# ==================================================================
#
#
# Advanced Social Network Analysis
#
# UZH, 31 Oct - 4 Nov 2016
#
# Lecturer: Thomas Grund, thomas.grund@ucd.ie
#
# ==================================================================
#
#
# ==================================================================
#
# Session 1: Introduction to R and networks in R
#
# ==================================================================
#
# The examples make use of the LGang data. This data includes attributes
# and the co-offending network of a youth gang. Data were collected by
# James Densley (Metropolitan University) and Thomas Grund (UCD).
#
#
# Overview of topics:
# ===================
# a) Installation of R, RStudio and R packages
# b) Getting help
# c) Basic operations
# d) Vectors
# e) Matrices
# f) Lists
# g) Data frames
# h) Finding built-in datasets
# i) Data import
# j) Basic statistics
# k) Network data in R
# l) Creating networks
# m) Creating the LGang network
# n) Vertex/node attributes
# o) Plotting networks
# p) Basic network statistics
# q) Node centrality
# r) Dyad and triad census
# s) Distances
# t) Largest component
# u) igraph, statnet, tnet, dynamicnetwork, rSonia and more..
#
#
# ============================================
# a) Installation of R, RStudio and R packages
# ============================================
#
# To install R, follow these steps:
# >> Go to http://cran.r-project.org/
# >> Select mirrors on the left-hand menu.
# >> Select the location nearest you.
# >> Select the link for your operating system in the
# >> "Download and Install R" section.
# >> Click on base.
# >> Follow the instructions for download.
#
# To install RStudio:
# >> Go to http:/http://www.rstudio.com/ide/download/
# >> Download and Install RStudio
# >> Follow the insructions
install.packages("network")
# for network data storage & handling
install.packages("sna")
# contains 'classical tools' from sociology tradition
install.packages("coda")
# for Markov Chain Monte Carlo diagnostics
install.packages("ergm")
# for exponential random graph models
install.packages("latticeExtra")
# for exponential random graph models
install.packages("beanplot")
# some fancy graphs
install.packages("lme4")
# random effects logistic regression
# Installation does not yet make the commands in a package available
# for use inside R. To make these commands available, add them to the
# current namespace by typing:
library(network)
library(sna)
# Now your "R namespace" contains the commands in these two packages,
# i.e., R will from now on understand what you mean when you want to
# use these commands.
# ===============
# b) Getting help
# ===============
#
# You can get information about functions (such as their inputs, outputs, and purpose)
# by using the "help()" function. Help files are important as they describe the
# arguments to the functions as well the values that result from running the function.
#
# For example, say we wanted to calculate the density of a network.
# You could type
help(gden)
# and learn how to do it. An alias for this command is "?gden".
#
# If you search for a function but are not sure of the name, type
help.search("gden")
# which gives you function names that might match. An alias for this command is "??gden".
#
# So, what about reciprocity indices available in 'sna'?
help.search("reciprocity")
# If you search for object and function names which include an exact text quote,
# use the function "apropos()".
apropos("rec")
# This will give a list of available commands that contain the quote.
# It also is possible to look up all the functions within a package, by typing:
help(package=sna)
# This shows you the range of things you could do using that particular package.
# ===================
# c) Basic operations
# ===================
#
# Assigning values to variables:
a <- 20
# An alias for this would be "a=20"
#
# Looking at the assigned object:
a
# should give value "20".
#
# Basic calculations:
a+10
a/10
sqrt(a)
# Storing calculation results in new variables:
d=sqrt(a)
d
# Alternatively, you could have written "d<-sqrt(a);d". The semicolon separates
# command lines just like a carriage return.
#
# Logicals:
d==a
# should give value "FALSE" (shorthand "F" or "0" when used in assignments).
d!=a
# should give value "TRUE" (shorthand "T" or "1" when used in assignments).
d=a
# To keep an overview of the available objects, type
ls() # an alias is 'objects()'
# which give a list of all objects. To remove an object, type
rm(d) # an alias is 'remove()'
ls()
# it should be gone now.
# ==========
# d) Vectors
# ==========
#
# Creating a vector:
v=c(2,3,4)
v
# Looking up the second element in the vector:
v[2]
# You can create a longer vector from two smaller vectors:
v2=c(v,v)
v2 = c(2,3,4,2,3,4)
# Mixing text (character type) with numerics makes it all a character vector:
v3=c(2,3,"apple")
v3
# Check this:
is.character(v3)
is.character(v2)
# Creating vectors quickly using replication and sequence:
#
# To create a vector that goes from 1 to 10 by steps of 1, type
1:10
# To create a vector that goes from from 2 to 10 by steps of 0.5, type
seq(2,10,by=.5)
# To create a vector of 100 3s, type
rep(3,100)
# You can also replicate vectors:
rep(c(3,2),100)
# There are some more fancy options of the replicate function:
#
# Doing each element 100 times:
rep(c(3,2),each=100)
# Vary the number of replications for each element:
rep(c(3,2),time=c(1,5))
# Now, consider operations on a vector. First define a nice one:
a=rep(c(3,2),time=c(1,5))
# Elementwise operations:
a*3
a+1
a+a
# Logicals for vectors:
a==3
# How many elements are equal to 3?
sum(a==3)
length(a[a==3])
# The second way of getting this count is interesting, as it relies on a
# pre-selected vector "a[a==3]"... how does that work, exactly?
#
# Create another vector of same length:
b=2*a
b
# Now keep only those elements for which the corresponding entry in vector a
# is equal to the value 3:
b[a==3]
# This will help later on selecting subsets of actors from larger data sets!
# ===========
# e) Matrices
# ===========
#
# Creating a matrix:
mat <- matrix(1:36, nrow=6,ncol=6)
mat
mat[1,] # getting the first row
mat[,1] # getting the first column
mat[2,3] # getting the element in the 2nd row and 3rd column
mat[1:3,] # getting the 1st through 3rd rows
mat[1:3,1:3] # getting the 1st through 3rd rows
# and the 1st through 3rd columns
t(mat) # transposing the matrix
# We can also make matrices using cbind or the rbind command.
# cbind=puts together as columns:
mat2=cbind(1:10,21:30)
mat2
# rbind puts together as rows:
mat3=rbind(1:10,21:30)
mat3
# Some more functions for matrices:
dim(mat2) # check the number of rows and columns using dim
nrow(mat2) # check the number of rows
ncol(mat2) # check the number of columns
# We can operate on the matrix 'cellwise':
mat4 <- mat2*2 # scalar times matrix
mat4
mat2+mat4 # sum per cell
mat2>4 # logicals
# Subset a matrix
mat4[1:3,1:2]
# Subset using a vector
binvec<-c(TRUE,FALSE,FALSE)
binvec
binvec<-(rep(c(TRUE,FALSE),5))
binvec
mat4[binvec,]
# ..or do more complex matrix multiplication:
mat2 %*% mat3
# ========
# f) Lists
# ========
#
# Using lists can be a good way to store data. Anything can be put into a
# list, next to one another: vectors, matrices, characters - without altering
# the type of the other elements in the list.
#
# The results of analytical procedures often come in this format.
alist=list(1:6,mat2,"some text")
alist
alist[[1]] # pulling the first part of the list
alist[[2]] # pulling the second part of the list
alist[[3]] # pulling the third part of the list
alist[[1]][5] # get the 5th element in the first part of the list
# It would be nicer to work with names instead of indices!
# Put names on the list:
alist=list(element1=1:6,element2=mat2)
alist
# Finding out the names a list type object has:
names(alist)
# Using the names to grab the first part of the list
alist$element1
# What if calling the name without the list?
element1
# Should result in an error. But after you type
attach(alist)
# the names of "alist" become general vocabulary, and typing
element1
# yields the desired result.
# To un-attach the names of "alist", type
detach(alist)
# When you now again type
element1
# you should again get the error message you got earlier.
#
# Attaching names can help typing effort!
# ==============
# g) Data frames
# ==============
#
# Data frames are special lists which contain variables with the same number
# of row entries. To learn more, type
?data.frame
# An example:
dat <- data.frame(id=1:6, gender=rep(c("M","F"),3))
dat
# Calculations 'as usual, e.g.:
dat[,1]*8
# Text variables are treated as categorical data:
class(dat[,2])
# The sex character has type "factor"; in many analytical methods this means
# the first category is used as reference category & the data are
# interpreted as nominal; actually "factor" is an ordinal type.
?factor
# To turn the variable type back into character, type
dat[,2] <- as.character(dat[,2])
dat[,2]
# ==============
# h) Finding in-built data
# ==============
#
# Many packages have built-in data for testing and educational purposes
data() # lists them all
?USArrests # get help on a data set
data(USArrests) # load the data set
USArrests # view the object
# R's workhorse is the "plot" command
plot(USArrests$Murder,USArrests$UrbanPop) # using dollar sign notation
plot(USArrests$Murder,USArrests$UrbanPop,log="xy") # log-log scale
# Adding plot title and axis labels
plot(USArrests$Murder,USArrests$Assault,xlab="Murder",ylab="Assault",main="USArrests")
# You can also add text.
plot(USArrests$Murder,USArrests$Assault,xlab="Murder",ylab="Assault", main="USArrests",type="n")
text(USArrests$Murder,USArrests$Assault,rownames(USArrests))
# Histograms and boxplots are often helpful.
hist(USArrests$Murder)
boxplot(USArrests)
# ==============
# i) Data import
# ==============
#
# We will often want to read in data from a saved location. See
?read.table
?read.csv
# For example, let's read in some data on the employees of the IFM
# which should be saved somewhere on your computer.
#
# First, use
getwd()
# which shows what R considers to be the current working directory.
# Then use
setwd("/Users/thomasgrund/OneDrive/TEACHING/Zurich 2016/data")
# Sets the working directory to the location where the data is stored.
# NOTE: YOU NEED TO CHANGE THIS TO WHERE YOU STORED THESE FILES!
# Note that R does not like Windows single backslashes "\" in directory paths.
# Replace them either by forward slashes "/" or double backslashes "\\" as in the example.
list.files()
# Read the attribute data:
LG.attrib <- read.csv("LGang-attributes.csv")
# Let's take a look:
head(LG.attrib)
# ===================
# j) Basic statistics
# ===================
#
# Average, standard deviation, quartiles:
mean(LG.attrib$age) # calculating the mean gpa in our dataset
sd(LG.attrib$age) # calculating the standard deviation of age
quantile(LG.attrib$age) # looking at the quartiles(! default)
hist(LG.attrib$arrests,main="Arrests") # a histogram
# As shown above, you can get rid of always writing "LGattrib$"
# as follows:
attach(LG.attrib)
# Some more examples of what is possible is basic R:
table(birthplace) # tabulation of the data
mean(arrests[age<18]) # mean arrests of non-adults
mean(arrests[age>=18]) # mean arrests of adults
cor(age,arrests) # correlation
plot(birthplace,arrests) # boxplots of arrests by ethnic background
plot(density(arrests)) # a density plot of arrests
plot(arrests,convictions) # a scatterplot of social by ego motivation
# etc.
# You can make the attach command undone by this command:
detach(LG.attrib)
# ====================
# k) Network data in R
# ====================
#
# Before we can describe, plot and ultimately run statistical models on our network
# we need to get the network into R and into a useable format.
# Make sure that network package is loaded
library(network)
# List available datasets in network package
data(package="network")
# Load a built-in data set; see ?flo for more
data(flo)
# Examine the flo adjacency matrix
flo
# For more information. . .
?data
?flo
# ====================
# l) Creating networks
# ====================
#
# Initialze an empty network with 5 nodes
g <- network.initialize(5)
# Add an edge from 1 to 2
g[1,2] <- 1
# Add edges from 2 to everyone else
g[2,] <- 1
# Examine the result
g
# Create an adjacency matrix
m <- matrix(0, nrow=5, ncol=5)
# Add entries from 3 to 4 and 5
m[3,4:5] <- 1
# Add more entries
g[m>0] <- 1
g
# Delete edges
# Remove the edge from 3 to 5
g[3,5] <- 0
# It's gone!
g
# Remove all edges
g[,] <- 0
# Now, an empty graph
g
# For more information. . .
?network.extraction
?add.edge
?delete.edges
?delete.vertices
?get.edges
?upper.tri
# ====================
# m) Creating the LGang network
# ====================
#
# There are lots of different formats, but the most common are edgelists and matrices.
# It is also possible to read in from the pajek format, see "?read.paj".
#
# We are going to work with the LGang co-offending network which should be saved
# as a matrix somewhere on your computer. The network corresponds to the data we've
# already been using above so the ids on the data link to the ids on the network.
# Let's first read in the LGang co-offending network in the MATRIX format and make it
# a network object. For reading in the network as a matrix of 1s and 0s, type...
# (Make sure that the working directory is set correctly before executing this)
LG.mat <- read.csv("LGang-matrix.csv",header=TRUE)
LG.mat
# Grab the ids from the input matrix:
ids <- LG.mat[,1]
ids
# Take off the first column (as it is the ids):
LG.mat <- LG.mat[,-1]
# The entries in the matrix are valued: 0 = no relation, 1 = haning out with each other, 2+ = co-offending
# Let us generate the unvalued co-offending network by replacing the values in the matrix.
# First, set all 1's to 0s'. Second, set everything equal or greater than 2 to 1.
LG.mat[LG.mat==1]<-0
LG.mat[LG.mat>=2]<-1
# What class does the read object have?
class(LG.mat)
# Transform into network object:
LG.mat <- as.matrix(LG.mat)
class(LG.mat)
LG.net <- network(LG.mat,directed=FALSE)
class(LG.net)
LG.net
# ====================
# n) Vertex/node attributes
# ====================
#
# Now that we have our network we want to put the characteristics
# of the people onto the network.
# We've already read in the LGattrib data so we can use that.
# Put ethnicity onto the network:
LG.net %v% "ethnicity" <- as.character(LG.attrib$birthplace)
# The 'as.character()' is necessary because variables of the class
# 'factor' cannot be attached to network objects!
# NOTE that you need to make sure that the characteristics (here LG.attrib$birthplace)
# are in the same order as the ids on the network, otherwise the data won't
# match up with people in the network.
# Notice, you can also set node (vertex) attributes with "set.vertex.attribute". See
?set.vertex.attribute
# Check this by comparing the id vectors:
sum(ids==LG.attrib$ID)
# Should be 54, if the ids match up.
# Add some more attributes:
LG.net %v% "age" <- LG.attrib$age
LG.net %v% "arrests" <- LG.attrib$arrests
LG.net %v% "convictions" <- LG.attrib$convictions
LG.net %v% "prison" <- LG.attrib$prison
LG.net %v% "rank" <- LG.attrib$rankrev
#For more information. . .
?attribute.methods
# ====================
# o) Plotting networks
# ====================
#
# It is important to get impressions of a network before jumping into
# any statistical analysis.
# Are there indegree- or outdegree outliers?
sociomatrixplot(LG.net,diaglab=FALSE,cex.lab=.5)
# Use plot.network from 'network' package. Notice that automatically
# learns that the object to be plotted is a 'network'. And then it looks
# through various 'plot' functions and selects the one which accepts
# a network.
plot(LG.net)
plot.network(LG.net)
# Use different layouts
plot(LG.net, mode="circle")
# Colour the nodes by ethnicity:
plot(LG.net,vertex.col="ethnicity")
# Scale the vertex by arrests:
plot(LG.net,vertex.col="ethnicity",vertex.cex="arrests")
# Oops! scale size down a bit:
plot(LG.net,vertex.col="ethnicity",vertex.cex=1 + LG.attrib$arrests/10)
# But we could als use the 'gplot' command from the 'sna' package. This
# command is a bit more powerful, but it does not really matter what you
# choose for now.
gplot(LG.net)
# Notice that the 'gplot' does not automatically know that the network is
# undirected.
gplot(LG.net, gmode="graph")
# Use different layouts
?gplot.layout
gplot(LG.net, gmode="graph", mode="mds")
gplot(LG.net, gmode="graph", mode="random")
gplot(LG.net, gmode="graph", mode="kamadakawai")
# Notice there are many more ways to plot networks. e.g. plot.igraph.
#
# ===========================
# p) Basic network statistics
# ===========================
# Check out what you can do with the network using the 'sna' package
help(package=sna)
# Some important graph level statistics:
gden(LG.net) # density
# If there would be only triads in the network, this score would be 1.
gtrans(LG.net)
# If the network were directed calculate reciprocity index
?grecip
# ===========================
# q) Node centrality
# ===========================
# Degree distribution:
LG.degree <- degree(LG.net,cmode="outdegree")
LG.degree
hist(LG.degree)
quantile(LG.degree)
# Calculate betweenness centrality
LG.between<-betweenness(LG.net)
LG.between
hist(LG.between)
# For more information. . .
?betweenness
?bonpow
?closeness
?degree
?evcent
?graphcent
?infocent
?prestige
?stresscent
# From centrality to centralization
centralization(LG.net, degree, cmode="indegree")
# Eigenvector centralization
centralization(LG.net, evcent)
# For more information. . .
?centralization
# ===========================
# r) Dyad and triad census
# ===========================
# Dyad and triad census:
dyad.census(LG.net)
triad.census(LG.net)
# ===========================
# s) Distances
# ===========================
# Calculate the distance matrix:
LG.dist <- geodist(LG.net)
LG.dist$gdist
table(LG.dist$gdist)
hist(LG.dist$gdist)
plot(table(LG.dist$gdist),type='b')
# Calculate the reachability matrix:
LG.reach <- reachability(LG.net)
LG.reach
table(LG.reach)
# ===========================
# t) Largest component
# ===========================
# Sometimes we wanto extract the largest component of the network.
#
# Check for isolates in the network.
isolates(LG.net)
# Indentify the connected component the nodes belong to
component.dist(LG.net)
# Are nodes part of the largest component?
lgc<-component.largest(LG.net)
lgc
# Which nodes are not part of the largest component? Of course, all isolates,
# but maybe some other nodes form their own little component?
nlgc<-lgc==FALSE
nlgc
# Get the ID's of the nodes who are not part of the largest component
nlgc.id<-seq(1:54)[nlgc==TRUE]
nlgc.id
# Build a new network which only consists of the largest component
LG.lgc<-LG.net
delete.vertices(LG.lgc,nlgc.id)
LG.lgc
plot(LG.lgc)
# ===========================================================
# u) igraph, statnet, tnet, dynamicnetwork, rSonia and more...
# ===========================================================
#
# In R there are also other packages that you can use for SNA.
# also many other useful tutorials on how to use R for SNA. Basically, you
# just need to play around, follow some example scripts (like this one) and
# eventually venture off yourself.
#
# Check out the website: http://www.stats.ox.ac.uk/~snijders/siena/
#
# Check out Tom Snijders Intro to SNA and R: http://www.stats.ox.ac.uk/~snijders/sna_intro.r
# Check out Michal Bojanowski Intro to SNA and R: http://bojan.3e.pl/bojanorama/doku.php?id=snar:start&redirect=1
install.packages("igraph")
install.packages("statnet")
install.packages("tnet")
help(package=igraph)
help(package=statnet)
help(package=tnet)