# ==================================================================
#
#
# Advanced Social Network Analysis
#
# UZH, 31 Oct - 4 Nov 2016
#
# Lecturer: Thomas Grund, thomas.grund@ucd.ie
#
# ==================================================================
#
#
# Session 3: Introduction to exponential random graph models
#
# ==================================================================
#
# The R-packages 'ergm', 'network', 'coda' and 'sna' need to be
# available.
#
# Overview:
# a) Installation & reading data into R
# b) ERGM analysis of the LGang network (undirected)
# c) ERGM diagnostics and goodness-of-fit
# d) ERGM analysis of the Gomery network (directed)
#
# =====================================
# a) Installation & reading data into R
# =====================================
install.packages("statnet")
# Load the vocabulary of the neccessary packages:
library(statnet)
# Sets the working directory to the location where the data is stored.
setwd("/Users/thomasgrund/OneDrive/DUBLIN TEACHING/Network Dynamics 2015/data")
# NOTE: YOU NEED TO CHANGE THIS TO WHERE YOU STORED THESE FILES!
# Note that R does not like Windows single backslashes "\" in directory paths.
# Replace them either by forward slashes "/" or double backslashes "\\" as in the example."
# Read in the LGang network and LGang attributes as we did before:
LG.mat <- read.csv("LGang-matrix.csv",header=TRUE)
LG.mat <- LG.mat[,-1]
# Recode the values in teh network
LG.mat[LG.mat==1]<-0
LG.mat[LG.mat>=2]<-1
class(LG.mat)
# Ah... LG.mat is still a dataframe. We first need to make a matrix out of it before
# we can create a network object with it....
LG.mat<-as.matrix(LG.mat)
# Create a network object. To do so we need to transform the data.frame in a matrix
# first. Here, we do this without assigning the matrix a new name. In R, you can always
# embedd commands like this...
LG.net <- network(LG.mat,directed=FALSE)
# Read the LGang attributes and put them onto the network object:
# Read the attribute data:
LG.attrib <- read.csv("LGang-attributes.csv")
# Put ethnicity and prison onto the network object:
LG.net %v% "ethnicity" <- as.character(LG.attrib$birthplace)
LG.net %v% "prison" <- LG.attrib$prison
LG.net %v% "age" <- LG.attrib$age
LG.net %v% "rankrev" <- LG.attrib$rankrev
# Plot it just as a reminder and color the nodes according to ethnicity
plot(LG.net,vertex.col="ethnicity")
plot.sociomatrix(LG.net)
# ==================================================
# b) ERGM analysis of the LGang network (undirected)
# ==================================================
#
# Basically, in ERGM`s we simulate lots and lots of networks using certain
# network motifs (specified by the researcher) and then try to estimate
# parameters for these motifs (similar like logistic regression coefficients)
# in such a way that the observed network is most likely to to be drawn from the
# simulated distribution of networks (estimates are derived using Markov Chain
# Monte Carlo Maximum Likelihood Estimation). ERGM`s are specifically designed
# to consider multiple network motifs and effects (for example, several homophily
# effects) simultaneously. Furthermore, ERGM`s can be specified to include structural
# effects like triadic closure, balance, andso on. One unique feature of ERGM`s is
# that essentially the presence of a network tie is modeled conditional on other
# network configurations. At the core of an ERGM is the Markov assumption, i.e.
# network ties are modeled conditional on the present network in continuous time.
# Hence, a generative process is modeled and MLE is performed based on this process.
#
# One distinguishing feature of ERG models, in contrast to earlier attempts in modeling
# networks is their ability to accommodate dependencies between the random variables.
# Earlier, so called p1 models (Holland and Leinhardt 1981) have long been popular and
# implied the highly unrealistic assumption of dyadic independence, i.e. the probability
# for existence of a tie is independent from the existence of any other tie. An extension
# has been the introduction of p2 models (Lazega and van Duijn 1997), which also assume
# dyadic independence, but conditional on node-level attribute effects. Most prominently
# in p2 models, sender and receiver are included as random effects. Some nodes are more
# and others less likely to send/receive ties. Together with actor and dyadic effects
# (e.g. similarity of nodes) this yields more complex, but also more realistic models.
# ERG models (in this tradition also sometimes referred to as p* models), generalize these
# previous attempts, but do not require the assumption of dyadic independence anymore. In
# networks, the probability for a tie to exist most often depends on the presence of other
# ties in the network.
#
# In ERGM's ties are endogenous to the estimation process. That means each tie is estimated
# in a way that considers that this tie affects the presence of others ties and so on. This
# is why we also call these models dyadic dependence models.
#
# Puh.... not easy, I know.
#
# The important help files are these:
?ergm
help("ergm-terms")
?mcmc.diagnostics
?gof
# Now finally, let us estimate some ERGMs for the LGang network!
#
# Begin with a Bernoulli random graph model / Erd?s-R?nyi model:
# First we formulate a model and t
model1 <- LG.net~edges + nodefactor("ethnicity")
# The summary method gives the "target statistics", here: the tie count.
summary(model1)
# Let's estimate this model:
results1 <- ergm(LG.net~edges)
summary(results1)
# The EDGE effect can be best interpreted as some kind of intercept.
# Checkout the odds for a tie to exist in contrast to the odds that the
# the ties does not exist.
133 / ((54 * 53 / 2) - 133)
# And now compare this with the exponentiated coefficient we got from the model.
exp(-2.27823)
# As we do know that ethnicity homophily matters from our previous analyses,
# let us check this using an ergm. There are a lo of terms that could be included.
help("ergm-terms")
# Think of these terms and interpret the estimates you derive for them a bit like logistic
# regression coefficients: How much more likely is a tie between i and j to exist in the
# network when i and j match on ethnicity?
model2 <- LG.net~edges+nodematch("ethnicity")
summary(model2)
results2<-ergm(model2)
summary(results2)
# Note that we get a beta estimate of 0.8523 exactly as before when we ran the
# netlogit analysis! So far so good. Both models are still equivalent.
# But now we can move on and check if the ethnicity homophily effect is equally
# strong for all ethnic groups.
model3<-LG.net~ edges + nodematch("ethnicity", diff=TRUE)
summary(model3)
results3<-ergm(model3)
summary(results3)
# Okay, this seems to be important! The ethnicity homophily effect is much
# stronger amongst Africans, less amongst Caribbeans and least among British.
# This is in line with the graphical depiction (see our EJC paper)
# based on the relational contingency table analysis in Ucinet.
# Question: But what about the activity of certain ethnic groups? Maybe Africans
# just co-offend more than British and that is why we see more homophily between them?
# Check the number of co-offending ties per ethnic group.
d<-degree(LG.net)
# Alternatively, we could also get the ethnicity directly from the attribute object and write:
# ethnicity<-LG.attrib$birthplace
ethnicity <- LG.net %v% "ethnicity"
boxplot(d~ethnicity )
# It looks indeed that the British just co-offend less. This could explain why our previous
# model did not pick up a British homophily effect. Let us include now effects for attributes
# of nodes, i.e. we model the possibility that ties from members of different ethnic groups are
# or more less likely.
model4<-LG.net~ edges + nodematch("ethnicity", diff=TRUE) + nodefactor("ethnicity", base=3)
summary(model4)
results4<-ergm(model4)
summary(results4)
# Now ERGM's become really interesting because tie changes are endogenous to the model.
# We do not have to assume dyadic independence as before. Basically, the model conditions on
# the presence of all other ties. Before we just used logistic regression to see if a tie
# value is one or zero. But implicitly we still had to make the assumption that our
# observations are independent, one has to do that when making a 'normal' regression.
# But most likely in networks the presence of one tie is not independent from the
# presence of other ties. Think of triadic closure. This general tendency says that a
# tie between A-B is more likely if there are already ties between A-C and B-C. If this
# tendency exists on our network, the presence of the tie A-B is not independent from
# other ties! In theory we could simply generate a new variable 'closes_triad' and assign
# it the score one if a dyad closes a triad or not. While in principle this strategy makes
# sense, it is a bit like cheating because our effect captures dependence of ties, but the
# actual estimation still assumes tie independence... That means, we do not capture the
# temporal sequence of how the network came about in the first instance and the role the
# triadic effect might have played in this evolution. So, if you are still with me, you
# could see now that a dyadic independence model (like 'netlogit' or p2) is absolutely fine
# with under one crucial condition: that all ties formed at exactly the same moment in time!
# Obviously, this is highly unrealistic. Networks do not come out of nowhere even when we
# just see a snapshot of them.
# ERGM's take care of that and we can estimate e.g. triadic closure. GWESP is a special effect
# that measures the amount of triadic closure in a network. The reason why we use GWESP and
# not e.g. a simple triad count has to do with what is called 'degeneracy'... but that stuff
# is really complicated. Basically, if one includes simple measures of triadic closure, as one
# used to until not that long ago (well, all these things are pretty new), the networks that
# one simulates (remember ERGM's simulate lots and lots of networks and then adjust parameter
# estimates so that the observed network is most likely to come from the simulated distribution)
# often do not make sense anymore. They are extreme or 'degenerated', e.g. full network.
# Check out the 'edgewise shared partners' distribution (a measure of higher order clustering):
#
# Example: esp2=29 means that in 29 cases the two nodes that are connected by a co-offending link
# have exactly 2 additional co-offending partners in common.
#
# Example: esp4=7 means that for 7 edges between i and j, the nodes i and j have exactly 4 other
# nodes k's as co-offending partner in common.
esp.distr <-summary(LG.net~esp(0:10))
esp.distr
plot(0:10,esp.distr, type="b", xlab="number of edgewise shared partners", ylab="frequency")
# This should be the same as the total number of edges we have. Juhuu!
sum(esp.distr)
LG.net
# Okay, let us go back at a simpler model we used before. Here we see a ethnicity homophily effect
# with 0.85, as we had seen in previous analyses
summary(model2)
summary(results2)
# The GWESP expression gives us a global statistic, which sums over this distribution and
# multiplies the number of edges who have k other co-offenders in common. And while doing so,
# it weights the values in this distribution according to a geometric function. Basically,
# the number of edges who have a lot of common neighbors k are weighted less when summing over
# the esp distribution. To get such a GWESP score we need to assing a parameter alpha. This score
# can very between 0 and 1. Scores between 0.25 and 0.75 make sense and should be choosen
# according to which parameter gets a better fit.
model2b<-LG.net~ edges + gwesp(0.5,fixed=TRUE) + nodematch("ethnicity")
results2b<-ergm(model2b)
summary(results2b)
# One can also leave alpha unfixed. Then the ergm program finds an appropriate tau. But this can
# take some time...
model2c<-LG.net~ edges + gwesp() + nodematch("ethnicity")
results2c<-ergm(model2c)
summary(results2c)
# The important thing is that including a GWESP effect, 'controls' for triadic
# closure. Finding triadic closure is not a huge surprise. We find it in nearly
# all social networks we look at. However, it is still substantively interesting.
#
# You notice that including structural effects, such as GWESP for triadic closure
# make the estimation take longer. Actually, what we see now is that once we control
# for triadic closure in the model, the actual ethnicity homophily is less pronounced.
# The effect drops to 0.54. Hence, a non-negligible amount of co-offending between
# members from the same ethnicity is in fact due to triadic closure.
# Question: Is there also homophily according to age, prison or rank? Is a tie more likely
# between gang members who are similar on these dimensions?
model5<-LG.net~ edges + gwesp() + nodematch("ethnicity") + nodematch("prison") + absdiff("age") + absdiff("rankrev")
results5<-ergm(model5)
summary(results5)
# Besides ethnicity we also find evidence for age homophily. The estimate for absdiff.age is
# siginificant and negative. That means a tie is less likely when the age difference between
# two gang members is high. Remember the nodematch.ethnicity variable is 1 when the two gang
# members have the same ethnicity and 0 otherwise. That is why here a positive estimate stands
# for a homophily effect, while for age a negative estimate indicates homophily... There is no
# evidence for prison or rank homophily though.
# =======================================
# c) ERGM diagnostics and goodness-of-fit
# =======================================
# Nice, but did the model converge, actually?
# Let us look at the model diagnostics, i.e., the sample of network
# statistics that were simulated.
mcmc.diagnostics(results5)
# This looks 'nice' - there is no trend or cyclicity or weird pattern
# in the temporal plot of statistics.
#
# Now let us look at the goodness of fit!
# Does the model reproduce some global features of the network?
# We check whether the empirical distribution of degree, shared partner
# cound the number of triads can be explained by the model:
# Check the triad census again.
triad.census(LG.net)
par(mfrow=c(2,2))
fit5 <- gof(results5~degree + espartner + triadcensus,nsim=40,burnin=200000)
plot(fit5)
# This is an excellent fit! Our model really manages to generate simulated
# networks that 'look' like the observed network. This is great. If that would
# not be the case we would have a problem. We would have to specifiy our model
# differently. Unfortunately, one needs a lot of experience to this properly.
# It can be a bit like an art.
# How would a bad model fit look like?
# Check one of our first models again.
summary(results2)
par(mfrow=c(2,2))
fit2 <- gof(results2~degree + espartner + triadcensus,nsim=40,burnin=200000)
plot(fit2)
# =========================================
# d) ERGM analysis of the Gomery network (directed)
# =========================================
# Let us see if we have some time left...
# This network is different because it is 'directed'
# Load network and the role attribute (VER, POL, PRIVE, ETAT).
# Yanick, what do they stand for again?
gomery.mat<-as.matrix(read.csv("gomery-mat.csv", header=F))
class(gomery.mat)
gomery.net<-network(gomery.mat)
gomery.att<-read.csv("gomery-role.csv", header=T)
gomery.role<-gomery.att$role
gomery.net %v% "role" <- as.character(gomery.role)
plot(gomery.net, vertex.col="role")
# For some practical reasons, let us focus only on those nodes who have some
# sort of tie. How can we extract a subgraph? There are different ways...
# First, we generate a vector which has the degree for each node (in + outdegree)
gomery.degree<-degree(gomery.net)
gomery.degree
# Now, let us generate a boolean vector of the same length as the degree vector,
# which takes the value true if the degree is greater than zero.
connected<-gomery.degree>0
connected
# A simple way to subgraph a network is to only select the appropriate rows
# and columns we want in the original matrix
gomery.mat.connected<-gomery.mat[connected, connected]
dim(gomery.mat.connected)
dim(gomery.mat)
# Generate a new network object
gomery.net2<-network(gomery.mat.connected)
# Do the same kind of subsetting for the attribute
gomery.att2.connected<-gomery.att[connected,]
gomery.role2<-gomery.att.connected$role
gomery.net2 %v% "role" <- as.character(gomery.role2)
plot(gomery.net, vertex.col="role")
model1<-gomery.net ~ edges + gwesp() + mutual() + nodematch("role", diff=T)
results1<-ergm(model1)
summary(results1)
par(mfrow=c(2,2))
fit <- gof(results1~idegree + odegree + espartner)
plot(fit)