# ==================================================================
#
#
# Advanced Social Network Analysis
#
# UZH, 31 Oct - 4 Nov 2016
#
# Lecturer: Thomas Grund, thomas.grund@ucd.ie
#
# ==================================================================
#
#
# ==================================================================
#
# Session 2: Modeling networks
#
# ==================================================================
#
# The examples make use of the LGang data. This data includes attributes
# and the co-offending network of a youth gang. Data were collected by
# James Densley (Metropolitan University) and Thomas Grund (UCD).
#
#
# Overview of topics:
# ===================
# a) Load the LGang network, plot it and characterize it
# b) Simulate networks with certain characteristics
# c) Network inference assuming conditional independence
# d) Network inference using permutation tests, QAP
# e) Regression based approaches, logistic regression
# f) Regression based approaches, multivariate
# g) p2 model - random ego and alter effects
#
library(network)
library(sna)
# Sets the working directory to the location where the data is stored.
setwd("/Users/thomasgrund/OneDrive/DUBLIN TEACHING/Network Dynamics 2015/data")
# NOTE: YOU NEED TO CHANGE THIS TO WHERE YOU STORED THESE FILES!
# Note that R does not like Windows single backslashes "\" in directory paths.
# Replace them either by forward slashes "/" or double backslashes "\\" as in the example."
list.files()
# lists the files in the directory.
# ======================================================
# b) Load the LGang network, plot it and characterize it
# ======================================================
# Read the LGang network, remove first column and recode the values as before:
LG.mat <- read.csv("LGang-matrix.csv",header=TRUE)
LG.mat <- LG.mat[,-1]
LG.mat[LG.mat==1]<-0
LG.mat[LG.mat>=2]<-1
# Note that LG.mat is a data.frame
class(LG.mat)
# Create a network object. To do so we need to transform the data.frame in a matrix
# first. Here, we do this without assigning the matrix a new name. In R, you can always
# embedd commands like this...
LG.net <- network(as.matrix(LG.mat),directed=FALSE)
# Read the LGang attributes:
LG.attrib <- read.csv("LGang-attributes.csv")
# Attach ethnicity and prison attributes to the network object:
LG.net %v% "ethnicity" <- as.character(LG.attrib$birthplace)
LG.net %v% "prison" <- LG.attrib$prison
# Plot the network
plot(LG.net)
plot(LG.net, vertex.col="ethnicity")
plot(LG.net, vertex.col="prison")
# Show both attributes in network plot
prisonsize<- (1 + 2 * (LG.net %v% "prison"))
plot(LG.net, vertex.cex=prisonsize, vertex.col="ethnicity")
# =================================================
# c) Simulate networks with certain characteristics
# =================================================
#
# One crucial thing in SNA is that even when networks are completely
# 'random' they exhibit certain non-random network features. For example,
# the centrality scores of nodes in random networks is not zero! In fact,
# the scores will be different depending on the density of the network.
# Hence, you cannot really compare the centrality scores of nodes over
# time when the overall density of your network changes over time as well.
# Another example is embeddedment in triads. When a random network
# is dense, nodes will have a higher chance to be embedded in triads by
# default. Hence, you cannot really say that in one network nodes are
# more clustered than in another if you do not consider this.
# Let us generate a directed random Erdos Renyi network with 100 nodes
# and 200 ties.
?rgnm
Erdos.graph1<-rgnm(1, 100, 200, mode="digraph")
class(Erdos.graph1)
# Make a network our of this matrix
Erdos.net1<-network(Erdos.graph1)
Erdos.net1
plot(Erdos.net1)
# Another way of getting a random network is specifying the probability
# for a tie.
?rgraph
Erdos.graph2<-rgraph(100, 1, tprob=0.02)
class(Erdos.graph2)
# Make a network our of this matrix
Erdos.net2<-network(Erdos.graph2)
Erdos.net2
plot(Erdos.net2)
# Check the density of the network
gden(Erdos.net2)
# Not exactly 0.02? Because ties are generated based on probability and not based on exact number.
# Generate 500 random networks.
Erdos.graphs<-rgraph(100, m=500, tprob=0.02)
hist(gden(Erdos.graphs))
# Generate 500 random networks each for different densities: 0.01, 0.02... 0.1
# A clever way to do something like this is use "lapply". This applies a function
# to each element in a vector and returns a vector of results for each element.
# But first check out seq. It generates a vector that we save in d:
d<-seq(from=0.05,to=0.85,by=0.1)
# Now we use the vector d as an argument in lapply. lapply passes on each element of d to the
# function we define as argument, i.e. each element in d is subsequently passed on as x.
dat.den <-lapply(d, function(x) gden(rgraph(100,m=50, tprob= x)))
# Also, calculate the absolute number of triangles in the networks, for this we use the option 'weakcensus'
dat.trans <-lapply(d, function(x) gtrans(rgraph(100,m=50, tprob= x), measure="weakcensus"))
# Now we have 10 list entries where each one contains 50 density or number of triangles scores
# of the random networks
dat.den
dat.trans
# A nice way to plot these 10 different distributions is using a beanplot
library(beanplot)
beanplot(dat.den, names = d)
# Note that as the simulated network gets 'denser' (more ties) you will see
# more triangles in the networks. And even more so, this relationship is not
# linear!
beanplot(dat.trans, names = d)
# Let us look at node-level characteristics, here, betweenness. How
# does the baseline distribution of betweenness in a random network change
# when density is increased?
d<-seq(0.05,0.5,by=0.05)
nodes<-100
reps<-50
# This is a more sophisticated application of lapply...
dat=lapply(d,
function(x){
temp=NULL
# Within the function is loop
for (i in 1:reps){
r<-rgraph(nodes,tprob=x)
temp=c(temp,betweenness(r, rescale=TRUE))
}
return(temp)
})
dat
beanplot(dat, names=d, what=c(1,1,1,0))
# Simulate a network with a specific dyad census.
?rguman
random.graph<-rguman(1, 20, mut=50, asym = 100, null = 40, method="exact")
dyad.census(random.graph)
# ======================================================
# d) Network inference assuming conditional independence
# ======================================================
#
# So far, we have seen how to calculate graph level indices like density
# and transitivity. But how do we know if the values are large, relative
# to some baseline?
#
# One approach is to generate a distribution of simulated, conditioned
# random networks, distill from this a distribution of the statistic of
# interest, and see where the observed value of the statistic is located
# in this distribution.
# Looking at transitivity:
# Question: Are there fewer unclosed triangles in the LGang network than one would expect
# by chance, given the size and number of edges that are there?
LG.trans = gtrans(LG.net)
LG.trans
LG.density = gden(LG.net)
LG.density
LG.size<-network.size(LG.net)
LG.size
LG.edges<- LG.density * LG.size * (LG.size - 1) / 2
LG.edges
# Generate 200 undirected random graphs that have the same size and
# density as the LGang network.
Erdos.graphs<-rgnm(n=200, nv=LG.size, m=LG.edges, mode="graph")
# Plot one of these random graphs. Notice that it looks different from the
# LGang network. It does, however, have the same density.
Erdos.first<-network(Erdos.graphs[1,,], directed=F)
plot(Erdos.first)
plot(LG.net)
# Check the density of this random graph against the density of the
# observed network. They are exactly the same.
gden(Erdos.first)
gden(LG.net)
# Now calculate the transitivity on all these random networks that have
# the same size and density as the observed network.
Erdos.graphs.trans<-gtrans(Erdos.graphs)
plot(density(Erdos.graphs.trans))
# Plot it again and this time add a line for the observed transitivity
plot(density(Erdos.graphs.trans), xlim=c(0,0.45))
abline(v=LG.trans, lty=3)
# A test that does this for you...
cg<-cug.test(LG.net,"gtrans",mode="graph", cmode=c("edges"), reps=10000)
cg
plot(cg)
# Basically, the result is that the probability for a random network with the same
# number of edges as the LGang network to have more triangles than the LGang network
# is practically 0. So, relative to a random network with the same amount of edges
# there would appear to be high clustering in the LGang network.
#
# Consult the following to find out how to condition on the whole dyad census:
LG.census<-dyad.census(LG.net)
cg2 <- cug.test(LG.net,gtrans,cmode="dyad.census")
plot(cg2)
?cug.test
# =================================================
# e) Network inference using permutation tests, QAP
# ==================================================
#
# Now we are going to take a slightly different approach and use
# permutation tests to answer questions about the features of a network.
# Specifically, we want to know if the individuals in the LGang are
# more likely to co-offend with someone who has the same ethnicity.
# Is homophily on ethnicity strong?
# We need a matrix showing if every pair i, j match on ethnicity or not.
# This is just another way to get hold of the attribute again. Let us check
# if what we obtain is indeed the same as in the LG.attrib object.
ethnicity <- get.vertex.attribute(LG.net,"ethnicity")
identical(ethnicity,as.character(LG.attrib$birthplace))
# Generate the matrix of ethnicity matching (yes or no):
ethnicity.matrix <- 1 * outer(ethnicity, ethnicity, "==")
ethnicity.matrix[1:10,1:10]
# (Try out what happens if you omit the "1*" in the matrix generation!)
# (Yes, there are many other ways in R to get the same matrix.)
# A first step would be to look at a simple correlation between the LGang
# co-offending network and the ethnicity matrix.
gcor(LG.net, ethnicity.matrix)
# It seems there is a positive correlation of 0.125. But is this significant?
# The problem is the following: Maybe there is another reason why we see such
# a correlation. It could be because of activity, i.e. we see more ties
# between gang members from a specific ethnic group, simply because the members
# of this group offend more. It could also be because of triadic closure, i.e.
# there is a general tendency for co-offenders of co-offenders to co-offend with
# each other :-) Or again, because A-B and B-C, a tie between A and C is more
# likely. If there is some tendeny towards homophily it will be inflated by
# triadic closure. You will see a high correlation between ethnicity of co-offenders,
# but not all of it is due to homophily.
# Let us randomly permute the network (simultaneous row/column permutation)
LG.permuted<-network(rmperm(LG.net), directed=F)
# Check the density of the permuted graph against the density of the
# observed network. They are exactly the same as before.
gden(LG.permuted)
gden(LG.net)
# And now plot both networks. They look exactly the same as well.
plot(LG.permuted)
plot(LG.net)
# However, the nodes have been relabeled and there is no correlation anymore
gcor(LG.net, LG.permuted)
# Now we can calculate the correlation between the permuted network and the
# ethnic similarity matrix to get a null distribution for no etnic homophily
# effect.
gcor(LG.permuted, ethnicity.matrix)
# We could do this many times now, but qaptest already does that for us...
# A permutation QAP test follows a simple logic. How would a test-statistic (e.g
# correlation) look like if we were to scramble the network? Practically,
# the nodes in the network are relabeled many times in a random fashion. Notice
# that such a relabeling keeps the whole structure of the underlying network
# intact. It therefore, "controls" for structural effects. The
qap<-qaptest(list(LG.net, ethnicity.matrix), g1=1, g2=2, FUN=gcor, reps=5000)
summary(qap)
plot(qap)
# You see that our actual test-statistic (here, correlation) is much higher
# than what we would see if we were to scramble the network. To see a case where
# this is not so run the following:
# Generate two random graphs with 100 nodes and density = 0.1 and see how they
# would correlate if we scramble the networks.
r1<-rgraph(100, tprob=0.1)
r2<-rgraph(100, tprob=0.5)
plot(network(r1))
qap<-qaptest(list(r1, r2), g1=1, g2=2, FUN=gcor, reps=5000)
summary(qap)
plot(qap)
# Now you see that our observed correlation falls right into the distribution of
# correlations we would expect by chance, controlling for the underlying structure.
# Hence, the correlation between the two random networks is not significant.
# ===================================================
# f) Regression based approaches, logistic regression
# ===================================================
#
# Okat, let us change our general approach a little and run a network regression.
# The idea is to regress the network on some independent variables. Practically,
# we run a regression where the unit of analysis is the dyad between i and j. Each
# dyad is either zero or one; there is no tie or there is a tie. We can think of this
# as a binary outcome, hence, logistic regression makes sense. First we are going to
# do this in a classical sense for you to understand what is going on. Then we run
# the same thing in a much simpler way using 'netlogit'.
# First, we need to create the dependent variable "whether or not there
# is a tie in each dyad", as a long string of one's and zeros.
ties <- as.matrix(LG.net)
# Omit diagonal entries:
diag(ties) <- NA
ties
# Because LGang co-offending is a symmetric network, the (upper right) half
# of the matrix is fully informative - let's drop the other (lower left) half:
ties <- ties[upper.tri(ties)]
ties
# Do the same for the ethnicity match predictor variable:
ethnicity.match <- ethnicity.matrix
ethnicity.match <- ethnicity.match[upper.tri(ethnicity.match)]
# The above strings out the ethnicity data so we can correlate having a tie
# with matching on ethnicity or not.
ethnicity.match
dyad.data <- data.frame(ties,ethnicity.match)
# Let's take a look at the data:
dyad.data[1:10,]
# This fits a simple logistic regression in R. The comand is actually more
# general and allows you to fit a generalized linear model.
results<-glm(ties~ethnicity.match,family="binomial",data=dyad.data)
summary(results)
# The above specifies a model predicting the presence of a tie as
# a function of matching on ethnicity.
# Okay, after this excursion to 'regular' statistics, back to "sna"!
# We can do the same thing in a much simpler way (which also allows further
# extension) using 'netlogit'
# This command does exactly the same thing.
# Note that we can even assign the result of a regression to be stored in an object:
dyad.ind <- netlogit(LG.net,ethnicity.matrix,mode="graph",nullhyp="classical")
dyad.ind
# The above performs a dyadic independent logistic regression.
# The null hypothesis is that networks are random with the right number of edges
# (so the dyads are independent). The x1 estimate (=beta coefficient) is positive
# at 0.85 and this effect is significant with p = 4.14...e-06 = very small p.
# Basically, the regression says that a tie between two gang members is Exp(b) = 2.34
# times more likely when the two gang members have the same ethnicity. There is
# clear evidence for ethnic homophily in co-offending.
# Consult the following to find out other options of this function:
?netlogit
# Let us now do a permutation test with the same regression model!
#
# So, we use the same analysis, but we a different null hypothesis,
# i.e. our standard errors are calculated in a different way.
#
# Permutation regression randomly permutes the actor labels, reruns the regression
# each time, and compares the regression coefficients obtained from the observed
# data to the distribution of coefficients obtained for the permuted datasets.
# This makes sense as there are many other factors that influence the structure
# of the network, e.g. clustering. In the example above, we used as a reference
# point for our null hypothesis (that ethnicity does not matter for co-offending
# in terms of ethnicity) a Erdos-Renyi (random) graph. Permutation tests use
# permutations of the actual/observed network as a point of reference instead.
# Thus, the baseline refers to how ethnic similarity would look like (if it would)
# not matter on random networks that have similar features than the observed one.
# Standard deviations are derived in such a way. We also call analyses using
# such permutations the 'Quadratic Assignment Procedure Regression' (= QAP regression)
dyad.qap <- netlogit(LG.net,ethnicity.matrix,mode="graph",
nullhyp="qapy",reps=100)
# The above estimates the same parameters as the 'glm' model above
# but it obtains standard error equivalents using the permutation test.
dyad.qap
# It would appear that homophily on ethnicity is strong in the network.
# ============================================
# g) Regression based approaches, multivariate
# ============================================
#
# Question: Is there homophily on ethnicity even after controlling for
# prison homophily (and underlying structural patterns)?
prison.matrix<-1 * outer(LG.attrib$prison, LG.attrib$prison, "==")
prison.matrix[1:10,1:10]
# In order to have more than one independent variable we need to pass the variables
# on as a list:
dyad.qap <- netlogit(LG.net,list(ethnicity.matrix, prison.matrix),mode="graph",
nullhyp="qapy",reps=100)
dyad.qap
# Now the estimate X1 refers to ethnity similarity and X2 to prison similarity.
# First there is no evidence for prison homophily and second this also does not change
# the result for ethnicity homophily.
# But hang on, maybe the prison effect only plays out for those members who actually
# were in prison and not for those who were not. So far, we just looked at both
# i and j having the same value on prison. No substantive difference had been
# made whether similarity is on having been or not having been in prison.
# Let us generate two variables
prison1.matrix<-1 * outer(LG.attrib$prison, LG.attrib$prison, function(arg1,arg2) {
return(arg1 + arg2 == 2)
})
prison1.matrix[1:10,1:10]
prison0.matrix<-1 * outer(LG.attrib$prison, LG.attrib$prison, function(arg1,arg2) {
return(arg1 + arg2 == 0)
})
prison0.matrix[1:10,1:10]
# Run the same thing including the two prison variables.
dyad.qap <- netlogit(LG.net,list(ethnicity.matrix, prison0.matrix, prison1.matrix),mode="graph",
nullhyp="qapy",reps=100)
dyad.qap
# This does not change things.
# Summary: Results obtained in such a way are already much better than simple correlation.
# First, the logistic regression framework allows us to test our hypothesis (is
# there ethnicity homophily?) in a multivariate way. We can control for other
# effects, e.g. prison homophily. Second, estimating our standard errors using
# QAP, we take a more realistic null hypothesis. Our reference point is: How
# many ties between members from the same ethnic group would we observe on a
# structurally 'similar' network (one that has similar features, e.g. denisity,
# number triads and even degree distribution => effectively exactly like the
# observed network because we just re-label the nodes)? This takes into account
# that randomness does not always mean zero values. Especially, in network contexts,
# a random network will always have certain features, and we will always observe
# certain patterns. Hence, our point of reference should not be that there are
# none of such patterns at all.
# ===========================================
# h) P2 models - random ego and alter effects
# ===========================================
library(lme4)
# Check the dyad data. Each entry refers to one dyad.
dyad.data[1:10,]
# Generate a vector holding the ID of the nodes
id<-seq(1:LG.size)
id
# Generate a 54 x 54 matrix filled with one's
full <-matrix(rep(1,LG.size*LG.size),LG.size)
full
# Multiply matrix with vector
i.matrix<- full * id
i.matrix
# Only keep the uper triangle of the matrix because our we have undirected ties only
i.id<-i.matrix[upper.tri(i.matrix)]
i.id[1:10]
# Transpose the matrix
j.matrix <-t(i.matrix)
j.id <-j.matrix[upper.tri(j.matrix)]
j.id[1:10]
# Merge in the ID's for node i and node j
dyad.data<-data.frame(ties, ethnicity.match, i.id, j.id)
dyad.data[1:10,]
# Estimate crossed-random effects logistic regression
results<-glmer(ties~ethnicity.match + (1 | i.id) + (1 | j.id),family="binomial",data=dyad.data)
summary(results)