# ==================================================================
#
#
# Advanced Social Network Analysis
#
# UZH, 31 Oct - 4 Nov 2016
#
# Lecturer: Thomas Grund, thomas.grund@ucd.ie
#
# ==================================================================
#
#
# ==================================================================
#
# Session 4: Stochastic actor-oriented network models
#
# ==================================================================
#
# This R-script guides you through RSiena. Some parts of the script come from scripts available
# on the Siena website: http://www.stats.ox.ac.uk/~snijders/siena/
#
# Check out the Siena website for further information, tutorials and guidance.
#
##############################################
# Overview of topics:
# >>>>>>>>>>>>>>>>>>>
# a) Load data
# b) Create an object for the dependent variable
# c) Create objects for the independent variables
# d) Combine the dependent and independent variables in a Siena object
# e) Specify the model
# f) Run Siena and estimate the model
# g) Look at the results
# h) Goodness-of-fit
##############################################
#
# First, you need to install the packages we need. These commands automatically download and
# install all we need. If you have already installed these packages, you might not have
# to run these commands (but it does not hurt).
install.packages("RSiena")
install.packages("statnet")
# Now, we need to load the libaries we need for this session.
library(statnet)
library(RSiena)
######################################################
#
# a) Load all the data we need
#
######################################################
# For this session we will load the s50 data, an excerpt of the
# Teenage Friends and Lifestyle Study. This dataset contains
# network observation in a school at three points in time, plus
# information about students' drinking and smoking behavior.
# You can download the data from here, but I already included it
# in our data directory.
?s50
# Again, make sure that R can find the data by setting the working
# directory properly.
setwd("/Users/thomasgrund/OneDrive/DUBLIN TEACHING/Network Dynamics 2015/data")
list.files()
# First, let us load the three networks
data1 <- read.table("s50-network1.dat")
class(data1)
# You can see that read.table loads the data as a "data.frame".
# RSiena wants to have a matrix as an input. Let us transform the
# data.frame in a matrix.
matrix1 <- as.matrix(data1)
class(matrix1)
matrix1
# Okay, we could go on and generate a "network"-object out of this
# matrix and plot it.
net1<- as.network(matrix1)
par(mfrow=c(1,1))
plot(net1)
# However, RSiena wants the networks in matrix-form. So, let us generate
# matrices for the second and the third wave by combining some previous
# commands in one line.
matrix2 <- as.matrix(read.table("s50-network2.dat"))
matrix3 <- as.matrix(read.table("s50-network3.dat"))
net2<- as.network(matrix2)
net3<- as.network(matrix3)
plot(net2)
plot(net3)
# Great, now we have the networks loaded in matrix form. Now, let us load
# some attributes of the students.
alcohol<- read.table("s50-alcohol.dat")
alcohol
# This is a "data.frame" with three variables (v1,v2,v3). Each one of these
# stands for the drinking behavior of an individual (represented by the row)
# at one of the three points in time. Let us do the same for smoking behavior
smoke<- read.table("s50-smoke.dat")
smoke
# Perfect. We have all data loaded we need.
# As part of every Siena analysis we need to create four types of objects:
#
# - dependent variables;
# - explanatory variables;
# - combination of dependent and explanatory variables;
# - model specification.
######################################################
#
# b) Create an object for the dependent variable
#
######################################################
# First we have to create objects for the dependent variables. Notice,
# this can be more than one object if we would want to model e.g. the
# co-evolution of network and behavior. For now, we only focus on the
# network and only generate one dependent variable, here, the sequence
# of networks we observe.
#
# We do this by using sienaDependent. This commands creates a sienaDependent
# object, here a network, from a matrix or array or list of sparse matrix of
# triples. This object will have the role of a dependent variable in the
# analysis.The name of this network object (here: friendship) will be used
# in the output file.
friendship <- sienaDependent(array(c(matrix1,matrix2,matrix3),dim = c(50,50,3)))
# Look at what this command does:
#
# First, it generates a list of the three matrices with:
# c(matrix1,matrix2,matrix3)
#
# Next, it generates an array out of this list with certain dimensions:
# array(..., dim =c(50,50,3))
#
# Third, this array is fed to the sienaDependent command
# sienaDependent(...)
#
# Voila!
# Note that this is an object of class
class(friendship)
# See some more information about this object
friendship
# The function sienaDependent can also be used to create a behavior variable object
# with the extra argument type = "behavior".
#
# For example, we could create the dependent variable:
drinkingbeh <- sienaDependent(as.matrix(alcohol), type = "behavior" )
# which is the sequence of drinking behavior in time. By declaring it a dependent
# variable we could study how network structure (in the past) influences individuals
# drinking behavior, e.g. the diffusion of drinking behavior. However,
# for now, we only focus on the network as the dependent variable.
# The options available for defining a sienaDependent object
# are displayed by typing
?sienaDependent
######################################################
#
# c) Create objects for the independent variables
#
######################################################
# Second, we construct objects for the explanatory (independent) variables.
# From the help request
# ?sienaDataCreate
# we see that these can be of five kinds:
# coCovar Constant actor covariates
# varCovar Time-varying actor covariates
# coDyadCovar Constant dyadic covariates
# varDyadCovar Time-varying dyadic covariates
# compositionChange Composition change indicators
# You can get help about this by the following requests:
# ?coCovar
# ?varCovar
# ?coDyadCovar
# ?varDyadCovar
# ?sienaCompositionChange
# The variables available for this data set all are changing actor covariates.
# For illustrative purposes, we use smoking as observed at the first wave
# as a constant covariate:
smoking1 <- coCovar(smoke[ , 1 ])
# This selects the first column of smoke, which contains the first
# wave observations, and makes it available as a constant covariate.
#
# This is the pattern for for every covariate file, e.g.
# Attr1 <- coCovar( Covariate1 )
#
# where Covariate1 is a matrix (or data.frame) with dim(Covariate1) equal
# to n x 1. Note, if the Covariate is a matrix with dim(Covariates) equal
# to n x p you can create constant covariates with:
#
# Attr1 <- coCovar(Covariates[,1])
# ...
# Attrk <- coCovar(Covariates[,p])
# We use the drinking behavior as a changing covariate.
# The function varCovar creates a changing covariate object from a matrix
# the name comes from 'varying covariate'. Notice that "alcohol" is still
# a data.frame. The command "varCovar" wants a "matrix" as an input. But
# we can easily transform our data.frame in a matrix.
drinking <- varCovar(as.matrix(alcohol))
# You need at least three waves in the data set to define a varying covariate
# by the function varCovar as the previous wave is used
# as a predictor of the next wave.
# The command
attributes(drinking)
# will tell you the information that RSiena now has added to the drink data.
######################################################
#
# d) Combine the dependent and independent variable in a Siena object
#
######################################################
# We now combine the dependent and independent variables.
# The function sienaDataCreate creates a Siena data object from input networks,
# covariates and composition change objects; the objects that earlier were
# created by sienaDependent will have the role of dependent variables, and
# similarly the other roles are predetermined by creation by the functions
# coCovar, varCovar, coDyadCovar, varDyadCovar, and sienaCompositionChange.
mydata <- sienaDataCreate(friendship, smoking1, drinking)
# You may check the result by requesting
mydata
# You should now understand how this differs from the result of
# mybehdata <- sienaDataCreate( friendship, smoking1, drinkingbeh )
######################################################
#
# e) Specify the model
#
######################################################
# The function getEffects creates a data.frame of effects with a number of extra
# properties for use in RSiena:
myeff <- getEffects(mydata)
# mydata is needed as an argument as the effects depend on the number
# and types of covariates and dependent variables.
# Before we explain the object myeff and how we shall be going to use it,
# we first produce a data description which is available now:
print01Report( mydata, myeff, modelname = 'report' )
# This writes a basic report of the data to the file report.out in the
# current working directory. Locate and open it! Inspecting this is
# important because it serves as a check and also contains a number of
# basic descriptives. In this description you can see that the third wave
# data for alcohol are not used. This is because changing covariates are
# assumed to be constant from one wave until immediately before the next
# wave, so that the values for the last wave are ignored.
# Let us now consider the myeff object, which is used to specify the model.
# It is of the class "sienaEffects", and contains the model specification.
# You can inspect the current model specification by simply requesting
myeff
# For starting, the model specification is just a very limited default
# (including rates of change, outdegree and reciprocity only)
# To make a meaningful analysis, you will need to add to it.
# The rows of myeff correspond to the effects.
# By requesting
names(myeff)
# You see the type of information that is stored about the effects,
# i.e., the columns (variables) defined for the effects.
# If desired, more information about these variables can be obtained
# from the help files:
# ?getEffects
# Some often used variables are effectName, shortName, type, and parameter.
# The set of available effects and their most used columns
# can be inspected as follows:
# effectsDocumentation(myeff)
# This gives a long list of effects: all defined in Section 12 of the manual,
# as far as they are meaningful for dataset mydata.
# The "include" column defines whether effects are included in the model.
myeff$include
# Here the TRUE values correspond to the default model specification which,
# however, is not meant as a serious model, being too limited.We can specify
# which effects should be included in the model in different ways.
# The simplest is to just type:
fix(myeff)
# fix calls a data editor internal to R, so we can manually edit the effects.
# This operates the same as in the Gui. How to use fix() is presented merely
# for getting to know what myeff is. In practical analysis it is more convenient
# to use routine "includeEffects" instead, as explained below. fix() may not
# be usable if you do not have tcl/tk available!
#
# Note that the top of the dataframe shows the names of the columns:
# name, effectName, etc.
# You can edit the "include" column by changing the TRUE and FALSE values
# as required; when the editor is closed, the new values are stored.
# When you make an error, however, the effects object may be corrupted.
# Intead of adding effects manually by changing the entries after typing fix(mydata)
# we can use "includeEffects". This function uses short names instead of full
# names. The short names are given by the effectsDocumentation() function
# mentioned above, and also are listed in the descriptions given in Section 12
# of the RSiena manual.
# A different table of effect information, including short names,
# that covers effects available for whatever data sets,
# is available as a pdf file in the R directory, and can be opened by
RShowDoc("effects", package="RSiena")
# For illustration, let us start from scratch with a new sienaEffects object,
# and add the transitive triples and 3-cycles effects
myeff <- getEffects(mydata)
myeff <- includeEffects( myeff, transTrip, cycle3 )
# To see the current model specification,
myeff
# Note that we can set several effects in one go!
# To remove an effect, e.g., the 3-cycle effects
myeff <- includeEffects(myeff, cycle3, include=FALSE )
# Check again which effects now are included in the model
myeff
# Let us add the 3-cycle effect again.
myeff <- includeEffects(myeff, cycle3)
# Great, now let us add some effects related to our covariates.
# The short names do not differentiate between the covariates:
# e.g., the effects 'drinking ego' and 'smoking1 ego' both have shortName
# 'egoX', hence, we need to specifiy the interaction1 parameter.
myeff <- includeEffects(myeff, egoX, interaction1="smoking1")
# We could add several effects for the covariate specified by interaction1
myeff <- includeEffects( myeff, egoX, altX, egoXaltX, simX, interaction1 = "drinking" )
myeff <- includeEffects( myeff, egoX, altX, egoXaltX, simX, interaction1 = "smoking1" )
# We check the results again:
myeff
# By looking at the help offered by
?includeEffects
# For now, we do not differentiate between evaluation and endowment effects.
# If we were to do so, we could model e.g. drinking behavior as two effects:
#
# 1) The importance of drinking behavior for the creation of ties (evaluation)
# 2) The importance of drinking behavior for the dissolution of ties (endowment)
#
# Substantively, it can make sense that some variable is import to create ties, but
# not for dissolving ties, or vice-versa.
#
# We would use the following syntax:
#
# myeff <- includeEffects(myeff, egoX, interaction1="smoking1", type="endow")
# myeff <- includeEffects(myeff, egoX, interaction1="smoking1", type="eval")
#
# Effects that depend on other variables, such as egoX, altX, etc. above,
# need the specification of these variables to define them. This is done by
# the interaction1 parameter when only one variable name is needed,
# and by interaction2 if there is a second variable involved, such as AltsAvAlt
# (see the manual). Although the names of these parameters are interaction1
# and interaction2, this does not refer to an interaction as commonly used
# in statistical modelling!
######################################################
#
# f) Run Siena and estimate the model
#
######################################################
# Parameters of the model are estimated by the function siena07. This requires
# the data specification; the effects specification; and a number of parameters,
# or settings, for the estimation algorithm. The latter are contained in an
# object created by the function sienaAlgorithmCreate. You can look at the help
# provided by ?sienaAlgorithmCreate to find out about options that you may use here;
# for beginning users, only the two options mentioned below are relevant.
#
# Output will be written to a file with name projname.out, where projname is
# whatever name is given; the default (used if no name is given) is Siena.
# This file will be written to your current directory. New estimation runs will
# append to it. A new call to print01Report will overwrite it!
myalgorithm <- sienaAlgorithmCreate(useStdInits = FALSE, projname = 'myresults')
# The useStdInits parameter determines the initial values used for the estimation
# algorithm. If useStdInits = TRUE, standard initial values are used;
# if useStdInits = FALSE, the initial values are used that are contained
# in the "initialValue" column of the effects object, which are reported by the
# information request.
# Let us first simplify the model for illustrative purposes:
myeff <- getEffects(mydata)
myeff <- includeEffects( myeff, transTrip, cycle3)
myeff <- includeEffects( myeff, egoX, altX, egoXaltX,interaction1 = "drinking" )
myeff <- includeEffects( myeff, simX, interaction1 = "smoking1")
myeff
# The function siena07 actually fits the specified model to the data
ans <- siena07( myalgorithm, data = mydata, effects = myeff, batch=TRUE)
# Function siena07 produces a so-called sienaFit object, here called ans;
# and it fills in a few things in the sienaEffects object myeff, if this is the first
# use of myeff in a siena07 call. By using various different effects objects, i.e.,
# with different names, you can switch between specifications.
# The batch = FALSE parameters will give a graphical user interface being opened
# which reports on the progress of the estimation algorithm;
# verbose = TRUE leads to extensive diagnostic information being sent
# to the console during the estimation, and results after the estimation
# (these results are also copied to the output file projname.out, see above);
# while batch=TRUE gives only a limited amount of printout sent to the console
# during the estimation (which is seen when clicking in the console,
# or more immediately if the Buffered Output is deselected in the Misc menu)
# which monitors the progress of the estimation algorithm in a different way.
# The call of siena07 leads to output in the file myresults.out
# (or more generally projname.out, where projname is the name given in sienaAlgorithmCreate)
# and to the creation of the object which here is called ans (for "answer").
######################################################
#
# g) Look at the results
#
######################################################
# The file "myresilts.out" will contain the results of the estimation. It is contained in
# the current directory ("getwd()"). This file can be read by any text editor. A summary of
# the results is obtained on the screen by
ans
# and a larger summary by
summary(ans)
# Depending on the random seed and the model specification, the results could be something
# like the following:
# Estimates, standard errors and convergence t-ratios
#
# Estimate Standard Convergence
# Error t-ratio
#
# Rate parameters:
# 0.1 Rate parameter period 1 6.6101 ( 1.1571 )
# 0.2 Rate parameter period 2 5.1962 ( 0.8875 )
#
# Other parameters:
# 1. eval outdegree (density) -2.7203 ( 0.1318 ) 0.0010
# 2. eval reciprocity 2.4351 ( 0.2214 ) -0.0188
# 3. eval transitive triplets 0.6496 ( 0.1450 ) -0.0409
# 4. eval 3-cycles -0.0943 ( 0.2913 ) -0.0376
# 5. eval smoking1 similarity 0.2596 ( 0.2061 ) 0.0396
# 6. eval drinking alter -0.0292 ( 0.0687 ) -0.0620
# 7. eval drinking ego 0.0416 ( 0.0716 ) -0.0714
# 8. eval drinking ego x drinking alter 0.1268 ( 0.0488 ) -0.0278
#
# Total of 2296 iteration steps.val alcohol ego x alcohol alter 0.1295 ( 0.0489 ) 0.0126
#
# CONVERGENCE:
#
# To understand the table above, note that the "convergence t-ratio" is the t-ratio for
# convergence checking, not the t statistic for testing the significance of this effect!
# (See Section 6.1.2 of the manual). In the external output file, these are called
# "t-ratios for deviations from targets". This is the first thing you should always look at!
# The simple reason is that the algorithm could simulate a series of networks that has
# nothing to with the actually observed data. The rule of thumb is that all t-ratios for
# convergence should ideally be less than 0.1 in absolute value; this signifies good
# convergence of the algorithm.
#
# In the example here, this is the case. If this would not be the case, the best thing
# to do would be to continue the estimation, using the estimates produced here,
# and contained in ans, as the new initial values. One could do this by typing:
#
# ans <- siena07(myalgorithm, data=mydata, effects=myeff, prevAns=ans, batch=TRUE)
#
# Because the estimation algorithm is based on random simulations of the network evolution,
# there always will be small differences between different runs of the algorithm. To
# obtain "publication grade" estimates, where this variability is minimized, choose the
# value of parameter n3 in sienaAlgorithmCreate() ("Number of iterations in phase 3")
# larger than the default value of 1000; e.g., n3=3000. Notice, this will take more time.
# ESTIMATES and STANDARD ERRORS
#
# When you have an acceptable convergence of the model, you can look at the effects. Exciting.
# You can interpret the estimates similar to logistic regression coefficients. For example,
# the positive reciprocity effect (2.4351) indicates that actors are more likely to form a tie
# (and also maintain this tie - because we did not distinguish between evaluation and
# endowment effects) to somebody who already sent a tie to them. There is a tendency towards
# reciprocity in the network!
#
# Let us look at the covariate effects. For example, a positive "smoking1 similarity" effect,
# (0.2596) indicates that actors tend to form (and maintain ties) to other individuals who are
# similar to them on the smoking1 variable. This implies homophily! When two actors smoke
# they are more likely to become friends in this data. But be very careful, we did not say
# anything about the significance of these effects yet! This is just like in a normal logistic
# regression. You can have non-zero effects, but they are not significant! We come back this
# in a second.
#
# Let us take another look at the "drinking ego" effect. A positive effect would suggest that
# those indivdiuals who drink a lot tend to send (form and maintain) more ties in general. The
# "drinking alter" effect indicates whether more or less ties are sent to those individuals
# who drink a lot. And the "drinking ego x drinking alter" indicates whether ties are more
# likely to be formed between two individuals who both score high on the drinking variable.
# Notice the difference to a potential "drinking similarity" effect, which would only look
# at whether two individuals are similar in their behavior (which could mean both high and
# low on drinking).
#
# Okay, after looking at 1) the convergence for each effect (indicating whether a particular
# feature in the data is modelled well), and 2) the effect sizes, let us look now at significance
# of the effects.
#
# For this purpose, two types of tests are available in Siena:
#
# 1. t-type tests of single parameters can be carried out by dividing the parameter estimate
# by its standard error. Under the null hypothesis that the parameter is 0, these tests
# have approximately a standard normal distribution. You simply divide the estimate by the
# standard error. If that that is greater than 2 you have a significant effect! Voila.
#
# 2. Score-type tests of single and multiple parameters are described in the manual. We do not
# do this for now. Check the manual for this test and when it is appropriate.
#
######################################################
#
# g) Goodness-of-fit
#
######################################################
# From traditional regression contexts, you are familiar with the idea of "goodness-of-fit".
# For example, the idea of R^2, which technically gives nothing else than the correlation
# between your observed values and the values your regression predicted. When this is high
# your model fits the observed data well. This is important, because you might have a model
# with significant effects, but you are just not predicted the observed data well. Hence,
# your effects are pretty meaningless.
#
# Unfortunately, in SIENA analyses (as well as in ERGM's) this is not so straightforward. The
# reason is that we are actually not trying to come up with a model that fits exactly the
# observed network, but rather with a model that fits the class of networks the observation
# falls in. We need to do this simplification because otherwise the model would be too complex.
# There are just too many potential networks out there. But this also means we do not end up
# with a nice little number like R^2, which tells us how good the fit is (although people
# are trying different things these days to come up with a similar number, e.g. Mahalanobis
# distance).
#
# The underlying issue is that "goodness-of-fit" in SIENA and ERGM means something slightly
# different. Instead of a direct correlation between predicted and observed network, one
# focuses on particular features of the observed network (or sequence of networks). Goodness-
# of-fit means, how well does our model resemble some actual features of the network, e.g.
# indegree distribution, outdegree distribution, triad census and so on.
#
# Think back what SIENA does, it basically simulates a lot of different sequences of evolving
# networks. Similar to an ERGM we have a distribution of networks (or even network sequences)
# that we simulate using some parameters. And ultimately, SIENA estimates these parameters so
# that the observed sequence of networks is the most likely one to have been produced by the
# simulation.
#
# So, for "goodness-of-fit" we look at other characteristics of the networks that we did not
# use for estimation. If our model is good (provides a good fit), it should reproduce these
# other features of the network well. Since recently, there is an easy way to do such analyses
# in SIENA by using sienaGOF. There are two features (out of the box) we can look at:
# indegree distribution and outdegree distribution. But you can also "program" functions
# to look at additional features: ?sienaGOF.
#
# First, however, when we estimate the model with siena07, we need to tell SIENA to keep the
# "simulated" networks. We can then calcualte some features of these simulated networks with
# the same feature on the observed networks.
ans <- siena07( myalgorithm, data = mydata, effects = myeff, returnDeps =TRUE, batch=TRUE)
# With the option returnDeps, the simulated networks are now stored within the ans object.
# Now, let us run sienaGOF and plot the comparison
gofi.indegree <- sienaGOF(ans, IndegreeDistribution, verbose=TRUE, join=TRUE, varName="friendship")
plot(gofi.indegree)
gofi.outdegree <- sienaGOF(ans, OutdegreeDistribution, verbose=TRUE, join=TRUE, varName="friendship")
plot(gofi.outdegree)