-
Notifications
You must be signed in to change notification settings - Fork 0
/
munging.R
214 lines (173 loc) · 8.06 KB
/
munging.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
## To be run from the 'analysis' script or on it's own, whatever.
## Data imports ################################################################
#
# * Build a directed network of countries in the international system based on
# trade relationships each year (looks like it'll be from 1870, not 1820)
# * Import conflict dyads
#
## Trade and Economic data for the world systems ###############################
# banks <- read.csv("~/Code/CSS739_Cioffi_CM/Project/Data/World-Systems/1976_Banks_crossNationalTimeSeries/dataBanksCurrent/CNTSDATA.csv")
#
# There are 41 countries in the COW trade data that Maddison refers to in
# clumps of countries, _eg_ '14 Carribean' countries. I massaged these into
# the source file (albeit hurriedly) so others will need to contact me for data
# or repeat the process themselves (with different results).
mpd <- read.csv("~/Code/CSS739_Cioffi_CM/Project/Data/World-Systems/Maddison_various/mpd-cwd-country-gdppc.csv")
#
# Barbeiri uses a -9 to indicate NA values. That will cause a problem when
# calculating centrality, so this needs to be handled on import.
dyadTrade <- read.csv("~/Code/CSS739_Cioffi_CM/Project/Data/COW/COW_Trade_3.0/dyadic_trade_3.0.csv", na.strings="-9")
#
natTrade <- read.csv("~/Code/CSS739_Cioffi_CM/Project/Data/COW/COW_Trade_3.0/national_trade_3.0_cwd.csv", na.strings="-9")
#
# Need to replace missing trade data with Oneal & Russett's data
library(Hmisc) # It's in STATA and their text version is whack; this works
rod <- stata.get("~/Data/World-Systems/2001_Oneal-Russett_triangulatingPeace/TRIANGLE.DTA", convert.factors=FALSE)
#
## War Data ####################################################################
cow.extra <- read.csv("~/Code/CSS739_Cioffi_CM/Project/Data/COW/Extra-StateWarData_v4.0.csv", stringsAsFactors=FALSE)
#
cow.intra <- read.csv("~/Code/CSS739_Cioffi_CM/Project/Data/COW/Intra-StateWarData_v4.1.csv", stringsAsFactors=FALSE)
#
cow.inter <- read.csv("~/Code/CSS739_Cioffi_CM/Project/Data/COW/Inter-StateWarData_v4.0.csv", stringsAsFactors=FALSE)
#
cow.non <- read.csv("~/Code/CSS739_Cioffi_CM/Project/Data/COW/Non-StateWarData_v4.0.csv", stringsAsFactors=FALSE)
#
## No more data ################################################################
################################################################################
library("doParallel") # Optional. Only useful a couple of times
library("foreach") # Required for plyr to work in parallel
library("plyr") # Handy way to split, process and recombine data
library("dplyr") # Functionality I wish were in R already
library("reshape") # For melting horizontal data
library("mice") # For imputing missing per capita GDP values
## Begin the data manipulation #################################################
# Melt the columns of Maddison's per capita GDP for years 1870-2010 so that we
# have a long data set.
gdp <- melt(mpd, id=1:3, measure=54:194, variable_name="year")
# Take the 'X' off of the year data
gdp$year <- sapply(gdp$year, function(x) substring(x,2,5))
# Rename the columns f
names(gdp) <- c("abb", "ccode", "mpd_country", "year", "gdppc")
# Function to impute missing values and return the third of five permutations
# I picked the third one because I had to pick one and it seemed lazy to pick
# the first one.
# This is kind of a big deal because it's really not true that
# a linear relationship exists for per capita gdp; like, when there's a war or
# something. The trade data determines that, however. The country-year data
# just delivers what the dyadic trade data requires; one way or another.
## This process can be done in parallel.
# Set up the multi-core processor to act like a small cluster
# cnodes <- detectCores()
# cl <- makeCluster(cnodes)
# registerDoParallel(cl)
thirdimputation <- function(x) {
# Don't impute anything if there are no missing values or no data at all.
if(any(is.na(x$gdppc)) && !all(complete.cases(x))) {
ic <- complete(mice(x, m=5), 3)
} else {
ic <- x
}
return(ic)
}
imp.country <- ddply(gdp, .(ccode), thirdimputation)
# Reshape and reorganize into a list of data frames for each year
cyears <- merge(imp.country, natTrade, by=c("ccode", "year"))
cyears <- cyears[,c(1,3,6,2,5,7,8)]
cya <- split(cyears, cyears$year)
cya <- cya[1:140] #cut out 2010; no trade data
#
# Groupo compacto: country-year attributes (cya) is a list of data frames.
# The next step is to build networks for each year from the dyadic trade data.
# The dyadic trade data has 14 columns, but we only need 4.
# Chase-Dunn aruges that trade openness should be calculated as the ratio of
# imports to GDP, and weighted by population. The COW dyadic trade data is
# ultimately agnostic on this point; preference to importer data but exporter
# and IMF data fill gaps. I'm just calculating as IMPORTS.
#
# ! Note the two pulls from columns 6 and 7 to build directed edges.
dag1 <- dyadTrade[,c(1,2,6,3)] # Reduce dataframe to IMPORT amount & year
dag2 <- dyadTrade[,c(2,1,7,3)] # Reduce dataframe to IMPORT amount & year
n <- c("to", "from", "imports", "year") # Update names to allow binding
names(dag1) <- n
names(dag2) <- n
dag <- rbind(dag1, dag2)
dag$to <- as.integer(dag$to) #merge Hong Kong back into China
dag$from <- as.integer(dag$from)
## Deal with missing data ######################################################
#
rod1 <- rod[,c(1,2,4,3)]
rod2 <- rod[,c(2,1,5,3)]
n <- c("to", "from", "weight", "year")
names(rod1) <- n
names(rod2) <- n
rods <- rbind(rod1, rod2)
## come up with some reasonable missing values for dead data years
fillVoids <- function(d) {
if((sum(d$weight, na.rm=TRUE)==0) && is.na(sum(d$weight))){
return(d)
}
if(all(complete.cases(d))){
return(d)
} else {
d <- complete(mice(d, m=5),2)
return(d)
}
}
irods <- ddply(rods, .(to, from), fillVoids)
irods <- irods[irods$year > 1919,]
irods <- irods[irods$year < 1950,]
openNorm <- function(x) {
# x is a dataframe by year
omax <- max(x$weight, na.rm=TRUE)
x$weight <- x$weight / omax
return(x)
}
nrods <- ddply(irods, .(year), openNorm)
dag <- merge(dag, nrods, by=c("to", "from", "year"), all.x=TRUE)
# Then I split the dataframe into a list of smaller dataframes for each year.
daglist <- split(dag, dag$year)
# Update the names to match and not be numeric
graph.names <- paste0("g", names(daglist))
names(daglist) <- graph.names
names(cya) <- graph.names
## Data Validation #############################################################
# Not because I'm that particular, but because there's a discontinuity between
# the annual split dataframes for country-year attributes and the edgelists.
#
# Uncomment the code chunk below to make certain that all the nodes in the
# edge lists are described in the annual country data. Extra nodes imply an
# error in the data but will not stop graphs from being built correctly. The
# other way around breaks things.
#
################################################################################
#
checkVertices <- function (x, y){
elvs <- unique(c(x$to, x$from))
clvs <- unique(y$ccode)
print(paste("Edgelist has: ", setdiff(elvs, clvs)))
print(paste("Country list has: ", setdiff(clvs, elvs)))
print(paste("Edgelist is bigger by ",length(elvs) - length(clvs)))
}
vertexValidation <- mapply(checkVertices, daglist, cya)
#
################################################################################
# Now we're ready to actually build the graphs...
################################################################################
source("graphs.R")
## And we'll get the war data ready to start working on when we get back to the
# analysis part.
# small cows are calves, so I'll subset it thataway.
# Slice
calf.intra <- cow.intra[,c(1,2,4,5,6,7,11,14,23,26,27)]
calf.inter <- cow.inter[,c(1,2,4,5,9,12,24)]
calf.extra <- cow.extra[,c(1,2,4,5,6,7,10,13,26)]
# Dice
calf.intra <- calf.intra[calf.intra$StartYear1 >= 1870,]
calf.inter <- calf.inter[calf.inter$StartYear1 >= 1870,]
calf.extra <- calf.extra[calf.extra$StartYear1 >= 1870,]
# calf.intra$core.a <- FALSE
# calf.intra$core.b <- FALSE
# calf.extra$core.a <- FALSE
# calf.extra$core.b <- FALSE
# calf.inter$core <- FALSE