-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataset_analyse_baci.py
195 lines (152 loc) · 6.69 KB
/
dataset_analyse_baci.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""
Analyse Tables, Plots and Construct Meta Data for BACI Data
"""
import os
import gc
import glob
import matplotlib.pyplot as plt
import pandas as pd
#-HS Levels-#
HS96L6 = True
#-SITC Levels-#
SITCR2L5 = True
SITCR2L4 = True
SITCR2L3 = True
SITCR2L2 = True
SITCR2L1 = True
#---------------#
#-Control Logic-#
#---------------#
RAW_SIMPLESTATS_TABLE = True
DATASET_PRODUCTCODE_INTERTEMPORAL_TABLES = True
DATASET_COUNTRYCODE_INTERTEMPORAL_TABLES = True
DATASET_SIMPLESTATS_TABLE = True
DATASET_PERCENTWORLDTRADE_PLOTS = True
#-----#
#-RAW-#
#-----#
from dataset_info import RESULTS_DIR, TARGET_DATASET_DIR
SOURCE_DIR = TARGET_DATASET_DIR["baci96"]
STORE = "raw_baci_hs96-1998-2012.h5"
RESULTS_DIR = RESULTS_DIR["baci96"]
if RAW_SIMPLESTATS_TABLE:
from pyeconlab.trade.util import describe
print "Running RAW_SIMPLESTATS_TABLE ..."
DIR = RESULTS_DIR + "tables/"
STORE = SOURCE_DIR + STORE
print "Running STATS on File %s" % STORE
store = pd.HDFStore(STORE)
for dataset in sorted(store.keys()):
dataset = dataset.strip("/") #Remove Directory Structure
print "Computing SIMPLE STATS for dataset: %s" % dataset
data = pd.read_hdf(STORE, key=dataset)
productcode = "hs6"
dataset_table = describe(data, table_name=dataset, productcode=productcode, exporter="eiso3n", importer="iiso3n")
del data
gc.collect()
store.close()
#-Excel Table-#
fl = "baciraw-trade-hs6-1998to2012_stats.xlsx"
dataset_table.to_excel(DIR + fl)
#-Latex Snippet-#
fl = "baciraw-trade-hs6-1998to2012_stats.tex"
with open(DIR + fl, "w") as latex_file:
latex_file.write(dataset_table.to_latex())
#----------#
#-DATASETS-#
#----------#
from dataset_info import RESULTS_DIR, TARGET_DATASET_DIR
SOURCE_DIR = TARGET_DATASET_DIR["baci96"]
STORES = glob.glob(SOURCE_DIR + "*.h5")
RESULTS_DIR = RESULTS_DIR["baci96"]
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## ---> Product Composition Tables <--- ##
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
STORES = [x for x in STORES if x.split("/")[-1][0:3] != "raw"] #Filter Out RAW Files
def split_filenames(fl):
dataset, data_type, classification, years = fl.split("-")
classification, product_level = classification[:-2], classification[-1:]
return dataset, data_type, classification, product_level
if DATASET_PRODUCTCODE_INTERTEMPORAL_TABLES:
print "Running DATASET_PRODUCTCODE_INTERTEMPORAL_TABLES ..."
DIR = RESULTS_DIR + "intertemporal-productcodes/"
for store in STORES:
print "Computing Composition Tables for: %s" % store
dataset, data_type, classification, product_level = split_filenames(store.split("/")[-1])
store = pd.HDFStore(store)
for dataset in store.keys():
print "Computing table for dataset: %s ..." % dataset
dataset = dataset.strip("/")
intertemp_product = store[dataset].groupby(["year", "sitc%s"%product_level]).sum().unstack("year")
intertemp_product.columns = intertemp_product.columns.droplevel()
intertemp_product.to_excel(DIR + "intertemporal_product_%s_%sl%s_%s.xlsx"%(data_type, classification, product_level, dataset))
store.close()
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## ---> Country Composition Tables <--- ##
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
if DATASET_COUNTRYCODE_INTERTEMPORAL_TABLES:
print "Running DATASET_COUNTRYCODE_INTERTEMPORAL_TABLES ..."
DIR = RESULTS_DIR + "intertemporal-countrycodes/"
for store in STORES:
print "Computing Composition Tables for: %s" % store
dataset, data_type, classification, product_level = split_filenames(store.split("/")[-1])
store = pd.HDFStore(store)
for dataset in store.keys():
print "Computing table for dataset: %s ..." % dataset
dataset = dataset.strip("/")
if data_type == "export":
intertemp_country = store[dataset].groupby(["year", "eiso3c"]).sum().unstack("year")
if data_type == "import":
intertemp_country = store[dataset].groupby(["year", "iiso3c"]).sum().unstack("year")
else:
continue
intertemp_country.columns = intertemp_country.columns.droplevel()
intertemp_country.to_excel(DIR + "intertemporal_country_%s_%s_%s.xlsx"%(data_type, classification, dataset))
store.close()
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## ----> SIMPLE STATS TABLES <---- ##
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
if DATASET_SIMPLESTATS_TABLE:
from pyeconlab.trade.util import describe
print "Running DATASET_SIMPLESTATS_TABLE: ..."
DIR = RESULTS_DIR + "tables/"
for dataset_file in STORES:
print "Running STATS on File %s" % dataset_file
store = pd.HDFStore(dataset_file)
for dataset in sorted(store.keys()):
dataset = dataset.strip("/") #Remove Directory Structure
print "Computing SIMPLE STATS for dataset: %s" % dataset
data = pd.read_hdf(dataset_file, key=dataset)
productcode = "".join(dataset_file.split("/")[-1].split("-")[2].split("r2l"))
dataset_table = describe(data, table_name=dataset, productcode=productcode)
if dataset == "A":
table = dataset_table
else:
table = table.merge(dataset_table, left_index=True, right_index=True)
store.close()
#-Excel Table-#
fl = dataset_file.split("/")[-1].split(".")[0] + "_stats" + ".xlsx"
table.to_excel(DIR + fl)
#-Latex Snippet-#
fl = dataset_file.split("/")[-1].split(".")[0] + "_stats" + ".tex"
with open(DIR + fl, "w") as latex_file:
latex_file.write(table.to_latex())
if DATASET_PERCENTWORLDTRADE_PLOTS:
print "DATASET_PERCENTWORLDTRADE_PLOTS ... "
DIR = RESULTS_DIR + "plots/percent_world_values/"
#-World Values-#
fl = "./output/dataset/baci96/raw_baci_world_yearly-1998to2012.h5"
world_values = pd.read_hdf(fl, key="World")["value"]
for dataset_file in STORES:
print "Producing GRAPH on File %s" % dataset_file
store = pd.HDFStore(dataset_file)
datasets = store.keys()
for dataset in sorted(datasets):
print "Computing GRAPH for dataset: %s" % dataset
data = pd.read_hdf(dataset_file, key=dataset)
yearly_values = data.groupby(["year"]).sum()["value"]
percent_values = yearly_values.div(world_values)*100
fig = percent_values.plot(title="Dataset: %s (%s)"%(dataset, dataset_file))
plt.savefig(DIR + "%s_%s_percent_wld.pdf"%(dataset, dataset_file.split('/')[-1].split('.')[0]))
plt.close()
store.close()