-
Notifications
You must be signed in to change notification settings - Fork 0
/
br.py
199 lines (178 loc) · 7.93 KB
/
br.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import re, random, os, json
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
from bokeh import mpl
from bokeh.plotting import output_file, show
from sklearn.feature_extraction.text import TfidfVectorizer
from classifier import Classifier, label2domain, manifestolabels
MANIFESTO_FOLDER = "data/wahlprogramme/"
RESULT_FOLDER = "data/resultate/"
if not os.path.isdir(RESULT_FOLDER):
os.mkdir(RESULT_FOLDER)
# Tuples with party names, files and plotting colors
partyFiles = [
('AfD',"afd.md", "blue"),
('CDU/CSU', "cducsu.md", "gray"),
('FDP', "fdp.md", "yellow"),
('SPD', "spd.md", "red"),
('Grüne', "diegruenen.md", "green"),
('Die Linke', "dielinke.md", "purple")
]
# political domains (according to manifestocodes) to be analysed
domains = [
'External Relations',
'Freedom and Democracy',
'Political System',
'Economy',
'Welfare and Quality of Life',
'Fabric of Society'
]
def clean_whitespace(txt):
'''
Replaces multiple whitespaces by blank
'''
return re.sub("\s+"," ",txt)
def read_md(fn, min_len=100):
'''
Reads manifesto from md file;
text segments shorter than min_len are discarded
'''
# uncomment next line for sentence segmentation
# split_symbol = '[\.\!\?\;] '#
# this splits texts per paragraph, marked by one or more '#'
split_symbol = '#+'
md_text = open(fn).read()
len_filter = lambda x: len(x) > min_len
text_segments = re.split(split_symbol,md_text)
texts = filter(len_filter, map(clean_whitespace, text_segments))
return texts
def classify_br(folder, fn, party, clf, max_txts=10000):
'''
Computes predictions for a given party
INPUT:
folder folder where [party].md files are stored
fn filename of [party].md file
party name of party (in case of different spelling than filename)
clf manifestoproject classifier - see classifier.py
max_txts maximal number of texts - subsamples max_txts if there are more
OUTPUT:
predictions pandas DataFrame with predictions, texts and party as columns
'''
content = list(read_md(os.path.join(folder,fn)))
if len(content) > max_txts:
content = random.sample(content, max_txts)
preds = clf.predictBatch(content)
manifesto_codes = list(set(manifestolabels().values()).intersection(set(preds.columns.tolist())))
preds['max_manifesto'] = preds[manifesto_codes].idxmax(axis=1)
preds['max_domain'] = preds[list(label2domain.keys())].idxmax(axis=1)
preds['max_leftright'] = preds[['left', 'right']].idxmax(axis=1)
preds['content'] = content
preds['party'] = party
return preds
def compute_most_distant_statements_per_topic(preds, n_most_distant=5, folder=MANIFESTO_FOLDER):
'''
Computes for each topic and party the text segments that are most distant
to the average text segments of all other parties. Could be interpreted as
'characteristic statements' of a party
INPUT:
preds predictions obtained by classify_br
n_most_distant number of 'characteristic' text segments to choose
folder folder to store results
'''
# BoW extraction
tf = TfidfVectorizer().fit(preds.content)
preds['tf_idf'] = preds.content.apply(lambda x: tf.transform([x]))
most_distant_statements = []
for domain in domains:
for party in [x[0] for x in partyFiles]:
# find statements of this party
this_party = (preds.party == party) & (preds.max_domain == domain)
# find statements of other parties
other_parties = (preds.party != party) & (preds.max_domain == domain)
# stack BoW features for this party
partyVecs = sp.sparse.vstack(preds[this_party]['tf_idf'])
partyTexts = preds[this_party]['content']
# stack BoW vectors and take their average
otherVec = sp.sparse.vstack(preds[other_parties]['tf_idf']).mean(axis=0)
# compute L_1 distance between party and other parties
dists = sp.array(abs(partyVecs - otherVec).sum(axis=1)).flatten()
# find and store 'characteristic' text segments
most_distant = [(partyTexts[idx], dists[idx]) for idx in dists.argsort()[-n_most_distant:][-1::-1]]
most_distant_statements.extend([(party, domain, m, d) for m, d in most_distant])
# store results as DataFrame
most_distant_statements_df = pd.DataFrame(most_distant_statements, columns=['party', 'domain', 'most_distant_to_other_parties', 'distance'])
most_distant_statements_df = most_distant_statements_df.sort_values(by=['party','domain'])
most_distant_statements_df.to_csv(os.path.join(folder,'most_distant_statements_per_topic.csv'),index=False)
del(preds['tf_idf'])
return most_distant_statements_df
def plotAll():
'''
Run analysis for BR
- Classifies texts per party
- Create violin plots for each topic
- Computes most 'characteristic' text segments for each party
'''
predictions = []
colors = []
clf = Classifier(train=True)
for party, fn, color in partyFiles:
predictions.append(classify_br(MANIFESTO_FOLDER, fn, party, clf))
colors.append(color)
df = pd.concat(predictions)
df.to_csv(os.path.join(RESULT_FOLDER, "results.csv"))
# compute most distant statements per topic, discard result as it's csv-dumped
_ = compute_most_distant_statements_per_topic(df)
compute_word_code_correlations(df)
plot_left_right(df.copy(), colors, plot_suffix = 'all_domains')
for domain in domains:
# get rows containing statements for this topic across all parties
idx = df[domains].apply(pd.Series.argmax,axis=1)==domain
plot_left_right(df[idx].copy(), colors, plot_suffix = domain)
def plot_left_right(df,
colors,
plot_column='right',
grouping_column='party',
folder=RESULT_FOLDER,
plot_suffix=""):
# median-centered per domain right position
df[plot_column] = df[plot_column] - df[plot_column].median()
ax = sns.violinplot(x=plot_column,y=grouping_column,
data=df[[plot_column,grouping_column]], palette=sns.color_palette(colors),
split=True,scale="count", inner="stick", saturation=0.5)
ax.set_xlim([0,1])
ax.set_xticks(np.arange(0,1,.1))
ax.set_xlabel("links-rechts Index")
ax.set_ylabel("Partei")
ax.set_facecolor("white")
ax.set_title(plot_suffix)
output_file(os.path.join(folder,"violinPlot-%s.html"%plot_suffix))
show(mpl.to_bokeh())
def compute_word_code_correlations(preds, folder = RESULT_FOLDER):
'''
Computes correlations between single words and manifesto codes
see also http://www.sciencedirect.com/science/article/pii/S1053811913010914
'''
tf = TfidfVectorizer(max_df=.2).fit(preds.content)
preds['tf_idf'] = preds.content.apply(lambda x: tf.transform([x]))
labels = list(set(manifestolabels().values()).intersection(set(preds.columns.tolist()))) + domains + ['right']
idx2word = {v:k for k,v in tf.vocabulary_.items()}
X = sp.sparse.vstack(preds.tf_idf)
Xw = preds[labels].as_matrix()
XXw = X.T.dot(Xw)
correlations = {}
for idx,mc in enumerate(labels):
correlations[mc] = [idx2word[widx] for widx in XXw[:,idx].argsort()[::-1]]
print("Most correlated words for {}:\n{}\n\n".format(mc,", ".join(correlations[mc][:50])))
# write results to json file
json.dump(correlations,open(os.path.join(folder, "word_correlations.json"),"w"))
def sort_by_word_importance(text, label = 'right', folder = RESULT_FOLDER):
# read word correlations
correlations = json.load(open(os.path.join(folder, "word_correlations.json"),"r"))
words = [re.sub('[!?,.-]','',w) for w in text.lower().split(" ")]
text_sorted_by_relevance = [w for w in correlations[label] if w in words]
print("Original Text:\n{}\n\n".format(text))
print("Words sorted by relevance for classifier:\n{}\n\n".format(" ".join(text_sorted_by_relevance)))
if __name__ == "__main__":
plotAll()