-
Notifications
You must be signed in to change notification settings - Fork 0
/
manifesto_data.py
85 lines (75 loc) · 3.17 KB
/
manifesto_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import urllib, itertools, json, os
import urllib.request
import pandas as pd
BASEURL = "https://manifesto-project.wzb.eu/tools"
VERSION = "MPDS2016b"
APIKEY = "INSERT_APIKEY" #AN API KEY STRING FROM https://manifestoproject.wzb.eu/information/documents/api
COUNTRY = "Germany"
def get_url(url):
return urllib.request.urlopen(url).read().decode()
def get_latest_version():
'''
Get the latest version id of the Corpus
'''
versionsUrl = BASEURL+"/api_list_metadata_versions.json?&api_key="+APIKEY
versions = json.loads(get_url(versionsUrl))
return versions['versions'][-1]
def get_manifesto_id(text_id,version):
'''
Get manifesto id of a text given the text id and a version id
'''
textKeyUrl = BASEURL+"/api_metadata?keys[]="+text_id+"&version="+version+"&api_key="+APIKEY
textMetaData = json.loads(get_url(textKeyUrl))
return textMetaData['items'][0]['manifesto_id']
def get_core(version = VERSION):
'''
Downloads core data set, including information about all parties
https://manifestoproject.wzb.eu/information/documents/api
'''
url = BASEURL + "/api_get_core?key=" + VERSION + "&api_key=" + APIKEY
return json.loads(get_url(url))
def get_text_keys(country=COUNTRY):
d = get_core()
return [p[5:7] for p in d if p[1]==country]
def get_text(text_id):
'''
Retrieves the latest version of the manifesto text with corresponding labels
'''
# get the latest version of this text
version = get_latest_version()
# get the text metadata and manifesto ID
manifestoId = get_manifesto_id(text_id,version)
textUrl = BASEURL + "/api_texts_and_annotations.json?keys[]="+manifestoId+"&version="+version+"&api_key="+APIKEY
textData = json.loads(get_url(textUrl))
try:
text = [(t['cmp_code'],t['text']) for t in textData['items'][0]['items']]
print('Downloaded %d texts for %s'%(len(textData['items'][0]['items']),text_id))
return text
except:
print('Could not get text %s'%text_id)
def get_texts_per_party(country=COUNTRY):
# get all tuples of party/date corresponding to a manifesto text in this country
textKeys = get_text_keys(country)
# get the texts
texts = {t[1]+"_"+t[0]:get_text(t[1]+"_"+t[0]) for t in textKeys}
texts = {k: v for k, v in texts.items() if v}
print("Downloaded %d/%d annotated texts"%(len(texts),len(textKeys)))
return texts
def get_texts(country=COUNTRY):
texts = get_texts_per_party(country)
return [x for x in list(itertools.chain(*texts.values())) if x[0]!='NA' and x[0]!='0']
def get_manifesto_texts(country = "Germany",
folder="data/manifesto",
min_len=10,
force_download = False):
fn = folder + "/manifesto-%s.csv"%country
if (force_download == False) and os.path.isfile(fn):
print("Loading %s"%fn)
df = pd.read_csv(fn)
else:
print("Downloading texts from manifestoproject.")
manifestotexts = get_texts(country)
df = pd.DataFrame(manifestotexts,columns=['cmp_code','content'])
df.to_csv(fn,index=False)
df = df[df.content.apply(lambda x: len(str(x)) > min_len)]
return df['content'].map(str).tolist(),df['cmp_code'].map(int).tolist()