-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
113 lines (82 loc) · 3.23 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import pandas as pd
import os
def preprossessing(fromPolis = False, onCluster = False, path = ""):
'''
preprocesses the data by reducing it to the important stuff and sorting
:param fromPols bool, true if data shall be taken from the polis dataset
:param onCluster bool, true if program is used on the cluster
'''
if fromPolis:
if path == "":
path = "15-per-hour-seattle"
# path = "american-assembly.bowling-green"
# path = "brexit-consensus"
# path = "canadian-electoral-reform"
# path = "football-concussions"
# path = "march-on.operation-marchin-orders"
# path = "scoop-hivemind.affordable-housing"
# path = "scoop-hivemind.biodiversity"
# path = "scoop-hivemind.freshwater"
# path = "scoop-hivemind.ubi"
# path = "scoop-hivemind.taxes"
# path = "vtaiwan.uberx"
if onCluster:
df = pd.read_csv("openData/" + path + "/votes.csv")
else:
df = pd.read_csv("../polis/openData/" + path + "/votes.csv")
df = df.sort_values(by = 'timestamp')
df = df.drop(['datetime'], axis = 1)
df = df.drop(['timestamp'], axis = 1)
df = df.values.astype(int)
if onCluster:
cmt = pd.read_csv("openData/" + path + "/comments.csv")
else:
cmt = pd.read_csv("../polis/openData/" + path + "/comments.csv")
### deleting all moderated comments
cmt = cmt.loc[cmt['moderated'] == -1]
bad_cmt_id = cmt['comment-id'].values
rows_to_delete = []
for i in range(df.shape[0]):
cmt_id = df[i,0]
if cmt_id in bad_cmt_id:
rows_to_delete.append(i)
cleaned_df = np.delete(df,rows_to_delete, axis= 0)
print("n_cleaned cmt ", len(rows_to_delete))
df = cleaned_df
### delete all people which aren't included in the clustering
if onCluster:
pts = pd.read_csv("openData/" + path + "/participants-votes.csv")
else:
pts = pd.read_csv("../polis/openData/" + path + "/participants-votes.csv")
bad_pts_id = pts.loc[pts['n-votes'] <= 5]
bad_pts_id = bad_pts_id['participant'].values
rows_to_delete = []
for i in range(df.shape[0]):
pts_id = df[i,1]
if pts_id in bad_pts_id:
rows_to_delete.append(i)
cleaned_df = np.delete(df,rows_to_delete, axis= 0)
print("n_cleaned pts ", len(rows_to_delete))
df = cleaned_df
else:
if path == "":
path = "vote_hist_71"
else:
if type(path) == int:
path = str(path)
# path = "vote_hist_backup"
print(path)
df = pd.read_csv("data/vote_hist_" + path + '.csv')
df.columns = ['idx', 'comment-id', 'voter-id', 'vote']
df = df.drop('idx' ,axis=1)
df = df.values
df = df.astype(int)
return df, path
def get_all_sub_dir():
directory ='openData'
sub_dir = next(os.walk(directory))[1]
sub_dir.remove(".git")
return sub_dir
if __name__ == "__main__":
print(get_all_sub_dir())