-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataset.py
108 lines (91 loc) · 3.48 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import math, csv
from tensorflow.feature_column import categorical_column_with_hash_bucket, \
categorical_column_with_identity, crossed_column
from tensorflow.estimator.inputs import pandas_input_fn
import pandas as pd
CRITEO_MAX_SECONDS = 5270499
"""
We should FCs editable as an experiment parameter. But,
basically we have [c1...cn, i1...in, c1c2...cn-1cn, c1i1...cnin];
where c1, cnin hash size ~ 1e6 and i1 max value 10000.
I haven't examined the integer columns to determine if some are numeric.
Also, do not know the max value to use. It'd be easy to compute.
"""
categorical_names = ['c1','c2','c3','c4','c5','c6','c7','c8','c9']
integer_names = ['i1','i2','i3','i4','i5','i6','i7','i8']
criteo_features = list(map(
lambda n: categorical_column_with_hash_bucket(
n, hash_bucket_size=1e5
), categorical_names))
criteo_features += list(map(
lambda n: categorical_column_with_identity(
n, num_buckets=7500, default_value=0
), integer_names
))
def cross_all_columns():
all = categorical_names + integer_names
fcs = []
fck = {}
# probably way cleaner way to do this
# set key indicating pairs have been crossed
# if not crossed, append to fcs, and set true
for n1 in all:
for n2 in all:
k1 = "%s%s" % (n1, n2)
k2 = "%s%s" % (n2, n1)
if fck.get(k1) is None and fck.get(k2) is None:
fcs.append(crossed_column([n1, n2], 1e6))
fck[k1] = True
fck[k2] = True
return fcs
# taking mega long as expected
# maybe PCA or manual analysis
# or another model
#criteo_features += cross_all_columns()
def criteo_partition_fn(filename, interval_secs):
"""
Criteo Columns:
Click Time Conversion Time 8 Integers, 9 Categoricals
:return: Dataset duration / lag_seconds input partition fns
"""
index = 0
Xint = []
Xcat = []
y = []
with open(filename) as fh:
tsv = csv.reader(fh, delimiter="\t")
line_number = 0
for row in tsv:
current_partition = int(math.floor(int(row[0]) / interval_secs))
if index < current_partition:
dfi = pd.DataFrame.from_records(Xint, columns=integer_names)
### if interperted numerically
### dfi = dfi.fillna(dfi.mean())
## shift everything by 2, N/A=1
dfi = dfi.fillna(int(-1))
dfi += 2
dfi = dfi.astype(int)
dfc = pd.DataFrame.from_records(Xcat, columns=categorical_names)
dfy = pd.Series(y)
dfx = pd.concat(objs=[dfi,dfc], axis=1)
Xint = []
Xcat = []
y = []
yield lambda options: pandas_input_fn(
dfx,
dfy,
**options
)
#print("index: %s, current_partition: %s, line: %s, rowts: %s, rowts2: %s" % (index, current_partition, line_number, row[0], row[1]))
Xint.append(list(map(lambda x: None if x == '' else int(x), row[2:10])))
Xcat.append(row[10:19])
y.append(row[1] != '')
index = current_partition
line_number += 1
if __name__ == "__main__":
i = 0
interval_secs = 500000
max_sb = int(math.ceil(CRITEO_MAX_SECONDS / interval_secs))
for fun in criteo_partition_fn("data/data.txt", interval_secs):
print("i: %s, interval: %s, max_should_be: %s" % (i, interval_secs, max_sb))
i += 1