-
Notifications
You must be signed in to change notification settings - Fork 55
/
ctr_funcs.py
80 lines (68 loc) · 3.11 KB
/
ctr_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import tensorflow as tf
import numpy as np
import datetime
from sklearn import metrics
def cal_auc(pred_score, label):
fpr, tpr, thresholds = metrics.roc_curve(label, pred_score, pos_label=1)
auc_val = metrics.auc(fpr, tpr)
return auc_val, fpr, tpr
def cal_rmse(pred_score, label):
mse = metrics.mean_squared_error(label, pred_score)
rmse = np.sqrt(mse)
return rmse
def cal_rectified_rmse(pred_score, label, sample_rate):
for idx, item in enumerate(pred_score):
pred_score[idx] = item/(item + (1-item)/sample_rate)
mse = metrics.mean_squared_error(label, pred_score)
rmse = np.sqrt(mse)
return rmse
# only works for 2D list
def list_flatten(input_list):
output_list = [yy for xx in input_list for yy in xx]
return output_list
def count_lines(file_name):
num_lines = sum(1 for line in open(file_name, 'rt'))
return num_lines
# this func is only for avito data
def tf_read_data(file_name_queue, label_col_idx, record_defaults):
reader = tf.TextLineReader()
key, value = reader.read(file_name_queue)
# Default values, in case of empty columns. Also specifies the type of the decoded result.
cols = tf.decode_csv(value, record_defaults=record_defaults)
# you can only process the data using tf ops
label = cols.pop(label_col_idx)
feature = cols
# Retrieve a single instance
return feature, label
# load training data
record_defaults = [[0]]*141
record_defaults[0] = [0.0]
def tf_input_pipeline(file_names, batch_size, num_epochs=1, label_col_idx=0, record_defaults=record_defaults):
# shuffle over files
file_name_queue = tf.train.string_input_producer(file_names, num_epochs=num_epochs, shuffle=True)
feature, label = tf_read_data(file_name_queue, label_col_idx, record_defaults)
# min_after_dequeue defines how big a buffer we will randomly sample from
# capacity must be larger than min_after_dequeue and the amount larger determines the max we
# will prefetch
min_after_dequeue = 5000
capacity = min_after_dequeue + 3*batch_size
feature_batch, label_batch = tf.train.shuffle_batch([feature, label], \
batch_size=batch_size, capacity=capacity, min_after_dequeue=min_after_dequeue)
return feature_batch, label_batch
def tf_input_pipeline_test(file_names, batch_size, num_epochs=1, label_col_idx=0, record_defaults=record_defaults):
# shuffle over files
file_name_queue = tf.train.string_input_producer(file_names, num_epochs=num_epochs, shuffle=True)
feature, label = tf_read_data(file_name_queue, label_col_idx, record_defaults)
# min_after_dequeue defines how big a buffer we will randomly sample from
# capacity must be larger than min_after_dequeue and the amount larger determines the max we
# will prefetch
min_after_dequeue = 5000
capacity = min_after_dequeue + 3*batch_size
feature_batch, label_batch = tf.train.batch([feature, label], \
batch_size=batch_size, capacity=capacity)
return feature_batch, label_batch
time_style = '%Y-%m-%d %H:%M:%S'
def print_time():
now = datetime.datetime.now()
time_str = now.strftime(time_style)
print(time_str)