-
Notifications
You must be signed in to change notification settings - Fork 0
/
explore_data.py
101 lines (83 loc) · 2.77 KB
/
explore_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#-*- coding: utf-8 -*-
import pandas as pd
import numpy as np
ACTION_201602_FILE = "data_ori/JData_Action_201602.csv"
ACTION_201603_FILE = "data_ori/JData_Action_201603.csv"
ACTION_201603_EXTRA_FILE = "data_ori/JData_Action_201603_extra.csv"
ACTION_201604_FILE = "data_ori/JData_Action_201604.csv"
COMMENT_FILE = "data_ori/JData_Comment.csv"
PRODUCT_FILE = "data_ori/JData_Product.csv"
USER_FILE = "data_ori/JData_User_New.csv"
NEW_USER_FILE = "data_ori/JData_User_New.csv"
# Display format
pd.options.display.float_format = '{:,.3f}'.format
def convert_age(age_str):
if age_str == u'-1':
return -1
elif age_str == u'15岁以下':
return 0
elif age_str == u'16-25岁':
return 1
elif age_str == u'26-35岁':
return 2
elif age_str == u'36-45岁':
return 3
elif age_str == u'46-55岁':
return 4
elif age_str == u'56岁以上':
return 5
else:
return -1
def tranform_user_age():
# Load data_ori, header=0 means that the file has column names
df = pd.read_csv(USER_FILE, header=0, encoding="gbk")
# for i in range(len(df['age'])):
# print(i)
# if df['age'][i] == u"15岁以下":
# df['age'][i] = 0
# elif df['age'][i] == u"16-25岁":
# df['age'][i] = 1
# elif df['age'][i] == u"26-35岁":
# df['age'][i] = 2
# elif df['age'][i] == u"36-45岁":
# df['age'][i] = 3
# elif df['age'][i] == u"46-55岁":
# df['age'][i] = 4
# elif df['age'][i] == u"56岁以上":
# df['age'][i] = 5
# else:
# df['age'][i] = -1
df['age'] = df['age'].map(convert_age)
df['user_reg_tm'] = pd.to_datetime(df['user_reg_tm'])
min_date = min(df['user_reg_tm'])
df['user_reg_diff'] = [i for i in (df['user_reg_tm'] - min_date).dt.days]
df.to_csv(NEW_USER_FILE, index=False)
def explore_user():
df = pd.read_csv(NEW_USER_FILE, header=0)
# Get first 5 rows, also you can use df.tail(10) to get last 10 rows
print(df.head(5))
# Basic statistical information
print(df.describe())
# Each column type
print(df.dtypes)
def explore_action_02(chunk_size=100000):
# Number of Record: 18117303
reader = pd.read_csv(ACTION_201602_FILE, header=0, iterator=True)
chunks = []
loop = True
while loop:
try:
chunk = reader.get_chunk(chunk_size)
chunks.append(chunk)
except StopIteration:
loop = False
print("Iteration is stopped")
df = pd.concat(chunks, ignore_index=True)
print(df.head(5))
print(df.dtypes)
print(df[df["user_id"] == 27630])
if __name__ == "__main__":
# 进行年龄映射
tranform_user_age()
# explore_user()
# explore_action_02()