Skip to content

Commit 7e89d92

Browse files
committedOct 24, 2018
Learn pandas day4
第十六课 多表联合(Join) 第十七课 数据校验,轴的概念 第十八课 把一列数据分割成两列 第十九课 求和,求平均,统计导引 第二十课 定位、消除重复数据
1 parent 908b573 commit 7e89d92

File tree

12 files changed

+331
-108
lines changed

12 files changed

+331
-108
lines changed
 

‎.idea/workspace.xml

+166-108
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎14&15-ScatterPlot&Histogram&DensityMap/ScatterPlot&Histogram&DensityMap.py

+1
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@
2525
plt.xticks(range(0,max(homes.sqft_living),500),fontsize=8,rotation=90)
2626
plt.show()
2727

28+

‎16-Join/Join.py

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# pandasVersusExcel
2+
# http://sa.mentorx.net/course/89/tasks
3+
# 第十六课 多表联合(Join)
4+
# 2018-10-24
5+
6+
import pandas as pd
7+
8+
students = pd.read_excel('./Student_Score.xlsx',sheet_name='Students',index_col='ID')
9+
scores = pd.read_excel('./Student_Score.xlsx',sheet_name='Scores',index_col='ID')
10+
print('----原始数据----')
11+
print('\n----Students----')
12+
print(students)
13+
print(students.columns)
14+
print('\n----Scores----')
15+
print(scores)
16+
print(scores.columns)
17+
18+
# 联合查询
19+
20+
# 方法一
21+
# how: 链接方式
22+
# inner(默认)-inner join
23+
# left-左链接
24+
# right-右链接
25+
# on: 链接字段(如果省略此属性,merge会自动以相同的列名作为链接的依据,但不会比较 index_col)
26+
# left_on/right_on: 分别指定两张表的链接依据
27+
# fillna(0): 将'NaN'替换为0
28+
table1 = students.merge(scores,how='left',on='ID').fillna(0)
29+
table1.Score = table1.Score.astype(int) # 将Score中的小数转换为整数
30+
print('\n----联合查询 方法一(inner join)----')
31+
print(table1)
32+
33+
# 方法二
34+
# how: 链接方式
35+
# inner(默认)-inner join
36+
# left-左链接
37+
# right-右链接
38+
# on: 链接字段(设置了 index_col 时如果省略此属性,join会自动以 index_col 作为链接的依据)
39+
# fillna(0): 将'NaN'替换为0
40+
table2 = students.join(scores,how='left',on='ID').fillna(0)
41+
table2.Score = table2.Score.astype(int) # 将Score中的小数转换为整数
42+
print('\n----联合查询 方法二(inner join)----')
43+
print(table2)
44+
45+
46+
47+

‎16-Join/Student_Score.xlsx

11.1 KB
Binary file not shown.

‎17-DataValidation/DataValidation.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# pandasVersusExcel
2+
# http://sa.mentorx.net/course/89/tasks
3+
# 第十七课 数据校验,轴的概念
4+
# 2018-10-24
5+
6+
import pandas as pd
7+
8+
# 方法一
9+
def score_validation(row):
10+
try:
11+
assert 0 <= row.Score <= 100
12+
except:
13+
print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.')
14+
15+
# 方法二
16+
def score_validation2(row):
17+
if not 0 <= row.Score <= 100:
18+
print(f'#{row.ID}\tstudent {row.Name} has an invalid score {row.Score}.')
19+
20+
# 在进行数据校验时不要设置 index_col ,这样有助于保证所有数据都进行校验
21+
students = pd.read_excel('./Students.xlsx')
22+
print('----原始数据----')
23+
print(students)
24+
print(students.columns)
25+
26+
print('\n----校验结果----')
27+
students.apply(score_validation,axis=1)
28+
29+
# axis = 1: 横向
30+
# axis = 0: 纵向(默认)

‎17-DataValidation/Students.xlsx

9.58 KB
Binary file not shown.
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# pandasVersusExcel
2+
# http://sa.mentorx.net/course/89/tasks
3+
# 第十八课 把一列数据分割成两列
4+
# 2018-10-24
5+
6+
import pandas as pd
7+
8+
employees = pd.read_excel('./Employees.xlsx',index_col='ID')
9+
df = employees['Full Name'].str.split(expand=True)
10+
print('----原始数据----')
11+
print(employees)
12+
print(employees.columns)
13+
print(df)
14+
15+
employees['First Name'] = df[0]
16+
employees['Last Name'] = df[0]
17+
print('\n----分割后的结果----')
18+
print(employees)
19+
20+
# 补充
21+
# split() 方法:
22+
# split(' ',n=0,expand=True)
23+
# split 的第一个参数: 表示分隔符默认为空格或tab
24+
# split 的第二个参数 n: 表示最多分割的个数(0或-1 表示分割成尽可能多的个数)
25+
# split 的第二个参数 expand: 默认为 False (False:分割后生成数组,占一列;True: 分割成单独的列)

‎18-DataSegmentation/Employees.xlsx

10.1 KB
Binary file not shown.

‎19-Statistics/Statistics.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# pandasVersusExcel
2+
# http://sa.mentorx.net/course/89/tasks
3+
# 第十九课 求和,求平均,统计导引
4+
# 2018-10-24
5+
6+
import pandas as pd
7+
8+
students = pd.read_excel('./Students.xlsx',index_col='ID')
9+
print('----原始数据----')
10+
print(students)
11+
print(students.columns)
12+
13+
temp = students[['Test_1','Test_2','Test_3']]
14+
print('\n----需要计算的元数据----')
15+
print(temp)
16+
17+
# 和
18+
raw_sum = temp.sum(axis=1)
19+
print('\n----求和----')
20+
print(raw_sum)
21+
22+
# 平均值
23+
raw_mean = temp.mean(axis=1)
24+
print('\n----求平均值----')
25+
print(raw_mean)
26+
27+
students['Total'] = raw_sum
28+
students['Average'] = raw_mean
29+
print('\n----整合结果----')
30+
print(students)
31+
32+
col_mean = students[['Test_1','Test_2','Test_3','Total','Average']].mean()
33+
col_mean['Name'] = 'Summary'
34+
students = students.append(col_mean,ignore_index=True)
35+
print('\n----最终结果----')
36+
print(students)
37+
38+
# axis = 1: 横向
39+
# axis = 0: 纵向(默认)

‎19-Statistics/Students.xlsx

10.5 KB
Binary file not shown.

‎20-DuplicateData/DuplicateData.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# pandasVersusExcel
2+
# http://sa.mentorx.net/course/89/tasks
3+
# 第二十课 定位、消除重复数据
4+
# 2018-10-24
5+
6+
import pandas as pd
7+
8+
students = pd.read_excel('./Students_Duplicates.xlsx')
9+
print('----原始数据----')
10+
print(students)
11+
print(students.columns)
12+
13+
dupe = students.duplicated(subset='Name')
14+
print('\n----检查重复数据(True为重复)----')
15+
print(dupe)
16+
17+
dupe = dupe[dupe] # 获取重复的行,等同于dupe = dupe[dupe==True]
18+
print('\n----查看重复数据----')
19+
print(students.iloc[dupe.index])
20+
21+
students.drop_duplicates(subset='Name',inplace=True)
22+
print('\n----消除重复数据后的数据----')
23+
print(students)
10.7 KB
Binary file not shown.

0 commit comments

Comments
 (0)
Please sign in to comment.