-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
165 lines (141 loc) · 6.15 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import requests
import parsel
import re
import csv
"""
查询参数:
'ssdm': 'xx' 选择省市
dwmc:xx 直接选择单位
'mldm': 'xx' 门类 专科:zyxw
'yjxkdm': 'xxxx' 学科类别
zymc: 选择专业
xxfs: 1 全日制 2 非全日制
"""
first_post_page = 0
with open('data.csv', mode='w', encoding='utf-8', newline='') as file:
csv_writer = csv.writer(file)
csv_writer.writerow(
['招生单位', '所在地', '研究生院', '自划线院校', '博士点',
'考试方式', '院系所', '专业', '研究方向', '学习方式', '指导教师', '拟招生人数', '跨专业', '备注',
'政治', '考试大纲', '英语', '考试大纲', '数学', '考试大纲', '专业课', '考试大纲',
'政治', '考试大纲', '英语', '考试大纲', '数学', '考试大纲', '专业课', '考试大纲', ])
url = 'https://yz.chsi.com.cn/zsml/queryAction.do'
data = {
'ssdm': '11',
'dwmc': '',
'mldm': '08',
'mlmc': '',
'yjxkdm': '0812',
'zymc': '',
'xxfs': '1',
'pageno': str(first_post_page),
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
response = requests.post(url=url, data=data, headers=headers)
html_data = response.text
selector = parsel.Selector(html_data)
max_page_num = int(selector.css('.zsml-page-box li:nth-last-child(3) a::text').get())
print(max_page_num)
for page in range(1, max_page_num + 1):
print(page)
data = {
'ssdm': '',
'dwmc': '',
'mldm': '08',
'mlmc': '',
'yjxkdm': '0801',
'zymc': '',
'xxfs': '',
'pageno': str(page),
}
response = requests.post(url=url, data=data, headers=headers)
html_data = response.text
selector = parsel.Selector(html_data)
trs = selector.css('.ch-table tbody tr')
for tr in trs:
temp_writer = []
temp_all = []
school = tr.css('a::text').get()
school_url = 'https://yz.chsi.com.cn/' + tr.css('a::attr(href)').get()
city = tr.css('td:nth-child(2)::text').get()
statue_1 = tr.css('td:nth-child(3) i::text').get()
if not statue_1:
statue_1 = ''
else:
statue_1 = '是'
statue_2 = tr.css('td:nth-child(4) i::text').get()
if not statue_2:
statue_2 = ''
else:
statue_2 = '是'
statue_3 = tr.css('td:nth-child(5) i::text').get()
if not statue_3:
statue_3 = ''
else:
statue_3 = '是'
print(school, city, statue_1, statue_2, statue_3)
with open('data.csv', mode='a', encoding='utf-8', newline='') as file:
file.write(school + ',' + city + ',' + statue_1 + ',' + statue_2 + ',' + statue_3 + ',')
temp_subject_Information = []
response = requests.get(url=school_url, headers=headers)
selector = parsel.Selector(response.text)
trs = selector.css('.ch-table tbody tr')
extent = selector.css('.ch-table tbody tr td:nth-child(1)::text').getall()
Number_of_studies = len(extent)
control_number = 1
for tr in trs:
form = tr.css('td:nth-child(1)::text').get()
faculty = tr.css('td:nth-child(2)::text').get()
speciality = tr.css('td:nth-child(3)::text').get()
Research_Direction = tr.css('td:nth-child(4)::text').get()
Styles = tr.css('td:nth-child(5)::text').get()
teacher = tr.css('td:nth-child(6)::text').get().strip()
people = tr.css('td:nth-child(7) script::text').get()
transdisciplinary = tr.css('td:nth-child(9) a::text').get()
remark = tr.css('td:nth-child(10) script::text').get()
# try:
# people = re.findall("专业:.*?',", people, re.S)[0]
# except:
# try:
# people = re.findall("研究方向:.*?,", people, re.S)[0]
# except:
# try:
# people = re.findall("一级学科:.*?,", people, re.S)[0]
# except:
# people = re.findall("院系所:.*?,", people, re.S)[0]
remark = eval(re.findall("cutString(.*?),", remark, re.S)[0].replace('(', ''))
people: str
people = people.split('cutString')[-1].replace("'", '').split(',')[0].strip("(")
syllabus_url = 'https://yz.chsi.com.cn/' + tr.css('td:nth-child(8) a::attr(href)').get()
print(form, faculty, speciality, Research_Direction, Styles, teacher, people, transdisciplinary, remark)
temp_subject_Information.append(
[form, faculty, speciality, Research_Direction, Styles, teacher, people, transdisciplinary, remark])
response = requests.get(url=syllabus_url, headers=headers)
selector = parsel.Selector(response.text)
tds = selector.css('tbody.zsml-res-items td')
for td in tds:
politics = td.css('::text').get().strip()
detail = td.css('span::text').get()
print(politics, detail)
temp_subject_Information.append([politics, detail])
for i in temp_subject_Information:
for k in i:
temp_writer.append(k)
temp_subject_Information.clear()
with open('data.csv', mode='a', encoding='utf-8', newline='') as file:
csv_writer = csv.writer(file)
csv_writer.writerow(temp_writer)
temp_writer.clear()
# 决定是否写入新行
if Number_of_studies == 1:
pass
else:
if control_number == Number_of_studies:
pass
else:
with open('data.csv', mode='a', encoding='utf-8', newline='') as file:
file.write('' + ',' + '' + ',' + '' + ',' + '' + ',' + '' + ',')
control_number += 1
print('*' * 100)