-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsave_as_csv.py
75 lines (62 loc) · 3.23 KB
/
save_as_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import time
import re
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import constant.toutiao as toutiao
def get_page(url):
driver = webdriver.Chrome()
driver.get(url)
time.sleep(10)
driver.execute_script("window.scrollBy(0, 10000)", "")
time.sleep(10)
page = driver.page_source
driver.close()
return page
def get_news(csv_file, url, category):
# csvFile = open("crawlnews.csv", 'wb')
writer = csv.writer(csv_file)
link_head = 'http://toutiao.com'
soup = BeautifulSoup(get_page(url))
items = soup.find_all("div", {"class", "item-inner"})
for i in range(0, len(items)):
if len(items[i].select('img')) == 0:
print '0'
title = items[i].select('div[class="title-box"]')[0].get_text().strip().encode('GBK', 'ignore')
link = link_head + items[i].select('div[class="title-box"]')[0].a['href'].strip()
source = items[i].find_all('div', {"class": re.compile("lfooter$")})[0].find_all(
'', {"class": re.compile("^((?!source).)+$")})[0].get_text().strip().encode('GBK', 'ignore')
writer.writerow(("", "", "", title, source, link, 0, category))
if len(items[i].select('img')) == 1:
print '1'
image = items[i].select('img')[0].get('src')
title = items[i].select('div[class="title-box"]')[0].get_text().strip().encode('GBK', 'ignore')
link = link_head + items[i].select('div[class="title-box"]')[0].a['href'].strip()
source = items[i].find_all('div', {"class": re.compile("lfooter$")})[0].find_all(
'', {"class": re.compile("^((?!source).)+$")})[0].get_text().strip().encode('GBK', 'ignore')
writer.writerow((image, "", "", title, source, link, 1, category))
if len(items[i].select('img')) == 3:
print '3'
image1 = items[i].select('img')[0].get('src')
image2 = items[i].select('img')[1].get('src')
image3 = items[i].select('img')[2].get('src')
title = items[i].select('div[class="title-box"]')[0].get_text().strip().encode('GBK', 'ignore')
link = link_head + items[i].select('div[class="title-box"]')[0].a['href'].strip()
source = items[i].find_all('div', {"class": re.compile("lfooter$")})[0].find_all(
'', {"class": re.compile("^((?!source).)+$")})[0].get_text().strip().encode('GBK', 'ignore')
writer.writerow((image1, image2, image3, title, source, link, 2, category))
if __name__ == "__main__":
csvFile = open("crawlnews.csv", 'wb')
get_news(csvFile, toutiao.URL_RECOMMEND, "recommend")
get_news(csvFile, toutiao.URL_HOT, "hot")
get_news(csvFile, toutiao.URL_SOCIETY, "society")
get_news(csvFile, toutiao.URL_ENTERTAINMENT, "entertainment")
get_news(csvFile, toutiao.URL_SPORTS, "sports")
get_news(csvFile, toutiao.URL_FINANCE, "finance")
get_news(csvFile, toutiao.URL_WORLD, "world")
get_news(csvFile, toutiao.URL_MILITARY, "military")
get_news(csvFile, toutiao.URL_TECH, "tech")
get_news(csvFile, toutiao.URL_CAR, "car")
get_news(csvFile, toutiao.URL_FUNNY, "funny")
get_news(csvFile, toutiao.URL_FASHION, "fashion")
get_news(csvFile, toutiao.URL_TRAVEL, "travel")