-
Notifications
You must be signed in to change notification settings - Fork 4
/
get_toutiao_news.py
55 lines (46 loc) · 2.37 KB
/
get_toutiao_news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import time
import re
from selenium import webdriver
from bs4 import BeautifulSoup
from bean.news import News
def get_page(url):
driver = webdriver.Chrome()
driver.get(url)
driver.execute_script("window.scrollBy(0, 5000)", "")
time.sleep(3)
page = driver.page_source
driver.close()
return page
def get_news(url):
news_list = []
link_head = 'http://toutiao.com'
soup = BeautifulSoup(get_page(url))
items = soup.find_all("div", {"class", "item-inner"})
for i in range(0, len(items)):
if len(items[i].select('img')) == 0:
print '0'
title = items[i].select('div[class="title-box"]')[0].get_text().strip().encode('GBK', 'ignore')
link = link_head + items[i].select('div[class="title-box"]')[0].a['href'].strip()
source = items[i].find_all('div', {"class": re.compile("lfooter$")})[0].find_all(
'', {"class": re.compile("^((?!source).)+$")})[0].get_text().strip().encode('GBK', 'ignore')
news = News('', '', '', title, source, link)
if len(items[i].select('img')) == 1:
print '1'
image = items[i].select('img')[0].get('src')
title = items[i].select('div[class="title-box"]')[0].get_text().strip().encode('GBK', 'ignore')
link = link_head + items[i].select('div[class="title-box"]')[0].a['href'].strip()
source = items[i].find_all('div', {"class": re.compile("lfooter$")})[0].find_all(
'', {"class": re.compile("^((?!source).)+$")})[0].get_text().strip().encode('GBK', 'ignore')
news = News(image, '', '', title, source, link)
if len(items[i].select('img')) == 3:
print '3'
image1 = items[i].select('img')[0].get('src')
image2 = items[i].select('img')[1].get('src')
image3 = items[i].select('img')[2].get('src')
title = items[i].select('div[class="title-box"]')[0].get_text().strip().encode('GBK', 'ignore')
link = link_head + items[i].select('div[class="title-box"]')[0].a['href'].strip()
source = items[i].find_all('div', {"class": re.compile("lfooter$")})[0].find_all(
'', {"class": re.compile("^((?!source).)+$")})[0].get_text().strip().encode('GBK', 'ignore')
news = News(image1, image2, image3, title, source, link)
news_list.append(news)
return news_list