-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_vc.py
99 lines (80 loc) · 3.28 KB
/
parse_vc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import time
import os.path
from tkinter import scrolledtext
import requests
from bs4 import BeautifulSoup as BS
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import config
class VCNewsParser:
last_post = ""
driver = None
# number of page scrolling
page_downs_num=15
def __init__(self):
# use chrome driver for scrolling page
self.driver = webdriver.Chrome(executable_path=config.chromedriver_path)
self.driver.get(config.url)
time.sleep(1)
def set_last_post(self, last_post):
self.last_post = last_post
def new_posts(self):
self.driver.refresh()
elem = self.driver.find_element(by=By.TAG_NAME, value="body")
# function for getting post's hrefs, if structure of site will change -> need change this function
get_posts_urls = lambda driver: list(map(lambda x: x.get_attribute("href"), self.driver.find_elements(by=By.CLASS_NAME, value="content-link")))
posts = get_posts_urls(self.driver)
scroll_count = 0
# if post didnt find in current posts -> scroll page down and search again 10 times
while self.last_post not in posts and scroll_count < 10:
for i in range(self.page_downs_num):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
posts = get_posts_urls(self.driver)
scroll_count += 1
# if post didnt find -> return last post from page
if scroll_count == 10:
return [self.driver.find_element(by=By.CLASS_NAME, value="content-link").get_attribute("href")]
#cut last posts
posts = posts[0:posts.index(self.last_post)]
# reverse
return posts[::-1]
def post_info(self, link):
r = requests.get(link)
html = BS(r.content, 'html.parser')
info = {}
try:
# get photo url
# if code of webpage is change -> need to change this block
try:
img_url = html.find("div","content").find("div", "andropov_image")["data-image-src"]
except:
img_url = None
# get info about post
# if code of webpage is change -> need to change this block
info = {
"category" : "#" + html.find("div", "content-header-author__name").text[17: -12].replace(" ", "_"),
"title": html.find("h1", "content-title").text[21:].replace("Статьи редакции",'').replace('\n', '').replace(" ", ""),
"description": html.find("div", "content content--full").find("p").text,
"link": link,
"img_url": img_url
}
except:
print("Something went wrong")
r.close()
return info
def download_image(self, url):
# cut last /
url = url[:-1]
r = requests.get(url, allow_redirects=True)
a = urlparse(url)
filename = os.path.basename(a.path)
# save header image to local disk
open(filename, 'wb').write(r.content)
return filename
def delete_image(self, filename):
os.remove(filename)
def close_all_sessions(self):
self.driver.quit()