-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
88 lines (73 loc) · 2.57 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import requests
import lxml.html
import time
import sqlite3
import pandas as pd
from fake_useragent import UserAgent
def get_html(html):
tree = lxml.html.fromstring(html)
item_cards = tree.xpath(
'//div[@class="items-filter"]/div[@class="Items"]/div[@class="sitem"]')
return item_cards
def get_pages_num(html):
tree = lxml.html.fromstring(html)
pages_num = tree.xpath('//div[@class="pagination"]/a')[-2].text
pages_num = int(pages_num)
return pages_num
def get_data(item_cards):
data = []
for item in item_cards:
title = item.xpath('.//div[@class="si-desc"]/a')[0].text
brand = item.xpath('.//div[@class="TItemTop"]')[0].text
image = item.xpath('.//a/img')[0].attrib['src']
url = item.xpath('.//div[@class="si-desc"]/a')[0].attrib['href']
try:
price = item.xpath(
'.//div[@class="si-desc"]/div[@class="si-price"]/div/span')[0].text
price = float(price.replace(' ', ''))
except:
price = 'Not in stock'
try:
portions_num = item.xpath(
'.//div[@class="si-desc"]/div[@class="si-price"]/div/text()[2]')[0].split(' ')[-1]
portions_num = int(portions_num)
except:
portions_num = 'N/A'
try:
portion_price = item.xpath(
'.//div[@class="si-desc"]/div[@class="si-price"]//div/text()[3]')[0].split(' ')[-2]
portion_price = float(portion_price)
except:
portion_price = 'N/A'
item_info = {
'title': title,
'price': price,
'brand': brand,
'portions_num': portions_num,
'portion_price': portion_price,
'image': 'https://sportivnoepitanie.ru'+image,
'link': 'https://sportivnoepitanie.ru'+url,
}
data.append(item_info)
return data
def save_to_db(total_data):
df = pd.DataFrame(total_data)
conn = sqlite3.connect('sport_food.db')
df.to_sql("sport_food_info", conn)
conn.close()
if __name__ == '__main__':
url = 'https://sportivnoepitanie.ru/vitamins-minerals/'
ua = UserAgent()
headers = {'User-Agent': ua.random}
html = requests.get(url, headers=headers)
total_data = []
pages_num = get_pages_num(html.text)
for i in range(1, pages_num+1):
time.sleep(2)
if i < 2:
html = html
else:
html = requests.get(url+f'?page={i}', headers=headers)
item_cards = get_html(html.text)
total_data.extend(get_data(item_cards))
save_to_db(total_data)