-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrape_Clean_Store.py
141 lines (110 loc) · 3.84 KB
/
Scrape_Clean_Store.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# import libraries
import sqlite3
from requests_html import HTMLSession
from fnmatch import fnmatch
from glom import glom
from openpyxl import load_workbook
from io import BytesIO
from itertools import islice
session = HTMLSession()
r = session.get("https://data.nsw.gov.au/data/dataset/fuel-check")
# The links(xlsx files) are in a json file.
# First step is to get the json file from the page, download it and extract all the xlsx files
# copy the xpath path from the webpage
# I used google chrome dev tools
# right click on the relevant link, click on copy element, and select full xpath
sel = "/html/body/div[3]/div/div[2]/div/article/div/ul/li[1]/a/@href"
# access the json url
json_link = r.html.xpath(sel)[0]
# Step 3: append json link to 'https://data.nsw.gov.au' and pass again to session.get
url = f"https://data.nsw.gov.au{json_link}"
json_data = session.get(url).json()
# The links are in the url key.<br>
# The path to the link is : json_data -> result -> resources -> url
# we'll filter urls for only those that end with 'xslx'
# the filter could also be on the format key that has 'xlsx' in its value
# create a specification to glom the relevant data
# glom's documentation is pretty detailed
spec = {"name": ("result.resources", ["name"]), "url": ("result.resources", ["url"])}
# extract data
outcome = glom(json_data, spec)
# get the names to be the keys, and urls to be the values
outcome = dict(zip(*outcome.values()))
# keep only entries where the file ends with 'xlsx'
# we'll use the fnmatch function from the fnmatch module
pattern = "*.xlsx"
urls = {key: value for key, value in outcome.items() if fnmatch(value, pattern)}
# connect to database
con = sqlite3.connect("fuel_nsw_price_data.db")
cur = con.cursor()
query = f"""SELECT fuel_month_year
from fuel_month_year
;"""
cur.execute(query)
# get current list of keys
db_table_list = cur.fetchall()
# get only urls that have not been downloaded
download_keys = set((urls.keys())) - set(db[0] for db in db_table_list)
def key_generator():
for key in download_keys:
yield (key,)
# add urls not downloaded to database
insert_query = f"INSERT INTO fuel_month_year (fuel_month_year) VALUES (?);"
cur.executemany(insert_query, key_generator())
# never forget to commit
con.commit()
# and cose connection
con.close()
# download list for our data
url_download_list = [urls[key] for key in download_keys]
print(len(url_download_list))
# no transformations needed here
# simply download and store in database
# pandas not needed
# plus openpyxl is faster than pandas for this task
collection = []
for xlsx_link in url_download_list:
# download link
r = session.get(xlsx_link)
# read it in
# note that they are bytes, hence the BytesIO usage
wb = load_workbook(BytesIO(r.content), read_only=True)
# initialize the first sheet
ws = wb.active
# get index of header
index, header = next(
(index, value)
for index, value in enumerate(ws.values)
if "ServiceStationName" in value
)
# get rest of data
body = list(islice(ws.values, index + 1, None))
# store away the data
collection.extend(body)
# connect to database again
con = sqlite3.connect("fuel_nsw_price_data.db")
cur = con.cursor()
# we have eight columns in our database table
# and in the incoming data
# so eight question marks to represent our data
insert_params = ",".join(["?"] * 8)
# list of columns
columns_list = ",".join(
[
"service_station_name",
"address",
"suburb",
"postcode",
"brand",
"fuelcode",
"transaction_date",
"price",
]
)
# query string
insert_query = f"INSERT INTO fuel_data({columns_list}) VALUES ({insert_params});"
# speedy multiple row insertions
cur.executemany(insert_query, collection)
# without this, the information will not be saved
con.commit()
con.close()