Skip to content

Commit d96bfb9

Browse files
authored
add script for extracting data from main page
1 parent 6522640 commit d96bfb9

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed

scrape-google-finance-main-page.py

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import requests, json, re
2+
from parsel import Selector
3+
4+
5+
def scrape_google_finance_main_page():
6+
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
7+
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
8+
headers = {
9+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
10+
}
11+
12+
html = requests.get(f"https://www.google.com/finance/", headers=headers, timeout=30)
13+
selector = Selector(text=html.text)
14+
15+
# where all extracted data will be temporary located
16+
ticker_data = {
17+
"market_trends": {
18+
"top_position": [],
19+
"bottom_position": []
20+
},
21+
"interested_in": {
22+
"top_position": [],
23+
"bottom_position": []
24+
},
25+
"earning_calendar": [],
26+
"most_followed_on_google": [],
27+
"news": [],
28+
}
29+
30+
# Market trends top results
31+
ticker_data["market_trends"]["top_position"] = selector.css(".gR2U6::text").getall()
32+
33+
# Earnings calendar results
34+
for calendar_quote in selector.css(".d3fRjc"):
35+
ticker_data["earning_calendar"].append({
36+
"quote": calendar_quote.css(".yaubCc::text").get(),
37+
"quote_link": f'https://www.google.com/finance/quote{calendar_quote.css(".yaubCc::attr(href)").get().replace("./quote/", "/")}',
38+
"short_date": calendar_quote.css(".JiAI5b").xpath("normalize-space()").get(),
39+
"full_date": calendar_quote.css(".fVovwd::text").get()
40+
})
41+
42+
# Most followed on Google results
43+
for google_most_followed in selector.css(".NaLFgc"):
44+
current_percent_change_raw_value = google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()
45+
current_percent_change = re.search(r"by\s?(\d+\.\d+)%", google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group(1)
46+
47+
ticker_data["most_followed_on_google"].append({
48+
"title": google_most_followed.css(".TwnKPb::text").get(),
49+
"quote": re.search(r"\.\/quote\/(\w+):",google_most_followed.attrib["href"]).group(1), # https://regex101.com/r/J3DDIX/1
50+
"following": re.search(r"(\d+\.\d+)M", google_most_followed.css(".Iap8Fc::text").get()).group(1), # https://regex101.com/r/7ptVha/1
51+
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
52+
})
53+
54+
# news results. If empty -> run once again. For some reason it could return [].
55+
for index, news in enumerate(selector.css(".yY3Lee"), start=1):
56+
ticker_data["news"].append({
57+
"position": index,
58+
"title": news.css(".Yfwt5::text").get(),
59+
"link": news.css(".z4rs2b a::attr(href)").get(),
60+
"source": news.css(".sfyJob::text").get(),
61+
"published": news.css(".Adak::text").get(),
62+
"thumbnail": news.css("img.Z4idke::attr(src)").get()
63+
})
64+
65+
# "you may be interested in" at the top of the page results
66+
for index, interested_top in enumerate(selector.css(".sbnBtf:not(.xJvDsc) .SxcTic"), start=1):
67+
current_percent_change_raw_value = interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()
68+
current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
69+
70+
ticker_data["interested_in"]["top_position"].append({
71+
"index": index,
72+
"title": interested_top.css(".ZvmM7::text").get(),
73+
"quote": interested_top.css(".COaKTb::text").get(),
74+
"price_change": interested_top.css(".SEGxAb .P2Luy::text").get(),
75+
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
76+
})
77+
78+
# market trends bottom
79+
for index, market_trend in enumerate(selector.css("[jscontroller=mBF9u]"), start=1):
80+
current_percent_change_raw_value = interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()
81+
current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
82+
83+
ticker_data["market_trends"]["bottom_position"].append({
84+
"index": index,
85+
"title": market_trend.css(".ZvmM7::text").get(),
86+
"quote": market_trend.css(".COaKTb::text").get(),
87+
"price": market_trend.css(".YMlKec::text").get(),
88+
"price_percent_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
89+
})
90+
91+
# "you may be interested in" at the bottom of the page results
92+
for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
93+
# single function to handle both top and bottom
94+
# "you may be interested results" as selectors is identical
95+
96+
current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
97+
current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
98+
99+
ticker_data["interested_in"]["bottom_position"].append({
100+
"position": index,
101+
"ticker": interested_bottom.css(".COaKTb::text").get(),
102+
"ticker_link": f'https://www.google.com/finance{interested_bottom.attrib["href"].replace("./", "/")}',
103+
"title": interested_bottom.css(".RwFyvf::text").get(),
104+
"price": interested_bottom.css(".YMlKec::text").get(),
105+
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
106+
})
107+
108+
return ticker_data
109+
110+
111+
# scrape_google_finance_main_page()
112+
print(json.dumps(scrape_google_finance_main_page(), indent=2, ensure_ascii=False))

0 commit comments

Comments
 (0)