-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparseGoogle.py
39 lines (33 loc) · 1.09 KB
/
parseGoogle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import re
from scraping import *
import time
def parseGoogleResult(URL, debug = False):
sleepseconds = 5
if debug:
print('sleeping {0} seconds...'.format(sleepseconds))
#wait X seconds so we don't hammer google and get blocked
time.sleep(sleepseconds)
soup = fetchWebPage(URL)
#no page returned
if not soup:
if debug:
#usually from 429, bot detected
print("http status != 200")
return -1 # don't keep searching
#should be first result - if found
aResult = soup.find('a', href=re.compile('https:\/\/(www|m).comixology.com'))
if aResult:
CMXURL = findAttributeValue(aResult, 'href')
if debug:
print("URL = " + CMXURL)
matchCMXID = re.search('\/([0-9]+)\??.*$', CMXURL)
if matchCMXID:
CMXID = matchCMXID.group(1)
if debug:
print("CMXID google result = {0}".format(CMXID))
return CMXID
else:
#not all series include the volume year
if debug:
print("no match found on google.com")
return None