-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProblem3.py
124 lines (101 loc) · 3.67 KB
/
Problem3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import requests as req
from lxml import html
import json
def parseOrg(str, dict):
'''Parses the first column and extracts organisation name'''
str=str.strip()
str=str.replace(':','')
info=str.split('-')
dict['office_name']=info[0]
def parseOrgAddress(str, dict):
'''Parses the organisation address and extracts address, city, state and zip'''
str=str.strip()
address=str.split('\n')
street=address[0].strip()
dict['office_address']=street
cityInfo=address[-1].strip()
if ',' in cityInfo:
city=cityInfo.split(',')[0]
dict['office_city']=city
state=cityInfo.split(',')[1].strip()
state=state.split(' ')[0]
zip=cityInfo.split(',')[1].strip().split(' ')[1]
dict['office_state']=state
dict['office_zip']=zip
else: #last row of the extracted data breaks the preset pattern for address nomenclature
dataList=cityInfo.split(' ')
cityName=''
for i in range(len(dataList)-2):
cityName+=dataList[i]+' '
dict['office_city']=cityName[:-1]
dict['office_state']=dataList[2]
dict['office_zip']=dataList[-1]
#print(list(str))
def parseMail(str,dict):
'''Parses the organisations mailing address and extracts address, city, state and zip'''
str=str.strip()
address=str.split('\n')
val=address[0].strip()
if 'P.O.' in val:
dict['mail_pobox']=val
dict['mail_address']=None
else:
dict['mail_pobox']=None
dict['mail_address']=val
cityInfo=address[-1].strip()
if ',' in cityInfo:
city=cityInfo.split(',')[0]
dict['mail_city']=city
state=cityInfo.split(',')[1].strip()
state=state.split(' ')[0]
zip=cityInfo.split(',')[1].strip().split(' ')[1]
dict['mail_state']=state
dict['mail_zip']=zip
else: #last row of the extracted data breaks the preset pattern for address nomenclature
dataList=cityInfo.split(' ')
cityName=''
for i in range(len(dataList)-2):
cityName+=dataList[i]+' '
dict['mail_city']=cityName[:-1]
dict['mail_state']=dataList[2]
dict['mail_zip']=dataList[-1]
def parseContact(str,dict):
'''Extracts contact information '''
str=str.strip()
if str is None or len(str)==0:
dict['mail_phone']=None
else:
str=str.replace(' ','').replace('\n',' ') # first value contains two phone numbers, this parse is for that specifically
dict['mail_phone']=str
def parseEmail(str, dict):
'''Extracts the email id if present'''
str=str.strip()
if str is None or len(str)==0:
dict['office_link']=None
else:
dict['office_link']=str
def parse(data):
'''Helper function to parse table data. Uses supporting helper functions to parse data depending on the type and returns a dict object for the table row'''
cells=data.xpath('td')
dict={}
parseOrg(cells[0].text_content(),dict)
parseOrgAddress(cells[1].text_content(),dict)
parseMail(cells[2].text_content(),dict)
parseContact(cells[3].text_content(),dict)
parseEmail(cells[4].text_content(),dict)
return dict
def fetch():
'''Fetches data from the html file using requests and parses the every table row using a helper function parse'''
result=[]
resp=req.get("https://dot.ca.gov/contact-us")
root=html.fromstring(resp.content)
rows=root.xpath('//*[@id="main-content"]/div/main/div[1]/div[1]/table//tr')
for i,data in enumerate(rows):
if i==0:
continue
else:
result.append(parse(data))
jsonObj=json.dumps(result)
return jsonObj
if __name__=="__main__":
print(fetch())