-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscrap.py
62 lines (51 loc) · 2.33 KB
/
webscrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import requests
import re
from bs4 import BeautifulSoup
import pandas
rr = requests.get("https://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/")
cc = rr.content
soupp = BeautifulSoup(cc,"html.parser")
all = soupp.find_all("div",{"class":"propertyRow"})
#v = all[0].find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ","")
#url = """http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s=\""""
page_num=soupp.find_all("a",{"class":"Page"})[-1].text
x = int(page_num)
#print(x)
li =[]
for page in range(0,x*10,10):
r=requests.get("http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s="+str(page)+".html")
c=r.content
soup=BeautifulSoup(c,"html.parser")
all=soup.find_all("div",{"class":"propertyRow"})
for item in all:
d={}
d["Address"]=item.find_all("span",{"class","propAddressCollapse"})[0].text
d["Locality"]=item.find_all("span",{"class","propAddressCollapse"})[1].text
d["Price"]=item.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ","")
#print('\n\n#######' + item.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",""))
#print(item.find_all("span",{"class","propAddressCollapse"})[0].text)
#print(item.find_all("span",{"class","propAddressCollapse"})[1].text + "\n")
try:
d["Beds"]=item.find("span",{"class","infoBed"}).find("b").text #alternatively add .find("b")
except:
d["Beds"]=None
try:
d["Area"]=item.find("span",{"class","infoSqFt"}).find("b").text
except:
d["Area"]=None
try:
d["FullBath"]=item.find("span",{"class","infoValueFullBath"}).find("b").text
except:
d["FullBath"]=None
for col_group in item.find_all("div",{"class":"columnGroup"}):
#print(col_group)
for feat_group, feat_name in zip(col_group.find_all("span",{"class":"featureGroup"}),col_group.find_all("span",{"class":"featureName"})):
#print(feat_group.text, feat_name.text)
if"Lot Size" in feat_group.text:
d["Lot Size"]=feat_name.text
li.append(d)
##if int(page_nr) >=1:
#page_nr = int(page_nr) -1
df = pandas.DataFrame(li)
df.to_csv("ScrapedData.csv")
print(df)