Skip to content

Commit 1941ecc

Browse files
committed
extracts pemenang page
1 parent 735c4a3 commit 1941ecc

File tree

4 files changed

+156
-5
lines changed

4 files changed

+156
-5
lines changed

main.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,25 @@
66
'''
77

88
from pengumumanlelang import *
9+
from pemenang import *
910
import vars as v
1011

1112

12-
def main():
13-
#initialising and executing pengumumanlelang
13+
def pengumumanlelangexecute():
14+
# initialising and executing pengumumanlelang
1415
pl = pengumumanlelang()
1516
pl.iterate(v.lowNum, v.highNum)
1617

1718

19+
def pemenangexecute():
20+
pm = pemenang()
21+
pm.iterate(v.lowNum, v.highNum)
22+
23+
24+
def main():
25+
#pengumumanlelangexecute()
26+
pemenangexecute()
27+
28+
1829
if __name__ == '__main__':
1930
main()

pemenang.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
'''
2+
sonicskye @2018
3+
4+
pemenang.py is used to parse pemenang page in LPSE website
5+
it stores class pemenang
6+
7+
8+
'''
9+
10+
11+
from bs4 import BeautifulSoup
12+
import utilities as u
13+
import vars as v
14+
import csv
15+
from pathlib import Path
16+
17+
18+
class pemenang:
19+
20+
_HEADERPREFERRED = ["Kode Tender", "Nama Tender", "Kategori", "Instansi", "Satker", "Pagu", "HPS",
21+
"Nama Pemenang", "Alamat", "NPWP", "Harga Penawaran"]
22+
23+
def generateurl(self, num):
24+
completeURL = v.menuEvaluasiURL + str(num) + v.staticCode + v.pemenangURL
25+
return completeURL
26+
27+
def generatecontent(self, url):
28+
page = u.getcontent(url)
29+
return page
30+
31+
def parsepage(self, page, num):
32+
# https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe
33+
soup = BeautifulSoup(page, 'html.parser')
34+
35+
# https://stackoverflow.com/questions/18966368/python-beautifulsoup-scrape-tables
36+
thList = []
37+
tdList = []
38+
thTemp = []
39+
tdTemp = []
40+
41+
kodeTenderTempHeader = []
42+
kodeTenderTempData = []
43+
kodeTenderTempHeader.append("Kode Tender")
44+
kodeTenderTempData.append(str(num) + v.staticCode)
45+
thList.append(kodeTenderTempHeader)
46+
tdList.append(kodeTenderTempData)
47+
48+
for tr in soup.find_all('tr'):
49+
# process the header cell
50+
ths = tr.find_all('th')
51+
thTemp2 = []
52+
for th in ths:
53+
thText = th.text.strip()
54+
#print(thText)
55+
if thText != "":
56+
thTemp2.append(thText)
57+
if len(thTemp2) != 0:
58+
thTemp = thTemp2
59+
60+
#process the data cell
61+
tds = tr.find_all('td')
62+
tdTemp2 = []
63+
for td in tds:
64+
tdText = td.text.strip()
65+
if tdText != "":
66+
tdTemp2.append(tdText)
67+
if len(tdTemp2) != 0:
68+
tdTemp = tdTemp2
69+
thList.append(thTemp)
70+
tdList.append(tdTemp)
71+
#print(len(thList), len(tdList))
72+
73+
for i in range(0,len(thTemp)-1):
74+
header = thTemp[i]
75+
dat = tdTemp[i]
76+
#print (header, repr(dat))
77+
# cleanup unwanted data
78+
thList.pop(-2)
79+
tdList.pop(-2)
80+
81+
# now we serialize the last data
82+
thLast = thList.pop(-1)
83+
tdLast = tdList.pop(-1)
84+
for i in range(0, len(thLast)):
85+
tempList = []
86+
tempList.append(thLast[i])
87+
thList.append(tempList)
88+
89+
for i in range(0, len(tdLast)):
90+
tempList2 = []
91+
tempList2.append(tdLast[i])
92+
tdList.append(tempList2)
93+
94+
# write to a csv file, named pemenang.csv
95+
# https://realpython.com/python-csv/
96+
filename: str = "results/pemenang" + "-" + v.govName + ".csv"
97+
98+
# write the header
99+
# check whether the file exists
100+
# https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists-without-exceptions
101+
102+
checkFile = Path(filename)
103+
if not checkFile.is_file():
104+
# file does not exist
105+
with open(filename, mode='w') as pemenangfile:
106+
pemenangwriter = csv.writer(pemenangfile, delimiter=',')
107+
pemenangwriter.writerow(self._HEADERPREFERRED)
108+
109+
# @ToDo do not allow duplicates
110+
# https://stackoverflow.com/questions/15741564/removing-duplicate-rows-from-a-csv-file-using-a-python-script
111+
112+
# write the data
113+
with open(filename, mode='a') as pemenangfile:
114+
pemenangwriter = csv.writer(pemenangfile, delimiter=',')
115+
dataAkhir = []
116+
for i in range (0, len(thList)):
117+
daftarHeader = thList[i]
118+
daftarData = tdList[i]
119+
for j in range(0, len(daftarHeader)):
120+
header = daftarHeader[j]
121+
if header in self._HEADERPREFERRED:
122+
# keep the \\n inside the data string
123+
data = repr(daftarData[j])
124+
dataAkhir.append(data)
125+
pemenangwriter.writerow(dataAkhir)
126+
127+
128+
def iterate(self, lowNum, highNum):
129+
# iterating from lowNum to highNum
130+
for i in range(lowNum, highNum):
131+
url = self.generateurl(i)
132+
print("Processing: " + url)
133+
# if 404 not found then do not process anything
134+
try:
135+
page = self.generatecontent(url)
136+
self.parsepage(page, i)
137+
except:
138+
print ("Page not found or Error has happened")
139+
continue

pengumumanlelang.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class pengumumanlelang:
2222
"Nilai HPS Paket", "Peserta Tender"]
2323

2424
def generateurl(self, num):
25-
completeURL = v.frontURL + str(num) + v.staticCode + v.pengumumanLelangURL
25+
completeURL = v.menuLelangURL + str(num) + v.staticCode + v.pengumumanLelangURL
2626
return completeURL
2727

2828
def generatecontent(self, url):
@@ -102,12 +102,12 @@ def parsepage(self, page):
102102
def iterate(self, lowNum, highNum):
103103
# iterating from lowNum to highNum
104104
for i in range(lowNum, highNum):
105-
print(i)
106105
url = self.generateurl(i)
107106
print("Processing: " + url)
108107
# if 404 not found then do not process anything
109108
try:
110109
page = self.generatecontent(url)
111110
self.parsepage(page)
112111
except:
112+
print("Page not found or Error has happened")
113113
continue

vars.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
lowNum = 7300
1212
highNum = 7320
1313

14-
frontURL = "https://lpse." + govName + ".go.id/eproc4/lelang/"
14+
menuLelangURL = "https://lpse." + govName + ".go.id/eproc4/lelang/"
15+
menuEvaluasiURL = "https://lpse." + govName + ".go.id/eproc4/evaluasi/"
1516
pengumumanLelangURL = "/pengumumanlelang"
1617
pesertaURL = "/peserta"
1718
hasilURL = "/hasil"

0 commit comments

Comments
 (0)