|
| 1 | +''' |
| 2 | +sonicskye @2018 |
| 3 | +
|
| 4 | +pemenang.py is used to parse pemenang page in LPSE website |
| 5 | +it stores class pemenang |
| 6 | +
|
| 7 | +
|
| 8 | +''' |
| 9 | + |
| 10 | + |
| 11 | +from bs4 import BeautifulSoup |
| 12 | +import utilities as u |
| 13 | +import vars as v |
| 14 | +import csv |
| 15 | +from pathlib import Path |
| 16 | + |
| 17 | + |
| 18 | +class pemenang: |
| 19 | + |
| 20 | + _HEADERPREFERRED = ["Kode Tender", "Nama Tender", "Kategori", "Instansi", "Satker", "Pagu", "HPS", |
| 21 | + "Nama Pemenang", "Alamat", "NPWP", "Harga Penawaran"] |
| 22 | + |
| 23 | + def generateurl(self, num): |
| 24 | + completeURL = v.menuEvaluasiURL + str(num) + v.staticCode + v.pemenangURL |
| 25 | + return completeURL |
| 26 | + |
| 27 | + def generatecontent(self, url): |
| 28 | + page = u.getcontent(url) |
| 29 | + return page |
| 30 | + |
| 31 | + def parsepage(self, page, num): |
| 32 | + # https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe |
| 33 | + soup = BeautifulSoup(page, 'html.parser') |
| 34 | + |
| 35 | + # https://stackoverflow.com/questions/18966368/python-beautifulsoup-scrape-tables |
| 36 | + thList = [] |
| 37 | + tdList = [] |
| 38 | + thTemp = [] |
| 39 | + tdTemp = [] |
| 40 | + |
| 41 | + kodeTenderTempHeader = [] |
| 42 | + kodeTenderTempData = [] |
| 43 | + kodeTenderTempHeader.append("Kode Tender") |
| 44 | + kodeTenderTempData.append(str(num) + v.staticCode) |
| 45 | + thList.append(kodeTenderTempHeader) |
| 46 | + tdList.append(kodeTenderTempData) |
| 47 | + |
| 48 | + for tr in soup.find_all('tr'): |
| 49 | + # process the header cell |
| 50 | + ths = tr.find_all('th') |
| 51 | + thTemp2 = [] |
| 52 | + for th in ths: |
| 53 | + thText = th.text.strip() |
| 54 | + #print(thText) |
| 55 | + if thText != "": |
| 56 | + thTemp2.append(thText) |
| 57 | + if len(thTemp2) != 0: |
| 58 | + thTemp = thTemp2 |
| 59 | + |
| 60 | + #process the data cell |
| 61 | + tds = tr.find_all('td') |
| 62 | + tdTemp2 = [] |
| 63 | + for td in tds: |
| 64 | + tdText = td.text.strip() |
| 65 | + if tdText != "": |
| 66 | + tdTemp2.append(tdText) |
| 67 | + if len(tdTemp2) != 0: |
| 68 | + tdTemp = tdTemp2 |
| 69 | + thList.append(thTemp) |
| 70 | + tdList.append(tdTemp) |
| 71 | + #print(len(thList), len(tdList)) |
| 72 | + |
| 73 | + for i in range(0,len(thTemp)-1): |
| 74 | + header = thTemp[i] |
| 75 | + dat = tdTemp[i] |
| 76 | + #print (header, repr(dat)) |
| 77 | + # cleanup unwanted data |
| 78 | + thList.pop(-2) |
| 79 | + tdList.pop(-2) |
| 80 | + |
| 81 | + # now we serialize the last data |
| 82 | + thLast = thList.pop(-1) |
| 83 | + tdLast = tdList.pop(-1) |
| 84 | + for i in range(0, len(thLast)): |
| 85 | + tempList = [] |
| 86 | + tempList.append(thLast[i]) |
| 87 | + thList.append(tempList) |
| 88 | + |
| 89 | + for i in range(0, len(tdLast)): |
| 90 | + tempList2 = [] |
| 91 | + tempList2.append(tdLast[i]) |
| 92 | + tdList.append(tempList2) |
| 93 | + |
| 94 | + # write to a csv file, named pemenang.csv |
| 95 | + # https://realpython.com/python-csv/ |
| 96 | + filename: str = "results/pemenang" + "-" + v.govName + ".csv" |
| 97 | + |
| 98 | + # write the header |
| 99 | + # check whether the file exists |
| 100 | + # https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists-without-exceptions |
| 101 | + |
| 102 | + checkFile = Path(filename) |
| 103 | + if not checkFile.is_file(): |
| 104 | + # file does not exist |
| 105 | + with open(filename, mode='w') as pemenangfile: |
| 106 | + pemenangwriter = csv.writer(pemenangfile, delimiter=',') |
| 107 | + pemenangwriter.writerow(self._HEADERPREFERRED) |
| 108 | + |
| 109 | + # @ToDo do not allow duplicates |
| 110 | + # https://stackoverflow.com/questions/15741564/removing-duplicate-rows-from-a-csv-file-using-a-python-script |
| 111 | + |
| 112 | + # write the data |
| 113 | + with open(filename, mode='a') as pemenangfile: |
| 114 | + pemenangwriter = csv.writer(pemenangfile, delimiter=',') |
| 115 | + dataAkhir = [] |
| 116 | + for i in range (0, len(thList)): |
| 117 | + daftarHeader = thList[i] |
| 118 | + daftarData = tdList[i] |
| 119 | + for j in range(0, len(daftarHeader)): |
| 120 | + header = daftarHeader[j] |
| 121 | + if header in self._HEADERPREFERRED: |
| 122 | + # keep the \\n inside the data string |
| 123 | + data = repr(daftarData[j]) |
| 124 | + dataAkhir.append(data) |
| 125 | + pemenangwriter.writerow(dataAkhir) |
| 126 | + |
| 127 | + |
| 128 | + def iterate(self, lowNum, highNum): |
| 129 | + # iterating from lowNum to highNum |
| 130 | + for i in range(lowNum, highNum): |
| 131 | + url = self.generateurl(i) |
| 132 | + print("Processing: " + url) |
| 133 | + # if 404 not found then do not process anything |
| 134 | + try: |
| 135 | + page = self.generatecontent(url) |
| 136 | + self.parsepage(page, i) |
| 137 | + except: |
| 138 | + print ("Page not found or Error has happened") |
| 139 | + continue |
0 commit comments