From 38f8a7d1b4b888e4e5b04628243936eee46d9860 Mon Sep 17 00:00:00 2001 From: Victor Raton <43411882+victorfernandesraton@users.noreply.github.com> Date: Sat, 7 Sep 2024 19:35:11 -0300 Subject: [PATCH 1/5] fix: removendo codigo de base e corrigindo url de pdf --- .../spiders/ma/ma_sao_jose_dos_basilios.py | 101 +++++++++++++++++- 1 file changed, 97 insertions(+), 4 deletions(-) diff --git a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py index f29d6c144..f43fa4c83 100644 --- a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py +++ b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py @@ -1,11 +1,104 @@ import datetime -from gazette.spiders.base.siganet import BaseSiganetSpider +import scrapy +from dateparser import parse +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider -class MaSaoJoseDosBasiliosSpider(BaseSiganetSpider): + +class MaSaoJoseDosBasiliosSpider(BaseGazetteSpider): TERRITORY_ID = "2111250" name = "ma_sao_jose_dos_basilios" start_date = datetime.date(2015, 11, 27) - allowed_domains = ["transparencia.saojosedosbasilios.ma.gov.br"] - BASE_URL = "https://transparencia.saojosedosbasilios.ma.gov.br/acessoInformacao/diario/diario" + allowed_domains = ["diariooficial.saojosedosbasilios.ma.gov.br"] + BASE_URL = "https://diariooficial.saojosedosbasilios.ma.gov.br" + + def start_requests(self): + yield scrapy.Request(self.get_url()) + + def parse_pagination(self, response): + """ + This parse function is used to get all the pages available and + return request object for each one + """ + return [ + scrapy.Request(self.get_url(page), callback=self.parse) + for page in range(1, 1 + self.get_last_page(response)) + ] + + def parse(self, response, page=1): + """ + Parse each page from the results page and yield the gazette issues available. + """ + gazette_boxes = response.css( + "div#edicoes-anteriores.table-responsive table.table.table-bordered tbody tr" + ) + + for gazette_box in gazette_boxes: + edition_number = self.get_edition_number(gazette_box) + file_url = self.get_pdf_url(edition_number) + date = self.get_gazette_date(gazette_box) + + if date > self.end_date: + continue + elif date < self.start_date: + return + + yield Gazette( + date=date, + file_urls=[file_url], + edition_number=edition_number, + power="executive_legislative", + ) + + last_page = self.get_last_page(response) + if page < last_page: + yield scrapy.Request( + url=self.get_url(page + 1), cb_kwargs={"page": page + 1} + ) + + def get_url(self, page=1): + return f"{self.BASE_URL}/home?page={page}" + + @staticmethod + def get_last_page(response): + """ + Gets the last page number available in the pages navigation menu + """ + pages = response.css("ul.pagination li.page-item a::text").getall() + if len(pages) == 0: + return 1 + return max([int(page) for page in pages if page.isnumeric()]) + + def get_pdf_url(self, edition_number): + """ + Gets the url for the gazette inside one of the 'div#edicoes-anteriores' table + """ + return f"{self.BASE_URL}/diariooficial/getFile/{edition_number}/download=true" + + def get_gazette_date(self, response_item): + """ + Get the date for the gazette inside one of the 'div#edicoes-anteriores' table + """ + date = response_item.css("td:nth-child(3)::text").get().strip() + date_cut = self.__format_date(date) + return parse(date_cut, date_formats=["%d - %B - %Y"], languages=["pt"]).date() + + @staticmethod + def __format_date(date): + split_date = date.split(",") + return split_date[1] + + def get_edition_number(self, response_item): + """ + Get the edition number inside one of the 'div#edicoes-anteriores' table + """ + text_edition = response_item.css("td:nth-child(1) a::text").get().strip() + return self.__cut_edition_number(text_edition) + + @staticmethod + def __cut_edition_number(text): + split_text = text.split(" ") + split_number_year = split_text[3].split("/") + return split_number_year[0] From 7c889bd97586379b19432ecd6ab3747865a4248b Mon Sep 17 00:00:00 2001 From: Victor Raton <43411882+victorfernandesraton@users.noreply.github.com> Date: Sat, 7 Sep 2024 20:37:13 -0300 Subject: [PATCH 2/5] =?UTF-8?q?fix:=20removendo=20m=C3=A9todos=20e=20otimi?= =?UTF-8?q?zando=20a=20classe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spiders/ma/ma_sao_jose_dos_basilios.py | 61 +++---------------- 1 file changed, 10 insertions(+), 51 deletions(-) diff --git a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py index f43fa4c83..d1d2358e7 100644 --- a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py +++ b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py @@ -15,17 +15,7 @@ class MaSaoJoseDosBasiliosSpider(BaseGazetteSpider): BASE_URL = "https://diariooficial.saojosedosbasilios.ma.gov.br" def start_requests(self): - yield scrapy.Request(self.get_url()) - - def parse_pagination(self, response): - """ - This parse function is used to get all the pages available and - return request object for each one - """ - return [ - scrapy.Request(self.get_url(page), callback=self.parse) - for page in range(1, 1 + self.get_last_page(response)) - ] + yield scrapy.Request(f"{self.BASE_URL}/home") def parse(self, response, page=1): """ @@ -37,7 +27,6 @@ def parse(self, response, page=1): for gazette_box in gazette_boxes: edition_number = self.get_edition_number(gazette_box) - file_url = self.get_pdf_url(edition_number) date = self.get_gazette_date(gazette_box) if date > self.end_date: @@ -47,58 +36,28 @@ def parse(self, response, page=1): yield Gazette( date=date, - file_urls=[file_url], + file_urls=[ + f"{self.BASE_URL}/diariooficial/getFile/{edition_number}/download=true" + ], edition_number=edition_number, power="executive_legislative", ) - last_page = self.get_last_page(response) - if page < last_page: - yield scrapy.Request( - url=self.get_url(page + 1), cb_kwargs={"page": page + 1} - ) - - def get_url(self, page=1): - return f"{self.BASE_URL}/home?page={page}" - - @staticmethod - def get_last_page(response): - """ - Gets the last page number available in the pages navigation menu - """ - pages = response.css("ul.pagination li.page-item a::text").getall() - if len(pages) == 0: - return 1 - return max([int(page) for page in pages if page.isnumeric()]) - - def get_pdf_url(self, edition_number): - """ - Gets the url for the gazette inside one of the 'div#edicoes-anteriores' table - """ - return f"{self.BASE_URL}/diariooficial/getFile/{edition_number}/download=true" + next_page_url = response.css("a.page-link[rel='next']::attr(href)").get() + if next_page_url: + yield scrapy.Request(url=next_page_url) def get_gazette_date(self, response_item): """ Get the date for the gazette inside one of the 'div#edicoes-anteriores' table """ - date = response_item.css("td:nth-child(3)::text").get().strip() - date_cut = self.__format_date(date) - return parse(date_cut, date_formats=["%d - %B - %Y"], languages=["pt"]).date() - - @staticmethod - def __format_date(date): - split_date = date.split(",") - return split_date[1] + date = response_item.css("td:nth-child(3)::text").get().strip().split(",")[1] + return parse(date, date_formats=["%d - %B - %Y"], languages=["pt"]).date() def get_edition_number(self, response_item): """ Get the edition number inside one of the 'div#edicoes-anteriores' table """ text_edition = response_item.css("td:nth-child(1) a::text").get().strip() - return self.__cut_edition_number(text_edition) - - @staticmethod - def __cut_edition_number(text): - split_text = text.split(" ") - split_number_year = split_text[3].split("/") + split_number_year = text_edition.split(" ")[3].split("/") return split_number_year[0] From cc858e87d2335eaea419d21c66b47c6661a8825d Mon Sep 17 00:00:00 2001 From: Victor Raton <43411882+victorfernandesraton@users.noreply.github.com> Date: Wed, 11 Sep 2024 22:55:46 -0300 Subject: [PATCH 3/5] =?UTF-8?q?fix:=20redu=C3=A7=C3=A3o=20de=20complexidad?= =?UTF-8?q?e=20de=20c=C3=B3digo=20removendo=20recursos=20desnecess=C3=A1ri?= =?UTF-8?q?os?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spiders/ma/ma_sao_jose_dos_basilios.py | 33 ++++++++----------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py index d1d2358e7..066034771 100644 --- a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py +++ b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py @@ -1,7 +1,7 @@ import datetime +import dateparser import scrapy -from dateparser import parse from gazette.items import Gazette from gazette.spiders.base import BaseGazetteSpider @@ -19,15 +19,25 @@ def start_requests(self): def parse(self, response, page=1): """ - Parse each page from the results page and yield the gazette issues available. + Parse each page from the results page and yield the gazette issues available and go to next page. """ gazette_boxes = response.css( "div#edicoes-anteriores.table-responsive table.table.table-bordered tbody tr" ) for gazette_box in gazette_boxes: - edition_number = self.get_edition_number(gazette_box) - date = self.get_gazette_date(gazette_box) + edition_number = ( + gazette_boxes.css("td:nth-child(1) a::text") + .get() + .strip() + .split(" ")[3] + .split("/")[0] + ) + + date = dateparser.parse( + gazette_boxes.css("td:nth-child(3)::text").get().strip().split(",")[1], + languages=["pt"], + ).date() if date > self.end_date: continue @@ -46,18 +56,3 @@ def parse(self, response, page=1): next_page_url = response.css("a.page-link[rel='next']::attr(href)").get() if next_page_url: yield scrapy.Request(url=next_page_url) - - def get_gazette_date(self, response_item): - """ - Get the date for the gazette inside one of the 'div#edicoes-anteriores' table - """ - date = response_item.css("td:nth-child(3)::text").get().strip().split(",")[1] - return parse(date, date_formats=["%d - %B - %Y"], languages=["pt"]).date() - - def get_edition_number(self, response_item): - """ - Get the edition number inside one of the 'div#edicoes-anteriores' table - """ - text_edition = response_item.css("td:nth-child(1) a::text").get().strip() - split_number_year = text_edition.split(" ")[3].split("/") - return split_number_year[0] From a02e0b34be9b74d88acaffae9f720dfc7f3c4b27 Mon Sep 17 00:00:00 2001 From: Victor Raton <43411882+victorfernandesraton@users.noreply.github.com> Date: Thu, 12 Sep 2024 16:40:54 -0300 Subject: [PATCH 4/5] fix: move pagination for outside table count --- .../gazette/spiders/ma/ma_sao_jose_dos_basilios.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py index 066034771..668c2425d 100644 --- a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py +++ b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py @@ -53,6 +53,6 @@ def parse(self, response, page=1): power="executive_legislative", ) - next_page_url = response.css("a.page-link[rel='next']::attr(href)").get() - if next_page_url: - yield scrapy.Request(url=next_page_url) + next_page_url = response.css("a.page-link[rel='next']::attr(href)").get() + if next_page_url: + yield scrapy.Request(url=next_page_url) From d6e4dd40b935ae1536f93d3d635736dea99a90bc Mon Sep 17 00:00:00 2001 From: Giulio Date: Wed, 25 Sep 2024 16:00:07 -0300 Subject: [PATCH 5/5] Corrige uso de elementos de lista para parsing --- .../gazette/spiders/ma/ma_sao_jose_dos_basilios.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py index 668c2425d..fd229c013 100644 --- a/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py +++ b/data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py @@ -27,7 +27,7 @@ def parse(self, response, page=1): for gazette_box in gazette_boxes: edition_number = ( - gazette_boxes.css("td:nth-child(1) a::text") + gazette_box.css("td:nth-child(1) a::text") .get() .strip() .split(" ")[3] @@ -35,7 +35,7 @@ def parse(self, response, page=1): ) date = dateparser.parse( - gazette_boxes.css("td:nth-child(3)::text").get().strip().split(",")[1], + gazette_box.css("td:nth-child(3)::text").get().strip().split(",")[1], languages=["pt"], ).date()