Skip to content

Commit

Permalink
fix: removendo métodos e otimizando a classe
Browse files Browse the repository at this point in the history
  • Loading branch information
victorfernandesraton committed Sep 7, 2024
1 parent 13bb133 commit adc3eaf
Showing 1 changed file with 10 additions and 51 deletions.
61 changes: 10 additions & 51 deletions data_collection/gazette/spiders/ma/ma_sao_jose_dos_basilios.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,7 @@ class MaSaoJoseDosBasiliosSpider(BaseGazetteSpider):
BASE_URL = "https://diariooficial.saojosedosbasilios.ma.gov.br"

def start_requests(self):
yield scrapy.Request(self.get_url())

def parse_pagination(self, response):
"""
This parse function is used to get all the pages available and
return request object for each one
"""
return [
scrapy.Request(self.get_url(page), callback=self.parse)
for page in range(1, 1 + self.get_last_page(response))
]
yield scrapy.Request(f"{self.BASE_URL}/home")

def parse(self, response, page=1):
"""
Expand All @@ -37,7 +27,6 @@ def parse(self, response, page=1):

for gazette_box in gazette_boxes:
edition_number = self.get_edition_number(gazette_box)
file_url = self.get_pdf_url(edition_number)
date = self.get_gazette_date(gazette_box)

if date > self.end_date:
Expand All @@ -47,58 +36,28 @@ def parse(self, response, page=1):

yield Gazette(
date=date,
file_urls=[file_url],
file_urls=[
f"{self.BASE_URL}/diariooficial/getFile/{edition_number}/download=true"
],
edition_number=edition_number,
power="executive_legislative",
)

last_page = self.get_last_page(response)
if page < last_page:
yield scrapy.Request(
url=self.get_url(page + 1), cb_kwargs={"page": page + 1}
)

def get_url(self, page=1):
return f"{self.BASE_URL}/home?page={page}"

@staticmethod
def get_last_page(response):
"""
Gets the last page number available in the pages navigation menu
"""
pages = response.css("ul.pagination li.page-item a::text").getall()
if len(pages) == 0:
return 1
return max([int(page) for page in pages if page.isnumeric()])

def get_pdf_url(self, edition_number):
"""
Gets the url for the gazette inside one of the 'div#edicoes-anteriores' table
"""
return f"{self.BASE_URL}/diariooficial/getFile/{edition_number}/download=true"
next_page_url = response.css("a.page-link[rel='next']::attr(href)").get()
if next_page_url:
yield scrapy.Request(url=next_page_url)

def get_gazette_date(self, response_item):
"""
Get the date for the gazette inside one of the 'div#edicoes-anteriores' table
"""
date = response_item.css("td:nth-child(3)::text").get().strip()
date_cut = self.__format_date(date)
return parse(date_cut, date_formats=["%d - %B - %Y"], languages=["pt"]).date()

@staticmethod
def __format_date(date):
split_date = date.split(",")
return split_date[1]
date = response_item.css("td:nth-child(3)::text").get().strip().split(",")[1]
return parse(date, date_formats=["%d - %B - %Y"], languages=["pt"]).date()

def get_edition_number(self, response_item):
"""
Get the edition number inside one of the 'div#edicoes-anteriores' table
"""
text_edition = response_item.css("td:nth-child(1) a::text").get().strip()
return self.__cut_edition_number(text_edition)

@staticmethod
def __cut_edition_number(text):
split_text = text.split(" ")
split_number_year = split_text[3].split("/")
split_number_year = text_edition.split(" ")[3].split("/")
return split_number_year[0]

0 comments on commit adc3eaf

Please sign in to comment.