forked from turicas/covid19-br
-
Notifications
You must be signed in to change notification settings - Fork 0
/
obitos_registral_cities_spider.py
116 lines (94 loc) · 3.65 KB
/
obitos_registral_cities_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json
from urllib.parse import urlencode, urljoin
import scrapy
from epiweeks import Week
import date_utils
class DeathsSpider(scrapy.Spider):
name = "obitos_registral_cities"
cities_url = "https://transparencia.registrocivil.org.br/api/covid-cities"
registral_url = (
"https://transparencia.registrocivil.org.br/api/covid-covid-registral"
)
causes_map = {
"sars": "SRAG",
"pneumonia": "PNEUMONIA",
"respiratory_failure": "INSUFICIENCIA_RESPIRATORIA",
"septicemia": "SEPTICEMIA",
"indeterminate": "INDETERMINADA",
"others": "OUTRAS",
}
def make_cities_request(self, total, callback, dont_cache=False):
data = {
"total": total, # Seems not to be working, it's always 100
"type": "registral-covid",
}
return scrapy.Request(
url=urljoin(self.cities_url, "?" + urlencode(data)),
callback=callback,
meta={"row": data, "dont_cache": dont_cache},
)
def make_registral_request(
self, city, ep_week, callback, dont_cache=False,
):
data = {
"city_id": city["city_id"],
"state": city["uf"],
"start_date": str(ep_week.startdate()),
"end_date": str(ep_week.enddate()),
}
return scrapy.Request(
url=urljoin(self.registral_url, "?" + urlencode(data)),
callback=callback,
meta={
"row": data,
"city_name": city["nome"],
"ep_week": ep_week,
"dont_cache": dont_cache,
},
)
def start_requests(self):
yield self.make_cities_request(
total=100, callback=self.parse_cities_request, dont_cache=False
)
def parse_cities_request(self, response):
cities = json.loads(response.body)
today = date_utils.today()
current_week = Week.fromdate(today)
# We have to do different passes for 2019 and 2020, since the specific days of
# the epidemiological week differs.
#
# The api seems to return the data from the current year as "2020", and the previous as "2019",
# so we'll exploit that to extract the data only from the "2020" chart
for city in cities:
for year in [2020, 2019]:
for weeknum in range(1, current_week.week):
ep_week = Week(year, weeknum)
# Cache more than 4 weeks ago
should_cache = (current_week.week - weeknum) > 4
yield self.make_registral_request(
city=city,
ep_week=ep_week,
callback=self.parse_registral_request,
dont_cache=not should_cache,
)
def add_causes(self, row, data):
for cause, portuguese_name in self.causes_map.items():
row[cause] = data[portuguese_name]
def parse_registral_request(self, response):
ep_week = response.meta["ep_week"]
row = response.meta["row"].copy()
row["city_name"] = response.meta["city_name"]
row["epidemiological_year"] = ep_week.year
row["epidemiological_week"] = ep_week.week
data = json.loads(response.body)
if "dont_cache" in row:
del row["dont_cache"]
for cause in self.causes_map:
row[cause] = 0
row["covid"] = 0
chart_data = data["chart"]
if chart_data:
if "2020" in chart_data:
self.add_causes(row, chart_data["2020"])
row["covid"] = chart_data["2020"]["COVID"]
yield row