Skip to content

Commit

Permalink
feat(struc): database modularization and code improvement (#137)
Browse files Browse the repository at this point in the history
* feat(struc): database modularization and code improvement

* SIM database

* SIM database

* SINASC database

* Docstrings

* Finish SIM and minor improvements

* SIH Database & removing dateparser in File to increase listing speed by 35x

* Saving work

* minor fix on zfill_year

* Setting a time limit of 5 seconds to every test

* redo poetry lock

* minor fix on dockerfile

* skip tests that take over 5 seconds

* Improve SINAN.get_files to accept multiple queues

* Include asynchronous FTP download method

* Improve FTP objects when listing ftp content

* Implement download methods on File class

* Fix some docstrings

* Replace online_data.SINAN methods with new class

* Replace online_data.SIM download method with new class

* Replace online_data.sinasc download method with new class

* Replace online_data.SIH download method with new class

* Replace online_data.SIA download method with new class

* add humanize pckg

* Start removing references from online_data.__init__
  • Loading branch information
luabida authored Sep 2, 2023
1 parent fb4521b commit d7e6d27
Show file tree
Hide file tree
Showing 35 changed files with 2,421 additions and 1,372 deletions.
3 changes: 2 additions & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ RUN curl -sSL https://install.python-poetry.org/ | POETRY_HOME=/opt/poetry pytho
ln -s /opt/poetry/bin/poetry && \
poetry config virtualenvs.create false

RUN chmod -R a+rwx /home/pysus/.config/pypoetry/
RUN chmod -R a+rwx /home/pysus/.config/pypoetry/ \
&& chown -R pysus:pysus /opt/conda/bin/

USER pysus

Expand Down
2,364 changes: 1,228 additions & 1,136 deletions poetry.lock

Large diffs are not rendered by default.

22 changes: 15 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,25 @@ packages = [{include='pysus'}]
cffi = "1.15.1"
dbfread = "2.0.7"
fastparquet = "^0.8.1"
geocoder = "^1.38.1"
jupyterlab = "^3.4.5"
numpy = "1.23.2"
pyarrow = ">=11.0.0"
pycparser = "2.21"
pyreaddbc = "1.0.0"
python = "^3.9"
python-dateutil = "2.8.2"
pytz = "2022.2.1"
six = "1.16.0"
tqdm = "4.64.0"
wget = "^3.2"
loguru = "^0.6.0"
Unidecode = "^1.3.6"
elasticsearch = "7.16.2"
ipykernel = "^6.22.0"
geobr = "^0.2.0"
dateparser = "^1.1.8"
pandas = ">=1.5.3"
urwid = "^2.1.2"
# Preprocessing Packages
geobr = { version = "^0.2.0", extras=["preprocessing"] }
geocoder = { version = "^1.38.1", extras=["ml"] }
aioftp = "^0.21.4"
humanize = "^4.8.0"

[tool.poetry.group.dev.dependencies]
pytest = ">=6.1.0"
Expand All @@ -42,6 +44,9 @@ folium = "^0.14.0"
seaborn = "^0.12.2"
descartes = "^1.1.0"
keplergl = "^0.3.2"
jupyterlab = "^4.0.5"
ipykernel = "^6.25.1"
pytest-timeout = "^2.1.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand All @@ -66,5 +71,8 @@ testpaths = [
"tests"
]


exclude = ["*.git", "docs/"]

[tool.poetry.extras]
preprocessing = ["geobr", "geocoder"]

279 changes: 279 additions & 0 deletions pysus/ftp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
from loguru import logger
from datetime import datetime
from ftplib import FTP
import os
import pathlib
from aioftp import Client
from typing import List, Optional, Union


CACHEPATH = os.getenv(
'PYSUS_CACHEPATH', os.path.join(str(pathlib.Path.home()), 'pysus')
)

class File:
"""
FTP File class. This class will contain methods for interacting with
files inside DataSUS FTP server. The databases will be responsible for
parsing the files found for each db into File classes, enabling the
databases' files to share state and its reusability.
Parameters
path [str]: entire directory path where the file is located
inside the FTP server
name [str]: basename of the file
info [dict]: a dict containing the keys [size, type, modify], which
are present in every FTP server. In PySUS, this info
is extract using `line_file_parser` with FTP LIST.
Methods
download(local_dir): extract the file to local_dir
async_download(local_dir): async extract the file to local_dir
"""

def __init__(self, path: str, name: str, info: dict) -> None:
name, extension = os.path.splitext(name)
self.name = name
self.extension = extension
self.basename = pathlib.PurePosixPath(self.name + self.extension)
self.path = pathlib.PurePosixPath(
path + str(self.basename)
if path.endswith("/")
else path + "/" + str(self.basename)
)
self.info = info

def __str__(self) -> str:
return str(self.basename)

def __repr__(self) -> str:
return str(self.basename)

def __hash__(self):
return hash(self.path)

def __eq__(self, other):
if isinstance(other, File):
return self.path == other.path
return False

def download(self, local_dir: str = CACHEPATH) -> str:
dir = pathlib.Path(local_dir)
dir.mkdir(exist_ok=True, parents=True)
filepath = dir / self.basename

if filepath.exists():
return str(filepath)

ftp = ftp = FTP("ftp.datasus.gov.br")
ftp.login()
ftp.retrbinary(
f"RETR {self.path}",
open(f"{filepath}", "wb").write,
)
ftp.close()
return str(filepath)

async def async_download(self, local_dir: str = CACHEPATH) -> None:
# aioftp.Client.parse_list_line_custom
def line_file_parser(file_line):
line = file_line.decode("utf-8")
info = {}
if "<DIR>" in line:
date, time, _, *name = str(line).strip().split()
info["size"] = 0
info["type"] = "dir"
name = " ".join(name)
else:
date, time, size, name = str(line).strip().split()
info["size"] = size
info["type"] = "file"

modify = datetime.strptime(
" ".join([date, time]), "%m-%d-%y %I:%M%p"
)
info["modify"] = modify.strftime("%m/%d/%Y %I:%M%p")

return pathlib.PurePosixPath(name), info

dir = pathlib.Path(local_dir)
dir.mkdir(exist_ok=True, parents=True)
filepath = dir / self.basename

output = (
local_dir + str(self.basename)
if local_dir.endswith("/")
else local_dir + "/" + str(self.basename)
)

if filepath.exists():
logger.debug(output)
pass

async with Client.context(
host="ftp.datasus.gov.br", parse_list_line_custom=line_file_parser
) as client:
await client.login()
await client.download(self.path, output, write_into=True)
logger.debug(output)


class Directory:
"""
FTP Directory class.
Parameters
path [str]: entire directory path where the directory is located
inside the FTP server
name [str]: directory name
info [dict]: a dict containing the keys [size, type, modify], which
are present in every FTP server. In PySUS, this info
is extract using `line_file_parser` with FTP LIST.
OBS: A Directory should have size=0
"""

def __init__(
self,
path: str,
info: dict,
content: Optional[List] = None,
) -> None:
self.path = pathlib.PurePosixPath(path)
self.info = info
self.content = content

def __str__(self) -> str:
return str(self.path)

def __repr__(self) -> str:
return str(self.path)

def __hash__(self):
return hash(self.path)

def __eq__(self, other):
if isinstance(other, Directory):
return self.path == other.path
return False


def list_path(path: str) -> List[Union[Directory, File]]:
"""
This method is responsible for listing all the database's
files when the database class is firstly called. It will
convert the files found within the paths into `File`s,
returning a list of Files and Directories.
"""
content = list()
ftp = FTP("ftp.datasus.gov.br")
ftp.connect()
ftp.login()
ftp.cwd(path)

def line_file_parser(file_line):
info = {}
if "<DIR>" in file_line:
date, time, _, *name = str(file_line).strip().split()
info["size"] = 0
info["type"] = "dir"
name = " ".join(name)
modify = datetime.strptime(
" ".join([date, time]), "%m-%d-%y %I:%M%p"
)
info["modify"] = modify
xpath = path + name if path.endswith("/") else path + "/" + name
content.append(Directory(xpath, info))
else:
date, time, size, name = str(file_line).strip().split()
info["size"] = size
info["type"] = "file"
modify = datetime.strptime(
" ".join([date, time]), "%m-%d-%y %I:%M%p"
)
info["modify"] = modify
content.append(File(path, name, info))

ftp.retrlines("LIST", line_file_parser)
ftp.close()
return content


class Database:
"""
Base class for PySUS databases. Contains common functions
for accessing DataSUS FTP server. With this class, it is
possible to construct database classes for different DataSUS
files, sharing state and functionalities.
Parameters
ftp [FTP]: ftplib.FTP object for connecting in DataSUS server.
name [str]: database name
paths [list[str]]: server paths where the files are located
files [list[Files]]: list of parsed Files that will be loaded at
database instantiation.
metadata [dict]: dict containing database's metadata information
Methods
describe(file): describes a file according to each database's
spec. Returns a dict with file information
all_files(): runs at instantiation, enters the FTP server and reads
files within the paths, parsing them into File classes
and returning all Files as a list
"""

ftp: FTP
name: str
paths: List[str]
content: List[Union[Directory, File]]
files: List[File]
metadata: dict

def __init__(self) -> None:
self.ftp = FTP("ftp.datasus.gov.br")
self.content = self.load(self.paths)
self.files = [f for f in self.content if isinstance(f, File)]

def __repr__(self) -> str:
return f'{self.name} - {self.metadata["long_name"]}'

def load(self, paths: List[str]):
"""
This method is responsible for listing all the database's
files when the database class is firstly called. It will
convert the files found within the paths into `File`s,
and `Directory`(ies).
"""
content = []
for path in paths:
content.extend(list_path(path))
return content

def describe(self, file: File) -> dict:
"""
Receives a `File` and returns a dict with its information,
according to the database's specifications. This method is
helpful to return the FTP's file in a humanized format
Parameters
file [File]: a `File` instance
"""
...

def format(self, file: File) -> tuple:
"""
Formats a File based on the database specifications,
extracting its name's parameters given a pattern.
Parameters
file [File]: a `File` instance
"""
...

def get_files(self, *args, **kwargs) -> list[File]:
"""
Filters the list of `File`s according to each database file
pattern, as UFs, Groups, Years, Months, etc. This method will
also be responsible to look for wrong values within the file
pattern and possible extra characters in its basename
"""
...
Loading

0 comments on commit d7e6d27

Please sign in to comment.