diff --git a/notes/scripts/manifest.sh b/notes/scripts/manifest.sh index 91a4d412..552a48bb 100644 --- a/notes/scripts/manifest.sh +++ b/notes/scripts/manifest.sh @@ -9,8 +9,563 @@ docker-compose up -d docker-compose ps docker-compose stop mysql +docker-compose run -T --rm -e PGPASSWORD=secret postgres \ + psql -h postgres -U adp adp-dev <=1.0.0"] diff --git a/scripts/manifest_export.py b/scripts/manifest_export.py index 04d6475f..b2f08c6b 100755 --- a/scripts/manifest_export.py +++ b/scripts/manifest_export.py @@ -1,28 +1,31 @@ import os -import django - -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "vitrina.settings") -django.setup() +from typing import Dict from tqdm import tqdm -from vitrina.datasets.models import DatasetStructure, DatasetStructureMapping, Dataset -from vitrina.orgs.models import Organization, OrganizationMapping -from typer import run, Option +from typer import run, Argument import csv import pandas as pd import hashlib +import pathlib +import dataclasses + +import django + organization_entries = [] dataset_entries = [] repository_dataset_entries = [] +UNKNOWN = 'unknown' + def main( - manifest_path: str = Option('manifest-data/', help=( - 'Path to where dataset manifest files are saved' - )), + manifest_path: str = Argument('manifest-data/', help=( + 'Path to where dataset manifest files are saved' + )), ): - pbar = tqdm('Exporting manifest information', total=DatasetStructure.objects.count()) + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "vitrina.settings") + django.setup() total = 0 total_xls = 0 @@ -31,151 +34,27 @@ def main( random = 0 found_ids = 0 rows_to_append = [] + manifest_path = pathlib.Path(manifest_path) repo_files = [] - with (pbar): - for item in DatasetStructure.objects.all(): - if item.filename is not None: - dataset_id = item.dataset_id - dataset = Dataset.objects.filter(pk=dataset_id).first() - org_id = dataset.organization.pk - org_title = Organization.objects.get(pk=org_id).title + orgs = read_orgs_mapping(manifest_path) + datasets = read_datasets_mapping(manifest_path) + read_structure_files(orgs, datasets) - if item.standardized: - standartized += 1 - else: - random += 1 - if 'csv' in item.filename: - # ignore old structure versions - if dataset.current_structure_id == item.pk: - total += 1 - if item.file is not None: - if item.standardized: - checksum = get_digest(item.file.path) - with open(item.file.path, 'r', encoding='utf-8', errors='ignore') as file: - reader = csv.reader(file) - for row in reader: - for field in row: - if 'datasets/' in field: - found_ids += 1 - if '"' in field: - mk1 = field.find('"') + 1 - mk2 = field.rfind('"', mk1) - sub = field[mk1: mk2] - if 'ref' in sub: - q = sub.rfind('"') + 1 - code = sub[q:len(sub)] - if 'datasets' in code: - if code.startswith('/'): - code = code[1:] - if len(code) > 0: - substrings = code.split('/') - org_code = substrings[2] - org_path = '/'.join(substrings[2:len(substrings) - 1]) - dataset_code = substrings[-1] - if dataset_code[0].islower(): - add_org_mapping(org_id, org_code, org_title) - add_dataset_mapping( - dataset_id, code, dataset.lt_title(), - org_title, checksum, False) - dump_current(file, org_path, manifest_path, - dataset_code) - else: - code = field[mk1: mk2] - if code.startswith('/'): - code = code[1:] - elif '""' in code: - st = code.find('"/') - fin = code.rfind('""') - code = code[st: fin].replace('"/', '') - if len(code) > 0: - if 'datasets' in code: - substrings = code.split('/') - org_code = substrings[2] - org_path = '/'.join(substrings[2:len(substrings) - 1]) - dataset_code = substrings[-1] - if dataset_code[0].islower(): - add_org_mapping(org_id, org_code, org_title) - add_dataset_mapping(dataset_id, code, - dataset.lt_title(), org_title, - checksum, False) - dump_current(file, org_path, manifest_path, - dataset_code) - else: - sub = field.find('datasets/') - if ';' in field: - if field.startswith(';;;;;'): - field = field[sub:] - fin = field.find(';') - field = field[:fin] - elif field.startswith(';'): - field = field[1:] - fin = field.find(';') - field = field[:fin] - if field.startswith('/'): - field = field[1:] - elif field.startswith('https') or field.startswith('http'): - fin = ':format' - field = field[sub:] - if fin in field: - field = field[:field.rfind(fin) - 1] - if len(field) > 0: - if ';' in field: - field = field.replace(';', '') - if field.startswith('1'): - field = field[1:] - if 'ref: ' in field: - field = field.replace('ref: ', '') - substrings = field.split('/') - if len(substrings) > 2: - org_code = substrings[2] - org_path = '/'.join(substrings[2:len(substrings) - 1]) - dataset_code = substrings[-1] - add_org_mapping(org_id, org_code, org_title) - if dataset_code[0].islower(): - add_dataset_mapping(dataset_id, field, dataset.lt_title(), - org_title, checksum, False) - dump_current(file, org_path, manifest_path, dataset_code) - elif 'xls' in item.filename or 'xlsx' in item.filename: - if dataset.current_structure_id == item.pk: - if item.file is not None: - checksum = get_digest(item.file.path) - if item.standardized: - dataframe1 = pd.read_excel(item.file.path) - if 'dataset' in dataframe1: - for i in range(0, len(dataframe1)): - code = dataframe1.iloc[i]['dataset'] - if isinstance(code, str): - found_ids += 1 - if '/' in code: - if len(code.split('/')) > 2: - org_code = code.split('/')[2] - org_path = '/'.join(code.split('/')[2:len(code.split('/')) - 1]) - add_org_mapping(org_id, org_code, org_title) - dataset_code = code.split('/')[-1] - if len(dataset_code) > 0: - if dataset_code[0].islower(): - add_dataset_mapping(dataset_id, code, dataset.lt_title(), org_title, - checksum, False) - with open(item.file.path, 'r', encoding='utf-8') as file: - dump_current(file, org_path, manifest_path, dataset_code) - total_xls += 1 - else: - total_not_csv += 1 - pbar.update(1) + return dump_orgs(organization_entries, manifest_path) dump_datasets(dataset_entries, manifest_path) - for root, dirs, files in os.walk(manifest_path + 'datasets/gov/'): + for root, dirs, files in os.walk(os.path.join(manifest_path, 'datasets/gov/')): for filename in files: if '.csv' in filename: repo_files.append(os.path.join(root, filename)) pbar2 = tqdm('Reading repository information', total=len(repo_files)) with (pbar2): - df = pd.read_csv(manifest_path + 'datasets.csv', index_col='name') + df = pd.read_csv(os.path.join(manifest_path, 'datasets.csv'), index_col='name') for repo_item in repo_files: checksum = get_digest(repo_item) with open(repo_item, 'r', encoding='utf-8') as f: @@ -208,47 +87,140 @@ def main( print(f'Organization mappings created: {len(organization_entries)}\n') +@dataclasses.dataclass +class OrganizationMapping: + id: int + name: str + title: str + + +def read_orgs_mapping( + manifest_path: pathlib.Path, +) -> Dict[ + str, # organization id from db + OrganizationMapping, +]: + orgs = {} + orgs_path = manifest_path / 'orgs.csv' + if orgs_path.exists(): + with orgs_path.open(encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + orgs[row['id']] = OrganizationMapping( + id=row['id'], + name=row['name'], + title=row['name'], + ) + return orgs + + +@dataclasses.dataclass +class DatasetMapping: + id: int + name: str + title: str + organization: str + checksum: str + + +def read_datasets_mapping( + manifest_path: pathlib.Path, +) -> Dict[ + str, # dataset id from db + DatasetMapping, +]: + datasets = {} + datasets_path = os.path.join(manifest_path, 'datasets.csv') + if datasets_path.exists(): + with datasets_path.open(encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + datasets[row['id']] = DatasetMapping( + id=row['id'], + name=row['name'], + title=row['title'], + organization=row['org'], + checksum=row['checksum'], + ) + return datasets + + +def read_structure_files( + orgs: Dict[str, OrganizationMapping], + datasets: Dict[str, DatasetMapping], +) -> None: # updates orgs and datasets in place + qs = get_datasets_in_db() + pbar = tqdm(qs, 'Exporting manifest information', total=qs.count()) + + for dataset in pbar: + item = dataset.current_structure + org = dataset.organization + + if org.id in orgs: + org.name = orgs[org.id] + + if dataset.id in datasets: + dataset_name = datasets[dataset.id].name + + df = None + if item.filename.endswith('.csv'): + with open(item.file.path, encoding='utf-8') as f: + sample = f.read(20) + for sep in (',', ';', '\t'): + if sep in sample: + break + else: + sep = None + if sep: + df = pd.read_csv(item.file.path, sep=sep) + elif item.filename.endswith(('.xls', '.xlsx')): + df = pd.read_excel(item.file.path) + + if df is not None and 'dataset' in df.columns: + idx = df['dataset'].first_valid_index() + if idx is not None: + given_dataset_name = df['dataset'].loc[idx] + if given_dataset_name.startswith('datasets/gov/'): + parts = given_dataset_name.split('/') + if len(parts) > 2: + given_org_name = parts[2] + print(given_org_name) + + ns_path = (manifest_path / dataset_name).parent + ns_path.mkdir(0o755, parents=True, exist_ok=True) + + checksum = get_digest(item.file.path) + + +def get_datasets_in_db(): + from vitrina.datasets.models import Dataset + return ( + Dataset.objects. + select_related( + 'current_structure', + 'organization', + ). + filter( + current_structure__filename__isnull=False, + current_structure__file__isnull=False, + ) + ) + + def get_digest(file_path): h = hashlib.sha256() with open(file_path, 'rb') as file: - while True: + chunk = True + while chunk: chunk = file.read(h.block_size) - if not chunk: - break h.update(chunk) return h.hexdigest() -def add_dataset_mapping(dataset_id, name, title, org, checksum, repo): - if repo: - dataset_id = None - target = repository_dataset_entries - else: - target = dataset_entries - mapping = DatasetStructureMapping( - dataset_id=dataset_id, - name=name, - title=title, - org=org, - checksum=checksum - ) - target.append(mapping) - - -def add_org_mapping(org_id, name, title): - mapping = OrganizationMapping( - org_id=org_id, - name=name, - title=title - ) - if not any(obj.org_id == org_id for obj in organization_entries): - organization_entries.append(mapping) - - def dump_orgs(data, manifest_path): - path = manifest_path + 'orgs.csv' + path = os.path.join(manifest_path, 'orgs.csv') columns = ['id', 'name', 'title'] lst = [[x.org_id, x.name, x.title] for x in data] df = pd.DataFrame(lst) @@ -273,7 +245,7 @@ def dump_orgs(data, manifest_path): def dump_datasets(data, manifest_path): multiple_entries = {} - path = manifest_path + 'datasets.csv' + path = os.path.join(manifest_path, 'datasets.csv') columns = ['id', 'name', 'title', 'org', 'checksum'] lst = [[x.dataset_id, x.name, x.title, x.org, x.checksum] for x in data] df = pd.DataFrame(lst) @@ -302,7 +274,7 @@ def dump_datasets(data, manifest_path): def dump_repo_datasets(data, manifest_path): - path = manifest_path + 'datasets.csv' + path = os.path.join(manifest_path, 'datasets.csv') dff = pd.DataFrame(data) dff.to_csv(path, mode='a', index=False, header=False, quoting=csv.QUOTE_ALL) @@ -310,7 +282,7 @@ def dump_repo_datasets(data, manifest_path): def dump_current(file, org_path, manifest_path, dataset): unknown_index = 0 original_ext = '.' + file.name.split('.')[-1] - parent_dir = manifest_path + 'datasets/gov/' + parent_dir = os.path.join(manifest_path, 'datasets/gov/') if len(org_path) == 0: org_path = 'unknown/' if not org_path.endswith('/'): diff --git a/tests/scripts/__init__.py b/tests/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/scripts/test_manifest_export.py b/tests/scripts/test_manifest_export.py new file mode 100644 index 00000000..be0a871f --- /dev/null +++ b/tests/scripts/test_manifest_export.py @@ -0,0 +1,319 @@ +import csv +import hashlib +import shutil +from pathlib import Path +from typing import Iterable +from typing import Iterator + +import pytest +import openpyxl +from factory.django import FileField +from django.db.models import QuerySet + +from vitrina.datasets.factories import DatasetStructureFactory +from vitrina.structure.models import Metadata +from vitrina.datasets.models import Dataset +from vitrina.datasets.models import DatasetStructure +# from vitirna.scripts.manifest.exporter +# from vitirna.scripts.manifest.importer + + +@pytest.mark.parametrize('sample,sep', [ + ('id;dataset;resource', ';'), + ('id,dataset,resource', ','), + ('id\tdataset\tresource', '\t'), +]) +def test_read_csv_table_detect_sep(sample: str, sep: str): + assert detect_csv_sep(sample) == sep + + +def detect_csv_sep(sample: str) -> str: + for sep in (',', ';', '\t'): + if sep in sample: + return sep + + +def write_csv_table(path: Path, table: Iterable[list[str]]): + with open(path, 'w', encoding='utf-8') as f: + writer = csv.writer(f) + for row in table: + writer.writerow(row) + + +def read_csv_table(path: Path) -> Iterator[list[str]]: + with open(path, encoding='utf-8') as f: + sample = f.read(20) + sep = detect_csv_sep(sample) + + with open(path, encoding='utf-8') as f: + if sep: + reader = csv.reader(f, delimiter=sep, quotechar='|') + for row in reader: + yield [v.strip() for v in row] + else: + for row in f: + yield row.strip() + + +def write_xlsx_table(path: Path, table: Iterable[list[str]]): + wb = openpyxl.Workbook() + ws = wb.active + for row in table: + ws.append(row) + wb.save(path) + + +def read_xlsx_table(path: Path) -> Iterator[list[str]]: + wb = openpyxl.load_workbook(path) + for sheet in wb: + buffer = [] + rows = sheet.iter_rows(max_col=20, values_only=True) + header = next(rows, None) + header = [str(v).strip() if v else '' for v in header] + header = rstrip_list(header) + yield header + for row in rows: + row = [str(v).strip() if v else '' for v in row] + row = rstrip_list(row, limit=len(header)) + buffer.append(row) + if any(row): + yield from buffer + buffer = [] + elif len(buffer) > 100: + break + + +@pytest.mark.parametrize('lst, result', [ + ('1,2,3', '1,2,3'), + ('1,2,3,', '1,2,3'), + ('1,2,3,,', '1,2,3'), + (',1,2,3,,', ',1,2,3'), +]) +def test_rstrip_list(lst, result): + assert ','.join(rstrip_list(lst.split(','))) == result + + +@pytest.mark.parametrize('lst, result', [ + ('1,2', '1,2'), + ('1,2,,,,', '1,2,,'), +]) +def test_rstrip_list_limit(lst, result): + assert ','.join(rstrip_list(lst.split(','), limit=4)) == result + + +def rstrip_list(value, *, limit: int | None = None): + size = len(value) + pos = next( + (i for i, v in enumerate(reversed(value)) if v), + size - 1, + ) + pos = size - pos + if limit is not None and size > limit: + pos = limit + return value[:pos] + + +def read_table(format: str, path: Path): + reader = FORMATS[format]['reader'] + yield from reader(path) + + +def write_table(format: str, path: Path, table: Iterable[list[str]]): + writer = FORMATS[format]['writer'] + writer(path, table) + + +def get_format_from_path(path: Path): + return path.suffix.strip('.') + + +FORMATS = { + 'csv': { + 'reader': read_csv_table, + 'writer': write_csv_table, + }, + 'xlsx': { + 'reader': read_xlsx_table, + 'writer': write_xlsx_table, + }, +} + + +@pytest.mark.parametrize('format', list(FORMATS)) +def test_table_io(tmp_path: Path, format: str): + table = [ + ['id', 'dataset', 'resource'], + ['', 'datasets/gov/ivpk/adp', ''], + ] + + path = (tmp_path / f'manifest.{format}') + write_table(format, path, table) + assert list(read_table(format, path)) == table + + +def test_table_to_dicts(): + table = [ + ['id', 'dataset', 'resource'], + ['', 'datasets/gov/ivpk/adp', ''], + ] + assert list(iter_table_as_dicts(table)) == [ + {'id': '', 'dataset': 'datasets/gov/ivpk/adp', 'resource': ''}, + ] + + +def iter_table_as_dicts(table: Iterable[list[str]]): + table = iter(table) + header = next(table, None) + if header is None: + return + for row in table: + yield {k: v for k, v in zip(header, row)} + + +@pytest.mark.parametrize('table, name', [ + ([{'dataset': 'datasets/gov/ivpk/adp'}], 'datasets/gov/ivpk/adp'), + ([{'dataset': ''}], None), + ([{'id': ''}], None), +]) +def test_get_dataset_name_from_tabular(table, name): + assert get_dataset_name_from_table(table) == name + + +def get_dataset_name_from_table(table: Iterable[dict[str, str]]) -> str: + for row in table: + if 'dataset' not in row: + return + if row['dataset']: + return row['dataset'] + + +@pytest.mark.parametrize('dataset, org', [ + ('datasets/gov/ivpk/adp', 'ivpk'), + ('', None), + (None, None), +]) +def test_get_org_name_from_dataset_name(dataset, org): + assert get_org_name_from_dataset_name(dataset) == org + + +def get_org_name_from_dataset_name(dataset: str | None) -> str | None: + if dataset and dataset.startswith('datasets/gov/'): + parts = dataset.split('/') + return parts[2] + + +@pytest.mark.django_db +def test_export(tmp_path: Path): + structure = DatasetStructureFactory( + dataset__organization__title="Org 1", + dataset__title="Dataset 1", + file__file=FileField(filename='manifest.csv', data=( + b'id,dataset,resource\n' + b',datasets/gov/ivpk/adp,\n' + )), + ) + datasets = query_datasets_with_metadata() + (dataset, metadata), = datasets + + path = Path(structure.file.path) + format = get_format_from_path(path) + table = read_table(format, path) + table = iter_table_as_dicts(table) + dataset_name = get_dataset_name_from_table(table) + assert dataset_name == 'datasets/gov/ivpk/adp' + org_name = get_org_name_from_dataset_name(dataset_name) + assert org_name == 'ivpk' + + +def query_datasets_with_metadata(datasets: QuerySet[Dataset] = None): + if datasets is None: + datasets = Dataset.objects.all() + datasets = ( + datasets. + select_related( + 'organization', + 'current_structure', + ). + prefetch_related( + 'metadata', + ) + ) + for dataset in datasets: + try: + metadata = dataset.metadata.get() + except Metadata.DoesNotExist: + metadata = Metadata() + yield dataset, metadata + + +def sha256(path: Path) -> str: + h = hashlib.sha256() + with open(path, 'rb') as f: + while chunk := f.read(h.block_size): + h.update(chunk) + return h.hexdigest() + + +@pytest.mark.parametrize('path', [ + 'foo/bar/baz.csv', + 'datasets/gov/ivpk/adp.csv', +]) +def test_fix_manifest_path(tmp_path: Path, path: str): + path = tmp_path / path + path.parent.mkdir(0o755, parents=True) + write_csv_table(path, [ + ['id', 'dataset', 'resource'], + ['', 'datasets/gov/ivpk/adp', ''], + ]) + path = fix_manifest_path(tmp_path, path, 'datasets/gov/ivpk/adp') + assert path == tmp_path / 'datasets/gov/ivpk/adp.csv' + assert strip_base_from_paths(tmp_path, tmp_path.glob('**/*.csv')) == [ + Path('datasets/gov/ivpk/adp.csv'), + ] + + +def fix_manifest_path(base: Path, path: Path, dataset: str) -> Path: + correct_path = base / f'{dataset}.csv' + if path.resolve() != correct_path.resolve(): + correct_path.parent.mkdir(0o755, parents=True, exist_ok=True) + shutil.move(path, correct_path) + return correct_path + + +def strip_base_from_paths(base: Path, paths: Iterable[Path]) -> list[Path]: + return [path.relative_to(base) for path in paths] + + +def test_mapping_tables(): + orgmap = Mapping(['regno', 'title']) + orgmap.update(42, None) + orgmap.update(42, 'ORG') + orgmap.update(None, 'ORG') + assert orgmap.data == [ + {'id': 42, 'name': 'ORG', 'regno': '', 'title': ''}, + ] + + +class Mapping: + names: list[str] + by_id: dict[str, int] + by_name: dict[str, int] + + def __init__(self, names: list[str]): + self.names = names + + def update(self, id, name, **kwargs) -> None: + id = str(id) if id else '' + name = name if name else '' + item = { + 'id': id, + 'name': name, + **kwargs, + } + self.data.append(item) + ix = len(self.data) + if id: + self.by_id[id] = ix + if name: + self.by_name[name] = ix + diff --git a/vitrina/datasets/factories.py b/vitrina/datasets/factories.py index db2b4d3b..1042bfc2 100644 --- a/vitrina/datasets/factories.py +++ b/vitrina/datasets/factories.py @@ -50,7 +50,7 @@ def _create(cls, model_class, *args, **kwargs): fake = faker.Faker() for lang in reversed(settings.LANGUAGES): dataset.set_current_language(lang[0]) - dataset.title = fake.word() + dataset.title = dataset.title or fake.word() dataset.description = fake.catch_phrase() dataset.save() return dataset diff --git a/vitrina/datasets/services.py b/vitrina/datasets/services.py index 96f0721c..f1f1ae95 100644 --- a/vitrina/datasets/services.py +++ b/vitrina/datasets/services.py @@ -3,7 +3,6 @@ from django.contrib.contenttypes.models import ContentType from django.core.exceptions import ObjectDoesNotExist from django.core.handlers.wsgi import HttpRequest - from django.db.models import Q from vitrina.helpers import get_filter_url