diff --git a/pyproject.toml b/pyproject.toml index 732f273..caa1004 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,5 +20,8 @@ requires-python = ">= 3.10" description = "ETL module for transforming clinical CSV data into properly-formatted packets for ingest into Katsu" readme = "README.md" +[project.scripts] +CSVConvert = "clinical_etl.CSVConvert:main" + [project.urls] Repository = "https://github.com/CanDIG/clinical_ETL_code" \ No newline at end of file diff --git a/src/clinical_ETL.egg-info/PKG-INFO b/src/clinical_ETL.egg-info/PKG-INFO index a351528..4b31839 100644 --- a/src/clinical_ETL.egg-info/PKG-INFO +++ b/src/clinical_ETL.egg-info/PKG-INFO @@ -1,5 +1,5 @@ Metadata-Version: 2.1 -Name: clinical-ETL +Name: clinical_ETL Version: 2.0.0 Summary: ETL module for transforming clinical CSV data into properly-formatted packets for ingest into Katsu Project-URL: Repository, https://github.com/CanDIG/clinical_ETL_code @@ -86,13 +86,15 @@ For each dataset (cohort) that you want to convert, create a directory outside o #### Manifest file The `manifest.yml` file contains settings for the cohort mapping. There is a sample file in [`sample_inputs/manifest.yml`](sample_inputs/manifest.yml) with documentation and example inputs. The fields are: -| field | description | -|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| description | A brief description of what mapping task this manifest is being used for | -| mapping | the mapping template csv file that lists the mappings for each field based on `moh_template.csv`, assumed to be in the same directory as the `manifest.yml` file | -| identifier | the unique identifier for the donor or root node | -| schema | a URL to the openapi schema file | -| functions | A list of one or more filenames containing additional mapping functions, can be omitted if not needed. Assumed to be in the same directory as the `manifest.yml` file | +| field | description | +|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| description | A brief description of what mapping task this manifest is being used for | +| mapping | the mapping template csv file that lists the mappings for each field based on `moh_template.csv`, assumed to be in the same directory as the `manifest.yml` file | +| identifier | the unique identifier for the donor or root node | +| schema | a URL to the openapi schema file | +| schema_class | The name of the class in the schema used as the model for creating the map.json. Currently supported: `MoHSchema` - for clinical MoH data and `GenomicSchema` for creating a genomic ingest linking file. | +| reference_date | a reference date used to calculate date intervals, formatted as a mapping entry for the mapping template | +| functions | A list of one or more filenames containing additional mapping functions, can be omitted if not needed. Assumed to be in the same directory as the `manifest.yml` file | #### Mapping template @@ -137,25 +139,27 @@ CSVConvert requires two inputs: ``` $ python src/clinical_etl/CSVConvert.py -h -usage: CSVConvert.py [-h] [--input INPUT] [--manifest manifest_file] [--test] [--verbose] +usage: CSVConvert.py [-h] --input INPUT --manifest MANIFEST [--test] [--verbose] [--index] [--minify] options: -h, --help show this help message and exit --input INPUT Path to either an xlsx file or a directory of csv files for ingest - --manifest MANIFEST Path to a manifest file describing the mapping. + --manifest MANIFEST Path to a manifest file describing the mapping. See README for more information --test Use exact template specified in manifest: do not remove extra lines --verbose, --v Print extra information, useful for debugging and understanding how the code runs. - ---test allows you to add extra lines to your manifest's template file that will be populated in the mapped schema. NOTE: this mapped schema will likely not be a valid mohpacket: it should be used only for debugging. + --index, --i Output 'indexed' file, useful for debugging and seeing relationships. + --minify Remove white space and line breaks from json outputs to reduce file size. Less readable for humans. ``` +* `--test` allows you to add extra lines to your manifest's template file that will be populated in the mapped schema. NOTE: this mapped schema will likely not be a valid mohpacket: it should be used only for debugging. + Example usage: ``` python src/clinical_etl/CSVConvert.py --input test_data/raw_data --manifest test_data/manifest.yml ``` -The output packets `_map.json` and `_indexed.json` will be in the parent of the `INPUT` directory / file. In the example above, this would be in the `test_data` directory. +The main output `_map.json` and optional output`_indexed.json` will be in the parent of the `INPUT` directory / file. In the example above, this would be in the `test_data` directory. Validation will automatically be run after the conversion is complete. Any validation errors or warnings will be reported both on the command line and as part of the `_map.json` file. @@ -211,7 +215,7 @@ A summarised example of the output is below: } ``` -`_indexed.json` contains information about how the ETL is looking up the mappings and can be useful for debugging. +`_indexed.json` contains information about how the ETL is looking up the mappings and can be useful for debugging. It is only generated if the `--index` argument is specified when CSVConvert is run. Note: This file can be very large if the input data is large. ## Testing @@ -241,7 +245,7 @@ You can validate the generated json mapping file against the MoH data model. The ``` $ python src/clinical_etl/validate_coverage.py -h -validate_coverage.py [-h] [--input map.json] [--manifest MAPPING] +usage: validate_coverage.py [-h] --json JSON [--verbose] options: -h, --help show this help message and exit diff --git a/src/clinical_ETL.egg-info/SOURCES.txt b/src/clinical_ETL.egg-info/SOURCES.txt index 5918bcd..fcd2d7a 100644 --- a/src/clinical_ETL.egg-info/SOURCES.txt +++ b/src/clinical_ETL.egg-info/SOURCES.txt @@ -4,6 +4,7 @@ pyproject.toml src/clinical_ETL.egg-info/PKG-INFO src/clinical_ETL.egg-info/SOURCES.txt src/clinical_ETL.egg-info/dependency_links.txt +src/clinical_ETL.egg-info/entry_points.txt src/clinical_ETL.egg-info/requires.txt src/clinical_ETL.egg-info/top_level.txt src/clinical_etl/CSVConvert.py diff --git a/src/clinical_ETL.egg-info/entry_points.txt b/src/clinical_ETL.egg-info/entry_points.txt new file mode 100644 index 0000000..840cedf --- /dev/null +++ b/src/clinical_ETL.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +CSVConvert = clinical_etl.CSVConvert:main diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py index 19bdd68..7e5ee3e 100644 --- a/src/clinical_etl/CSVConvert.py +++ b/src/clinical_etl/CSVConvert.py @@ -743,8 +743,12 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver return packets -if __name__ == '__main__': +def main(): args = parse_args() input_path = args.input manifest_file = args.manifest - csv_convert(input_path, manifest_file, minify=args.minify, index_output=args.index, verbose=args.verbose) + csv_convert(input_path, manifest_file, minify=args.minify, index_output=args.index, verbose=args.verbose) + + +if __name__ == '__main__': + main()