diff --git a/.Rhistory b/.Rhistory deleted file mode 100644 index bd29281e..00000000 --- a/.Rhistory +++ /dev/null @@ -1,8 +0,0 @@ -# set up environment -options(scipen = 999) -library(tidyr) -library(dplyr) -library(writexl) -#### Original Data Treatment #### -# import the original data -CPHD_QC <- read.csv(file = "centreau_qc_data-V1/CovidPublicHealthData.csv") diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..12053a68 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,15 @@ +name: lint + +on: [pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version-file: '.python-version-ci' + - uses: py-actions/flake8@v2 + with: + path: 'src tests' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..db2bccc6 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,27 @@ +name: test + +on: [pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version-file: '.python-version-ci' + + - name: cache dependencies + uses: actions/cache@v4 + id: cache-pip + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} + + - name: install dependencies + run: pip install -r requirements.txt + if: steps.cache-pip.outputs.cache-hit != 'true' + + - run: pip install . + - run: ./run-all-tests.sh diff --git a/.github/workflows/type-check.yml b/.github/workflows/type-check.yml new file mode 100644 index 00000000..fb5eed96 --- /dev/null +++ b/.github/workflows/type-check.yml @@ -0,0 +1,16 @@ +name: type-check + +on: [pull_request] + +jobs: + type-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version-file: '.python-version-ci' + cache: 'pip' + - run: pip install -r requirements.txt + - run: pip install -r requirements-dev.txt + - run: mypy diff --git a/.gitignore b/.gitignore index c3ffc9b9..b0ac1eb5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,20 +1,16 @@ -# R project files to ignore - -.Rproj.user/ -.Rhistory -.RData - -# Temporary or generated files to ignore - -.Rapp.history -_.rdb -_.rdx - # Operating system generated files .DS_Store Thumbs.db -# R package cache (optional, see notes) +# python cache +__pycache__ + +# python pyenv +.python-version + +# python venv +.env -.renv/ +# generated by odm-share +debug.txt diff --git a/.python-version-ci b/.python-version-ci new file mode 100644 index 00000000..bd28b9c5 --- /dev/null +++ b/.python-version-ci @@ -0,0 +1 @@ +3.9 diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..1333ed77 --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +TODO diff --git a/README.md b/README.md index 720e250b..486c6485 100644 --- a/README.md +++ b/README.md @@ -1,858 +1,23 @@ -# Aim/Objective: +# PHES-ODM Sharing Library -The purpose of the ODM is to support wastewater-based surveillance and +The purpose of this library is to support wastewater-based surveillance and epidemiology by facilitating the collection, standardization, and transparency of data by providing a more harmonized ODM data format to share data between different data curators and data repositories. -The ODM supports data sharing in two ways: +## Features -1. **Data sharing schema** - The ODM will have a schema that describes what data can be shared with one or more partners or users. -2. **Data filter based on the data sharing schema** - The ODM will support an open source method for filtering data tables using the data sharing schema. This will be accomplished in large part by generating one, or a series of, SQL querries that can be used to pull selected data from a variety of different database structures. +- A standard schema format defining rules for data filtering: + - specify custom rules per organization + - specify tables and columns to include + - specify filters for which rows to include +- A Python library for performing each step of the process +- A CLI for easy one-shot sharing -The data sharing schema will be a csv file (sharing.csv) where each row in the `sharing` file corresponds to one or more headers and/or tables in the PHES-ODM, or one or more organizations with whom to share the output data. See below for an example. +## Usage -| ruleId | table | mode | key | operator | value | notes | -|--------|----------|--------|------------|----------|---------------------------------------------|------------------------| -| 1 | measures | select | NA | NA | measureRepID,measure,value,unit,aggregation | basic measures details | -| 2 | measures | filter | measure | = | mPox | | -| 3 | measures | filter | reportable | = | TRUE | NA | -| 4 | NA | share | OPH | NA | 1; 2 | link to DSA | -| 5 | NA | share | PHAC | NA | 1; 2; 3 | changed for new DSA | +See [docs/manual/getting-started.qmd](docs/manual/getting-started.qmd). -The `sharing` file should be accompanied by a metadata file, `sharing_metadata.csv`. This csv file provides additional information about the sharing schema, as shown in the example below: +## Install -| name | datasetID | organizationID | numberRules | orgsServed | contactID | version | firstReleased | lastUpdated | changes | notes | -|---------------|---------------|----------------|-------------|------------|-------------|---------|---------------|-------------|----------------------------------------|--------------------------------------------| -| quebecSharing | universityLab | university-1 | 5 | PHAC;OPH | lastnamePer | 1.0.0 | 2024-03-26 | 2024-03-26 | Deprecated outdated rules for LPH, OPH | in line with DSA number 234, 565, and 901. | - -The data filter is a Python module (or function) that generates a SQL query to -pull the shareable data based on the inclusion criteria in the data sharing -schema. The function queries ODM-formatted data tables and takes a sharing -schema as an input. The function includes (filters) data according to the -schema rules. The function then returns a data table with only the data that is -to be shared. This new, returned data is ready to be shared and used with a -partner. - -# Features - -High level features include: - -- The data custodian should be able to define all the sharing rules in a CSV - file (`sharing.csv`). A standard schema for defining the rules will be - developed. -- The schema should allow a data custodian to define the partner (organization - or person - matching to an `organizationID` and/or `contactID` within the - model) that each rule pertains to. For example, a certain rule or set of - rules may be applicable only to the Public Health Agency of Canada (PHAC) - while another rule may be applicable to not only the PHAC but also to Ottawa - Public Health. -- The schema should allow data custodians to define rules that apply to rows or - to columns. For example, a rule can be made to share all the rows from the - `samples` table, and/or to only include the `collType` column from the - `samples` table. -- The schema is built using the logic and arguments of the `filter()` and - `select()` functions of the Dplyr package in R. When specifying details of - the filter function (mode), use of the `=`, `>`, `<`, `>=`, and `<=` - operators are supported, along with `in` for ranges of continuous data and - `NA` where the operator is not relevant, like for the select function (mode). -- Rules can be combined to form more powerful conditions by building across - rows of the `sharing` csv. For example, include all rows with `email` equal - to "[john.doe\@email.com](mailto:john.doe@email.com){.email}", `firstName` - equal to "John", and `lastName` equal to "Doe". This is accomplished by - grouping rules together using the mode `group` with the operators `AND` or - `OR`, creating customized combinations of conditions. -- Rules can be made within the context of an entire table, to a column that may - be present in more than one table, or to a column specific to a table. Rules - can also be made at the level of all measures or datasets with a given - license type. -- The rules may only be inclusive. For example, rules can be defined to include - rows but not to exclude them. -- The data custodian will be returned a report at the end which will provide - details about how many rows were filtered for inclusion in the shareable - data, as well as the tables and headers selected for inclusion. -- Version 2 of the PHES-ODM allows data generators and custodian to define data - licenses. In some jurisdictions, this may be defined in detailed data-sharing - agreements (DSA). The DSAs can be short simply referencing a license type, or - they can be many pages identifying specifically who can use the data and for - what purpose and what will be the data destruction protocols, etc. The notes - column in the `sharing.csv` is a free text field, providing an opportunity to - reference a longer document or provide more details. Most licenses currently - supported by the ODM license field are open. -- The implementation should take into account the relationship between the - different tables as defined in the ODM. For example, removing a row with - `siteID = ottawa-1` from the sites table, should also remove all rows in the - samples table with `siteID = ottawa-1`. All nested relationships should also - be taken care of. The relationships between the tables can be seen - [here](https://lucid.app/lucidchart/847978df-d627-4b8a-a379-faca7a517ef4/edit?invitationId=inv_0de7777b-888b-4d8a-827d-2306bdc48cce&page=4OvE58YH3w..#). -- A python function that implements these rules should be built. - -# Sharing CSV - -## Introduction - -The sharing CSV file provides different data generators or custodians with a -standardized and code-agnostic method to define rules for sharing data with -different organizations. Each row in the CSV file defines one rule which -combined defines all the sharing rules for a given data generator or custodian. -The headers of the CSV file define the different parts of the rule. The -following sections outline these different parts, and provide a guide to -defining a rule. - -### 1. Setting the ruleID - -Because each sharing schema is a closed system to a given data generator or -data custodian, the ruleIDs only need to be unique within a given schema -(`sharing.csv`). Using sequential integers for ruleIDs works well, and is the -recommended approach. - -### 2. Rules and Modes - -After defining the unique ID for a rule, the next step is to determine the -`mode` of a rule. There are four possible values for the `mode` column: - -1. `select`: This indicates that the effect of this rule will be to select the - tables and columns for inclusion in the output shareable data. It also means - that the `key` and `operator` columns do not need to be defined for this - rule. -2. `filter`: This is used for rules that will filter the shareble data output - rows based on row values. The full rule will require the `key` and - `operator` columns to be fully specified. -3. `group`: This defines activities for a rule that groups or combines rules - together for execution. The full rule will require the `operator` column to - be fully specified. -4. `share`: This defines a rule that specifies the `organizationID` or - `contactID` with which an output will be shared, as well as the rules to - apply to generate the specific output data. The full rule will require the - `key` column, but not the `operator` column, to be fully specified. - -Generally, the bulk of a sharing csv will be composed of `filter` and `select` -rules, with a few `group` rules, and the final `share` rules at the very end. -Rules should also be written and specified in this same order. - -### 3. Selecting an Entity - -In order to generate an intelligible output dataset, several `select` and -`filter` rules will need to first be stacked and applied. This step involves -selecting the parts of the PHES-ODM or entities within the model. The entities -that can be selected are: - -- Data contained in a table -- Data contained in column(s) of table(s) -- Data contained in row(s) of table(s) - -This step uses four columns, `table`, `mode`, `key`, and/or `value`. The -`table` column specifies the name(s) of the table(s) to which this rule -applies. To list multiple tables in the `table` column, list each table -separated by a ";". The `mode` column specifies the action of a rule. For -`mode = filter` rules, the `key` column lists the name(s) of the column(s) to be -included in the shared data output as specified by filtering rule. For -`mode = select` rules, the names of the selected columns are specified in the -`value` column. For rules that select entities, the `filter` and `select` modes -will be used. - -#### 3.1. Selecting Columns - -In order to have any data to share, tables and columns need to be specified for -inclusion. These are the first rules to define in your schema. To specify which -columns should be shared, specify the table or tables in the `table` column, -list `select` in the `mode` column, and then list the column or columns to be -shared in the `value` column. When specifying the columns, you can separate -distinct column names with a ";". The `key` and `operator` columns should be -left blank (or `NA`) as they are not used in these rules, and any values in -these columns for `select`-mode rows will be ignored. - -To select all tables or all columns, an `all` value can be used in the `table` -and/or `value` columns of the sharing csv. - -Some examples are given below: - -1. Selecting only the `saMaterial` column in the `samples` table - - | ruleId | table | mode | key | operator | value | notes | - |--------|---------|--------|-----|----------|------------|-----------| - | 1 | samples | select | NA | NA | saMaterial | NA | - -2. Selecting only the `reportable` and the `pooled` columns in the `measures` -table - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|-----|----------|------------|-----------| - | 2 | measures | select | NA | NA | reportable;pooled | NA | - -3. Selecting all the columns in the `measures` table - - | ruleId | table | mode | key | operator | value | notes | - |--------------|----------|--------|-----|----------|------------|-----------| - | 3 | measures | select | NA | NA | all | NA | - -4. Selecting only the `purposeID` column in the `measures` and the `samples` table - - | ruleId | table | mode | key | operator | value | notes | - |--------------|----------|--------|-----|----------|------------|-----------| - | 4 | measures;samples | select | NA | NA | purposeID | NA | - -5. Selecting the `siteID` column in all tables - - | ruleId | table | mode | key | operator | value | notes | - |--------------|----------|--------|-----|----------|------------|-----------| - | 5 | all | select | NA | NA | siteID | NA | - -Notes: - -- In examples 2 and 4 where multiple columns and tables were selected - respectively, a `;` was used to separate the values. Throughout this entire - document when multiple values need to listed in a single cell, the `;` - symbol should be used to separate discrete values. - -- In examples 3 and 5 where all the columns in a table and all the tables - were selected respectively, the keyword `all` was used. Similar to the `;` - symbol, the keyword `all` may be used in a cell to mean everything. - -- The **ruleId** column is mandatory for all rules and each value is unique - across the entire sheet (`sharing.csv`). It must be a number. - -#### 3.2. Filtering Rows - -Once the columns and tables for inclusion have been specified, users can -specify which rows should be shared using rules with the `filter` mode. Note -that rules that filter can use values in any columns, including columns that -are not being shared in the final output. To specify a `filter` rule, users -need to specify the table or tables in the `table` column, and define the -`mode` as filter. Then users can specify the columns which the filter will act -on in the `key` column, specify the nature of the filter using the `operator` -column, and the filter values in the `value` column. The general structure for -the filter argument is: - -``` -**column name** **operator** **value** -``` - -Where the "column name" the name of a column (specified in the `key` column) -from the table(s) specified in the `table` column, and the "value" is the value -or range of values that determine whether a row is selected for sharing, stored -in the `value` column. The "operator" is a placeholder for the symbol indicates -that the nature of the filter to be applied, and the desired relationship -between the `key` and the `value`. The currently accepted values for the -`operator` column are: - -- **=**: Denotes exact equivalence. This should be used for categorical or - character variables. -- **\>**: Denotes "greater-than". This can be used for numeric, integer, or - date-type variables. Note that it is exclusive of the value used in the - expression. -- **\<**: Denotes "lesser-than". This can be used for numeric, integer, or - date-type variables. Note that it is exclusive of the value used in the - expression. -- **\>=**: Denotes "greater-than-or-equal-to". This can be used for numeric, - integer, or date-type variables. Note that it is inclusive of the value - used in the expression. -- **\<=**: Denotes "lesser-than". This can be used for numeric, integer, or - date-type variables. Note that it is inclusive of the value used in the - expression. -- **in**: Denotes that a value is contained in a range of continuous data. - This can be used for numeric, integer, or date-type variables. Note that it - is inclusive of the values used in the expression. - -Technically the `operator` column also accepts `AND` and `OR` as values, but -only for rules of the `group` mode. - -Some examples of how these rules can be constructed and applied in practice are -given below: - -1. Selecting only the rows where the value of `siteID` is exactly equal to -"ottawa-1" in the `samples` table. - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|---------|----------|-----------|-----------| - | 6 | samples | filter | siteID | = | ottawa-1 | | - -2. Selecting only the rows where the value of "Collection period" (`collPer`) -is greater than or equal to 5 in the `samples` table. - - | ruleId | table | mode | key | operator | value | notes | - |--------|---------|--------|---------|----------|-------|-------| - | 7 | samples | filter | collPer | >= | 5 | | - -3. Selecting only the rows where the value of "Collection period" (`collPer`) -is less than 5 in the `samples` table. - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|---------|----------|-----------|-----------| - | 8 | samples | filter | collPer | <= | 5 | | - -4. Selecting only the rows where the value of "Analysis date end" (`aDateEnd`) -is exactly equal to February 1st, 2022 (2022-02-01) from the `measures` table. - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|----------|----------|------------|-----------| - | 9 | measures | filter | aDateEnd | = | 2022-02-01 | | - -5. Selecting only the rows where the value of "Analysis date end" (`aDateEnd`) -is a date in February from the `measures` table. - - | ruleId | table | mode | key | operator | value | notes | - | ------ | -------- | ------ | -------- | -------- | --------------------- | ----- | - | 10 | measures | filter | aDateEnd | in | 2022-02-01:2022-02-28 | | - - -### 4. Grouping Rules - -By default, all `filter` and `select` rules that are applied together are -combined with an implicit `AND`. That is to say, data to be shared must meet all -the criteria together. To stack particular rules to be applied together, or to -combine rules with an `OR`, users can rely on the `group` mode. To create a -`group` rule, the mode column needs to be specified to `group`, and the rule IDs -of the rules to be groups should be listed in the `value` column, separated by a -";". To specify how the rules are being grouped, the operator needs to be -specified as `AND` or `OR`. Group-type rules can also be grouped together, -creating nested group rules. - -Some examples are given below: - -1. Selecting only the rows where the value of "Analysis date end" (`aDateEnd`) -is exactly equal to February 1st, 2022 (2022-02-01) or February 1st, 2023 -(2023-02-01) from the `measures` table. - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|----------|----------|------------|-----------| - | 11 | measures | select | NA | NA | all | This rule selects all the columns from the measures table for inclusion | - | 12 | measures | filter | aDateEnd | = | 2022-02-01 | This rules takes all rows where analysis date end is February 1st, 2022 | - | 13 | measures | filter | aDateEnd | = | 2023-02-01 | This rules takes all rows where analysis date end is February 1st, 2023 | - | 14 | NA | group | NA | OR | 12;13 | This rule groups rules 12 and 13 together with "OR", such that if either rule is true, the data is selected | - -2. Selecting only the rows where the value of `siteID` is exactly equal to "ottawa-1" or "laval-1" in the `samples` table. - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|----------|----------|------------|-----------| - | 15 | samples | select | NA | NA | all | This rule selects all the columns from the samples table for inclusion | - | 16 | samples | filter | siteID | = | ottawa-1 | This rules takes all rows with a siteID of ottawa-1 | - | 17 | samples | filter | siteID | = | laval-1 | This rules takes all rows with a siteID of laval-1 | - | 18 | NA | group | NA | OR | 16;17 | This rule groups rules 16 and 17 together with "OR", such that if either rule is true, the data is selected | - -3. Selecting only the rows where the value of `siteID` is "ottawa-1" and the collection datetime (`collDT`) was February 1st, 2023 (2023-02-01) from the `samples` table. - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|----------|----------|------------|-----------| - | 19 | samples | select | NA | NA | all | This rule selects all the columns from the samples table for inclusion | - | 20 | samples | filter | siteID | = | ottawa-1 | This rules takes all rows with a siteID of ottawa-1 | - | 21 | samples | filter | collDT | = | 2023-02-01 | This rules takes all rows with a collection date of February 1st, 2023 | - | 22 | NA | group | NA | AND | 20;21 | This rule groups rules 20 and 21 together with "AND", such that only rows that met both conditions are selected | - -4. Selecting only the rows from the `measures` table that correspond to MPox measures between January 1st, 2021 and December 31st, 2021, or SARS-CoV-2 measures after January 1st, 2020. - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|------------|----------|------------|-----------| - | 23 | measures | select | NA | NA | measure; value; unit; aggregation | This rule selects the measure, value, unit, and aggregation columns from the measures table for inclusion | - | 24 | measures | filter | measure | = | mPox | This rules takes all rows with an MPox measure in the measures table | - | 25 | measures | filter | reportDate | in | 2021-01-01:2021-12-31 | This rules takes all rows with a report date between jan.1 and dec. 31, 2021 in the measures table | - | 26 | NA | group | NA | AND | 24;25 | This rule groups rules 24 and 25 together with "AND", such that only rows that met both conditions are selected | - | 27 | measures | filter | measure | = | cov | This rules takes all rows with a SARS-CoV-2 measure in the measures table | - | 28 | measures | filter | reportDate | >= | 2020-01-01 | This rules takes all rows with a report date after jan.1, 2020 in the measures table | - | 29 | NA | group | NA | AND | 27;28 | This rule groups rules 27 and 28 together with "AND", such that only rows that met both conditions are selected | - | 30 | NA | group | NA | OR | 26;29 | This rule groups rules 26 and 29 together with "OR", such that if either grouping of rules is true, the data is selected | - -### 5. Selecting an Organization for Sharing - -Once the rules in the sharing csv are defined, the next step is deciding to -which organization(s) or person/people a rule applies. This is done using the -an additional rule row with the `mode` columns value specified as `share`. A -unique identifier for each organization or person should be used and reused -throughout the entire document, and is used in the `key` column for sharing -rules. This unique identifier should ideally correspond to an organization ID -(`organizationID`) in the `organizations` table, or a contact ID (`contactID`) -in the `contacts` table of the ODM. To apply a single rule across multiple -organizations, the different organizations that a rule pertains to can be -listed together in the `key` column. The listed organizations should be -separated by a ";". For example, if a rule applies to the **Public Health -Agency of Canada** (`organizationID = PHAC`) as well as **Ottawa Public -Health** (`organizationID = OPH`) the value of the `key` cell in the row for -that rule would be `PHAC;OPH`. The example assumes that PHAC and OPH are the -agreed upon identifiers to represent these organizations. The rules to apply -for the shared data output should be listed in the `value` column, with that -various rule IDs separated by a ";". To specify different rules for different -organizations/people, users will need to generate addition `share`-mode rules. - -Some examples of how these rules can be constructed and applied in practice are -given below: - -1. Selecting only all columns of the `measures` table, but only the rows where -the value of "Analysis date end" (`aDateEnd`) is exactly equal to February 1st, -2022 (2022-02-01) or February 1st, 2023 (2023-02-01), and everything from the -`samples` table with the Public Health Agency of Canada (`organizationID = -PHAC`) and Ottawa Public Health (`organizationID = OPH`). Using those same -rules for Laval Public Health (`organizationID = LPH`), except only including -the rows of the `samples` table where the value of `siteID` is exactly equal to -"ottawa-1" or "laval-1". - - | ruleId | table | mode | key | operator | value | notes - |--------|----------|--------|----------|----------|------------|-----------| - | 11 | measures | select | NA | NA | all | This rule selects all the columns from the measures table for inclusion | - | 12 | measures | filter | aDateEnd | = | 2022-02-01 | This rules takes all rows where analysis date end is February 1st, 2022 | - | 13 | measures | filter | aDateEnd | = | 2023-02-01 | This rules takes all rows where analysis date end is February 1st, 2023 | - | 14 | NA | group | NA | OR | 12;13 | This rule groups rules 12 and 13 together with "OR", such that if either rule is true, the data is selected | - | 15 | samples | select | NA | NA | all | This rule selects all the columns from the samples table for inclusion | - | 16 | samples | filter | siteID | = | ottawa-1 | This rules takes all rows with a siteID of ottawa-1 | - | 17 | samples | filter | siteID | = | laval-1 | This rules takes all rows with a siteID of laval-1 | - | 18 | NA | group | NA | OR | 16;17 | This rule groups rules 16 and 17 together with "OR", such that if either rule is true, the data is selected | - | 31 | NA | share | OPH;PHAC | NA | 11;14;15 | Share all measures from feb. 1 2022 and 2023, and all samples information | - | 32 | NA | share | LPH | NA | 11;14;15;18| Share all measures from feb. 1 2022 and 2023, and all samples from ottawa and laval | - -2. Share MPox data from 2021 with Ottawa Public Health (`organizationID = - OPH`), share all SARS-CoV-2 data since 2020 with Laval Public Health - (`organizationID = LPH`), and share MPox data from 2021 and all SARS-CoV-2 - data since 2020 with the Public Health Agency of Canada (`organizationID = - PHAC`). - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|------------|----------|------------|-----------| - | 23 | measures | select | NA | NA | measure; value; unit; aggregation | This rule selects the measure, value, unit, and aggregation columns from the measures table for inclusion | - | 24 | measures | filter | measure | = | mPox | This rules takes all rows with an MPox measure in the measures table | - | 25 | measures | filter | reportDate | in | 2021-01-01:2021-12-31 | This rules takes all rows with a report date between jan.1 and dec. 31, 2021 in the measures table | - | 26 | NA | group | NA | AND | 24;25 | This rule groups rules 24 and 25 together with "AND", such that only rows that met both conditions are selected | - | 27 | measures | filter | measure | = | cov | This rules takes all rows with a SARS-CoV-2 measure in the measures table | - | 28 | measures | filter | reportDate | >= | 2020-01-01 | This rules takes all rows with a report date after jan.1, 2020 in the measures table | - | 29 | NA | group | NA | AND | 27;28 | This rule groups rules 27 and 28 together with "AND", such that only rows that met both conditions are selected | - | 30 | NA | group | NA | OR | 26;29 | This rule groups rules 26 and 29 together with "OR", such that if either grouping of rules is true, the data is selected | - | 33 | NA | share | OPH | NA | 23;26 | Share MPox data from 2021 with Ottawa Public Health | - | 34 | NA | share | LPH | NA | 23;29 | Share all SARS-CoV-2 data since 2020 with Laval Public Health | - | 35 | NA | share | PHAC | NA | 23;30 | Share MPox data from 2021 and all SARS-CoV-2 data since 2020 with PHAC | - -## Example Scenarios - -In this section we will be working with some data, providing an example -scenario for a rule and showing what the rule looks like in practice. - -### Filtering on license type - -One special case for filtering is using the license type (`license` in the -`datasets` table, or `measureLic` in the `measures` table). This is more useful -for data generators and custodians who work with a mix of open and private -data. By only filtering on open data, or open data with a specific license, all -of the data and metadata that are open can be shared, without needing to -specify additional sharing filters. For example, to share all data in a given -dataset: - -| ruleId | table | mode | key | operator | value | notes | -|--------|-------|--------|------------|----------|-------|-----------------------------------------------------------------------------------------------------------| -| 1 | all | select | NA | NA | all | This rule selects all the columns and tables for inclusion | -| 2 | all | filter | license | = | open | This rules takes all rows where the license is open | -| 3 | all | filter | measureLic | = | open | This rules takes all rows where the measure license is open | -| 4 | NA | group | NA | OR | 2; 3 | This rule groups rules 2 and 3 together with "OR", such that if either rule is true, the data is selected | -| 5 | NA | share | PHAC | NA | 1; 4 | This rule specifies that the data should be filtered using rules 1 and 4, and shared with PHAC | - -For an example pulling specifically open measures: - -| ruleId | table | mode | key | operator | value | notes | -|--------|----------|--------|------------|----------|-------|------------------------------------------------------------------------------------------------| -| 1 | measures | select | NA | NA | all | This rule selects all the columns from the measures tables for inclusion | -| 2 | measures | filter | measureLic | = | open | This rules takes all rows in the measures table where the measure license is open | -| 3 | NA | share | PHAC | NA | 1; 2 | This rule specifies that the data should be filtered using rules 1 and 2, and shared with PHAC | - -### General Example - -The data we will be working with has two tables from the ODM, **samples** and -**sites**. It does not include all the columns present in these tables. The -rows in the samples and sites table respectively are shown below: - -**samples**: -| sampleID | siteID | collDT | saMaterial | reportable | notes | -|-----------|----------|------------|------------|------------|--------| -| ottWa19-1 | ottawa-1 | 2021-08-19 | rawWW | TRUE | Note 1 | -| ottWa18-1 | ottawa-1 | 2021-08-18 | sweSed | TRUE | Note 2 | -| ottWa17-1 | laval-1 | 2021-08-17 | pstGrit | TRUE | Note 3 | -| ottWa10-1 | laval-1 | 2020-01-10 | water | FALSE | Note 4 | - -**sites**: -| siteID | name | repOrg1 | sampleshed | -|----------|----------------------|---------|------------| -| ottawa-1 | University of Ottawa | OPH | school | -| laval-1 | University of Laval | LPH | school | - -#### Basic Example - -1. Share all columns in the `samples` table, but select only rows whose site -ID is "ottawa-1" for Ottawa Public Health (OPH) - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|------------|----------|------------|-----------| - | 1 | samples | select | | | all | | - | 2 | samples | filter | siteID | = | ottawa-1 | | - | 3 | NA | share | OPH | | 1;2 | | - -2. Share all columns in the `samples` table, but select rows whose sample -material (`saMaterial`) is `rawWW` or `sweSed` for the Public Health Agency of -Canada (PHAC) - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|------------|----------|--------------|-----------| - | 4 | samples | select | | | all | | - | 5 | samples | filter | saMaterial | = | rawWW;sweSed | | - | 6 | NA | share | PHAC | | 4;5 | | - -3. Share all rows, but select the `notes` column from all tables for Laval -Public Health (LPH) - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|------------|----------|--------------|-----------| - | 7 | all | select | | | notes | | - | 8 | NA | share | LPH | | 4;5 | | - -4. Share all columns, but select only the rows for samples taken in the year -2021 and who have been marked as 'reportable' for Ottawa Public Health (OPH) -and the Public Health Agency of Canada (PHAC) - - | ruleId | table | mode | key | operator | value | notes | - |--------|---------|--------|------------|----------|-----------------------|-------| - | 9 | all | select | | | all | | - | 10 | samples | filter | reportable | = | TRUE | | - | 11 | samples | filtr | collDT | in | 2021-01-01:2021-12-31 | | - | 12 | NA | group | | AND | 10;11 | | - | 13 | NA | share | PHAC | | 9;12 | | - -5. Select all columns from the samples and sites tables, but only rows that - belong to the University of Laval for Laval Public Health (LPH) - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|------------|----------|--------------|-----------| - | 14 | all | select | | | all | | - | 15 | all | filter | siteID | = | laval-1 | | - | 16 | NA | share | LPH | | 14;15 | | - -### A Note on Filter and Select, Groups - -When specifying the columns to include in the shared data with the `select` -column, it is implied that all rows will be included **unless** a filter has -also been specified separately. Conversely, specifying the rows you want to -include in the `filter` column **does not** specifies that the column used for -filtering should be included in the `filtered_data` output. `select` is the -only way to specify columns for inclusion. - -As such, if you wanted to share all of the `samples` table data with Laval -Public Health (LPH), it would suffice to define the rules as: - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|------------|----------|--------------|-----------| - | 1 | samples | select | | | all | | - | 2 | NA | share | LPH | | 1 | | - -Similarly, if you only wanted to share the measure, value, and unit columns for -the siteID that belong to the University of Laval, but did not want to share -the siteID column, the rules would be: - - | ruleId | table | mode | key | operator | value | notes | - |--------|----------|--------|--------|----------|--------------------|-------| - | 1 | measures | select | | | measure;value;unit | | - | 2 | measures | filter | siteID | = | laval-1 | | - | 3 | NA | share | LPH | | 1;2 | | - -With group-type rules, the rules are combined with an `AND` or `OR` operator, -and the rules to be combined are listed in the value field. Similarly, when -specifying the sharing target, users also list the rules to apply for the -output. The result is that with the sharing, there is an implicit grouping -action run by the library as part of this activity as well. - - -| ruleId | table | mode | key | operator | value | notes | -| --- | --- | --- | --- | --- | --- |-----------| -| 1 | measures | select | NA | NA | measure;value;unit | | -| 4 | measures | filter | measure | = | mPox | | -| 5 | measures | filter | reportDate | in | 2021-01-01;2021-12-31 | | -| 6 | measures | filter | measure | = | cov | | -| 7 | measures | filter | reportDate | >= | 2020-01-01 | | -| 8 | NA | group | NA | AND | 4; 5 | | -| 9 | NA | group | NA | AND | 6; 7 | | -| 10 | NA | group | NA | OR | 8; 9 | | -| 11 | measures | filter | reportable | = | TRUE | | -| 12 | NA | share | PHAC | NA | 10;11 | | - -Which implicitly generates --> - -| ruleId | table | mode | key | operator | value | notes | -| --- | --- | --- | --- | --- | --- | ----------- | -| 13 | measures | group | NA | AND | 10;11 | | - -Which then generates the SQL query for sharing with PHAC for this example --> - -``` -select measure, value, unit from measures where ((4 and 5) or (6 and 7)) and 11 -``` - -## Sharing CSV Columns - -This section summarizes all the columns that are a part of the file - -**ruleId**: Mandatory for all rules. Recommended to use sequential integers for -naming, but can be a number or a string. If a string, then its recommended to -use [snake_case](https://en.wikipedia.org/wiki/Snake_case) - spaces in names -are not supported. Each value should be unique across an entire sharing file -(`sharing.csv`). - -**table**: The name(s) of the tables for this rule. Allowable values are names -(partIDs) of the tables separated by a `;`, or `all` to select all tables. - -**mode**: The activity and modality of a rule. Allowable values are: - - `select`: used for rules that define which tables and columns are to be - shared. Requires values in the `ruleID`, `table`, `mode`, and `value` - columns of the sharing csv. - - `filter`: used for rules that define which rows of data are appropriate for - sharing. Requires values in the `ruleID`, `table`, `mode`, `key`, - `operator` and `value` columns of the sharing csv. - - `group`: used for grouping together rules that should be applied as - combined conditions, using either `AND` or `OR` as the operator. Requires - values in the `ruleID`, `mode`, `operator` and `value` columns of the - sharing csv. - - `share`: used for rules defining the target for the sharing data output. - Requires values in the `ruleID`, `mode`, `key` (to specify the - organizationID(s) or contactID(s)) and `value` (to specify the rules to - apply for the output) columns of the sharing csv. - -**key**: The argument used to specify the header or headers used for a -filtering rule, or the destination organization or person for a sharing rule. -Multiple headers can be listed, and likewise multiple organizations/individuals -can be separated by a `;`. Also supports key word `all`. The organizations here -reference the organizations table (`organizationID`), or the contacts table -(`contactID`) in the ODM data. - -**operator**: The operator used to define the logic of filtering and grouping -rules. For `filter`-mode rules, use of the `=`, `>`, `<`, `>=`, and `<=` -operators are supported, along with `in` for ranges of continuous data. For -`group`-mode rules, the acceptable values for this field are `AND` or `OR`. - -**value**: Specifies the values for filtering rules, and the rules to be -grouped for grouping rules. Discrete, listed values in this field should be -separated by a ";". - -**notes**: An optional, free-text description or notes explaining this rule, or -other related information deemed worthy of sharing. - -## Sharing Metadata CSV Columns - -Metadata for the sharing csv is stored in a separate file, the -`sharingMetadata.csv`. This section summarizes all the columns that are a part -of the file: - -**name**: the name given to a sharing schema. This is less important for data -custodians/generators who only use a single schema, but these are unique names -for each `sharing.csv` for each group or dataset. For naming, it is recommended -to use [snake_case](https://en.wikipedia.org/wiki/Snake_case) - spaces in names -are not supported. Each value should be unique across an entire sharing -metadata file (`sharing_metadata.csv`). - -**datasetID**: The dataset(s) for which a given sharing schema applies. -Multiple datasets can be separated by a `;`. The dataset(s) here reference the -datasets table (`datasetID`) in the ODM data. - -**version**: The version number of a given sharing schema. Version numbering -should be updated with each change, ideally following [semantic -versioning](https://semver.org) structure. Given a version number "x.y.z", or -"1.0.0", for example. The meaning of a change to each of these numbers based on -position is: MAJOR.MINOR.PATCH. MAJOR version updates are when rules are added -or removed, MINOR version updates are when when you are editing rules, and -PATCH version updates are when you tweak the `status` or `valid_period` -columns. - -**organizationID**: The organization who created a given sharing schema. The -organization here should reference the organizations table (`organizationID`) -in the ODM data. - -**contactID**: The contact information for the person who created a given -sharing schema. The contact here references the contacts table (`contactID`) in -the ODM data. - -**numberRules**: The number of rules defined in the sharing csv schema. - -**orgsServed**: A list of the organizations/people served by a sharing csv. -This is a list of `organizationID` and/or `contactID` entries in the `key` -field for sharing-type rules. The values should be separated with a ";". - -**firstReleased**: A date to specify when the sharing schema was made. - -**lastUpdated**: A date to specify when the sharing schema was last edited or -updated. - -**changes**: A free-text field to record changes made at the last update to the -sharing schema. - -**notes**: An optional, free-text description or notes explaining details about -the sharing schema, or other related information deemed worthy of sharing. - -An example of this table is found below. For this example, the university lab -records data for two different municipalities, and has separate datasetIDs for -data from the different municipalities. To make their workflow clearer, they've -also opted to created separate sharing schemas for the separate datsets. - -| name | datasetID | version | organizationID | contactID | firstReleased | lastUpdated | changes | notes | -|----------------|-----------------|---------|----------------|-------------|---------------|-------------|--------------------------------------|-------| -| ottawaSharingA | cityAReportData | 1.1.0 | university-1 | lastnamePer | 2022-02-01 | 2023-03-01 | Deprecated outdated rules for city A | NA | -| ottawaSharingB | cityBReportData | 1.2.0 | university-1 | lastnamePer | 2022-03-15 | 2023-03-01 | Changed outdated rules for city B | NA | - -Many of these values can be generated automatically: `name` can be extracted -from the filename of the schema. `lastEdited` can be inferred by reading in the -modified date from the filesystem. `organizationID`, `status`, `version`, and -`notes` are not able to be automatically inferred ar this point, but we hope to -be able to infer them automatically in a later version of the sharing system. - -# Implementation - -## Function Signature - -The function which implements the sharing feature takes two arguments: - -1. `data`: A series of tables from PHES-ODM formatted data. The data input -does not have to contain all the entities defined in the ODM, but can only -contain those on which the sharing rules should be applied. An example is shown -below, - -**measures** - -| measureRepID | sampleID | measure | value | unit | aggregation | -| -------------- | ------------ | --------- | -------- | ------ | ------------- | -| ottWW100 | pgsOttS100 | covN1 | 0.0023 | gcml | sin | -| ottWW101 | pgsOttS101 | covN1 | 0.0402 | gcml | sin | - -**samples** - -| sampleID | siteID | collDT | saMaterial | -| ------------ | ---------- | ----------------------- | ------------- | -| pgsOttS100 | ottawa-1 | 2021-02-01 9:00:00 PM | rawWW | -| pgsOttS101 | ottawa-1 | 2021-02-01 9:00:00 PM | rawWW | -| pgsOttS102 | ottawa-1 | 2021-02-26 9:00:00 PM | rawWW | - -**organizations** - -| organizationID | name | orgType | -| ---------------- | --------------------- | --------- | -| lab100 | University L100 Lab | academ | -| lab101 | University L101 Lab | academ | - -The above `data` example has three tables, **measures**, **samples**, and -**organizations**, with each table containing two or three rows. The table -names (`partID`s) as specified in the ODM should match the input file names. -The names of the columns and their value types should match up with their -specification (including the named `partID`) in the ODM. - -2. `sharing_rules`: The tabular `sharing.csv` containing the sharing rules to -be applied to the data. Each item must reference a table (or multiple tables), -and reference some or all of the fields as defined in the data above. An -example is shown below, - -| ruleId | table | mode | key | operator | value | notes | -|--------|------------------|--------|----------|----------|-----------------------|-------| -| 1 | all | select | NA | NA | all | | -| 2 | samples | filter | collDT | in | 2021-01-25:2021-02-25 | | -| 3 | samples;measures | filter | sampleID | = | pgsOttS101;pgsOttS102 | | -| 4 | NA | share | PHAC | NA | 1;3 | | -| 5 | NA | share | public | NA | 1;2;3 | | - -The above `sharing_rules` example contains three rules to apply to the data, -and 2 rules for targetting the sharing of data. - -The function will then return one dataset output (either xlsx file or series of -csv files) per organization/individual named in the rules with a `share` value -in the `mode` column. This will be the `filtered_data`, with the example shown -below: - -- **filtered_data**: The data to share with an organization. This is a copy - of the `data` parameter with the columns and rows that meet the inclusion - rules defined in the sharing rules for the passed organization. It has the - same structure as the `data` argument described above. To continue our - example: - -**FOR: PUBLIC** - -**measures** - -| measureRepID | sampleID | measure | value | unit | aggregation | -| -------------- | ------------ | --------- | -------- | ------ | ------------- | -| ottWW101 | pgsOttS101 | covN1 | 0.0402 | gcml | sin | - -**samples** - -| sampleID | siteID | collDT | saMaterial | -| ------------ | ---------- | ----------------------- | ------------ | -| pgsOttS101 | ottawa-1 | 2021-02-01 9:00:00 PM | rawWW | - -**organizations** - -| organizationID | name | orgType | -| ---------------- | --------------------- | --------- | -| lab100 | University L100 Lab | academ | -| lab101 | University L101 Lab | academ | - -**FOR: PHAC** - -**measures** - -| measureRepID | sampleID | measure | value | unit | aggregation | -|--------------|------------|---------|--------|------|-------------| -| ottWW101 | pgsOttS101 | covN1 | 0.0402 | gcml | sin | - -**samples** - -| sampleID | siteID | collDT | saMaterial | -|------------|----------|-----------------------|------------| -| pgsOttS101 | ottawa-1 | 2021-02-01 9:00:00 PM | rawWW | -| pgsOttS102 | ottawa-1 | 2021-02-26 9:00:00 PM | rawWW | - -**organizations** - -| organizationID | name | orgType | -| ---------------- | --------------------- | --------- | -| lab100 | University L100 Lab | academ | -| lab101 | University L101 Lab | academ | - -The above data can then be exported as two separate excel files (or sets of csv -files), with one for the public and one for PHAC. - -- **sharing_summary**: A tabular breakdown of entities for whom sharing data - was generated, and for each organization it lists the ruleIDs of the - applied rules, the tables included in the shared data, and the number of - rows for each shared table. An example is shown below: - -**summary_table:** - -| destination_org | rule_ids_used | tables_shared | number_rows_output | -|-----------------|---------------|---------------|--------------------| -| public | 1,2,3 | measures | 1 | -| public | 1,2,3 | samples | 1 | -| public | 1,2,3 | organizations | 2 | -| PHAC | 1,3 | measures | 1 | -| PHAC | 1,3 | samples | 2 | -| PHAC | 1,3 | organizations | 2 | - -- **sharing_rules_summary**: A copy of the sharing rules csv, but with an - additional column for recording the number of cells selected by each rule. - Allows for users to check the fineness of their data filtration, and detect - potential errors. As an example: - -| ruleId | table | mode | key | operator | value | notes | selectedCells | -|--------|------------------|--------|----------|----------|-----------------------|-------|---------------| -| 1 | all | select | NA | NA | all | | 30 | -| 2 | samples | filter | collDT | in | 2021-01-25:2021-02-25 | | 8 | -| 3 | samples;measures | filter | sampleID | = | pgsOttS101;pgsOttS102 | | 14 | -| 4 | NA | share | PHAC | NA | 1;3 | | 20 | -| 5 | NA | share | public | NA | 1;2;3 | | 16 | - -The `sharing_summary` and `sharing_rules_summary` tables should be shared with -the `filtered_data` output, along with the `sharing_metadata` file. - -Describing the example above, - -1. For the rule with ID 1, it says to include all tables and columns. So all -tables and columns were included in the output `filtered_data`, with only the -rows that matched inclusion criteria. If no filtration on rows was provided, -the column-based rules set the definition to include all rows in the included -columns. -2. For the rule with ID 2, an additional row was filtered out of **samples** -as one the of entries did not match the inclusion criteria for the collection -date. -3. The rule with ID 3 says that rows with the **sampleID** of "pgsOttS101" or -"pgsOttS102" were included across the `measures` and `samples` tables. This -meant that only one row that met this criteria in the **measures** table was -included, and the two rows from the **samples** table that met that criteria -were included. -4. RuleID 4 says to share with PHAC the data that meets the criteria of both 1 -and 3, and RuleID 5 says to share with the public the data that meets the -criteria of both 1, 2, and 3. So those rules are applied together to generate -the two outputs, one for each sharing partner. +See [docs/manual/install.qmd](docs/manual/install.qmd). diff --git a/docs/manual/.gitignore b/docs/manual/.gitignore new file mode 100644 index 00000000..a4377a31 --- /dev/null +++ b/docs/manual/.gitignore @@ -0,0 +1,17 @@ +# generated by quarto +/.quarto/ +_book/ +build/ +search.json +/chapters/*.quarto_ipynb + +# generated by quartodoc +/api-reference/ +objects.json + +# generated by code examples +/chapters/*.csv + +# assets copied by quarto document setup code +/chapters/*.csv +/chapters/*.xlsx diff --git a/docs/manual/_quarto.yml b/docs/manual/_quarto.yml new file mode 100644 index 00000000..1ef7ce4b --- /dev/null +++ b/docs/manual/_quarto.yml @@ -0,0 +1,46 @@ +project: + type: book + output-dir: build + execute-dir: file + +book: + title: 'PHES-ODM Sharing Library Manual' + author: 'OHRI' + chapters: + - index.qmd + - chapters/install.qmd + - chapters/getting-started.qmd + - chapters/cli-reference.qmd + - chapters/cli-usage.qmd + - api-reference/index.qmd + - chapters/api-usage.qmd + appendices: + - chapters/data-sources.qmd + - chapters/schemas.qmd + - chapters/python.qmd + - chapters/sqlite.qmd + +quartodoc: + renderer: _renderer.py + style: single-page + parser: sphinx + package: odm_sharing + dir: api-reference + sections: + - title: API Reference + - contents: + - sharing.extract + - sharing.parse + - sharing.connect + - sharing.get_data + - sharing.get_counts + - sharing.get_columns + - sharing.CsvFile + +theme: + - default + - assets/style.scss + +pdf-engine: pdflatex +toc: true +toc-depth: 2 diff --git a/docs/manual/_renderer.py b/docs/manual/_renderer.py new file mode 100644 index 00000000..e9e41f31 --- /dev/null +++ b/docs/manual/_renderer.py @@ -0,0 +1,21 @@ +from typing import Optional, Union + +from griffe import dataclasses as dc +from plum import dispatch +from quartodoc import MdRenderer + + +class Renderer(MdRenderer): + style = 'odm' + + @dispatch + def signature( + self, + el: Union[dc.Class, dc.Function], + source: Optional[dc.Alias] = None + ): + # exclude package name from function signature + # XXX: doesn't work to set this in __init__ + self.display_name = 'short' + + return super().signature(el) diff --git a/docs/manual/assets/data.xlsx b/docs/manual/assets/data.xlsx new file mode 100644 index 00000000..2417f98b Binary files /dev/null and b/docs/manual/assets/data.xlsx differ diff --git a/docs/manual/assets/measures.csv b/docs/manual/assets/measures.csv new file mode 100644 index 00000000..4f73510f --- /dev/null +++ b/docs/manual/assets/measures.csv @@ -0,0 +1,5 @@ +measureRepID,sampleID,measure,value,unit,aggregation +o.08.08.20covN1,o.08.08.20,covN1,0.00036,gcPMMoV,meanNr +o.08.08.20covN2,o.08.08.20,covN1,0.00003,gcPMMoV,sdNr +o.08.08.20covN4,o.08.08.20,covN2,0.00002,gcPMMoV,meanNr +o.08.08.20covN3,o.08.08.20,covN2,0.00004,gcPMMoV,sdNr diff --git a/docs/manual/assets/odm-logo.png b/docs/manual/assets/odm-logo.png new file mode 100644 index 00000000..99754e6f Binary files /dev/null and b/docs/manual/assets/odm-logo.png differ diff --git a/docs/manual/assets/schema-missing-headers.csv b/docs/manual/assets/schema-missing-headers.csv new file mode 100644 index 00000000..6ec11481 --- /dev/null +++ b/docs/manual/assets/schema-missing-headers.csv @@ -0,0 +1 @@ +ruleID,mode,key,value,notes diff --git a/docs/manual/assets/schema-missing-rules.csv b/docs/manual/assets/schema-missing-rules.csv new file mode 100644 index 00000000..b116cf15 --- /dev/null +++ b/docs/manual/assets/schema-missing-rules.csv @@ -0,0 +1 @@ +ruleID,table,mode,key,operator,value,notes diff --git a/docs/manual/assets/schema.csv b/docs/manual/assets/schema.csv new file mode 100644 index 00000000..e495f5e3 --- /dev/null +++ b/docs/manual/assets/schema.csv @@ -0,0 +1,4 @@ +ruleID,table,mode,key,operator,value,notes +1,measures,select,NA,NA,all,"select all columns from the measures table" +2,measures,filter,measure,=,covN1,"where measure equals covN1" +3,NA,share,OHRI,NA,1;2,"use rule 1 & 2 for the OHRI organization" diff --git a/docs/manual/assets/style.scss b/docs/manual/assets/style.scss new file mode 100644 index 00000000..b1e933c5 --- /dev/null +++ b/docs/manual/assets/style.scss @@ -0,0 +1,6 @@ +/*-- scss:rules --*/ + +h3 { + font-size: calc(1.17em + 0.24vw); + color: #5a6570; +} diff --git a/docs/manual/build-api.sh b/docs/manual/build-api.sh new file mode 100755 index 00000000..105c6a99 --- /dev/null +++ b/docs/manual/build-api.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -e + +projdir=$(dirname $0) +cd $projdir + +# build API reference +quartodoc build + +cd api-reference + +# XXX: `sed -i` must be written as `sed -i'' -e` for portability: +# https://stackoverflow.com/questions/4247068/sed-command-with-i-option-failing-on-mac-but-works-on-linux + +# fix +if [[ -f index.qmd.qmd ]]; then + + # HACK: fix generated file name + mv index.qmd.qmd index.qmd + + # HACK: remove 'None' header and desc + sed -i'' -e 's/^#*\s*None$//' index.qmd + + # HACK: add cross-reference to title + sed -i'' -e 's/^# .*/\0 {#sec-api-ref}/' index.qmd +fi diff --git a/docs/manual/chapters/api-usage.qmd b/docs/manual/chapters/api-usage.qmd new file mode 100644 index 00000000..8ecea402 --- /dev/null +++ b/docs/manual/chapters/api-usage.qmd @@ -0,0 +1,149 @@ +# API Usage {#sec-api-usage} + + + +```{python} +#|echo: False +import tempfile + +from common import copy_assets + +temp_dir = tempfile.gettempdir() + +copy_assets(['schema.csv', 'measures.csv', 'data.xlsx']) +``` + +## Share a single table in CSV format + +`extract` is an simple high-level function for retrieving filtered data. It can +be seen as equivalent to the `odm-share` CLI tool. + +The following example extracts data from table "measures" (residing in +`measures.csv`) according to the sharing rules in `schema.csv`, and stores the +result for each organization in a separate CSV file: + +```{python} +import pandas as pd +import odm_sharing.sharing as sh + +# NOTE: the CSV filename must be a valid table name +filtered_data = sh.extract('schema.csv', 'measures.csv') +for org_name, table_data in filtered_data.items(): + df: pd.DataFrame = table_data['measures'] + df.to_csv(f'{temp_dir}/measures-for-{org_name}.csv') +``` + +## Share multiple tables in Excel format + +The above example can be rewritten for Excel files as follows: + +```{python} +import pandas as pd +import odm_sharing.sharing as sh + +# NOTE: Excel sheet names must be valid table names +data = sh.extract('schema.csv', 'data.xlsx') +for org, table_data in data.items(): + with pd.ExcelWriter(f'{temp_dir}/data-for-{org}.xlsx') as writer: + for table_name, df in table_data.items(): + df.to_excel(writer, sheet_name=table_name, index=False) +``` + +## Parse a sharing schema + +The `parse` function turns a schema file into queries that later can be used to +filter data. + +```{python} +import pandas as pd +import odm_sharing.sharing as sh + +org_table_queries = sh.parse('schema.csv') +for org, table_queries in org_table_queries.items(): + print('org:', org) + for table, query in table_queries.items(): + print('tbl:', table) + + # NOTE: query internals are considered private and are subject to + # change + print('sql:', query.data_query.sql) +``` + +## Connecting to data sources + +The `connect` function establishes a connection to a data source to be queried. +Both files and databases can be accessed. + +```{python} +import odm_sharing.sharing as sh + +excel_con = sh.connect('data.xlsx') +sqlite_memory_con = sh.connect('sqlite://') +sqlite_file_con = sh.connect(f'sqlite:///{temp_dir}/data.db') +``` + +The following examples require extra packages to be installed, depending on the +database driver used: + +```python +mysql_con = sh.connect('mysql://:@/') +mssql_con = sh.connect('mssql+pyodbc://:@') +``` + +See +[SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#database-urls) +for more database URL examples. + +## Get filtered data + +The following example shows how to use the `connect`, `parse` and `get_data` +functions together, to extract filtered data from a data source. This is +essentially the same thing as using `extract`. + +```{python} +import odm_sharing.sharing as sh + +con = sh.connect('measures.csv') +queries = sh.parse('schema.csv') +for org, table_queries in queries.items(): + for table, query in table_queries.items(): + print(sh.get_data(con, query)) +``` + +## Get column names + +The `get_columns` function provides a way to know which columns will be +selected without performing the actual data extraction. + +```{python} +import odm_sharing.sharing as sh + +con = sh.connect('measures.csv') +queries = sh.parse('schema.csv') +for org, table_queries in queries.items(): + for table, query in table_queries.items(): + (select_rule_id, columns) = sh.get_columns(con, query) + print('columns:') + for col in columns: + print('-', col) +``` + +## Get rule counts + +The `get_counts` function can be used to show how many rows are included by +each rule's filter. This is useful for debugging a schema during its +development. + +```{python} +import odm_sharing.sharing as sh + +con = sh.connect('measures.csv') +queries = sh.parse('schema.csv') +for org, table_queries in queries.items(): + for table, query in table_queries.items(): + print('rule counts:') + rule_counts = sh.get_counts(con, query) + for rule_id, count in rule_counts.items(): + print(f'- #{rule_id}: {count}') + +``` diff --git a/docs/manual/chapters/cli-reference.qmd b/docs/manual/chapters/cli-reference.qmd new file mode 100644 index 00000000..a790f267 --- /dev/null +++ b/docs/manual/chapters/cli-reference.qmd @@ -0,0 +1,84 @@ +```{python} +#| echo: false +from os.path import join + +from odm_sharing.tools.share import share + +from common import ASSET_DIR +``` + +# CLI Reference {#sec-cli-ref} + +```bash +odm-share [OPTION]... SCHEMA INPUT... +``` + +Arguments: + +- SCHEMA + + sharing schema file path + +- INPUT... + + CSV files, an Excel/SQLite file, or an [SQLAlchemy database URL](https://docs.sqlalchemy.org/en/20/core/engines.html#database-urls) + +Options: + +- `--orgs=NAME[,...]` + + comma separated list of organizations to output data for, defaults to all + +- `--outfmt=FORMAT` + + output format (`excel` or `csv`), defaults to input format when input is a + file and `excel` when it's a database + +- `--outdir=PATH` + + output file directory, defaults to the current directory. It's created if + it doesn't exist. + +- `-d`, `--debug`: + + output debug info to STDOUT (and ./debug.txt) instead of creating sharable + output files. This shows which tables and columns are selected, and how + many rows each filter returns. + +- `-q`, `--quiet`: + + don't log to STDOUT + +One or multiple sharable output files will be created in the chosen output +directory according to the chosen output format and organization(s). Each +output file will get an associative name. + +::: {.callout-warning} +Boolean values will be normalized as TRUE/FALSE in the output. See +@sec-data-sources for more information. +::: + +::: {.callout-warning} +Even tho using a database as input is supported, it hasn't been tested much +yet. +::: + +## Errors + +Error messages will be printed to the terminal (STDERR) when something is +wrong. The message starts with telling where the error originated, including +the filename and line number or rule id. Here's a few examples: + +When headers are missing from the schema: + +```{python} +#| echo: false +x = share(join(ASSET_DIR, 'schema-missing-headers.csv'), ['assets/measures.csv']) +``` + +When no share-rules are contained in the schema: + +```{python} +#| echo: false +x = share(join(ASSET_DIR, 'schema-missing-rules.csv'), ['assets/measures.csv']) +``` diff --git a/docs/manual/chapters/cli-usage.qmd b/docs/manual/chapters/cli-usage.qmd new file mode 100644 index 00000000..19737065 --- /dev/null +++ b/docs/manual/chapters/cli-usage.qmd @@ -0,0 +1,104 @@ +```{python} +#| echo: false +from odm_sharing.tools.share import share + +from common import DATA, SCHEMA +``` + +# CLI Usage {#sec-cli-usage} + +## Examples + +### Using CSV files + +::: {.callout-note} +Please keep in mind that CSV files currently need to have the same names as the +tables they're representing. `measures.csv` represents the table "measures", +etc. +::: + +::: {.callout-note} +In this example, the terminal is pointing to the same directory as the schema +and input files. If that isn't the case, then their full relative/absolute +paths must be spelled out. +::: + +To share tables `measures.csv` and `samples.csv`, using sharing schema +`schema.csv`, the following command can be used: + +```bash +odm-share schema.csv measures.csv samples.csv +``` + +It will create one output file per table for each organization specified in +the schema, with data filtered according to the schema rules, ready to share. + +The above command can also be simplified with a wildcard (`*`) if you have a +directory with only the files you want to share, like this: + +```bash +odm-share schema.csv mytables/*.csv +``` + +### Using an Excel file + +Excel files can be used as input to share multiple tables at once: + +```bash +odm-share schema.csv data.xlsx +``` + +It will make an output file called `-.xlsx` for each +organization in the schema. + +### Using a database + +To use a MySQL database as input (with the pymysql package): + +```bash +odm-share schema.csv mysql+pymysql://user:pass@host/db +``` + +Same as above, using a MS SQL Server database through ODBC (with the pyodbc +package): + +```bash +odm-share schema.csv mssql+pyodbc://user:pass@mydsn +``` + +### Using additional options + +- Share CSV files from an Excel file: + + ```bash + odm-share --outfmt=csv schema.csv data.xlsx + ``` + +- Create a sharable excel file in the "~/files" directory, for the "OHRI" and + "TOH" organizations, applying the rules from schema.csv on the input from + data.xlsx: + + ```bash + odm-share --orgs=OHRI,TOH --outdir=~/files schema.csv data.xlsx + ``` + +## Debugging + +Debug mode provides information about what would happen when using a specific +schema, without pulling the actual data to be shared. Debugging is enabled by +passing the `--debug` flag, or simply `-d`. + +Here's an example using the sample files from [getting started](getting-started.qmd): + +```bash +odm-share --debug schema.csv data.xlsx +``` +```{python} +#| echo: false +share(SCHEMA, [DATA], debug=True) +``` + +Here we can see the columns that would be selected, as well as the number of +rows each rule would produce. Specifically, we can see that 4 rows would be +selected by rule #1, but the filter in rule #2 reduces that number to 2, which +is the final count as confirmed in the count for rule #3. diff --git a/docs/manual/chapters/common.py b/docs/manual/chapters/common.py new file mode 100644 index 00000000..0dc5d2ce --- /dev/null +++ b/docs/manual/chapters/common.py @@ -0,0 +1,31 @@ +import inspect +from os.path import abspath, dirname, join +from shutil import copy2 +from typing import List + +import pandas as pd +from tabulate import tabulate + + +MODULE_DIR = dirname(abspath(inspect.getfile(inspect.currentframe()))) +ASSET_DIR = join(MODULE_DIR, '../assets/') +SCHEMA = join(ASSET_DIR, 'schema.csv') +DATA = join(ASSET_DIR, 'measures.csv') + + +def load_csv_md(path): + '''read csv file and convert it to markdown''' + df = pd.read_csv(path, keep_default_na=False) + md = tabulate(df, headers=df.columns.to_list(), showindex=False) + return md + + +def print_file(path): + with open(path, 'r') as f: + print(f.read()) + + +def copy_assets(filenames: List[str]) -> None: + '''copies asset to current directory to avoid long paths in examples''' + for fn in filenames: + copy2(join(ASSET_DIR, fn), fn) diff --git a/docs/manual/chapters/data-sources.qmd b/docs/manual/chapters/data-sources.qmd new file mode 100644 index 00000000..979e0f94 --- /dev/null +++ b/docs/manual/chapters/data-sources.qmd @@ -0,0 +1,28 @@ +# Data Sources {#sec-data-sources} + +## Data Types + +Boolean values from CSV/Excel input are discriminated and normalized to make +filtering work correctly. This may cause certain columns to be incorrectly +recognized as boolean, and columns intended as boolean to not be recognized. +Columns with the following values are recognized as boolean: + +- CSV: + - NA-values: empty string, `NA` + - Booleans formatted as string, in any case: `FALSE`, `true` +- Excel: + - NA-values: empty cell, `NA` + - Booleans formatted as string, in any case: `FALSE`, `true` + - Boolean values formatted as such: `FALSE`, `TRUE` + - Boolean formulas: `=FALSE()`, `=TRUE()` + +Columns with only NA-values won't be recognized, since their intended type is +ambiguous. + +If the first cells in a column have values other than what is show above, they +won't be recognized as boolean. There may also be false-positives if any of the +boolean values shown above are used in a column meant for text. + +Finally, boolean values included in the filtered output are converted to the +string values `FALSE`/`TRUE`. This is also the case when using databases as +input. diff --git a/docs/manual/chapters/getting-started.qmd b/docs/manual/chapters/getting-started.qmd new file mode 100644 index 00000000..8325eb84 --- /dev/null +++ b/docs/manual/chapters/getting-started.qmd @@ -0,0 +1,93 @@ +```{python} +#| echo: false +from pathlib import Path + +import IPython.display as display + +from odm_sharing.tools.share import share + +from common import DATA, SCHEMA, copy_assets, load_csv_md, print_file +``` + +# Getting started {#sec-getting-started} + +So you want to share your data, but not all of it. This library provides the +tools for filtering your data before sharing it. + +## Data + +The input data can either be in CSV or Excel/XLSX format. Here's a CSV example. + +The following text can be copied into a file called `measures.csv`: + +```{python} +#| echo: false +#| output: asis + +print('```bash') +print_file(DATA) +print('```') +``` + +## Schema + +A sharing schema is needed to define the sharing rules. Here's an example +schema that defines one rule for which columns to select, one for which rows to +include, and one that binds them together with a recipient organization. Look +at the `notes` column for an explanation of each rule. + +The following text can be copied into a file called `schema.csv`: + +```{python} +#| echo: false +#| output: asis + +print('```bash') +print_file(SCHEMA) +print('```') +``` + +See @sec-schemas for more information on schemas. + +## CLI + +Run `odm-share` from the command line to filter your data using a schema: + +```bash +odm-share schema.csv measures.csv +``` + +This will produce one output file per organization (as specified in the schema) +with the filename pattern `-.csv`. + +Using the above schema and data examples would result in the following: + +`measures-OHRI.csv` +```{python} +#| echo: false +outpaths = share(SCHEMA, [DATA]) +display.Markdown(load_csv_md(outpaths[0])) +``` +
+See @sec-cli-ref for the CLI reference and @sec-cli-usage for more examples. + +## API + +```{python} +#|echo: False +copy_assets(['schema.csv', 'measures.csv']) +``` + +The following code does the exact same as the above CLI command but using the +functions exported by the library: + +```{python} +import odm_sharing.sharing as sh + +results = sh.extract('schema.csv', 'measures.csv') +for org, table_data in results.items(): + data_frame = table_data['measures'] + print(data_frame) +``` + +See @sec-api-ref for the API reference and @sec-api-usage for more examples. diff --git a/docs/manual/chapters/install.qmd b/docs/manual/chapters/install.qmd new file mode 100644 index 00000000..ca003e55 --- /dev/null +++ b/docs/manual/chapters/install.qmd @@ -0,0 +1,144 @@ +# Installation + +The commands below can be entered using any terminal application. Search for +"terminal" or "command line" in your OS application menu. + +::: {.callout-important} +The python and pip executables may have different names depending on the +distribution. `python` and `pip` is used below, but they may also be called +`python3` and `pip3`. +::: + +## System dependencies + +- Python >= 3.9 (see @sec-python) +- SQLite 3 (see @sec-sqlite) + +## Package + +Install the package: + +```bash +pip install git+https://github.com/Big-Life-Lab/PHES-ODM-sharing.git +``` + +You're done! You can now explore how to get started in @sec-getting-started. + +## Development + +If you want to setup a development environment, then run the commands below as +you see fit. If you just want to use the library, then you can ignore this +section. + +1. clone the repo + + ```bash + git clone https://github.com/Big-Life-Lab/PHES-ODM-sharing odm-sharing + cd odm-sharing + ``` + +2. if python != 3.9, then install pyenv and setup repo to run python 3.9 + + ```bash + sudo apt-get install pyenv + pyenv install 3.9 + pyenv local 3.9 + ``` + +3. create a virtual env for the repo, using the selected python version + + - with pyenv: + + ```bash + pyenv exec python -m venv .env + ``` + + - without pyenv: + + ```bash + python -m venv .env + ``` + +4. activate the virtual env + + (This must be done every time a new terminal is opened, to set the correct + python-binary path/env.) + + ```bash + source .env/bin/activate + ``` + +6. verify that python now has the correct version (3.9) + + ```bash + python --version + ``` + +7. make sure pip is up to date, and install dependencies + + ```bash + pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-dev.txt + ``` + +8. if you previously installed the package, then uninstall it again + + ```bash + pip uninstall odm_sharing + ``` + +9. build and install the library in development mode + + ```bash + pip install -e . + ``` + +You can now edit the source files, and directly see the changes when importing +the package. + +## Documentation + +To generate the documentation, you'll first have to: + +1. Install [Quarto](https://quarto.org/docs/get-started/) +2. Activate the Python virtual environment (if any) +3. Install Python dependencies: + + ```bash + pip install -r requirements-doc.txt + ``` + +4. Run `docs/manual/build-api.sh`^[This is currently not very portable, and requires a + unix-like shell. Windows users do however have many options, including: + GitBash/MSYS/Cygwin/WSL.] to generate the API reference. (This must also be + re-run every time the API changes.) + +To preview the manual in a web browser while editing: + +```bash +quarto preview docs/manual +``` + +To render the manual to a PDF: + +```bash +quarto render docs/manual --to pdf +``` + +The rendered PDF file can now be found in `docs/manual/build`. + +::: {.callout-note title='Troubleshooting'} + +If you're on Linux and the above command fails after mentioning 'tlmgr' or +'fontawesome5', then do the following: + + +1. Fix `tlmgr` as described in the [Arch Linux wiki](https://wiki.archlinux.org/title/TeX_Live#tlmgr). +2. Install the fontawesome5 package: + ```bash + tlmgr install fontawesome5 + ``` +3. Try the above PDF render command again. + +::: diff --git a/docs/manual/chapters/python.qmd b/docs/manual/chapters/python.qmd new file mode 100644 index 00000000..f55d068c --- /dev/null +++ b/docs/manual/chapters/python.qmd @@ -0,0 +1,56 @@ +# Python {#sec-python} + +## Install + +### Windows & macOS + +Python can be downloaded from [here](https://www.python.org/downloads/). + +### Linux + +Linux users should install Python using their distribution's package manager. +Installing a specific version is more complicated, so we'll keep it simple and +use the latest version. On Ubuntu that would be: + +```bash +sudo apt update +sudo apt install python3 +``` + +## Test installation + +You can now test that Python is working by entering the following in a terminal, + +```bash +python --version +``` + +which should output `Python 3.x.x`. + +If you get a "command not found" error, then: + +1. Make sure you typed it correctly. +2. Some distributions don't include the `python` alias. Try using `python3` + directly, like this: `python3 --version`. + +## Setup Pip + +(Replace `python`/`pip` with `python3`/`pip3` as needed.) + +Pip should be bundled with python. Ensure that it's working by running: + +```bash +pip --version +``` + +If it's not working, then try the following: + +```bash +python -m ensurepip +``` + +Lastly, upgrade pip: + +```bash +pip install --upgrade pip +``` diff --git a/docs/manual/chapters/schemas.qmd b/docs/manual/chapters/schemas.qmd new file mode 100644 index 00000000..2394367b --- /dev/null +++ b/docs/manual/chapters/schemas.qmd @@ -0,0 +1,544 @@ +# Schemas {#sec-schemas} + +## Introduction + +The sharing CSV file provides different data generators or custodians with a +standardized and code-agnostic method to define rules for sharing data with +different organizations. Each row in the CSV file defines one rule which +combined defines all the sharing rules for a given data generator or custodian. +The headers of the CSV file define the different parts of the rule. The +following sections outline these different parts, and provide a guide to +defining a rule. + +### 1. Setting the ruleID + +Because each sharing schema is a closed system to a given data generator or +data custodian, the ruleIDs only need to be unique within a given schema +(`sharing.csv`). Using sequential integers for ruleIDs works well, and is the +recommended approach. + +### 2. Rules and Modes + +After defining the unique ID for a rule, the next step is to determine the +`mode` of a rule. There are four possible values for the `mode` column: + +1. `select`: This indicates that the effect of this rule will be to select the + tables and columns for inclusion in the output shareable data. It also means + that the `key` and `operator` columns do not need to be defined for this + rule. +2. `filter`: This is used for rules that will filter the shareble data output + rows based on row values. The full rule will require the `key` and + `operator` columns to be fully specified. +3. `group`: This defines activities for a rule that groups or combines rules + together for execution. The full rule will require the `operator` column to + be fully specified. +4. `share`: This defines a rule that specifies the `organizationID` or + `contactID` with which an output will be shared, as well as the rules to + apply to generate the specific output data. The full rule will require the + `key` column, but not the `operator` column, to be fully specified. + +Generally, the bulk of a sharing csv will be composed of `filter` and `select` +rules, with a few `group` rules, and the final `share` rules at the very end. +Rules should also be written and specified in this same order. + +### 3. Selecting an Entity + +In order to generate an intelligible output dataset, several `select` and +`filter` rules will need to first be stacked and applied. This step involves +selecting the parts of the PHES-ODM or entities within the model. The entities +that can be selected are: + +- Data contained in a table +- Data contained in column(s) of table(s) +- Data contained in row(s) of table(s) + +This step uses four columns, `table`, `mode`, `key`, and/or `value`. The +`table` column specifies the name(s) of the table(s) to which this rule +applies. To list multiple tables in the `table` column, list each table +separated by a ";". The `mode` column specifies the action of a rule. For +`mode = filter` rules, the `key` column lists the name(s) of the column(s) to be +included in the shared data output as specified by filtering rule. For +`mode = select` rules, the names of the selected columns are specified in the +`value` column. For rules that select entities, the `filter` and `select` modes +will be used. + +#### 3.1. Selecting Columns + +In order to have any data to share, tables and columns need to be specified for +inclusion. These are the first rules to define in your schema. To specify which +columns should be shared, specify the table or tables in the `table` column, +list `select` in the `mode` column, and then list the column or columns to be +shared in the `value` column. When specifying the columns, you can separate +distinct column names with a ";". The `key` and `operator` columns should be +left blank (or `NA`) as they are not used in these rules, and any values in +these columns for `select`-mode rows will be ignored. + +To select all columns, an `all` value can be used in the `value` column of the +sharing csv. + +Some examples are given below: + +1. Selecting only the `saMaterial` column in the `samples` table + + | ruleId | table | mode | key | operator | value | notes | + |--------|---------|--------|-----|----------|------------|-----------| + | 1 | samples | select | NA | NA | saMaterial | NA | + +2. Selecting only the `reportable` and the `pooled` columns in the `measures` +table + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|-----|----------|------------|-----------| + | 2 | measures | select | NA | NA | reportable;pooled | NA | + +3. Selecting all the columns in the `measures` table + + | ruleId | table | mode | key | operator | value | notes | + |--------------|----------|--------|-----|----------|------------|-----------| + | 3 | measures | select | NA | NA | all | NA | + +4. Selecting only the `purposeID` column in the `measures` and the `samples` table + + | ruleId | table | mode | key | operator | value | notes | + |--------------|----------|--------|-----|----------|------------|-----------| + | 4 | measures;samples | select | NA | NA | purposeID | NA | + +Notes: + +- In examples 2 and 4 where multiple columns and tables were selected + respectively, a `;` was used to separate the values. Throughout this entire + document when multiple values need to listed in a single cell, the `;` + symbol should be used to separate discrete values. + +- In examples 3 where all the columns in a table were selected, the keyword + `all` was used. Similar to the `;` symbol, the keyword `all` may be used in + a cell to mean everything. + +- The **ruleId** column is mandatory for all rules and each value is unique + across the entire sheet (`sharing.csv`). It must be a number. + +#### 3.2. Filtering Rows + +Once the columns and tables for inclusion have been specified, users can +specify which rows should be shared using rules with the `filter` mode. Note +that rules that filter can use values in any columns, including columns that +are not being shared in the final output. To specify a `filter` rule, users +need to specify the table or tables in the `table` column, and define the +`mode` as filter. Then users can specify the columns which the filter will act +on in the `key` column, specify the nature of the filter using the `operator` +column, and the filter values in the `value` column. The general structure for +the filter argument is: + +``` +**column name** **operator** **value** +``` + +"column name" is the name of a column (specified in the `key` column) from the +table(s) specified in the `table` column. + +"value" is the value, or range of values, that determine whether a row is +selected for sharing. It's stored in the `value` column. String values can +optionally be quoted with double-quotes (`"`) to allow commas, while +single-quotes are assumed to be part of the value. + +"operator" is a placeholder for the symbol indicates that the nature of the +filter to be applied, and the desired relationship between the `key` and the +`value`. The currently accepted values for the `operator` column are: + +- **=**: Denotes exact equivalence. This should be used for categorical or + character variables. +- **\>**: Denotes "greater-than". This can be used for numeric, integer, or + date-type variables. Note that it is exclusive of the value used in the + expression. +- **\<**: Denotes "lesser-than". This can be used for numeric, integer, or + date-type variables. Note that it is exclusive of the value used in the + expression. +- **\>=**: Denotes "greater-than-or-equal-to". This can be used for numeric, + integer, or date-type variables. Note that it is inclusive of the value + used in the expression. +- **\<=**: Denotes "lesser-than". This can be used for numeric, integer, or + date-type variables. Note that it is inclusive of the value used in the + expression. +- **in**: Denotes that a value is contained in either a range of continuous + data or a set of values. Range values are separated by `:` and can be used + for numeric, integer, or date-type variables, while sets are separated by + `;` and can be of any type. Note that both ranges and sets are inclusive of + the values used in the expression. + +Technically the `operator` column also accepts `AND` and `OR` as values, but +only for rules of the `group` mode. + +Some examples of how these rules can be constructed and applied in practice are +given below: + +1. Selecting only the rows where the value of `siteID` is exactly equal to +"ottawa-1" in the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|---------|----------|-----------|-----------| + | 6 | samples | filter | siteID | = | ottawa-1 | | + +2. Selecting only the rows where the value of "Collection period" (`collPer`) +is greater than or equal to 5 in the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|---------|--------|---------|----------|-------|-------| + | 7 | samples | filter | collPer | >= | 5 | | + +3. Selecting only the rows where the value of "Collection period" (`collPer`) +is less than 5 in the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|---------|----------|-----------|-----------| + | 8 | samples | filter | collPer | <= | 5 | | + +4. Selecting only the rows where the value of "Analysis date end" (`aDateEnd`) +is exactly equal to February 1st, 2022 (2022-02-01) from the `measures` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|----------|----------|------------|-----------| + | 9 | measures | filter | aDateEnd | = | 2022-02-01 | | + +5. Selecting only the rows where the value of "Analysis date end" (`aDateEnd`) +is a date in February from the `measures` table. + + | ruleId | table | mode | key | operator | value | notes | + | ------ | -------- | ------ | -------- | -------- | --------------------- | ----- | + | 10 | measures | filter | aDateEnd | in | 2022-02-01:2022-02-28 | | + +6. Selecting only the rows where the value of `siteID` is either "ottawa-1" or + "laval-1" in the samples table. + + | ruleId | table | mode | key | operator | value | notes | + | ------ | -------- | ------ | -------- | -------- | ---------------- | ----- | + | 10 | samples | filter | siteID | in | ottawa-1;laval-1 | | + +### 4. Grouping Rules + +By default, all `filter` and `select` rules that are applied together are +combined with an implicit `AND`. That is to say, data to be shared must meet all +the criteria together. To stack particular rules to be applied together, or to +combine rules with an `OR`, users can rely on the `group` mode. To create a +`group` rule, the mode column needs to be specified to `group`, and the rule IDs +of the rules to be groups should be listed in the `value` column, separated by a +";". To specify how the rules are being grouped, the operator needs to be +specified as `AND` or `OR`. Group-type rules can also be grouped together, +creating nested group rules. + +Some examples are given below: + +1. Selecting only the rows where the value of "Analysis date end" (`aDateEnd`) +is exactly equal to February 1st, 2022 (2022-02-01) or February 1st, 2023 +(2023-02-01) from the `measures` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|----------|----------|------------|-----------| + | 11 | measures | select | NA | NA | all | This rule selects all the columns from the measures table for inclusion | + | 12 | measures | filter | aDateEnd | = | 2022-02-01 | This rules takes all rows where analysis date end is February 1st, 2022 | + | 13 | measures | filter | aDateEnd | = | 2023-02-01 | This rules takes all rows where analysis date end is February 1st, 2023 | + | 14 | NA | group | NA | OR | 12;13 | This rule groups rules 12 and 13 together with "OR", such that if either rule is true, the data is selected | + +2. Selecting only the rows where the value of `siteID` is exactly equal to "ottawa-1" or "laval-1" in the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|----------|----------|------------|-----------| + | 15 | samples | select | NA | NA | all | This rule selects all the columns from the samples table for inclusion | + | 16 | samples | filter | siteID | = | ottawa-1 | This rules takes all rows with a siteID of ottawa-1 | + | 17 | samples | filter | siteID | = | laval-1 | This rules takes all rows with a siteID of laval-1 | + | 18 | NA | group | NA | OR | 16;17 | This rule groups rules 16 and 17 together with "OR", such that if either rule is true, the data is selected | + +3. Selecting only the rows where the value of `siteID` is "ottawa-1" and the collection datetime (`collDT`) was February 1st, 2023 (2023-02-01) from the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|----------|----------|------------|-----------| + | 19 | samples | select | NA | NA | all | This rule selects all the columns from the samples table for inclusion | + | 20 | samples | filter | siteID | = | ottawa-1 | This rules takes all rows with a siteID of ottawa-1 | + | 21 | samples | filter | collDT | = | 2023-02-01 | This rules takes all rows with a collection date of February 1st, 2023 | + | 22 | NA | group | NA | AND | 20;21 | This rule groups rules 20 and 21 together with "AND", such that only rows that met both conditions are selected | + +4. Selecting only the rows from the `measures` table that correspond to MPox measures between January 1st, 2021 and December 31st, 2021, or SARS-CoV-2 measures after January 1st, 2020. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|------------|-----------| + | 23 | measures | select | NA | NA | measure; value; unit; aggregation | This rule selects the measure, value, unit, and aggregation columns from the measures table for inclusion | + | 24 | measures | filter | measure | = | mPox | This rules takes all rows with an MPox measure in the measures table | + | 25 | measures | filter | reportDate | in | 2021-01-01:2021-12-31 | This rules takes all rows with a report date between jan.1 and dec. 31, 2021 in the measures table | + | 26 | NA | group | NA | AND | 24;25 | This rule groups rules 24 and 25 together with "AND", such that only rows that met both conditions are selected | + | 27 | measures | filter | measure | = | cov | This rules takes all rows with a SARS-CoV-2 measure in the measures table | + | 28 | measures | filter | reportDate | >= | 2020-01-01 | This rules takes all rows with a report date after jan.1, 2020 in the measures table | + | 29 | NA | group | NA | AND | 27;28 | This rule groups rules 27 and 28 together with "AND", such that only rows that met both conditions are selected | + | 30 | NA | group | NA | OR | 26;29 | This rule groups rules 26 and 29 together with "OR", such that if either grouping of rules is true, the data is selected | + +### 5. Selecting an Organization for Sharing + +Once the rules in the sharing csv are defined, the next step is deciding to +which organization(s) or person/people a rule applies. This is done using the +an additional rule row with the `mode` columns value specified as `share`. A +unique identifier for each organization or person should be used and reused +throughout the entire document, and is used in the `key` column for sharing +rules. This unique identifier should ideally correspond to an organization ID +(`organizationID`) in the `organizations` table, or a contact ID (`contactID`) +in the `contacts` table of the ODM. To apply a single rule across multiple +organizations, the different organizations that a rule pertains to can be +listed together in the `key` column. The listed organizations should be +separated by a ";". For example, if a rule applies to the **Public Health +Agency of Canada** (`organizationID = PHAC`) as well as **Ottawa Public +Health** (`organizationID = OPH`) the value of the `key` cell in the row for +that rule would be `PHAC;OPH`. The example assumes that PHAC and OPH are the +agreed upon identifiers to represent these organizations. The rules to apply +for the shared data output should be listed in the `value` column, with that +various rule IDs separated by a ";". To specify different rules for different +organizations/people, users will need to generate addition `share`-mode rules. + +Some examples of how these rules can be constructed and applied in practice are +given below: + +1. Selecting only all columns of the `measures` table, but only the rows where +the value of "Analysis date end" (`aDateEnd`) is exactly equal to February 1st, +2022 (2022-02-01) or February 1st, 2023 (2023-02-01), and everything from the +`samples` table with the Public Health Agency of Canada (`organizationID = +PHAC`) and Ottawa Public Health (`organizationID = OPH`). Using those same +rules for Laval Public Health (`organizationID = LPH`), except only including +the rows of the `samples` table where the value of `siteID` is exactly equal to +"ottawa-1" or "laval-1". + + | ruleId | table | mode | key | operator | value | notes + |--------|----------|--------|----------|----------|------------|-----------| + | 11 | measures | select | NA | NA | all | This rule selects all the columns from the measures table for inclusion | + | 12 | measures | filter | aDateEnd | = | 2022-02-01 | This rules takes all rows where analysis date end is February 1st, 2022 | + | 13 | measures | filter | aDateEnd | = | 2023-02-01 | This rules takes all rows where analysis date end is February 1st, 2023 | + | 14 | NA | group | NA | OR | 12;13 | This rule groups rules 12 and 13 together with "OR", such that if either rule is true, the data is selected | + | 15 | samples | select | NA | NA | all | This rule selects all the columns from the samples table for inclusion | + | 16 | samples | filter | siteID | = | ottawa-1 | This rules takes all rows with a siteID of ottawa-1 | + | 17 | samples | filter | siteID | = | laval-1 | This rules takes all rows with a siteID of laval-1 | + | 18 | NA | group | NA | OR | 16;17 | This rule groups rules 16 and 17 together with "OR", such that if either rule is true, the data is selected | + | 31 | NA | share | OPH;PHAC | NA | 11;14;15 | Share all measures from feb. 1 2022 and 2023, and all samples information | + | 32 | NA | share | LPH | NA | 11;14;15;18| Share all measures from feb. 1 2022 and 2023, and all samples from ottawa and laval | + +2. Share MPox data from 2021 with Ottawa Public Health (`organizationID = + OPH`), share all SARS-CoV-2 data since 2020 with Laval Public Health + (`organizationID = LPH`), and share MPox data from 2021 and all SARS-CoV-2 + data since 2020 with the Public Health Agency of Canada (`organizationID = + PHAC`). + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|------------|-----------| + | 23 | measures | select | NA | NA | measure; value; unit; aggregation | This rule selects the measure, value, unit, and aggregation columns from the measures table for inclusion | + | 24 | measures | filter | measure | = | mPox | This rules takes all rows with an MPox measure in the measures table | + | 25 | measures | filter | reportDate | in | 2021-01-01:2021-12-31 | This rules takes all rows with a report date between jan.1 and dec. 31, 2021 in the measures table | + | 26 | NA | group | NA | AND | 24;25 | This rule groups rules 24 and 25 together with "AND", such that only rows that met both conditions are selected | + | 27 | measures | filter | measure | = | cov | This rules takes all rows with a SARS-CoV-2 measure in the measures table | + | 28 | measures | filter | reportDate | >= | 2020-01-01 | This rules takes all rows with a report date after jan.1, 2020 in the measures table | + | 29 | NA | group | NA | AND | 27;28 | This rule groups rules 27 and 28 together with "AND", such that only rows that met both conditions are selected | + | 30 | NA | group | NA | OR | 26;29 | This rule groups rules 26 and 29 together with "OR", such that if either grouping of rules is true, the data is selected | + | 33 | NA | share | OPH | NA | 23;26 | Share MPox data from 2021 with Ottawa Public Health | + | 34 | NA | share | LPH | NA | 23;29 | Share all SARS-CoV-2 data since 2020 with Laval Public Health | + | 35 | NA | share | PHAC | NA | 23;30 | Share MPox data from 2021 and all SARS-CoV-2 data since 2020 with PHAC | + +## Example Scenarios + +In this section we will be working with some data, providing an example +scenario for a rule and showing what the rule looks like in practice. + +### Filtering on license type + +One special case for filtering is using the license type (`license` in the +`datasets` table, or `measureLic` in the `measures` table). This is more useful +for data generators and custodians who work with a mix of open and private +data. By only filtering on open data, or open data with a specific license, all +of the data and metadata that are open can be shared, without needing to +specify additional sharing filters. For example, to share all data in a given +dataset: + +| ruleId | table | mode | key | operator | value | notes | +|--------|-------|--------|------------|----------|-------|-----------------------------------------------------------------------------------------------------------| +| 1 | all | select | NA | NA | all | This rule selects all the columns and tables for inclusion | +| 2 | all | filter | license | = | open | This rules takes all rows where the license is open | +| 3 | all | filter | measureLic | = | open | This rules takes all rows where the measure license is open | +| 4 | NA | group | NA | OR | 2; 3 | This rule groups rules 2 and 3 together with "OR", such that if either rule is true, the data is selected | +| 5 | NA | share | PHAC | NA | 1; 4 | This rule specifies that the data should be filtered using rules 1 and 4, and shared with PHAC | + +For an example pulling specifically open measures: + +| ruleId | table | mode | key | operator | value | notes | +|--------|----------|--------|------------|----------|-------|------------------------------------------------------------------------------------------------| +| 1 | measures | select | NA | NA | all | This rule selects all the columns from the measures tables for inclusion | +| 2 | measures | filter | measureLic | = | open | This rules takes all rows in the measures table where the measure license is open | +| 3 | NA | share | PHAC | NA | 1; 2 | This rule specifies that the data should be filtered using rules 1 and 2, and shared with PHAC | + +### General Example + +The data we will be working with has two tables from the ODM, **samples** and +**sites**. It does not include all the columns present in these tables. The +rows in the samples and sites table respectively are shown below: + +**samples**: +| sampleID | siteID | collDT | saMaterial | reportable | notes | +|-----------|----------|------------|------------|------------|--------| +| ottWa19-1 | ottawa-1 | 2021-08-19 | rawWW | TRUE | Note 1 | +| ottWa18-1 | ottawa-1 | 2021-08-18 | sweSed | TRUE | Note 2 | +| ottWa17-1 | laval-1 | 2021-08-17 | pstGrit | TRUE | Note 3 | +| ottWa10-1 | laval-1 | 2020-01-10 | water | FALSE | Note 4 | + +**sites**: +| siteID | name | repOrg1 | sampleshed | +|----------|----------------------|---------|------------| +| ottawa-1 | University of Ottawa | OPH | school | +| laval-1 | University of Laval | LPH | school | + +#### Basic Example + +1. Share all columns in the `samples` table, but select only rows whose site +ID is "ottawa-1" for Ottawa Public Health (OPH) + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|------------|-----------| + | 1 | samples | select | | | all | | + | 2 | samples | filter | siteID | = | ottawa-1 | | + | 3 | NA | share | OPH | | 1;2 | | + +2. Share all columns in the `samples` table, but select rows whose sample +material (`saMaterial`) is `rawWW` or `sweSed` for the Public Health Agency of +Canada (PHAC) + + | ruleId | table | mode | key | operator | value | notes | + |--------|---------|--------|------------|----------|--------------|-------| + | 4 | samples | select | | | all | | + | 5 | samples | filter | saMaterial | in | rawWW;sweSed | | + | 6 | NA | share | PHAC | | 4;5 | | + +3. Share all rows, but select the `notes` column from all tables for Laval +Public Health (LPH) + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|--------------|-----------| + | 7 | all | select | | | notes | | + | 8 | NA | share | LPH | | 4;5 | | + +4. Share all columns, but select only the rows for samples taken in the year +2021 and who have been marked as 'reportable' for Ottawa Public Health (OPH) +and the Public Health Agency of Canada (PHAC) + + | ruleId | table | mode | key | operator | value | notes | + |--------|---------|--------|------------|----------|-----------------------|-------| + | 9 | all | select | | | all | | + | 10 | samples | filter | reportable | = | TRUE | | + | 11 | samples | filtr | collDT | in | 2021-01-01:2021-12-31 | | + | 12 | NA | group | | AND | 10;11 | | + | 13 | NA | share | PHAC | | 9;12 | | + +5. Select all columns from the samples and sites tables, but only rows that + belong to the University of Laval for Laval Public Health (LPH) + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|--------------|-----------| + | 14 | all | select | | | all | | + | 15 | all | filter | siteID | = | laval-1 | | + | 16 | NA | share | LPH | | 14;15 | | + +### A Note on Filter and Select, Groups + +When specifying the columns to include in the shared data with the `select` +column, it is implied that all rows will be included **unless** a filter has +also been specified separately. Conversely, specifying the rows you want to +include in the `filter` column **does not** specifies that the column used for +filtering should be included in the `filtered_data` output. `select` is the +only way to specify columns for inclusion. + +As such, if you wanted to share all of the `samples` table data with Laval +Public Health (LPH), it would suffice to define the rules as: + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|--------------|-----------| + | 1 | samples | select | | | all | | + | 2 | NA | share | LPH | | 1 | | + +Similarly, if you only wanted to share the measure, value, and unit columns for +the siteID that belong to the University of Laval, but did not want to share +the siteID column, the rules would be: + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|--------|----------|--------------------|-------| + | 1 | measures | select | | | measure;value;unit | | + | 2 | measures | filter | siteID | = | laval-1 | | + | 3 | NA | share | LPH | | 1;2 | | + +With group-type rules, the rules are combined with an `AND` or `OR` operator, +and the rules to be combined are listed in the value field. Similarly, when +specifying the sharing target, users also list the rules to apply for the +output. The result is that with the sharing, there is an implicit grouping +action run by the library as part of this activity as well. + + +| ruleId | table | mode | key | operator | value | notes | +| --- | --- | --- | --- | --- | --- |-----------| +| 1 | measures | select | NA | NA | measure;value;unit | | +| 4 | measures | filter | measure | = | mPox | | +| 5 | measures | filter | reportDate | in | 2021-01-01;2021-12-31 | | +| 6 | measures | filter | measure | = | cov | | +| 7 | measures | filter | reportDate | >= | 2020-01-01 | | +| 8 | NA | group | NA | AND | 4; 5 | | +| 9 | NA | group | NA | AND | 6; 7 | | +| 10 | NA | group | NA | OR | 8; 9 | | +| 11 | measures | filter | reportable | = | TRUE | | +| 12 | NA | share | PHAC | NA | 10;11 | | + +Which implicitly generates --> + +| ruleId | table | mode | key | operator | value | notes | +| --- | --- | --- | --- | --- | --- | ----------- | +| 13 | measures | group | NA | AND | 10;11 | | + +Which then generates the SQL query for sharing with PHAC for this example --> + +``` +select measure, value, unit from measures where ((4 and 5) or (6 and 7)) and 11 +``` + +## Sharing CSV Columns + +This section summarizes all the columns that are a part of the file + +**ruleId**: Mandatory for all rules. Recommended to use sequential integers for +naming, but can be a number or a string. If a string, then its recommended to +use [snake_case](https://en.wikipedia.org/wiki/Snake_case) - spaces in names +are not supported. Each value should be unique across an entire sharing file +(`sharing.csv`). + +**table**: The name(s) of the tables for this rule. Allowable values are names +(partIDs) of the tables separated by a `;`, or `all` to select all tables. + +**mode**: The activity and modality of a rule. Allowable values are: + - `select`: used for rules that define which tables and columns are to be + shared. Requires values in the `ruleID`, `table`, `mode`, and `value` + columns of the sharing csv. + - `filter`: used for rules that define which rows of data are appropriate for + sharing. Requires values in the `ruleID`, `table`, `mode`, `key`, + `operator` and `value` columns of the sharing csv. + - `group`: used for grouping together rules that should be applied as + combined conditions, using either `AND` or `OR` as the operator. Requires + values in the `ruleID`, `mode`, `operator` and `value` columns of the + sharing csv. + - `share`: used for rules defining the target for the sharing data output. + Requires values in the `ruleID`, `mode`, `key` (to specify the + organizationID(s) or contactID(s)) and `value` (to specify the rules to + apply for the output) columns of the sharing csv. + +**key**: The argument used to specify the header or headers used for a +filtering rule, or the destination organization or person for a sharing rule. +Multiple headers can be listed, and likewise multiple organizations/individuals +can be separated by a `;`. Also supports key word `all`. The organizations here +reference the organizations table (`organizationID`), or the contacts table +(`contactID`) in the ODM data. + +**operator**: The operator used to define the logic of filtering and grouping +rules. For `filter`-mode rules, use of the `=`, `>`, `<`, `>=`, and `<=` +operators are supported, along with `in` for ranges of continuous data. For +`group`-mode rules, the acceptable values for this field are `AND` or `OR`. + +**value**: Specifies the values for filtering rules, and the rules to be +grouped for grouping rules. Discrete, listed values in this field should be +separated by a ";". + +**notes**: An optional, free-text description or notes explaining this rule, or +other related information deemed worthy of sharing. diff --git a/docs/manual/chapters/sqlite.qmd b/docs/manual/chapters/sqlite.qmd new file mode 100644 index 00000000..f7da63f2 --- /dev/null +++ b/docs/manual/chapters/sqlite.qmd @@ -0,0 +1,87 @@ +# SQLite {#sec-sqlite} + +## Install + +To install the SQLite library, follow the steps below for your system, then +[verify that it's working in Python](#verify-sqlite-support-in-python). + +### Windows + +1. **Download SQLite:** Go to the + [SQLite download page](https://www.sqlite.org/download.html) and download + `sqlite-dll-win-x64-3XXXXXX.zip`. + +2. **Extract the Files:** Extract the contents of the ZIP file to a directory + of your choice, for example, `C:\sqlite`. + +3. **Add SQLite to PATH:** Add the directory chosen above, now containing + `sqlite3.dll`, to your system's PATH environment variable: + 1. Open the Start Menu, search for "Environment Variables", and select "Edit + the system environment variables". + 1. In the System Properties window, click on the "Environment Variables" + button. + 1. In the Environment Variables window, find the "Path" variable in the + "System variables" section and click "Edit". + 1. Click "New" and add the path chosen above. + 1. Click "OK" to close all windows. + +4. **Verify the Installation:** Open a new Command Prompt window and run + `sqlite3 --version`. This should print the version of SQLite installed. + +### macOS + +Run the following commands in a terminal. + +1. **Install Homebrew.** Homebrew is a popular package manager for macOS. If + you don't have Homebrew installed, you can install it by running the + following command: + ```bash + /bin/bash -c "$(curl -fsSL \ + https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" + ``` + +2. **Install SQLite using Homebrew**: + ```bash + brew install sqlite3 + ``` + +3. **Verify the Installation**: + ```bash + sqlite3 --version + ``` + This should print the version of SQLite installed. + +### Linux + +To install SQLite on (Ubuntu) Linux, you can run the following commands: + +1. **Update package list**: + ```bash + sudo apt update + ``` + +2. **Install SQLite**: + ```bash + sudo apt install sqlite3 + ``` + +3. **Verify installation**: + ```bash + sqlite3 --version + ``` + This should print the version of SQLite installed. + +## Verify SQLite Support in Python + + After installing SQLite, you should verify that SQLite works in Python. + + Open a Python shell (by typing `python` (or `python3`) in a terminal) and + run the following commands: + + ```python + import sqlite3 + sqlite3.sqlite_version + ``` + + If the version is displayed, and no errors are encountered, then SQLite is + ready for use with Python. diff --git a/docs/manual/index.qmd b/docs/manual/index.qmd new file mode 100644 index 00000000..cd7ca14d --- /dev/null +++ b/docs/manual/index.qmd @@ -0,0 +1,14 @@ +--- +title: Introduction +--- + +![](assets/odm-logo.png){width=100px} + +The PHES-ODM Sharing Library supports wastewater-based surveillance and +epidemiology by facilitating the collection, standardization, and transparency +of data by providing a more harmonized ODM data format to share data between +different data curators and data repositories. + +In the upcoming chapters, we'll explore how to install and use the package. + + diff --git a/docs/spec/design-spec.md b/docs/spec/design-spec.md new file mode 100644 index 00000000..8338fcf6 --- /dev/null +++ b/docs/spec/design-spec.md @@ -0,0 +1,860 @@ +# Aim/Objective: + +The purpose of the ODM is to support wastewater-based surveillance and +epidemiology by facilitating the collection, standardization, and transparency +of data by providing a more harmonized ODM data format to share data between +different data curators and data repositories. + +The ODM supports data sharing in two ways: + +1. **Data sharing schema** - The ODM will have a schema that describes what data can be shared with one or more partners or users. +2. **Data filter based on the data sharing schema** - The ODM will support an open source method for filtering data tables using the data sharing schema. This will be accomplished in large part by generating one, or a series of, SQL querries that can be used to pull selected data from a variety of different database structures. + +The data sharing schema will be a csv file (sharing.csv) where each row in the `sharing` file corresponds to one or more headers and/or tables in the PHES-ODM, or one or more organizations with whom to share the output data. See below for an example. + +| ruleId | table | mode | key | operator | value | notes | +|--------|----------|--------|------------|----------|---------------------------------------------|------------------------| +| 1 | measures | select | NA | NA | measureRepID,measure,value,unit,aggregation | basic measures details | +| 2 | measures | filter | measure | = | mPox | | +| 3 | measures | filter | reportable | = | TRUE | NA | +| 4 | NA | share | OPH | NA | 1; 2 | link to DSA | +| 5 | NA | share | PHAC | NA | 1; 2; 3 | changed for new DSA | + +The `sharing` file should be accompanied by a metadata file, `sharing_metadata.csv`. This csv file provides additional information about the sharing schema, as shown in the example below: + +| name | datasetID | organizationID | numberRules | orgsServed | contactID | version | firstReleased | lastUpdated | changes | notes | +|---------------|---------------|----------------|-------------|------------|-------------|---------|---------------|-------------|----------------------------------------|--------------------------------------------| +| quebecSharing | universityLab | university-1 | 5 | PHAC;OPH | lastnamePer | 1.0.0 | 2024-03-26 | 2024-03-26 | Deprecated outdated rules for LPH, OPH | in line with DSA number 234, 565, and 901. | + +The data filter is a Python module (or function) that generates a SQL query to +pull the shareable data based on the inclusion criteria in the data sharing +schema. The function queries ODM-formatted data tables and takes a sharing +schema as an input. The function includes (filters) data according to the +schema rules. The function then returns a data table with only the data that is +to be shared. This new, returned data is ready to be shared and used with a +partner. + +# Features + +High level features include: + +- The data custodian should be able to define all the sharing rules in a CSV + file (`sharing.csv`). A standard schema for defining the rules will be + developed. +- The schema should allow a data custodian to define the partner (organization + or person - matching to an `organizationID` and/or `contactID` within the + model) that each rule pertains to. For example, a certain rule or set of + rules may be applicable only to the Public Health Agency of Canada (PHAC) + while another rule may be applicable to not only the PHAC but also to Ottawa + Public Health. +- The schema should allow data custodians to define rules that apply to rows or + to columns. For example, a rule can be made to share all the rows from the + `samples` table, and/or to only include the `collType` column from the + `samples` table. +- The schema is built using the logic and arguments of the `filter()` and + `select()` functions of the Dplyr package in R. When specifying details of + the filter function (mode), use of the `=`, `>`, `<`, `>=`, and `<=` + operators are supported, along with `in` for ranges of continuous data and + `NA` where the operator is not relevant, like for the select function (mode). +- Rules can be combined to form more powerful conditions by building across + rows of the `sharing` csv. For example, include all rows with `email` equal + to "[john.doe\@email.com](mailto:john.doe@email.com){.email}", `firstName` + equal to "John", and `lastName` equal to "Doe". This is accomplished by + grouping rules together using the mode `group` with the operators `AND` or + `OR`, creating customized combinations of conditions. +- Rules can be made within the context of an entire table, to a column that may + be present in more than one table, or to a column specific to a table. Rules + can also be made at the level of all measures or datasets with a given + license type. +- The rules may only be inclusive. For example, rules can be defined to include + rows but not to exclude them. +- The data custodian will be returned a report at the end which will provide + details about how many rows were filtered for inclusion in the shareable + data, as well as the tables and headers selected for inclusion. +- Version 2 of the PHES-ODM allows data generators and custodian to define data + licenses. In some jurisdictions, this may be defined in detailed data-sharing + agreements (DSA). The DSAs can be short simply referencing a license type, or + they can be many pages identifying specifically who can use the data and for + what purpose and what will be the data destruction protocols, etc. The notes + column in the `sharing.csv` is a free text field, providing an opportunity to + reference a longer document or provide more details. Most licenses currently + supported by the ODM license field are open. +- The implementation should take into account the relationship between the + different tables as defined in the ODM. For example, removing a row with + `siteID = ottawa-1` from the sites table, should also remove all rows in the + samples table with `siteID = ottawa-1`. All nested relationships should also + be taken care of. The relationships between the tables can be seen + [here](https://lucid.app/lucidchart/847978df-d627-4b8a-a379-faca7a517ef4/edit?invitationId=inv_0de7777b-888b-4d8a-827d-2306bdc48cce&page=4OvE58YH3w..#). +- A python function that implements these rules should be built. + +# Sharing CSV + +## Introduction + +The sharing CSV file provides different data generators or custodians with a +standardized and code-agnostic method to define rules for sharing data with +different organizations. Each row in the CSV file defines one rule which +combined defines all the sharing rules for a given data generator or custodian. +The headers of the CSV file define the different parts of the rule. The +following sections outline these different parts, and provide a guide to +defining a rule. + +### 1. Setting the ruleID + +Because each sharing schema is a closed system to a given data generator or +data custodian, the ruleIDs only need to be unique within a given schema +(`sharing.csv`). Using sequential integers for ruleIDs works well, and is the +recommended approach. + +### 2. Rules and Modes + +After defining the unique ID for a rule, the next step is to determine the +`mode` of a rule. There are four possible values for the `mode` column: + +1. `select`: This indicates that the effect of this rule will be to select the + tables and columns for inclusion in the output shareable data. It also means + that the `key` and `operator` columns do not need to be defined for this + rule. +2. `filter`: This is used for rules that will filter the shareble data output + rows based on row values. The full rule will require the `key` and + `operator` columns to be fully specified. +3. `group`: This defines activities for a rule that groups or combines rules + together for execution. The full rule will require the `operator` column to + be fully specified. +4. `share`: This defines a rule that specifies the `organizationID` or + `contactID` with which an output will be shared, as well as the rules to + apply to generate the specific output data. The full rule will require the + `key` column, but not the `operator` column, to be fully specified. + +Generally, the bulk of a sharing csv will be composed of `filter` and `select` +rules, with a few `group` rules, and the final `share` rules at the very end. +Rules should also be written and specified in this same order. + +### 3. Selecting an Entity + +In order to generate an intelligible output dataset, several `select` and +`filter` rules will need to first be stacked and applied. This step involves +selecting the parts of the PHES-ODM or entities within the model. The entities +that can be selected are: + +- Data contained in a table +- Data contained in column(s) of table(s) +- Data contained in row(s) of table(s) + +This step uses four columns, `table`, `mode`, `key`, and/or `value`. The +`table` column specifies the name(s) of the table(s) to which this rule +applies. To list multiple tables in the `table` column, list each table +separated by a ";". The `mode` column specifies the action of a rule. For +`mode = filter` rules, the `key` column lists the name(s) of the column(s) to be +included in the shared data output as specified by filtering rule. For +`mode = select` rules, the names of the selected columns are specified in the +`value` column. For rules that select entities, the `filter` and `select` modes +will be used. + +#### 3.1. Selecting Columns + +In order to have any data to share, tables and columns need to be specified for +inclusion. These are the first rules to define in your schema. To specify which +columns should be shared, specify the table or tables in the `table` column, +list `select` in the `mode` column, and then list the column or columns to be +shared in the `value` column. When specifying the columns, you can separate +distinct column names with a ";". The `key` and `operator` columns should be +left blank (or `NA`) as they are not used in these rules, and any values in +these columns for `select`-mode rows will be ignored. + +To select all columns, an `all` value can be used in the `value` column of the +sharing csv. + +Some examples are given below: + +1. Selecting only the `saMaterial` column in the `samples` table + + | ruleId | table | mode | key | operator | value | notes | + |--------|---------|--------|-----|----------|------------|-----------| + | 1 | samples | select | NA | NA | saMaterial | NA | + +2. Selecting only the `reportable` and the `pooled` columns in the `measures` +table + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|-----|----------|------------|-----------| + | 2 | measures | select | NA | NA | reportable;pooled | NA | + +3. Selecting all the columns in the `measures` table + + | ruleId | table | mode | key | operator | value | notes | + |--------------|----------|--------|-----|----------|------------|-----------| + | 3 | measures | select | NA | NA | all | NA | + +4. Selecting only the `purposeID` column in the `measures` and the `samples` table + + | ruleId | table | mode | key | operator | value | notes | + |--------------|----------|--------|-----|----------|------------|-----------| + | 4 | measures;samples | select | NA | NA | purposeID | NA | + +Notes: + +- In examples 2 and 4 where multiple columns and tables were selected + respectively, a `;` was used to separate the values. Throughout this entire + document when multiple values need to listed in a single cell, the `;` + symbol should be used to separate discrete values. + +- In examples 3 where all the columns in a table were selected, the keyword + `all` was used. Similar to the `;` symbol, the keyword `all` may be used in + a cell to mean everything. + +- The **ruleId** column is mandatory for all rules and each value is unique + across the entire sheet (`sharing.csv`). It must be a number. + +#### 3.2. Filtering Rows + +Once the columns and tables for inclusion have been specified, users can +specify which rows should be shared using rules with the `filter` mode. Note +that rules that filter can use values in any columns, including columns that +are not being shared in the final output. To specify a `filter` rule, users +need to specify the table or tables in the `table` column, and define the +`mode` as filter. Then users can specify the columns which the filter will act +on in the `key` column, specify the nature of the filter using the `operator` +column, and the filter values in the `value` column. The general structure for +the filter argument is: + +``` +**column name** **operator** **value** +``` + +Where the "column name" the name of a column (specified in the `key` column) +from the table(s) specified in the `table` column, and the "value" is the value +or range of values that determine whether a row is selected for sharing, stored +in the `value` column. The "operator" is a placeholder for the symbol indicates +that the nature of the filter to be applied, and the desired relationship +between the `key` and the `value`. The currently accepted values for the +`operator` column are: + +- **=**: Denotes exact equivalence. This should be used for categorical or + character variables. +- **\>**: Denotes "greater-than". This can be used for numeric, integer, or + date-type variables. Note that it is exclusive of the value used in the + expression. +- **\<**: Denotes "lesser-than". This can be used for numeric, integer, or + date-type variables. Note that it is exclusive of the value used in the + expression. +- **\>=**: Denotes "greater-than-or-equal-to". This can be used for numeric, + integer, or date-type variables. Note that it is inclusive of the value + used in the expression. +- **\<=**: Denotes "lesser-than". This can be used for numeric, integer, or + date-type variables. Note that it is inclusive of the value used in the + expression. +- **in**: Denotes that a value is contained in either a range of continuous + data or a set of values. Range values are separated by `:` and can be used + for numeric, integer, or date-type variables, while sets are separated by + `;` and can be of any type. Note that both ranges and sets are inclusive of + the values used in the expression. + +Technically the `operator` column also accepts `AND` and `OR` as values, but +only for rules of the `group` mode. + +Some examples of how these rules can be constructed and applied in practice are +given below: + +1. Selecting only the rows where the value of `siteID` is exactly equal to +"ottawa-1" in the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|---------|----------|-----------|-----------| + | 6 | samples | filter | siteID | = | ottawa-1 | | + +2. Selecting only the rows where the value of "Collection period" (`collPer`) +is greater than or equal to 5 in the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|---------|--------|---------|----------|-------|-------| + | 7 | samples | filter | collPer | >= | 5 | | + +3. Selecting only the rows where the value of "Collection period" (`collPer`) +is less than 5 in the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|---------|----------|-----------|-----------| + | 8 | samples | filter | collPer | <= | 5 | | + +4. Selecting only the rows where the value of "Analysis date end" (`aDateEnd`) +is exactly equal to February 1st, 2022 (2022-02-01) from the `measures` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|----------|----------|------------|-----------| + | 9 | measures | filter | aDateEnd | = | 2022-02-01 | | + +5. Selecting only the rows where the value of "Analysis date end" (`aDateEnd`) +is a date in February from the `measures` table. + + | ruleId | table | mode | key | operator | value | notes | + | ------ | -------- | ------ | -------- | -------- | --------------------- | ----- | + | 10 | measures | filter | aDateEnd | in | 2022-02-01:2022-02-28 | | + +6. Selecting only the rows where the value of `siteID` is either "ottawa-1" or + "laval-1" in the samples table. + + | ruleId | table | mode | key | operator | value | notes | + | ------ | -------- | ------ | -------- | -------- | ---------------- | ----- | + | 10 | samples | filter | siteID | in | ottawa-1;laval-1 | | + +### 4. Grouping Rules + +By default, all `filter` and `select` rules that are applied together are +combined with an implicit `AND`. That is to say, data to be shared must meet all +the criteria together. To stack particular rules to be applied together, or to +combine rules with an `OR`, users can rely on the `group` mode. To create a +`group` rule, the mode column needs to be specified to `group`, and the rule IDs +of the rules to be groups should be listed in the `value` column, separated by a +";". To specify how the rules are being grouped, the operator needs to be +specified as `AND` or `OR`. Group-type rules can also be grouped together, +creating nested group rules. + +Some examples are given below: + +1. Selecting only the rows where the value of "Analysis date end" (`aDateEnd`) +is exactly equal to February 1st, 2022 (2022-02-01) or February 1st, 2023 +(2023-02-01) from the `measures` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|----------|----------|------------|-----------| + | 11 | measures | select | NA | NA | all | This rule selects all the columns from the measures table for inclusion | + | 12 | measures | filter | aDateEnd | = | 2022-02-01 | This rules takes all rows where analysis date end is February 1st, 2022 | + | 13 | measures | filter | aDateEnd | = | 2023-02-01 | This rules takes all rows where analysis date end is February 1st, 2023 | + | 14 | NA | group | NA | OR | 12;13 | This rule groups rules 12 and 13 together with "OR", such that if either rule is true, the data is selected | + +2. Selecting only the rows where the value of `siteID` is exactly equal to "ottawa-1" or "laval-1" in the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|----------|----------|------------|-----------| + | 15 | samples | select | NA | NA | all | This rule selects all the columns from the samples table for inclusion | + | 16 | samples | filter | siteID | = | ottawa-1 | This rules takes all rows with a siteID of ottawa-1 | + | 17 | samples | filter | siteID | = | laval-1 | This rules takes all rows with a siteID of laval-1 | + | 18 | NA | group | NA | OR | 16;17 | This rule groups rules 16 and 17 together with "OR", such that if either rule is true, the data is selected | + +3. Selecting only the rows where the value of `siteID` is "ottawa-1" and the collection datetime (`collDT`) was February 1st, 2023 (2023-02-01) from the `samples` table. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|----------|----------|------------|-----------| + | 19 | samples | select | NA | NA | all | This rule selects all the columns from the samples table for inclusion | + | 20 | samples | filter | siteID | = | ottawa-1 | This rules takes all rows with a siteID of ottawa-1 | + | 21 | samples | filter | collDT | = | 2023-02-01 | This rules takes all rows with a collection date of February 1st, 2023 | + | 22 | NA | group | NA | AND | 20;21 | This rule groups rules 20 and 21 together with "AND", such that only rows that met both conditions are selected | + +4. Selecting only the rows from the `measures` table that correspond to MPox measures between January 1st, 2021 and December 31st, 2021, or SARS-CoV-2 measures after January 1st, 2020. + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|------------|-----------| + | 23 | measures | select | NA | NA | measure; value; unit; aggregation | This rule selects the measure, value, unit, and aggregation columns from the measures table for inclusion | + | 24 | measures | filter | measure | = | mPox | This rules takes all rows with an MPox measure in the measures table | + | 25 | measures | filter | reportDate | in | 2021-01-01:2021-12-31 | This rules takes all rows with a report date between jan.1 and dec. 31, 2021 in the measures table | + | 26 | NA | group | NA | AND | 24;25 | This rule groups rules 24 and 25 together with "AND", such that only rows that met both conditions are selected | + | 27 | measures | filter | measure | = | cov | This rules takes all rows with a SARS-CoV-2 measure in the measures table | + | 28 | measures | filter | reportDate | >= | 2020-01-01 | This rules takes all rows with a report date after jan.1, 2020 in the measures table | + | 29 | NA | group | NA | AND | 27;28 | This rule groups rules 27 and 28 together with "AND", such that only rows that met both conditions are selected | + | 30 | NA | group | NA | OR | 26;29 | This rule groups rules 26 and 29 together with "OR", such that if either grouping of rules is true, the data is selected | + +### 5. Selecting an Organization for Sharing + +Once the rules in the sharing csv are defined, the next step is deciding to +which organization(s) or person/people a rule applies. This is done using the +an additional rule row with the `mode` columns value specified as `share`. A +unique identifier for each organization or person should be used and reused +throughout the entire document, and is used in the `key` column for sharing +rules. This unique identifier should ideally correspond to an organization ID +(`organizationID`) in the `organizations` table, or a contact ID (`contactID`) +in the `contacts` table of the ODM. To apply a single rule across multiple +organizations, the different organizations that a rule pertains to can be +listed together in the `key` column. The listed organizations should be +separated by a ";". For example, if a rule applies to the **Public Health +Agency of Canada** (`organizationID = PHAC`) as well as **Ottawa Public +Health** (`organizationID = OPH`) the value of the `key` cell in the row for +that rule would be `PHAC;OPH`. The example assumes that PHAC and OPH are the +agreed upon identifiers to represent these organizations. The rules to apply +for the shared data output should be listed in the `value` column, with that +various rule IDs separated by a ";". To specify different rules for different +organizations/people, users will need to generate addition `share`-mode rules. + +Some examples of how these rules can be constructed and applied in practice are +given below: + +1. Selecting only all columns of the `measures` table, but only the rows where +the value of "Analysis date end" (`aDateEnd`) is exactly equal to February 1st, +2022 (2022-02-01) or February 1st, 2023 (2023-02-01), and everything from the +`samples` table with the Public Health Agency of Canada (`organizationID = +PHAC`) and Ottawa Public Health (`organizationID = OPH`). Using those same +rules for Laval Public Health (`organizationID = LPH`), except only including +the rows of the `samples` table where the value of `siteID` is exactly equal to +"ottawa-1" or "laval-1". + + | ruleId | table | mode | key | operator | value | notes + |--------|----------|--------|----------|----------|------------|-----------| + | 11 | measures | select | NA | NA | all | This rule selects all the columns from the measures table for inclusion | + | 12 | measures | filter | aDateEnd | = | 2022-02-01 | This rules takes all rows where analysis date end is February 1st, 2022 | + | 13 | measures | filter | aDateEnd | = | 2023-02-01 | This rules takes all rows where analysis date end is February 1st, 2023 | + | 14 | NA | group | NA | OR | 12;13 | This rule groups rules 12 and 13 together with "OR", such that if either rule is true, the data is selected | + | 15 | samples | select | NA | NA | all | This rule selects all the columns from the samples table for inclusion | + | 16 | samples | filter | siteID | = | ottawa-1 | This rules takes all rows with a siteID of ottawa-1 | + | 17 | samples | filter | siteID | = | laval-1 | This rules takes all rows with a siteID of laval-1 | + | 18 | NA | group | NA | OR | 16;17 | This rule groups rules 16 and 17 together with "OR", such that if either rule is true, the data is selected | + | 31 | NA | share | OPH;PHAC | NA | 11;14;15 | Share all measures from feb. 1 2022 and 2023, and all samples information | + | 32 | NA | share | LPH | NA | 11;14;15;18| Share all measures from feb. 1 2022 and 2023, and all samples from ottawa and laval | + +2. Share MPox data from 2021 with Ottawa Public Health (`organizationID = + OPH`), share all SARS-CoV-2 data since 2020 with Laval Public Health + (`organizationID = LPH`), and share MPox data from 2021 and all SARS-CoV-2 + data since 2020 with the Public Health Agency of Canada (`organizationID = + PHAC`). + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|------------|-----------| + | 23 | measures | select | NA | NA | measure; value; unit; aggregation | This rule selects the measure, value, unit, and aggregation columns from the measures table for inclusion | + | 24 | measures | filter | measure | = | mPox | This rules takes all rows with an MPox measure in the measures table | + | 25 | measures | filter | reportDate | in | 2021-01-01:2021-12-31 | This rules takes all rows with a report date between jan.1 and dec. 31, 2021 in the measures table | + | 26 | NA | group | NA | AND | 24;25 | This rule groups rules 24 and 25 together with "AND", such that only rows that met both conditions are selected | + | 27 | measures | filter | measure | = | cov | This rules takes all rows with a SARS-CoV-2 measure in the measures table | + | 28 | measures | filter | reportDate | >= | 2020-01-01 | This rules takes all rows with a report date after jan.1, 2020 in the measures table | + | 29 | NA | group | NA | AND | 27;28 | This rule groups rules 27 and 28 together with "AND", such that only rows that met both conditions are selected | + | 30 | NA | group | NA | OR | 26;29 | This rule groups rules 26 and 29 together with "OR", such that if either grouping of rules is true, the data is selected | + | 33 | NA | share | OPH | NA | 23;26 | Share MPox data from 2021 with Ottawa Public Health | + | 34 | NA | share | LPH | NA | 23;29 | Share all SARS-CoV-2 data since 2020 with Laval Public Health | + | 35 | NA | share | PHAC | NA | 23;30 | Share MPox data from 2021 and all SARS-CoV-2 data since 2020 with PHAC | + +## Example Scenarios + +In this section we will be working with some data, providing an example +scenario for a rule and showing what the rule looks like in practice. + +### Filtering on license type + +One special case for filtering is using the license type (`license` in the +`datasets` table, or `measureLic` in the `measures` table). This is more useful +for data generators and custodians who work with a mix of open and private +data. By only filtering on open data, or open data with a specific license, all +of the data and metadata that are open can be shared, without needing to +specify additional sharing filters. For example, to share all data in a given +dataset: + +| ruleId | table | mode | key | operator | value | notes | +|--------|-------|--------|------------|----------|-------|-----------------------------------------------------------------------------------------------------------| +| 1 | all | select | NA | NA | all | This rule selects all the columns and tables for inclusion | +| 2 | all | filter | license | = | open | This rules takes all rows where the license is open | +| 3 | all | filter | measureLic | = | open | This rules takes all rows where the measure license is open | +| 4 | NA | group | NA | OR | 2; 3 | This rule groups rules 2 and 3 together with "OR", such that if either rule is true, the data is selected | +| 5 | NA | share | PHAC | NA | 1; 4 | This rule specifies that the data should be filtered using rules 1 and 4, and shared with PHAC | + +For an example pulling specifically open measures: + +| ruleId | table | mode | key | operator | value | notes | +|--------|----------|--------|------------|----------|-------|------------------------------------------------------------------------------------------------| +| 1 | measures | select | NA | NA | all | This rule selects all the columns from the measures tables for inclusion | +| 2 | measures | filter | measureLic | = | open | This rules takes all rows in the measures table where the measure license is open | +| 3 | NA | share | PHAC | NA | 1; 2 | This rule specifies that the data should be filtered using rules 1 and 2, and shared with PHAC | + +### General Example + +The data we will be working with has two tables from the ODM, **samples** and +**sites**. It does not include all the columns present in these tables. The +rows in the samples and sites table respectively are shown below: + +**samples**: +| sampleID | siteID | collDT | saMaterial | reportable | notes | +|-----------|----------|------------|------------|------------|--------| +| ottWa19-1 | ottawa-1 | 2021-08-19 | rawWW | TRUE | Note 1 | +| ottWa18-1 | ottawa-1 | 2021-08-18 | sweSed | TRUE | Note 2 | +| ottWa17-1 | laval-1 | 2021-08-17 | pstGrit | TRUE | Note 3 | +| ottWa10-1 | laval-1 | 2020-01-10 | water | FALSE | Note 4 | + +**sites**: +| siteID | name | repOrg1 | sampleshed | +|----------|----------------------|---------|------------| +| ottawa-1 | University of Ottawa | OPH | school | +| laval-1 | University of Laval | LPH | school | + +#### Basic Example + +1. Share all columns in the `samples` table, but select only rows whose site +ID is "ottawa-1" for Ottawa Public Health (OPH) + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|------------|-----------| + | 1 | samples | select | | | all | | + | 2 | samples | filter | siteID | = | ottawa-1 | | + | 3 | NA | share | OPH | | 1;2 | | + +2. Share all columns in the `samples` table, but select rows whose sample +material (`saMaterial`) is `rawWW` or `sweSed` for the Public Health Agency of +Canada (PHAC) + + | ruleId | table | mode | key | operator | value | notes | + |--------|---------|--------|------------|----------|--------------|-------| + | 4 | samples | select | | | all | | + | 5 | samples | filter | saMaterial | in | rawWW;sweSed | | + | 6 | NA | share | PHAC | | 4;5 | | + +3. Share all rows, but select the `notes` column from all tables for Laval +Public Health (LPH) + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|--------------|-----------| + | 7 | all | select | | | notes | | + | 8 | NA | share | LPH | | 4;5 | | + +4. Share all columns, but select only the rows for samples taken in the year +2021 and who have been marked as 'reportable' for Ottawa Public Health (OPH) +and the Public Health Agency of Canada (PHAC) + + | ruleId | table | mode | key | operator | value | notes | + |--------|---------|--------|------------|----------|-----------------------|-------| + | 9 | all | select | | | all | | + | 10 | samples | filter | reportable | = | TRUE | | + | 11 | samples | filtr | collDT | in | 2021-01-01:2021-12-31 | | + | 12 | NA | group | | AND | 10;11 | | + | 13 | NA | share | PHAC | | 9;12 | | + +5. Select all columns from the samples and sites tables, but only rows that + belong to the University of Laval for Laval Public Health (LPH) + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|--------------|-----------| + | 14 | all | select | | | all | | + | 15 | all | filter | siteID | = | laval-1 | | + | 16 | NA | share | LPH | | 14;15 | | + +### A Note on Filter and Select, Groups + +When specifying the columns to include in the shared data with the `select` +column, it is implied that all rows will be included **unless** a filter has +also been specified separately. Conversely, specifying the rows you want to +include in the `filter` column **does not** specifies that the column used for +filtering should be included in the `filtered_data` output. `select` is the +only way to specify columns for inclusion. + +As such, if you wanted to share all of the `samples` table data with Laval +Public Health (LPH), it would suffice to define the rules as: + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|------------|----------|--------------|-----------| + | 1 | samples | select | | | all | | + | 2 | NA | share | LPH | | 1 | | + +Similarly, if you only wanted to share the measure, value, and unit columns for +the siteID that belong to the University of Laval, but did not want to share +the siteID column, the rules would be: + + | ruleId | table | mode | key | operator | value | notes | + |--------|----------|--------|--------|----------|--------------------|-------| + | 1 | measures | select | | | measure;value;unit | | + | 2 | measures | filter | siteID | = | laval-1 | | + | 3 | NA | share | LPH | | 1;2 | | + +With group-type rules, the rules are combined with an `AND` or `OR` operator, +and the rules to be combined are listed in the value field. Similarly, when +specifying the sharing target, users also list the rules to apply for the +output. The result is that with the sharing, there is an implicit grouping +action run by the library as part of this activity as well. + + +| ruleId | table | mode | key | operator | value | notes | +| --- | --- | --- | --- | --- | --- |-----------| +| 1 | measures | select | NA | NA | measure;value;unit | | +| 4 | measures | filter | measure | = | mPox | | +| 5 | measures | filter | reportDate | in | 2021-01-01;2021-12-31 | | +| 6 | measures | filter | measure | = | cov | | +| 7 | measures | filter | reportDate | >= | 2020-01-01 | | +| 8 | NA | group | NA | AND | 4; 5 | | +| 9 | NA | group | NA | AND | 6; 7 | | +| 10 | NA | group | NA | OR | 8; 9 | | +| 11 | measures | filter | reportable | = | TRUE | | +| 12 | NA | share | PHAC | NA | 10;11 | | + +Which implicitly generates --> + +| ruleId | table | mode | key | operator | value | notes | +| --- | --- | --- | --- | --- | --- | ----------- | +| 13 | measures | group | NA | AND | 10;11 | | + +Which then generates the SQL query for sharing with PHAC for this example --> + +``` +select measure, value, unit from measures where ((4 and 5) or (6 and 7)) and 11 +``` + +## Sharing CSV Columns + +This section summarizes all the columns that are a part of the file + +**ruleId**: Mandatory for all rules. Recommended to use sequential integers for +naming, but can be a number or a string. If a string, then its recommended to +use [snake_case](https://en.wikipedia.org/wiki/Snake_case) - spaces in names +are not supported. Each value should be unique across an entire sharing file +(`sharing.csv`). + +**table**: The name(s) of the tables for this rule. Allowable values are names +(partIDs) of the tables separated by a `;`, or `all` to select all tables. + +**mode**: The activity and modality of a rule. Allowable values are: + - `select`: used for rules that define which tables and columns are to be + shared. Requires values in the `ruleID`, `table`, `mode`, and `value` + columns of the sharing csv. + - `filter`: used for rules that define which rows of data are appropriate for + sharing. Requires values in the `ruleID`, `table`, `mode`, `key`, + `operator` and `value` columns of the sharing csv. + - `group`: used for grouping together rules that should be applied as + combined conditions, using either `AND` or `OR` as the operator. Requires + values in the `ruleID`, `mode`, `operator` and `value` columns of the + sharing csv. + - `share`: used for rules defining the target for the sharing data output. + Requires values in the `ruleID`, `mode`, `key` (to specify the + organizationID(s) or contactID(s)) and `value` (to specify the rules to + apply for the output) columns of the sharing csv. + +**key**: The argument used to specify the header or headers used for a +filtering rule, or the destination organization or person for a sharing rule. +Multiple headers can be listed, and likewise multiple organizations/individuals +can be separated by a `;`. Also supports key word `all`. The organizations here +reference the organizations table (`organizationID`), or the contacts table +(`contactID`) in the ODM data. + +**operator**: The operator used to define the logic of filtering and grouping +rules. For `filter`-mode rules, use of the `=`, `>`, `<`, `>=`, and `<=` +operators are supported, along with `in` for ranges of continuous data. For +`group`-mode rules, the acceptable values for this field are `AND` or `OR`. + +**value**: Specifies the values for filtering rules, and the rules to be +grouped for grouping rules. Discrete, listed values in this field should be +separated by a ";". + +**notes**: An optional, free-text description or notes explaining this rule, or +other related information deemed worthy of sharing. + +## Sharing Metadata CSV Columns + +Metadata for the sharing csv is stored in a separate file, the +`sharingMetadata.csv`. This section summarizes all the columns that are a part +of the file: + +**name**: the name given to a sharing schema. This is less important for data +custodians/generators who only use a single schema, but these are unique names +for each `sharing.csv` for each group or dataset. For naming, it is recommended +to use [snake_case](https://en.wikipedia.org/wiki/Snake_case) - spaces in names +are not supported. Each value should be unique across an entire sharing +metadata file (`sharing_metadata.csv`). + +**datasetID**: The dataset(s) for which a given sharing schema applies. +Multiple datasets can be separated by a `;`. The dataset(s) here reference the +datasets table (`datasetID`) in the ODM data. + +**version**: The version number of a given sharing schema. Version numbering +should be updated with each change, ideally following [semantic +versioning](https://semver.org) structure. Given a version number "x.y.z", or +"1.0.0", for example. The meaning of a change to each of these numbers based on +position is: MAJOR.MINOR.PATCH. MAJOR version updates are when rules are added +or removed, MINOR version updates are when when you are editing rules, and +PATCH version updates are when you tweak the `status` or `valid_period` +columns. + +**organizationID**: The organization who created a given sharing schema. The +organization here should reference the organizations table (`organizationID`) +in the ODM data. + +**contactID**: The contact information for the person who created a given +sharing schema. The contact here references the contacts table (`contactID`) in +the ODM data. + +**numberRules**: The number of rules defined in the sharing csv schema. + +**orgsServed**: A list of the organizations/people served by a sharing csv. +This is a list of `organizationID` and/or `contactID` entries in the `key` +field for sharing-type rules. The values should be separated with a ";". + +**firstReleased**: A date to specify when the sharing schema was made. + +**lastUpdated**: A date to specify when the sharing schema was last edited or +updated. + +**changes**: A free-text field to record changes made at the last update to the +sharing schema. + +**notes**: An optional, free-text description or notes explaining details about +the sharing schema, or other related information deemed worthy of sharing. + +An example of this table is found below. For this example, the university lab +records data for two different municipalities, and has separate datasetIDs for +data from the different municipalities. To make their workflow clearer, they've +also opted to created separate sharing schemas for the separate datsets. + +| name | datasetID | version | organizationID | contactID | firstReleased | lastUpdated | changes | notes | +|----------------|-----------------|---------|----------------|-------------|---------------|-------------|--------------------------------------|-------| +| ottawaSharingA | cityAReportData | 1.1.0 | university-1 | lastnamePer | 2022-02-01 | 2023-03-01 | Deprecated outdated rules for city A | NA | +| ottawaSharingB | cityBReportData | 1.2.0 | university-1 | lastnamePer | 2022-03-15 | 2023-03-01 | Changed outdated rules for city B | NA | + +Many of these values can be generated automatically: `name` can be extracted +from the filename of the schema. `lastEdited` can be inferred by reading in the +modified date from the filesystem. `organizationID`, `status`, `version`, and +`notes` are not able to be automatically inferred ar this point, but we hope to +be able to infer them automatically in a later version of the sharing system. + +# Implementation + +## Function Signature + +The function which implements the sharing feature takes two arguments: + +1. `data`: A series of tables from PHES-ODM formatted data. The data input +does not have to contain all the entities defined in the ODM, but can only +contain those on which the sharing rules should be applied. An example is shown +below, + +**measures** + +| measureRepID | sampleID | measure | value | unit | aggregation | +| -------------- | ------------ | --------- | -------- | ------ | ------------- | +| ottWW100 | pgsOttS100 | covN1 | 0.0023 | gcml | sin | +| ottWW101 | pgsOttS101 | covN1 | 0.0402 | gcml | sin | + +**samples** + +| sampleID | siteID | collDT | saMaterial | +| ------------ | ---------- | ----------------------- | ------------- | +| pgsOttS100 | ottawa-1 | 2021-02-01 9:00:00 PM | rawWW | +| pgsOttS101 | ottawa-1 | 2021-02-01 9:00:00 PM | rawWW | +| pgsOttS102 | ottawa-1 | 2021-02-26 9:00:00 PM | rawWW | + +**organizations** + +| organizationID | name | orgType | +| ---------------- | --------------------- | --------- | +| lab100 | University L100 Lab | academ | +| lab101 | University L101 Lab | academ | + +The above `data` example has three tables, **measures**, **samples**, and +**organizations**, with each table containing two or three rows. The table +names (`partID`s) as specified in the ODM should match the input file names. +The names of the columns and their value types should match up with their +specification (including the named `partID`) in the ODM. + +2. `sharing_rules`: The tabular `sharing.csv` containing the sharing rules to +be applied to the data. Each item must reference a table (or multiple tables), +and reference some or all of the fields as defined in the data above. An +example is shown below, + +| ruleId | table | mode | key | operator | value | notes | +|--------|------------------|--------|----------|----------|-----------------------|-------| +| 1 | all | select | NA | NA | all | | +| 2 | samples | filter | collDT | in | 2021-01-25:2021-02-25 | | +| 3 | samples;measures | filter | sampleID | in | pgsOttS101;pgsOttS102 | | +| 4 | NA | share | PHAC | NA | 1;3 | | +| 5 | NA | share | public | NA | 1;2;3 | | + +The above `sharing_rules` example contains three rules to apply to the data, +and 2 rules for targetting the sharing of data. + +The function will then return one dataset output (either xlsx file or series of +csv files) per organization/individual named in the rules with a `share` value +in the `mode` column. This will be the `filtered_data`, with the example shown +below: + +- **filtered_data**: The data to share with an organization. This is a copy + of the `data` parameter with the columns and rows that meet the inclusion + rules defined in the sharing rules for the passed organization. It has the + same structure as the `data` argument described above. To continue our + example: + +**FOR: PUBLIC** + +**measures** + +| measureRepID | sampleID | measure | value | unit | aggregation | +| -------------- | ------------ | --------- | -------- | ------ | ------------- | +| ottWW101 | pgsOttS101 | covN1 | 0.0402 | gcml | sin | + +**samples** + +| sampleID | siteID | collDT | saMaterial | +| ------------ | ---------- | ----------------------- | ------------ | +| pgsOttS101 | ottawa-1 | 2021-02-01 9:00:00 PM | rawWW | + +**organizations** + +| organizationID | name | orgType | +| ---------------- | --------------------- | --------- | +| lab100 | University L100 Lab | academ | +| lab101 | University L101 Lab | academ | + +**FOR: PHAC** + +**measures** + +| measureRepID | sampleID | measure | value | unit | aggregation | +|--------------|------------|---------|--------|------|-------------| +| ottWW101 | pgsOttS101 | covN1 | 0.0402 | gcml | sin | + +**samples** + +| sampleID | siteID | collDT | saMaterial | +|------------|----------|-----------------------|------------| +| pgsOttS101 | ottawa-1 | 2021-02-01 9:00:00 PM | rawWW | +| pgsOttS102 | ottawa-1 | 2021-02-26 9:00:00 PM | rawWW | + +**organizations** + +| organizationID | name | orgType | +| ---------------- | --------------------- | --------- | +| lab100 | University L100 Lab | academ | +| lab101 | University L101 Lab | academ | + +The above data can then be exported as two separate excel files (or sets of csv +files), with one for the public and one for PHAC. + +- **sharing_summary**: A tabular breakdown of entities for whom sharing data + was generated, and for each organization it lists the ruleIDs of the + applied rules, the tables included in the shared data, and the number of + rows for each shared table. An example is shown below: + +**summary_table:** + +| destination_org | rule_ids_used | tables_shared | number_rows_output | +|-----------------|---------------|---------------|--------------------| +| public | 1,2,3 | measures | 1 | +| public | 1,2,3 | samples | 1 | +| public | 1,2,3 | organizations | 2 | +| PHAC | 1,3 | measures | 1 | +| PHAC | 1,3 | samples | 2 | +| PHAC | 1,3 | organizations | 2 | + +- **sharing_rules_summary**: A copy of the sharing rules csv, but with an + additional column for recording the number of cells selected by each rule. + Allows for users to check the fineness of their data filtration, and detect + potential errors. As an example: + +| ruleId | table | mode | key | operator | value | notes | selectedCells | +|--------|------------------|--------|----------|----------|-----------------------|-------|---------------| +| 1 | all | select | NA | NA | all | | 30 | +| 2 | samples | filter | collDT | in | 2021-01-25:2021-02-25 | | 8 | +| 3 | samples;measures | filter | sampleID | in | pgsOttS101;pgsOttS102 | | 14 | +| 4 | NA | share | PHAC | NA | 1;3 | | 20 | +| 5 | NA | share | public | NA | 1;2;3 | | 16 | + +The `sharing_summary` and `sharing_rules_summary` tables should be shared with +the `filtered_data` output, along with the `sharing_metadata` file. + +Describing the example above, + +1. For the rule with ID 1, it says to include all tables and columns. So all +tables and columns were included in the output `filtered_data`, with only the +rows that matched inclusion criteria. If no filtration on rows was provided, +the column-based rules set the definition to include all rows in the included +columns. +2. For the rule with ID 2, an additional row was filtered out of **samples** +as one the of entries did not match the inclusion criteria for the collection +date. +3. The rule with ID 3 says that rows with the **sampleID** of "pgsOttS101" or +"pgsOttS102" were included across the `measures` and `samples` tables. This +meant that only one row that met this criteria in the **measures** table was +included, and the two rows from the **samples** table that met that criteria +were included. +4. RuleID 4 says to share with PHAC the data that meets the criteria of both 1 +and 3, and RuleID 5 says to share with the public the data that meets the +criteria of both 1, 2, and 3. So those rules are applied together to generate +the two outputs, one for each sharing partner. diff --git a/docs/tech-spec.md b/docs/spec/tech-spec.md similarity index 76% rename from docs/tech-spec.md rename to docs/spec/tech-spec.md index 7c55ec5e..82de4b9e 100644 --- a/docs/tech-spec.md +++ b/docs/spec/tech-spec.md @@ -21,74 +21,6 @@ boost when running queries on big indexed databases, as well as enable us to output the intermediate SQL to the user in case they want to inspect or execute the queries themselves. -## CLI - -**Usage** - -``` -./share.py [OPTION]... SCHEMA INPUT -``` - -Arguments: - -- SCHEMA - - sharing schema file path - -- INPUT - - spreadsheet file path or [SQLAlchemy database url](https://docs.sqlalchemy.org/en/20/core/engines.html#database-urls) - -Options: - -- `--orgs=NAME[,...]` - - comma separated list of organizations to output data for, defaults to all - -- `--outfmt=FORMAT` - - output format (excel or csv), defaults to excel - -- `--outdir=PATH` - - output file directory, defaults to the current directory - -- `-d`, `--dry-run`, `--debug`: - - only output the intermediary debug information describing what would - happen, and don't create any output files. - -One or multiple sharable output files will be created in the chosen output -directory according to the chosen output format and organization(s). Each -output file will have the input filename followed by a postfix with the org -name (and table name if CSV). - -(Debug) information about the operation will be printed to STDOUT, as well as -written to a `debug.txt` file in the same directory. - -**Examples** - -Create a sharable excel file in the "~/ohri" directory, for the "OHRI" -organization, applying the rules from schema.csv on the input from data.xlsx: - -```bash -./share.py --orgs=OHRI --outdir=~/ohri/ schema.csv data.xlsx -``` - -Output to the default (current) directory, for all organizations specified in -the schema, using a MySQL database (with the pymysql package) as input: - -```bash -./share.py schema.csv mysql+pymysql://scott:tiger@localhost/foo -``` - -Same as above, using a MS SQL Server database through ODBC (with the pyodbc -package): - -```bash -./share.py schema.csv mssql+pyodbc://user:pass@mydsn -``` - ## API ### Public modules @@ -105,14 +37,14 @@ package): - `TableName = str` - high level functions: - - `extract(data_source: str, schema_file: str, orgs: List[str]=[]) -> ...` + - `extract(schema_file: str, data_source: str, orgs: List[str]=[]) -> ...` returns a Pandas DataFrame per table per org Parameters: - - data_source: a file path or database url (in SQLAlchemy format) - schema_file: rule schema file path + - data_source: a file path or database url (in SQLAlchemy format) - orgs: orgs to share with, or all if empty Exceptions: ConnectionError, OSError, ParseError @@ -125,10 +57,10 @@ package): Exceptions: ConnectionError - - `parse(schema_file: str, orgs=[]) -> Dict[OrgName, Dict[TableName, Query]]` + - `parse(schema_path: str, orgs=[]) -> Dict[OrgName, Dict[TableName, Query]]` returns queries for each org and table, generated from the rules - specified in `schema_file` + specified in `schema_path` Exceptions: OSError, ParseError @@ -178,26 +110,23 @@ Parsing of rules into abstract syntax trees. (SQL) query generation from ASTs. -- generate(rt: RuleTree) -> Dict[OrgName, Query] +- generate(rt: RuleTree) -> Dict[OrgName, Dict[TableName, TableQuery]] ### Errors The exception types that may be thrown, as well as examples of what they cover: +- DataSourceError: + - table not found in data source + - unable to open/read data source - OSError: - - input file doesn't exist - - failed to read input file + - failed to read schema file - failed to write output file -- ConnectionError: - - failed to establish connection to db - - failed to import data to temporary db - - failed to read from data source - - data source columns don't match query - ParseError: - - header is missing/invalid - - row/rule field value is missing/invalid - - rule is referencing a rule that doesn't exist or doesn't have the - expected field values + - headers are missing + - value can't be coerced to the correct type + - required table/key/operator/mode is missing + - invalid filter/group operator ### Examples @@ -216,7 +145,7 @@ orgs = [org] high-level one-shot function: ```python -results = s.extract(data_file, rules, orgs) +results = s.extract(rules, data_file, orgs) for org, tabledata in results.items(): for table, data in tabledata.items(): data.to_csv(f'{org}-{table}.csv') @@ -256,17 +185,19 @@ for table, query in table_queries.items(): ### CSV rule parsing -1. open schema file -2. parse each line into a rule obj: - - validate and throw exception on error -3. add each rule obj to a dictionary with rule id as key +1. read csv, or fail with OSError +2. normalize NA values +3. validate headers, or fail with ParseError(s) +4. parse each row into a rule obj: + - validate rule, or accumulate ParseError(s): + - coerce values into the right types + - check existence of required values + - check operator values +5. return a dict with rule-ids and rules, or raise accumulated errors Error messages should contain all the necessary info to find and fix the issue, -including the line number, row number, rule id and column name (if applicable). -Parsing can be wrapped in a try-block to accumulate errors instead of aborting -on the first error. This is a viable option since each line is parsed -individually, and their relationships aren't taken into account before the next -(AST generation) step. +including the line number and column name (if applicable). Errors can be +accumulated, but the result is only valid if no errors occured. ### AST generation @@ -286,13 +217,14 @@ Node kinds: - **filter**: defines a filter with operator, key and value - **field**: a field name - **literal**: a string literal +- **range-kind**: specifies if a range is an interval or a set Node structure: -- (ruleId: int) +- rule_id: int - kind: NodeKind - str_val: str -- children: List[Node] +- sons: List[Node] Tree structure: @@ -305,6 +237,7 @@ specified, so the filter.field node comes before any filter.literal nodes, etc. - filter: - (**filter**, rule.operator): - (**field**, rule.key) + - (**range-kind**, 'interval'/'set') # only present for 'in' operator - (**literal**, x) for x in rule.value - group: - (**group**, rule.operator): @@ -347,6 +280,7 @@ Example rules with its generated tree: (literal, "mPox") (filter, "in") (field, "reportDate") + (range-kind, "interval") (literal, "2021-01-01") (literal, "2021-12-31") (group, "AND") @@ -361,6 +295,57 @@ Example rules with its generated tree: (select, "all") ``` +#### Algorithm + +- Each rule in a sharing CSV sheet can only reference rules defined in a + previous row. +- a node has a type/kind, a value, and a list of children +- the children of a node is called 'sons' since it's shorter +- nodes are first added to ctx and then later added to parent nodes with O(1) + lookup, this way the tree is constructed incrementally while parsing each + rule +- share nodes are made to be children of a single root-node, since each org + gets its own node, there may be multiple share-rules, and the tree + can only have a single root node +- the root-node is updated every time a new share-node is added +- tables of each rule are cached for O(1) lookup + +``` +for each rule: + for each table in rule, or just once if no table: + init node, depending on rule mode: + select: + kind = select + value = empty if sons are specified, otherwise 'all' + sons = a value node for each column name + filter: + kind = filter + value = operator + sons = + 1. a key node for the field name + 2. a value node for each filter value + group: + kind = group + value = operator + sons = nodes matching the rule's list of ids + share: + kind = root + sons = + for each organization: + kind = share + value = org + sons = + for each select-node referenced in share-rule: + for each table in select-node's rule: + kind = "table" + value = table + sons = + 1. select node + 2. filter/group node referenced in + share-node. Multiple nodes are + implicitly grouped with an AND-node. +``` + ### SQL query generation SQL queries are (recursively) generated from each table-node of the AST. Values @@ -401,7 +386,7 @@ the table node: ``` operator = str_val - result = recurse(first-child) + operator + recurse(second-child) + result = recurse(first-child) + (logic depending on range-kind) ``` - **field**: diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..0087c279 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,14 @@ +[mypy] +disallow_any_explicit = True +disallow_untyped_calls = True +disallow_untyped_defs = True +no_implicit_optional = True +explicit_package_bases = True +mypy_path = ./src +files = ./src/**/*.py + +# libraries +# + +[mypy-functional] +ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..d8417292 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[project] +name = "odm_sharing" +version = "1.0.0" +authors = [ + { name="OHRI", email="PHES-ODM@ohri.ca" } +] +description = "PHES-ODM Sharing Library" +readme = "README.md" +license = { file="LICENSE" } +requires-python = ">=3.9" +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] +dynamic = ["dependencies"] + +[project.urls] +"Homepage" = "https://github.com/Big-Life-Lab/PHES-ODM-sharing" +"Bug Tracker" = "https://github.com/Big-Life-Lab/PHES-ODM-sharing/issues" + +[project.scripts] +odm-share = "odm_sharing.tools.share:main" + +[build-system] +requires = ["hatchling", "hatch-requirements-txt"] +build-backend = "hatchling.build" + +# specify package dir +[tool.hatch.build.targets.wheel] +packages = ["src/odm_sharing"] + +# install dependencies automatically +[tool.hatch.metadata.hooks.requirements_txt] +files = ["requirements.txt"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..9685505d --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +pandas-stubs==2.2.2.240603 +sqlalchemy-stubs==0.4 +types-openpyxl==3.1.5.20240806 +types-tabulate==0.9.0.20240106 diff --git a/requirements-doc.txt b/requirements-doc.txt new file mode 100644 index 00000000..2a34b7ea --- /dev/null +++ b/requirements-doc.txt @@ -0,0 +1,3 @@ +griffe==0.47.0 +jupyter==1.0.0 +quartodoc==0.7.5 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..81bb3bbe --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +SQLAlchemy==2.0.31 +numpy==2.0.0 +openpyxl==3.1.4 +pandas==2.2.2 +pyfunctional==1.5.0 +tabulate==0.9.0 +typer==0.12.3 diff --git a/run-all-tests.sh b/run-all-tests.sh new file mode 100755 index 00000000..b48b4f0a --- /dev/null +++ b/run-all-tests.sh @@ -0,0 +1,10 @@ +#!/bin/sh +dir=$(dirname $0) + +echo "unit tests" +$dir/run-unit-tests.sh + +echo "" +echo "integration tests" +cd $dir/tests +python -m unittest int_test*.py diff --git a/run-unit-tests.sh b/run-unit-tests.sh new file mode 100755 index 00000000..01a22b4e --- /dev/null +++ b/run-unit-tests.sh @@ -0,0 +1,3 @@ +#!/bin/sh +dir=$(dirname $0) +python -m unittest discover $dir/tests diff --git a/sharing/.DS_Store b/sharing/.DS_Store deleted file mode 100644 index 14f6bdb6..00000000 Binary files a/sharing/.DS_Store and /dev/null differ diff --git a/sharing/readme-examples/.DS_Store b/sharing/readme-examples/.DS_Store deleted file mode 100644 index 96377356..00000000 Binary files a/sharing/readme-examples/.DS_Store and /dev/null differ diff --git a/src/odm_sharing/private/common.py b/src/odm_sharing/private/common.py new file mode 100644 index 00000000..c7fb0117 --- /dev/null +++ b/src/odm_sharing/private/common.py @@ -0,0 +1,8 @@ +# type aliases +ColumnName = str +OrgName = str +TableName = str + +# constants +F = 'FALSE' +T = 'TRUE' diff --git a/src/odm_sharing/private/cons.py b/src/odm_sharing/private/cons.py new file mode 100644 index 00000000..9f6a369b --- /dev/null +++ b/src/odm_sharing/private/cons.py @@ -0,0 +1,360 @@ +import logging +import os +from collections import defaultdict +from dataclasses import dataclass +from io import IOBase +from pathlib import Path +from typing import Dict, Generator, List, Set, Tuple, Union, cast + +import openpyxl as xl +import pandas as pd +import sqlalchemy as sa +from functional import seq +from openpyxl.workbook import Workbook + +from odm_sharing.private.common import ColumnName, TableName, F, T +from odm_sharing.private.utils import qt + + +@dataclass(frozen=True) +class CsvFile: + '''a table's CSV file''' + + table: str + '''table name''' + + file: IOBase + '''file object''' + + +CsvPath = str +CsvDataSourceList = Union[List[CsvPath], List[CsvFile]] +Sheet = xl.worksheet._read_only.ReadOnlyWorksheet + + +@dataclass(frozen=True) +class Connection: + handle: sa.engine.Engine + tables: Set[TableName] + bool_cols: Dict[TableName, Set[ColumnName]] + + +class DataSourceError(Exception): + pass + + +F_FORMULA = '=FALSE()' +T_FORMULA = '=TRUE()' + +BOOL_FORMULAS = [F_FORMULA, T_FORMULA] +BOOL_VALS = [F, T] +NA_VALS = ['', 'NA'] + + +def _create_temp_db() -> sa.engine.Engine: + path = '' # in-memory by default + custom_path = os.environ.get('ODM_TEMP_DB', '') + if custom_path: + # XXX: extra initial slash required for both rel and abs paths + path = '/' + custom_path + logging.info(f'using temp db {custom_path}') + return sa.create_engine(f'sqlite://{path}', echo=False) + + +def _write_table_to_db(db: sa.engine.Engine, table: str, df: pd.DataFrame + ) -> None: + logging.info(f'- table {table}') + df.to_sql(table, db, index=False, if_exists='replace') + + +def _datasets_to_db(datasets: Dict[TableName, pd.DataFrame] + ) -> sa.engine.Engine: + '''creates a temp db and writes the datasets as tables''' + db = _create_temp_db() + for table, df in datasets.items(): + _write_table_to_db(db, table, df) + return db + + +def _find_bool_cols(df: pd.DataFrame, bool_vals: List[str], + exclude_cols: Set[str] = set()) -> Set[str]: + '''Finds boolean columns in a dataframe. + + :returns: a set of column names + ''' + + # The following columns are included: + # - col-type=bool + # - col-type=object and val-type=bool + # - col-type=object and val-type=str and upper(val) in ['FALSE', 'TRUE'] + # + # NA-values are ignored and won't interfere with the result. + # + # The search is non-exhaustive, and will assume a valid bool column after + # the first match even if there are invalid values further down the column. + + result: Set[str] = set() + + for col in df: + col_name = str(col) + if col_name in exclude_cols: + continue + + def add_bool_col() -> None: + result.add(col_name) + + # check column type + # XXX: columns with mixed types have dtype=object + if df[col].dtype == bool: + add_bool_col() + continue + if df[col].dtype != object: + continue + + # check cell value type + for val in df[col]: + if val is None: # empty cell -> NA + continue + if isinstance(val, bool): + add_bool_col() + elif isinstance(val, str): + norm_val = val.strip().upper() + if norm_val in NA_VALS: + continue + if norm_val in bool_vals: + add_bool_col() + break + + return result + + +def _sheet_to_df(sheet: Sheet) -> pd.DataFrame: + '''converts an excel sheet to a pandas dataframe''' + row_iter = sheet.values + columns = list(next(row_iter)) # consumes first row + return pd.DataFrame(row_iter, columns=columns) + + +def _normalize_bool_values(df: pd.DataFrame, bool_cols: Set[ColumnName] + ) -> None: + '''normalize bool (string) values to 0/1''' + # XXX: this is needed to be able to run the same query filter on booleans + # coming from different data types (like string) + for col in bool_cols: + if df[col].dtype == object: # potentially str + df[col] = df[col].replace({F: '0', T: '1'}) + + +def _import_csv(data_source: Union[CsvPath, CsvFile] + ) -> Tuple[TableName, pd.DataFrame]: + # XXX: NA-values are not normalized to avoid mutating user data (#31) + ds = data_source + if isinstance(ds, CsvPath): + path = ds + assert path.endswith('.csv') + table = Path(path).stem + logging.info(f'importing {qt(table)} from {path}') + df = pd.read_csv(path, na_filter=False) + return (table, df) + else: + assert isinstance(ds, CsvFile) + logging.info(f'importing {qt(ds.table)} from a file') + df = pd.read_csv(ds.file, na_filter=False) # type: ignore + return (ds.table, df) + + +def _connect_csv(data_sources: CsvDataSourceList) -> Connection: + '''copies file data to in-memory db + + :raises DataSourceError: + :raises OSError: + ''' + assert len(data_sources) > 0 + if isinstance(data_sources[0], CsvPath): + _check_csv_paths(cast(List[CsvPath], data_sources)) + + dfs = {} + tables = set() + bool_cols = {} + for ds in data_sources: + (table, df) = _import_csv(ds) + bool_cols[table] = _find_bool_cols(df, BOOL_VALS) + _normalize_bool_values(df, bool_cols[table]) + dfs[table] = df + tables.add(table) + db = _datasets_to_db(dfs) + return Connection(db, tables, bool_cols) + + +def _iter_sheets(wb: Workbook, included_tables: Set[str]) -> Generator: + for sheet in wb: + table_name = sheet.title + if table_name in included_tables: + yield (sheet, table_name) + + +def _connect_excel(path: str, table_whitelist: Set[str]) -> Connection: + '''copies excel file data to in-memory db + + :returns: a connection to the db + + :raises OSError: + ''' + # XXX: we must NOT change the data (#31). + + # XXX: We can NOT use Pandas to import Excel files, since it may convert + # booleans to float when the first column cell value isn't a valid boolean. + # This happens even with `dtype=str`. Invalid booleans happen because + # we must allow empty cells and NA values, and we can't normalize the data + # (#31). Pandas uses openpyxl under the hood as its Excel backend, and we + # can use it directly to avoid this issue. + + # XXX: We will NOT use `dtype=str` when converting the imported data to + # Pandas-dataframes, since there's no need at this point. + + logging.info('importing excel workbook') + + # load excel file + wb = xl.load_workbook(path, read_only=True, data_only=True) + sheet_names = seq(wb).map(lambda sheet: sheet.title).list() + included_tables = set(sheet_names) + if table_whitelist: + included_tables &= table_whitelist + + # convert to dataframes + dfs = {} + bool_cols = {} + for sheet, table in _iter_sheets(wb, included_tables): + df = _sheet_to_df(sheet) + bool_cols[table] = _find_bool_cols(df, BOOL_VALS) + _normalize_bool_values(df, bool_cols[table]) + dfs[table] = df + + # include bool formulas when looking for bool columns + formula_wb = xl.load_workbook(path, read_only=True, data_only=False) + for sheet, table in _iter_sheets(formula_wb, included_tables): + df = _sheet_to_df(sheet) + bc = bool_cols[table] + bc |= _find_bool_cols(df, BOOL_FORMULAS, bc) + + # write to db + db = _datasets_to_db(dfs) + return Connection(db, included_tables, bool_cols) + + +def _connect_db(url: str) -> Connection: + ''':raises sa.exc.OperationalError:''' + handle = sa.create_engine(url) + db_info = sa.inspect(handle) + tables = set(db_info.get_table_names()) + + # find bool cols + bool_cols = defaultdict(set) + for table in tables: + for col_info in db_info.get_columns(table): + if isinstance(col_info['type'], sa.sql.sqltypes.BOOLEAN): + bool_cols[table].add(col_info['name']) + + return Connection(handle, tables, bool_cols) + + +def _detect_sqlite(path: str) -> bool: + # https://www.sqlite.org/fileformat.html + MAGIC = b'SQLite format 3' + try: + with open(path, 'rb') as f: + return f.read(len(MAGIC)) == MAGIC + except Exception: + return False + + +def _detect_sqlalchemy(path: str) -> bool: + if not path: + return False + try: + sa.engine.url.make_url(path) + return True + except sa.exc.ArgumentError: + return False + + +def _check_csv_paths(paths: List[str]) -> None: + if not seq(paths).map(lambda p: p.endswith('.csv')).all(): + raise DataSourceError( + 'mixing CSV files with other file types is not allowed') + + +def _detect_csv_input(data_sources: Union[str, CsvDataSourceList]) -> bool: + is_str = isinstance(data_sources, str) + is_list = isinstance(data_sources, list) + is_path_list = is_list and isinstance(data_sources[0], str) + is_file_list = is_list and not is_path_list + return ((is_str and cast(str, data_sources).endswith('.csv')) or + (is_path_list and cast(str, data_sources[0]).endswith('.csv')) or + is_file_list) + + +def connect( + data_sources: Union[str, CsvDataSourceList], + tables: Set[str] = set() +) -> Connection: + ''' + connects to one or more data sources and returns the connection + + :param data_sources: filepath or database URL, or list of multiple CSV + paths/files + + :param tables: when connecting to an excel file, this acts as a sheet + whitelist + + :raises DataSourceError: + ''' + + # XXX: After import of CSV/Excel files, boolean values will be normalized + # as 0/1, which we'll have to convert back (using previously detected bool + # columns) to 'FALSE'/'TRUE' before returning the data to the user. This + # happens in `odm_sharing.sharing.get_data`. + if not data_sources: + raise DataSourceError('no data source') + try: + if _detect_csv_input(data_sources): + csv_data_sources = ( + [data_sources] if isinstance(data_sources, str) + else data_sources) + return _connect_csv(csv_data_sources) + + is_list = isinstance(data_sources, list) + if is_list: + if len(data_sources) > 1: + raise DataSourceError('specifying multiple inputs is only ' + + 'allowed for CSV files') + + path = (cast(str, data_sources[0]) if is_list + else cast(str, data_sources)) + if path.endswith('.xlsx'): + return _connect_excel(path, tables) + elif _detect_sqlite(path): + return _connect_db(f'sqlite:///{path}') + elif _detect_sqlalchemy(path): + return _connect_db(path) + else: + raise DataSourceError('unrecognized data source format for ' + + f'path {path}') + except (OSError, sa.exc.OperationalError) as e: + raise DataSourceError(str(e)) + + +def get_dialect_name(c: Connection) -> str: + '''returns the name of the dialect used for the connection''' + return c.handle.dialect.name + + +def exec(c: Connection, sql: str, sql_args: List[str] = []) -> pd.DataFrame: + '''executes sql with args on connection + + :raises DataSourceError: + ''' + try: + return pd.read_sql_query(sql, c.handle, params=tuple(sql_args)) + except sa.exc.OperationalError as e: + raise DataSourceError(str(e)) diff --git a/src/odm_sharing/private/queries.py b/src/odm_sharing/private/queries.py new file mode 100644 index 00000000..ba155288 --- /dev/null +++ b/src/odm_sharing/private/queries.py @@ -0,0 +1,337 @@ +from collections import defaultdict +from dataclasses import dataclass, field +from functools import partial +from typing import Dict, List, Tuple +# from pprint import pprint + +from functional import seq + +from odm_sharing.private.common import OrgName, TableName +from odm_sharing.private.rules import RuleId +from odm_sharing.private.stdext import StrEnum, sorted_dict +from odm_sharing.private.utils import dqt + +from odm_sharing.private.trees import ( + ALL_LIT, + Node, + NodeKind, + Op, + ParseError, + RangeKind, + RuleTree, + parse_op, +) + + +Sql = str +SqlArgs = List[str] + + +class SqlDialect(StrEnum): + OTHER = '' + MSSQL = 'mssql' + SYBASE = 'sybase' + + +@dataclass(frozen=True) +class Query: + sql: Sql + args: SqlArgs = field(default_factory=list) + + +PartialQuery = Query # incomplete query + + +@dataclass(frozen=True) +class TableQuery: + '''collection of queries for a single table''' + table_name: str + + columns: List[str] + '''columns specified in the select-rule (unless "all" is used), which can + be used instead of querying the columns using `get_column_sql`''' + + data_query: Query + rule_count_queries: Dict[RuleId, Query] + select_rule_id: RuleId + _select_query: Query + + +OrgTableQueries = Dict[OrgName, Dict[TableName, TableQuery]] + + +def ident(x: str) -> str: + '''make a sanitized/quoted sql identifier + + :raises ParseError: + ''' + # Double-quotes should be used as the delimiter for column-name + # identifiers. (https://stackoverflow.com/a/2901499) + # + # It should be enough to simply disallow double-quotes in the name. + if '"' in x: + raise ParseError('the following column-name contains double-quotes, ' + + f'which is not allowed: \'{x}\'') + return dqt(x) + + +def convert(val: str) -> str: + '''convert to sql equivalent value''' + norm_val = val.lower() + if norm_val == 'true': + return '1' + elif norm_val == 'false': + return '0' + else: + return val + + +def gen_data_sql( + node: Node, + args: SqlArgs, + rule_queries: Dict[RuleId, PartialQuery], +) -> Sql: + '''recursive helper function of ``gen_data_query`` + + :param node: should be a table-node in the outer call for a complete query + to be generated, but it'll also work on any node in the table-subtree. + :param args: (output) sql arguments + :param rule_queries: (output) see ``gen_data_query`` + + :raises ParseError: + ''' + + def recurse(node: Node) -> str: + return gen_data_sql(node, args, rule_queries) + + def record(node: Node, sql: str, args: SqlArgs) -> None: + '''associate (partial) sql query with node's rule''' + assert node.kind in [NodeKind.TABLE, NodeKind.FILTER, NodeKind.GROUP] + rule_queries[node.rule_id] = PartialQuery(sql=sql, args=args) + + n = node + if n.kind == NodeKind.TABLE: + # table has a select followed by an optional filter/group + table = n.str_val + select = n.sons[0] + filter_root = n.sons[1] if len(n.sons) > 1 else None + select_sql = f'SELECT {recurse(select)} FROM {ident(table)}' + filter_sql = f'WHERE {recurse(filter_root)}' if filter_root else '' + record(n, select_sql, []) + return (select_sql + ' ' + filter_sql) + elif n.kind == NodeKind.SELECT: + # select has an all-value or fields as children + if n.str_val == ALL_LIT: + return '*' + else: + columns = seq(n.sons).map(lambda x: ident(x.str_val)) + return ','.join(columns) + elif n.kind == NodeKind.GROUP: + # group combines any number of children with its operator + op_str = n.str_val.upper() + assert op_str in ['AND', 'OR'] + arg_start = len(args) + sql = seq(n.sons)\ + .map(recurse)\ + .reduce(lambda x, y: f'({x} {op_str} {y})') + record(n, sql, args[arg_start:]) + return sql + elif n.kind == NodeKind.FILTER: + # filter has op as value, children define field, kind and literals + op = parse_op(n.str_val) + key_ident = recurse(n.sons[0]) + + def gen_range_sql(range_kind: RangeKind, values: List[str]) -> str: + '''generates sql for a range of values''' + if range_kind == RangeKind.INTERVAL: + a = values[0] + b = values[1] + return f'({key_ident} BETWEEN {a} AND {b})' + elif range_kind == RangeKind.SET: + return f"({key_ident} IN ({','.join(values)}))" + else: + assert False, 'unreachable' + + if op == Op.RANGE: + range_kind = RangeKind(n.sons[1].str_val) + literals: List[str] = seq(n.sons[2:]).map(recurse).list() + sql = gen_range_sql(range_kind, literals) + record(n, sql, args[-len(literals):]) + return sql + else: + val = recurse(n.sons[1]) + sql = f'({key_ident} {op} {val})' + record(n, sql, args[-1:]) + return sql + elif n.kind == NodeKind.FIELD: + return ident(n.str_val) + elif n.kind == NodeKind.LITERAL: + # literal value is converted and added to list of args, while only a + # parameter placeholder is added to the sql + val = convert(n.str_val) + args.append(val) + return '?' + else: + assert False, 'unreachable' + return '' + + +def gen_data_query( + table_node: Node, + rule_queries: Dict[RuleId, PartialQuery], +) -> Query: + '''generates sql from a node-tree + + :param root_node: the root-node of the tree to generate sql from + :param rule_queries: (output) partial filter and select queries + + :return: complete query + + :raises ParseError: + ''' + args: List[str] = [] + sql = gen_data_sql(table_node, args, rule_queries) + return Query(sql=sql, args=args) + + +def get_table_node_columns(table_node: Node) -> List[str]: + '''returns the column-values of a select-node of a table-node''' + assert table_node.kind == NodeKind.TABLE + select_node = table_node.sons[0] + assert select_node.kind == NodeKind.SELECT + return seq(select_node.sons).map(lambda x: x.str_val).list() + + +def parse_sql_dialect(dialect_str: str) -> SqlDialect: + '''parse dialect str to enum value, or fall back to default''' + try: + return SqlDialect(dialect_str.lower()) + except ValueError: + return SqlDialect.OTHER + + +def get_share_query( + table_node: Node, + rule_queries: Dict[RuleId, PartialQuery] +) -> Query: + '''returns the complete query associated with a table-node (belonging to a + share-node)''' + # if a zero-rule exist (which is an implicit top-level AND-group) then use + # it, otherwise, if the table-node has a filter/group-node then use it, + # otherwise, use the table-node's select-node + if 0 in rule_queries: + return rule_queries[0] + else: + if len(table_node.sons) == 1: + select_node = table_node.sons[0] + assert select_node.kind == NodeKind.SELECT + return rule_queries[select_node.rule_id] + else: + filter_root = table_node.sons[1] + assert filter_root.kind in [NodeKind.FILTER, NodeKind.GROUP] + return rule_queries[filter_root.rule_id] + + +def gen_count_query_sql( + table: str, + rule_id: int, + filter_query: PartialQuery, +) -> Tuple[RuleId, Query]: + '''generate count query for table from partial filter query + + :raises ParseError: + ''' + sql = ( + f'SELECT COUNT(*) FROM {ident(table)}' + + (f' WHERE {filter_query.sql}' if filter_query.sql else '') + ) + return (rule_id, Query(sql=sql, args=filter_query.args)) + + +def gen_table_query(share_node: Node, table_node: Node) -> TableQuery: + '''generates a table-query for a specific table node of a share node + + :raises ParseError: + ''' + assert share_node.kind == NodeKind.SHARE + assert table_node.kind == NodeKind.TABLE + assert table_node in share_node.sons + + # generate complete sql and partial rule queries + rule_queries: Dict[RuleId, PartialQuery] = {} + data_query = gen_data_query(table_node, rule_queries) + + select_node = table_node.sons[0] + assert select_node.kind == NodeKind.SELECT + select_id = select_node.rule_id + + # save select query and replace with an empty partial query so that we'll + # get a count-query without a filter for the select rule + select_query = rule_queries[select_id] + no_filter_query = PartialQuery(sql='') + rule_queries[select_id] = no_filter_query + + # add share-rule to partial queries so that it'll be included in the count + share_id = share_node.rule_id + assert share_id not in rule_queries + rule_queries[share_id] = get_share_query(table_node, rule_queries) + + # generate count queries, making sure sure they're sorted to retain rule + # order for the user + gen_count_query_sql2 = partial(gen_count_query_sql, table_node.str_val) + count_queries = sorted_dict( + seq(rule_queries.items()).smap(gen_count_query_sql2).dict()) + + return TableQuery( + table_name=table_node.str_val, + data_query=data_query, + rule_count_queries=count_queries, + select_rule_id=select_id, + columns=get_table_node_columns(table_node), + _select_query=select_query, + ) + + +def generate(rule_tree: RuleTree) -> OrgTableQueries: + '''generate queries from a rule tree + + :param rule_tree: the tree to generate queries from + + :return: query-objects for each org and table + + :raises ParseError: + ''' + + def gen_table_query_entry(share_node: Node, table_node: Node + ) -> Tuple[TableName, TableQuery]: + table_name = table_node.str_val + table_query = gen_table_query(share_node, table_node) + return (table_name, table_query) + + result: OrgTableQueries = defaultdict(dict) + root: Node = rule_tree + assert root.kind == NodeKind.ROOT + for share in root.sons: + assert share.kind == NodeKind.SHARE + org = share.str_val + table_nodes = seq(share.sons) + gen_table_query_entry2 = partial(gen_table_query_entry, share) + result[org] = table_nodes.map(gen_table_query_entry2).dict() + return result + + +def gen_column_sql(select_sql: Sql, dialect: SqlDialect) -> Sql: + '''returns an sql statement that selects zero rows of data, but gives the + column names''' + if dialect in [SqlDialect.MSSQL, SqlDialect.SYBASE]: + return 'SELECT TOP 0 ' + select_sql[len('SELECT '):] + else: + return select_sql + ' LIMIT 0' + + +def get_column_sql(q: TableQuery, dialect: SqlDialect) -> Sql: + '''returns sql for querying actual colums, for when columns are not + pre-specified''' + # XXX: this query is generated on demand due to dependency on sql dialect, + # which we want to keep separate from query generation to keep modularity + return gen_column_sql(q._select_query.sql, dialect) diff --git a/src/odm_sharing/private/rules.py b/src/odm_sharing/private/rules.py new file mode 100644 index 00000000..f5b9ba7d --- /dev/null +++ b/src/odm_sharing/private/rules.py @@ -0,0 +1,258 @@ +import sys +from io import IOBase + +from dataclasses import dataclass, field +from enum import EnumMeta +from typing import Any, Dict, List, Union + +import pandas as pd +from functional import seq + +from odm_sharing.private.stdext import StrValueEnum +from odm_sharing.private.utils import fmt_set, get_filename, qt + +RuleId = int +SchemaFile = IOBase +SchemaPath = str + + +class RuleMode(StrValueEnum): + SELECT = 'select' + FILTER = 'filter' + GROUP = 'group' + SHARE = 'share' + + +class SchemaCtx: + '''Keeps track of the current state of the parsing process. This object + should be created at the beginning of the parsing process and its fields + updated throughout.''' + filename: str + row_ix: int # current row being parsed + column: str # current field being parsed + + def __init__(self, filename: str) -> None: + self.filename = filename + self.row_ix = 0 + self.column = '' + + @property + def line_num(self) -> int: + '''line number of current row being parsed''' + return self.row_ix + 1 + + +@dataclass(frozen=True) +class Rule: + '''A rule mapped from a sharing schema row''' + id: int # aka ruleID + table: str + mode: RuleMode + key: str = field(default='') + operator: str = field(default='') + value: str = field(default='') + + +class ParseError(Exception): + pass + + +RULE_ID = 'ruleID' + +HEADERS = [ + RULE_ID, + 'table', + 'mode', + 'key', + 'operator', + 'value', + 'notes', +] + +FILTER_OPERATORS = set([ + '<', + '<=', + '=', + '>', + '>=', + 'in', +]) + +ALL_MODES = set(RuleMode) +GROUP_OPERATORS = set(['and', 'or']) +RULE_FIELD_TYPES = Rule.__annotations__ +RULE_FIELDS = set(RULE_FIELD_TYPES.keys()) +HEADER_LIST_STR = ','.join(HEADERS) +TABLE_MODES = [RuleMode.SELECT, RuleMode.FILTER] + + +def gen_error(ctx: SchemaCtx, desc: str) -> ParseError: + '''returns a ParseError''' + col = f', col: {qt(ctx.column)}' if ctx.column else '' + msg = f'{ctx.filename}(ln: {ctx.line_num}{col}): {desc}' + print('Error: ' + msg, file=sys.stderr) + return ParseError(msg) + + +def fail(ctx: SchemaCtx, desc: str) -> None: + '''raises a ParseError''' + raise gen_error(ctx, desc) + + +def coerce_value( # type: ignore + ctx: SchemaCtx, + type_class, + value: str +) -> Any: + '''converts a value from string to the specified type, using the type class + (aka. class-constructor) for that type. + + :param type_class: str, int, MyEnum, etc. + :raises ParseError: + ''' + try: + typed_val = type_class(value) + return typed_val + except ValueError: + + def get_expected(type_class) -> str: # type: ignore + if type(type_class) is EnumMeta: + return 'one of ' + fmt_set(list(type_class)) + else: + return type_class.__name__ + + expected = get_expected(type_class) + fail(ctx, f'got {qt(value)}, expected {expected}') + + +def init_rule(ctx: SchemaCtx, schema_row: dict) -> Rule: + '''constructs a rule from a schema row, or raises list of ParseError(s)''' + + def get_field_name(column: str) -> str: + return ('id' if column == RULE_ID else column) + + def init_default_rule() -> Rule: + # XXX: `mode` doesn't have a default value, but it'll be overwritten + return Rule(id=0, table='', mode=RuleMode.SELECT) + + rule = init_default_rule() + errors: List[ParseError] = [] + for column in HEADERS: + if column == 'notes': + continue + val = schema_row[column] + ctx.column = column + field = get_field_name(column) + type_class = RULE_FIELD_TYPES[field] + try: + typed_val = coerce_value(ctx, type_class, val) + object.__setattr__(rule, field, typed_val) + except ParseError as e: + errors.append(e) + if errors: + raise ParseError(errors) + return rule + + +def validate_headers(ctx: SchemaCtx, schema_headers: List[str]) -> None: + '''validates schema headers, or raises ParseError''' + expected = set(HEADERS) + actual = set(schema_headers) + missing = expected - actual + if missing: + msg = f'missing headers: {", ".join(missing)}' + fail(ctx, msg) + + +def validate_rule(ctx: SchemaCtx, rule: Rule) -> None: + '''checks that the rule's values are valid according to its mode, or raises + list of ParseError(s)''' + errors: List[ParseError] = [] + + def err(msg: str) -> None: + errors.append(gen_error(ctx, msg)) + + def check_required(ctx: SchemaCtx, val: str, mode: RuleMode, + modes: Union[set, list]) -> None: + # XXX: empty filter string value must be allowed + has = bool(val) or (mode == RuleMode.FILTER and val == '') + should_have = mode in modes + if has and not should_have: + err(f'{ctx.column} must be empty/NA for mode {qt(mode)}') + elif not has and should_have: + err(f'{ctx.column} required for modes {fmt_set(modes)}') + + def check_set(ctx: SchemaCtx, actual: str, expected: Union[set, list] + ) -> None: + if actual not in expected: + err(f'got {qt(actual)}, expected one of {fmt_set(expected)}') + + ctx.column = RULE_ID + if rule.id <= 0: + err(f'{ctx.column} must be greater than zero') + + ctx.column = 'table' + check_required(ctx, rule.table, rule.mode, TABLE_MODES) + + ctx.column = 'key' + check_required(ctx, rule.key, rule.mode, + [RuleMode.FILTER, RuleMode.SHARE]) + + ctx.column = 'operator' + check_required(ctx, rule.operator, rule.mode, + [RuleMode.FILTER, RuleMode.GROUP]) + if rule.operator: + if rule.mode == RuleMode.FILTER: + check_set(ctx, rule.operator, FILTER_OPERATORS) + elif rule.mode == RuleMode.GROUP: + check_set(ctx, rule.operator.lower(), GROUP_OPERATORS) + + ctx.column = 'value' + check_required(ctx, rule.value, rule.mode, ALL_MODES) + + if errors: + raise ParseError(errors) + + +def load(schema: Union[SchemaPath, SchemaFile]) -> Dict[RuleId, Rule]: + '''loads a sharing schema + + :param schema: file path/object + + :returns: rules parsed from schema, by rule id + :raises OSError, ParseError: + ''' + filename = get_filename(schema) + ctx = SchemaCtx(filename) + data = pd.read_csv(schema) # type: ignore + + # replace all different NA values with an empty string + data = data.fillna('') + + # trim column names to avoid silly str-compare issues (#33) + data.columns = seq(data.columns).map(str.strip).list() + + # XXX: loading is aborted on header errors since row-parsing depends on it + validate_headers(ctx, data.columns.to_list()) + + # iterate dataset and parse each row into a sharing rule obj + result: Dict[RuleId, Rule] = {} + errors: List[ParseError] = [] + for i, row in enumerate(data.itertuples(index=False)): + ctx.row_ix = i + 1 + try: + + # type-checkers can't handle dicts with values of multiple types + row_dict = row._asdict() # type: ignore + + rule = init_rule(ctx, row_dict) + if rule.id in result: + ctx.column = RULE_ID + fail(ctx, f'rule with id {rule.id} already exists') + validate_rule(ctx, rule) + result[rule.id] = rule + except ParseError as e: + errors.append(e) + if errors: + raise ParseError(errors) + return result diff --git a/src/odm_sharing/private/stdext.py b/src/odm_sharing/private/stdext.py new file mode 100644 index 00000000..ab51aedb --- /dev/null +++ b/src/odm_sharing/private/stdext.py @@ -0,0 +1,26 @@ +'''python standard library extensions''' + +from enum import Enum + + +class StrEnum(str, Enum): + '''shim for python < 3.11 + + Gives the enum's assigned string value when converted to string, which is + useful for printing the value or comparing it with another string. + + See https://docs.python.org/3.11/library/enum.html#enum.StrEnum + ''' + def __str__(self) -> str: + return str(self.value) + + +class StrValueEnum(StrEnum): + '''extension of `StrEnum` that makes it look cleaner with pprint''' + def __repr__(self) -> str: + return self.value + + +def sorted_dict(d: dict) -> dict: + '''sorts a dict so that the keys are in sorted order''' + return {k: d[k] for k in sorted(d.keys())} diff --git a/src/odm_sharing/private/trees.py b/src/odm_sharing/private/trees.py new file mode 100644 index 00000000..fb8a64d6 --- /dev/null +++ b/src/odm_sharing/private/trees.py @@ -0,0 +1,522 @@ +'''see docs/trees-algo.md''' + +import sys +# from overloading import overload +from collections import defaultdict +from dataclasses import dataclass, field +from functools import partial +from typing import Dict, List, Optional, Set, Union, cast +# from pprint import pprint + +from functional import seq + +from odm_sharing.private.common import TableName +from odm_sharing.private.stdext import StrEnum +from odm_sharing.private.utils import fmt_set, not_empty, qt +from odm_sharing.private.rules import ( + ParseError, + Rule, + RuleId, + RuleMode, + TABLE_MODES, +) + + +# {{{1 types + + +class Op(StrEnum): + AND = 'and' + EQ = '=' + GT = '>' + GTE = '>=' + LT = '<' + LTE = '<=' + OR = 'or' + RANGE = 'in' + + +class NodeKind(StrEnum): + ROOT = 'root' + SHARE = 'share' + TABLE = 'table' + SELECT = 'select' + GROUP = 'group' + FILTER = 'filter' + FIELD = 'field' + LITERAL = 'literal' + RANGE_KIND = 'range-kind' + + +class RangeKind(StrEnum): + INTERVAL = 'interval' + SET = 'set' + + +@dataclass(frozen=True) +class Node: + rule_id: RuleId + kind: NodeKind + str_val: str = field(default_factory=str) + sons: list = field(default_factory=list) + + @staticmethod + def _get_repr(node, depth: int = 0) -> str: # type: ignore + result = (' ' * depth) + str(node) + '\n' + for child in node.sons: + result += Node._get_repr(child, depth+1) + return result + + def __repr__(self) -> str: + return Node._get_repr(self) + + def __str__(self) -> str: + return f'({self.rule_id}, {self.kind}, {qt(self.str_val)})' + + +RuleTree = Node # alias for a complete node tree + + +class Ctx: + '''Keeps track of the current parsing process. This object should be + created at the beginning of the parsing process and its fields updated + throughout''' + filename: str # filename reference for error messages + rule_tables: Dict[RuleId, List[str]] # rule-table mapping + nodes: Dict[RuleId, Node] # collection of nodes + root: Optional[Node] # current tree root + rule_id: RuleId # current node's rule-id + + def __init__(self, filename: str) -> None: + self.filename = filename + self.rule_tables = defaultdict(list) + self.nodes = {} + self.root = None + self.rule_id = 0 + + +# {{{1 constants + + +ALL_LIT = "all" +VAL_SEP = ";" +INTERVAL_SEP = ":" +ALL_SEPARATORS = set([VAL_SEP, INTERVAL_SEP]) + + +# {{{1 error gen + + +def gen_error(ctx: Ctx, desc: str) -> ParseError: + loc = f'(id: {ctx.rule_id})' if ctx.rule_id else '' + msg = f'{ctx.filename}{loc}: {desc}' + print('Error: ' + msg, file=sys.stderr) + return ParseError(msg) + + +def fail(ctx: Ctx, desc: str) -> None: + raise gen_error(ctx, desc) + + +# {{{1 input text parsing + + +def parse_list( + ctx: Ctx, + val_str: str, + min: int = 0, + max: int = 0, + sep: str = VAL_SEP, +) -> List[str]: + '''splits a multiple-value string into a list, and validates the number of + elements + + :param val_str: the string to parse values from + :param min: min required number of elements, or zero + :param max: max required number of elements, or zero + :param sep: the value separator + + :raises ParseError + ''' + result = seq(val_str.split(sep))\ + .map(str.strip)\ + .filter(not_empty)\ + .list() + n = len(result) + + has_constraint = min > 0 or max > 0 + absolute = min == max + no_max = max == 0 + in_range = min <= n <= max + + if has_constraint: + if absolute: + if n != min: + fail(ctx, f'expected {min} values, got {n}') + elif no_max: + if n < min: + fail(ctx, f'expected at least {min} values, got {n}') + elif not in_range: + fail(ctx, f'expected {min}-{max} values, got {n}') + + return result + + +def parse_int_list(ctx: Ctx, val_str: str, + min: int = 0, max: int = 0) -> List[int]: + '''splits a multiple-value string into a list of ints, or raises + ParseError. See `parse_list`.''' + int_strings = parse_list(ctx, val_str, min, max) + result = [0] * len(int_strings) + for i, int_str in enumerate(int_strings): + try: + result[i] = int(int_str) + except ValueError: + fail(ctx, 'invalid integer {qt(int_str)} (#{i}) in value list') + return result + + +def parse_mode(ctx: Ctx, mode_str: str) -> RuleMode: + '''converts str to mode enum, or raises ParseError''' + try: + return RuleMode(mode_str.upper()) + except ValueError: + raise gen_error(ctx, f'invalid mode {qt(mode_str)}') + + +def parse_op(op_str: str) -> Op: + '''converts str to operator enum''' + return Op(op_str.lower()) + + +def parse_ctx_op(ctx: Ctx, op_str: str) -> Op: + '''converts str to operator enum, or raises ParseError''' + try: + return parse_op(op_str) + except ValueError: + raise gen_error(ctx, f'invalid operator {qt(op_str)}') + + +# {{{1 ast gen + + +def is_filter_for_table(ctx: Ctx, table: str, node: Node) -> bool: + '''checks if the node is of kind filter/group and if its children + references the table''' + assert table != ALL_LIT + if node.kind == NodeKind.FILTER: + return (table in ctx.rule_tables[node.rule_id]) + elif node.kind == NodeKind.GROUP: + if node.sons: + return is_filter_for_table(ctx, table, node.sons[0]) + return False + + +def get_table_select_ids(ctx: Ctx, select_nodes: List[Node] + ) -> Dict[TableName, RuleId]: + '''returns mapping between tables and their select-rule ids, or raises + ParseError''' + # enforces only one select per table + result: Dict[TableName, RuleId] = {} + for node in select_nodes: + assert node.kind == NodeKind.SELECT + id = node.rule_id + select_tables = ctx.rule_tables[id] + for table in select_tables: + if table in result: + orig_id = result[table] + fail(ctx, f'select-rule {id}\'s table {qt(table)} ' + + f'is already used by select-rule {orig_id}') + result[table] = id + return result + + +def to_literal_node(rule_id: int, val: str) -> Node: + '''init literal-node with value''' + return Node(rule_id=rule_id, kind=NodeKind.LITERAL, str_val=val.strip()) + + +def to_literal_nodes(rule_id: RuleId, values: List[str]) -> List[Node]: + '''init literal-nodes from list of values''' + to_literal_node2 = partial(to_literal_node, rule_id) + return seq(values).map(to_literal_node2).list() + + +def get_filter_root(ctx: Ctx, table: str, nodes: List[Node]) -> Optional[Node]: + '''returns the root of the filter node tree for the table''' + is_filter_for_table2 = partial(is_filter_for_table, ctx, table) + filter_nodes = seq(nodes).filter(is_filter_for_table2).list() + n = len(filter_nodes) + if n == 0: + return None + elif n == 1: + return filter_nodes[0] + else: + return Node( + rule_id=0, + kind=NodeKind.GROUP, + str_val=Op.AND.value, + sons=filter_nodes, + ) + + +def get_node(ctx: Ctx, rule_id: RuleId) -> Node: + '''returns node generated from the rule id, or raises ParseError''' + assert rule_id + try: + return ctx.nodes[rule_id] + except KeyError: + msg = (f'missing rule {rule_id}. ' + + 'Hint: Rules must be declared before they are referenced.') + raise gen_error(ctx, msg) + + +def parse_filter_values(ctx: Ctx, op: Op, is_interval: bool, val_str: str + ) -> List[str]: + ''' + :raises ParseError: + ''' + if op == Op.RANGE: + if is_interval: + return parse_list(ctx, val_str, 2, 2, INTERVAL_SEP) + else: # is set + return parse_list(ctx, val_str, min=1) + else: + if seq(ALL_SEPARATORS).map(lambda x: x in val_str).any(): + msg = ('multiple values ' + + f'(using separators {fmt_set(ALL_SEPARATORS)}) ' + + f'are only allowed with operator {qt(Op.RANGE)}') + fail(ctx, msg) + return [val_str] + + +def filter_is_interval(op: Op, val_str: str) -> bool: + return op == Op.RANGE and INTERVAL_SEP in val_str + + +def init_node(ctx: Ctx, rule_id: RuleId, mode: RuleMode, key: str, op_str: str, + val_str: str) -> Node: + '''initializes and returns a new node from rule attributes, or raises + ParseError''' + get_ctx_node = partial(get_node, ctx) + + if mode == RuleMode.SELECT: + values = parse_list(ctx, val_str, 1) + use_all = ALL_LIT in values + to_literal_node2 = partial(to_literal_node, rule_id) + return Node( + rule_id=rule_id, + kind=NodeKind.SELECT, + str_val=(ALL_LIT if use_all else ''), + sons=([] if use_all else seq(values).map(to_literal_node2).list()), + ) + elif mode == RuleMode.FILTER: + + def init_range_kind_node(is_interval: bool) -> Node: + kind = RangeKind.INTERVAL if is_interval else RangeKind.SET + return Node( + rule_id=rule_id, + kind=NodeKind.RANGE_KIND, + str_val=kind.value + ) + + # XXX: range-kind node is added before literals for range operator + # XXX: a set with a single element isn't required to have a separator + op = parse_ctx_op(ctx, op_str) + is_interval = filter_is_interval(op, val_str) + values = parse_filter_values(ctx, op, is_interval, val_str) + field_node = Node(rule_id=rule_id, kind=NodeKind.FIELD, str_val=key) + literal_nodes = to_literal_nodes(rule_id, values) + sons = ( + [field_node] + + ([init_range_kind_node(is_interval)] if op == Op.RANGE else []) + + literal_nodes + ) + return Node( + rule_id=rule_id, + kind=NodeKind.FILTER, + str_val=op_str, + sons=sons, + ) + elif mode == RuleMode.GROUP: + + def not_filter_group(node: Node) -> bool: + return node.kind not in [NodeKind.FILTER, NodeKind.GROUP] + + op = parse_ctx_op(ctx, op_str) + if op not in [Op.AND, Op.OR]: + fail(ctx, 'incompatible group operator') + ids = parse_int_list(ctx, val_str, min=2) + sons = seq(ids).map(get_ctx_node).list() + if seq(sons).map(not_filter_group).any(): + fail(ctx, 'group-rules can only refer to other filter/group-rules') + return Node( + rule_id=rule_id, + kind=NodeKind.GROUP, + str_val=op.value, + sons=sons, + ) + elif mode == RuleMode.SHARE: + + def is_select(node: Node) -> bool: + return node.kind == NodeKind.SELECT + + def init_table_node(share_value_nodes: List[Node], + table: str, select_id: int) -> Node: + assert select_id + select_node = get_ctx_node(select_id) + filter_root = get_filter_root(ctx, table, share_value_nodes) + return Node( + rule_id=select_id, + kind=NodeKind.TABLE, + str_val=table, + sons=([select_node] + ([filter_root] if filter_root else [])), + ) + + def init_share_node(rule_id: int, table_nodes: List[Node], org: str + ) -> Node: + return Node( + rule_id=rule_id, + kind=NodeKind.SHARE, + str_val=org, + sons=table_nodes, + ) + + orgs = parse_list(ctx, key, min=1) + ids = parse_int_list(ctx, val_str, min=1) + + # the nodes the user wants to share with the specified orgs + share_value_nodes = seq(ids).map(get_ctx_node).list() + + select_nodes = seq(share_value_nodes).filter(is_select).list() + table_select_ids = get_table_select_ids(ctx, select_nodes) + + init_table_node2 = partial(init_table_node, share_value_nodes) + table_nodes = seq(table_select_ids.items())\ + .smap(init_table_node2)\ + .list() + + init_share_node2 = partial(init_share_node, rule_id, table_nodes) + share_nodes = seq(orgs).map(init_share_node2).list() + + root_node_candidate = Node( + rule_id=0, + kind=NodeKind.ROOT, + sons=share_nodes + ) + return root_node_candidate + else: + assert False, 'not all cases covered' + + +def add_node(ctx: Ctx, rule_id: RuleId, table: str, mode: RuleMode, + key: str, op_str: str, val_str: str) -> None: + '''parses a rule into a tree node, and adds it to the context object + + :param ctx: the context object to update + :param rule_id: rule id + :param table: rule table, required for select/filter rules, otherwise empty + :param mode: rule mode + :param key: rule key + :param op_str: rule operator + :param val_str: rule value + + :raises ParseError: + ''' + # record rule table for later lookup + if table: + ctx.rule_tables[rule_id].append(table) + + def get_or_add_node(rule_id: RuleId) -> Node: + # node may already have been added in the context of another table, in + # that case we can reuse it + if rule_id in ctx.nodes: + return ctx.nodes[rule_id] + else: + node = init_node(ctx, rule_id, mode, key, op_str, val_str) + ctx.nodes[rule_id] = node + return node + + node = get_or_add_node(rule_id) + + # assign initial root or merge with existing root + if node.kind == NodeKind.ROOT: + if not ctx.root: + ctx.root = node + else: + ctx.root = Node( + rule_id=0, + kind=NodeKind.ROOT, + sons=(ctx.root.sons+node.sons) + ) + + +def validate_schema(ctx: Ctx, rules: List[Rule]) -> None: + '''checks that the required rules are present in the schema + + :raises ParseError''' + if not seq(rules).filter(lambda x: x.mode == RuleMode.SHARE).any(): + fail(ctx, 'no share-rules in schema') + if not seq(rules).filter(lambda x: x.mode == RuleMode.SELECT).any(): + fail(ctx, 'no select-rules in schema') + + +def filter_rule_orgs(ctx: Ctx, rule: Rule, org_whitelist: Set[str]) -> Rule: + '''return a new rule with orgs filtered using whitelist''' + assert rule.mode == RuleMode.SHARE + rule_orgs = set(parse_list(ctx, rule.key, 1)) + new_orgs = rule_orgs & org_whitelist + orgs_str = VAL_SEP.join(new_orgs) + return Rule(id=rule.id, table='', mode=RuleMode.SHARE, key=orgs_str, + value=rule.value) + + +def parse(rules: Union[Dict[RuleId, Rule], List[Rule]], + orgs: List[str] = [], filename: str = '') -> RuleTree: + '''parses rules into an abstract syntax tree + + :param rules: collection of rules to be parsed + :param orgs: list of organization names to include, or an empty list for + all orgs + :param filename: the filename of the schema the rules were loaded from. + Only used as context in error messages + + :raises ParseError: + + :return: an opaque rule-tree object for query generation + ''' + ctx = Ctx(filename) + + if isinstance(rules, dict): + rules = list(rules.values()) + org_whitelist = set(orgs) + orgs_in_schema = seq(rules)\ + .filter(lambda r: r.mode == RuleMode.SHARE)\ + .map(lambda r: r.key)\ + .set() + + if not (org_whitelist <= orgs_in_schema): + invalid = org_whitelist - orgs_in_schema + fail(ctx, f'the specified orgs {fmt_set(invalid)} are not part of ' + + f'any share-rule in the schema {fmt_set(orgs_in_schema)}.') + + # make sure schema has the required (share and select) rules + validate_schema(ctx, rules) + + for rule in rules: + ctx.rule_id = rule.id + + # remove non-whitelisted orgs, skip if no orgs left + if orgs and rule.mode == RuleMode.SHARE: + rule = filter_rule_orgs(ctx, rule, org_whitelist) + if not rule.key: + continue + + min_tables = 1 if rule.mode in TABLE_MODES else 0 + tables = parse_list(ctx, rule.table, min_tables) or [''] + for table in tables: + add_node(ctx, rule.id, table, rule.mode, rule.key, rule.operator, + rule.value) + + assert ctx.root + return cast(RuleTree, ctx.root) diff --git a/src/odm_sharing/private/utils.py b/src/odm_sharing/private/utils.py new file mode 100644 index 00000000..f99cfaee --- /dev/null +++ b/src/odm_sharing/private/utils.py @@ -0,0 +1,37 @@ +from io import IOBase +from pathlib import Path +from typing import Iterable, Union + + +def qt(x: str) -> str: + '''quote `x`''' + return f"'{x}'" + + +def dqt(x: str) -> str: + '''double-quote `x`''' + return f"\"{x}\"" + + +def not_empty(x: Union[list, set, str]) -> bool: + return len(x) > 0 + + +def fmt_set(values: Iterable) -> str: + '''returns a comma-separated string of the items in `values`, surrounded by + curly-brackets''' + items = ', '.join(values) + return f'{{{items}}}' + + +def gen_output_filename(input_name: str, schema_name: str, org: str, + table: str, ext: str) -> str: + parts = (([input_name] if input_name else []) + + [schema_name, org] + + ([table] if table else [])) + return '-'.join(parts) + f'.{ext}' + + +def get_filename(file: Union[str, IOBase]) -> str: + '''returns the path filename, or a dummy name for file objects''' + return Path(file).name if isinstance(file, str) else 'file-obj' diff --git a/src/odm_sharing/sharing.py b/src/odm_sharing/sharing.py new file mode 100644 index 00000000..b820d2dd --- /dev/null +++ b/src/odm_sharing/sharing.py @@ -0,0 +1,187 @@ +from io import IOBase +from typing import Dict, List, Tuple, Union + +import numpy as np +import pandas +from functional import seq + +import odm_sharing.private.cons as cons +import odm_sharing.private.queries as queries +import odm_sharing.private.rules as rules +import odm_sharing.private.trees as trees +from odm_sharing.private.common import ColumnName, OrgName, TableName, F, T +from odm_sharing.private.cons import Connection, CsvFile +from odm_sharing.private.queries import OrgTableQueries, Query, TableQuery +from odm_sharing.private.rules import RuleId +from odm_sharing.private.utils import get_filename, qt + + +def parse(schema_file: Union[str, IOBase], + orgs: List[str] = []) -> OrgTableQueries: + '''loads and parses a schema file into query objects + + :param schema_file: schema file path/object + :param orgs: organization whitelist, disabled if empty + + :return: a query per table per org. `OrgName` and `TableName` are + strings. + :rtype: Dict[OrgName, Dict[TableName, TableQuery]] + + :raises OSError: if the schema file can't be loaded + :raises ParseError: if the schema parsing fails + ''' + ruleset = rules.load(schema_file) + filename = get_filename(schema_file) + tree = trees.parse(ruleset, orgs, filename) + return queries.generate(tree) + + +def connect( + data_sources: Union[str, List[str], List[CsvFile]], + tables: List[str] = [], +) -> Connection: + ''' + creates a connection to a data source that later can be used with a query + to retrieve data + + Warning: Even tho using a database as input is supported, it hasn't been + tested properly. + + :param data_sources: filepath, database URL, list of CSV filepaths, or list + of [CsvFiles](#odm_sharing.sharing.CsvFile). + :param tables: table name whitelist, disabled if empty + + :return: the data source connection object + + :raises DataSourceError: if the connection couldn't be established + ''' + # normalize single str/path input as list + if not isinstance(data_sources, list): + data_sources = [data_sources] + return cons.connect(data_sources, set(tables)) + + +def _check_con_query(c: Connection, tq: TableQuery) -> None: + if tq.table_name not in c.tables: + msg = f'table {qt(tq.table_name)} is missing from input' + raise cons.DataSourceError(msg) + + +def get_data(c: Connection, tq: TableQuery) -> pandas.DataFrame: + '''retrieves filtered data from a specific table of a data source + + Warning: Boolean values from CSV/Excel files will be normalized as + `TRUE`/`FALSE`. + + :param c: the data source connection + :param tq: the table query + + :return: the resulting (filtered) dataset + + :raises DataSourceError: if an error occured while retrieving data + ''' + + _check_con_query(c, tq) + dq = tq.data_query + df = cons.exec(c, dq.sql, dq.args) + + # At this point bool values are 0/1, so we have to convert it to + # FALSE/TRUE (which is the ODM standard). + # + # XXX: selected columns are only a subset of all columns, and may not even + # include any of the previously found bool columns + for col in c.bool_cols[tq.table_name]: + if col not in df: + continue + kind = df[col].dtype + if kind == object: # potentially str + df[col] = df[col].replace({'0': F, '1': T}) + elif kind == np.int64: + df[col] = df[col].astype(str).replace({'0': F, '1': T}) + elif kind == np.float64: + df[col] = df[col].astype(str).replace( + {'nan': '', '0': F, '1': T, '0.0': F, '1.0': T}) + else: + assert False, f'invalid bool type {kind}' + + # normalize None/empty to empty string + # + # XXX: bool columns may end up storing empty values as NULL in the + # database, which in turn are extracted as None + for series in df: + if df[series].dtype == object: # str + df[series] = df[series].replace({None: ''}) + + return df + + +def get_counts(c: Connection, tq: TableQuery) -> Dict[RuleId, int]: + '''gives the row count of the query for each rule + + :param c: connection + :param tq: table query + + :return: the row count for each rule. `RuleId` is an integer. + + :raises DataSourceError: if an error occured while counting rows + ''' + def get_rule_count(rule_id: RuleId, q: Query) -> Tuple[RuleId, int]: + count = int(cons.exec(c, q.sql, q.args).iat[0, 0]) + return (rule_id, count) + + _check_con_query(c, tq) + return seq(tq.rule_count_queries.items()).smap(get_rule_count).dict() + + +def get_columns(c: Connection, tq: TableQuery + ) -> Tuple[RuleId, List[ColumnName]]: + '''gives the column names of a query + + :param c: connection + :param tq: table query + + :return: the select-rule's ID, and the list of column names + associated with it. `RuleId` is an integer, and `ColumnName` is a string. + + :raises DataSourceError: if an error occured while retrieving the column + names + ''' + _check_con_query(c, tq) + if tq.columns: + return (tq.select_rule_id, tq.columns) + else: + dialect = queries.parse_sql_dialect(cons.get_dialect_name(c)) + sql = queries.get_column_sql(tq, dialect) + columns = cons.exec(c, sql).columns.array.tolist() + return (tq.select_rule_id, columns) + + +def extract( + schema_file: Union[str, IOBase], + data_sources: Union[str, List[str], List[CsvFile]], + orgs: List[str] = [], +) -> Dict[OrgName, Dict[TableName, pandas.DataFrame]]: + '''high-level function for retrieving filtered data + + Warning: Boolean values from CSV/Excel files will be normalized as + `TRUE`/`FALSE`. + + :param schema_file: rule schema file path/object + :param data_sources: filepath, database URL, list of CSV filepaths, or list + of [CsvFiles](#odm_sharing.sharing.CsvFile). + :param orgs: organization whitelist, disabled if empty + + :return: a dataset per table per org. `OrgName` and `TableName` are + strings. + + :raises DataSourceError: if an error occured while extracting data from the + data source + ''' + con = connect(data_sources) + queries = parse(schema_file, orgs) + result: Dict[OrgName, Dict[TableName, pandas.DataFrame]] = {} + for org, tablequeries in queries.items(): + result[org] = {} + for table, query in tablequeries.items(): + result[org][table] = get_data(con, query) + return result diff --git a/src/odm_sharing/tools/share.py b/src/odm_sharing/tools/share.py new file mode 100644 index 00000000..300856be --- /dev/null +++ b/src/odm_sharing/tools/share.py @@ -0,0 +1,284 @@ +import contextlib +import logging +import os +import sys +from collections import namedtuple +from enum import Enum +from os import linesep +from pathlib import Path +from typing import Dict, List, Set, TextIO, Union +from typing_extensions import Annotated + +import pandas as pd +import typer +from tabulate import tabulate +from functional import seq + +import odm_sharing.sharing as sh + +import odm_sharing.private.cons as cons +import odm_sharing.private.queries as queries +import odm_sharing.private.rules as rules +import odm_sharing.private.trees as trees +from odm_sharing.private.rules import Rule, RuleId, RuleMode +from odm_sharing.private.utils import gen_output_filename, qt + + +FilePath = namedtuple('FilePath', ['abspath', 'relpath', 'filename']) + + +class OutFmt(str, Enum): + '''output format''' + AUTO = 'auto' + CSV = 'csv' + EXCEL = 'excel' + + +SCHEMA_DESC = 'Sharing schema file path.' +INPUT_DESC = 'Input spreadsheet file-path(s) or SQLAlchemy database-url.' + +ORGS_DESC = '''Comma separated list of organizations to share with, defaults to +all.''' + +OUTFMT_DESC = 'Output format.' +OUTDIR_DESC = 'Output directory.' + +DEBUG_DESC = '''Output debug info to STDOUT (and ./debug.txt) instead of +creating sharable output files. This shows which tables and columns are +selected, and how many rows each filter returns.''' + +QUIET_DESC = 'Don\'t log to STDOUT.' +LIST_DESC = 'Write output file-paths to STDOUT, separated by newlines.' + +# default cli args +DEBUG_DEFAULT = False +ORGS_DEFAULT: List[str] = [] +OUTDIR_DEFAULT = './' +OUTFMT_DEFAULT = OutFmt.AUTO +QUIET_DEFAULT = False +LIST_DEFAULT = False + +app = typer.Typer(pretty_exceptions_show_locals=False) + + +def error(msg: str) -> None: + print(msg, file=sys.stderr) + logging.error(msg) + + +def write_line(file: TextIO, text: str = '') -> None: + '''writes a line to STDOUT and file''' + print(text) + file.write(text + linesep) + + +def write_header(file: TextIO, level: int, text: str) -> None: + '''writes a markdown header''' + write_line(file, ('#'*level) + f' {text}{linesep}') + + +def fmt_rule(r: Rule) -> List[str]: + # [id, mode, filter] + result = [f'{r.id:>2}', r.mode.value] + if r.mode == RuleMode.FILTER: + result.append(f'{r.key} {r.operator} ({r.value})') + elif r.mode == RuleMode.GROUP: + result.append(f'{r.operator:3} ({r.value})') + return result + + +def write_debug( + file: TextIO, + con: cons.Connection, + org_name: str, + table_name: str, + table_query: queries.TableQuery, + ruleset: Dict[RuleId, Rule] +) -> None: + '''write debug output''' + write_line(file, '') + write_header(file, 1, f'org {qt(org_name)} - table {qt(table_name)}') + + write_header(file, 2, 'data sql') + write_line(file, table_query.data_query.sql) + write_line(file, '') + + (select_id, columns) = sh.get_columns(con, table_query) + write_header(file, 2, 'columns') + for col in columns: + write_line(file, f'- {col}') + write_line(file) + + write_header(file, 2, 'counts') + counts = sh.get_counts(con, table_query) + + # XXX: the rule with ID 0 is not from the input schema, but is generated + # implicitly during schema parsing, so it's not included in this output + # table + count_table = seq(counts.keys())\ + .filter(lambda id: id > 0)\ + .map(lambda id: ruleset[id])\ + .map(lambda r: (counts[r.id],) + tuple(fmt_rule(r)))\ + .list() + + headers = ['count', 'id', 'mode', 'filter'] + write_line(file, tabulate(count_table, headers=headers)) + write_line(file) + + +def get_tables(org_queries: sh.queries.OrgTableQueries) -> Set[str]: + '''returns all table names in the query collection''' + result = set() + for table_query in org_queries.values(): + for table in table_query.keys(): + result.add(table) + return result + + +def gen_filepath(outdir: str, input_name: str, schema_name: str, org: str, + table: str, ext: str) -> FilePath: + filename = gen_output_filename(input_name, schema_name, org, table, ext) + abspath = os.path.join(outdir, filename) + relpath = os.path.relpath(abspath, os.getcwd()) + return FilePath(abspath=abspath, relpath=relpath, filename=filename) + + +def get_debug_writer(debug: bool) -> Union[TextIO, contextlib.nullcontext]: + # XXX: this function is only used for brewity with the below `with` clause + if debug: + return open('debug.txt', 'w') + else: + return contextlib.nullcontext() + + +def infer_outfmt(inputs: List[str]) -> OutFmt: + first = inputs[0] + (_, ext) = os.path.splitext(first) + if ext == '.csv' and len(inputs) == 1: + return OutFmt.CSV + return OutFmt.EXCEL + + +def get_output_prefix_from_input(input: str) -> str: + # - ignore CSV files since their names may double as table names, which + # are already included in the generated output name + # - ignore non-existing files like database URLs + if (not input.endswith('.csv') and os.path.exists(input)): + return Path(input).stem + else: + return '' + + +def share( + schema: str, + inputs: List[str], + orgs: List[str] = ORGS_DEFAULT, + outfmt: OutFmt = OUTFMT_DEFAULT, + outdir: str = OUTDIR_DEFAULT, + debug: bool = DEBUG_DEFAULT, +) -> List[str]: + '''returns list of output files''' + schema_path = schema + schema_filename = Path(schema_path).name + schema_name = Path(schema_path).stem + output_prefix = get_output_prefix_from_input(inputs[0]) + if outfmt == OutFmt.AUTO: + outfmt = infer_outfmt(inputs) + logging.info(f'inferred output format as {outfmt}') + + logging.info(f'loading schema {qt(schema_filename)}') + try: + ruleset = rules.load(schema_path) + ruletree = trees.parse(ruleset, orgs, schema_filename) + org_queries = queries.generate(ruletree) + table_filter = get_tables(org_queries) + except rules.ParseError: + # XXX: error messages are already printed at this point + return [] + + # XXX: only tables found in the schema are considered in the data source + logging.info('connecting...') + con = cons.connect(inputs, table_filter) + + # create outdir + os.makedirs(outdir, exist_ok=True) + + # one debug file per run + output_paths = [] + with get_debug_writer(debug) as debug_file: + for org, table_queries in org_queries.items(): + org_data = {} + for table, tq in table_queries.items(): + assert table in table_filter + if debug: + write_debug(debug_file, con, org, table, tq, ruleset) + else: + org_data[table] = sh.get_data(con, tq) + + # one excel file per org + excel_path = gen_filepath(outdir, output_prefix, schema_name, org, + '', 'xlsx') + excel_file = None + if not debug and outfmt == OutFmt.EXCEL: + excel_file = pd.ExcelWriter(excel_path.abspath, + engine='openpyxl') + logging.info('writing ' + excel_path.relpath) + try: + for table, data in org_data.items(): + if outfmt == OutFmt.CSV: + p = gen_filepath(outdir, output_prefix, schema_name, + org, table, 'csv') + logging.info('writing ' + p.relpath) + data.to_csv(p.abspath, index=False) + output_paths.append(p.relpath) + elif outfmt == OutFmt.EXCEL: + logging.info(f'- {qt(table)}') + data.to_excel(excel_file, sheet_name=table, + index=False) + else: + assert False, f'format {outfmt} not impl' + except IndexError: + # XXX: this is thrown from excel writer when nothing is written + # XXX: no need to return paths since excel file didn't finish + assert outfmt == OutFmt.EXCEL + error('failed to write output, most likely due to empty input') + return [] + finally: + if excel_file: + excel_file.close() + if excel_file: + output_paths.append(excel_path.relpath) + logging.info('done') + return output_paths + + +@app.command() +def main_cli( + schema: str = typer.Argument(default=..., help=SCHEMA_DESC), + inputs: List[str] = typer.Argument(default=..., help=INPUT_DESC), + orgs: List[str] = typer.Option(default=ORGS_DEFAULT, help=ORGS_DESC), + outfmt: OutFmt = typer.Option(default=OUTFMT_DEFAULT, help=OUTFMT_DESC), + outdir: str = typer.Option(default=OUTDIR_DEFAULT, help=OUTDIR_DESC), + debug: Annotated[bool, typer.Option("-d", "--debug", + help=DEBUG_DESC)] = DEBUG_DEFAULT, + quiet: Annotated[bool, typer.Option("-q", "--quiet", + help=QUIET_DESC)] = QUIET_DEFAULT, + list_output: Annotated[bool, typer.Option("-l", "--list", + help=LIST_DESC)] = LIST_DEFAULT, +) -> None: + if not quiet: + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + paths = share(schema, inputs, orgs, outfmt, outdir, debug) + if list_output: + cwd = os.getcwd() + relpaths = seq(paths).map(lambda abs: os.path.relpath(abs, cwd)) + print(linesep.join(relpaths)) + + +def main() -> None: + # runs main_cli + app() + + +if __name__ == '__main__': + main() diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 00000000..50e50392 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +data.sqlite diff --git a/tests/api/id-schema.csv b/tests/api/id-schema.csv new file mode 100644 index 00000000..c323663c --- /dev/null +++ b/tests/api/id-schema.csv @@ -0,0 +1,3 @@ +ruleID,table,mode,key,operator,value,notes +1,mytable,select,NA,NA,id, +2,NA,share,OHRI,NA,1, diff --git a/tests/api/issue-69/protocols.csv b/tests/api/issue-69/protocols.csv new file mode 100644 index 00000000..d1d9ff97 --- /dev/null +++ b/tests/api/issue-69/protocols.csv @@ -0,0 +1,4 @@ +Protocol.ID +a +b +c diff --git a/tests/api/issue-69/schema-org1-protocols.csv b/tests/api/issue-69/schema-org1-protocols.csv new file mode 100644 index 00000000..d1d9ff97 --- /dev/null +++ b/tests/api/issue-69/schema-org1-protocols.csv @@ -0,0 +1,4 @@ +Protocol.ID +a +b +c diff --git a/tests/api/issue-69/schema.csv b/tests/api/issue-69/schema.csv new file mode 100644 index 00000000..9ee5bc70 --- /dev/null +++ b/tests/api/issue-69/schema.csv @@ -0,0 +1,3 @@ +ruleID,table,mode,key,operator,value,notes +1,protocols,select,,,Protocol.ID, +2,,share,org1,,1, diff --git a/tests/api/str-filter-schema.csv b/tests/api/str-filter-schema.csv new file mode 100644 index 00000000..4e6e9483 --- /dev/null +++ b/tests/api/str-filter-schema.csv @@ -0,0 +1,11 @@ +ruleID,table,mode,key,operator,value,notes +1,mytable,select,NA,NA,str1;str2, +2,mytable,filter,str1,=,a, +3,mytable,filter,str1,=,"a", +4,mytable,filter,str1,<,b, +5,mytable,filter,str2,=,, +6,mytable,filter,str2,=,"", +7,,group,,and,2;3;4, +8,,group,,and,5;6, +9,,group,,or,7;8, +10,NA,share,OHRI,NA,1;9, diff --git a/tests/api/true-schema.csv b/tests/api/true-schema.csv new file mode 100644 index 00000000..b2a58c60 --- /dev/null +++ b/tests/api/true-schema.csv @@ -0,0 +1,4 @@ +ruleID,table,mode,key,operator,value,notes +1,mytable,select,NA,NA,all, +2,mytable,filter,bool3,=,TRUE, +3,NA,share,OHRI,NA,1;2, diff --git a/tests/cli/multi-schema.csv b/tests/cli/multi-schema.csv new file mode 100644 index 00000000..9722e740 --- /dev/null +++ b/tests/cli/multi-schema.csv @@ -0,0 +1,4 @@ +ruleID,table,mode,key,operator,value,notes +1,mytable1,select,NA,NA,x, +2,mytable2,select,NA,NA,x, +3,NA,share,OHRI,NA,1;2, diff --git a/tests/cli/mytable1.csv b/tests/cli/mytable1.csv new file mode 100644 index 00000000..e3a3c2b4 --- /dev/null +++ b/tests/cli/mytable1.csv @@ -0,0 +1,2 @@ +id,x +1,a diff --git a/tests/cli/mytable2.csv b/tests/cli/mytable2.csv new file mode 100644 index 00000000..0ee74356 --- /dev/null +++ b/tests/cli/mytable2.csv @@ -0,0 +1,2 @@ +id,x +2,b diff --git a/tests/common.py b/tests/common.py new file mode 100644 index 00000000..70581f25 --- /dev/null +++ b/tests/common.py @@ -0,0 +1,16 @@ +import unittest +from os.path import abspath, dirname + + +def readfile(path: str) -> str: + with open(path) as f: + return f.read() + + +class OdmTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls.maxDiff = None + + def setUp(self) -> None: + self.dir = dirname(abspath(__file__)) diff --git a/tests/common/3.1.1.csv b/tests/common/3.1.1.csv new file mode 100644 index 00000000..fca13c2d --- /dev/null +++ b/tests/common/3.1.1.csv @@ -0,0 +1,3 @@ +ruleID,table,mode,key,operator,value,notes +1,samples,select,NA,NA,saMaterial, +2,NA,share,ohri,NA,1, diff --git a/tests/common/3.1.2.csv b/tests/common/3.1.2.csv new file mode 100644 index 00000000..4739a45d --- /dev/null +++ b/tests/common/3.1.2.csv @@ -0,0 +1,3 @@ +ruleID,table,mode,key,operator,value,notes +1,measures,select,NA,NA,reportable;pooled, +2,NA,share,ohri,NA,1, diff --git a/tests/common/3.1.3.csv b/tests/common/3.1.3.csv new file mode 100644 index 00000000..9873caae --- /dev/null +++ b/tests/common/3.1.3.csv @@ -0,0 +1,3 @@ +ruleID,table,mode,key,operator,value,notes +1,measures,select,NA,NA,all, +2,NA,share,ohri,NA,1, diff --git a/tests/common/3.1.4.csv b/tests/common/3.1.4.csv new file mode 100644 index 00000000..83e569bd --- /dev/null +++ b/tests/common/3.1.4.csv @@ -0,0 +1,3 @@ +ruleID,table,mode,key,operator,value,notes +1,measures;samples,select,NA,NA,purposeID, +2,NA,share,ohri,NA,1, diff --git a/tests/common/3.2.csv b/tests/common/3.2.csv new file mode 100644 index 00000000..98cde6f8 --- /dev/null +++ b/tests/common/3.2.csv @@ -0,0 +1,10 @@ +ruleID,table,mode,key,operator,value,notes +1,samples;measures,select,NA,NA,all, +2,samples,filter,siteID,=,ottawa-1, +3,samples,filter,collPer,>=,5, +4,samples,filter,collPer,<=,5, +5,measures,filter,aDateEnd,=,2022-02-01, +6,measures,filter,aDateEnd,in,2022-02-01:2022-02-28, +7,NA,group,NA,AND,2;3;4, +8,NA,group,NA,AND,5;6, +9,NA,share,ohri,NA,1;7;8, diff --git a/tests/common/4.1.csv b/tests/common/4.1.csv new file mode 100644 index 00000000..03d42b49 --- /dev/null +++ b/tests/common/4.1.csv @@ -0,0 +1,6 @@ +ruleID,table,mode,key,operator,value,notes +11,measures,select,NA,NA,all, +12,measures,filter,aDateEnd,=,2022-02-01, +13,measures,filter,aDateEnd,=,2023-02-01, +14,NA,group,NA,OR,12;13, +15,NA,share,ohri,NA,11;14, diff --git a/tests/common/4.3.csv b/tests/common/4.3.csv new file mode 100644 index 00000000..a64998ac --- /dev/null +++ b/tests/common/4.3.csv @@ -0,0 +1,6 @@ +ruleID,table,mode,key,operator,value,notes +11,samples,select,NA,NA,all, +12,samples,filter,siteID,=,ottawa-1, +13,samples,filter,collDT,=,2023-02-01, +14,NA,group,NA,AND,12;13, +15,NA,share,ohri,NA,11;14, diff --git a/tests/common/4.4.csv b/tests/common/4.4.csv new file mode 100644 index 00000000..d2566be7 --- /dev/null +++ b/tests/common/4.4.csv @@ -0,0 +1,10 @@ +ruleID,table,mode,key,operator,value,notes +11,measures,select,NA,NA,measure;value;unit;aggregation, +12,measures,filter,measure,=,mPox, +13,measures,filter,reportDate,in,2021-01-01:2021-12-31, +14,NA,group,NA,AND,12;13, +15,measures,filter,measure,=,cov, +16,measures,filter,reportDate,>=,2020-01-01, +17,NA,group,NA,AND,15;16, +18,NA,group,NA,OR,14;17, +19,NA,share,ohri,NA,11;18, diff --git a/tests/common/5.1.csv b/tests/common/5.1.csv new file mode 100644 index 00000000..416afb39 --- /dev/null +++ b/tests/common/5.1.csv @@ -0,0 +1,11 @@ +ruleID,table,mode,key,operator,value,notes +11,measures,select,NA,NA,all, +12,measures,filter,aDateEnd,=,2022-02-01, +13,measures,filter,aDateEnd,=,2023-02-01, +14,NA,group,NA,OR,12;13, +15,samples,select,NA,NA,all, +16,samples,filter,siteID,=,ottawa-1, +17,samples,filter,siteID,=,laval-1, +18,NA,group,NA,OR,16;17, +31,NA,share,OPH;PHAC,NA,11;14;15, +32,NA,share,LPH,NA,11;14;15;18, diff --git a/tests/common/filter-set.csv b/tests/common/filter-set.csv new file mode 100644 index 00000000..f6e2b9f3 --- /dev/null +++ b/tests/common/filter-set.csv @@ -0,0 +1,4 @@ +ruleID,table,mode,key,operator,value,notes +4,samples,select,,,all, +5,samples,filter,saMaterial,in,rawWW;sweSed, +6,,share,PHAC,,4;5, diff --git a/tests/common/mytable.csv b/tests/common/mytable.csv new file mode 100644 index 00000000..4b883a9a --- /dev/null +++ b/tests/common/mytable.csv @@ -0,0 +1,6 @@ +id,bool1,bool2,bool3,bool4,bool5,int1,int2,str1,str2 +1,TRUE,FALSE,,NA,,0,,a, +2,TRUE,TRUE,FALSE,TRUE,,1,NA,b,NA +3,TRUE,,TRUE,TRUE,,2,2,c,c +4,TRUE,,NA,TRUE,,3,3,d,d +5,FALSE,,,FALSE,FALSE,4,4,e,e diff --git a/tests/common/passthrough-schema.csv b/tests/common/passthrough-schema.csv new file mode 100644 index 00000000..eed284b0 --- /dev/null +++ b/tests/common/passthrough-schema.csv @@ -0,0 +1,3 @@ +ruleID,table,mode,key,operator,value,notes +1,mytable,select,NA,NA,all, +2,NA,share,OHRI,NA,1, diff --git a/tests/common/testdata.xlsx b/tests/common/testdata.xlsx new file mode 100644 index 00000000..5a4c5c9a Binary files /dev/null and b/tests/common/testdata.xlsx differ diff --git a/tests/int/delatolla/AUTHORS.md b/tests/int/delatolla/AUTHORS.md new file mode 100644 index 00000000..e1ad526f --- /dev/null +++ b/tests/int/delatolla/AUTHORS.md @@ -0,0 +1,7 @@ +Authors for data.xlsx: + +Mercier, É., D’Aoust, P., Pisharody, L. K., Hegazy, N., Wan, S., Tian, X., +Tomalty, E., Kabir, M. P., Nguyen, T. B., Wong, C. H., Ramsay, N. T., Addo, F., +& Delatolla, R. (2024). The Public Health Environmental Surveillance Database +(PHESD) - Delatolla Lab v2.0.0 (Wastewater_Surveillance). Zenodo. +https://doi.org/10.5281/zenodo.10794558 diff --git a/tests/int/delatolla/data.xlsx b/tests/int/delatolla/data.xlsx new file mode 100644 index 00000000..11743fad Binary files /dev/null and b/tests/int/delatolla/data.xlsx differ diff --git a/tests/int/delatolla/expected/data-schema-org1-measures.csv b/tests/int/delatolla/expected/data-schema-org1-measures.csv new file mode 100644 index 00000000..d003aba5 --- /dev/null +++ b/tests/int/delatolla/expected/data-schema-org1-measures.csv @@ -0,0 +1,13 @@ +sampleID,siteID,aDateEnd,reportDate,measure,value,unit,aggregation,reportable,contactID +o.04.08.20,Ottawa-1,2020-04-08,,covN1,0.000260146,gcPMMoV,meanNr,TRUE,delatollaLab +o.04.08.20,Ottawa-1,2020-04-08,,covN2,0.00012698899999999999,gcPMMoV,meanNr,TRUE,delatollaLab +o.04.08.20,Ottawa-1,2020-04-08,,nPPMoV,27.35,Ct,mean,TRUE,delatollaLab +o.01.28.24,Ottawa-1,2024-01-28,2024-01-29,covN1,0.00039488,gcPMMoV,meanNr,TRUE,delatollaLab +o.01.28.24,Ottawa-1,2024-01-28,2024-01-29,covN2,0.000247411,gcPMMoV,meanNr,TRUE,delatollaLab +o.01.28.24,Ottawa-1,2024-01-28,2024-01-29,nPPMoV,28.37666667,Ct,mean,TRUE,delatollaLab +o.02.02.22,Ottawa-1,2022-02-02,2022-02-03,infA,0.0,gcPMMoV,meanNr,TRUE,delatollaLab +o.02.02.22,Ottawa-1,2022-02-02,2022-02-03,infB,999.0,gcPMMoV,meanNr,TRUE,delatollaLab +o.02.02.22,Ottawa-1,2022-02-02,2022-02-03,rsv,999.0,gcPMMoV,meanNr,TRUE,delatollaLab +o.12.31.23,Ottawa-1,2023-12-31,2024-01-03,infA,0.000148,gcPMMoV,meanNr,TRUE,delatollaLab +o.12.31.23,Ottawa-1,2023-12-31,2024-01-03,infB,4.01e-08,gcPMMoV,meanNr,TRUE,delatollaLab +o.12.31.23,Ottawa-1,2023-12-31,2024-01-03,rsv,2.79e-05,gcPMMoV,meanNr,TRUE,delatollaLab diff --git a/tests/int/delatolla/expected/data-schema-org1-samples.csv b/tests/int/delatolla/expected/data-schema-org1-samples.csv new file mode 100644 index 00000000..e357f05e --- /dev/null +++ b/tests/int/delatolla/expected/data-schema-org1-samples.csv @@ -0,0 +1,16 @@ +sampleID,contactID,siteID,saMaterial,origin,repType,collType,collPer,collNum,collDT,reportable +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE diff --git a/tests/int/delatolla/expected/data-schema-org1-sites.csv b/tests/int/delatolla/expected/data-schema-org1-sites.csv new file mode 100644 index 00000000..36bf8fdf --- /dev/null +++ b/tests/int/delatolla/expected/data-schema-org1-sites.csv @@ -0,0 +1,2 @@ +siteID,datasetID,polygonID,siteType,sampleShed,addressID,organizationID,contactID,name,descr,repOrg1,repOrg2,healthRegion,popServ,geoLat,geoLong,geoEPSG,lastEdited +Ottawa-1,Ottawa-1,,wwtp,municp,ropec,Ottawa-1,,Ottawa-ROPEC,"The Robert O. Pickard Environmental Centre is a waste water treatment facility in Ottawa, Ontario, Canada. It provides secondary treatment to about 720,000 people.",Ottawa-1,,Ontario Health Region - East,1100000,45.454147,-75.59248,,2024-02-05 00:00:00.000000 diff --git a/tests/int/delatolla/expected/data-schema-org2-measures.csv b/tests/int/delatolla/expected/data-schema-org2-measures.csv new file mode 100644 index 00000000..04077942 --- /dev/null +++ b/tests/int/delatolla/expected/data-schema-org2-measures.csv @@ -0,0 +1,7 @@ +sampleID,siteID,aDateEnd,reportDate,measure,value,unit,aggregation,reportable,contactID +o.04.08.20,Ottawa-1,2020-04-08,,covN1,0.000260146,gcPMMoV,meanNr,TRUE,delatollaLab +o.04.08.20,Ottawa-1,2020-04-08,,covN2,0.00012698899999999999,gcPMMoV,meanNr,TRUE,delatollaLab +o.04.08.20,Ottawa-1,2020-04-08,,nPPMoV,27.35,Ct,mean,TRUE,delatollaLab +o.01.28.24,Ottawa-1,2024-01-28,2024-01-29,covN1,0.00039488,gcPMMoV,meanNr,TRUE,delatollaLab +o.01.28.24,Ottawa-1,2024-01-28,2024-01-29,covN2,0.000247411,gcPMMoV,meanNr,TRUE,delatollaLab +o.01.28.24,Ottawa-1,2024-01-28,2024-01-29,nPPMoV,28.37666667,Ct,mean,TRUE,delatollaLab diff --git a/tests/int/delatolla/expected/data-schema-org2-samples.csv b/tests/int/delatolla/expected/data-schema-org2-samples.csv new file mode 100644 index 00000000..e357f05e --- /dev/null +++ b/tests/int/delatolla/expected/data-schema-org2-samples.csv @@ -0,0 +1,16 @@ +sampleID,contactID,siteID,saMaterial,origin,repType,collType,collPer,collNum,collDT,reportable +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.05.21,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2021-01-05,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE +o.01.24.24,delatollaLab,Ottawa-1,pSludge,field,unique,timePr,24,24,2024-01-24,TRUE diff --git a/tests/int/delatolla/expected/data-schema-org2-sites.csv b/tests/int/delatolla/expected/data-schema-org2-sites.csv new file mode 100644 index 00000000..36bf8fdf --- /dev/null +++ b/tests/int/delatolla/expected/data-schema-org2-sites.csv @@ -0,0 +1,2 @@ +siteID,datasetID,polygonID,siteType,sampleShed,addressID,organizationID,contactID,name,descr,repOrg1,repOrg2,healthRegion,popServ,geoLat,geoLong,geoEPSG,lastEdited +Ottawa-1,Ottawa-1,,wwtp,municp,ropec,Ottawa-1,,Ottawa-ROPEC,"The Robert O. Pickard Environmental Centre is a waste water treatment facility in Ottawa, Ontario, Canada. It provides secondary treatment to about 720,000 people.",Ottawa-1,,Ontario Health Region - East,1100000,45.454147,-75.59248,,2024-02-05 00:00:00.000000 diff --git a/tests/int/delatolla/schema.csv b/tests/int/delatolla/schema.csv new file mode 100644 index 00000000..d4f80e74 --- /dev/null +++ b/tests/int/delatolla/schema.csv @@ -0,0 +1,16 @@ +ruleID,table,mode,key,operator,value,notes +1,measures,select,NA,NA,sampleID;siteID;aDateEnd;reportDate;measure;value;unit;aggregation;reportable;contactID, +2,measures,filter,measure,in,covN1;covN2;nPPMoV, +3,measures,filter,measure,in,mPox; infA;infB;rsv, +4,measures,filter,reportable,=,TRUE, +5,measures,filter,aggregation,in,mean;meanNr, +6,measures,filter,aDateEnd,in,2020-04-08;2024-01-28, +7,measures,filter,aDateEnd,in,2022-02-02;2023-12-31, +8,NA,group,NA,AND,2;6, +9,NA,group,NA,AND,3;7, +10,NA,group,NA,OR,8;9, +11,samples,select,NA,NA,sampleID;contactID;siteID;saMaterial;origin;repType;collType;collPer;collNum;collDT;reportable, +12,samples,filter,collDT,in,2021-01-05;2024-01-24, +13,sites,select,NA,NA,siteID;datasetID;polygonID;siteType;sampleShed;addressID;organizationID;contactID;name;descr;repOrg1;repOrg2;healthRegion;popServ;geoLat;geoLong;geoEPSG;lastEdited, +14,,share,org1,NA,1;4;5;10;11;12;13, +15,,share,org2,NA,1;4;5;8;11;12;13, diff --git a/tests/int_tests.py b/tests/int_tests.py new file mode 100644 index 00000000..e15d2f03 --- /dev/null +++ b/tests/int_tests.py @@ -0,0 +1,85 @@ +import os +import subprocess +import sys +import unittest +from os.path import exists, join +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import List + +from odm_sharing.tools.share import OutFmt + +from common import OdmTestCase, readfile + + +def _remove_file(path): + try: + os.remove(path) + except FileNotFoundError: + pass + + +def run( + args: List[str], outdir: str, outfmt: OutFmt = OutFmt.AUTO +) -> subprocess.CompletedProcess: + res = subprocess.run([ + 'odm-share', + '--quiet', + '--list', + '--outdir', outdir, + '--outfmt', outfmt.CSV.value, + ] + args, + capture_output=True) + if res.stderr: + sys.stderr.buffer.write(res.stderr) + raise Exception('failed to run') + assert res.returncode == 0 + assert res.stdout + paths = res.stdout.decode().splitlines() + return paths + + +class DelatollaIntTests(OdmTestCase): + def setUp(self): + super().setUp() + self.delatolla_dir = join(self.dir, 'int', 'delatolla') + self.dbname = 'data.sqlite' + self.dbpath = join(self.delatolla_dir, self.dbname) + + # remove db from previous run + _remove_file(self.dbpath) + + def _test_impl(self, data_filename) -> None: + data_path = join(self.delatolla_dir, data_filename) + schema_path = join(self.delatolla_dir, 'schema.csv') + with TemporaryDirectory() as tmpdir: + paths = run([schema_path, data_path], tmpdir, OutFmt.CSV) + for path in paths: + actual = readfile(path) + fn = Path(path).name + expected = readfile(join(self.delatolla_dir, 'expected', fn)) + self.assertEqual(actual, expected) + + def test(self) -> None: + + # test excel + os.environ['ODM_TEMP_DB'] = self.dbpath + self._test_impl('data.xlsx') + + # test sqlite using the db generated from the excel file (to save time) + self._test_impl(self.dbname) + + +class MiscIntTests(OdmTestCase): + def test_outdir_creation(self) -> None: + with TemporaryDirectory() as tmpdir: + subdir = join(tmpdir, 'mysubdir') + schema_path = join(self.dir, 'common', 'passthrough-schema.csv') + data_path = join(self.dir, 'common', 'mytable.csv') + self.assertFalse(exists(subdir)) + run([schema_path, data_path], outdir=subdir) + self.assertTrue(exists(subdir)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/rules/schema-33.csv b/tests/rules/schema-33.csv new file mode 100644 index 00000000..04823c9a --- /dev/null +++ b/tests/rules/schema-33.csv @@ -0,0 +1 @@ + ruleID ,table,mode ,key,operator, value,notes diff --git a/tests/rules/schema-35.csv b/tests/rules/schema-35.csv new file mode 100644 index 00000000..1e63fa6a --- /dev/null +++ b/tests/rules/schema-35.csv @@ -0,0 +1,4 @@ +ruleID,table,mode,key,operator,value,notes +1,mytable,select,NA,NA,all, +2,mytable,filter,bool3,=,TRUE, +2,NA,share,OHRI,NA,1;2, diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 00000000..ae06cc73 --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,77 @@ +import unittest +from os.path import join + +import odm_sharing.sharing as sh +from odm_sharing.private.common import F, T + +from common import OdmTestCase + + +class TestApi(OdmTestCase): + '''test cases related to the whole sharing pipeline''' + def setUp(self) -> None: + super().setUp() + + def _test_extract(self, data_relpath: str) -> None: + schema_path = join(self.dir, 'common', 'passthrough-schema.csv') + data_path = join(self.dir, 'common', data_relpath) + df = sh.extract(schema_path, data_path)['OHRI']['mytable'] + self.assertEqual(list(df['bool1']), [T] * 4 + [F]) + self.assertEqual(list(df['bool2']), [F, T] + [''] * 3) + self.assertEqual(list(df['bool3']), ['', F, T, 'NA', '']) + self.assertEqual(list(df['bool4']), ['NA'] + [T] * 3 + [F]) + self.assertEqual(list(df['bool5']), [''] * 4 + [F]) + self.assertEqual(list(df['int1']), [0, 1, 2, 3, 4]) + self.assertEqual(list(df['int2']), ['', 'NA', '2', '3', '4']) + self.assertEqual(list(df['str1']), ['a', 'b', 'c', 'd', 'e']) + self.assertEqual(list(df['str2']), ['', 'NA', 'c', 'd', 'e']) + + def _test_extract_true(self, data_relpath: str) -> None: + schema_path = join(self.dir, 'api', 'true-schema.csv') + data_path = join(self.dir, 'common', data_relpath) + df = sh.extract(schema_path, data_path)['OHRI']['mytable'] + df = df.set_index('id') + actual_rows = list(df.itertuples()) + self.assertEqual(len(actual_rows), 1) + actual = actual_rows[0][1:6] + expected = (T, '', T, T, '') # row 4 (incl. header) + self.assertEqual(actual, expected) + + def _test_extract_strict_subset(self, data_relpath: str) -> None: + '''test that selecting just a single column works, which is a strict + subest of all columns and may be less that the code expects.''' + schema_path = join(self.dir, 'api', 'id-schema.csv') + data_path = join(self.dir, 'common', data_relpath) + sh.extract(schema_path, data_path)['OHRI']['mytable'] + + def test_csv(self) -> None: + fn = 'mytable.csv' + self._test_extract(fn) + self._test_extract_true(fn) + self._test_extract_strict_subset(fn) + + def test_excel(self) -> None: + fn = 'testdata.xlsx' + self._test_extract(fn) + self._test_extract_true(fn) + self._test_extract_strict_subset(fn) + + def test_excel_string_filter(self) -> None: + '''tests that '=' and '<' filters work with strings''' + schema_path = join(self.dir, 'api', 'str-filter-schema.csv') + data_path = join(self.dir, 'common', 'testdata.xlsx') + df = sh.extract(schema_path, data_path)['OHRI']['mytable'] + self.assertEqual(df['str1'].to_list(), ['a']) + self.assertEqual(df['str2'].to_list(), ['']) + + def test_header_with_dot(self) -> None: + HEADER = 'Protocol.ID' + dir = join(self.dir, 'api', 'issue-69') + res = sh.extract(join(dir, 'schema.csv'), join(dir, 'protocols.csv')) + df = res['org1']['protocols'] + self.assertEqual(df.columns.to_list(), [HEADER]) + self.assertEqual(df[HEADER].to_list(), ['a', 'b', 'c']) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 00000000..a5ff4423 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,67 @@ +import unittest +from os.path import join +from tempfile import TemporaryDirectory +from typing import List + +from functional import seq +from odm_sharing.tools.share import OutFmt, share +from odm_sharing.private.cons import DataSourceError + +from common import OdmTestCase, readfile + + +def share_csv(schema_path: str, data_path: str) -> List[str]: + with TemporaryDirectory() as dir: + outpaths = share(schema_path, [data_path], outdir=dir) + return readfile(outpaths[0]) + + +def share_excel(schema_path, data_path: str, outfmt) -> List[str]: + with TemporaryDirectory() as dir: + outpaths = share(schema_path, [data_path], outdir=dir, outfmt=outfmt) + return readfile(outpaths[0]) + + +class TestCli(OdmTestCase): + def test_csv_to_csv(self) -> None: + schema_path = join(self.dir, 'common', 'passthrough-schema.csv') + data_path = join(self.dir, 'common', 'mytable.csv') + src_content = readfile(data_path) + dst_content = share_csv(schema_path, data_path) + self.assertEqual(src_content, dst_content) + + def test_excel_to_csv(self) -> None: + schema_path = join(self.dir, 'common', 'passthrough-schema.csv') + data_path = join(self.dir, 'common', 'testdata.xlsx') + src_content = readfile(join(self.dir, 'common', 'mytable.csv')) + dst_content = share_excel(schema_path, data_path, OutFmt.CSV) + self.assertEqual(src_content, dst_content) + + def _multi_impl(self, schema_path: str, inputs: List[str], outdir: str): + outpaths = share(schema_path, inputs, outdir=outdir, outfmt=OutFmt.CSV) + actual = (''.join(seq(outpaths).map(readfile))).splitlines() + expected = ['x', 'a', 'x', 'b'] + self.assertEqual(actual, expected) + + def test_multi_csv(self) -> None: + schema_path = join(self.dir, 'cli', 'multi-schema.csv') + data_paths = [ + join(self.dir, 'cli', 'mytable1.csv'), + join(self.dir, 'cli', 'mytable2.csv'), + ] + with TemporaryDirectory() as dir: + self._multi_impl(schema_path, data_paths, dir) + + def test_multi_csv_missing(self) -> None: + schema_path = join(self.dir, 'cli', 'multi-schema.csv') + data_paths = [ + join(self.dir, 'cli', 'mytable1.csv'), + ] + with TemporaryDirectory() as dir: + expr = '.*mytable2.*missing' + with self.assertRaisesRegex(DataSourceError, expr): + self._multi_impl(schema_path, data_paths, dir) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_cons.py b/tests/test_cons.py new file mode 100644 index 00000000..7e78454c --- /dev/null +++ b/tests/test_cons.py @@ -0,0 +1,45 @@ +import unittest + +import odm_sharing.private.cons as cons + +from common import OdmTestCase + + +class TestCons(OdmTestCase): + def test_detect_sqlalchemy(self) -> None: + valid_urls = [ + 'postgresql+pg8000://dbuser:kx%40jj5%2Fg@pghost10/appdb', + 'postgresql://scott:tiger@localhost/mydatabase', + 'postgresql+psycopg2://scott:tiger@localhost/mydatabase', + 'postgresql+pg8000://scott:tiger@localhost/mydatabase', + 'mysql://scott:tiger@localhost/foo', + 'mysql+mysqldb://scott:tiger@localhost/foo', + 'mysql+pymysql://scott:tiger@localhost/foo', + 'oracle://scott:tiger@127.0.0.1:1521/sidname', + 'oracle+cx_oracle://scott:tiger@tnsname', + 'mssql+pyodbc://scott:tiger@mydsn', + 'mssql+pymssql://scott:tiger@hostname:8080/dbname', + 'sqlite:///foo.db', + 'sqlite:////absolute/path/to/foo.db', + 'sqlite:///C:\\path\\to\\foo.db', + r'sqlite:///C:\path\to\foo.db', + 'sqlite://', + ] + for path in valid_urls: + self.assertTrue(cons._detect_sqlalchemy(path)) + + invalid_urls = [ + 'myfile.db', + 'x:/', + '', + ] + for path in invalid_urls: + self.assertFalse(cons._detect_sqlalchemy(path)) + + def test_input_mix_not_allowed(self) -> None: + with self.assertRaisesRegex(cons.DataSourceError, 'mix.*type'): + cons.connect(['a.csv', 'b.xlsx']) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_queries.py b/tests/test_queries.py new file mode 100644 index 00000000..c9e7c47a --- /dev/null +++ b/tests/test_queries.py @@ -0,0 +1,110 @@ +import unittest +from os.path import join + +import odm_sharing.private.queries as queries +import odm_sharing.private.rules as rules +import odm_sharing.private.trees as trees +from odm_sharing.private.rules import Rule, RuleMode + +from common import OdmTestCase + + +class TestQueries(OdmTestCase): + def get_ruleset(self, schema_fn: str): + return rules.load(join(self.dir, 'common', schema_fn)) + + def test_in_interval_and_gte(self) -> None: + ruleset = self.get_ruleset('4.4.csv') + ruletree = trees.parse(ruleset) + q = queries.generate(ruletree)['ohri']['measures'] + + actual_sql = q.data_query.sql + expected_sql = ( + 'SELECT "measure","value","unit","aggregation" ' + + 'FROM "measures" ' + + 'WHERE (' + ( + '(("measure" = ?) AND ("reportDate" BETWEEN ? AND ?)) ' + + 'OR ' + + '(("measure" = ?) AND ("reportDate" >= ?))' + ) + + ')' + ) + self.assertEqual(actual_sql, expected_sql) + + actual_args = q.data_query.args + expected_args = ['mPox', '2021-01-01', '2021-12-31', + 'cov', '2020-01-01'] + self.assertEqual(actual_args, expected_args) + + def test_in_set(self) -> None: + ruleset = self.get_ruleset('filter-set.csv') + ruletree = trees.parse(ruleset) + q = queries.generate(ruletree)['PHAC']['samples'] + actual_sql = q.data_query.sql + expected_sql = ( + 'SELECT * ' + + 'FROM "samples" ' + + 'WHERE ("saMaterial" IN (?,?))' + ) + self.assertEqual(actual_sql, expected_sql) + + actual_args = q.data_query.args + expected_args = ['rawWW', 'sweSed'] + self.assertEqual(actual_args, expected_args) + + def test_rule_count_args(self) -> None: + ruleset = [ + Rule(id=1, table='t', mode=RuleMode.SELECT, value='all'), + Rule(id=2, table='t', mode=RuleMode.FILTER, key='x', + operator='=', value='a'), + Rule(id=3, table='t', mode=RuleMode.FILTER, key='y', + operator='in', value='1;2'), + Rule(id=4, table='', mode=RuleMode.SHARE, key='ohri', + value='1;2;3'), + ] + ruletree = trees.parse(ruleset) + q = queries.generate(ruletree)['ohri']['t'] + implicit_group_id = 0 + actual = q.rule_count_queries[implicit_group_id].args + expected = ['a', '1', '2'] + self.assertEqual(actual, expected) + + def test_share_table_rule_count_queries(self) -> None: + '''tests that each table of a share rule gets the right count-query''' + ruleset = [ + Rule(id=1, table='a;b', mode=RuleMode.SELECT, value='all'), + Rule(id=2, table='a', mode=RuleMode.FILTER, key='x', + operator='=', value='1'), + Rule(id=3, table='b', mode=RuleMode.FILTER, key='y', + operator='=', value='1'), + Rule(id=4, table='', mode=RuleMode.SHARE, key='ohri', + value='1;2;3'), + ] + share_id = ruleset[-1].id + ruletree = trees.parse(ruleset) + q1 = queries.generate(ruletree)['ohri']['a'] + q2 = queries.generate(ruletree)['ohri']['b'] + expected1 = 'SELECT COUNT(*) FROM "a" WHERE ("x" = ?)' + expected2 = 'SELECT COUNT(*) FROM "b" WHERE ("y" = ?)' + actual1 = q1.rule_count_queries[share_id].sql + actual2 = q2.rule_count_queries[share_id].sql + self.assertEqual(actual1, expected1) + self.assertEqual(actual2, expected2) + + def test_sanitize(self) -> None: + '''double-quotes are not allowed in identifiers and parameter values + are separated, to prevent injections''' + injection = '" OR 1=1 --' + ruleset = [ + Rule(id=1, table='t', mode=RuleMode.SELECT, value=injection), + Rule(id=2, table='t', mode=RuleMode.FILTER, key=injection, + operator='=', value=injection), + Rule(id=3, table='', mode=RuleMode.SHARE, key='ohri', value='1;2'), + ] + ruletree = trees.parse(ruleset) + with self.assertRaisesRegex(rules.ParseError, 'quote.*not allowed'): + queries.generate(ruletree) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_rules.py b/tests/test_rules.py new file mode 100644 index 00000000..ff01b658 --- /dev/null +++ b/tests/test_rules.py @@ -0,0 +1,42 @@ +import unittest +from os.path import join + +import odm_sharing.private.rules as rules +from odm_sharing.private.rules import Rule, RuleMode, SchemaCtx, init_rule + +from common import OdmTestCase + + +class TestRules(OdmTestCase): + def setUp(self) -> None: + super().setUp() + self.ctx = SchemaCtx('test') + + def test_init_rule(self) -> None: + schema_row = { + 'ruleID': '1', + 'table': 'mytable', + 'mode': 'filter', + 'key': 'x', + 'operator': '=', + 'value': '2', + 'notes': '', + } + assert list(schema_row.keys()) == rules.HEADERS + actual = init_rule(self.ctx, schema_row) + expected = Rule(id=1, table='mytable', mode=RuleMode.FILTER, + key='x', operator='=', value='2') + self.assertEqual(actual, expected) + + def test_dup_ruleid_error(self): + # trees.parse may throw a misleading error if rules.load doesn't + # check for duplicate rule-ids + with self.assertRaisesRegex(rules.ParseError, "already exists"): + rules.load(join(self.dir, 'rules', 'schema-35.csv')) + + def test_header_whitespace_allowed(self) -> None: + rules.load(join(self.dir, 'rules', 'schema-33.csv')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_trees.py b/tests/test_trees.py new file mode 100644 index 00000000..238273c1 --- /dev/null +++ b/tests/test_trees.py @@ -0,0 +1,302 @@ +import unittest +from os.path import join +from typing import List +# from pprint import pprint + +import odm_sharing.private.trees as trees +from odm_sharing.private.rules import ParseError, Rule, RuleMode, load +from odm_sharing.private.trees import Op, parse + +from common import OdmTestCase + + +class TestParseList(OdmTestCase): + def setUp(self) -> None: + super().setUp() + self.ctx = trees.Ctx('test') + + def test_no_constraint(self) -> None: + trees.parse_list(self.ctx, 'a') + trees.parse_list(self.ctx, 'a;b') + + def test_absolute(self) -> None: + trees.parse_list(self.ctx, 'a', 1, 1) + trees.parse_list(self.ctx, 'a;b', 2, 2) + with self.assertRaises(ParseError): + trees.parse_list(self.ctx, 'a;b;c', 2, 2) + + def test_min_only(self) -> None: + trees.parse_list(self.ctx, 'a', 1) + trees.parse_list(self.ctx, 'a;b', 1) + with self.assertRaises(ParseError): + trees.parse_list(self.ctx, 'a', 2) + + def test_not_in_range(self) -> None: + trees.parse_list(self.ctx, 'a', 1, 2) + trees.parse_list(self.ctx, 'a;b', 2, 3) + trees.parse_list(self.ctx, 'a;b;c', 2, 3) + with self.assertRaises(ParseError): + trees.parse_list(self.ctx, '', 1, 2) + with self.assertRaises(ParseError): + trees.parse_list(self.ctx, 'a;b', 3, 10) + + def test_set_with_one_element(self) -> None: + actual = trees.parse_list(self.ctx, 'a;', 1) + expected = ['a'] + self.assertEqual(actual, expected) + + +class TestParseFilterValues(OdmTestCase): + def setUp(self) -> None: + super().setUp() + self.ctx = trees.Ctx('test') + + def parse(self, op: Op, val_str: str) -> List[str]: + is_interval = trees.filter_is_interval(op, val_str) + return trees.parse_filter_values(self.ctx, op, is_interval, val_str) + + def test_eq(self) -> None: + actual = self.parse(Op.EQ, 'a') + expected = ['a'] + self.assertEqual(actual, expected) + with self.assertRaises(ParseError): + self.parse(Op.EQ, 'a;b') + + def test_in(self) -> None: + actual = self.parse(Op.RANGE, 'a:b') + expected = ['a', 'b'] + self.assertEqual(actual, expected) + with self.assertRaises(ParseError): + self.parse(Op.EQ, 'a;b') + + +class TestParse(OdmTestCase): + def get_actual(self, schema_fn: str) -> str: + rules = load(join(self.dir, 'common', schema_fn)) + tree = parse(rules) + return tree.__repr__() + + def test_simple(self) -> None: + rules = [ + Rule(id=1, table='t', mode=RuleMode.SELECT, value='all'), + Rule(id=2, table='t', mode=RuleMode.FILTER, key='a', operator='=', + value='x'), + Rule(id=3, table='', mode=RuleMode.SHARE, key='OHRI', value='1;2'), + ] + tree = parse(rules, filename='test') + actual = tree.__repr__() + expected = '''(0, root, '') + (3, share, 'OHRI') + (1, table, 't') + (1, select, 'all') + (2, filter, '=') + (2, field, 'a') + (2, literal, 'x') +''' + self.assertEqual(actual, expected) + + def test_select_one(self) -> None: + actual = self.get_actual('3.1.1.csv') + expected = '''(0, root, '') + (2, share, 'ohri') + (1, table, 'samples') + (1, select, '') + (1, literal, 'saMaterial') +''' + self.assertEqual(actual, expected, actual) + + def test_select_two(self) -> None: + actual = self.get_actual('3.1.2.csv') + expected = '''(0, root, '') + (2, share, 'ohri') + (1, table, 'measures') + (1, select, '') + (1, literal, 'reportable') + (1, literal, 'pooled') +''' + self.assertEqual(actual, expected, actual) + + def test_select_all(self) -> None: + actual = self.get_actual('3.1.3.csv') + expected = '''(0, root, '') + (2, share, 'ohri') + (1, table, 'measures') + (1, select, 'all') +''' + self.assertEqual(actual, expected, actual) + + def test_select_multiple_tables(self) -> None: + actual = self.get_actual('3.1.4.csv') + expected = '''(0, root, '') + (2, share, 'ohri') + (1, table, 'measures') + (1, select, '') + (1, literal, 'purposeID') + (1, table, 'samples') + (1, select, '') + (1, literal, 'purposeID') +''' + self.assertEqual(actual, expected, actual) + + def test_filter(self) -> None: + actual = self.get_actual('3.2.csv') + expected = '''(0, root, '') + (9, share, 'ohri') + (1, table, 'samples') + (1, select, 'all') + (7, group, 'and') + (2, filter, '=') + (2, field, 'siteID') + (2, literal, 'ottawa-1') + (3, filter, '>=') + (3, field, 'collPer') + (3, literal, '5') + (4, filter, '<=') + (4, field, 'collPer') + (4, literal, '5') + (1, table, 'measures') + (1, select, 'all') + (8, group, 'and') + (5, filter, '=') + (5, field, 'aDateEnd') + (5, literal, '2022-02-01') + (6, filter, 'in') + (6, field, 'aDateEnd') + (6, range-kind, 'interval') + (6, literal, '2022-02-01') + (6, literal, '2022-02-28') +''' + self.assertEqual(actual, expected) + + def test_group_or(self) -> None: + actual = self.get_actual('4.1.csv') + expected = '''(0, root, '') + (15, share, 'ohri') + (11, table, 'measures') + (11, select, 'all') + (14, group, 'or') + (12, filter, '=') + (12, field, 'aDateEnd') + (12, literal, '2022-02-01') + (13, filter, '=') + (13, field, 'aDateEnd') + (13, literal, '2023-02-01') +''' + self.assertEqual(actual, expected) + + def test_group_and(self) -> None: + actual = self.get_actual('4.3.csv') + expected = '''(0, root, '') + (15, share, 'ohri') + (11, table, 'samples') + (11, select, 'all') + (14, group, 'and') + (12, filter, '=') + (12, field, 'siteID') + (12, literal, 'ottawa-1') + (13, filter, '=') + (13, field, 'collDT') + (13, literal, '2023-02-01') +''' + self.assertEqual(actual, expected) + + def test_group_or_and(self) -> None: + actual = self.get_actual('4.4.csv') + expected = '''(0, root, '') + (19, share, 'ohri') + (11, table, 'measures') + (11, select, '') + (11, literal, 'measure') + (11, literal, 'value') + (11, literal, 'unit') + (11, literal, 'aggregation') + (18, group, 'or') + (14, group, 'and') + (12, filter, '=') + (12, field, 'measure') + (12, literal, 'mPox') + (13, filter, 'in') + (13, field, 'reportDate') + (13, range-kind, 'interval') + (13, literal, '2021-01-01') + (13, literal, '2021-12-31') + (17, group, 'and') + (15, filter, '=') + (15, field, 'measure') + (15, literal, 'cov') + (16, filter, '>=') + (16, field, 'reportDate') + (16, literal, '2020-01-01') +''' + self.assertEqual(actual, expected) + + def test_share_multi(self) -> None: + actual = self.get_actual('5.1.csv') + expected = '''(0, root, '') + (31, share, 'OPH') + (11, table, 'measures') + (11, select, 'all') + (14, group, 'or') + (12, filter, '=') + (12, field, 'aDateEnd') + (12, literal, '2022-02-01') + (13, filter, '=') + (13, field, 'aDateEnd') + (13, literal, '2023-02-01') + (15, table, 'samples') + (15, select, 'all') + (31, share, 'PHAC') + (11, table, 'measures') + (11, select, 'all') + (14, group, 'or') + (12, filter, '=') + (12, field, 'aDateEnd') + (12, literal, '2022-02-01') + (13, filter, '=') + (13, field, 'aDateEnd') + (13, literal, '2023-02-01') + (15, table, 'samples') + (15, select, 'all') + (32, share, 'LPH') + (11, table, 'measures') + (11, select, 'all') + (14, group, 'or') + (12, filter, '=') + (12, field, 'aDateEnd') + (12, literal, '2022-02-01') + (13, filter, '=') + (13, field, 'aDateEnd') + (13, literal, '2023-02-01') + (15, table, 'samples') + (15, select, 'all') + (18, group, 'or') + (16, filter, '=') + (16, field, 'siteID') + (16, literal, 'ottawa-1') + (17, filter, '=') + (17, field, 'siteID') + (17, literal, 'laval-1') +''' + self.assertEqual(actual, expected) + + def test_filter_multi_value(self) -> None: + actual = self.get_actual('filter-set.csv') + expected = '''(0, root, '') + (6, share, 'PHAC') + (4, table, 'samples') + (4, select, 'all') + (5, filter, 'in') + (5, field, 'saMaterial') + (5, range-kind, 'set') + (5, literal, 'rawWW') + (5, literal, 'sweSed') +''' + self.assertEqual(actual, expected) + + def test_invalid_org_error(self): + with self.assertRaisesRegex(ParseError, 'org123.*not.*in.*schema'): + trees.parse(rules=[], orgs=['org123']) + + +if __name__ == '__main__': + unittest.main()