diff --git a/assets/specs/module-functions/schema-1.yml b/assets/specs/module-functions/schema-1.yml index 764a91ae..9a06f3bb 100644 --- a/assets/specs/module-functions/schema-1.yml +++ b/assets/specs/module-functions/schema-1.yml @@ -7,9 +7,11 @@ schema: schema: SiteID: required: true - allowed: - - Ottawa Site - - Montreal Site + anyof: + - allowed: + - Ottawa Site + - Montreal Site + empty: true meta: - ruleID: missing_mandatory_column meta: diff --git a/assets/specs/module-functions/schema-2.yml b/assets/specs/module-functions/schema-2.yml index 14084195..448201a6 100644 --- a/assets/specs/module-functions/schema-2.yml +++ b/assets/specs/module-functions/schema-2.yml @@ -6,12 +6,14 @@ schema: type: dict schema: collection: - allowed: - - comp3h - - comp8h - - flowPr - - comp3 - - comp3dep + anyof: + - allowed: + - comp3h + - comp8h + - flowPr + - comp3 + - comp3dep + empty: true meta: - ruleID: invalid_category meta: diff --git a/assets/specs/module-functions/schema-additions-1.json b/assets/specs/module-functions/schema-additions-1.json index 59350c3d..82b76e29 100644 --- a/assets/specs/module-functions/schema-additions-1.json +++ b/assets/specs/module-functions/schema-additions-1.json @@ -1,7 +1,12 @@ { "Site": { "SiteID": { - "allowed": ["Ottawa Site", "Montreal Site"] + "anyof": [ + { + "allowed": ["Ottawa Site", "Montreal Site"], + "empty": true + } + ] } } } diff --git a/assets/specs/module-functions/schema-additions-2.json b/assets/specs/module-functions/schema-additions-2.json index 56672131..20bf903d 100644 --- a/assets/specs/module-functions/schema-additions-2.json +++ b/assets/specs/module-functions/schema-additions-2.json @@ -1,7 +1,11 @@ { "samples": { "collection": { - "allowed": ["comp3", "comp3dep"] + "anyof": [ + { + "allowed": ["comp3", "comp3dep"] + } + ] } } } diff --git a/assets/validation-rules/empty/dataset-allowed.csv b/assets/validation-rules/empty/dataset-allowed.csv new file mode 100644 index 00000000..30b54c39 --- /dev/null +++ b/assets/validation-rules/empty/dataset-allowed.csv @@ -0,0 +1,6 @@ +siteID,allowedField +1, +2,"" +3," " +4,NA +5,a diff --git a/assets/validation-rules/empty/dataset-minlength.csv b/assets/validation-rules/empty/dataset-minlength.csv new file mode 100644 index 00000000..89cdc473 --- /dev/null +++ b/assets/validation-rules/empty/dataset-minlength.csv @@ -0,0 +1,7 @@ +siteID,minlengthField +1, +2,"" +3," " +4,NA +5,x +6,xxxxx diff --git a/assets/validation-rules/empty/error-report-allowed.json b/assets/validation-rules/empty/error-report-allowed.json new file mode 100644 index 00000000..f67551d7 --- /dev/null +++ b/assets/validation-rules/empty/error-report-allowed.json @@ -0,0 +1,66 @@ +{ + "warnings": [ + { + "columnName": "allowedField", + "invalidValue": "", + "message": "missing_values_found rule triggered in table sites, column allowedField, row(s) 1: Empty string found", + "row": {"allowedField": "", "siteID": "1"}, + "rowNumber": 1, + "tableName": "sites", + "validationRuleFields": [], + "warningType": "missing_values_found" + }, + { + "columnName": "allowedField", + "invalidValue": "", + "message": "missing_values_found rule triggered in table sites, column allowedField, row(s) 2: Empty string found", + "row": {"allowedField": "", "siteID": "2"}, + "rowNumber": 2, + "tableName": "sites", + "validationRuleFields": [], + "warningType": "missing_values_found" + }, + { + "columnName": "allowedField", + "invalidValue": " ", + "message": "missing_values_found rule triggered in table sites, column allowedField, row(s) 3: Missing value \" \"", + "row": {"allowedField": " ", "siteID": "3"}, + "rowNumber": 3, + "tableName": "sites", + "validationRuleFields": [], + "warningType": "missing_values_found" + }, + { + "columnName": "allowedField", + "invalidValue": "NA", + "message": "missing_values_found rule triggered in table sites, column allowedField, row(s) 4: Missing value \"NA\"", + "row": {"allowedField": "NA", "siteID": "4"}, + "rowNumber": 4, + "tableName": "sites", + "validationRuleFields": [], + "warningType": "missing_values_found" + } + ], + "errors": [ + { + "columnName": "allowedField", + "errorType": "invalid_category", + "invalidValue": " ", + "message": "invalid_category rule violated in table sites, column allowedField, row(s) 3: Invalid category \" \"", + "row": {"allowedField": " ", "siteID": "3"}, + "rowNumber": 3, + "tableName": "sites", + "validationRuleFields": [] + }, + { + "columnName": "allowedField", + "errorType": "invalid_category", + "invalidValue": "NA", + "message": "invalid_category rule violated in table sites, column allowedField, row(s) 4: Invalid category \"NA\"", + "row": {"allowedField": "NA", "siteID": "4"}, + "rowNumber": 4, + "tableName": "sites", + "validationRuleFields": [] + } + ] +} diff --git a/assets/validation-rules/empty/error-report-minlength.json b/assets/validation-rules/empty/error-report-minlength.json new file mode 100644 index 00000000..bbbb2a8a --- /dev/null +++ b/assets/validation-rules/empty/error-report-minlength.json @@ -0,0 +1,75 @@ +{ + "warnings": [ + { + "warningType": "missing_values_found", + "tableName": "sites", + "columnName": "minlengthField", + "validationRuleFields": [], + "message": "missing_values_found rule triggered in table sites, column minlengthField, row(s) 1: Empty string found", + "rowNumber": 1, + "row": {"siteID": "1", "minlengthField": ""}, + "invalidValue": "" + }, + { + "warningType": "missing_values_found", + "tableName": "sites", + "columnName": "minlengthField", + "validationRuleFields": [], + "message": "missing_values_found rule triggered in table sites, column minlengthField, row(s) 2: Empty string found", + "rowNumber": 2, + "row": {"siteID": "2", "minlengthField": ""}, "invalidValue": "" + }, + { + "warningType": "missing_values_found", + "tableName": "sites", + "columnName": "minlengthField", + "validationRuleFields": [], + "message": "missing_values_found rule triggered in table sites, column minlengthField, row(s) 3: Missing value \" \"", + "rowNumber": 3, + "row": {"siteID": "3", "minlengthField": " "}, + "invalidValue": " " + }, + { + "warningType": "missing_values_found", + "tableName": "sites", + "columnName": "minlengthField", + "validationRuleFields": [], + "message": "missing_values_found rule triggered in table sites, column minlengthField, row(s) 4: Missing value \"NA\"", + "rowNumber": 4, + "row": {"siteID": "4", "minlengthField": "NA"}, + "invalidValue": "NA" + } + ], + "errors": [ + { + "errorType": "less_than_min_length", + "tableName": "sites", + "columnName": "minlengthField", + "validationRuleFields": [], + "message": "less_than_min_length rule violated in table sites, column minlengthField, row(s) 3: Value \" \" (of length 3) is less than the min length of \"5\"", + "rowNumber": 3, + "row": {"siteID": "3", "minlengthField": " "}, + "invalidValue": " " + }, + { + "errorType": "less_than_min_length", + "tableName": "sites", + "columnName": "minlengthField", + "validationRuleFields": [], + "message": "less_than_min_length rule violated in table sites, column minlengthField, row(s) 4: Value \"NA\" (of length 2) is less than the min length of \"5\"", + "rowNumber": 4, + "row": {"siteID": "4", "minlengthField": "NA"}, + "invalidValue": "NA" + }, + { + "errorType": "less_than_min_length", + "tableName": "sites", + "columnName": "minlengthField", + "validationRuleFields": [], + "message": "less_than_min_length rule violated in table sites, column minlengthField, row(s) 5: Value \"x\" (of length 1) is less than the min length of \"5\"", + "rowNumber" : 5, + "row": {"siteID": "5", "minlengthField": "x"}, + "invalidValue": "x" + } + ] +} diff --git a/assets/validation-rules/empty/schema-v2-allowed.yml b/assets/validation-rules/empty/schema-v2-allowed.yml new file mode 100644 index 00000000..acfcd472 --- /dev/null +++ b/assets/validation-rules/empty/schema-v2-allowed.yml @@ -0,0 +1,16 @@ +schemaVersion: 2.0.0 +schema: + sites: + type: list + schema: + type: dict + schema: + allowedField: + anyof: + - allowed: + - a + - b + empty: true + emptyTrimmed: false + forbidden: + - NA diff --git a/assets/validation-rules/empty/schema-v2-minlength.yml b/assets/validation-rules/empty/schema-v2-minlength.yml new file mode 100644 index 00000000..6a94839f --- /dev/null +++ b/assets/validation-rules/empty/schema-v2-minlength.yml @@ -0,0 +1,14 @@ +schemaVersion: 2.0.0 +schema: + sites: + type: list + schema: + type: dict + schema: + minlengthField: + anyof: + - minlength: 5 + empty: true + emptyTrimmed: false + forbidden: + - NA diff --git a/assets/validation-rules/invalid-category/schema-v1.yml b/assets/validation-rules/invalid-category/schema-v1.yml index 80ac3f4b..7b7db65c 100644 --- a/assets/validation-rules/invalid-category/schema-v1.yml +++ b/assets/validation-rules/invalid-category/schema-v1.yml @@ -7,12 +7,14 @@ schema: type: dict schema: Collection: - allowed: - - Comp3h - - Comp8h - - FlowPr - - FlowRatePr - - other + anyof: + - allowed: + - Comp3h + - Comp8h + - FlowPr + - FlowRatePr + - other + empty: true meta: - ruleID: invalid_category meta: @@ -47,11 +49,13 @@ schema: type: dict schema: type: - allowed: + anyof: + - allowed: - other - someOldCat1 - wwtpMuC - wwtpMuS + empty: true meta: - ruleID: invalid_category meta: @@ -86,10 +90,12 @@ schema: type: dict schema: type: - allowed: - - other - - wqCOD - - wwCOD + anyof: + - allowed: + - other + - wqCOD + - wwCOD + empty: true meta: - ruleID: invalid_category meta: diff --git a/assets/validation-rules/invalid-category/schema-v2.yml b/assets/validation-rules/invalid-category/schema-v2.yml index b1447c21..0390a269 100644 --- a/assets/validation-rules/invalid-category/schema-v2.yml +++ b/assets/validation-rules/invalid-category/schema-v2.yml @@ -6,10 +6,12 @@ schema: type: dict schema: coll: - allowed: - - comp3h - - comp8h - - flowPr + anyof: + - allowed: + - comp3h + - comp8h + - flowPr + empty: True meta: - ruleID: invalid_category meta: diff --git a/assets/validation-rules/invalid-category/valid-dataset-2.csv b/assets/validation-rules/invalid-category/valid-dataset-2.csv index 881689d1..0e9194ac 100644 --- a/assets/validation-rules/invalid-category/valid-dataset-2.csv +++ b/assets/validation-rules/invalid-category/valid-dataset-2.csv @@ -1,3 +1,4 @@ coll NA -"" \ No newline at end of file + +"" diff --git a/assets/validation-rules/less-than-min-length/schema-v1.yml b/assets/validation-rules/less-than-min-length/schema-v1.yml index 5074ea68..f683ce77 100644 --- a/assets/validation-rules/less-than-min-length/schema-v1.yml +++ b/assets/validation-rules/less-than-min-length/schema-v1.yml @@ -6,7 +6,9 @@ schema: type: dict schema: phoneNumber: - minlength: 10 + anyof: + - empty: True + minlength: 10 meta: - ruleID: less_than_min_length meta: diff --git a/assets/validation-rules/less-than-min-length/schema-v2.yml b/assets/validation-rules/less-than-min-length/schema-v2.yml index e61d0841..ddec6c69 100644 --- a/assets/validation-rules/less-than-min-length/schema-v2.yml +++ b/assets/validation-rules/less-than-min-length/schema-v2.yml @@ -6,7 +6,9 @@ schema: type: dict schema: phone: - minlength: 10 + anyof: + - empty: True + minlength: 10 meta: - ruleID: less_than_min_length meta: diff --git a/assets/validation-schemas/schema-v1.0.0.yml b/assets/validation-schemas/schema-v1.0.0.yml index 8ce90f00..48816ce1 100644 --- a/assets/validation-schemas/schema-v1.0.0.yml +++ b/assets/validation-schemas/schema-v1.0.0.yml @@ -111,13 +111,15 @@ schema: ruleID: invalid_type type: string unit: - allowed: - - gcCrA - - gcGs - - gcL - - gcMl - - gcPMMoV - - other + anyof: + - allowed: + - gcCrA + - gcGs + - gcL + - gcMl + - gcPMMoV + - other + empty: true maxlength: 30 meta: - meta: @@ -951,12 +953,14 @@ schema: ruleID: invalid_type type: string type: - allowed: - - atline - - hand - - lab - - online - - other + anyof: + - allowed: + - atline + - hand + - lab + - online + - other + empty: true emptyTrimmed: false forbidden: - NA @@ -1131,6 +1135,9 @@ schema: ruleID: invalid_type type: string contactPhone: + anyof: + - empty: true + minlength: 10 maxlength: 12 meta: - meta: @@ -1154,7 +1161,6 @@ schema: version1Table: Lab version1Variable: contactPhone ruleID: less_than_min_length - minlength: 10 type: string labID: maxlength: 10 @@ -1303,10 +1309,12 @@ schema: type: string unique: true type: - allowed: - - hlthReg - - other - - swrCatSet + anyof: + - allowed: + - hlthReg + - other + - swrCatSet + empty: true emptyTrimmed: false forbidden: - NA @@ -1492,15 +1500,17 @@ schema: version1Table: Sample schema: collection: - allowed: - - cpFP24h - - cpTP24h - - grb - - grbCp3 - - grbCp3h - - grbCp8h - - mooreSw - - other + anyof: + - allowed: + - cpFP24h + - cpTP24h + - grb + - grbCp3 + - grbCp3h + - grbCp8h + - mooreSw + - other + empty: true emptyTrimmed: false forbidden: - NA @@ -1748,17 +1758,19 @@ schema: ruleID: invalid_type type: string type: - allowed: - - hTank - - other - - pEfflu - - pSludge - - pstGrit - - rawWW - - sEfflu - - sSludge - - swrSed - - water + anyof: + - allowed: + - hTank + - other + - pEfflu + - pSludge + - pstGrit + - rawWW + - sEfflu + - sSludge + - swrSed + - water + empty: true emptyTrimmed: false forbidden: - NA @@ -1989,28 +2001,30 @@ schema: ruleID: invalid_type type: string type: - allowed: - - airPln - - corFcil - - estuary - - faeces - - holdTnk - - hosptl - - lagoon - - lake - - ltcf - - mSwrPpl - - ocean - - other - - pStat - - retPond - - river - - school - - sea - - septTnk - - swgTrck - - wwtpInd - - wwtpMuC + anyof: + - allowed: + - airPln + - corFcil + - estuary + - faeces + - holdTnk + - hosptl + - lagoon + - lake + - ltcf + - mSwrPpl + - ocean + - other + - pStat + - retPond + - river + - school + - sea + - septTnk + - swgTrck + - wwtpInd + - wwtpMuC + empty: true emptyTrimmed: false forbidden: - NA diff --git a/assets/validation-schemas/schema-v1.1.0.yml b/assets/validation-schemas/schema-v1.1.0.yml index b568351f..9eea4458 100644 --- a/assets/validation-schemas/schema-v1.1.0.yml +++ b/assets/validation-schemas/schema-v1.1.0.yml @@ -111,13 +111,15 @@ schema: ruleID: invalid_type type: string unit: - allowed: - - gcCrA - - gcGs - - gcL - - gcMl - - gcPMMoV - - other + anyof: + - allowed: + - gcCrA + - gcGs + - gcL + - gcMl + - gcPMMoV + - other + empty: true maxlength: 30 meta: - meta: @@ -1054,12 +1056,14 @@ schema: ruleID: invalid_type type: string type: - allowed: - - atline - - hand - - lab - - online - - other + anyof: + - allowed: + - atline + - hand + - lab + - online + - other + empty: true emptyTrimmed: false forbidden: - NA @@ -1234,6 +1238,9 @@ schema: ruleID: invalid_type type: string contactPhone: + anyof: + - empty: true + minlength: 10 maxlength: 12 meta: - meta: @@ -1257,7 +1264,6 @@ schema: version1Table: Lab version1Variable: contactPhone ruleID: less_than_min_length - minlength: 10 type: string labID: maxlength: 10 @@ -1406,10 +1412,12 @@ schema: type: string unique: true type: - allowed: - - hlthReg - - other - - swrCatSet + anyof: + - allowed: + - hlthReg + - other + - swrCatSet + empty: true emptyTrimmed: false forbidden: - NA @@ -1631,15 +1639,17 @@ schema: version1Table: Sample schema: collection: - allowed: - - cpFP24h - - cpTP24h - - grb - - grbCp3 - - grbCp3h - - grbCp8h - - mooreSw - - other + anyof: + - allowed: + - cpFP24h + - cpTP24h + - grb + - grbCp3 + - grbCp3h + - grbCp8h + - mooreSw + - other + empty: true emptyTrimmed: false forbidden: - NA @@ -1972,17 +1982,19 @@ schema: type: string unique: true type: - allowed: - - hTank - - other - - pEfflu - - pSludge - - pstGrit - - rawWW - - sEfflu - - sSludge - - swrSed - - water + anyof: + - allowed: + - hTank + - other + - pEfflu + - pSludge + - pstGrit + - rawWW + - sEfflu + - sSludge + - swrSed + - water + empty: true emptyTrimmed: false forbidden: - NA @@ -2257,29 +2269,31 @@ schema: ruleID: invalid_type type: string type: - allowed: - - airPln - - corFcil - - estuary - - faeces - - holdTnk - - hosptl - - lagoon - - lake - - ltcf - - mSwrPpl - - ocean - - other - - pStat - - retPond - - river - - school - - sea - - septTnk - - swgTrck - - uCampus - - wwtpInd - - wwtpMuC + anyof: + - allowed: + - airPln + - corFcil + - estuary + - faeces + - holdTnk + - hosptl + - lagoon + - lake + - ltcf + - mSwrPpl + - ocean + - other + - pStat + - retPond + - river + - school + - sea + - septTnk + - swgTrck + - uCampus + - wwtpInd + - wwtpMuC + empty: true emptyTrimmed: false forbidden: - NA diff --git a/assets/validation-schemas/schema-v2.0.0.yml b/assets/validation-schemas/schema-v2.0.0.yml index b8de4d57..7467ce7f 100644 --- a/assets/validation-schemas/schema-v2.0.0.yml +++ b/assets/validation-schemas/schema-v2.0.0.yml @@ -568,6 +568,9 @@ schema: required: true type: string phone: + anyof: + - empty: true + minlength: 10 maxlength: 12 meta: - meta: @@ -588,7 +591,6 @@ schema: minLength: '10' partID: phone ruleID: less_than_min_length - minlength: 10 type: string role: maxlength: 30 @@ -1031,12 +1033,14 @@ schema: ruleID: invalid_type type: string insType: - allowed: - - aas - - hma - - instrumentTypeOther - - ola - - onse + anyof: + - allowed: + - aas + - hma + - instrumentTypeOther + - ola + - onse + empty: true emptyTrimmed: false forbidden: - NA @@ -2047,10 +2051,12 @@ schema: ruleID: invalid_type type: string fraction: - allowed: - - liq - - mix - - sol + anyof: + - allowed: + - liq + - mix + - sol + empty: true maxlength: 30 meta: - meta: @@ -2334,14 +2340,16 @@ schema: ruleID: invalid_type type: string purpose: - allowed: - - education - - multiple - - provisional - - qualityControl - - regular - - testing - - validationStudy + anyof: + - allowed: + - education + - multiple + - provisional + - qualityControl + - regular + - testing + - validationStudy + empty: true maxlength: 30 meta: - meta: @@ -2414,6 +2422,11 @@ schema: allowed: - 'false' - 'true' + anyof: + - allowed: + - 'false' + - 'true' + empty: true maxlength: 10 meta: - meta: @@ -2488,10 +2501,12 @@ schema: required: true type: string severity: - allowed: - - high - - low - - mid + anyof: + - allowed: + - high + - low + - mid + empty: true maxlength: 30 meta: - meta: @@ -2897,13 +2912,15 @@ schema: ruleID: invalid_type type: string orgLevel: - allowed: - - admRegLevel - - countryLevel - - countyLevel - - municipalLevel - - neighborLevel - - stateProvLevel + anyof: + - allowed: + - admRegLevel + - countryLevel + - countyLevel + - municipalLevel + - neighborLevel + - stateProvLevel + empty: true maxlength: 30 meta: - meta: @@ -2938,19 +2955,21 @@ schema: ruleID: invalid_type type: string orgSector: - allowed: - - airport - - ccc - - dorm - - fiNa - - healthAdm - - lab - - ltcf - - ltcfAl - - pubHealth - - school - - sss - - uCampus + anyof: + - allowed: + - airport + - ccc + - dorm + - fiNa + - healthAdm + - lab + - ltcf + - ltcfAl + - pubHealth + - school + - sss + - uCampus + empty: true maxlength: 30 meta: - meta: @@ -2997,10 +3016,12 @@ schema: ruleID: invalid_type type: string orgType: - allowed: - - academ - - govt - - priv + anyof: + - allowed: + - academ + - govt + - priv + empty: true maxlength: 30 meta: - meta: @@ -3203,9 +3224,11 @@ schema: required: true type: float geoType: - allowed: - - hlthReg - - swrSet + anyof: + - allowed: + - hlthReg + - swrSet + empty: true emptyTrimmed: false forbidden: - NA @@ -3586,6 +3609,11 @@ schema: allowed: - 'false' - 'true' + anyof: + - allowed: + - 'false' + - 'true' + empty: true maxlength: 10 meta: - meta: @@ -4179,6 +4207,11 @@ schema: allowed: - 'false' - 'true' + anyof: + - allowed: + - 'false' + - 'true' + empty: true maxlength: 10 meta: - meta: @@ -4416,10 +4449,12 @@ schema: ruleID: invalid_type type: string severity: - allowed: - - high - - low - - mid + anyof: + - allowed: + - high + - low + - mid + empty: true maxlength: 30 meta: - meta: @@ -4863,16 +4898,18 @@ schema: required: true type: float collType: - allowed: - - areaPr - - comp - - cosca - - flowPr - - grb - - moorSw - - surfSw - - timePr - - volPr + anyof: + - allowed: + - areaPr + - comp + - cosca + - flowPr + - grb + - moorSw + - surfSw + - timePr + - volPr + empty: true emptyTrimmed: false forbidden: - NA @@ -5041,6 +5078,11 @@ schema: allowed: - 'false' - 'true' + anyof: + - allowed: + - 'false' + - 'true' + empty: true maxlength: 30 meta: - meta: @@ -5087,14 +5129,16 @@ schema: ruleID: invalid_type type: string purpose: - allowed: - - education - - multiple - - provisional - - qualityControl - - regular - - testing - - validationStudy + anyof: + - allowed: + - education + - multiple + - provisional + - qualityControl + - regular + - testing + - validationStudy + empty: true maxlength: 30 meta: - meta: @@ -5148,13 +5192,15 @@ schema: ruleID: invalid_type type: datetime repType: - allowed: - - colocated - - fieldReplicate - - labDuplicate - - lcsd - - msd - - unique + anyof: + - allowed: + - colocated + - fieldReplicate + - labDuplicate + - lcsd + - msd + - unique + empty: true maxlength: 30 meta: - meta: @@ -5192,6 +5238,11 @@ schema: allowed: - 'false' - 'true' + anyof: + - allowed: + - 'false' + - 'true' + empty: true maxlength: 10 meta: - meta: @@ -5222,24 +5273,26 @@ schema: ruleID: invalid_type type: string saMaterial: - allowed: - - afu - - desk - - faeces - - floor - - htSam - - nww - - pEfflu - - pSludge - - pstGrit - - rawWW - - rawWWdown - - rawWWup - - sEfflu - - sSludge - - septage - - surface - - swrSed + anyof: + - allowed: + - afu + - desk + - faeces + - floor + - htSam + - nww + - pEfflu + - pSludge + - pstGrit + - rawWW + - rawWWdown + - rawWWup + - sEfflu + - sSludge + - septage + - surface + - swrSed + empty: true emptyTrimmed: false forbidden: - NA @@ -5391,10 +5444,12 @@ schema: ruleID: invalid_type type: datetime severity: - allowed: - - high - - low - - mid + anyof: + - allowed: + - high + - low + - mid + empty: true maxlength: 30 meta: - meta: @@ -5789,26 +5844,28 @@ schema: ruleID: invalid_type type: string sampleShed: - allowed: - - airpln - - airport - - ccc - - cdc - - corFcil - - dorm - - fiNa - - hosptl - - ltcf - - ltcfAl - - ltcfO - - municp - - neigh - - orb - - school - - ship - - sss - - terminal - - uCampus + anyof: + - allowed: + - airpln + - airport + - ccc + - cdc + - corFcil + - dorm + - fiNa + - hosptl + - ltcf + - ltcfAl + - ltcfO + - municp + - neigh + - orb + - school + - ship + - sss + - terminal + - uCampus + empty: true emptyTrimmed: false forbidden: - NA @@ -5947,28 +6004,30 @@ schema: type: string unique: true siteType: - allowed: - - bSwrPpl - - buildCO - - estuary - - htSite - - lagoon - - lake - - mSwrPpl - - ocean - - pStat - - retPond - - river - - sea - - septTnk - - stabPnd - - swgTrck - - upstream - - wwtp - - wwtpBack - - wwtpInd - - wwtpMuC - - wwtpMuS + anyof: + - allowed: + - bSwrPpl + - buildCO + - estuary + - htSite + - lagoon + - lake + - mSwrPpl + - ocean + - pStat + - retPond + - river + - sea + - septTnk + - stabPnd + - swgTrck + - upstream + - wwtp + - wwtpBack + - wwtpInd + - wwtpMuC + - wwtpMuS + empty: true emptyTrimmed: false forbidden: - NA diff --git a/src/odm_validation/cerberusext.py b/src/odm_validation/cerberusext.py index 87c9973b..bde4098f 100644 --- a/src/odm_validation/cerberusext.py +++ b/src/odm_validation/cerberusext.py @@ -212,7 +212,7 @@ def _validate_emptyTrimmed(self, constraint, field, raw_value): expect_empty = constraint is_str = isinstance(raw_value, str) value = raw_value.strip() if is_str else raw_value - is_empty = not value or (is_str and value == '') + is_empty = not value if is_empty != expect_empty: err = ErrorDefinition(EMPTY_TRIMMED_RULE, 'emptyTrimmed') self._error(field, err) diff --git a/src/odm_validation/reports.py b/src/odm_validation/reports.py index 70220cee..cb06bc90 100644 --- a/src/odm_validation/reports.py +++ b/src/odm_validation/reports.py @@ -7,7 +7,7 @@ import part_tables as pt from input_data import DataKind -from rules import RuleId +from rules import get_anyof_constraint, RuleId from stdext import ( get_len, quote, @@ -186,10 +186,20 @@ def _gen_error_msg(ctx: ErrorCtx, template: Optional[str] = None, xfix.get('suffix', ''), ]) + # XXX: constraint may be a combination of rules due to a need for the + # 'empty' rule, which we'll have to exclude to get the actual rule value we + # want. This is the case when constraint has the type List[dict] + constraint_val = ctx.constraint + if isinstance(constraint_val, list): + rules = constraint_val[0] + if isinstance(rules, dict): + (_, val) = get_anyof_constraint(rules) + constraint_val = val + return full_template.format( allowed_values=_fmt_allowed_values(ctx.allowed_values), column_id=ctx.column_id, - constraint=_fmt_msg_value(ctx.constraint, relaxed=True), + constraint=_fmt_msg_value(constraint_val, relaxed=True), row_num=_fmt_list(ctx.row_numbers), rule_id=ctx.rule_id.name, table_id=ctx.table_id, diff --git a/src/odm_validation/rule_errors.py b/src/odm_validation/rule_errors.py index 9201f780..edbf2b2e 100644 --- a/src/odm_validation/rule_errors.py +++ b/src/odm_validation/rule_errors.py @@ -8,7 +8,7 @@ from input_data import DataKind from reports import ErrorKind, ValidationCtx, get_row_num from rule_filters import RuleFilter -from rules import Rule, RuleId, ruleset +from rules import Rule, RuleId, get_anyof_constraint, ruleset from schemas import CerberusSchema, init_table_schema from stdext import ( countdown, @@ -146,11 +146,22 @@ def _gen_error_entry(vctx, cerb_rule, table_id, column_id, value, row_numbers, return (rule.id, entry) +def _get_cerb_rule(e): + rule = e.schema_path[-1] + + # 'anyof' may be used to wrap the actual rule together with 'empty' + if rule == 'anyof': + (key, _) = get_anyof_constraint(e.constraint[0]) + rule = key + + return rule + + def _gen_cerb_error_entry(vctx, e, row, schema: CerberusSchema, rule_filter: RuleFilter, offset: int, data_kind: DataKind) -> Optional[RuleError]: "Transforms a single Cerberus error into a validation error." - cerb_rule = e.schema_path[-1] + cerb_rule = _get_cerb_rule(e) (table_id, _, column_id) = e.document_path row_index = e.document_path[1] schema_column = schema[table_id]['schema']['schema'][column_id] diff --git a/src/odm_validation/rules.py b/src/odm_validation/rules.py index ee642ec4..8d4a1926 100644 --- a/src/odm_validation/rules.py +++ b/src/odm_validation/rules.py @@ -10,8 +10,9 @@ import part_tables as pt from input_data import DataKind -from schemas import Schema, update_schema +from schemas import Schema, init_attr_schema, init_table_schema from stdext import ( + deep_update, try_parse_int, ) from rule_primitives import ( @@ -78,18 +79,46 @@ class Rule: these should be mapped to a missing_values_found error.""" +def get_anyof_constraint(anyof_constraint: dict) -> Tuple[str, str]: + '''returns actual constraint (key, val) from anyof-rule containing an empty + rule in addition to the actual rule''' + rules = anyof_constraint + assert 'empty' in rules and len(rules) == 2 + key = next(filter(lambda x: x != 'empty', rules)) + val = rules[key] + return (key, val) + + +def extract_cerb_keys(gen_cerb_rules: Callable) -> List[str]: + '''extracts the cerberus' rule-keys from an ODM rule's `gen_cerb_rules` + function''' + dummy_ctx = OdmValueCtx(value=1, datatype='integer', bool_set=set(), + null_set=set()) + cerb_rules = gen_cerb_rules(dummy_ctx) + assert isinstance(cerb_rules, dict) + + # 'anyof' may be used to wrap the actual rule together with 'empty' + if 'anyof' in cerb_rules: + assert len(cerb_rules) == 1 + anyof_list = cerb_rules['anyof'] + assert len(anyof_list) == 1 + (key, _) = get_anyof_constraint(anyof_list[0]) + return [key] + + return list(cerb_rules.keys()) + + def init_rule(rule_id, error, gen_cerb_rules, gen_schema, is_column=False, is_warning=False, match_all_keys=False): """ - `error` can either be a string or a function taking a value and returning a string. - `gen_cerb_rules` must accept a dummy context of `None` values, and return - a dict with cerberus rule names as keys. + a dict with cerberus rule names as keys. Only the keys are used, so the + values can be empty. - `is_column` determines if the rule is validating columns/headers. """ - dummy_ctx = OdmValueCtx(value=1, datatype='integer', bool_set=set(), - null_set=set()) - cerb_keys = list(gen_cerb_rules(dummy_ctx).keys()) + cerb_keys = extract_cerb_keys(gen_cerb_rules) get_error_template = error if callable(error) else (lambda x, y, z: error) return Rule( id=rule_id, @@ -199,7 +228,7 @@ def less_than_min_length(): def gen_cerb_rules(val_ctx: OdmValueCtx): val = try_parse_int(val_ctx.value) if val > 0: - return {'minlength': val} + return {'anyof': [{'empty': True, 'minlength': val}]} def gen_schema(data: pt.OdmData, ver): return gen_value_schema(data, ver, rule_id.name, odm_key, @@ -230,9 +259,6 @@ def invalid_category(): cerb_rule_key = 'allowed' err = 'Invalid category {value}' - def gen_cerb_rules(val_ctx: OdmValueCtx): - return {cerb_rule_key: None} - def gen_schema(data: pt.OdmData, ver: Version): # FIXME: `cat_ids1` contains duplicates due to v1 categories belonging # to multiple tables. @@ -251,12 +277,22 @@ def gen_schema(data: pt.OdmData, ver: Version): cat_ids1 = pt.map_ids(data.mappings, cat_ids0, ver) if len(cat_ids1) == 0: continue - cerb_rule = (cerb_rule_key, sorted(set(cat_ids1 + other_cat))) + cerb_rules = {'anyof': [{ + 'empty': True, + cerb_rule_key: sorted(set(cat_ids1 + other_cat)), + }]} attr_meta = get_catset_meta(table_id0, cs, categories, ver) - update_schema(schema, table_id1, attr_id1, rule_id.name, - cerb_rule, table_meta, attr_meta) + attr_schema = init_attr_schema(attr_id1, rule_id.name, + cerb_rules, attr_meta) + table_schema = init_table_schema(table_id1, table_meta, + attr_schema) + deep_update(schema, table_schema) + return schema + def gen_cerb_rules(val_ctx: OdmValueCtx): + return {cerb_rule_key: None} + return init_rule(rule_id, err, gen_cerb_rules, gen_schema) diff --git a/src/odm_validation/schemas.py b/src/odm_validation/schemas.py index 706bfe97..21313d6e 100644 --- a/src/odm_validation/schemas.py +++ b/src/odm_validation/schemas.py @@ -1,7 +1,6 @@ from copy import deepcopy from part_tables import Meta -from stdext import deep_update CerberusSchema = dict Schema = dict # {'schemaVersion': str, 'schema': CerberusSchema} @@ -35,11 +34,3 @@ def init_attr_schema(attr_id: str, rule_id: str, cerb_rules: dict, } ] return {attr_id: inner} - - -def update_schema(schema, table_id, attr_id, rule_id: str, cerb_rule, - table_meta: Meta, attr_meta: Meta): - cerb_rules = {cerb_rule[0]: cerb_rule[1]} - attr_schema = init_attr_schema(attr_id, rule_id, cerb_rules, attr_meta) - table_schema = init_table_schema(table_id, table_meta, attr_schema) - deep_update(schema, table_schema) diff --git a/src/odm_validation/stdext.py b/src/odm_validation/stdext.py index f246ecea..61174212 100644 --- a/src/odm_validation/stdext.py +++ b/src/odm_validation/stdext.py @@ -19,28 +19,44 @@ def hash2(x) -> int: return hash(x) -def deep_update(dst: dict, src: dict): - """ - Recursively update a dict. - Subdict's won't be overwritten but also updated. - List values will be joined, but not recursed. - - Originally from: https://stackoverflow.com/a/8310229 - """ - for key, value in src.items(): +def deep_update(dst: dict, src: dict, merge_dict_lists: bool = False): + '''recursively merge two dictionaries + + :param dst: the dict to merge into + :param src: the dict to merge from + :param merge_dict_lists: recurse into lists of dictionaries instead of + simply appending to the lists. + ''' + for key, src_val in src.items(): if key not in dst: - dst[key] = value - elif isinstance(value, dict): - deep_update(dst[key], value) - elif isinstance(value, list): - src_list = value - if len(src_list) == 0: + dst[key] = src_val + elif isinstance(src_val, dict): + deep_update(dst[key], src_val, merge_dict_lists) + elif isinstance(src_val, list): + src_list = src_val + n = len(src_list) + if n == 0: continue dst_list = dst[key] + assert isinstance(dst_list, list) + + # match dicts for as long as possible... + off = 0 + if merge_dict_lists: + for i in range(min(len(dst_list), n)): + dst_dict = dst_list[i] + src_dict = src_list[i] + if not (isinstance(dst_dict, dict) and + isinstance(src_dict, dict)): + break + deep_update(dst_dict, src_dict, merge_dict_lists) + off = i + 1 + + # ...then resort to appending the rest dst_hashset = set(map(hash2, dst_list)) - for item in src_list: - if hash2(item) not in dst_hashset: - dst_list.append(item) + for src_item in src_list[off:]: + if hash2(src_item) not in dst_hashset: + dst_list.append(src_item) def strip_dict_key(d: dict, target_key: str): diff --git a/src/odm_validation/validation.py b/src/odm_validation/validation.py index 01ad7b0b..b59924b3 100644 --- a/src/odm_validation/validation.py +++ b/src/odm_validation/validation.py @@ -69,7 +69,7 @@ def _generate_validation_schema_ext(parts: pt.Dataset, assert s is not None deep_update(cerb_schema, s) additions_schema = gen_additions_schema(schema_additions) - deep_update(cerb_schema, additions_schema) + deep_update(cerb_schema, additions_schema, merge_dict_lists=True) # strip empty tables for table in list(cerb_schema): diff --git a/tests/test_empty.py b/tests/test_empty.py new file mode 100644 index 00000000..d669fc06 --- /dev/null +++ b/tests/test_empty.py @@ -0,0 +1,34 @@ +import unittest +from os.path import join + +from parameterized import parameterized + +import common +from common import asset, root_dir +from utils import ( + import_dataset, + import_schema, +) +from validation import _validate_data_ext + + +class TestEmpty(common.OdmTestCase): + '''tests minlength and allowed together with missing_values_found due to + anyof-empty constraint''' + + @classmethod + def setUpClass(cls): + cls.maxDiff = None + + @parameterized.expand(['allowed', 'minlength']) + def test_empty(self, rulename: str): + common.ASSET_DIR = join(root_dir, 'assets/validation-rules/empty') + schema = import_schema(asset(f'schema-v2-{rulename}.yml')) + data = {'sites': import_dataset(asset(f'dataset-{rulename}.csv'))} + report = _validate_data_ext(schema, data) + expected = import_dataset(asset(f'error-report-{rulename}.json')) + self.assertReportEqual(expected, report) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_stdext.py b/tests/test_stdext.py index 28e891e5..dab794f9 100644 --- a/tests/test_stdext.py +++ b/tests/test_stdext.py @@ -1,7 +1,8 @@ import unittest +from copy import deepcopy import common -from stdext import keep, swapDelete, try_parse_int +from stdext import deep_update, keep, swapDelete, try_parse_int common.unused_import_dummy = 1 @@ -34,6 +35,87 @@ def test_keep(self): self.assertEqual(a, [{'target': [{'a': 1}]}]) self.assertEqual(b, [{'b': [{'target': 1}]}]) + def test_deep_update(self): + a = { + 'attr': { + 'meta': [ + { + 'meta': [ + {'maxLength': 10} + ], + 'ruleID': 'ml' + } + ] + } + } + b = { + 'attr': { + 'meta': [ + { + 'meta': [ + {'dataType': 'varchar'} + ], + 'ruleID': 't' + } + ] + } + } + expected = { + 'attr': { + 'meta': [ + { + 'meta': [ + {'maxLength': 10} + ], + 'ruleID': 'ml' + }, + { + 'meta': [ + {'dataType': 'varchar'} + ], + 'ruleID': 't' + }, + ] + } + } + actual = deepcopy(a) + deep_update(actual, b) + self.assertEqual(actual, expected) + + def test_deep_update_with_merge_dict_lists(self): + initial = { + 'a': { + 'b': [ + { + 'x': [1, 2], + 'y': True, + } + ] + } + } + addition = { + 'a': { + 'b': [ + { + 'x': [3, 4] + } + ] + } + } + expected = { + 'a': { + 'b': [ + { + 'x': [1, 2, 3, 4], + 'y': True, + } + ] + } + } + actual = deepcopy(initial) + deep_update(actual, addition, merge_dict_lists=True) + self.assertEqual(actual, expected) + if __name__ == '__main__': unittest.main()