Skip to content

Commit

Permalink
feat(rules): WRONG_ENTITY_IS_HEADER, NON_CONCEPT_HEADER and WRONG_ENT…
Browse files Browse the repository at this point in the history
…ITY_IS_VALUE rules

Closes #81 #82 #83 #56
  • Loading branch information
buchslava committed May 12, 2016
1 parent 9604c0a commit d2a6924
Show file tree
Hide file tree
Showing 24 changed files with 500 additions and 72 deletions.
50 changes: 50 additions & 0 deletions doc/rules/NON_CONCEPT_HEADER.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# NON_CONCEPT_HEADER

## Rule test folder

`test/fixtures/rules-cases/non-concept-header`

## Description
Each part of any header should be concept (is-- fields are excluded in this case)

## Examples of correct data

ddf--concepts.csv
```
concept,concept_type,domain,name
name,string,,
geo,entity_domain,,
country,entity_set,geo,Country
pop,measure,geo,Population
year,time,,year
```

ddf--datapoints--pop--by--country--year.csv
```
country,year,pop
vat,1960,100000
```

## Examples of incorrect data

ddf--concepts.csv
```
concept,concept_type,domain,name
name,string,,
geo,entity_domain,,
country,entity_set,geo,Country
pop,measure,geo,Population
year,time,,year
```

ddf--datapoints--pop--by--country--year.csv
```
countryFOO,year,pop
vat,1960,100000
```

## Output data format

Should be included next information:

incorrect header value
78 changes: 78 additions & 0 deletions doc/rules/WRONG_ENTITY_IS_HEADER.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# WRONG_ENTITY_IS_HEADER

## Rule test folder

`test/fixtures/rules-cases/wrong-entity-is-header`

## Description
An issue according to this rule will be fired when `is-header` in concept is defined and not valid: not a concept with `entity_set` type

## Examples of correct data

`ddf--concepts.csv`
```
"concept","name","concept_type","domain",
"income_groups","Income groups","entity_set","geo",
"geo","Geographic location","entity_domain",,
```
and
`ddf--entities--geo--income_groups.csv`
```
income_groups,name,gwid,is--income_groups
high_income,High income,i268,TRUE
lower_middle_income,Lower middle income,i269,TRUE
low_income,Low income,i266,TRUE
upper_middle_income,Upper middle income,i267,TRUE
```

## Examples of incorrect data

`ddf--concepts.csv`
```
"concept","name","concept_type","domain",
"income_groups","Income groups","entity_set","geo",
"geo","Geographic location","entity_domain",,
```
and
`ddf--entities--geo--income_groups.csv`
```
income_groups,name,gwid,is--foo_groups
high_income,High income,i268,TRUE
lower_middle_income,Lower middle income,i269,TRUE
low_income,Low income,i266,TRUE
upper_middle_income,Upper middle income,i267,TRUE
```

## Output data format

* `message` - kind of issue. It should be `Not a concept` or `Wrong concept type`
* `header name` - csv's column name

### Additional information

is--header is not mandatory anywhere, absence just means all entities have value false for that is--header.

only error when: `is--xxx` is used when `xxx` is not defined in concepts as an entity_set. No other case should give an error.

So the following is also valid (though the `is--country` is nonsensical):
`ddf--concepts.csv`
```
"concept","name","concept_type","domain",
"income_groups","Income groups","entity_set","geo",
"geo","Geographic location","entity_domain",,
"country","Country","entity_set","geo"
```
`incomegroups.csv`
```
income_groups,name,gwid,is--income_groups,is--country
high_income,High income,i268,TRUE,FALSE
lower_middle_income,Lower middle income,i269,TRUE,FALSE
low_income,Low income,i266,TRUE,TRUE
upper_middle_income,Upper middle income,i267,TRUE,TRUE
```
`ddf--index.csv`
```
"key","value","file"
"income_groups","name","incomegroups.csv"
"income_groups","gwid","incomegroups.csv"
```
33 changes: 33 additions & 0 deletions doc/rules/WRONG_ENTITY_IS_VALUE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# WRONG_ENTITY_IS_VALUE`

## Rule test folder

`test/fixtures/rules-cases/wrong-entity-is-value`

## Description
An issue according to this rule will be fired when value under `is-` header doesn't look like boolean

## Examples of correct data

`ddf--entities--geo--income_groups.csv`
```
income_groups,name,gwid,is--income_groups
high_income,High income,i268,TRUE
lower_middle_income,Lower middle income,i269,TRUE
low_income,Low income,i266,TRUE
upper_middle_income,Upper middle income,i267,TRUE
```

## Examples of incorrect data

`ddf--entities--geo--income_groups.csv`
```
income_groups,name,gwid,is--income_groups
high_income,High income,i268,FOO
```

## Output data format

* `header name` - csv's column name
* `header value`
* `line in csv`
9 changes: 9 additions & 0 deletions lib/ddf-definitions/concept.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ class Concept {
return this.collection.find();
}

/*eslint camelcase: ["error", {properties: "never"}]*/
getDataIdsByType(type) {
return this.collection.find({concept_type: type}).map(record => record.concept);
}

getRecordByKey(concept) {
return _.head(this.collection.find({concept}));
}

getDataByFiles() {
return _.groupBy(this.getAllData(), record => record.$$source);
}
Expand Down
2 changes: 2 additions & 0 deletions lib/ddf-definitions/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
exports.CONCEPT = Symbol.for('concepts');
exports.ENTITY = Symbol.for('entities');
exports.DATA_POINT = Symbol.for('datapoints');

exports.LINE_NUM_INCLUDING_HEADER = 2;
5 changes: 2 additions & 3 deletions lib/ddf-definitions/ddf-json-corrector.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@ const _ = require('lodash');
const async = require('async');
const json2csv = require('json2csv');
const DdfDataSet = require('./ddf-data-set');
const constants = require('../ddf-definitions/constants');
const generalDdfRules = require('../ddf-rules/general-rules');
const rulesRegistry = require('../ddf-rules/registry');
const fileUtils = require('../utils/file');

const LINE_NUM_INCLUDING_HEADER = 2;

function correctFile(data, cb) {
fileUtils.readFile(data.file, (err, content) => {
if (err) {
return cb(err);
}

data.warnings.forEach(issue => {
content[issue.data.line - LINE_NUM_INCLUDING_HEADER][issue.data.column] =
content[issue.data.line - constants.LINE_NUM_INCLUDING_HEADER][issue.data.column] =
_.first(issue.suggestions);
});

Expand Down
65 changes: 65 additions & 0 deletions lib/ddf-rules/concept-rules.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,61 @@
const _ = require('lodash');
const registry = require('./registry');
const Issue = require('./issue');
const Levenshtein = require('levenshtein');
const SUGGEST_TOLERANCE = 3;

function getDataPointHeaderDetails(ddfDataSet) {
const result = [];

ddfDataSet.getDataPoint().details.forEach(detail =>
detail.fileDescriptor.headers.forEach(header => result.push({header, detail})));

return result;
}

function getEntityHeaderDetails(ddfDataSet) {
const result = [];

ddfDataSet.getEntity().details.forEach(detail => {
detail.header.forEach(header => {
if (!_.startsWith(header, 'is--')) {
result.push({header, detail});
}
});
});

return result;
}

function getHeaderDetailObjects(ddfDataSet) {
return getDataPointHeaderDetails(ddfDataSet)
.concat(getEntityHeaderDetails(ddfDataSet));
}

function setNonConceptHeaderIssue(conceptIds, detailObject, result) {
if (conceptIds.indexOf(detailObject.header) < 0) {
const suggestions = _.uniq(
conceptIds
.map(concept => {
const levenshtein = new Levenshtein(concept, detailObject.header);

return {
concept,
distance: levenshtein.distance
};
})
.filter(suggest => suggest.distance < SUGGEST_TOLERANCE)
.map(suggest => suggest.concept)
);

const issue = new Issue(registry.NON_CONCEPT_HEADER)
.setPath(detailObject.detail.fileDescriptor.fullPath)
.setData(detailObject.header)
.setSuggestions(suggestions);

result.push(issue);
}
}

module.exports = {
[registry.CONCEPT_ID_IS_NOT_UNIQUE]: ddfDataSet => {
Expand All @@ -28,6 +83,16 @@ module.exports = {
.setData(nonUniqueConceptIds);
}

return result;
},
[registry.NON_CONCEPT_HEADER]: ddfDataSet => {
const result = [];
const conceptIds = ddfDataSet.getConcept().getIds();

getHeaderDetailObjects(ddfDataSet)
.map(headerDetailObject =>
setNonConceptHeaderIssue(conceptIds, headerDetailObject, result));

return result;
}
};
12 changes: 6 additions & 6 deletions lib/ddf-rules/data-point-rules.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ const _ = require('lodash');
const ddfTimeUtils = require('ddf-time-utils');
const registry = require('./registry');
const Issue = require('./issue');
const LINE_NUM_INCLUDING_HEADER = 2;
const constants = require('../ddf-definitions/constants');

function constructEntityCondition(entity) {
const espectedKey = `is--${entity}`;
const expectedKey = `is--${entity}`;

return {
[espectedKey]: {$in: ['1', 'TRUE', 'true']}
[expectedKey]: {$in: ['1', 'TRUE', 'true']}
};
}

Expand All @@ -30,7 +30,7 @@ module.exports = {
if (isNaN(dataPointRecord[measure])) {
const data = {
measure,
line: line + LINE_NUM_INCLUDING_HEADER,
line: line + constants.LINE_NUM_INCLUDING_HEADER,
value: dataPointRecord[measure]
};
const issue = new Issue(registry.DATA_POINT_VALUE_NOT_NUMERIC)
Expand Down Expand Up @@ -63,7 +63,7 @@ module.exports = {
if (!_.includes(entityValueHash[entityKey], dataPointRecord[entityKey])) {
const data = {
concept: entityKey,
line: line + LINE_NUM_INCLUDING_HEADER,
line: line + constants.LINE_NUM_INCLUDING_HEADER,
value: dataPointRecord[entityKey]
};
const issue = new Issue(registry.DATA_POINT_UNEXPECTED_ENTITY_VALUE)
Expand All @@ -89,7 +89,7 @@ module.exports = {
if (!ddfTimeUtils.detectTimeType(dataPointRecord[timeKey])) {
const data = {
concept: timeKey,
line: line + LINE_NUM_INCLUDING_HEADER,
line: line + constants.LINE_NUM_INCLUDING_HEADER,
value: dataPointRecord[timeKey]
};
const issue = new Issue(registry.DATA_POINT_UNEXPECTED_TIME_VALUE)
Expand Down
Loading

0 comments on commit d2a6924

Please sign in to comment.