Skip to content

Commit

Permalink
feat(rules): Two new datapackage validation rules
Browse files Browse the repository at this point in the history
Closes #494
  • Loading branch information
buchslava committed Apr 11, 2018
1 parent d7bd477 commit 76c0659
Show file tree
Hide file tree
Showing 45 changed files with 1,860 additions and 76 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"test-travis": "istanbul cover mocha _mocha -- -R spec --timeout 200000 --compilers ts:ts-node/register --recursive test/**/*.spec.ts && codecov",
"changelog": "conventional-changelog -i CHANGELOG.md -s -p angular",
"github-release": "conventional-github-releaser -p angular",
"build": "tsc && touch lib/package.json && echo \\{\\\"version\\\": \\\"1.15.2\\\"\\} > lib/package.json",
"build": "tsc && touch lib/package.json && echo \\{\\\"version\\\": \\\"1.16.0\\\"\\} > lib/package.json",
"prepublish": "npm run build",
"preversion": "npm test",
"version": "npm run changelog && git add CHANGELOG.md",
Expand Down
55 changes: 55 additions & 0 deletions src/ddf-rules/data-package-rules/nonexistent-concept.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import { isArray, includes, compact } from 'lodash';
import { DdfDataSet } from '../../ddf-definitions/ddf-data-set';
import { Issue } from '../issue';
import { DATAPACKAGE_NONEXISTENT_CONCEPT } from '../registry';
import { DATA_PACKAGE_FILE } from '../../data/data-package';
import * as path from 'path';
import { DDFRoot } from '../../data/ddf-root';

const toArray = value => isArray(value) ? value : [value];
const fillConceptsSetBySchema = (dataPackageSchema, conceptsSet: Set<string>) => {
[
...dataPackageSchema.concepts,
...dataPackageSchema.entities,
...dataPackageSchema.datapoints
].forEach(resource => {
for (const pk of toArray(resource.primaryKey)) {
conceptsSet.add(pk);
}

conceptsSet.add(resource.value);
});
};
const fillResources = (ddfRoot: DDFRoot, conceptsSet: Set<string>) => {
ddfRoot.getDataPackageResources().forEach(resource => {
for (const pk of toArray(resource.schema.primaryKey)) {
conceptsSet.add(pk);
}

for (const field of resource.schema.fields) {
conceptsSet.add(field.name);
}
});
};

export const rule = {
rule: (ddfDataSet: DdfDataSet) => {
const ddfRoot = ddfDataSet.ddfRoot;
const dataPackagePath = path.resolve(ddfRoot.dataPackageDescriptor.rootFolder, DATA_PACKAGE_FILE);
const conceptsSet = new Set<string>();
const dataPackageSchema = ddfRoot.getDataPackageSchema();

if (dataPackageSchema) {
fillConceptsSetBySchema(dataPackageSchema, conceptsSet);
}

fillResources(ddfRoot, conceptsSet);

const originalConcepts = ddfDataSet.getConcept().getAllData().map(record => record.concept);

return compact(Array.from(conceptsSet.values()))
.map(concept => concept.replace(/^is--/, ''))
.filter(concept => concept !== 'concept' && concept !== 'concept_type' && !includes(originalConcepts, concept))
.map(concept => new Issue(DATAPACKAGE_NONEXISTENT_CONCEPT).setPath(dataPackagePath).setData(concept))
}
};
83 changes: 83 additions & 0 deletions src/ddf-rules/data-package-rules/nonexistent-resource.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import * as path from 'path';
import { isEmpty, compact } from 'lodash';
import { DdfDataSet } from '../../ddf-definitions/ddf-data-set';
import { Issue } from '../issue';
import { DATAPACKAGE_NONEXISTENT_RESOURCE } from '../registry';
import { DATA_PACKAGE_FILE } from '../../data/data-package';
import { DDFRoot } from '../../data/ddf-root';

const getNonexistentResourcesIssues = (
ddfRoot: DDFRoot,
dataPackagePath: string,
resourcesMap: Map<string, number>): Issue[] => {
if (!ddfRoot.getDataPackageSchema()) {
return [];
}

return compact([
...ddfRoot.getDataPackageSchema().concepts,
...ddfRoot.getDataPackageSchema().entities,
...ddfRoot.getDataPackageSchema().datapoints
].map(record => {
const nonexistentResources = record.resources.filter(resource => !resourcesMap.has(resource));

if (!isEmpty(nonexistentResources)) {
return new Issue(DATAPACKAGE_NONEXISTENT_RESOURCE)
.setPath(dataPackagePath)
.setData({
nonexistentResources, record,
specific: 'is NOT found in resources, but found in schema section'
});
}

return null;
}));
};

const fillResourceMapCounters = (ddfRoot: DDFRoot, resourcesMap: Map<string, number>) => {
if (!ddfRoot.getDataPackageSchema()) {
return [];
}

[
...ddfRoot.getDataPackageSchema().concepts,
...ddfRoot.getDataPackageSchema().entities,
...ddfRoot.getDataPackageSchema().datapoints
].forEach(record => {
record.resources.forEach(resource => {
if (resourcesMap.has(resource)) {
resourcesMap.set(resource, resourcesMap.get(resource) + 1);
}

return resource;
})
});
};

const getNonexistentSchemaResourcesIssues = (dataPackagePath: string, resourcesMap: Map<string, number>): Issue[] =>
Array.from(resourcesMap.keys())
.filter(resource => resourcesMap.get(resource) === 0)
.map(resource => new Issue(DATAPACKAGE_NONEXISTENT_RESOURCE)
.setPath(dataPackagePath)
.setData({resource, specific: 'is NOT found in ddfSchema schema, but found in resources section'}));

export const rule = {
rule: (ddfDataSet: DdfDataSet) => {
const ddfRoot = ddfDataSet.ddfRoot;
const dataPackagePath = path.resolve(ddfRoot.dataPackageDescriptor.rootFolder, DATA_PACKAGE_FILE);
const resourcesMap = ddfRoot.getDataPackageResources()
.map(resource => resource.name)
.reduce((mapValue, resourceName) => {
mapValue.set(resourceName, 0);

return mapValue;
}, new Map<string, number>());

fillResourceMapCounters(ddfRoot, resourcesMap);

return [
...getNonexistentResourcesIssues(ddfRoot, dataPackagePath, resourcesMap),
...getNonexistentSchemaResourcesIssues(dataPackagePath, resourcesMap)
];
}
};
4 changes: 4 additions & 0 deletions src/ddf-rules/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import { rule as nonUniqueResourceName } from './data-package-rules/non-unique-r
import { rule as nonUniqueResourceFile } from './data-package-rules/non-unique-resource-file';
import { rule as dataPointWithoutIndicator } from './data-package-rules/datapoint-without-indicator';
import { rule as sameKeyValueConcept } from './data-package-rules/same-key-value-concept';
import { rule as nonexistentResource } from './data-package-rules/nonexistent-resource';
import { rule as nonexistentConcept } from './data-package-rules/nonexistent-concept';

// import { rule as measureValueNotNumeric } from './data-point-rules/measure-value-not-numeric';
import { rule as unexpectedEntityValue } from './data-point-rules/unexpected-entity-value';
Expand Down Expand Up @@ -62,6 +64,8 @@ export const allRules = {
[registry.DATA_POINT_UNEXPECTED_ENTITY_VALUE]: unexpectedEntityValue,
[registry.DATA_POINT_UNEXPECTED_TIME_VALUE]: unexpectedTimeValue,
[registry.SAME_KEY_VALUE_CONCEPT]: sameKeyValueConcept,
[registry.DATAPACKAGE_NONEXISTENT_RESOURCE]: nonexistentResource,
[registry.DATAPACKAGE_NONEXISTENT_CONCEPT]: nonexistentConcept,
[registry.WRONG_ENTITY_IS_HEADER]: wrongEntityIsHeader,
[registry.WRONG_ENTITY_IS_VALUE]: wrongEntityIsValue,
[registry.NON_UNIQUE_ENTITY_VALUE]: nonUniqueEntityValue,
Expand Down
12 changes: 10 additions & 2 deletions src/ddf-rules/registry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ export const DATAPACKAGE_NON_CONCEPT_FIELD = Symbol.for('DATAPACKAGE_NON_CONCEPT
export const DATAPACKAGE_INCORRECT_PRIMARY_KEY = Symbol.for('DATAPACKAGE_INCORRECT_PRIMARY_KEY');
export const DATAPACKAGE_NON_UNIQUE_RESOURCE_NAME = Symbol.for('DATAPACKAGE_NON_UNIQUE_RESOURCE_NAME');
export const DATAPACKAGE_NON_UNIQUE_RESOURCE_FILE = Symbol.for('DATAPACKAGE_NON_UNIQUE_RESOURCE_FILE');
export const DATAPACKAGE_NONEXISTENT_RESOURCE = Symbol.for('DATAPACKAGE_NONEXISTENT_RESOURCE');
export const DATAPACKAGE_NONEXISTENT_CONCEPT = Symbol.for('DATAPACKAGE_NONEXISTENT_CONCEPT');
export const UNEXPECTED_TRANSLATION_HEADER = Symbol.for('UNEXPECTED_TRANSLATION_HEADER');
export const UNEXPECTED_TRANSLATIONS_DATA = Symbol.for('UNEXPECTED_TRANSLATIONS_DATA');
export const UNEXPECTED_DATA_POINT_TRANSLATIONS_DATA = Symbol.for('UNEXPECTED_DATA_POINT_TRANSLATIONS_DATA');
Expand Down Expand Up @@ -63,6 +65,8 @@ export const tags: any = {
[DATA_POINT_UNEXPECTED_TIME_VALUE]: [DATAPOINT_TAG],
[WRONG_DATA_POINT_HEADER]: [DATAPOINT_TAG],
[SAME_KEY_VALUE_CONCEPT]: [DATAPACKAGE_TAG],
[DATAPACKAGE_NONEXISTENT_RESOURCE]: [DATAPACKAGE_TAG],
[DATAPACKAGE_NONEXISTENT_CONCEPT]: [DATAPACKAGE_TAG],
[WRONG_ENTITY_IS_HEADER]: [],
[WRONG_ENTITY_IS_VALUE]: [],
[NON_UNIQUE_ENTITY_VALUE]: [],
Expand Down Expand Up @@ -113,6 +117,8 @@ export const descriptions = {
[DATAPACKAGE_INCORRECT_PRIMARY_KEY]: 'Datapackage: Fields section does not contain primary key.',
[DATAPACKAGE_NON_UNIQUE_RESOURCE_NAME]: 'Datapackage: Non-unique resource name found in datapackage.json.',
[DATAPACKAGE_NON_UNIQUE_RESOURCE_FILE]: 'Datapackage: Non-unique resource file found in datapackage.json.',
[DATAPACKAGE_NONEXISTENT_RESOURCE]: 'Resource found in datapackage that is not present in folder',
[DATAPACKAGE_NONEXISTENT_CONCEPT]: 'Concept found in datapackage but is not listed in concepts table',
[UNEXPECTED_TRANSLATION_HEADER]: 'Translations: Unexpected header in translation files',
[UNEXPECTED_TRANSLATIONS_DATA]: 'Translations: Unexpected translations data: primary key is not consistent.',
[UNEXPECTED_DATA_POINT_TRANSLATIONS_DATA]: 'Translations: Unexpected translations datapoint data: primary key is not consistent.',
Expand Down Expand Up @@ -153,8 +159,10 @@ export const howToFix = {
[DATAPACKAGE_INCORRECT_PRIMARY_KEY]: 'Regenerate or update datapackage as described here: https://github.com/Gapminder/ddf-validation#datapackage',
[DATAPACKAGE_NON_UNIQUE_RESOURCE_NAME]: 'Regenerate or update datapackage as described here: https://github.com/Gapminder/ddf-validation#datapackage',
[DATAPACKAGE_NON_UNIQUE_RESOURCE_FILE]: 'Regenerate or update datapackage as described here: https://github.com/Gapminder/ddf-validation#datapackage',
[UNEXPECTED_TRANSLATION_HEADER]: '',
[UNEXPECTED_TRANSLATIONS_DATA]: '',
[DATAPACKAGE_NONEXISTENT_RESOURCE]: 'Resource found in datapackage that is not present in folder',
[DATAPACKAGE_NONEXISTENT_CONCEPT]: 'Concept found in datapackage but is not listed in concepts table',
[UNEXPECTED_TRANSLATION_HEADER]: 'Remove the resource from datapackage or restore it in the folder',
[UNEXPECTED_TRANSLATIONS_DATA]: 'Remove the resource from datapackage or add it to the concepts table',
[UNEXPECTED_DATA_POINT_TRANSLATIONS_DATA]: '',
[DUPLICATED_DATA_POINT_TRANSLATION_KEY]: '',
[DUPLICATED_TRANSLATION_KEY]: '',
Expand Down
8 changes: 4 additions & 4 deletions test/api.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ describe('api', () => {
describe('and DDF dataset is correct', () => {
it('should dataset is correct', done => {
const issues = [];
const streamValidator = new StreamValidator('./test/fixtures/good-folder', {});
const streamValidator = new StreamValidator('./test/fixtures/good-folder-dp', {});

streamValidator.on('issue', issue => {
issues.push(issue);
Expand All @@ -233,7 +233,7 @@ describe('api', () => {

it('should custom settings be processed correctly (excludeDirs as string)', done => {
const issues = [];
const streamValidator = new StreamValidator('./test/fixtures/good-folder', {
const streamValidator = new StreamValidator('./test/fixtures/good-folder-dp', {
excludeRules: 'WRONG_DATA_POINT_HEADER',
excludeDirs: '.gitingore, .git',
isCheckHidden: true
Expand Down Expand Up @@ -266,7 +266,7 @@ describe('api', () => {
excludeDirs: ['.gitingore', '.git'],
isCheckHidden: true
};
const streamValidator = new StreamValidator('./test/fixtures/good-folder', EXPECTED_SETTINGS);
const streamValidator = new StreamValidator('./test/fixtures/good-folder-dp', EXPECTED_SETTINGS);

streamValidator.on('issue', issue => {
issues.push(issue);
Expand Down Expand Up @@ -320,7 +320,7 @@ describe('api', () => {
const _StreamValidator = require('../lib/index').StreamValidator;

it('should result for generic and multi thread modes be same ', done => {
const EXPECTED_ISSUES_COUNT = 4;
const EXPECTED_ISSUES_COUNT = 15;
const DATA_SET_PATH = './test/fixtures/rules-cases/data-point-constraint-violation';

parallel({
Expand Down
117 changes: 116 additions & 1 deletion test/data-package-rules.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ import {
DATAPACKAGE_NON_UNIQUE_RESOURCE_FILE,
DATAPACKAGE_NON_UNIQUE_RESOURCE_NAME,
DATA_POINT_WITHOUT_INDICATOR,
SAME_KEY_VALUE_CONCEPT
SAME_KEY_VALUE_CONCEPT,
DATAPACKAGE_NONEXISTENT_RESOURCE, DATAPACKAGE_NONEXISTENT_CONCEPT
} from '../src/ddf-rules/registry';
import { DdfDataSet } from '../src/ddf-definitions/ddf-data-set';
import { Issue } from '../src/ddf-rules/issue';
Expand Down Expand Up @@ -368,4 +369,118 @@ describe('ddf datapackage.json validation', () => {
});
});
});

describe('when "DATAPACKAGE_NONEXISTENT_RESOURCE" rule', () => {
it('any issue should NOT be found for a folder without the problem', done => {
const ddfDataSet = new DdfDataSet('./test/fixtures/dummy-companies-with-dp', null);

ddfDataSet.load(() => {
const issues = allRules[DATAPACKAGE_NONEXISTENT_RESOURCE].rule(ddfDataSet);

expect(issues.length).to.equal(0);

done();
});
});

it('an issue should be found for a folder with the problem', done => {
const ddfDataSet = new DdfDataSet('./test/fixtures/rules-cases/dp-nonexistent-resource', null);
const expectedData = [
{
nonexistentResources: [
'company_size_string--by--company--anno-2'
],
record: {
primaryKey: [
'anno',
'company'
],
value: 'company_size_string',
resources: [
'company_size_string--by--company--anno-2'
]
},
specific: 'is NOT found in resources, but found in schema section'
},
{
nonexistentResources: [
'company_size_string--by--company--anno-2'
],
record: {
primaryKey: [
'anno',
'english_speaking_company'
],
value: 'company_size_string',
resources: [
'company_size_string--by--company--anno-2'
]
},
specific: 'is NOT found in resources, but found in schema section'
},
{
nonexistentResources: [
'company_size_string--by--company--anno-2'
],
record: {
primaryKey: [
'anno',
'foundation'
],
value: 'company_size_string',
resources: [
'company_size_string--by--company--anno-2'
]
},
specific: 'is NOT found in resources, but found in schema section'
},
{
resource: 'company_size-2',
specific: 'is NOT found in ddfSchema schema, but found in resources section'
}
];

ddfDataSet.load(() => {
const issues: Issue[] = allRules[DATAPACKAGE_NONEXISTENT_RESOURCE].rule(ddfDataSet);

expect(issues.length).to.equal(expectedData.length);

issues.forEach((issue, order) => {
expect(endsWith(issue.path, 'datapackage.json')).to.be.true;
expect(issue.data).to.deep.equal(expectedData[order]);
});

done();
});
});
});

describe('when "DATAPACKAGE_NONEXISTENT_CONCEPT" rule', () => {
it('any issue should NOT be found for a folder without the problem', done => {
const ddfDataSet = new DdfDataSet('./test/fixtures/dummy-companies-with-dp', null);

ddfDataSet.load(() => {
const issues = allRules[DATAPACKAGE_NONEXISTENT_CONCEPT].rule(ddfDataSet);

expect(issues.length).to.equal(0);

done();
});
});

it('an issue should be found for a folder with the problem', done => {
const ddfDataSet = new DdfDataSet('./test/fixtures/rules-cases/dp-nonexistent-concept', null);

ddfDataSet.load(() => {
const issues: Issue[] = allRules[DATAPACKAGE_NONEXISTENT_CONCEPT].rule(ddfDataSet);
const issue = head(issues);

expect(issues.length).to.equal(1);
expect(endsWith(issue.path, 'datapackage.json')).to.be.true;
expect(issue.data).to.equal('company_size');

done();
});
});
});
});
Loading

0 comments on commit 76c0659

Please sign in to comment.