Skip to content

Commit

Permalink
Add lint script for default model
Browse files Browse the repository at this point in the history
This change adds a script to test for common issues with the default FtM model:

* Divergent types: Multiple properties with the same name, but different types
* Divergent labels: Multiple properties with the same name, but different labels
* Label collisions: Multiple properties with different names, but using the same label

These issues can cause problems for example in Aleph. For example, divergent types can cause errors when querying multiple Elasticsearch indexes. Divergent labels result in a confusing user experience.
  • Loading branch information
tillprochaska committed Jun 9, 2023
1 parent 3c51eaf commit cf3efbd
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 21 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ jobs:
sudo apt-get install -y -qq libicu-dev
pip install wheel pyicu
pip install -e ".[dev]"
- name: Run checks for default model
run: |
python contrib/check_model.py
- name: Run the tests
run: |
make test
Expand Down
142 changes: 142 additions & 0 deletions contrib/check_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import sys
from collections import defaultdict
from followthemoney import model


IGNORE_DIVERGENT_TYPES = [
"author",
"organization",
"classification",
"gender",
"number",
"authority",
"duration",
"cpvCode",
"nutsCode",
"area",
"subject",
"sender",
]

IGNORE_DIVERGENT_LABELS = [
"wikidataId",
"parent",
"holder",
"number",
"authority",
"title",
"cpvCode",
"nutsCode",
"criteria",
"procedure",
"callForTenders",
"ticker",
]

IGNORE_LABEL_COLLISIONS = [
"Description",
"Address",
"Notes",
"Customs declarations",
"Country of origin",
"Payments received",
"Payments made",
"Entity",
"Passport number",
"The language of the translated text",
"Responding to",
"Document number",
"ISIN",
]


def test_divergent_types(by_name):
divergent = {}

for name, props in by_name.items():
if len(props) == 1 or name in IGNORE_DIVERGENT_TYPES:
continue

types = set([p.type for p in props])
if len(types) > 1:
divergent[name] = props

return divergent


def test_divergent_labels(by_name):
divergent = {}

for name, props in by_name.items():
if len(props) == 1 or name in IGNORE_DIVERGENT_LABELS:
continue

labels = set([p.label for p in props])
if len(labels) > 1:
divergent[name] = props

return divergent


def test_label_collisions(by_label):
collisions = {}

for label, props in by_label.items():
if len(props) == 1 or label in IGNORE_LABEL_COLLISIONS:
continue

names = set([p.name for p in props])
if len(names) > 1:
collisions[label] = props

return collisions


if __name__ == '__main__':
by_name = defaultdict(set)
by_label = defaultdict(set)

for schema in model:
for prop in schema.properties.values():
by_name[prop.name].add(prop)
by_label[prop.label].add(prop)

divergent_types = test_divergent_types(by_name)
divergent_labels = test_divergent_labels(by_name)
label_collisions = test_label_collisions(by_label)

failed = False

if divergent_types:
failed = True
print("DIVERGENT TYPES\n")
for name, props in divergent_types.items():
print(f" {name}:")
for prop in props:
print(f" * {prop.qname} - {prop.type.name}")
print()
print()

if divergent_labels:
failed = True
print("DIVERGENT LABELS\n")
for name, props in divergent_labels.items():
print(f" {name}:")
for prop in props:
print(f" * {prop.qname} - {prop.label}")
print()
print()

if label_collisions:
failed = True
print("COLLIDING LABELS\n")
for label, props in label_collisions.items():
print(f" {label}:")
for prop in props:
print(f" * {prop.qname}")
print()

if failed:
sys.exit(1)

print("No issues found.")
21 changes: 0 additions & 21 deletions contrib/collisions.py

This file was deleted.

0 comments on commit cf3efbd

Please sign in to comment.