Skip to content

Commit

Permalink
Major rewrite
Browse files Browse the repository at this point in the history
* Proper processing order implemented (class->resource->source
  instead of class->source->resource)
* Implement merging of conflicting resources in the repository
* Streaming output to the file.
* Use quickRdf as broadly as possible (and rdfInterface2easyRdf
  library to convert)
* and probably more
  • Loading branch information
zozlak committed Jan 9, 2023
1 parent 5b443fc commit 58bc180
Show file tree
Hide file tree
Showing 15 changed files with 533 additions and 297 deletions.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
vendor
/vendor
/composer.lock
/nbproject
composer.lock
/config.yaml
/*.ttl*
186 changes: 109 additions & 77 deletions arche-ref-sources
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,22 @@
*/

use zozlak\argparse\ArgumentParser as AP;
use GuzzleHttp\Exception\ClientException;
use quickRdf\Dataset;
use quickRdf\DataFactory as DF;
use quickRdf\Dataset;
use quickRdfIo\Util as ioUtil;
use rdfInterface2easyRdf\AsRdfInterface;
use rdfHelpers\DatasetNode;
use acdhOeaw\UriNormalizer;
use acdhOeaw\UriNormRules;
use acdhOeaw\UriNormalizerRule;
use acdhOeaw\UriNormalizerCache;
use acdhOeaw\UriNormalizerException;
use acdhOeaw\arche\lib\Repo;
use acdhOeaw\arche\refSources\NamedEntityIteratorFile;
use acdhOeaw\arche\refSources\NamedEntityIteratorRepo;
use acdhOeaw\arche\refSources\NamedEntityIteratorInterface;
use acdhOeaw\arche\refSources\NamedEntityInterface;
use acdhOeaw\arche\refSources\Util;
use acdhOeaw\arche\refSources\PropertyMapping;
use acdhOeaw\arche\refSources\PropertyMappings;
use acdhOeaw\arche\refSources\RefSourcesException;

if (file_exists(__DIR__ . '/../../autoload.php')) {
require_once __DIR__ . '/../../autoload.php';
Expand All @@ -59,17 +60,18 @@ $modes = [
'test' => MODE_TEST,
'update' => MODE_UPDATE,
];
$parser = new AP("Enriches metadata by fetching additional information from external reference sources (GND, geonames, etc.)\nWhich resource classes are processed, which external reference sources are used and which properties are fetched from them is driven by the configuration file.");
$parser = new AP(epilog: "Enriches metadata by fetching additional information from external reference sources (GND, geonames, etc.)\nWhich resource classes are processed, which external reference sources are used and which properties are fetched from them is driven by the configuration file.");
$parser->addArgument('--limit', type: AP::TYPE_INT, default: PHP_INT_MAX, help: 'limit number of processed resources');
$parser->addArgument('--after', help: 'process only resources modified after a given date');
$parser->addArgument('--id', help: 'process only repository resource with a given id');
$parser->addArgument('--id', help: 'process only resource with a given id');
$parser->addArgument('--class', help: 'process only resource of a given class');
$parser->addArgument('--repoUrl', help: 'use a given repository instance (overwrites the `repositoryUrl` property read from the config file)');
$parser->addArgument('--inputFile', help: 'read resources from a given RDF file instead of the ARCHE repository');
$parser->addArgument('--user', help: 'user name used for repository authentication (not important if --test or --resolveOnly are used)');
$parser->addArgument('--pswd', help: 'password used for repository authentication (not important if --test or --resolveOnly are used)');
$parser->addArgument('--mode', default: 'parse', choices: array_keys($modes), help: "operation mode\n - resolve - only try to resolve the external URI (finds broken external URIs)\n - parse [default] - resolve the external URI and parse the output (when used with --verbose and/or --output it allows to inspect the data provided by the external source and test the metadata mapping defined in the configuration file)\n - test - tries to update the repository resource with data fetched from the external source (so doorkeeper checks are performed) but rolls back the update no matter if it was successful or not\n - update - updates the repository resource with data fetched from the external source\n");
$parser->addArgument('--verbose', action: AP::ACTION_STORE_TRUE, help: 'provide more verbose output, especially print the data fetched from the external reference source');
$parser->addArgument('--output', help: 'when used, the data to be saved to the repository are also saved in a TTL file');
$parser->addArgument('--output', help: "when used, the data to be saved to the repository is also saved in a TTL file (the output isn't created in mode=resolve)");
$parser->addArgument('cfgFile', help: 'path to the configuration file');
$param = $parser->parseArgs();
$param->mode = $modes[$param->mode];
Expand All @@ -89,92 +91,122 @@ $dateFilter = !empty($param->after) ? $param->after : null;

// Helper objects initialization
$repo = Repo::factoryFromUrl($param->repositoryUrl ?? $cfg->repositoryUrl ?? die("ARCHE repository URL unknown"), $guzzleOpts);
$idProp = $repo->getSchema()->id;
$idPropNN = DF::namedNode($idProp);
$cache = new UriNormalizerCache();
$rules = UriNormRules::getRules();
$normalizer = new UriNormalizer($rules, cache: $cache);
$normRules = UriNormRules::getRules();
$normalizer = new UriNormalizer($normRules);

/* @var $source NamedEntityIteratorInterface */
if (!empty($param->inputFile)) {
$source = new NamedEntityIteratorFile($param->inputFile, $repo);
} else {
$source = new NamedEntityIteratorRepo($repo);
}
if (!empty($param->id)) {
$param->id = $normalizer->normalize($param->id);
}

// Fetch the data from external sources
$outputMeta = new Dataset(false);
if (!empty($param->output) && $param->mode >= MODE_PARSE) {
$nmsp = new quickRdf\RdfNamespace();
foreach ($cfg->namespaces ?? [] as $alias => $prefix) {
$nmsp->add($prefix, $alias);
}
$output = fopen($param->output, 'w');
}
foreach ($cfg->classes as $class => $cCfg) {
echo "\n### Processing resources of class $class\n\n";

//TODO - first resources should be collected,
// - then all namespaces should be processed for a given resource
// - then update should be performed
foreach ($cCfg as $namespace => $mappings) {
echo "\n ### Processing resources in the $namespace namespace\n\n";

$normRule = array_filter($rules, fn($x) => $x->name === $namespace);
$normRule = array_pop($normRule) ?? die("Normalization rules unknown for the $namespace namespace");
$normRule = UriNormalizerRule::factory($normRule);
$db = new UriNormalizer([$normRule], cache: $cache);
$idFilter = !empty($param->id) ? "^" . $param->id . "$" : $normRule->match;
$source->setFilter($class, $idFilter, $dateFilter, $param->limit);

$mappings = PropertyMapping::fromConfig($mappings);

foreach ($source->getNamedEntities() as $N => $namedEntity) {
/* @var $namedEntity NamedEntityInterface */
$N = $N + 1;
$T = $source->getCount();
$NN = round(100 * $N / $T);
$namedEntityUri = DF::namedNode($namedEntity->getUri());
echo " Resource $namedEntityUri ($N/$T $NN%)\n";

$ids = $namedEntity->getIdentifiers($normRule->match, $db);
if (count($ids) === 0) {
echo " WARNING: no matching identifiers\n";
continue;
}
if (!empty($param->class) && $class !== $param->class) {
continue;
}
echo "\n### Processing resources of class $class\n";

$newMeta = new Dataset(false);
foreach ($ids as $id) {
try {
$dbMeta = $db->fetch($id);
} catch (Throwable $e) {
echo " ERROR: Failed to load data from $id with error: " . $e->getMessage() . "\n";
continue;
}
if ($param->mode <= MODE_RESOLVE) {
continue;
}
$extDbNames = array_keys(get_object_vars($cCfg));

$mappings = new PropertyMappings($normalizer, $idPropNN);
$idFilter = [];
foreach ($extDbNames as $extDbName) {
$rule = array_filter($normRules, fn($x) => $x->name === $extDbName);
if (count($rule) === 0) {
die("No normalization rules found for the '$extDbName' external reference database\n");
}
$rule = UriNormalizerRule::factory(reset($rule));
$mappings->addExternalDatabase($extDbName, $rule, $cCfg->$extDbName);
$idFilter[] = $rule->match;
}

foreach ($mappings as $pCfg) {
/* @var $pCfg PropertyMapping */
$pCfg->merge($newMeta, $dbMeta, $namedEntityUri, $normalizer);
if (!empty($param->id)) {
$idFilter = "^" . $param->id . "$";
} else {
$idFilter = '(' . implode(')|(', $idFilter) . ')';
}
$source->setFilter($class, $idFilter, $dateFilter, $param->limit);

$N = 1;
foreach ($source->getNamedEntities() as $entity) {
/* @var NamedEntityInterface $entity */
$T = $source->getCount();
$NN = round(100 * $N / $T);
echo " " . $entity->getUri() . " ($N/$T $NN%)\n";
$N++;

// collect data from external databases
$entityExtMeta = [];
$idsToProcess = $entity->getIdentifiers($normalizer);
$idsProcessed = [];
while (count($idsToProcess) > 0) {
try {
$id = array_pop($idsToProcess);
$idsProcessed[$id] = 1;
$id = $normalizer->normalize($id);
$idsProcessed[$id] = 1;
// don't even try to resolve identifiers for which there's no mapping
$mappings->matchExternalDatabase($id);
echo " fetching data for $id\n";
$meta = AsRdfInterface::addDatasetNode($normalizer->fetch($id), new DF(), fn($x) => new DatasetNode(new Dataset(), $x));
$uriStr = $meta->getNode()->getValue();
$extDbName = $mappings->matchExternalDatabase($uriStr);
if (!isset($entityExtMeta[$extDbName])) {
$entityExtMeta[$extDbName] = [];
} if (!isset($entityExtMeta[$extDbName][$uriStr])) {
$entityExtMeta[$extDbName][$uriStr] = $meta;
foreach ($mappings->mapIdentifiers($meta, $extDbName) as $i) {
if (!isset($idsProcessed[$i]) && !in_array($i, $idsToProcess)) {
$idsToProcess[] = $i;
}
}
}
} catch (RefSourcesException | UriNormalizerException $e) {
echo $param->verbose ? " unsupported source: " . $e->getMessage() . "\n" : '';
}
if (!empty($param->output)) {
$outputMeta->add($newMeta);
}
if ($param->mode <= MODE_PARSE) {
continue;
}
if ($param->mode < MODE_RESOLVE) {
continue;
}

// update entiti's metadata
$entityMeta = $entity->getMetadata();
$entityMetaOrig = $entityMeta->getDataset()->copy();
foreach ($extDbNames as $dbName) {
foreach ($entityExtMeta[$dbName] ?? [] as $extDbMeta) {
$mappings->resolveAndMerge($dbName, $entityMeta, $extDbMeta);
}
// ARCHE update goes here
}
echo $param->verbose ? " fetched data: \n " . trim(str_replace("\n", "\n ", (string) $entityMeta->getDataset()->copyExcept($entityMetaOrig))) . "\n" : '';

// save entity's metadata
if ($param->mode >= MODE_TEST) {
echo " updating ARCHE resource " . ($param->mode === MODE_TEST ? '(test)' : '') . "\n";
try {
$namedEntity->updateMetadata(Util::asEasyRdfResource($newMeta, $namedEntity->getUri()), $param->mode <= MODE_TEST);
echo $param->verbose ? " INFO: updated successfully\n" : "";
} catch (ClientException $ex) {
echo " ERROR: " . (string) $ex->getResponse()->getBody() . "\n";
$merged = $entity->updateMetadata($entityMeta, $param->mode === MODE_TEST);
$merged = count($merged) > 0 ? '(merged with: ' . implode(', ', $merged) . ')' : '';
echo " succeeded $merged\n";
} catch (\Exception $e) {
echo " failed with: " . ($param->verbose ? print_r($e, true) : $e->getMessage()) . "\n";
}
}
if (isset($output)) {
ioUtil::serialize($entityMeta, 'text/turtle', $output, $nmsp);
}
}
}
echo $param->verbose ? "\n### Collected metadata:\n\n$outputMeta\n" : '';
if (!empty($param->output)) {
echo "\n### Writing collected metadata to $param->output\n";
$nmsp = new quickRdf\RdfNamespace();
foreach ($cfg->namespaces ?? [] as $alias => $prefix) {
$nmsp->add($prefix, $alias);
}
ioUtil::serialize($outputMeta, 'text/turtle', $param->output, $nmsp);
}
if (isset($output)) {
fclose($output);
}
16 changes: 9 additions & 7 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,24 @@
"require": {
"php": ">=8",
"guzzlehttp/guzzle": "^7",
"acdh-oeaw/arche-lib": "*",
"psr/log": "*",
"acdh-oeaw/arche-lib": "^5",
"psr/log": "^3",
"acdh-oeaw/arche-assets": "^3.8.1",
"acdh-oeaw/uri-normalizer": "^2",
"zozlak/argparse": "*",
"zozlak/argparse": "^1",
"sweetrdf/quick-rdf-io": "^1",
"sweetrdf/quick-rdf": "^1",
"sweetrdf/term-templates": "^1"
"sweetrdf/term-templates": "^1",
"sweetrdf/rdfinterface2easyrdf": "^0.2"
},
"autoload": {
"psr-4": {
"acdhOeaw\\": "src/acdhOeaw"
}
},
"require-dev": {
"phpunit/phpunit": "*",
"phpstan/phpstan": "*"
"phpunit/phpunit": "^9",
"phpstan/phpstan": "^1"
},
"autoload-dev": {
"psr-4": {
Expand All @@ -39,5 +40,6 @@
},
"bin": [
"arche-ref-sources"
]
],
"minimum-stability": "dev"
}
3 changes: 3 additions & 0 deletions config-sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,11 @@ classes:
- property: https://vocabs.acdh.oeaw.ac.at/schema#hasCountry
action: replace
type: literal
langProcess: assure
langValue: und
path:
- https://d-nb.info/standards/elementset/gnd#geographicAreaCode
- http://www.w3.org/2004/02/skos/core#prefLabel
- property: https://vocabs.acdh.oeaw.ac.at/schema#hasUrl
action: add
type: literal
Expand Down
56 changes: 27 additions & 29 deletions src/acdhOeaw/arche/refSources/NamedEntityFile.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@

namespace acdhOeaw\arche\refSources;

use EasyRdf\Resource;
use quickRdf\DataFactory as DF;
use rdfHelpers\DatasetNode;
use termTemplates\QuadTemplate as QT;
use acdhOeaw\UriNormalizer;
use acdhOeaw\arche\lib\RepoResource;
use acdhOeaw\UriNormalizerException;
use acdhOeaw\arche\lib\Repo;

/**
* Description of RefResourceFile
Expand All @@ -37,47 +40,42 @@
*/
class NamedEntityFile implements NamedEntityInterface {

private Resource $res;
use NamedEntityTrait;

private DatasetNode $node;
private NamedEntityIteratorFile $iter;
private Repo $repo;

public function __construct(Resource $res, NamedEntityIteratorFile $iter) {
$this->res = $res;
public function __construct(DatasetNode $node,
NamedEntityIteratorFile $iter, Repo $repo) {
$this->node = $node;
$this->iter = $iter;
$this->repo = $repo;
}

public function getMetadata(): DatasetNode {
return $this->node;
}

/**
*
* @param string $match
* @param UriNormalizer $normalizer
* @return array<string>
*/
public function getIdentifiers(string $match, UriNormalizer $normalizer): array {
$match = "`$match`";
$ids = [];
foreach ($this->res->allResources($this->iter->getIdProp()) as $id) {
$id = (string) $id;
if (preg_match($match, $id)) {
$ids[] = $normalizer->normalize($id);
public function getIdentifiers(UriNormalizer $normalizer): array {
$allIds = $this->node->getIterator(new QT(predicate: DF::namedNode($this->iter->getIdProp())));
$ids = [];
foreach (iterator_to_array($allIds) as $id) {
try {
$ids[] = $normalizer->normalize((string) $id->getObject()->getValue(), true);
} catch (UriNormalizerException $e) {

}
}
return $ids;
}

public function getUri(): string {
return $this->res->getUri();
}

public function updateMetadata(Resource $meta, bool $test = true): void {

$repoRes = $this->iter->getRepoResource($this->res);
$repo = $repoRes->getRepo();
/* @var $repo \acdhOeaw\arche\lib\Repo */
$repo->begin();
$repoRes->setMetadata($meta);
$repoRes->updateMetadata(RepoResource::UPDATE_MERGE);
if ($test) {
$repo->rollback();
} else {
$repo->commit();
}
return $this->node->getNode()->getValue();
}
}
Loading

0 comments on commit 58bc180

Please sign in to comment.