diff --git a/Concordance-RVK-Verbundbibliothek/README.md b/Concordance-RVK-Verbundbibliothek/README.md new file mode 100644 index 0000000..380d7a7 --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/README.md @@ -0,0 +1,58 @@ +Create a json file of concordance of Verbundbibliotheks-IDs <-> RVK +==================== +This is an example of how to load the `culturegraph aggregate MARC21 XML` file, +selecting a Library Union Catalog based on an ISIL, associate the IDs of the +libraries with the [RVK](https://de.wikipedia.org/wiki/Regensburger_Verbundklassifikation) and generate an json bulk file which can be indexed into elasticsearch. + +- [Create a json file of concordance of Verbundbibliotheks-IDs <-> RVK](#create-a-json-file-of-concordance-of-verbundbibliotheks-ids-----rvk) +- [Installation](#installation) +- [Run](#run) +- [Index](#index) +- [Enrichment](#enrichment) + +# Installation +Until the next metafacture release (> 5.1.0, coming this year) the easiest way is to +load this prebundle: +```bash +wget http://lobid.org/download/tmp/dini-kim-2020_rvk/metafacture-core-rvk-dist.zip +unzip metafacture-core-rvk-dist.zip +cd metafacture-core-rvk-dist +``` +# Run +```bash +bash flux.sh culturegraph_to_Rvk-Verbundbibliothek_concordance_jsonbulk.flux +``` +The parameter after `flux.sh` (the flux file) can be the path to the flux file, e.g. the +flux from this repo. Just make sure that the files used in the flux (input, morph) +reside in the same directory as the flux itself (as it is in this repo). + +Get the real aggregate data dump (~7GB) from somewhere, adjust the path in the morph. + +# Index +_This shall work at least for all elasticsearch versions <8.0 where the "index-type" setting is still valid._ +The generated `bulk.ndjson` looks like this: + +> {"index":{"_index":"cgrvk","_type":"rvk"}} +> {"rvk":["CI 1100","5,1"],"hbzId":"HT018839495, HT018625006"} + +This is elasticsearch's bulk format, where the odd-numbered lines are the index' +metadata and the follwing even-numbered the actual data to be indexed. + +Make sure your Elasticsearch is up and running. Index: +``` +curl -XPOST --header 'Content-Type: application/x-ndjson' --data-binary @bulk.ndjson 'http://localhost:9200/_bulk' +``` +*Note*: elasticsearch's default upload sizes are limited to the Elasticsearch HTTP receive buffer size (default 100 Mb). See the script `bulkIndexingEs.sh` how to split +the `bulk.ndjson` and index when you use the culturegraph's complete aggreagte data. + +Test-query: +```bash +curl localhost:9200/cg/_search?q="hbzId:HT018625006" +``` + +# Enrichment +The resulting elasticsearch index can be used to enrich your data. +*Note*: as you may have quite a lot of records (several millions) don't use +HTTP-Requests when doing lookups against the index but use native `TransportClients` +of Elasticsearch, thus avoiding the HTTP overhead for performance reasons. Elasticsearch +provide the libraries for nearly all programming languages. diff --git a/Concordance-RVK-Verbundbibliothek/aggregate_auslieferung_20191212.small.marcxml b/Concordance-RVK-Verbundbibliothek/aggregate_auslieferung_20191212.small.marcxml new file mode 100644 index 0000000..00954fd --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/aggregate_auslieferung_20191212.small.marcxml @@ -0,0 +1,112 @@ + + + + 00000nam a2200000 a 4500 + CG_1_2019-12-08T10:23:41.073Z + DE-101 + + (AT-OBV)990034557380203331 + 7\p + + + Quelle + f + 7\p + + + + 00000nam a2200000 a 4500 + CG_2_2019-12-08T10:23:41.073Z + DE-101 + + (AT-OBV)990032216710203331 + 4\p + + + (DE-101)962757853 + 6\p + + + (DE-605)HT018839495 + 1\p + + + (DE-605)HT018625006 + 9\p + + + (DE-607)HT01862500i7 + 9\p + + + CI 1100 + rvk + (DE-625)18356: + (DE-603)407647848 + 3\p + 5\p + + + 5,1 + ssgn + 10\p + 11\p + 12\p + 13\p + + + (DE-588)118572350 + (DE-627)135606780 + (DE-576)20901315X + Lévinas, Emmanuel + 1906-1995 + gnd + 11\p + + + Ethics + 11\p + 12\p + 13\p + + + Phenomenology + 10\p + + + DE-101 + DE-101 + 6\p + + + DE-605 + 7\p + 9\p + + + DE-605 + 7\p + 9\p + + + + 00000nam a2200000 a 4500 + CG_2_2019-12-08T10:23:42.074Z + DE-101 + (DE-605)HT013166356 + 1\p + + (DE-605)HT018625006 + 9\p + + + (DE-588)118572350 + (DE-627)135606780 + (DE-576)20901315X + Lévinas, Emmanuel + 1906-1995 + gnd + 11\p + + + diff --git a/Concordance-RVK-Verbundbibliothek/bulkIndexingEs.sh b/Concordance-RVK-Verbundbibliothek/bulkIndexingEs.sh new file mode 100644 index 0000000..71b4d51 --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/bulkIndexingEs.sh @@ -0,0 +1,10 @@ +#Upload sizes are limited to the Elasticsearch HTTP receive buffer size (default 100 Mb). +# 1. split bulk.ndjson +# Because every two lines are one complete bulk index request one must split them +# even-numbered. E.g.: +split --lines=1000000 bulk.ndjson +# 2. now all the resulting files can be indexed: +for i in $(ls x*); do + echo $i; + curl -H "Content-Type: application/x-ndjson" -XPOST locahost:9200/_bulk --data-binary "@$i" 2>&1>/dev/null +done diff --git a/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_jsonbulk.flux b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_jsonbulk.flux new file mode 100644 index 0000000..e2c360f --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_jsonbulk.flux @@ -0,0 +1,30 @@ +// Die flux filtert mittels morph-cg-to-es.xml die Isil DE-605 aus +// der culturegraph aggregate marcxml raus und baut eine json bulk +// Datei, snippet daraus: +// +//{"index":{"_index":"cgrvk","_type":"rvk"}} +//{"rvk":["CI 1100","5,1"],"hbzId":"HT018839495, HT018625006"} +// +// Diese Datei kann per curl in einen Elasticsearch-Index geladen werden: +// +// curl -XPOST --header 'Content-Type: application/x-ndjson' -d @bulk.ndjson 'http://localhost:9200/_bulk' + +default outfile = "bulk.ndjson"; +default infile = FLUX_DIR + "aggregate_auslieferung_20191212.small.marcxml"; +default morphfile = FLUX_DIR + "morph-cg-to-es.xml"; + + +infile| +open-file| +decode-xml| +split-xml-elements(topLevelElement="marc:collection",elementName="record")| +literal-to-object| +read-string| +decode-xml| +handle-marcxml| +filter(morphfile)| +morph(morphfile)| +encode-json| +json-to-elasticsearch-bulk(type="rvk", index="cgrvk")| +write(outfile); + diff --git a/Concordance-RVK-Verbundbibliothek/morph-cg-to-es.xml b/Concordance-RVK-Verbundbibliothek/morph-cg-to-es.xml new file mode 100644 index 0000000..172e889 --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/morph-cg-to-es.xml @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + +