Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hbz lobid rdf to json 25 contributor types #106

Merged
merged 11 commits into from
Sep 22, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
<dependency>
<groupId>org.culturegraph</groupId>
<artifactId>metafacture-core</artifactId>
<version>2.0.1-HBZ-SNAPSHOT</version> <!-- 1.2.0</version> -->
<version>4.0.0-HBZ-SNAPSHOT</version> <!-- 1.2.0</version> -->
</dependency>
<dependency>
<groupId>org.apache.jena</groupId>
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/lobid/resources/run/MabXml2lobidJsonEs.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import org.culturegraph.mf.morph.Metamorph;
import org.culturegraph.mf.stream.converter.xml.AlephMabXmlHandler;
import org.culturegraph.mf.stream.converter.xml.XmlDecoder;
import org.culturegraph.mf.stream.pipe.BatchLogger;
import org.culturegraph.mf.stream.pipe.StreamBatchLogger;
import org.culturegraph.mf.stream.pipe.ObjectBatchLogger;
import org.culturegraph.mf.stream.pipe.StreamTee;
import org.culturegraph.mf.stream.source.FileOpener;
Expand Down Expand Up @@ -78,7 +78,7 @@ public static void main(String... args) {
esIndexer.setIndexAliasSuffix(indexAliasSuffix);
esIndexer.setUpdateNewestIndex(update);
esIndexer.onSetReceiver();
BatchLogger batchLogger = new BatchLogger();
StreamBatchLogger batchLogger = new StreamBatchLogger();
batchLogger.setBatchSize(100000);
ObjectBatchLogger<HashMap<String, String>> objectBatchLogger =
new ObjectBatchLogger<>();
Expand Down
140 changes: 103 additions & 37 deletions src/main/resources/morph-hbz01-to-lobid.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<!-- Aleph MAB differs from MAB in that: -->
<!-- an indicator2 exists -->
<!-- most fields have a subfield 'a' -->
<!-- => structure of alehp MAB: $field$indicator1$indicator2$subfield -->
<!-- => structure of aleph MAB: $field$indicator1$indicator2$subfield -->
<vars>
<var name="ns-lobid-resource" value="http://lobid.org/resources/"/>
<var name="ns-lobid-vocab" value="http://purl.org/lobid/lv#"/>
Expand Down Expand Up @@ -60,44 +60,67 @@
</concat>
</combine>
</macro>
<!-- prefer gnd subjects IDs to literals -->
<macro name="subjectOrder">
<combine name="@subjectOrder" value="${a}">
<concat delimiter=", " name="a">
<choose name="a" flushWith="$[field]-[-12].9|record">
<choose name="a" flushWith="$[field]-[-12]" sameEntity="true">
<data source="$[field]-[-12].9">
<regexp match="\(DE-588\)(.*)" format="http://d-nb.info/gnd/${1}"/>
</data>
<concat delimiter=", " name="ignored" flushWith="$[field]-[-12].9|record">
<data source="$[field]-[-12].[acfghkpstkz-]"/>
</concat>
<data source="$[field]-[-12].[acfghkpstkz-]"/>
</choose>
</concat>
</combine>
</macro>
<!-- subjects without ID -->
<macro name="dctSubject">
<combine name="@subjectIdOrElseLiteral" value="${a}">
<choose name="a" flushWith="$[field]-[-12].9|record">
<combine name="@rdfTypeSubject" value="${a}" flushWith="$[field]-[-12]|record"
reset="true" sameEntity="true">
<choose name="a" flushWith="$[field]-[-12]|record" sameEntity="true">
<data source="$[field]-[-12].9">
<regexp match="\(DE-588\)(.*)" format="http://d-nb.info/gnd/${1}"/>
<regexp match="\(DE-588\)(.*)" format=""/> <!-- will be ignored -->
</data>
<data source="$[field]-[-12].e" name="a">
<constant value="http://d-nb.info/standards/elementset/gnd#ConferenceOrEvent"/>
</data>
<data source="$[field]-[-12].g" name="a">
<constant value="http://d-nb.info/standards/elementset/gnd#PlaceOrGeographicName"/>
</data>
<data source="$[field]-[-12].k" name="a">
<constant value="http://d-nb.info/standards/elementset/gnd#CorporateBody"/>
</data>
<data source="$[field]-[-12].[acfghkpstkz-]">
<regexp match="(.*)" format="${1}"/>
<data source="$[field]-[-12].p" name="a">
<constant value="http://d-nb.info/standards/elementset/gnd#Person"/>
</data>
<data source="$[field]-[-12].t" name="a">
<constant value="http://d-nb.info/standards/elementset/gnd#Work"/>
</data>
<data source="$[field]-[-12].?" name="a">
<constant value="http://d-nb.info/standards/elementset/gnd#SubjectHeading"/>
</data>
</choose>
</combine>
</macro>
</macros>
<rules>
<data source="@subjectIdOrElseLiteral" name="@subjectId">
<regexp match="http.*"/>
<sanitizeUrl></sanitizeUrl>
</data>
<data source="@subjectId" name="http://purl.org/dc/terms/subject"/>
<entity name="http://purl.org/dc/terms/subject">
<data source="@subjectIdOrElseLiteral" name="http://d-nb.info/standards/elementset/gnd#preferredName">
<regexp match="^[^http].*"/>
</data>
<entity name="http://purl.org/dc/terms/subject" flushWith="@rdfTypeSubject"
reset="true" sameEntity="true">
<data source="@rdfTypeSubject" name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type">
<regexp match="^(http.*)"/>
</data>
<combine name="$[ns-gnd]preferredName" value="${a}" reset="true" sameEntity="true">
<data source="@rdfTypeSubject">
<regexp match="^http"/>
</data>
<data source="9[01234][27]-[-12].[aefghpstz-]" name="a">
<regexp match="(.*)" format="${1}"/>
</data>
</combine>
</entity>
<data source="9[01234][27]-[-12].9" name="http://purl.org/dc/terms/subject">
<regexp match="\(DE-588\)(.*)" format="http://d-nb.info/gnd/${1}"/>
</data>
<!-- ####################### -->
<!-- ####### Get subject uri of each record -->
<!-- ####################### -->
Expand Down Expand Up @@ -858,7 +881,8 @@
<replace pattern="&lt;&lt;" with=""/>
</data>
</combine>
<choose name="@creatorNameOrId" flushWith="[12]????">
<!-- flushWith="@creatorPersonId|@creatorCorporateBodyId|@creatorLabel|@nameOfCreatingCorporateBody" -->
<choose name="@creatorNameOrId" flushWith="[12]????" sameEntity="true" reset="false">
<data source="@creatorPersonId"/>
<data source="@creatorCorporateBodyId"/>
<data source="@creatorLabel"/>
Expand All @@ -868,22 +892,22 @@
<data source="@contributorNameOrId" name="http://purl.org/dc/elements/1.1/contributor">
<regexp match="^[^http].*"/>
</data>
<choose name="@contributorNameOrId" flushWith="[12]????">
<choose name="@contributorNameOrId" flushWith="[12]????" sameEntity="true" reset="false">
<data source="@contributorPersonId"/>
<data source="@contributorCorporateBodyId"/>
<data source="@contributorLabel"/>
<data source="@nameOfContributingCorporateBody"/>
</choose>
<entity name="http://purl.org/dc/terms/contributor" sameEntity="true">
<data source="@contributorNameOrId" name="http://d-nb.info/standards/elementset/gnd#preferredName">
<data source="@contributorNameOrId" name="$[ns-gnd]preferredName">
<regexp match="^[^http].*"/>
</data>
<data source="@contributorNameOrId" name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type">
<regexp match="^[^http].*" format="http://d-nb.info/standards/elementset/gnd#Person"/>
</data>
</entity>
<entity name="http://purl.org/dc/terms/contributor" sameEntity="true">
<data source="2?[02468]-1.a" name="http://d-nb.info/standards/elementset/gnd#preferredName"/>
<data source="2?[02468]-1.a" name="$[ns-gnd]preferredName"/>
<data source="2?[02468]-1.a" name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type">
<regexp match=".*" format="http://d-nb.info/standards/elementset/gnd#CorporateBody"/>
</data>
Expand Down Expand Up @@ -926,6 +950,7 @@
<data source="@nameOfCreatingCorporateBodyK"/>
</choose>
</combine>
<data source="2?[02468][-a][12].g" name="@nameOfCreatingCorporateBodyG"/>
<!-- only contributor, not creator -->
<data source="2?[02468][bcfep][12].k" name="@nameOfContributingCorporateBodyK">
<replace pattern="&gt;&gt;" with=""/>
Expand All @@ -936,7 +961,6 @@
<data source="@nameOfContributingCorporateBodyK" name="k"/>
<data source="@nameOfContributingCorporateBodyH" name="h"/>
</combine>
<data source="2?[02468][bcfep][12].b" name=""/>
<combine name="@nameOfContributingCorporateBodyB" value="${a}">
<concat delimiter=", " name="a" flushWith="@contributorCorporateBodyId">
<data source="2?[02468][bcfep][12].b"/>
Expand Down Expand Up @@ -1236,17 +1260,19 @@
</concat>
</combine>
<combine name="@subjectPerson" value="${b}" sameEntity="true">
<data source="9[01234][27]-[-12].p"/>
<choose name="b" flushWith="@subjectLinks">
<data source="9[01234][27]-[-12].[efgkstz]">
<regexp match=".*" format=""/>
</data>
<data source="9[01234][27]-[-12].c">
<regexp match="^Familie"/>
</data>
<data source="@preferredNamePersonConcatSubjectPC"/>
</choose>
</combine>
<combine name="$[ns-gnd]preferredNameForThePerson" value="${a}" flushWith="@preferredNameP">
<data source="@subjectPerson" name="a">
<regexp match=".*"/>
<regexp match="^[^Familie].*"/>
</data>
</combine>
<!-- ########### -->
Expand All @@ -1273,14 +1299,29 @@
</concat>
</combine>
<!-- START contributor/creator -->
<combine name="$[ns-gnd]preferredNameForThePerson" value="${a}" sameEntity="true"
reset="true">
<combine name="@preferredNameForThePerson" value="${a}" sameEntity="true" reset="true">
<data source="@preferredName" name="a">
<replace pattern="&gt;&gt;" with=""/>
<replace pattern="&lt;&lt;" with=""/>
</data>
<data source="@creatorPersonId|@contributorPersonId"/>
</combine>
<data source="@preferredNameForThePerson" name="$[ns-gnd]preferredNameForThePerson"/>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="http://d-nb.info/standards/elementset/gnd#Family"
flushWith="@subjectLinks">
<data source="9[01234][27]-[-12].c">
<regexp match="Familie"/>
</data>
</combine>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="http://d-nb.info/standards/elementset/gnd#Person">
<data source="@preferredNameForThePerson"/>
</combine>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="http://d-nb.info/standards/elementset/gnd#Person"
flushWith="@preferredNameP">
<data source="@subjectPerson">
<regexp match="^[^Familie].*"/>
</data>
</combine>
<combine name="$[ns-gnd]preferredNameForTheCorporateBody" value="${a}"
flushWith="@creatorCorporateBodyId">
<data source="@nameOfCreatingCorporateBody" name="a"/>
Expand Down Expand Up @@ -1309,21 +1350,31 @@
<replace pattern="&lt;" with=""/>
</data>
</combine>
<choose name="$[ns-gnd]preferredNameForTheWork" flushWith="@subjectLinks">
<choose name="@preferredNameForTheWork" flushWith="@subjectLinks">
<data source="@preferredNameSemicolon"/>
<data source="9[01234][27]-[-12].t"/>
</choose>
<combine name="$[ns-gnd]preferredNameForTheSubjectHeading" value="${a}"
sameEntity="true">
<data source="9[01234][27]-[-12].s"/>
<data source="@preferredNameForTheWork" name="$[ns-gnd]preferredNameForTheWork"/>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="http://d-nb.info/standards/elementset/gnd#Work">
<data source="@preferredNameForTheWork"/>
</combine>
<!-- default, generic type -->
<combine name="@preferredNameForTheSubjectHeading" value="${a}" sameEntity="true">
<data source="9[01234][27]-[-12].[szf]"/>
<data source="@preferredName" name="a"/>
</combine>
<combine name="$[ns-gnd]preferredNameForThePlaceOrGeographicName" value="${a}"
<data source="@preferredNameForTheSubjectHeading" name="$[ns-gnd]preferredNameForTheSubjectHeading"/>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="http://d-nb.info/standards/elementset/gnd#SubjectHeading">
<data source="@preferredNameForTheSubjectHeading"/>
</combine>
<combine name="@preferredNameForThePlaceOrGeographicName" value="${a}"
sameEntity="true">
<data source="@subjectLinks"/>
<data source="9[01234][27]-[-12].g"/>
<data source="@preferredNameConcatSubject" name="a"/>
</combine>
<data source="@preferredNameForThePlaceOrGeographicName|@nameOfCreatingCorporateBodyG"
name="$[ns-gnd]preferredNameForThePlaceOrGeographicName"/>
<combine name="$[ns-gnd]dateOfBirth" value="${a}" sameEntity="true">
<data source="@subjectLinks"/>
<data source="@preferredNameP"/>
Expand All @@ -1338,6 +1389,22 @@
<regexp match="[0-9]+-([0-9]+)" format="${1}"/>
</data>
</combine>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="${a}"
flushWith="@subjectLinks|@creatorCorporateBodyId|@contributorCorporateBodyId"
sameEntity="true">
<choose name="a" flushWith="@subjectLinks|@creatorCorporateBodyId|@contributorCorporateBodyId"
sameEntity="true">
<data source="@preferredNameConcatEventEDC">
<constant value="$[ns-gnd]ConferenceOrEvent"/>
</data>
<data source="@preferredNameForThePlaceOrGeographicName|@nameOfCreatingCorporateBodyG">
<constant value="$[ns-gnd]PlaceOrGeographicName"/>
</data>
<data source="9[01234][27]-[-12].k|@creatorCorporateBodyId|@contributorCorporateBodyId">
<constant value="$[ns-gnd]CorporateBody"/>
</data>
</choose>
</combine>
<!-- /subjects -->
<choose name="@preferredName"
flushWith="@subjectLinks|@preferredNameP|@creatorPersonId|@creatorCorporateBodyId|@contributorPersonId|@contributorCorporateBodyId">
Expand Down Expand Up @@ -1429,8 +1496,7 @@
<replace pattern=", :" with=":"/>
</data>
<!-- ############ "schlagwortketten" (aka subject chains) as ordered lists -->
<!-- prefer gnd subjects IDs to literals, make rdf-lists -->
<entity name="~rdf:list">
<entity name="~rdf:list" reset="true">
<data name="$[ns-lobid-vocab]subjectOrder" source="@subjectOrder"/>
</entity>
<call-macro name="subjectOrder" field="902"/>
Expand All @@ -1443,7 +1509,7 @@
<call-macro name="subjectOrder" field="937"/>
<call-macro name="subjectOrder" field="942"/>
<call-macro name="subjectOrder" field="947"/>
<!-- subjects as an json object -->
<!-- subjects without ID -->
<call-macro name="dctSubject" field="902"/>
<call-macro name="dctSubject" field="907"/>
<call-macro name="dctSubject" field="912"/>
Expand Down Expand Up @@ -1718,4 +1784,4 @@
<filemap name="sigel2isilMap" files="sigel2isilMap.csv"/>
<filemap name="iso639xToIso639-3-Map" files="iso639xToIso639-3-Map.tsv"/>
</maps>
</metamorph>
</metamorph>
2 changes: 2 additions & 0 deletions src/main/resources/schemata/metamorph.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,9 @@ decreasing order of definition.
<attribute name="name" type="string" use="optional"/>
<attribute name="value" type="string" use="optional"/>
<!-- <attribute name="defaultValue" type="string" use="optional" /> -->
<attribute name="reset" type="boolean" use="optional" default="true" />
<attribute name="flushWith" type="string" use="optional" default="record"/>
<attribute name="sameEntity" type="boolean" use="optional" default="false"/>
</complexType>
</element>
<element name="tuples">
Expand Down
1 change: 1 addition & 0 deletions src/test/resources/data
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
curl -XGET http://gaia.hbz-nrw.de:9200/resources-smalltest-20160315-132731/item/HT008351497:DE-6:46%205244
Loading