Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1058-enrichWithCulturegraphRvkWithFix #1921

Merged
merged 19 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@
</dependency>
<dependency>
<groupId>org.metafacture</groupId>
<artifactId>metafacture-triples</artifactId>
<artifactId>metafacture-elasticsearch</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>org.metafacture</groupId>
<artifactId>metafacture-csv</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
Expand Down Expand Up @@ -124,6 +129,21 @@
<artifactId>core</artifactId>
<version>1.47.1</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-1.2-api</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>commons-validator</groupId>
<artifactId>commons-validator</artifactId>
<version>1.5.1</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
Expand Down Expand Up @@ -263,6 +283,7 @@
</executions>
<configuration>
<excludes>
<exclude>tmp/</exclude>
<exclude>web/public/javascripts/leaflet.js</exclude>
<exclude>**/*.woff2</exclude>
<exclude>web/conf/context.jsonld</exclude>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/

package org.lobid.resources.run;

import java.io.File;
import java.io.IOException;

import org.metafacture.biblio.marc21.MarcXmlHandler;
import org.metafacture.csv.CsvEncoder;
import org.metafacture.json.JsonDecoder;
import org.metafacture.json.JsonEncoder;
import org.metafacture.io.FileOpener;
import org.metafacture.io.ObjectWriter;
import org.metafacture.xml.XmlDecoder;
import org.metafacture.metafix.Metafix;

/**
* Filter resources with hbz holdings from culturegraph's MARCXML while tranform it with reject()
* into a CSV file.
*
* @author Pascal Christoph (dr0i)
* @author Tobias Bülte (TobiasNx)
**/
public final class CulturegraphXmlFilterHbzRvkToCsv {
private static String OUTPUT_FILE="cg-concordance.csv";

public static void main(String... args) {
String XML_INPUT_FILE = new File(args[0]).getAbsolutePath();

if (args.length > 1) OUTPUT_FILE = args[1];

final FileOpener opener = new FileOpener();
JsonDecoder jsonDecoder = new JsonDecoder();
jsonDecoder.setRecordPath("records");
try {
opener.setReceiver(new XmlDecoder()).setReceiver(new MarcXmlHandler())
.setReceiver(new Metafix("src/main/resources/rvk/cg-to-rvk-csv.fix"))
.setReceiver(new JsonEncoder())
.setReceiver(jsonDecoder)
.setReceiver(new CsvEncoder())
.setReceiver(new ObjectWriter<>(OUTPUT_FILE));
} catch (IOException e) {
e.printStackTrace();
}
opener.process(
new File(XML_INPUT_FILE).getAbsolutePath());
try {
opener.closeStream();
} catch (final NullPointerException e) {
// ignore, see https://github.com/hbz/lobid-resources/issues/1030
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/

package org.lobid.resources.run;

import java.io.File;
import java.io.IOException;

import org.metafacture.biblio.marc21.MarcXmlHandler;
import org.metafacture.elasticsearch.JsonToElasticsearchBulk;
import org.metafacture.io.FileOpener;
import org.metafacture.io.ObjectWriter;
import org.metafacture.json.JsonEncoder;
import org.metafacture.xml.XmlDecoder;
import org.metafacture.metafix.Metafix;

/**
* Filter resources with hbz holdings from culturegraph's MARCXML while tranform it with reject()
* into JSON and write this as an elasticsearch bulk json file.
*
* @author Pascal Christoph (dr0i)
* @author Tobias Bülte (TobiasNx)
**/
public final class CulturegraphXmlFilterHbzToJson {
private static final String ELASTICSEARCH_INDEX_NAME = "cg";
public static final String ELASTICSEARCH_INDEX_TYPE_NAME="rvk";
private static String JSON_FILE="bulk.ndjson";

public static void main(String... args) {
String XML_INPUT_FILE = new File(args[0]).getAbsolutePath();

if (args.length > 1) JSON_FILE = args[1];

final FileOpener opener = new FileOpener();
try {
opener.setReceiver(new XmlDecoder()).setReceiver(new MarcXmlHandler())
.setReceiver(new Metafix("src/main/resources/rvk/cg-to-rvk-json.fix"))
.setReceiver(new JsonEncoder())
.setReceiver(new JsonToElasticsearchBulk(ELASTICSEARCH_INDEX_TYPE_NAME, ELASTICSEARCH_INDEX_NAME))
.setReceiver(new ObjectWriter<>(JSON_FILE));
} catch (IOException e) {
e.printStackTrace();
}
opener.process(
new File(XML_INPUT_FILE).getAbsolutePath());
try {
opener.closeStream();
} catch (final NullPointerException e) {
// ignore, see https://github.com/hbz/lobid-resources/issues/1030
}
}
}
40 changes: 40 additions & 0 deletions src/main/resources/rvk/cg-to-rvk-csv.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
set_array("records[]")
set_array("@id[]")
set_array("rvk[]")

do list(path: "084??", "var": "$i")
if any_match("$i.2", "rvk")
copy_field("$i.a","rvk[].$append")
end
end

uniq("rvk[]")
join_field("rvk[]",",")


do list(path: "035??", "var": "$i")
if any_match("$i.a", "^\\(DE-605\\)(.*)")
copy_field("$i.a","@id[].$append")
end
end
replace_all("id[].*","^\\(DE-605\\)(.*)","$1")

do list(path: "@id[]", "var": "$i")
copy_field("$i","records[].$append.id")
copy_field("rvk[]","records[].$last.rvk[]")
end
replace_all("records[].*.id","^\\(DE-605\\)(.*)","$1")

vacuum()

# Filter records without RVK
unless exists("rvk[]")
reject()
end

# Filter records without hbz ids
unless exists("@id[]")
reject()
end

retain("records[]")
29 changes: 29 additions & 0 deletions src/main/resources/rvk/cg-to-rvk-json.fix
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need two fixes?

Copy link
Member

@dr0i dr0i Jun 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In json we don't need a record for every id on its own - the search is done by the search engine, in contrast to a csv where we need a single unique key. The json is more performant when using search engines, the csv is the only way to go when using tables.
Could also be, if the csv is working great, that we can get rid of jsonaltogether.

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
set_array("rvk[]")

do list(path: "084??", "var": "$i")
if any_match("$i.2", "rvk")
copy_field("$i.a","rvk[].$append")
end
end

dr0i marked this conversation as resolved.
Show resolved Hide resolved
set_array("id")
do list(path: "035??", "var": "$i")
if any_match("$i.a", "^\\(DE-605\\)(.*)")
copy_field("$i.a","id.$append")
end
end
replace_all("id.*","^\\(DE-605\\)(.*)","$1")
join_field("id",", ")

retain("rvk[]","id")
vacuum()

# Filter records without RVK
unless exists("rvk[]")
reject()
end

# Filter records without hbz ids
unless exists("id")
reject()
end
4 changes: 3 additions & 1 deletion src/test/java/UnitTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
@RunWith(Suite.class)
@Suite.SuiteClasses({
TestGenerateContext.class,
org.lobid.resources.AlmaMarc21XmlToLobidJsonMetafixTest.class})
org.lobid.resources.AlmaMarc21XmlToLobidJsonMetafixTest.class,
org.lobid.resources.CulturegraphXmlFilterHbzRvkToCsvTest.class,
org.lobid.resources.CulturegraphXmlFilterHbzToJsonTest.class})

public final class UnitTests {
/* Suite class, groups tests via annotation above */
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/

package org.lobid.resources;

import static org.junit.Assert.assertEquals;

import java.io.File;
import java.io.IOException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.junit.Test;
import org.lobid.resources.run.CulturegraphXmlFilterHbzRvkToCsv;

/**
* Test of filtering resources with hbz holdings from culturegraph MARCXML,
* tranforming into a CSV file.
*
* @author Pascal Christoph(dr0i)
**/
public final class CulturegraphXmlFilterHbzRvkToCsvTest {

private static final Logger LOG =
LoggerFactory.getLogger(CulturegraphXmlFilterHbzRvkToCsvTest.class);

private static final String PATH_TO_TEST = "src/test/resources/";
public static final String OUTPUT_FILE =
PATH_TO_TEST + "cg/output.csv";

private static final String XML_INPUT_FILE = "cg/aggregate_20240507_example.marcxml";

@SuppressWarnings("static-method")
@Test
public void testExtractLookupTableFromCgAsHbzRvk() {
CulturegraphXmlFilterHbzRvkToCsv.main(PATH_TO_TEST + XML_INPUT_FILE,
OUTPUT_FILE);
}

/**private static void ingest() throws IOException {
File jsonFile = new File(OUTPUT_FILE);
}*/


}
Loading
Loading