-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
1058-enrichWithCulturegraphRvkWithFix #1921
Merged
Merged
Changes from all commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
ca6c3ef
Add extraction and transformation of culturegraph RVK
dr0i ed0344f
Add filter to omit empty records
dr0i f535aa3
Add rule to make id AND rvk mandatory
dr0i 95d87a6
Add junit test
dr0i 573cf05
Test runner
dr0i d9235a3
WIP
dr0i 3454ae9
Use fix instead of morph for culturegraph #1058
TobiasNx bdbdefa
Update logger dependencies
TobiasNx 87f63af
Catch FileNotFoundException #1058
TobiasNx 3fdcc5c
Catch Exception (#1058)
dr0i 8fb1fe3
Remove metafacture-triples; add metafacture-elasticsearch (#1058)
dr0i 26a55a7
Update test data (#1058)
dr0i 554949b
Format
dr0i 848c948
Reduce complexity of flow (#1058)
dr0i f6c150f
Generate also CSV from CultureGraph as hbz-RVK concordance (#1085)
dr0i eae4a69
Enforce two columns' CSV; set ID to first column (#1058)
dr0i ee6ad10
Exclude tmp directory for editorconfig
dr0i 9c4a21a
Create record for each ID; separate CSV and JSON (#1058)
dr0i e297026
Update src/main/resources/rvk/cg-to-rvk-json.fix
dr0i File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
53 changes: 53 additions & 0 deletions
53
src/main/java/org/lobid/resources/run/CulturegraphXmlFilterHbzRvkToCsv.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/ | ||
|
||
package org.lobid.resources.run; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
import org.metafacture.biblio.marc21.MarcXmlHandler; | ||
import org.metafacture.csv.CsvEncoder; | ||
import org.metafacture.json.JsonDecoder; | ||
import org.metafacture.json.JsonEncoder; | ||
import org.metafacture.io.FileOpener; | ||
import org.metafacture.io.ObjectWriter; | ||
import org.metafacture.xml.XmlDecoder; | ||
import org.metafacture.metafix.Metafix; | ||
|
||
/** | ||
* Filter resources with hbz holdings from culturegraph's MARCXML while tranform it with reject() | ||
* into a CSV file. | ||
* | ||
* @author Pascal Christoph (dr0i) | ||
* @author Tobias Bülte (TobiasNx) | ||
**/ | ||
public final class CulturegraphXmlFilterHbzRvkToCsv { | ||
private static String OUTPUT_FILE="cg-concordance.csv"; | ||
|
||
public static void main(String... args) { | ||
String XML_INPUT_FILE = new File(args[0]).getAbsolutePath(); | ||
|
||
if (args.length > 1) OUTPUT_FILE = args[1]; | ||
|
||
final FileOpener opener = new FileOpener(); | ||
JsonDecoder jsonDecoder = new JsonDecoder(); | ||
jsonDecoder.setRecordPath("records"); | ||
try { | ||
opener.setReceiver(new XmlDecoder()).setReceiver(new MarcXmlHandler()) | ||
.setReceiver(new Metafix("src/main/resources/rvk/cg-to-rvk-csv.fix")) | ||
.setReceiver(new JsonEncoder()) | ||
.setReceiver(jsonDecoder) | ||
.setReceiver(new CsvEncoder()) | ||
.setReceiver(new ObjectWriter<>(OUTPUT_FILE)); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
opener.process( | ||
new File(XML_INPUT_FILE).getAbsolutePath()); | ||
try { | ||
opener.closeStream(); | ||
} catch (final NullPointerException e) { | ||
// ignore, see https://github.com/hbz/lobid-resources/issues/1030 | ||
} | ||
} | ||
} |
51 changes: 51 additions & 0 deletions
51
src/main/java/org/lobid/resources/run/CulturegraphXmlFilterHbzToJson.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/ | ||
|
||
package org.lobid.resources.run; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
import org.metafacture.biblio.marc21.MarcXmlHandler; | ||
import org.metafacture.elasticsearch.JsonToElasticsearchBulk; | ||
import org.metafacture.io.FileOpener; | ||
import org.metafacture.io.ObjectWriter; | ||
import org.metafacture.json.JsonEncoder; | ||
import org.metafacture.xml.XmlDecoder; | ||
import org.metafacture.metafix.Metafix; | ||
|
||
/** | ||
* Filter resources with hbz holdings from culturegraph's MARCXML while tranform it with reject() | ||
* into JSON and write this as an elasticsearch bulk json file. | ||
* | ||
* @author Pascal Christoph (dr0i) | ||
* @author Tobias Bülte (TobiasNx) | ||
**/ | ||
public final class CulturegraphXmlFilterHbzToJson { | ||
private static final String ELASTICSEARCH_INDEX_NAME = "cg"; | ||
public static final String ELASTICSEARCH_INDEX_TYPE_NAME="rvk"; | ||
private static String JSON_FILE="bulk.ndjson"; | ||
|
||
public static void main(String... args) { | ||
String XML_INPUT_FILE = new File(args[0]).getAbsolutePath(); | ||
|
||
if (args.length > 1) JSON_FILE = args[1]; | ||
|
||
final FileOpener opener = new FileOpener(); | ||
try { | ||
opener.setReceiver(new XmlDecoder()).setReceiver(new MarcXmlHandler()) | ||
.setReceiver(new Metafix("src/main/resources/rvk/cg-to-rvk-json.fix")) | ||
.setReceiver(new JsonEncoder()) | ||
.setReceiver(new JsonToElasticsearchBulk(ELASTICSEARCH_INDEX_TYPE_NAME, ELASTICSEARCH_INDEX_NAME)) | ||
.setReceiver(new ObjectWriter<>(JSON_FILE)); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
opener.process( | ||
new File(XML_INPUT_FILE).getAbsolutePath()); | ||
try { | ||
opener.closeStream(); | ||
} catch (final NullPointerException e) { | ||
// ignore, see https://github.com/hbz/lobid-resources/issues/1030 | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
set_array("records[]") | ||
set_array("@id[]") | ||
set_array("rvk[]") | ||
|
||
do list(path: "084??", "var": "$i") | ||
if any_match("$i.2", "rvk") | ||
copy_field("$i.a","rvk[].$append") | ||
end | ||
end | ||
|
||
uniq("rvk[]") | ||
join_field("rvk[]",",") | ||
|
||
|
||
do list(path: "035??", "var": "$i") | ||
if any_match("$i.a", "^\\(DE-605\\)(.*)") | ||
copy_field("$i.a","@id[].$append") | ||
end | ||
end | ||
replace_all("id[].*","^\\(DE-605\\)(.*)","$1") | ||
|
||
do list(path: "@id[]", "var": "$i") | ||
copy_field("$i","records[].$append.id") | ||
copy_field("rvk[]","records[].$last.rvk[]") | ||
end | ||
replace_all("records[].*.id","^\\(DE-605\\)(.*)","$1") | ||
|
||
vacuum() | ||
|
||
# Filter records without RVK | ||
unless exists("rvk[]") | ||
reject() | ||
end | ||
|
||
# Filter records without hbz ids | ||
unless exists("@id[]") | ||
reject() | ||
end | ||
|
||
retain("records[]") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
set_array("rvk[]") | ||
|
||
do list(path: "084??", "var": "$i") | ||
if any_match("$i.2", "rvk") | ||
copy_field("$i.a","rvk[].$append") | ||
end | ||
end | ||
uniq("rvk[]") | ||
set_array("id") | ||
do list(path: "035??", "var": "$i") | ||
if any_match("$i.a", "^\\(DE-605\\)(.*)") | ||
copy_field("$i.a","id.$append") | ||
end | ||
end | ||
replace_all("id.*","^\\(DE-605\\)(.*)","$1") | ||
join_field("id",", ") | ||
|
||
retain("rvk[]","id") | ||
vacuum() | ||
|
||
# Filter records without RVK | ||
unless exists("rvk[]") | ||
reject() | ||
end | ||
|
||
# Filter records without hbz ids | ||
unless exists("id") | ||
reject() | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
45 changes: 45 additions & 0 deletions
45
src/test/java/org/lobid/resources/CulturegraphXmlFilterHbzRvkToCsvTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/* Copyright 2020 hbz, Pascal Christoph. Licensed under the EPL 2.0*/ | ||
|
||
package org.lobid.resources; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import org.junit.Test; | ||
import org.lobid.resources.run.CulturegraphXmlFilterHbzRvkToCsv; | ||
|
||
/** | ||
* Test of filtering resources with hbz holdings from culturegraph MARCXML, | ||
* tranforming into a CSV file. | ||
* | ||
* @author Pascal Christoph(dr0i) | ||
**/ | ||
public final class CulturegraphXmlFilterHbzRvkToCsvTest { | ||
|
||
private static final Logger LOG = | ||
LoggerFactory.getLogger(CulturegraphXmlFilterHbzRvkToCsvTest.class); | ||
|
||
private static final String PATH_TO_TEST = "src/test/resources/"; | ||
public static final String OUTPUT_FILE = | ||
PATH_TO_TEST + "cg/output.csv"; | ||
|
||
private static final String XML_INPUT_FILE = "cg/aggregate_20240507_example.marcxml"; | ||
|
||
@SuppressWarnings("static-method") | ||
@Test | ||
public void testExtractLookupTableFromCgAsHbzRvk() { | ||
CulturegraphXmlFilterHbzRvkToCsv.main(PATH_TO_TEST + XML_INPUT_FILE, | ||
OUTPUT_FILE); | ||
} | ||
|
||
/**private static void ingest() throws IOException { | ||
File jsonFile = new File(OUTPUT_FILE); | ||
}*/ | ||
|
||
|
||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need two fixes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In
json
we don't need a record for everyid
on its own - the search is done by the search engine, in contrast to acsv
where we need a single unique key. Thejson
is more performant when using search engines, thecsv
is the only way to go when using tables.Could also be, if the
csv
is working great, that we can get rid ofjson
altogether.