Skip to content

Commit

Permalink
complete consolidation
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Nov 24, 2018
1 parent d58c820 commit f0e5bf4
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 32 deletions.
10 changes: 8 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -389,13 +389,19 @@ project(":grobid-trainer") {
jvmArgs '-Xmx3072m'
}

task(EvaluationDOIMatching, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') {
task(PrepareDOIMatching, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') {
main = 'org.grobid.trainer.evaluation.EvaluationDOIMatching'
classpath = sourceSets.main.runtimeClasspath
args 'data', getArg('p2t', '.')
jvmArgs '-Xmx3072m'
}
// jvmArgs '-Djava.net.useSystemProxies=true' is not working

task(EvaluationDOIMatching, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') {
main = 'org.grobid.trainer.evaluation.EvaluationDOIMatching'
classpath = sourceSets.main.runtimeClasspath
args 'eval', getArg('p2t', '.')
jvmArgs '-Xmx3072m'
}
}

// coveralls does not support mutimodule, therefore merging reports
Expand Down
55 changes: 45 additions & 10 deletions grobid-core/src/main/java/org/grobid/core/engines/Engine.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.factory.GrobidPoolingFactory;
import org.grobid.core.lang.Language;
import org.grobid.core.utilities.Consolidation;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.LanguageUtilities;
import org.grobid.core.utilities.Utilities;
Expand All @@ -47,8 +48,7 @@
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.*;

/**
* Class for managing the extraction of bibliographical information from PDF
Expand Down Expand Up @@ -196,16 +196,51 @@ public BiblioItem processRawReference(String reference, int consolidate) {
* @return the list of recognized bibliographical objects
*/
public List<BiblioItem> processRawReferences(List<String> references, int consolidate) throws Exception {
if (references == null)
return null;
if (references.size() == 0)
return null;
List<BiblioItem> results = new ArrayList<BiblioItem>();
List<BibDataSet> results = new ArrayList<BibDataSet>();
List<BiblioItem> finalResults = new ArrayList<BiblioItem>();
if (references == null || references.size() == 0)
return finalResults;
for (String reference : references) {
BiblioItem bit = parsers.getCitationParser().processing(reference, consolidate);
results.add(bit);
BiblioItem bib = parsers.getCitationParser().processing(reference, 0);
//if ((bib != null) && !bib.rejectAsReference())
{
BibDataSet bds = new BibDataSet();
bds.setResBib(bib);
bds.setRawBib(reference);
results.add(bds);
}
}
return results;

if (results.size() == 0)
return finalResults;
// consolidation in a second stage to take advantage of parallel calls
if (consolidate != 0) {
Consolidation consolidator = new Consolidation(cntManager);
Map<Integer,BiblioItem> resConsolidation = null;
try {
resConsolidation = consolidator.consolidate(results);
} catch(Exception e) {
throw new GrobidException(
"An exception occured while running consolidation on bibliographical references.", e);
} finally {
//consolidator.close();
}
if (resConsolidation != null) {
for(int i=0; i<results.size(); i++) {
BiblioItem resCitation = results.get(i).getResBib();
BiblioItem bibo = resConsolidation.get(Integer.valueOf(i));
if (bibo != null) {
if (consolidate == 1)
BiblioItem.correct(resCitation, bibo);
else if (consolidate == 2)
BiblioItem.injectDOI(resCitation, bibo);
}
finalResults.add(resCitation);
}
}
}

return finalResults;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ public class EvaluationDOIMatching {
private static final String path_nlm_title = "article-title/text()";
private static final String path_nlm_author = "person-group[@person-group-type=\"author\"]/name/surname/text()";
private static final String path_nlm_host = "source/text()";
private static final String path_nlm_first_page = "fpage/text()";
private static final String path_nlm_volume = "volume/text()";


// xpath expressions for tei
private static final String path_tei_ref = "//back/div/listBibl/biblStruct";
Expand Down Expand Up @@ -117,40 +120,52 @@ public boolean accept(File dir, String name) {
ObjectMapper mapper = new ObjectMapper();
for (File dir : refFiles) {
// get the PDF file in the directory

final File jsonFile = refFiles[0];

JsonNode rootNode = mapper.readTree(jsonFile);

Iterator<JsonNode> ite = rootNode.elements();

List<String> rawRefs = new ArrayList<String>();
List<String> dois = new ArrayList<String>();
List<String> pmids = new ArrayList<String>();
while (ite.hasNext()) {
if (nbRef > 1000)
break;
JsonNode entryNode = ite.next();

String rawRef = null;
JsonNode refNode = entryNode.findPath("reference");
if ((refNode != null) && (!refNode.isMissingNode())) {
rawRef = refNode.textValue();
}
rawRefs.add(rawRef);

String doi = null;
JsonNode doiNode = entryNode.findPath("doi");
if ((doiNode != null) && (!doiNode.isMissingNode())) {
doi = doiNode.textValue();
}
dois.add(doi);

String pmid = null;
JsonNode pmidNode = entryNode.findPath("pmid");
if ((pmidNode != null) && (!pmidNode.isMissingNode())) {
pmid = pmidNode.textValue();
}

System.out.println("\n\tDOI: " + doi);
System.out.println("\trawRef: " + rawRef);
pmids.add(pmid);
nbRef++;
}
// run Grobid reference parser on this raw string, todo: call by batch of n citations
try {
List<BiblioItem> biblios = engine.processRawReferences(rawRefs, 2);
for(int i=0; i<rawRefs.size(); i++) {
BiblioItem biblio = biblios.get(i);
String doi = dois.get(i);
String pmid = pmids.get(i);

// run Grobid reference parser on this raw string
try {
BiblioItem biblio = engine.processRawReference(rawRef, 2);
System.out.println("\n\tDOI: " + doi);
System.out.println("\trawRef: " + rawRefs.get(i));

if (biblio.getDOI() != null) {
nbDOIFound++;
Expand All @@ -163,12 +178,12 @@ public boolean accept(File dir, String name) {
System.out.println("!!!!!!!!!!!!! Mismatch DOI: " + doi + " / " + biblio.getDOI());
}
}
}
catch (Exception e) {
System.out.println("Error when processing: " + jsonFile.getPath());
e.printStackTrace();
}
}
catch (Exception e) {
System.out.println("Error when processing: " + jsonFile.getPath());
e.printStackTrace();
}
}

double processTime = ((double)System.currentTimeMillis() - start) / 1000.0;
Expand Down Expand Up @@ -289,7 +304,7 @@ public boolean accept(File dir, String name) {
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
docFactory.setValidating(false);

System.out.println("\n\nFile: " + pdfFile.getPath());
//System.out.println("\n\nFile: " + pdfFile.getPath());

try {
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
Expand All @@ -306,7 +321,7 @@ public InputSource resolveEntity(String publicId, String systemId) {
goldFile = nlmFile;
Document gold = docBuilder.parse(goldFile);

System.out.println("Reference DOIs in : " + goldFile.getPath());
//System.out.println("Reference DOIs in : " + goldFile.getPath());

// get the DOI or PMID of the bibliographical references ia xpath
String path_doi = null;
Expand Down Expand Up @@ -356,10 +371,12 @@ public InputSource resolveEntity(String publicId, String systemId) {
try {
System.out.println(n + " - " + pdfFile.getPath());
List<BibDataSet> bibrefs = engine.processReferences(pdfFile, 0);

for(BibDataSet bib : bibrefs) {
String rawRef = bib.getRawBib();
// we remove a DOI possibly in the raw ref, as the whole exercie is about DOI
// matching
if (rawRef != null) {
rawRef = TextUtilities.DOIPattern.matcher(rawRef).replaceAll(" ");
// we need to align this raw ref bib string with a gold ref bib
for(BibRefAggregated goldReference : goldReferences) {
if ( (goldReference.getRawRef() == null) &&
Expand Down Expand Up @@ -391,6 +408,20 @@ public InputSource resolveEntity(String publicId, String systemId) {
if ((nodeList != null) && nodeList.getLength()>0)
host = nodeList.item(0).getNodeValue();

// first page
String firstPage = null;
nodeList = (NodeList) xp.compile(path_nlm_first_page).
evaluate(refNode, XPathConstants.NODESET);
if ((nodeList != null) && nodeList.getLength()>0)
firstPage = nodeList.item(0).getNodeValue();

// volume
String volume = null;
nodeList = (NodeList) xp.compile(path_nlm_volume).
evaluate(refNode, XPathConstants.NODESET);
if ((nodeList != null) && nodeList.getLength()>0)
volume = nodeList.item(0).getNodeValue();

//System.out.println(title + " " + author + " " + host);
if ( (title == null) && (author == null) && (host == null) ) {
// nlm might contain the raw string but usually not DOI or PMID
Expand All @@ -399,7 +430,9 @@ public InputSource resolveEntity(String publicId, String systemId) {
String titleSignature = this.getSignature(title);
String authorSignature = this.getSignature(author);
String hostSignature = this.getSignature(host);
int ind1 = -1, ind2 = -1, ind3 = -1;
String firstPageSignature = this.getSignature(firstPage);
String volumeSignature = this.getSignature(volume);
int ind1 = -1, ind2 = -1, ind3 = -1, ind4 =-1, ind5 =-1;
if (title != null) {
ind1 = rawRefSignature.indexOf(titleSignature);
}
Expand All @@ -409,6 +442,12 @@ public InputSource resolveEntity(String publicId, String systemId) {
if (host != null) {
ind3 = rawRefSignature.indexOf(hostSignature);
}
if (firstPage != null) {
ind4 = rawRefSignature.indexOf(firstPageSignature);
}
if (volume != null) {
ind5 = rawRefSignature.indexOf(volumeSignature);
}
// soft match for the title using Ratcliff Obershelp string distance
//double similarity = 0.0;
//Option<Object> similarityObject =
Expand All @@ -418,8 +457,7 @@ public InputSource resolveEntity(String publicId, String systemId) {

// intra-document matching
if ( (ind1 != -1) ||
(ind2 != -1 && ind3 != -1) ) {
//if ( (ind1 != -1) ) {
(ind2 != -1 && ind3 != -1 && (ind4 != -1 || ind5 != -1)) ) {
goldReference.setRawRef(rawRef);
p++;
continue;
Expand Down Expand Up @@ -605,8 +643,6 @@ public static void main(String[] args) {
if (action.equals("data")) {
EvaluationDOIMatching data = new EvaluationDOIMatching(inputPath);
data.buildEvaluationDataset();
String report = data.evaluation();
System.out.println(report);
} else if (action.equals("eval")) {
EvaluationDOIMatching eval = new EvaluationDOIMatching(inputPath);
String report = eval.evaluation();
Expand Down

0 comments on commit f0e5bf4

Please sign in to comment.