From 4e02dc71046026a295b342b96d7981bf34ccb611 Mon Sep 17 00:00:00 2001 From: Patrice Lopez Date: Sat, 24 Nov 2018 12:43:44 +0100 Subject: [PATCH] complete consolidation Former-commit-id: f0e5bf4072a1bc8ff890c64121da72d557a1362e --- build.gradle | 10 ++- .../java/org/grobid/core/engines/Engine.java | 55 +++++++++++--- .../evaluation/EvaluationDOIMatching.java | 76 ++++++++++++++----- 3 files changed, 109 insertions(+), 32 deletions(-) diff --git a/build.gradle b/build.gradle index 23a5e6acb0..639a8e635c 100644 --- a/build.gradle +++ b/build.gradle @@ -389,13 +389,19 @@ project(":grobid-trainer") { jvmArgs '-Xmx3072m' } - task(EvaluationDOIMatching, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') { + task(PrepareDOIMatching, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') { main = 'org.grobid.trainer.evaluation.EvaluationDOIMatching' classpath = sourceSets.main.runtimeClasspath args 'data', getArg('p2t', '.') jvmArgs '-Xmx3072m' } -// jvmArgs '-Djava.net.useSystemProxies=true' is not working + + task(EvaluationDOIMatching, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') { + main = 'org.grobid.trainer.evaluation.EvaluationDOIMatching' + classpath = sourceSets.main.runtimeClasspath + args 'eval', getArg('p2t', '.') + jvmArgs '-Xmx3072m' + } } // coveralls does not support mutimodule, therefore merging reports diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java index 87565df5de..1dc8634a93 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java @@ -36,6 +36,7 @@ import org.grobid.core.factory.GrobidFactory; import org.grobid.core.factory.GrobidPoolingFactory; import org.grobid.core.lang.Language; +import org.grobid.core.utilities.Consolidation; import org.grobid.core.utilities.GrobidProperties; import org.grobid.core.utilities.LanguageUtilities; import org.grobid.core.utilities.Utilities; @@ -47,8 +48,7 @@ import org.slf4j.LoggerFactory; import java.io.*; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * Class for managing the extraction of bibliographical information from PDF @@ -196,16 +196,51 @@ public BiblioItem processRawReference(String reference, int consolidate) { * @return the list of recognized bibliographical objects */ public List processRawReferences(List references, int consolidate) throws Exception { - if (references == null) - return null; - if (references.size() == 0) - return null; - List results = new ArrayList(); + List results = new ArrayList(); + List finalResults = new ArrayList(); + if (references == null || references.size() == 0) + return finalResults; for (String reference : references) { - BiblioItem bit = parsers.getCitationParser().processing(reference, consolidate); - results.add(bit); + BiblioItem bib = parsers.getCitationParser().processing(reference, 0); + //if ((bib != null) && !bib.rejectAsReference()) + { + BibDataSet bds = new BibDataSet(); + bds.setResBib(bib); + bds.setRawBib(reference); + results.add(bds); + } } - return results; + + if (results.size() == 0) + return finalResults; + // consolidation in a second stage to take advantage of parallel calls + if (consolidate != 0) { + Consolidation consolidator = new Consolidation(cntManager); + Map resConsolidation = null; + try { + resConsolidation = consolidator.consolidate(results); + } catch(Exception e) { + throw new GrobidException( + "An exception occured while running consolidation on bibliographical references.", e); + } finally { + //consolidator.close(); + } + if (resConsolidation != null) { + for(int i=0; i ite = rootNode.elements(); + + List rawRefs = new ArrayList(); + List dois = new ArrayList(); + List pmids = new ArrayList(); while (ite.hasNext()) { + if (nbRef > 1000) + break; JsonNode entryNode = ite.next(); String rawRef = null; @@ -131,26 +139,33 @@ public boolean accept(File dir, String name) { if ((refNode != null) && (!refNode.isMissingNode())) { rawRef = refNode.textValue(); } + rawRefs.add(rawRef); String doi = null; JsonNode doiNode = entryNode.findPath("doi"); if ((doiNode != null) && (!doiNode.isMissingNode())) { doi = doiNode.textValue(); } + dois.add(doi); String pmid = null; JsonNode pmidNode = entryNode.findPath("pmid"); if ((pmidNode != null) && (!pmidNode.isMissingNode())) { pmid = pmidNode.textValue(); } - - System.out.println("\n\tDOI: " + doi); - System.out.println("\trawRef: " + rawRef); + pmids.add(pmid); nbRef++; + } + // run Grobid reference parser on this raw string, todo: call by batch of n citations + try { + List biblios = engine.processRawReferences(rawRefs, 2); + for(int i=0; i bibrefs = engine.processReferences(pdfFile, 0); - for(BibDataSet bib : bibrefs) { String rawRef = bib.getRawBib(); + // we remove a DOI possibly in the raw ref, as the whole exercie is about DOI + // matching if (rawRef != null) { + rawRef = TextUtilities.DOIPattern.matcher(rawRef).replaceAll(" "); // we need to align this raw ref bib string with a gold ref bib for(BibRefAggregated goldReference : goldReferences) { if ( (goldReference.getRawRef() == null) && @@ -391,6 +408,20 @@ public InputSource resolveEntity(String publicId, String systemId) { if ((nodeList != null) && nodeList.getLength()>0) host = nodeList.item(0).getNodeValue(); + // first page + String firstPage = null; + nodeList = (NodeList) xp.compile(path_nlm_first_page). + evaluate(refNode, XPathConstants.NODESET); + if ((nodeList != null) && nodeList.getLength()>0) + firstPage = nodeList.item(0).getNodeValue(); + + // volume + String volume = null; + nodeList = (NodeList) xp.compile(path_nlm_volume). + evaluate(refNode, XPathConstants.NODESET); + if ((nodeList != null) && nodeList.getLength()>0) + volume = nodeList.item(0).getNodeValue(); + //System.out.println(title + " " + author + " " + host); if ( (title == null) && (author == null) && (host == null) ) { // nlm might contain the raw string but usually not DOI or PMID @@ -399,7 +430,9 @@ public InputSource resolveEntity(String publicId, String systemId) { String titleSignature = this.getSignature(title); String authorSignature = this.getSignature(author); String hostSignature = this.getSignature(host); - int ind1 = -1, ind2 = -1, ind3 = -1; + String firstPageSignature = this.getSignature(firstPage); + String volumeSignature = this.getSignature(volume); + int ind1 = -1, ind2 = -1, ind3 = -1, ind4 =-1, ind5 =-1; if (title != null) { ind1 = rawRefSignature.indexOf(titleSignature); } @@ -409,6 +442,12 @@ public InputSource resolveEntity(String publicId, String systemId) { if (host != null) { ind3 = rawRefSignature.indexOf(hostSignature); } + if (firstPage != null) { + ind4 = rawRefSignature.indexOf(firstPageSignature); + } + if (volume != null) { + ind5 = rawRefSignature.indexOf(volumeSignature); + } // soft match for the title using Ratcliff Obershelp string distance //double similarity = 0.0; //Option similarityObject = @@ -418,8 +457,7 @@ public InputSource resolveEntity(String publicId, String systemId) { // intra-document matching if ( (ind1 != -1) || - (ind2 != -1 && ind3 != -1) ) { - //if ( (ind1 != -1) ) { + (ind2 != -1 && ind3 != -1 && (ind4 != -1 || ind5 != -1)) ) { goldReference.setRawRef(rawRef); p++; continue; @@ -605,8 +643,6 @@ public static void main(String[] args) { if (action.equals("data")) { EvaluationDOIMatching data = new EvaluationDOIMatching(inputPath); data.buildEvaluationDataset(); - String report = data.evaluation(); - System.out.println(report); } else if (action.equals("eval")) { EvaluationDOIMatching eval = new EvaluationDOIMatching(inputPath); String report = eval.evaluation();