diff --git a/build.gradle b/build.gradle index d5174df060..af5556b4c0 100644 --- a/build.gradle +++ b/build.gradle @@ -55,10 +55,10 @@ subprojects { } } - configurations { + /*configurations { all*.exclude group: 'org.slf4j', module: "slf4j-log4j12" all*.exclude group: 'log4j', module: "log4j" - } + }*/ ext { // treating them separately, these jars will be flattened into grobid-core.jar on installing, @@ -199,6 +199,7 @@ project("grobid-core") { compile "org.apache.lucene:lucene-analyzers-common:4.5.1" compile 'javax.xml.bind:jaxb-api:2.3.0' compile 'black.ninia:jep:3.8.2' + compile 'org.slf4j:slf4j-log4j12:1.7.25' shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1" @@ -280,6 +281,8 @@ project(":grobid-service") { configurations { all*.exclude group: 'org.slf4j', module: "slf4j-jdk14" + all*.exclude group: 'org.slf4j', module: "slf4j-log4j12" + all*.exclude group: 'log4j', module: "log4j" } // tasks.distZip.enabled = false @@ -372,8 +375,8 @@ project(":grobid-trainer") { } // run like: - // gradle PubMedCentralEval -Pp2t=/path/to/goldenSet - // gradle PubMedCentralEval -Pp2t=/path/to/goldenSet -Prun=1 -PfileRatio=0.1 + // ./gradlew PubMedCentralEval -Pp2t=/path/to/goldenSet + // ./gradlew PubMedCentralEval -Pp2t=/path/to/goldenSet -Prun=1 -PfileRatio=0.1 // ./gradlew PrepareDOIMatching -Pp2t=ABS_PATH_TO_PMC/PMC_sample_1943 // ./gradlew EvaluateDOIMatching -Pp2t=ABS_PATH_TO_PMC/PMC_sample_1943 task(PubMedCentralEval, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') { diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index 8d75cd41cf..b537bd28e4 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -3275,7 +3275,7 @@ public void attachAffiliations() { aff.setFailAffiliation(false); } } else if (hasMarker) { - // we get the marker for each affiliation and try to find the related author in the + // we get the marker for each affiliation and try to find the related author in the // original author field for (Affiliation aff : fullAffiliations) { if (aff.getMarker() != null) { @@ -3391,16 +3391,18 @@ public void attachAffiliations() { } } } - } else if (nbAuthors == nbAffiliations) { + } /*else if (nbAuthors == nbAffiliations) { // risky heuristics, we distribute in this case one affiliation per author // preserving author // sometimes 2 affiliations belong both to 2 authors, for these case, the layout // positioning should be studied for (int p = 0; p < nbAuthors; p++) { fullAuthors.get(p).addAffiliation(fullAffiliations.get(p)); + System.out.println("attachment: " + p); + System.out.println(fullAuthors.get(p)); fullAffiliations.get(p).setFailAffiliation(false); } - } + }*/ } @@ -3412,7 +3414,6 @@ public String toTEIAuthorBlock(int nbTag, boolean withCoordinates) { int nbAuthors = 0; int nbAffiliations = 0; int nbAddresses = 0; - // uncomment below when collaboration will be concretely added to headers /* if ( (collaboration != null) && @@ -3497,7 +3498,6 @@ public String toTEIAuthorBlock(int nbTag, boolean withCoordinates) { } if (author.getAffiliations() != null) { - for (Affiliation aff : author.getAffiliations()) { TextUtilities.appendN(tei, '\t', nbTag + 1); tei.append(" processingReflow(List affiliationBlocks, return resultBuilder(res, tokenizations, false); // normally use pre-label because it is a reflow } - - static class DebugTahher { - private String str = ""; - - public void add(String s) { - str += s; - } - - public void clear() { - str = ""; - } - - String[] split; - - - public boolean parse() { - System.out.println("Parsing:\n" + str + "\n------------------"); - split = str.split("\n"); - - return true; - } - - public int size() { - return split.length; - } - - - } - private String runReflow(List affiliationBlocks, List tokenizations) { -// StringBuilder res = new StringBuilder(); -// DebugTahher tagger = new DebugTahher(); try { List> placesPositions = new ArrayList>(); placesPositions.add(lexicon.tokenPositionsCityNames(tokenizations)); @@ -217,10 +189,7 @@ private String runReflow(List affiliationBlocks, return null; } - String res = label(header); - res = label(res); - - return res; + return label(header); } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } @@ -276,7 +245,10 @@ private ArrayList resultBuilder(String result, hasAddress = false; continue; } - StringTokenizer st3 = new StringTokenizer(line, "\t"); + String delimiter = "\t"; + if (line.indexOf(delimiter) == -1) + delimiter = " "; + StringTokenizer st3 = new StringTokenizer(line, delimiter); int ll = st3.countTokens(); int i = 0; String s1 = null; // predicted label diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 79b4c161f7..715906b8f2 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -6,6 +6,7 @@ import org.grobid.core.GrobidModels; import org.grobid.core.data.BiblioItem; import org.grobid.core.data.Date; +import org.grobid.core.data.Affiliation; import org.grobid.core.data.Keyword; import org.grobid.core.data.Person; import org.grobid.core.document.Document; @@ -123,7 +124,6 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re header = doc.getHeaderFeatured(false, true); }*/ List tokenizations = doc.getTokenizationsHeader(); -//System.out.println(tokenizations.toString()); if ((header != null) && (header.trim().length() > 0)) { String res = label(header); @@ -199,8 +199,8 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re } } - resHeader.setFullAffiliations( - parsers.getAffiliationAddressParser().processReflow(res, tokenizations)); + List affiliations = parsers.getAffiliationAddressParser().processReflow(res, tokenizations); + resHeader.setFullAffiliations(affiliations); resHeader.attachEmails(); boolean attached = false; if (fragmentedAuthors && !hasMarker) { @@ -264,10 +264,7 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re } } - //if (consolidate) - { - resHeader = consolidateHeader(resHeader, consolidate); - } + resHeader = consolidateHeader(resHeader, consolidate); // normalization of dates if (resHeader != null) { @@ -478,10 +475,7 @@ public String processingHeaderSection(int consolidate, Document doc, BiblioItem } } - //if (consolidate) - { - resHeader = consolidateHeader(resHeader, consolidate); - } + resHeader = consolidateHeader(resHeader, consolidate); // normalization of dates if (resHeader != null) { @@ -1076,7 +1070,10 @@ public BiblioItem resultExtraction(String result, boolean intro, List localFeatures = new ArrayList(); int i = 0; diff --git a/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java b/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java index f659835a90..46928d5132 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java @@ -26,7 +26,12 @@ public static synchronized GenericTagger getTagger(GrobidModel model) { t = new WapitiTagger(model); break; case DELFT: - t = new DeLFTTagger(model); + // if model is fulltext or segmentation we use currently WAPITI as fallback because they + // are not covered by DeLFT for the moment + if (model.getModelName().equals("fulltext") || model.getModelName().equals("segmentation")) + t = new WapitiTagger(model); + else + t = new DeLFTTagger(model); break; default: throw new IllegalStateException("Unsupported Grobid sequence labelling engine: " + GrobidProperties.getGrobidCRFEngine()); diff --git a/grobid-core/src/main/java/org/grobid/core/jni/DeLFTModel.java b/grobid-core/src/main/java/org/grobid/core/jni/DeLFTModel.java index d13258578b..6015cb5bec 100644 --- a/grobid-core/src/main/java/org/grobid/core/jni/DeLFTModel.java +++ b/grobid-core/src/main/java/org/grobid/core/jni/DeLFTModel.java @@ -176,9 +176,14 @@ public void run() { jep.eval("print(len(x_train), 'train sequences')"); jep.eval("print(len(x_valid), 'validation sequences')"); + String useELMo = "False"; + if (GrobidProperties.getInstance().useELMo()) { + useELMo = "True"; + } + // init model to be trained jep.eval("model = sequenceLabelling.Sequence('"+this.modelName+ - "', max_epoch=100, recurrent_dropout=0.50, embeddings_name='glove-840B', use_ELMo=False)"); + "', max_epoch=100, recurrent_dropout=0.50, embeddings_name='glove-840B', use_ELMo="+useELMo+")"); // actual training //start_time = time.time() @@ -212,12 +217,17 @@ public void run() { public static void train(String modelName, File trainingData, File outputModel) { try { LOGGER.info("Train DeLFT model " + modelName + "..."); - ProcessBuilder pb = new ProcessBuilder("python3", + List command = Arrays.asList("python3", "grobidTagger.py", modelName, "train", "--input", trainingData.getAbsolutePath(), "--output", GrobidProperties.getInstance().getModelPath().getAbsolutePath()); + if (GrobidProperties.getInstance().useELMo()) { + command.add("--use-ELMo"); + } + + ProcessBuilder pb = new ProcessBuilder(command); File delftPath = new File(GrobidProperties.getInstance().getDeLFTFilePath()); pb.directory(delftPath); Process process = pb.start(); diff --git a/grobid-core/src/main/java/org/grobid/core/jni/JEPThreadPool.java b/grobid-core/src/main/java/org/grobid/core/jni/JEPThreadPool.java index 15872edf3d..3ea990857e 100644 --- a/grobid-core/src/main/java/org/grobid/core/jni/JEPThreadPool.java +++ b/grobid-core/src/main/java/org/grobid/core/jni/JEPThreadPool.java @@ -75,6 +75,7 @@ public Jep getJEPInstance() { throw new GrobidResourceException("DeLFT installation path is not a directory"); } config.addIncludePaths(delftPath.getAbsolutePath()); + config.setClassLoader(Thread.currentThread().getContextClassLoader()); //System.out.println("jep instance thread: " + Thread.currentThread().getId()); Jep jep = new Jep(config); jepInstances.put(Thread.currentThread().getId(), jep); diff --git a/grobid-core/src/main/java/org/grobid/core/main/GrobidConstants.java b/grobid-core/src/main/java/org/grobid/core/main/GrobidConstants.java deleted file mode 100755 index 2a0d8f8413..0000000000 --- a/grobid-core/src/main/java/org/grobid/core/main/GrobidConstants.java +++ /dev/null @@ -1,13 +0,0 @@ -package org.grobid.core.main; - -/** - * @author Slava - */ -public class GrobidConstants { - //a name of a native CRF++ library without an extension - public static final String CRFPP_NATIVE_LIB_NAME = "libcrfpp"; - public static final String WAPITI_NATIVE_LIB_NAME = "libwapiti"; - public static final String DELFT_NATIVE_LIB_NAME = "libjep"; - - public static final String TEST_RESOURCES_PATH = "./src/test/resources/test"; -} diff --git a/grobid-core/src/main/java/org/grobid/core/main/LibraryLoader.java b/grobid-core/src/main/java/org/grobid/core/main/LibraryLoader.java index 4aa405ae86..0d831a67cc 100755 --- a/grobid-core/src/main/java/org/grobid/core/main/LibraryLoader.java +++ b/grobid-core/src/main/java/org/grobid/core/main/LibraryLoader.java @@ -24,6 +24,11 @@ public class LibraryLoader { private static Logger LOGGER = LoggerFactory.getLogger(LibraryLoader.class); + //a name of a native CRF++ library without an extension + public static final String CRFPP_NATIVE_LIB_NAME = "libcrfpp"; + public static final String WAPITI_NATIVE_LIB_NAME = "libwapiti"; + public static final String DELFT_NATIVE_LIB_NAME = "libjep"; + private static boolean loaded = false; // private static boolean isContextMocked = false; @@ -33,6 +38,13 @@ public static void load() { LOGGER.info("Loading external native sequence labelling library"); // mockContextIfNotSet(); LOGGER.debug(getLibraryFolder()); + + if (GrobidProperties.getGrobidCRFEngine() != GrobidCRFEngine.CRFPP && + GrobidProperties.getGrobidCRFEngine() != GrobidCRFEngine.WAPITI && + GrobidProperties.getGrobidCRFEngine() != GrobidCRFEngine.DELFT) { + throw new IllegalStateException("Unsupported sequence labelling engine: " + GrobidProperties.getGrobidCRFEngine()); + } + File libraryFolder = new File(getLibraryFolder()); if (!libraryFolder.exists() || !libraryFolder.isDirectory()) { LOGGER.error("Unable to find a native sequence labelling library: Folder " @@ -46,17 +58,17 @@ public static void load() { File[] files = libraryFolder.listFiles(new FileFilter() { public boolean accept(File file) { return file.getName().toLowerCase() - .startsWith(GrobidConstants.CRFPP_NATIVE_LIB_NAME); + .startsWith(CRFPP_NATIVE_LIB_NAME); } }); if (files.length == 0) { LOGGER.error("Unable to find a native CRF++ library: No files starting with " - + GrobidConstants.CRFPP_NATIVE_LIB_NAME + + CRFPP_NATIVE_LIB_NAME + " are in folder " + libraryFolder); throw new RuntimeException( "Unable to find a native CRF++ library: No files starting with " - + GrobidConstants.CRFPP_NATIVE_LIB_NAME + + CRFPP_NATIVE_LIB_NAME + " are in folder " + libraryFolder); } @@ -81,11 +93,16 @@ public boolean accept(File file) { + libPath, e); } - } else if (GrobidProperties.getGrobidCRFEngine() == GrobidCRFEngine.WAPITI) { + } + + if (GrobidProperties.getGrobidCRFEngine() == GrobidCRFEngine.WAPITI || + GrobidProperties.getGrobidCRFEngine() == GrobidCRFEngine.DELFT) { + // note: if DeLFT is used, we still make Wapiti available for models not existing in DeLFT (currently segmentation and + // fulltext) File[] wapitiLibFiles = libraryFolder.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { - return name.startsWith(GrobidConstants.WAPITI_NATIVE_LIB_NAME); + return name.startsWith(WAPITI_NATIVE_LIB_NAME); } }); @@ -93,46 +110,49 @@ public boolean accept(File dir, String name) { LOGGER.info("No wapiti library in the grobid home folder"); } else { LOGGER.info("Loading Wapiti native library..."); - System.load(wapitiLibFiles[0].getAbsolutePath()); - } - - } else if (GrobidProperties.getGrobidCRFEngine() == GrobidCRFEngine.DELFT) { - File[] delftLibFiles = libraryFolder.listFiles(new FilenameFilter() { - @Override - public boolean accept(File dir, String name) { - return name.startsWith(GrobidConstants.DELFT_NATIVE_LIB_NAME); + if (GrobidProperties.getGrobidCRFEngine() == GrobidCRFEngine.DELFT) { + // if DeLFT will be used, we must not load libstdc++, it would create a conflict with tensorflow libstdc++ version + // so we temporary rename the lib so that it is not loaded in this case + // note that we know that, in this case, the local lib can be ignored because as DeFLT and tensorflow are installed + // we are sure that a compatible libstdc++ lib is installed on the system and can be dynamically loaded + + String libstdcppPath = libraryFolder.getAbsolutePath() + File.separator + "libstdc++.so.6"; + File libstdcppFile = new File(libstdcppPath); + if (libstdcppFile.exists()) { + File libstdcppFileNew = new File(libstdcppPath+".new"); + libstdcppFile.renameTo(libstdcppFileNew); + } } - }); - - if (delftLibFiles.length == 0) { - LOGGER.info("No JEP library in the grobid home folder, this is required for using DeLFT"); - } else { - LOGGER.info("Loading JEP native library for DeLFT... " + delftLibFiles[0].getAbsolutePath()); - // actual loading will be made at JEP initialization, so we just need to add the path in the - // java.library.path (JEP will anyway try to load from java.library.path, so explicit file - // loading here will not help) try { - addLibraryPath(delftLibFiles[0].getParent()); - } catch (Exception e) { - LOGGER.info("Loading JEP native library for DeLFT failed", e); + System.load(wapitiLibFiles[0].getAbsolutePath()); + } finally { + if (GrobidProperties.getGrobidCRFEngine() == GrobidCRFEngine.DELFT) { + // restore libstdc++ + String libstdcppPathNew = libraryFolder.getAbsolutePath() + File.separator + "libstdc++.so.6.new"; + File libstdcppFileNew = new File(libstdcppPathNew); + if (libstdcppFileNew.exists()) { + File libstdcppFile = new File(libraryFolder.getAbsolutePath() + File.separator + "libstdc++.so.6"); + libstdcppFileNew.renameTo(libstdcppFile); + } + } } - //System.load(delftLibFiles[0].getAbsolutePath()); } + } - } else { - throw new IllegalStateException("Unsupported sequence labelling engine: " + GrobidProperties.getGrobidCRFEngine()); - } + if (GrobidProperties.getGrobidCRFEngine() == GrobidCRFEngine.DELFT) { + LOGGER.info("Loading JEP native library for DeLFT... " + libraryFolder.getAbsolutePath()); + // actual loading will be made at JEP initialization, so we just need to add the path in the + // java.library.path (JEP will anyway try to load from java.library.path, so explicit file + // loading here will not help) + try { + addLibraryPath(libraryFolder.getAbsolutePath()); + } catch (Exception e) { + LOGGER.info("Loading JEP native library for DeLFT failed", e); + } + } + + loaded = true; - -// if (isContextMocked) { -// try { -// MockContext.destroyInitialContext(); -// } catch (Exception exp) { -// LOGGER.error("Could not unmock the context." + exp); -// throw new GrobidException("Could not unmock the context.", exp); -// } -// isContextMocked = false; -// } LOGGER.info("Native library for sequence labelling loaded"); } } diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java index 9d63b02391..8c39184be5 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java @@ -435,6 +435,15 @@ public static String getDeLFTFilePath() { return pathFile.getAbsolutePath(); } + public static boolean useELMo() { + String rawValue = getPropertyValue(GrobidPropertyKeys.PROP_GROBID_DELFT_ELMO); + if (rawValue.equals("true")) + return true; + else if (rawValue.equals("false")) + return false; + return false; + } + /** * Returns the host for a proxy connection, given in the grobid-property * file. @@ -602,9 +611,13 @@ public static GrobidCRFEngine getGrobidCRFEngine() { } public static File getModelPath(final GrobidModel model) { + String extension = grobidCRFEngine.getExt(); + if (GrobidProperties.getGrobidCRFEngine() == GrobidCRFEngine.DELFT && + (model.getModelName().equals("fulltext") || model.getModelName().equals("segmentation"))) + extension = "wapiti"; return new File(get_GROBID_HOME_PATH(), FOLDER_NAME_MODELS + File.separator + model.getFolderName() + File.separator - + FILE_NAME_MODEL + "." + grobidCRFEngine.getExt()); + + FILE_NAME_MODEL + "." + extension); } public static File getModelPath() { diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidPropertyKeys.java b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidPropertyKeys.java index 18054a30fa..34507f9334 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidPropertyKeys.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidPropertyKeys.java @@ -18,6 +18,7 @@ public interface GrobidPropertyKeys { String PROP_GROBID_CRF_ENGINE = "grobid.crf.engine"; String PROP_GROBID_DELFT_PATH = "grobid.delft.install"; + String PROP_GROBID_DELFT_ELMO = "grobid.delft.useELMo"; String PROP_USE_LANG_ID = "grobid.use_language_id"; String PROP_LANG_DETECTOR_FACTORY = "grobid.language_detector_factory"; diff --git a/grobid-core/src/test/java/org/grobid/core/test/TestFullTextParser.java b/grobid-core/src/test/java/org/grobid/core/test/TestFullTextParser.java index 32317b6c36..2273ba70ec 100755 --- a/grobid-core/src/test/java/org/grobid/core/test/TestFullTextParser.java +++ b/grobid-core/src/test/java/org/grobid/core/test/TestFullTextParser.java @@ -11,7 +11,6 @@ import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.factory.GrobidFactory; import org.grobid.core.layout.Block; -import org.grobid.core.main.GrobidConstants; import org.grobid.core.utilities.GrobidProperties; import org.junit.*; diff --git a/grobid-core/src/test/java/org/grobid/core/test/TestHeaderParser.java b/grobid-core/src/test/java/org/grobid/core/test/TestHeaderParser.java index f585936070..4f02ba7109 100755 --- a/grobid-core/src/test/java/org/grobid/core/test/TestHeaderParser.java +++ b/grobid-core/src/test/java/org/grobid/core/test/TestHeaderParser.java @@ -2,7 +2,6 @@ import org.grobid.core.data.BiblioItem; import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.main.GrobidConstants; import org.grobid.core.utilities.GrobidProperties; import org.junit.AfterClass; import org.junit.Test; @@ -21,6 +20,7 @@ public class TestHeaderParser extends EngineTest { private String testPath = null; private String newTrainingPath = null; + public static final String TEST_RESOURCES_PATH = "./src/test/resources/test"; @AfterClass public static void tearDown(){ @@ -28,7 +28,7 @@ public static void tearDown(){ } private void getTestResourcePath() { - testPath = GrobidConstants.TEST_RESOURCES_PATH; + testPath = TEST_RESOURCES_PATH; GrobidProperties.getInstance(); newTrainingPath = GrobidProperties.getTempPath().getAbsolutePath(); } diff --git a/grobid-core/src/test/java/org/grobid/core/test/TestReferencesParser.java b/grobid-core/src/test/java/org/grobid/core/test/TestReferencesParser.java index 092c1f78ac..e2378b8b84 100755 --- a/grobid-core/src/test/java/org/grobid/core/test/TestReferencesParser.java +++ b/grobid-core/src/test/java/org/grobid/core/test/TestReferencesParser.java @@ -1,7 +1,6 @@ package org.grobid.core.test; import org.grobid.core.data.BibDataSet; -import org.grobid.core.main.GrobidConstants; import org.junit.Test; import java.io.File; @@ -16,13 +15,11 @@ */ public class TestReferencesParser extends EngineTest { - private String getTestResourcePath() { - return GrobidConstants.TEST_RESOURCES_PATH; - } + public static final String TEST_RESOURCES_PATH = "./src/test/resources/test"; //@Test public void testReferences() throws Exception { - String testPath = getTestResourcePath(); + String testPath = TEST_RESOURCES_PATH; String pdfPath = testPath + File.separator + "Wang-paperAVE2008.pdf"; List resRefs = engine.processReferences(new File(pdfPath), 1); diff --git a/grobid-home/config/grobid.properties b/grobid-home/config/grobid.properties index 1ed6d275ce..f280b25c0e 100755 --- a/grobid-home/config/grobid.properties +++ b/grobid-home/config/grobid.properties @@ -25,10 +25,11 @@ grobid.proxy_port=null #------------------------------------------------------ #-------------------- runtime ------------------ -#grobid.crf.engine=wapiti -grobid.crf.engine=delft +grobid.crf.engine=wapiti +#grobid.crf.engine=delft #grobid.crf.engine=crfpp grobid.delft.install=../delft +grobid.delft.useELMo=false grobid.pdf.blocks.max=100000 grobid.pdf.tokens.max=1000000 diff --git a/grobid-home/models/affiliation-address/config.json b/grobid-home/models/affiliation-address/config.json index 74ef48978b..f97a0e0457 100644 --- a/grobid-home/models/affiliation-address/config.json +++ b/grobid-home/models/affiliation-address/config.json @@ -1,19 +1,20 @@ { - "case_vocab_size": 8, - "word_embedding_size": 300, "fold_number": 1, - "embeddings_name": "glove-840B", - "num_char_lstm_units": 25, - "max_char_length": 30, - "recurrent_dropout": 0.25, + "case_vocab_size": 8, + "use_ELMo": false, + "char_vocab_size": 152, "model_name": "affiliation-address", + "dropout": 0.5, + "num_char_lstm_units": 25, "num_word_lstm_units": 100, - "char_embedding_size": 25, - "char_vocab_size": 149, - "batch_size": 20, "use_char_feature": true, + "word_embedding_size": 300, "model_type": "BidLSTM_CRF", + "embeddings_name": "glove-840B", + "use_crf": true, + "max_char_length": 30, + "recurrent_dropout": 0.5, + "char_embedding_size": 25, "case_embedding_size": 5, - "dropout": 0.5, - "use_crf": true -} + "batch_size": 20 +} \ No newline at end of file diff --git a/grobid-home/models/affiliation-address/model_weights.hdf5 b/grobid-home/models/affiliation-address/model_weights.hdf5 index 8ac313ea8f..be83e8bdd1 100644 Binary files a/grobid-home/models/affiliation-address/model_weights.hdf5 and b/grobid-home/models/affiliation-address/model_weights.hdf5 differ diff --git a/grobid-home/models/affiliation-address/preprocessor.pkl b/grobid-home/models/affiliation-address/preprocessor.pkl index dc276a53c4..9c1b1dc8b7 100644 Binary files a/grobid-home/models/affiliation-address/preprocessor.pkl and b/grobid-home/models/affiliation-address/preprocessor.pkl differ diff --git a/grobid-home/models/figure/config.json b/grobid-home/models/figure/config.json index 74d2434b80..47dc640a5c 100644 --- a/grobid-home/models/figure/config.json +++ b/grobid-home/models/figure/config.json @@ -1,20 +1,20 @@ { - "use_char_feature": true, - "case_embedding_size": 5, - "batch_size": 20, - "fold_number": 1, - "char_vocab_size": 103, + "use_ELMo": false, + "num_word_lstm_units": 100, + "embeddings_name": "glove-840B", "use_crf": true, + "batch_size": 20, "word_embedding_size": 300, - "recurrent_dropout": 0.5, - "max_char_length": 30, - "num_word_lstm_units": 100, "char_embedding_size": 25, - "embeddings_name": "glove-840B", - "num_char_lstm_units": 25, - "model_type": "BidLSTM_CRF", - "use_ELMo": false, - "dropout": 0.5, "case_vocab_size": 8, - "model_name": "figure" -} \ No newline at end of file + "char_vocab_size": 103, + "dropout": 0.5, + "model_type": "BidLSTM_CRF", + "model_name": "figure", + "recurrent_dropout": 0.5, + "fold_number": 1, + "num_char_lstm_units": 25, + "case_embedding_size": 5, + "use_char_feature": true, + "max_char_length": 30 +} diff --git a/grobid-home/models/figure/model_weights.hdf5 b/grobid-home/models/figure/model_weights.hdf5 index 653b1f8d07..c22b578c0f 100644 Binary files a/grobid-home/models/figure/model_weights.hdf5 and b/grobid-home/models/figure/model_weights.hdf5 differ diff --git a/grobid-home/models/figure/preprocessor.pkl b/grobid-home/models/figure/preprocessor.pkl index 12f1c2e506..6ff8e71584 100644 Binary files a/grobid-home/models/figure/preprocessor.pkl and b/grobid-home/models/figure/preprocessor.pkl differ diff --git a/grobid-home/models/header/config.json b/grobid-home/models/header/config.json new file mode 100644 index 0000000000..9d3edfed1b --- /dev/null +++ b/grobid-home/models/header/config.json @@ -0,0 +1,20 @@ +{ + "use_ELMo": false, + "fold_number": 1, + "char_embedding_size": 25, + "use_crf": true, + "num_char_lstm_units": 25, + "model_name": "header", + "dropout": 0.5, + "char_vocab_size": 338, + "batch_size": 10, + "max_char_length": 30, + "case_vocab_size": 8, + "model_type": "BidLSTM_CRF", + "case_embedding_size": 5, + "num_word_lstm_units": 100, + "recurrent_dropout": 0.5, + "word_embedding_size": 300, + "embeddings_name": "glove-840B", + "use_char_feature": true +} diff --git a/grobid-home/models/header/model_weights.hdf5 b/grobid-home/models/header/model_weights.hdf5 new file mode 100644 index 0000000000..5272a7c35b Binary files /dev/null and b/grobid-home/models/header/model_weights.hdf5 differ diff --git a/grobid-home/models/header/preprocessor.pkl b/grobid-home/models/header/preprocessor.pkl new file mode 100644 index 0000000000..31e5ba6ff8 Binary files /dev/null and b/grobid-home/models/header/preprocessor.pkl differ diff --git a/grobid-home/models/reference-segmenter/config.json b/grobid-home/models/reference-segmenter/config.json new file mode 100644 index 0000000000..6e0159c852 --- /dev/null +++ b/grobid-home/models/reference-segmenter/config.json @@ -0,0 +1,20 @@ +{ + "max_char_length": 30, + "char_vocab_size": 146, + "batch_size": 20, + "case_embedding_size": 5, + "recurrent_dropout": 0.5, + "model_type": "BidLSTM_CRF", + "fold_number": 1, + "use_char_feature": true, + "word_embedding_size": 300, + "embeddings_name": "glove-840B", + "num_word_lstm_units": 100, + "use_crf": true, + "model_name": "reference-segmenter", + "num_char_lstm_units": 25, + "case_vocab_size": 8, + "char_embedding_size": 25, + "dropout": 0.5, + "use_ELMo": false +} diff --git a/grobid-home/models/reference-segmenter/model_weights.hdf5 b/grobid-home/models/reference-segmenter/model_weights.hdf5 new file mode 100644 index 0000000000..d2245f8ac5 Binary files /dev/null and b/grobid-home/models/reference-segmenter/model_weights.hdf5 differ diff --git a/grobid-home/models/reference-segmenter/preprocessor.pkl b/grobid-home/models/reference-segmenter/preprocessor.pkl new file mode 100644 index 0000000000..70b007dccc Binary files /dev/null and b/grobid-home/models/reference-segmenter/preprocessor.pkl differ diff --git a/grobid-home/models/table/config.json b/grobid-home/models/table/config.json index 24e9865a8b..d666f0181a 100644 --- a/grobid-home/models/table/config.json +++ b/grobid-home/models/table/config.json @@ -1,20 +1,20 @@ { - "model_type": "BidLSTM_CRF", - "char_vocab_size": 109, - "dropout": 0.5, - "batch_size": 20, "max_char_length": 30, - "num_char_lstm_units": 25, - "char_embedding_size": 25, - "use_ELMo": false, - "fold_number": 1, - "use_crf": true, + "embeddings_name": "glove-840B", + "dropout": 0.5, + "case_embedding_size": 5, "model_name": "table", + "model_type": "BidLSTM_CRF", "num_word_lstm_units": 100, + "char_vocab_size": 109, + "fold_number": 1, + "char_embedding_size": 25, "use_char_feature": true, - "embeddings_name": "glove-840B", + "num_char_lstm_units": 25, "case_vocab_size": 8, - "case_embedding_size": 5, + "batch_size": 20, + "use_ELMo": false, "recurrent_dropout": 0.5, - "word_embedding_size": 300 -} \ No newline at end of file + "word_embedding_size": 300, + "use_crf": true +} diff --git a/grobid-home/models/table/model_weights.hdf5 b/grobid-home/models/table/model_weights.hdf5 index d3ba6de5ba..3bb0c5f304 100644 Binary files a/grobid-home/models/table/model_weights.hdf5 and b/grobid-home/models/table/model_weights.hdf5 differ diff --git a/grobid-home/models/table/preprocessor.pkl b/grobid-home/models/table/preprocessor.pkl index 51e0d3bdc1..bac0024632 100644 Binary files a/grobid-home/models/table/preprocessor.pkl and b/grobid-home/models/table/preprocessor.pkl differ diff --git a/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.5.4-DeLFT-29.12.2018 b/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.5.4-DeLFT-29.12.2018 new file mode 100644 index 0000000000..f01a833be7 --- /dev/null +++ b/grobid-trainer/doc/PMC_sample_1943.results.grobid-0.5.4-DeLFT-29.12.2018 @@ -0,0 +1,319 @@ +Evaluation metrics produced in 657.811 seconds + +======= Header metadata ======= + +Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 + +abstract 81.16 10.9 10.73 10.81 +authors 91.75 62.45 61.1 61.77 +first_author 97.18 89.71 86.71 88.18 +keywords 89.14 33.33 29.42 31.25 +title 90.08 54.39 53.27 53.82 + +all fields 89.86 51.44 49.53 50.47 (micro average) + 89.86 50.16 48.25 49.17 (macro average) + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 + +abstract 87.09 39.77 39.14 39.45 +authors 91.82 62.77 61.41 62.08 +first_author 97.24 89.98 86.97 88.45 +keywords 89.98 39.66 35 37.18 +title 91.8 62.64 61.35 61.99 + +all fields 91.58 60.42 58.17 59.27 (micro average) + 91.58 58.96 56.77 57.83 (macro average) + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 + +abstract 93.81 72.46 71.32 71.89 +authors 95.04 78.3 76.61 77.45 +first_author 97.31 90.35 87.33 88.81 +keywords 94.07 70.36 62.1 65.97 +title 96.1 83.34 81.63 82.48 + +all fields 95.27 79.62 76.66 78.11 (micro average) + 95.27 78.96 75.8 77.32 (macro average) + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 + +abstract 92.73 67.2 66.14 66.67 +authors 93.05 68.72 67.23 67.97 +first_author 97.19 89.77 86.76 88.24 +keywords 91.23 49.01 43.26 45.96 +title 93.71 71.83 70.36 71.09 + +all fields 93.58 70.83 68.2 69.49 (micro average) + 93.58 69.31 66.75 67.98 (macro average) + +===== Instance-level results ===== + +Total expected instances: 1943 +Total correct instances: 47 (strict) +Total correct instances: 178 (soft) +Total correct instances: 741 (Levenshtein) +Total correct instances: 422 (ObservedRatcliffObershelp) + +Instance-level recall: 2.42 (strict) +Instance-level recall: 9.16 (soft) +Instance-level recall: 38.14 (Levenshtein) +Instance-level recall: 21.72 (RatcliffObershelp) + +======= Citation metadata ======= + +Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 + +authors 93.42 53.24 33.21 40.91 +date 97.6 83.35 51.23 63.46 +first_author 94.68 62.43 38.83 47.88 +inTitle 94.56 61.15 40.92 49.03 +issue 99.08 77.26 60.58 67.91 +page 96.1 80.13 45.61 58.13 +title 96.42 73.55 47.58 57.78 +volume 97.84 84.51 56.15 67.47 + +all fields 96.21 71.19 45.14 55.25 (micro average) + 96.21 71.95 46.77 56.57 (macro average) + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 + +authors 93.47 53.58 33.42 41.17 +date 97.6 83.35 51.23 63.46 +first_author 94.68 62.46 38.85 47.9 +inTitle 96 71.41 47.79 57.26 +issue 99.08 77.26 60.58 67.91 +page 96.1 80.13 45.61 58.13 +title 97.9 84.53 54.68 66.41 +volume 97.84 84.51 56.15 67.47 + +all fields 96.58 74.22 47.06 57.6 (micro average) + 96.58 74.65 48.54 58.71 (macro average) + + +==== Levenshtein Matching ===== (Minimum Levenshtein distance at 0.8) + +===== Field-level results ===== + +label accuracy precision recall f1 + +authors 95.09 65.29 40.73 50.17 +date 97.6 83.35 51.23 63.46 +first_author 94.69 62.57 38.92 47.99 +inTitle 96.17 72.67 48.63 58.27 +issue 99.08 77.26 60.58 67.91 +page 96.1 80.13 45.61 58.13 +title 98.17 86.52 55.97 67.97 +volume 97.84 84.51 56.15 67.47 + +all fields 96.84 76.34 48.41 59.25 (micro average) + 96.84 76.54 49.73 60.17 (macro average) + + += Ratcliff/Obershelp Matching = (Minimum Ratcliff/Obershelp similarity at 0.95) + +===== Field-level results ===== + +label accuracy precision recall f1 + +authors 93.81 56.03 34.95 43.05 +date 97.6 83.35 51.23 63.46 +first_author 94.68 62.44 38.84 47.89 +inTitle 95.81 70.12 46.93 56.23 +issue 99.08 77.26 60.58 67.91 +page 96.1 80.13 45.61 58.13 +title 98.11 86.08 55.69 67.63 +volume 97.84 84.51 56.15 67.47 + +all fields 96.63 74.59 47.3 57.89 (micro average) + 96.63 74.99 48.75 58.97 (macro average) + +===== Instance-level results ===== + +Total expected instances: 90125 +Total extracted instances: 69072 +Total correct instances: 13055 (strict) +Total correct instances: 17184 (soft) +Total correct instances: 18715 (Levenshtein) +Total correct instances: 17496 (RatcliffObershelp) + +Instance-level precision: 18.9 (strict) +Instance-level precision: 24.88 (soft) +Instance-level precision: 27.09 (Levenshtein) +Instance-level precision: 25.33 (RatcliffObershelp) + +Instance-level recall: 14.49 (strict) +Instance-level recall: 19.07 (soft) +Instance-level recall: 20.77 (Levenshtein) +Instance-level recall: 19.41 (RatcliffObershelp) + +Instance-level f-score: 16.4 (strict) +Instance-level f-score: 21.59 (soft) +Instance-level f-score: 23.51 (Levenshtein) +Instance-level f-score: 21.98 (RatcliffObershelp) + +Matching 1 : 41359 + +Matching 2 : 2223 + +Matching 3 : 1892 + +Matching 4 : 2304 + +Total matches : 47778 + +======= Fulltext structures ======= + +Evaluation on 1943 random PDF files out of 1943 PDF (ratio 1.0). + +======= Strict Matching ======= (exact matches) + +===== Field-level results ===== + +label accuracy precision recall f1 + +figure_title 96.64 28.88 20.42 23.92 +reference_citation 56.44 56.14 52.34 54.18 +reference_figure 94.47 60.98 60.96 60.97 +reference_table 99.07 83.06 82.28 82.67 +section_title 94.3 74.48 66.52 70.28 +table_title 98.61 0 0 0 + +all fields 89.92 59.48 54.12 56.67 (micro average) + 89.92 50.59 47.09 48.67 (macro average) + + +======== Soft Matching ======== (ignoring punctuation, case and space characters mismatches) + +===== Field-level results ===== + +label accuracy precision recall f1 + +figure_title 98.2 73.08 51.66 60.53 +reference_citation 58.93 60.29 56.21 58.18 +reference_figure 94.43 61.97 61.96 61.96 +reference_table 99.07 83.57 82.79 83.18 +section_title 94.95 78.78 70.36 74.33 +table_title 98.57 0 0 0 + +all fields 90.69 64.27 58.48 61.24 (micro average) + 90.69 59.62 53.83 56.36 (macro average) + + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.TableRejectionCounters +************************************************************************************ +------------------------------------------------------------------------------------ + EMPTY_LABEL_OR_HEADER_OR_CONTENT: 5408 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters +************************************************************************************ +------------------------------------------------------------------------------------ + UNMATCHED_REF_MARKERS: 43396 + MATCHED_REF_MARKERS_AFTER_POST_FILTERING: 995 + STYLE_AUTHORS: 35414 + STYLE_NUMBERED: 48321 + MANY_CANDIDATES: 3768 + MANY_CANDIDATES_AFTER_POST_FILTERING: 188 + NO_CANDIDATES: 63513 + INPUT_REF_STRINGS_CNT: 88311 + MATCHED_REF_MARKERS: 62303 + NO_CANDIDATES_AFTER_POST_FILTERING: 1894 + STYLE_OTHER: 4576 +==================================================================================== + +************************************************************************************ +COUNTER: org.grobid.core.engines.label.TaggingLabelImpl +************************************************************************************ +------------------------------------------------------------------------------------ + CITATION_TITLE: 81843 + NAME-HEADER_MIDDLENAME: 4381 + TABLE_FIGDESC: 7 + NAME-CITATION_PAD: 523 + NAME-HEADER_SURNAME: 11644 + NAME-CITATION_OTHER: 350453 + CITATION_BOOKTITLE: 6817 + CITATION_NOTE: 2723 + FULLTEXT_CITATION_MARKER: 176254 + FULLTEXT_TABLE_MARKER: 14605 + CITATION_WEB: 1813 + FULLTEXT_SECTION: 51259 + NAME-HEADER_FORENAME: 11822 + TABLE_CONTENT: 32649 + CITATION_COLLABORATION: 163 + CITATION_ISSUE: 19057 + CITATION_JOURNAL: 77177 + NAME-CITATION_SURNAME: 283318 + FULLTEXT_EQUATION_MARKER: 1721 + CITATION_OTHER: 439213 + FULLTEXT_FIGURE_MARKER: 38906 + CITATION_TECH: 258 + FIGURE_CONTENT: 4371 + FIGURE_LABEL: 5037 + FULLTEXT_EQUATION_LABEL: 1819 + FULLTEXT_EQUATION: 3923 + TABLE_PAD: 28101 + NAME-HEADER_PAD: 47 + CITATION_DATE: 88025 + FULLTEXT_FIGURE: 14735 + CITATION_AUTHOR: 94783 + FULLTEXT_TABLE: 11213 + CITATION_EDITOR: 6560 + FULLTEXT_OTHER: 355 + NAME-HEADER_OTHER: 13684 + FIGURE_FIGDESC: 8994 + NAME-HEADER_SUFFIX: 9 + CITATION_VOLUME: 75596 + CITATION_LOCATION: 7180 + NAME-CITATION_SUFFIX: 305 + NAME-HEADER_TITLE: 536 + CITATION_INSTITUTION: 1048 + CITATION_PAD: 90 + CITATION_PAGES: 77316 + NAME-HEADER_MARKER: 7571 + NAME-CITATION_FORENAME: 269482 + CITATION_PUBLISHER: 4732 + CITATION_PUBNUM: 10289 + FIGURE_PAD: 1762 + NAME-CITATION_MIDDLENAME: 46218 + FULLTEXT_PARAGRAPH: 370985 + FIGURE_FIGURE_HEAD: 8523 +==================================================================================== +==================================================================================== + + + +BUILD SUCCESSFUL in 3h 8m 7s \ No newline at end of file diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/EndToEndEvaluation.java b/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/EndToEndEvaluation.java index caf275aa2d..e652c7000d 100644 --- a/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/EndToEndEvaluation.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/evaluation/EndToEndEvaluation.java @@ -152,8 +152,8 @@ public boolean accept(File dir, String name) { System.out.println(n + " - " + pdfFile.getPath()); GrobidAnalysisConfig config = GrobidAnalysisConfig.builder() - .consolidateHeader(0) - .consolidateCitations(1) + .consolidateHeader(1) + .consolidateCitations(0) .withPreprocessImages(true) .build(); String tei = engine.fullTextToTEI(pdfFile, config); diff --git a/grobid-trainer/src/main/resources/log4j.xml b/grobid-trainer/src/main/resources/log4j.xml index 483fb2d744..314d067193 100644 --- a/grobid-trainer/src/main/resources/log4j.xml +++ b/grobid-trainer/src/main/resources/log4j.xml @@ -10,7 +10,8 @@ - - + + + \ No newline at end of file