add DeLFT models; make both Wapiti and DeLFT usable at the same time;…

… some fixes Former-commit-id: ab43d65
allenai · Dec 29, 2018 · 8d086f4 · 8d086f4
1 parent 9da8702
commit 8d086f4
Show file tree

Hide file tree

Showing 34 changed files with 540 additions and 168 deletions.
diff --git a/build.gradle b/build.gradle
@@ -55,10 +55,10 @@ subprojects {
         }
     }
 
-    configurations {
+    /*configurations {
         all*.exclude group: 'org.slf4j', module: "slf4j-log4j12"
         all*.exclude group: 'log4j', module: "log4j"
-    }
+    }*/
 
     ext {
         // treating them separately, these jars will be flattened into grobid-core.jar on installing,
@@ -199,6 +199,7 @@ project("grobid-core") {
         compile "org.apache.lucene:lucene-analyzers-common:4.5.1"
         compile 'javax.xml.bind:jaxb-api:2.3.0'
         compile 'black.ninia:jep:3.8.2'
+        compile 'org.slf4j:slf4j-log4j12:1.7.25'
 
         shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"
 
@@ -280,6 +281,8 @@ project(":grobid-service") {
 
     configurations {
         all*.exclude group: 'org.slf4j', module: "slf4j-jdk14"
+        all*.exclude group: 'org.slf4j', module: "slf4j-log4j12"
+        all*.exclude group: 'log4j', module: "log4j"
     }
 
 //    tasks.distZip.enabled = false
@@ -372,8 +375,8 @@ project(":grobid-trainer") {
     }
 
     // run like:
-    // gradle PubMedCentralEval -Pp2t=/path/to/goldenSet
-    // gradle PubMedCentralEval -Pp2t=/path/to/goldenSet -Prun=1 -PfileRatio=0.1
+    // ./gradlew PubMedCentralEval -Pp2t=/path/to/goldenSet
+    // ./gradlew PubMedCentralEval -Pp2t=/path/to/goldenSet -Prun=1 -PfileRatio=0.1
     // ./gradlew PrepareDOIMatching -Pp2t=ABS_PATH_TO_PMC/PMC_sample_1943 
     // ./gradlew EvaluateDOIMatching -Pp2t=ABS_PATH_TO_PMC/PMC_sample_1943 
     task(PubMedCentralEval, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') {

diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
@@ -3275,7 +3275,7 @@ public void attachAffiliations() {
                 aff.setFailAffiliation(false);
             }
         } else if (hasMarker) {
-            // we get the marker for each affiliation and try  to find the related author in the
+            // we get the marker for each affiliation and try to find the related author in the
             // original author field
             for (Affiliation aff : fullAffiliations) {
                 if (aff.getMarker() != null) {
@@ -3391,16 +3391,18 @@ public void attachAffiliations() {
                     }
                 }
             }
-        } else if (nbAuthors == nbAffiliations) {
+        } /*else if (nbAuthors == nbAffiliations) {
             // risky heuristics, we distribute in this case one affiliation per author
             // preserving author 
             // sometimes 2 affiliations belong both to 2 authors, for these case, the layout
             // positioning should be studied
             for (int p = 0; p < nbAuthors; p++) {
                 fullAuthors.get(p).addAffiliation(fullAffiliations.get(p));
+                System.out.println("attachment: " + p);
+                System.out.println(fullAuthors.get(p));
                 fullAffiliations.get(p).setFailAffiliation(false);
             }
-        }
+        }*/
     }
 
 
@@ -3412,7 +3414,6 @@ public String toTEIAuthorBlock(int nbTag, boolean withCoordinates) {
         int nbAuthors = 0;
         int nbAffiliations = 0;
         int nbAddresses = 0;
-
         // uncomment below when collaboration will be concretely added to headers
         /*
         if ( (collaboration != null) && 
@@ -3497,7 +3498,6 @@ public String toTEIAuthorBlock(int nbTag, boolean withCoordinates) {
                     }
 
                     if (author.getAffiliations() != null) {
-
                         for (Affiliation aff : author.getAffiliations()) {
                             TextUtilities.appendN(tei, '\t', nbTag + 1);
                             tei.append("<affiliation");

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Person.java b/grobid-core/src/main/java/org/grobid/core/data/Person.java
@@ -170,6 +170,11 @@ public String toString() {
         if (email != null) {
             res += " (email:" + email + ")";
         }
+        if (affiliations != null) {
+            for(Affiliation aff : affiliations) {
+                res += " (affiliation: " + aff.toString() + ") ";
+            }
+        }
         return res.trim();
     }
 

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java
@@ -109,7 +109,10 @@ private void filterAffiliationAddress(String result,
                 lastLabel = null;
             }
             else {
-                String[] s = line.split("\t");
+                String delimiter = "\t";
+                if (line.indexOf(delimiter) == -1)
+                    delimiter = " "; 
+                String[] s = line.split(delimiter);
                 String s0 = s[0].trim();
                 int p0 = p;
                 boolean strop = false;
@@ -172,39 +175,8 @@ private ArrayList<Affiliation> processingReflow(List<String> affiliationBlocks,
         return resultBuilder(res, tokenizations, false); // normally use pre-label because it is a reflow
     }
 
-
-    static class DebugTahher {
-        private String str = "";
-
-        public void add(String s) {
-            str += s;
-        }
-
-        public void clear() {
-            str = "";
-        }
-
-        String[] split;
-
-
-        public boolean parse() {
-            System.out.println("Parsing:\n" + str + "\n------------------");
-            split = str.split("\n");
-
-            return true;
-        }
-
-        public int size() {
-            return split.length;
-        }
-
-
-    }
-
     private String runReflow(List<String> affiliationBlocks,
                              List<LayoutToken> tokenizations) {
-//        StringBuilder res = new StringBuilder();
-//        DebugTahher tagger = new DebugTahher();
         try {
             List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
             placesPositions.add(lexicon.tokenPositionsCityNames(tokenizations));
@@ -217,10 +189,7 @@ private String runReflow(List<String> affiliationBlocks,
                 return null;
             }
 
-            String res = label(header);
-            res = label(res);
-
-            return res;
+            return label(header);
         } catch (Exception e) {
             throw new GrobidException("An exception occured while running Grobid.", e);
         }
@@ -276,7 +245,10 @@ private ArrayList<Affiliation> resultBuilder(String result,
                     hasAddress = false;
                     continue;
                 }
-                StringTokenizer st3 = new StringTokenizer(line, "\t");
+                String delimiter = "\t";
+                if (line.indexOf(delimiter) == -1)
+                    delimiter = " "; 
+                StringTokenizer st3 = new StringTokenizer(line, delimiter);
                 int ll = st3.countTokens();
                 int i = 0;
                 String s1 = null; // predicted label

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java
@@ -6,6 +6,7 @@
 import org.grobid.core.GrobidModels;
 import org.grobid.core.data.BiblioItem;
 import org.grobid.core.data.Date;
+import org.grobid.core.data.Affiliation;
 import org.grobid.core.data.Keyword;
 import org.grobid.core.data.Person;
 import org.grobid.core.document.Document;
@@ -123,7 +124,6 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re
             header = doc.getHeaderFeatured(false, true);
         }*/
         List<LayoutToken> tokenizations = doc.getTokenizationsHeader();
-//System.out.println(tokenizations.toString());
 
         if ((header != null) && (header.trim().length() > 0)) {
             String res = label(header);
@@ -199,8 +199,8 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re
                         }
                     }
 
-                    resHeader.setFullAffiliations(
-                            parsers.getAffiliationAddressParser().processReflow(res, tokenizations));
+                    List<Affiliation> affiliations = parsers.getAffiliationAddressParser().processReflow(res, tokenizations);
+                    resHeader.setFullAffiliations(affiliations);
                     resHeader.attachEmails();
                     boolean attached = false;
                     if (fragmentedAuthors && !hasMarker) {
@@ -264,10 +264,7 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re
                     }
                 }
 
-                //if (consolidate) 
-                {
-                    resHeader = consolidateHeader(resHeader, consolidate);
-                }
+                resHeader = consolidateHeader(resHeader, consolidate);
 
                 // normalization of dates
                 if (resHeader != null) {
@@ -478,10 +475,7 @@ public String processingHeaderSection(int consolidate, Document doc, BiblioItem
                     }
                 }
 
-                //if (consolidate) 
-                {
-                    resHeader = consolidateHeader(resHeader, consolidate);
-                }
+                resHeader = consolidateHeader(resHeader, consolidate);
 
                 // normalization of dates
                 if (resHeader != null) {
@@ -1076,7 +1070,10 @@ public BiblioItem resultExtraction(String result, boolean intro, List<LayoutToke
             if (tok.length() == 0) {
                 continue;
             }
-            StringTokenizer stt = new StringTokenizer(tok, "\t");
+            String delimiter = "\t";
+            if (tok.indexOf(delimiter) == -1)
+                delimiter = " "; 
+            StringTokenizer stt = new StringTokenizer(tok, delimiter);
             List<String> localFeatures = new ArrayList<String>();
             int i = 0;
 

diff --git a/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java b/grobid-core/src/main/java/org/grobid/core/engines/tagging/TaggerFactory.java
@@ -26,7 +26,12 @@ public static synchronized GenericTagger getTagger(GrobidModel model) {
                     t = new WapitiTagger(model);
                     break;
                 case DELFT:
-                    t = new DeLFTTagger(model);
+                    // if model is fulltext or segmentation we use currently WAPITI as fallback because they
+                    // are not covered by DeLFT for the moment
+                    if (model.getModelName().equals("fulltext") || model.getModelName().equals("segmentation"))
+                        t = new WapitiTagger(model);
+                    else    
+                        t = new DeLFTTagger(model);
                     break;
                 default:
                     throw new IllegalStateException("Unsupported Grobid sequence labelling engine: " + GrobidProperties.getGrobidCRFEngine());

diff --git a/grobid-core/src/main/java/org/grobid/core/jni/DeLFTModel.java b/grobid-core/src/main/java/org/grobid/core/jni/DeLFTModel.java
@@ -176,9 +176,14 @@ public void run() {
                 jep.eval("print(len(x_train), 'train sequences')");
                 jep.eval("print(len(x_valid), 'validation sequences')");
 
+                String useELMo = "False";
+                if (GrobidProperties.getInstance().useELMo()) {
+                    useELMo = "True";
+                }
+
                 // init model to be trained
                 jep.eval("model = sequenceLabelling.Sequence('"+this.modelName+
-                    "', max_epoch=100, recurrent_dropout=0.50, embeddings_name='glove-840B', use_ELMo=False)");
+                    "', max_epoch=100, recurrent_dropout=0.50, embeddings_name='glove-840B', use_ELMo="+useELMo+")");
 
                 // actual training
                 //start_time = time.time()
@@ -212,12 +217,17 @@ public void run() {
     public static void train(String modelName, File trainingData, File outputModel) {
         try {
             LOGGER.info("Train DeLFT model " + modelName + "...");
-            ProcessBuilder pb = new ProcessBuilder("python3", 
+            List<String> command = Arrays.asList("python3", 
                 "grobidTagger.py", 
                 modelName,
                 "train",
                 "--input", trainingData.getAbsolutePath(),
                 "--output", GrobidProperties.getInstance().getModelPath().getAbsolutePath());
+            if (GrobidProperties.getInstance().useELMo()) {
+                command.add("--use-ELMo");
+            }
+
+            ProcessBuilder pb = new ProcessBuilder(command);
             File delftPath = new File(GrobidProperties.getInstance().getDeLFTFilePath());
             pb.directory(delftPath);
             Process process = pb.start(); 

diff --git a/grobid-core/src/main/java/org/grobid/core/jni/JEPThreadPool.java b/grobid-core/src/main/java/org/grobid/core/jni/JEPThreadPool.java
@@ -75,6 +75,7 @@ public Jep getJEPInstance() {
                     throw new GrobidResourceException("DeLFT installation path is not a directory");
                 }
                 config.addIncludePaths(delftPath.getAbsolutePath());
+                config.setClassLoader(Thread.currentThread().getContextClassLoader());
                 //System.out.println("jep instance thread: " + Thread.currentThread().getId());
                 Jep jep = new Jep(config);
                 jepInstances.put(Thread.currentThread().getId(), jep);

diff --git a/grobid-core/src/main/java/org/grobid/core/main/GrobidConstants.java b/grobid-core/src/main/java/org/grobid/core/main/GrobidConstants.java