Skip to content

Commit

Permalink
add DeLFT models; make both Wapiti and DeLFT usable at the same time;…
Browse files Browse the repository at this point in the history
… some fixes

Former-commit-id: ab43d65
  • Loading branch information
kermitt2 committed Dec 29, 2018
1 parent 9da8702 commit 8d086f4
Show file tree
Hide file tree
Showing 34 changed files with 540 additions and 168 deletions.
11 changes: 7 additions & 4 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ subprojects {
}
}

configurations {
/*configurations {
all*.exclude group: 'org.slf4j', module: "slf4j-log4j12"
all*.exclude group: 'log4j', module: "log4j"
}
}*/

ext {
// treating them separately, these jars will be flattened into grobid-core.jar on installing,
Expand Down Expand Up @@ -199,6 +199,7 @@ project("grobid-core") {
compile "org.apache.lucene:lucene-analyzers-common:4.5.1"
compile 'javax.xml.bind:jaxb-api:2.3.0'
compile 'black.ninia:jep:3.8.2'
compile 'org.slf4j:slf4j-log4j12:1.7.25'

shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"

Expand Down Expand Up @@ -280,6 +281,8 @@ project(":grobid-service") {

configurations {
all*.exclude group: 'org.slf4j', module: "slf4j-jdk14"
all*.exclude group: 'org.slf4j', module: "slf4j-log4j12"
all*.exclude group: 'log4j', module: "log4j"
}

// tasks.distZip.enabled = false
Expand Down Expand Up @@ -372,8 +375,8 @@ project(":grobid-trainer") {
}

// run like:
// gradle PubMedCentralEval -Pp2t=/path/to/goldenSet
// gradle PubMedCentralEval -Pp2t=/path/to/goldenSet -Prun=1 -PfileRatio=0.1
// ./gradlew PubMedCentralEval -Pp2t=/path/to/goldenSet
// ./gradlew PubMedCentralEval -Pp2t=/path/to/goldenSet -Prun=1 -PfileRatio=0.1
// ./gradlew PrepareDOIMatching -Pp2t=ABS_PATH_TO_PMC/PMC_sample_1943
// ./gradlew EvaluateDOIMatching -Pp2t=ABS_PATH_TO_PMC/PMC_sample_1943
task(PubMedCentralEval, dependsOn: 'classes', type: JavaExec, group: 'modelevaluation') {
Expand Down
10 changes: 5 additions & 5 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -3275,7 +3275,7 @@ public void attachAffiliations() {
aff.setFailAffiliation(false);
}
} else if (hasMarker) {
// we get the marker for each affiliation and try to find the related author in the
// we get the marker for each affiliation and try to find the related author in the
// original author field
for (Affiliation aff : fullAffiliations) {
if (aff.getMarker() != null) {
Expand Down Expand Up @@ -3391,16 +3391,18 @@ public void attachAffiliations() {
}
}
}
} else if (nbAuthors == nbAffiliations) {
} /*else if (nbAuthors == nbAffiliations) {
// risky heuristics, we distribute in this case one affiliation per author
// preserving author
// sometimes 2 affiliations belong both to 2 authors, for these case, the layout
// positioning should be studied
for (int p = 0; p < nbAuthors; p++) {
fullAuthors.get(p).addAffiliation(fullAffiliations.get(p));
System.out.println("attachment: " + p);
System.out.println(fullAuthors.get(p));
fullAffiliations.get(p).setFailAffiliation(false);
}
}
}*/
}


Expand All @@ -3412,7 +3414,6 @@ public String toTEIAuthorBlock(int nbTag, boolean withCoordinates) {
int nbAuthors = 0;
int nbAffiliations = 0;
int nbAddresses = 0;

// uncomment below when collaboration will be concretely added to headers
/*
if ( (collaboration != null) &&
Expand Down Expand Up @@ -3497,7 +3498,6 @@ public String toTEIAuthorBlock(int nbTag, boolean withCoordinates) {
}

if (author.getAffiliations() != null) {

for (Affiliation aff : author.getAffiliations()) {
TextUtilities.appendN(tei, '\t', nbTag + 1);
tei.append("<affiliation");
Expand Down
5 changes: 5 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Person.java
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ public String toString() {
if (email != null) {
res += " (email:" + email + ")";
}
if (affiliations != null) {
for(Affiliation aff : affiliations) {
res += " (affiliation: " + aff.toString() + ") ";
}
}
return res.trim();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,10 @@ private void filterAffiliationAddress(String result,
lastLabel = null;
}
else {
String[] s = line.split("\t");
String delimiter = "\t";
if (line.indexOf(delimiter) == -1)
delimiter = " ";
String[] s = line.split(delimiter);
String s0 = s[0].trim();
int p0 = p;
boolean strop = false;
Expand Down Expand Up @@ -172,39 +175,8 @@ private ArrayList<Affiliation> processingReflow(List<String> affiliationBlocks,
return resultBuilder(res, tokenizations, false); // normally use pre-label because it is a reflow
}


static class DebugTahher {
private String str = "";

public void add(String s) {
str += s;
}

public void clear() {
str = "";
}

String[] split;


public boolean parse() {
System.out.println("Parsing:\n" + str + "\n------------------");
split = str.split("\n");

return true;
}

public int size() {
return split.length;
}


}

private String runReflow(List<String> affiliationBlocks,
List<LayoutToken> tokenizations) {
// StringBuilder res = new StringBuilder();
// DebugTahher tagger = new DebugTahher();
try {
List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
placesPositions.add(lexicon.tokenPositionsCityNames(tokenizations));
Expand All @@ -217,10 +189,7 @@ private String runReflow(List<String> affiliationBlocks,
return null;
}

String res = label(header);
res = label(res);

return res;
return label(header);
} catch (Exception e) {
throw new GrobidException("An exception occured while running Grobid.", e);
}
Expand Down Expand Up @@ -276,7 +245,10 @@ private ArrayList<Affiliation> resultBuilder(String result,
hasAddress = false;
continue;
}
StringTokenizer st3 = new StringTokenizer(line, "\t");
String delimiter = "\t";
if (line.indexOf(delimiter) == -1)
delimiter = " ";
StringTokenizer st3 = new StringTokenizer(line, delimiter);
int ll = st3.countTokens();
int i = 0;
String s1 = null; // predicted label
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.grobid.core.GrobidModels;
import org.grobid.core.data.BiblioItem;
import org.grobid.core.data.Date;
import org.grobid.core.data.Affiliation;
import org.grobid.core.data.Keyword;
import org.grobid.core.data.Person;
import org.grobid.core.document.Document;
Expand Down Expand Up @@ -123,7 +124,6 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re
header = doc.getHeaderFeatured(false, true);
}*/
List<LayoutToken> tokenizations = doc.getTokenizationsHeader();
//System.out.println(tokenizations.toString());

if ((header != null) && (header.trim().length() > 0)) {
String res = label(header);
Expand Down Expand Up @@ -199,8 +199,8 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re
}
}

resHeader.setFullAffiliations(
parsers.getAffiliationAddressParser().processReflow(res, tokenizations));
List<Affiliation> affiliations = parsers.getAffiliationAddressParser().processReflow(res, tokenizations);
resHeader.setFullAffiliations(affiliations);
resHeader.attachEmails();
boolean attached = false;
if (fragmentedAuthors && !hasMarker) {
Expand Down Expand Up @@ -264,10 +264,7 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re
}
}

//if (consolidate)
{
resHeader = consolidateHeader(resHeader, consolidate);
}
resHeader = consolidateHeader(resHeader, consolidate);

// normalization of dates
if (resHeader != null) {
Expand Down Expand Up @@ -478,10 +475,7 @@ public String processingHeaderSection(int consolidate, Document doc, BiblioItem
}
}

//if (consolidate)
{
resHeader = consolidateHeader(resHeader, consolidate);
}
resHeader = consolidateHeader(resHeader, consolidate);

// normalization of dates
if (resHeader != null) {
Expand Down Expand Up @@ -1076,7 +1070,10 @@ public BiblioItem resultExtraction(String result, boolean intro, List<LayoutToke
if (tok.length() == 0) {
continue;
}
StringTokenizer stt = new StringTokenizer(tok, "\t");
String delimiter = "\t";
if (tok.indexOf(delimiter) == -1)
delimiter = " ";
StringTokenizer stt = new StringTokenizer(tok, delimiter);
List<String> localFeatures = new ArrayList<String>();
int i = 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ public static synchronized GenericTagger getTagger(GrobidModel model) {
t = new WapitiTagger(model);
break;
case DELFT:
t = new DeLFTTagger(model);
// if model is fulltext or segmentation we use currently WAPITI as fallback because they
// are not covered by DeLFT for the moment
if (model.getModelName().equals("fulltext") || model.getModelName().equals("segmentation"))
t = new WapitiTagger(model);
else
t = new DeLFTTagger(model);
break;
default:
throw new IllegalStateException("Unsupported Grobid sequence labelling engine: " + GrobidProperties.getGrobidCRFEngine());
Expand Down
14 changes: 12 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/jni/DeLFTModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,14 @@ public void run() {
jep.eval("print(len(x_train), 'train sequences')");
jep.eval("print(len(x_valid), 'validation sequences')");

String useELMo = "False";
if (GrobidProperties.getInstance().useELMo()) {
useELMo = "True";
}

// init model to be trained
jep.eval("model = sequenceLabelling.Sequence('"+this.modelName+
"', max_epoch=100, recurrent_dropout=0.50, embeddings_name='glove-840B', use_ELMo=False)");
"', max_epoch=100, recurrent_dropout=0.50, embeddings_name='glove-840B', use_ELMo="+useELMo+")");

// actual training
//start_time = time.time()
Expand Down Expand Up @@ -212,12 +217,17 @@ public void run() {
public static void train(String modelName, File trainingData, File outputModel) {
try {
LOGGER.info("Train DeLFT model " + modelName + "...");
ProcessBuilder pb = new ProcessBuilder("python3",
List<String> command = Arrays.asList("python3",
"grobidTagger.py",
modelName,
"train",
"--input", trainingData.getAbsolutePath(),
"--output", GrobidProperties.getInstance().getModelPath().getAbsolutePath());
if (GrobidProperties.getInstance().useELMo()) {
command.add("--use-ELMo");
}

ProcessBuilder pb = new ProcessBuilder(command);
File delftPath = new File(GrobidProperties.getInstance().getDeLFTFilePath());
pb.directory(delftPath);
Process process = pb.start();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ public Jep getJEPInstance() {
throw new GrobidResourceException("DeLFT installation path is not a directory");
}
config.addIncludePaths(delftPath.getAbsolutePath());
config.setClassLoader(Thread.currentThread().getContextClassLoader());
//System.out.println("jep instance thread: " + Thread.currentThread().getId());
Jep jep = new Jep(config);
jepInstances.put(Thread.currentThread().getId(), jep);
Expand Down

This file was deleted.

Loading

0 comments on commit 8d086f4

Please sign in to comment.