Skip to content

Commit

Permalink
implement etymQuote model as a first model to get complex like quote …
Browse files Browse the repository at this point in the history
…from etym #14
  • Loading branch information
MedKhem committed Oct 6, 2017
1 parent 2f57c66 commit c368e5b
Show file tree
Hide file tree
Showing 10 changed files with 490 additions and 14 deletions.
23 changes: 23 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,29 @@
</build>

<profiles>
<profile>
<id>train_etymQuote</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<executions>
<execution>
<phase>generate-resources</phase>
<goals>
<goal>java</goal>
</goals>
<configuration>
<mainClass>org.grobid.trainer.EtymQuoteTrainer</mainClass>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>train_etym</id>
<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ private String checkFullABodyComponentToTEI(String tagLabel, List<LayoutToken> a
LexicalEntryParser lexicalEntryParser = new LexicalEntryParser();
FormParser formParser = new FormParser();
SenseParser senseParser = new SenseParser();
EtymParser etymParser = new EtymParser();
EtymQuoteParser etymQuoteParser = new EtymQuoteParser();
if (tagLabel.equals(DictionaryBodySegmentationLabels.PUNCTUATION_LABEL)) {
clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(allTokensOfaLE));
}else {
Expand All @@ -1281,7 +1281,7 @@ private String checkFullABodyComponentToTEI(String tagLabel, List<LayoutToken> a
clusterContent = clusterContent + senseParser.processToTEI(segmentedEntryComponent.getA()).toString();
} else if (segmentedEntryComponent.getB().equals(LEXICAL_ENTRY_ETYM_LABEL)) {

clusterContent = clusterContent + etymParser.processToTei(segmentedEntryComponent.getA()).toString();
clusterContent = clusterContent + etymQuoteParser.processToTei(segmentedEntryComponent.getA()).toString();
} else {
String xmlTag = segmentedEntryComponent.getB().replace("<", "").replace(">", "");
clusterContent = clusterContent + createMyXMLString(xmlTag, LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(segmentedEntryComponent.getA())));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public class DictionaryModels {
public static final GrobidModel FORM = GrobidModels.modelFor("form");
public static final GrobidModel SENSE = GrobidModels.modelFor("sense");
public static final GrobidModel GRAMMATICAL_GROUP = GrobidModels.modelFor("grammatical-group");

public static final GrobidModel ETYM_QUOTE = GrobidModels.modelFor("etymQuote");
public static final GrobidModel ETYM = GrobidModels.modelFor("etym");

}
99 changes: 99 additions & 0 deletions src/main/java/org/grobid/core/engines/EtymQuoteParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package org.grobid.core.engines;

import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.LabeledLexicalInformation;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.features.FeatureVectorLexicalEntry;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.LayoutTokenization;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.Pair;
import org.grobid.core.utilities.TextUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;

import static org.grobid.core.document.TEIDictionaryFormatter.createMyXMLString;
import static org.grobid.core.engines.label.DictionaryBodySegmentationLabels.DICTIONARY_ENTRY_LABEL;
import static org.grobid.service.DictionaryPaths.PATH_FULL_DICTIONARY;

/**
* Created by Med on 06.10.17.
*/
public class EtymQuoteParser extends AbstractParser {
private static final Logger LOGGER = LoggerFactory.getLogger(FormParser.class);
private static volatile FormParser instance;

public EtymQuoteParser() {
super(DictionaryModels.ETYM_QUOTE);
}


public static FormParser getInstance() {
if (instance == null) {
getNewInstance();
}
return instance;
}

private static synchronized void getNewInstance() {
instance = new FormParser();
}

public StringBuilder processToTei(List<LayoutToken> etymEntry) {
LabeledLexicalInformation labeledSense = process(etymEntry, PATH_FULL_DICTIONARY);
StringBuilder sb = new StringBuilder();

sb.append("<etym>").append("\n");
//I apply the form also to the sense to recognise the grammatical group, if any!

for (Pair<List<LayoutToken>, String> entrySense : labeledSense.getLabels()) {
String tokenSense = LayoutTokensUtil.normalizeText(entrySense.getA());
String labelSense = entrySense.getB();

String content = TextUtilities.HTMLEncode(tokenSense);
content = content.replace("&lt;lb/&gt;", "<lb/>");

sb.append(createMyXMLString(labelSense.replaceAll("[<>]", ""), content));

}
sb.append("</etym>").append("\n");
return sb;

}

public LabeledLexicalInformation process(List<LayoutToken> etymEntry, String parentTag) {
LabeledLexicalInformation labeledLexicalEntry = new LabeledLexicalInformation();

LayoutTokenization layoutTokenization = new LayoutTokenization(etymEntry);

String featSeg = FeatureVectorLexicalEntry.createFeaturesFromLayoutTokens(layoutTokenization.getTokenization(), DICTIONARY_ENTRY_LABEL).toString();

if (StringUtils.isNotBlank(featSeg)) {
// Run the lexical entry model to label the features
String modelOutput = label(featSeg);
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(DictionaryModels.ETYM, modelOutput, etymEntry);

List<TaggingTokenCluster> clusters = clusteror.cluster();

for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
continue;
}
TaggingLabel clusterLabel = cluster.getTaggingLabel();
Engine.getCntManager().i((TaggingLabel) clusterLabel);

List<LayoutToken> concatenatedTokens = cluster.concatTokens();
String tagLabel = clusterLabel.getLabel();

labeledLexicalEntry.addLabel(new Pair(concatenatedTokens, tagLabel));
}
}


return labeledLexicalEntry;
}
}
26 changes: 17 additions & 9 deletions src/main/java/org/grobid/core/engines/label/EtymLabels.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,31 @@ private EtymLabels() {
super();
}

public static final String ATTESTED_FORM_ETYM_LABEL = "<attForm>";
public static final String ETYM_RELATION_ETYM_LABEL = "<etymRel>";
public static final String MENTIONED_ETYM_LABEL = "<mentioned>";
public static final String LANG_ETYM_LABEL = "<lang>";
public static final String SEG_ETYM_LABEL = "<seg>";
public static final String LITERARY_CITATION_ETYM_LABEL = "<litCitation>";
public static final String DEF_ETYM_LABEL = "<def>";
//public static final String QUOTE_ETYM_LABEL = "<quote>";
public static final String BIBL_ETYM_LABEL = "<bibl>";

public static final TaggingLabel ETYM_ATTESTED_FORM = new TaggingLabelImpl(DictionaryModels.ETYM, ATTESTED_FORM_ETYM_LABEL);
public static final TaggingLabel ETYM_ETYM_RELATION = new TaggingLabelImpl(DictionaryModels.ETYM, ETYM_RELATION_ETYM_LABEL);
public static final TaggingLabel ETYM_MENTIONED = new TaggingLabelImpl(DictionaryModels.ETYM, MENTIONED_ETYM_LABEL);
public static final TaggingLabel ETYM_LANG = new TaggingLabelImpl(DictionaryModels.ETYM, LANG_ETYM_LABEL);
public static final TaggingLabel ETYM_SEG = new TaggingLabelImpl(DictionaryModels.ETYM, SEG_ETYM_LABEL);
public static final TaggingLabel ETYM_LITERARY_CITATION = new TaggingLabelImpl(DictionaryModels.ETYM, LITERARY_CITATION_ETYM_LABEL);
public static final TaggingLabel ETYM_DEF = new TaggingLabelImpl(DictionaryModels.ETYM, DEF_ETYM_LABEL);
// public static final TaggingLabel ETYM_CIT = new TaggingLabelImpl(DictionaryModels.ETYM, CIT_ETYM_LABEL);
//public static final TaggingLabel ETYM_QUOTE = new TaggingLabelImpl(DictionaryModels.ETYM, QUOTE_ETYM_LABEL);
public static final TaggingLabel ETYM_BIBL = new TaggingLabelImpl(DictionaryModels.ETYM, BIBL_ETYM_LABEL);
public static final TaggingLabel FORM_OTHER = new TaggingLabelImpl(DictionaryModels.ETYM, OTHER_LABEL);
public static final TaggingLabel FORM_PUNCTUATION = new TaggingLabelImpl(DictionaryModels.ETYM, PUNCTUATION_LABEL);

static {
register(ETYM_ATTESTED_FORM);
register(ETYM_ETYM_RELATION);
register(ETYM_MENTIONED);
register(ETYM_LANG);
register(ETYM_SEG);
register(ETYM_LITERARY_CITATION);
register(ETYM_DEF);
// register(ETYM_CIT);
//register(ETYM_QUOTE);
register(ETYM_BIBL);
register(FORM_PUNCTUATION);
register(FORM_OTHER);
}
Expand Down
30 changes: 30 additions & 0 deletions src/main/java/org/grobid/core/engines/label/EtymQuoteLabels.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.grobid.core.engines.label;

import org.grobid.core.engines.DictionaryModels;

import static org.grobid.core.engines.label.DictionaryBodySegmentationLabels.PUNCTUATION_LABEL;

/**
* Created by Med on 06.10.17.
*/
public class EtymQuoteLabels extends TaggingLabels {

private EtymQuoteLabels() {
super();
}
public static final String SEG_ETYM_LABEL = "<seg>";
public static final String QUOTE_ETYM_LABEL = "<quote>";

public static final TaggingLabel ETYM_SEG = new TaggingLabelImpl(DictionaryModels.ETYM, SEG_ETYM_LABEL);
public static final TaggingLabel ETYM_QUOTE = new TaggingLabelImpl(DictionaryModels.ETYM, QUOTE_ETYM_LABEL);
public static final TaggingLabel FORM_OTHER = new TaggingLabelImpl(DictionaryModels.ETYM, OTHER_LABEL);
public static final TaggingLabel FORM_PUNCTUATION = new TaggingLabelImpl(DictionaryModels.ETYM, PUNCTUATION_LABEL);

static {

register(ETYM_SEG);
register(ETYM_QUOTE);
register(FORM_PUNCTUATION);
register(FORM_OTHER);
}
}
3 changes: 3 additions & 0 deletions src/main/java/org/grobid/core/engines/label/FormLabels.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,20 @@ private FormLabels() {
public static final String ORTHOGRAPHY_FORM_LABEL = "<orth>";
public static final String PRONUNCIATION_FORM_LABEL = "<pron>";
public static final String GRAMMATICAL_GROUP_FORM_LABEL = "<gramGrp>";
public static final String LANG_LABEL = "<lang>";

public static final TaggingLabel FORM_ORTHOGRAPHY = new TaggingLabelImpl(DictionaryModels.FORM, ORTHOGRAPHY_FORM_LABEL);
public static final TaggingLabel FORM_PRONUNCIATION = new TaggingLabelImpl(DictionaryModels.FORM, PRONUNCIATION_FORM_LABEL);
public static final TaggingLabel FORM_GRAMMATICAL_GROUP = new TaggingLabelImpl(DictionaryModels.FORM, GRAMMATICAL_GROUP_FORM_LABEL);
public static final TaggingLabel FORM_LANG = new TaggingLabelImpl(DictionaryModels.FORM, LANG_LABEL);
public static final TaggingLabel FORM_OTHER = new TaggingLabelImpl(DictionaryModels.FORM, OTHER_LABEL);
public static final TaggingLabel FORM_PUNCTUATION = new TaggingLabelImpl(DictionaryModels.FORM, PUNCTUATION_LABEL);

static {
register(FORM_ORTHOGRAPHY);
register(FORM_PRONUNCIATION);
register(FORM_GRAMMATICAL_GROUP);
register(FORM_LANG);
register(FORM_PUNCTUATION);
register(FORM_OTHER);
}
Expand Down
Loading

0 comments on commit c368e5b

Please sign in to comment.