implement etymQuote model as a first model to get complex like quote …

…from etym #14
MedKhem · Oct 6, 2017 · c368e5b · c368e5b
1 parent 2f57c66
commit c368e5b
Show file tree

Hide file tree

Showing 10 changed files with 490 additions and 14 deletions.
diff --git a/pom.xml b/pom.xml
@@ -352,6 +352,29 @@
     </build>
 
     <profiles>
+        <profile>
+            <id>train_etymQuote</id>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>exec-maven-plugin</artifactId>
+                        <version>1.6.0</version>
+                        <executions>
+                            <execution>
+                                <phase>generate-resources</phase>
+                                <goals>
+                                    <goal>java</goal>
+                                </goals>
+                                <configuration>
+                                    <mainClass>org.grobid.trainer.EtymQuoteTrainer</mainClass>
+                                </configuration>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
         <profile>
             <id>train_etym</id>
             <build>

diff --git a/src/main/java/org/grobid/core/engines/DictionaryBodySegmentationParser.java b/src/main/java/org/grobid/core/engines/DictionaryBodySegmentationParser.java
@@ -1267,7 +1267,7 @@ private String checkFullABodyComponentToTEI(String tagLabel, List<LayoutToken> a
         LexicalEntryParser lexicalEntryParser = new LexicalEntryParser();
         FormParser formParser = new FormParser();
         SenseParser senseParser = new SenseParser();
-        EtymParser etymParser = new EtymParser();
+        EtymQuoteParser etymQuoteParser = new EtymQuoteParser();
         if (tagLabel.equals(DictionaryBodySegmentationLabels.PUNCTUATION_LABEL)) {
             clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(allTokensOfaLE));
         }else {
@@ -1281,7 +1281,7 @@ private String checkFullABodyComponentToTEI(String tagLabel, List<LayoutToken> a
                     clusterContent = clusterContent + senseParser.processToTEI(segmentedEntryComponent.getA()).toString();
                 } else if (segmentedEntryComponent.getB().equals(LEXICAL_ENTRY_ETYM_LABEL)) {
 
-                    clusterContent = clusterContent + etymParser.processToTei(segmentedEntryComponent.getA()).toString();
+                    clusterContent = clusterContent + etymQuoteParser.processToTei(segmentedEntryComponent.getA()).toString();
                 } else {
                     String xmlTag = segmentedEntryComponent.getB().replace("<", "").replace(">", "");
                     clusterContent = clusterContent +  createMyXMLString(xmlTag, LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(segmentedEntryComponent.getA())));

diff --git a/src/main/java/org/grobid/core/engines/DictionaryModels.java b/src/main/java/org/grobid/core/engines/DictionaryModels.java
@@ -14,7 +14,7 @@ public class DictionaryModels {
     public static final GrobidModel FORM = GrobidModels.modelFor("form");
     public static final GrobidModel SENSE = GrobidModels.modelFor("sense");
     public static final GrobidModel GRAMMATICAL_GROUP = GrobidModels.modelFor("grammatical-group");
-
+    public static final GrobidModel ETYM_QUOTE = GrobidModels.modelFor("etymQuote");
     public static final GrobidModel ETYM = GrobidModels.modelFor("etym");
 
 }
diff --git a/src/main/java/org/grobid/core/engines/EtymQuoteParser.java b/src/main/java/org/grobid/core/engines/EtymQuoteParser.java
@@ -0,0 +1,99 @@
+package org.grobid.core.engines;
+
+import org.apache.commons.lang3.StringUtils;
+import org.grobid.core.data.LabeledLexicalInformation;
+import org.grobid.core.engines.label.TaggingLabel;
+import org.grobid.core.features.FeatureVectorLexicalEntry;
+import org.grobid.core.layout.LayoutToken;
+import org.grobid.core.layout.LayoutTokenization;
+import org.grobid.core.tokenization.TaggingTokenCluster;
+import org.grobid.core.tokenization.TaggingTokenClusteror;
+import org.grobid.core.utilities.LayoutTokensUtil;
+import org.grobid.core.utilities.Pair;
+import org.grobid.core.utilities.TextUtilities;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+
+import static org.grobid.core.document.TEIDictionaryFormatter.createMyXMLString;
+import static org.grobid.core.engines.label.DictionaryBodySegmentationLabels.DICTIONARY_ENTRY_LABEL;
+import static org.grobid.service.DictionaryPaths.PATH_FULL_DICTIONARY;
+
+/**
+ * Created by Med on 06.10.17.
+ */
+public class EtymQuoteParser  extends AbstractParser {
+    private static final Logger LOGGER = LoggerFactory.getLogger(FormParser.class);
+    private static volatile FormParser instance;
+
+    public EtymQuoteParser() {
+        super(DictionaryModels.ETYM_QUOTE);
+    }
+
+
+    public static FormParser getInstance() {
+        if (instance == null) {
+            getNewInstance();
+        }
+        return instance;
+    }
+
+    private static synchronized void getNewInstance() {
+        instance = new FormParser();
+    }
+
+    public StringBuilder processToTei(List<LayoutToken> etymEntry) {
+        LabeledLexicalInformation labeledSense = process(etymEntry, PATH_FULL_DICTIONARY);
+        StringBuilder sb = new StringBuilder();
+
+        sb.append("<etym>").append("\n");
+        //I apply the form also to the sense to recognise the grammatical group, if any!
+
+        for (Pair<List<LayoutToken>, String> entrySense : labeledSense.getLabels()) {
+            String tokenSense = LayoutTokensUtil.normalizeText(entrySense.getA());
+            String labelSense = entrySense.getB();
+
+            String content = TextUtilities.HTMLEncode(tokenSense);
+            content = content.replace("&lt;lb/&gt;", "<lb/>");
+
+            sb.append(createMyXMLString(labelSense.replaceAll("[<>]", ""), content));
+
+        }
+        sb.append("</etym>").append("\n");
+        return sb;
+
+    }
+
+    public LabeledLexicalInformation process(List<LayoutToken> etymEntry, String parentTag) {
+        LabeledLexicalInformation labeledLexicalEntry = new LabeledLexicalInformation();
+
+        LayoutTokenization layoutTokenization = new LayoutTokenization(etymEntry);
+
+        String featSeg = FeatureVectorLexicalEntry.createFeaturesFromLayoutTokens(layoutTokenization.getTokenization(), DICTIONARY_ENTRY_LABEL).toString();
+
+        if (StringUtils.isNotBlank(featSeg)) {
+            // Run the lexical entry model to label the features
+            String modelOutput = label(featSeg);
+            TaggingTokenClusteror clusteror = new TaggingTokenClusteror(DictionaryModels.ETYM, modelOutput, etymEntry);
+
+            List<TaggingTokenCluster> clusters = clusteror.cluster();
+
+            for (TaggingTokenCluster cluster : clusters) {
+                if (cluster == null) {
+                    continue;
+                }
+                TaggingLabel clusterLabel = cluster.getTaggingLabel();
+                Engine.getCntManager().i((TaggingLabel) clusterLabel);
+
+                List<LayoutToken> concatenatedTokens = cluster.concatTokens();
+                String tagLabel = clusterLabel.getLabel();
+
+                labeledLexicalEntry.addLabel(new Pair(concatenatedTokens, tagLabel));
+            }
+        }
+
+
+        return labeledLexicalEntry;
+    }
+}
diff --git a/src/main/java/org/grobid/core/engines/label/EtymLabels.java b/src/main/java/org/grobid/core/engines/label/EtymLabels.java
@@ -13,23 +13,31 @@ private EtymLabels() {
         super();
     }
 
-    public static final String ATTESTED_FORM_ETYM_LABEL = "<attForm>";
-    public static final String ETYM_RELATION_ETYM_LABEL = "<etymRel>";
+    public static final String MENTIONED_ETYM_LABEL = "<mentioned>";
+    public static final String LANG_ETYM_LABEL = "<lang>";
     public static final String SEG_ETYM_LABEL = "<seg>";
-    public static final String LITERARY_CITATION_ETYM_LABEL = "<litCitation>";
+    public static final String DEF_ETYM_LABEL = "<def>";
+    //public static final String QUOTE_ETYM_LABEL = "<quote>";
+    public static final String BIBL_ETYM_LABEL = "<bibl>";
 
-    public static final TaggingLabel ETYM_ATTESTED_FORM = new TaggingLabelImpl(DictionaryModels.ETYM, ATTESTED_FORM_ETYM_LABEL);
-    public static final TaggingLabel ETYM_ETYM_RELATION = new TaggingLabelImpl(DictionaryModels.ETYM, ETYM_RELATION_ETYM_LABEL);
+    public static final TaggingLabel ETYM_MENTIONED = new TaggingLabelImpl(DictionaryModels.ETYM, MENTIONED_ETYM_LABEL);
+    public static final TaggingLabel ETYM_LANG = new TaggingLabelImpl(DictionaryModels.ETYM, LANG_ETYM_LABEL);
     public static final TaggingLabel ETYM_SEG = new TaggingLabelImpl(DictionaryModels.ETYM, SEG_ETYM_LABEL);
-    public static final TaggingLabel ETYM_LITERARY_CITATION = new TaggingLabelImpl(DictionaryModels.ETYM, LITERARY_CITATION_ETYM_LABEL);
+    public static final TaggingLabel ETYM_DEF = new TaggingLabelImpl(DictionaryModels.ETYM, DEF_ETYM_LABEL);
+   // public static final TaggingLabel ETYM_CIT = new TaggingLabelImpl(DictionaryModels.ETYM, CIT_ETYM_LABEL);
+    //public static final TaggingLabel ETYM_QUOTE = new TaggingLabelImpl(DictionaryModels.ETYM, QUOTE_ETYM_LABEL);
+    public static final TaggingLabel ETYM_BIBL = new TaggingLabelImpl(DictionaryModels.ETYM, BIBL_ETYM_LABEL);
     public static final TaggingLabel FORM_OTHER = new TaggingLabelImpl(DictionaryModels.ETYM, OTHER_LABEL);
     public static final TaggingLabel FORM_PUNCTUATION = new TaggingLabelImpl(DictionaryModels.ETYM, PUNCTUATION_LABEL);
 
     static {
-        register(ETYM_ATTESTED_FORM);
-        register(ETYM_ETYM_RELATION);
+        register(ETYM_MENTIONED);
+        register(ETYM_LANG);
         register(ETYM_SEG);
-        register(ETYM_LITERARY_CITATION);
+        register(ETYM_DEF);
+       // register(ETYM_CIT);
+        //register(ETYM_QUOTE);
+        register(ETYM_BIBL);
         register(FORM_PUNCTUATION);
         register(FORM_OTHER);
     }

diff --git a/src/main/java/org/grobid/core/engines/label/EtymQuoteLabels.java b/src/main/java/org/grobid/core/engines/label/EtymQuoteLabels.java
@@ -0,0 +1,30 @@
+package org.grobid.core.engines.label;
+
+import org.grobid.core.engines.DictionaryModels;
+
+import static org.grobid.core.engines.label.DictionaryBodySegmentationLabels.PUNCTUATION_LABEL;
+
+/**
+ * Created by Med on 06.10.17.
+ */
+public class EtymQuoteLabels  extends TaggingLabels {
+
+    private EtymQuoteLabels() {
+        super();
+    }
+    public static final String SEG_ETYM_LABEL = "<seg>";
+    public static final String QUOTE_ETYM_LABEL = "<quote>";
+
+    public static final TaggingLabel ETYM_SEG = new TaggingLabelImpl(DictionaryModels.ETYM, SEG_ETYM_LABEL);
+    public static final TaggingLabel ETYM_QUOTE = new TaggingLabelImpl(DictionaryModels.ETYM, QUOTE_ETYM_LABEL);
+    public static final TaggingLabel FORM_OTHER = new TaggingLabelImpl(DictionaryModels.ETYM, OTHER_LABEL);
+    public static final TaggingLabel FORM_PUNCTUATION = new TaggingLabelImpl(DictionaryModels.ETYM, PUNCTUATION_LABEL);
+
+    static {
+
+        register(ETYM_SEG);
+        register(ETYM_QUOTE);
+        register(FORM_PUNCTUATION);
+        register(FORM_OTHER);
+    }
+}
diff --git a/src/main/java/org/grobid/core/engines/label/FormLabels.java b/src/main/java/org/grobid/core/engines/label/FormLabels.java
@@ -16,17 +16,20 @@ private FormLabels() {
     public static final String ORTHOGRAPHY_FORM_LABEL = "<orth>";
     public static final String PRONUNCIATION_FORM_LABEL = "<pron>";
     public static final String GRAMMATICAL_GROUP_FORM_LABEL = "<gramGrp>";
+    public static final String LANG_LABEL = "<lang>";
 
     public static final TaggingLabel FORM_ORTHOGRAPHY = new TaggingLabelImpl(DictionaryModels.FORM, ORTHOGRAPHY_FORM_LABEL);
     public static final TaggingLabel FORM_PRONUNCIATION = new TaggingLabelImpl(DictionaryModels.FORM, PRONUNCIATION_FORM_LABEL);
     public static final TaggingLabel FORM_GRAMMATICAL_GROUP = new TaggingLabelImpl(DictionaryModels.FORM, GRAMMATICAL_GROUP_FORM_LABEL);
+    public static final TaggingLabel FORM_LANG = new TaggingLabelImpl(DictionaryModels.FORM, LANG_LABEL);
     public static final TaggingLabel FORM_OTHER = new TaggingLabelImpl(DictionaryModels.FORM, OTHER_LABEL);
     public static final TaggingLabel FORM_PUNCTUATION = new TaggingLabelImpl(DictionaryModels.FORM, PUNCTUATION_LABEL);
 
     static {
         register(FORM_ORTHOGRAPHY);
         register(FORM_PRONUNCIATION);
         register(FORM_GRAMMATICAL_GROUP);
+        register(FORM_LANG);
         register(FORM_PUNCTUATION);
         register(FORM_OTHER);
     }