Skip to content

Commit

Permalink
1) adapt a number of codes in several classes as a consequence of Tag…
Browse files Browse the repository at this point in the history
…gingTokenClusteror implementation in AcknowledgmentParser class; 2) add new feature vectors for improving acknowledgement model; 3) automatically split data (80/20)-train-evaluate a new acknowledgment model; 4) add new n-fold evaluation results. Old results are still kept for comparison purposes.
  • Loading branch information
tantikristanti committed Oct 25, 2019
1 parent 3e4171a commit c85abd9
Show file tree
Hide file tree
Showing 19 changed files with 259,512 additions and 180,107 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package org.grobid.core.engines;

/**
* @created by Tanti
* @created by Tanti, 2019
*/

import org.apache.commons.collections4.CollectionUtils;
Expand Down Expand Up @@ -55,12 +55,12 @@ public List<AcknowledgmentItem> processing(String input) {
}

public List<AcknowledgmentItem> processing(List<LayoutToken> tokens) {
List<AcknowledgmentItem> acknowledgments = null;
List<AcknowledgmentItem> acknowledgments = new ArrayList<>();
if (CollectionUtils.isEmpty(tokens)) {
return null;
}
try {
String headerAcknowledgment = FeaturesVectorAcknowledgment.addFeaturesAcknowledgment(tokens);
String headerAcknowledgment = FeaturesVectorAcknowledgment.addFeaturesAcknowledgment(tokens, null);
String resAcknowledgment = label(headerAcknowledgment);
acknowledgments = resultExtractionLayoutTokens(resAcknowledgment, tokens);
return acknowledgments;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public class TaggingLabels {
public final static String GLOSSARY_LABEL = "<glossary>";
public final static String BACK_LABEL = "<back>";

// acknowledgment
// for acknowledgment model
public final static String AFFILIATION = "<affiliation>";
public final static String EDUCATIONAL_INSTITUTION = "<educationalInstitution>";
public final static String FUNDING_AGENCY = "<fundingAgency>";
Expand Down Expand Up @@ -356,7 +356,7 @@ protected static void register(TaggingLabel label) {
register(MONOGRAPH_BACK);
register(MONOGRAPH_OTHER);

// acknowledgment
// for acknowledgment model
register(ACKNOWLEDGMENT_AFFILIATION);
register(ACKNOWLEDGMENT_EDUCATIONAL_INSTITUTION);
register(ACKNOWLEDGMENT_FUNDING_AGENCY);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package org.grobid.core.features;

import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.utilities.UnicodeUtil;

Expand All @@ -12,8 +9,9 @@
import java.util.regex.Matcher;

/**
* Class for features used for parsing acknowledgment chunk.
* Created by Tanti, 2019
* Class contains feature vectors for acknowledgment parser model.
*
* @Created by Tanti, 2019
*/

public class FeaturesVectorAcknowledgment {
Expand All @@ -25,6 +23,8 @@ public class FeaturesVectorAcknowledgment {
public boolean singleChar = false;
public boolean properName = false;
public boolean commonName = false;
public boolean locationName = false;
public boolean countryName = false;
public boolean affiliation = false;
public boolean educationalInsitution = false;
public boolean fundingAgency = false;
Expand Down Expand Up @@ -90,6 +90,18 @@ public String printVector() {
else
res.append(" 0");

// location name
if (locationName)
res.append(" 1");
else
res.append(" 0");

// country name
if (countryName)
res.append(" 1");
else
res.append(" 0");

// lexical information (9)
if (affiliation)
res.append(" 1");
Expand Down Expand Up @@ -139,6 +151,8 @@ public String printVector() {
// punctuation information (1)
res.append(" " + punctType); // in case the token is a punctuation (NO otherwise)

res.append(" ").append(wordShape);

// label - for training data (1)
if (label != null)
res.append(" " + label + "\n");
Expand All @@ -149,7 +163,7 @@ public String printVector() {
}

/**
* Add feature for acknowledgment parsing.
* Add feature for acknowledgment parsing with string as input.
*/
static public String addFeaturesAcknowledgmentString(List<String> lines) throws Exception {

Expand Down Expand Up @@ -285,19 +299,21 @@ static private FeaturesVectorAcknowledgment addFeaturesAcknowledgment(String lin
/**
* Add feature for acknowledgment parsing with tokens as input.
*/
static public String addFeaturesAcknowledgment(List<LayoutToken> tokens) throws Exception {
static public String addFeaturesAcknowledgment(List<LayoutToken> tokens, List<String> labels) throws Exception {
FeatureFactory featureFactory = FeatureFactory.getInstance();

StringBuilder acknowledgment = new StringBuilder();

String previousTag = null;
String previousText = null;
FeaturesVectorAcknowledgment featuresVectorAcknowledgment = null;
int sentenceLenth = tokens.size(); // length of the current sentence
for (int n=0; n < tokens.size(); n++) {
for (int n = 0; n < tokens.size(); n++) {
LayoutToken token = tokens.get(n);
String tag = null;

if ((labels != null) && (labels.size() > 0) && (n < labels.size()))
tag = labels.get(n);

boolean outputLineStatus = false;

String text = token.getText();
Expand All @@ -311,7 +327,7 @@ static public String addFeaturesAcknowledgment(List<LayoutToken> tokens) throws

// parano normalisation
text = UnicodeUtil.normaliseTextAndRemoveSpaces(text);
if (text.trim().length() == 0 ) {
if (text.trim().length() == 0) {
continue;
}

Expand All @@ -334,7 +350,7 @@ static public String addFeaturesAcknowledgment(List<LayoutToken> tokens) throws
featuresVectorAcknowledgment.lineStatus = "LINESTART";
outputLineStatus = true;
}
} else if (tokens.size() == n+1) {
} else if (tokens.size() == n + 1) {
if (!outputLineStatus) {
featuresVectorAcknowledgment.lineStatus = "LINEEND";
outputLineStatus = true;
Expand All @@ -345,18 +361,13 @@ static public String addFeaturesAcknowledgment(List<LayoutToken> tokens) throws
outputLineStatus = true;
}

// single character
if (text.length() == 1) {
featuresVectorAcknowledgment.singleChar = true;
// capitalisation
if (Character.isUpperCase(text.charAt(0))) {
featuresVectorAcknowledgment.capitalisation = "INITCAP";
}

// capital
if (featureFactory.test_all_capital(text)) {
featuresVectorAcknowledgment.capitalisation = "ALLCAPS";
}

if (Character.isUpperCase(text.charAt(0))) {
featuresVectorAcknowledgment.capitalisation = "INITCAP";
featuresVectorAcknowledgment.capitalisation = "ALLCAP";
}

if (featuresVectorAcknowledgment.capitalisation == null) {
Expand All @@ -376,17 +387,32 @@ static public String addFeaturesAcknowledgment(List<LayoutToken> tokens) throws
if (featuresVectorAcknowledgment.digit == null)
featuresVectorAcknowledgment.digit = "NODIGIT";

// common name
if (featureFactory.test_common(text)) {
featuresVectorAcknowledgment.commonName = true;
// single character
if (text.length() == 1) {
featuresVectorAcknowledgment.singleChar = true;
}

// proper name
if (featureFactory.test_names(text)) {
featuresVectorAcknowledgment.properName = true;
}

// find the punctuations
// common name
if (featureFactory.test_common(text)) {
featuresVectorAcknowledgment.commonName = true;
}

// location name
if (featureFactory.test_city(text)) {
featuresVectorAcknowledgment.locationName = true;
}

// country name
if (featureFactory.test_country(text)) {
featuresVectorAcknowledgment.countryName = true;
}

// punctuations
Matcher m0 = featureFactory.isPunct.matcher(text);
if (m0.find()) {
featuresVectorAcknowledgment.punctType = "PUNCT";
Expand All @@ -395,7 +421,6 @@ static public String addFeaturesAcknowledgment(List<LayoutToken> tokens) throws
if (featuresVectorAcknowledgment.punctType == null)
featuresVectorAcknowledgment.punctType = "NOPUNCT";

// token containing special character
if ((text.equals("(")) | (text.equals("["))) {
featuresVectorAcknowledgment.punctType = "OPENBRACKET";
} else if ((text.equals(")")) | (text.equals("]"))) {
Expand All @@ -414,6 +439,7 @@ static public String addFeaturesAcknowledgment(List<LayoutToken> tokens) throws

previousTag = tag;
previousText = text;

}

return acknowledgment.toString();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
package org.grobid.core.engines;

import org.grobid.core.data.AcknowledgmentItem;
import org.grobid.core.data.Acknowledgment;
import org.grobid.core.factory.AbstractEngineFactory;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;

import java.util.List;
Expand All @@ -19,8 +17,7 @@
*/

public class AcknowledgmentParserTest {
AcknowledgmentParserOld targetOld;
AcknowledgmentParser targetNew;
AcknowledgmentParser target;

@BeforeClass
public static void setInitialContext() throws Exception {
Expand All @@ -29,47 +26,40 @@ public static void setInitialContext() throws Exception {

@Before
public void setUp() throws Exception {
targetOld = new AcknowledgmentParserOld();
targetNew = new AcknowledgmentParser();
}

@Ignore ("old method")
@Test
public void processing_akwnowledgmentOld_shouldWork() throws Exception {
List<Acknowledgment> output = targetOld.processing("This research was supported by the Deutsche Forschungsgemeinschaft through the SFB 649 \"Economic Risk\". http://sfb649.wiwi.hu-berlin.de ISSN 1860-5664");

final Acknowledgment acknowledgment = output.get(0);
assertThat(acknowledgment.getFundingAgency(), is("the Deutsche Forschungsgemeinschaft"));
assertThat(acknowledgment.getProjectName(), is("the SFB 649 \" Economic Risk \""));
target = new AcknowledgmentParser();
}

@Test
public void processing_akwnowledgmentNew1_shouldWork() throws Exception {
List<AcknowledgmentItem> output = targetNew.processing("This research was supported by the Deutsche Forschungsgemeinschaft through the SFB 649 \"Economic Risk\". http://sfb649.wiwi.hu-berlin.de ISSN 1860-5664");
AcknowledgmentItem acknowledgment = new AcknowledgmentItem();
List<AcknowledgmentItem> output = target.processing("This research was supported by the Deutsche Forschungsgemeinschaft through the SFB 649 \"Economic Risk\". http://sfb649.wiwi.hu-berlin.de ISSN 1860-5664");


acknowledgment = output.get(0);
assertThat(output.get(0).getText(), is("the Deutsche Forschungsgemeinschaft"));
assertThat(output.get(0).getLabel(), is("fundingAgency"));

assertThat(acknowledgment.getText(), is("the Deutsche Forschungsgemeinschaft"));
assertThat(acknowledgment.getLabel(), is("fundingAgency"));
assertThat(output.get(1).getText(), is("the SFB 649 \"Economic Risk\""));
assertThat(output.get(1).getLabel(), is("projectName"));
}

@Test
public void processing_akwnowledgmentNew2_shouldWork() throws Exception {
List<AcknowledgmentItem> output = targetNew.processing("This research was funded by Computational Science grant #635.000.014 from " +
List<AcknowledgmentItem> output = target.processing("This research was funded by Computational Science grant #635.000.014 from " +
"the Netherlands Organization for Scientific Research (NWO). " +
"Mikas Vengris, Denitsa Grancharova and Rienk van Grondelle provided the data modeled in Section 5.6. Rob Koehorst, " +
"Bart van Oort, Sergey Laptenok, Ton Visser and Herbert van Amerongen provided the data modeled in Section 6.3. " +
"Joris Snellenburg is thanked for constructive comments on the text. Uwe Ligges and Martin Mächler collaborated " +
"in the implementation of the nls options described in Section B. Achim Zeileis contributed helpful suggestions regarding the figures.");

assertThat(output.get(1).getText(), is("the Netherlands Organization for Scientific Research (NWO)"));
assertThat(output.get(1).getLabel(), is("fundingAgency"));
assertThat(output.get(1).getText(), is("#635.000"));
assertThat(output.get(1).getLabel(), is("grantNumber"));

assertThat(output.get(2).getText(), is("Mikas Vengris"));
assertThat(output.get(2).getLabel(), is("individual"));
assertThat(output.get(2).getText(), is("the Netherlands Organization for Scientific Research (NWO)"));
assertThat(output.get(2).getLabel(), is("fundingAgency"));

assertThat(output.get(3).getText(), is("Denitsa Grancharova"));
assertThat(output.get(3).getText(), is("Mikas Vengris"));
assertThat(output.get(3).getLabel(), is("individual"));

assertThat(output.get(4).getText(), is("Denitsa Grancharova"));
assertThat(output.get(4).getLabel(), is("individual"));
}
}
37 changes: 17 additions & 20 deletions grobid-home/models/acknowledgment/config.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
{
"model_name": "acknowledgment",
"model_type": "BidLSTM_CRF",
"embeddings_name": "glove-840B",
"char_vocab_size": 111,
"case_vocab_size": 8,
"char_embedding_size": 25,
"num_char_lstm_units": 25,
"max_char_length": 30,
"max_sequence_length": null,
"word_embedding_size": 300,
"num_word_lstm_units": 100,
"case_embedding_size": 5,
"dropout": 0.5,
"recurrent_dropout": 0.25,
"use_char_feature": true,
"use_crf": true,
"fold_number": 1,
"batch_size": 20,
"use_ELMo": false,
"use_BERT": false
"case_vocab_size": 8,
"model_name": "acknowledgment",
"num_char_lstm_units": 25,
"num_word_lstm_units": 100,
"case_embedding_size": 5,
"use_crf": true,
"fold_number": 1,
"char_vocab_size": 206,
"dropout": 0.5,
"model_type": "BidLSTM_CRF",
"batch_size": 20,
"recurrent_dropout": 0.25,
"max_char_length": 30,
"char_embedding_size": 25,
"use_char_feature": true,
"embeddings_name": "glove-840B",
"word_embedding_size": 300
}
Loading

0 comments on commit c85abd9

Please sign in to comment.