From 21a4ef84a3d36d5b139397d0b3219518a3e0368a Mon Sep 17 00:00:00 2001 From: tantikristanti Date: Wed, 23 Oct 2019 15:47:16 +0200 Subject: [PATCH] adapt acknowledgmentParser with a new method using TokenClusteror, fix all class linked with it, fix the test class as well --- .../java/org/grobid/core/data/Acknow.java | 33 - .../org/grobid/core/data/Acknowledgment.java | 65 -- .../grobid/core/data/AcknowledgmentItem.java | 58 ++ .../grobid/core/document/TEIFormatter.java | 127 ++-- .../core/engines/AcknowledgmentParser.java | 706 +++--------------- .../core/engines/AcknowledgmentParserOld.java | 302 ++++++++ .../java/org/grobid/core/engines/Engine.java | 9 +- .../core/engines/label/TaggingLabels.java | 43 +- .../FeaturesVectorAcknowledgment.java | 154 +++- .../engines/AcknowledgmentParserTest.java | 65 +- .../org/grobid/core/engines/EngineTest.java | 3 +- .../process/GrobidRestProcessString.java | 9 +- 12 files changed, 686 insertions(+), 888 deletions(-) delete mode 100644 grobid-core/src/main/java/org/grobid/core/data/Acknow.java create mode 100644 grobid-core/src/main/java/org/grobid/core/data/AcknowledgmentItem.java create mode 100644 grobid-core/src/main/java/org/grobid/core/engines/AcknowledgmentParserOld.java diff --git a/grobid-core/src/main/java/org/grobid/core/data/Acknow.java b/grobid-core/src/main/java/org/grobid/core/data/Acknow.java deleted file mode 100644 index cc16b1b0d6..0000000000 --- a/grobid-core/src/main/java/org/grobid/core/data/Acknow.java +++ /dev/null @@ -1,33 +0,0 @@ -package org.grobid.core.data; - -public class Acknow { - String label; - String text; - String coords; - - public String getLabel() { - return label; - } - - public void setLabel(String label) { - this.label = label; - } - - public String getText() { - return text; - } - - public void setText(String text) { - this.text = text; - } - - public String getCoords() { - return coords; - } - - public void setCoords(String coords) { - this.coords = coords; - } - - -} diff --git a/grobid-core/src/main/java/org/grobid/core/data/Acknowledgment.java b/grobid-core/src/main/java/org/grobid/core/data/Acknowledgment.java index fb564f4f44..ddaae5fa35 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Acknowledgment.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Acknowledgment.java @@ -243,70 +243,5 @@ public String toTEI(){ return tei.toString(); } - - - public String toTEI(int n, int indent, GrobidAnalysisConfig config) { - StringBuilder tei = new StringBuilder(); - boolean generateIDs = config.isGenerateTeiIds(); - try { - for (int i = 0; i < indent; i++) { - tei.append("\t"); - } - tei.append(""); - boolean withCoords = (config.getGenerateTeiCoordinates() != null) && (config.getGenerateTeiCoordinates().contains("listAcknowledment")); - tei.append(" "); - if (withCoords) - tei.append(TEIFormatter.getCoordsAttribute(coordinates, withCoords)).append(" "); - // title - for (int i = 0; i < indent + 2; i++) { - tei.append("\t"); - } - if (affiliation != null) { - tei.append("").append(TextUtilities.HTMLEncode(affiliation)).append(""); - } - - if (educationalInstitution != null) { - tei.append("").append(TextUtilities.HTMLEncode(educationalInstitution)).append(""); - } - - if (fundingAgency != null) { - tei.append("").append(TextUtilities.HTMLEncode(fundingAgency)).append(""); - } - - if (grantName != null) { - tei.append("").append(TextUtilities.HTMLEncode(grantName)).append(""); - } - - if (grantNumber != null) { - tei.append("").append(TextUtilities.HTMLEncode(grantNumber)).append(""); - } - - if (individual != null) { - tei.append("").append(TextUtilities.HTMLEncode(individual)).append(""); - } - - if (otherInstitution != null) { - tei.append("").append(TextUtilities.HTMLEncode(otherInstitution)).append(""); - } - - if (projectName != null) { - tei.append("").append(TextUtilities.HTMLEncode(projectName)).append(""); - } - - if (researchInstitution != null) { - tei.append("").append(TextUtilities.HTMLEncode(researchInstitution)).append(""); - } - - for (int i = 0; i < indent; i++) { - tei.append("\t"); - } - tei.append("\n"); - } catch (Exception e) { - throw new GrobidException("Cannot convert bibliographical item into a TEI, " + - "because of nested exception.", e); - } - - return tei.toString(); - } } diff --git a/grobid-core/src/main/java/org/grobid/core/data/AcknowledgmentItem.java b/grobid-core/src/main/java/org/grobid/core/data/AcknowledgmentItem.java new file mode 100644 index 0000000000..1b669a5157 --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/data/AcknowledgmentItem.java @@ -0,0 +1,58 @@ +package org.grobid.core.data; + +import org.grobid.core.utilities.TextUtilities; + +/** + * A class for saving and exchancing information regarding acknowledgment item consisting of the text, + * the type label (affiliation, educational institution, individual, etc) + * and the bounding box coordinates. + * + * Created by Tanti, 2019 + */ + + +public class AcknowledgmentItem { + String label; + String text; + String coords; + + public String getLabel() { + return label; + } + + public void setLabel(String label) { + this.label = label; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public String getCoords() { + return coords; + } + + public void setCoords(String coords) { + this.coords = coords; + } + + // result of acknowledgment string processing + public String toTEI(){ + StringBuilder tei = new StringBuilder(); + if (label== null) { + return null; + } else { + + tei.append("<").append(label).append(">"). + append(TextUtilities.HTMLEncode(text)). + append(""); + } + return tei.toString(); + } + + +} diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 2f8da4a846..f4bfb1d851 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -30,6 +30,7 @@ import org.grobid.core.utilities.*; import org.grobid.core.utilities.matching.EntityMatcherException; import org.grobid.core.utilities.matching.ReferenceMarkerMatcher; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -60,19 +61,6 @@ public class TEIFormatter { TaggingLabels.TABLE_MARKER, TaggingLabels.EQUATION_MARKER); - // for acknowledgment parser - public static final Set ACKNOWLEDGMENT_LABELS = Sets.newHashSet( - TaggingLabels.ACKNOWLEDGMENT, - TaggingLabels.AKNOWLEDGMENT_AFFILIATION, - TaggingLabels.AKNOWLEDGMENT_EDUCATIONAL_INSTITUTION, - TaggingLabels.AKNOWLEDGMENT_FUNDING_AGENCY, - TaggingLabels.AKNOWLEDGMENT_GRANT_NAME, - TaggingLabels.AKNOWLEDGMENT_GRANT_NUMBER, - TaggingLabels.AKNOWLEDGMENT_INDIVIDUAL, - TaggingLabels.AKNOWLEDGMENT_OTHER_INSTITUTION, - TaggingLabels.AKNOWLEDGMENT_PROJECT_NAME, - TaggingLabels.AKNOWLEDGMENT_RESEARCH_INSTITUTION); - // possible association to Grobid customised TEI schemas: DTD, XML schema, RelaxNG or compact RelaxNG // DEFAULT means no schema association in the generated XML documents public enum SchemaDeclaration { @@ -1119,8 +1107,6 @@ public StringBuilder toTEIAcknowledgement(StringBuilder buffer, List bds, GrobidAnalysisConfig config) throws Exception { AcknowledgmentParser acknowledgmentParser = new AcknowledgmentParser(); - Map> ackInMap = new HashMap<>(); - List resultAcknowledgment = new ArrayList<>(); if ((reseAcknowledgement == null) || (tokenizationsAcknowledgement == null)) { return buffer; @@ -1128,22 +1114,27 @@ public StringBuilder toTEIAcknowledgement(StringBuilder buffer, buffer.append("\n\t\t\t
\n"); - StringBuilder buffer2 = new StringBuilder(); + StringBuilder buffer2 = new StringBuilder(), buffer3 = new StringBuilder(); StringBuilder resultAcknow = new StringBuilder(); + // get the acknowledgment sections together with other results in a whole document buffer2 = toTEITextPiece(buffer2, reseAcknowledgement, null, bds, false, new LayoutTokenization(tokenizationsAcknowledgement), null, null, null, doc, config); - String acknowResult = buffer2.toString(); + String acknowWholeText = buffer2.toString(); + + // take only acknowledgment section content + buffer3 = takeOnlyAcknowledgmentText(buffer3, reseAcknowledgement, + new LayoutTokenization(tokenizationsAcknowledgement)); + String acknowOnlyText = buffer3.toString(); // call the acknowledgment parser - resultAcknowledgment = acknowledgmentParser.processing(acknowResult); + List resultAcknowledgment = acknowledgmentParser.processing(acknowOnlyText); // get the coordinates for acknowledgment results - List listAcknow = listAcknowledgmentResult(resultAcknowledgment); - getCoordAcknowledgment(listAcknow, tokenizationsAcknowledgement); + getCoordAcknowledgment(resultAcknowledgment, tokenizationsAcknowledgement); // put the results together with the raw text - resultAcknow = markReferencesTEIAcknowledgment(acknowResult, listAcknow); + resultAcknow = markReferencesTEIAcknowledgment(acknowWholeText, resultAcknowledgment); if (resultAcknow != null) { String resultAcknowInString = resultAcknow.toString(); @@ -1161,78 +1152,44 @@ public StringBuilder toTEIAcknowledgement(StringBuilder buffer, return buffer; } - public List listAcknowledgmentResult(List listAck) { - List acklist = new ArrayList<>(); + public StringBuilder takeOnlyAcknowledgmentText(StringBuilder buffer, + String reseAcknowledgement, + LayoutTokenization layoutTokenization) { + List tokenizations = layoutTokenization.getTokenization(); - for (Acknowledgment acknowledgment : listAck) { - Acknow acknow = null; - if (acknowledgment.getAffiliation() != null) { - acknow = new Acknow(); - acknow.setText(acknowledgment.getAffiliation()); - acknow.setLabel("affiliation"); - acklist.add(acknow); - } - if (acknowledgment.getEducationalInstitution() != null) { - acknow = new Acknow(); - acknow.setText(acknowledgment.getEducationalInstitution()); - acknow.setLabel("educationalInstitution"); - acklist.add(acknow); - } - if (acknowledgment.getFundingAgency() != null) { - acknow = new Acknow(); - acknow.setText(acknowledgment.getFundingAgency()); - acknow.setLabel("fundingAgency"); - acklist.add(acknow); - } - if (acknowledgment.getGrantName() != null) { - acknow = new Acknow(); - acknow.setText(acknowledgment.getGrantName()); - acknow.setLabel("grantName"); - acklist.add(acknow); - } - if (acknowledgment.getGrantNumber() != null) { - acknow = new Acknow(); - acknow.setText(acknowledgment.getGrantNumber()); - acknow.setLabel("grantNumber"); - acklist.add(acknow); - } - if (acknowledgment.getOtherInstitution() != null) { - acknow = new Acknow(); - acknow.setText(acknowledgment.getOtherInstitution()); - acknow.setLabel("otherInstitution"); - acklist.add(acknow); - } - if (acknowledgment.getProjectName() != null) { - acknow = new Acknow(); - acknow.setText(acknowledgment.getProjectName()); - acknow.setLabel("projectName"); - acklist.add(acknow); - } - if (acknowledgment.getResearchInstitution() != null) { - acknow = new Acknow(); - acknow.setText(acknowledgment.getResearchInstitution()); - acknow.setLabel("researchInstitution"); - acklist.add(acknow); + TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, reseAcknowledgement, tokenizations); + String tokenLabel = null; + List clusters = clusteror.cluster(); + + for (TaggingTokenCluster cluster : clusters) { + if (cluster == null) { + continue; } - if (acknowledgment.getIndividual() != null) { - acknow = new Acknow(); - acknow.setText(acknowledgment.getIndividual()); - acknow.setLabel("individual"); - acklist.add(acknow); + + TaggingLabel clusterLabel = cluster.getTaggingLabel(); + + Engine.getCntManager().i(clusterLabel); + if (clusterLabel.equals(TaggingLabels.SECTION)) { + String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + buffer.append(clusterContent); + } else if (clusterLabel.equals(TaggingLabels.PARAGRAPH)) { + String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + buffer.append(clusterContent); } + buffer.append("\n"); } - return acklist; + return buffer; } - public StringBuilder markReferencesTEIAcknowledgment(String acknowText, List acknowList) { + public StringBuilder markReferencesTEIAcknowledgment(String acknowText, List acknowledgmentItemList) { String acknowWholeText = acknowText; StringBuilder result = new StringBuilder(); - for (Acknow acknow : acknowList) { + for (AcknowledgmentItem acknowledgmentItem : acknowledgmentItemList) { // text information - String text = acknow.getText(); - String label = acknow.getLabel(); - String coords = acknow.getCoords(); + String text = acknowledgmentItem.getText(); + String label = acknowledgmentItem.getLabel(); + String coords = acknowledgmentItem.getCoords(); int lengTextAckPart = text.length(); int lengTextWhole = acknowWholeText.length(); @@ -1257,8 +1214,8 @@ public StringBuilder markReferencesTEIAcknowledgment(String acknowText, List listAcknow, List tokenizationsAcknowledgement) { - for (Acknow ack : listAcknow) { // iterate through all acknowledgment text as results of acknowledgment parser + public void getCoordAcknowledgment(List listAcknowledgmentItem, List tokenizationsAcknowledgement) { + for (AcknowledgmentItem ack : listAcknowledgmentItem) { // iterate through all acknowledgment text as results of acknowledgment parser List layaoutTokenRelevant = new ArrayList<>(); List tokenListToBeFound = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(ack.getText()); int lastIdx = 0; diff --git a/grobid-core/src/main/java/org/grobid/core/engines/AcknowledgmentParser.java b/grobid-core/src/main/java/org/grobid/core/engines/AcknowledgmentParser.java index eaa8bc47b5..a29c5c7b7f 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/AcknowledgmentParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/AcknowledgmentParser.java @@ -4,656 +4,120 @@ * @created by Tanti */ +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; import org.grobid.core.GrobidModels; -import org.grobid.core.data.Acknowledgment; +import org.grobid.core.data.AcknowledgmentItem; +import org.grobid.core.engines.label.TaggingLabel; +import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.features.FeaturesVectorAcknowledgment; -import org.grobid.core.lang.Language; -import org.grobid.core.utilities.TextUtilities; +import org.grobid.core.layout.LayoutToken; +import org.grobid.core.tokenization.TaggingTokenCluster; +import org.grobid.core.tokenization.TaggingTokenClusteror; +import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.UnicodeUtil; +import org.grobid.core.utilities.counters.CntManager; import java.util.ArrayList; import java.util.List; -import java.util.StringTokenizer; public class AcknowledgmentParser extends AbstractParser { + private EngineParsers parsers; + + public AcknowledgmentParser(EngineParsers parsers, CntManager cntManager) { + super(GrobidModels.ACKNOWLEDGMENT, cntManager); + this.parsers = parsers; + } + + public AcknowledgmentParser(EngineParsers parsers) { + super(GrobidModels.ACKNOWLEDGMENT); + this.parsers = parsers; + } + public AcknowledgmentParser() { super(GrobidModels.ACKNOWLEDGMENT); } /** - * Processing of acknowledgment in header + * Processing of acknowledgment */ - public List processing(String input) { - if (input == null) + + public List processing(String input) { + if (StringUtils.isBlank(input)) { return null; + } - List acknowledgments = null; + // cleaning + input = UnicodeUtil.normaliseText(input); + List tokens = analyzer.tokenizeWithLayoutToken(input); + return processing(tokens); + } - List acknowledgmentBlocks = new ArrayList(); + public List processing(List tokens) { + List acknowledgments = null; + if (CollectionUtils.isEmpty(tokens)) { + return null; + } try { - // force English language for the tokenization only - List tokenizations = analyzer.tokenize(input, new Language("en", 1.0)); - if (tokenizations.size() == 0) - return null; - for (String tok : tokenizations) { - if (!tok.equals(" ") && !tok.equals("\n")) { - tok = tok.replaceAll("[ \n]", ""); - acknowledgmentBlocks.add(tok + " "); - } - } - - String headerAcknowledgment = FeaturesVectorAcknowledgment.addFeaturesAcknowledgment(acknowledgmentBlocks); - String res = label(headerAcknowledgment); - - //System.out.print(res.toString()); - StringTokenizer st2 = new StringTokenizer(res, "\n"); - String lastTag = null; - Acknowledgment acknowledgment = new Acknowledgment(); - int lineCount = 0; - while (st2.hasMoreTokens()) { - String line = st2.nextToken(); - if ((line.trim().length() == 0)) { - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - acknowledgment = new Acknowledgment(); - continue; - } - StringTokenizer st3 = new StringTokenizer(line, "\t "); - int ll = st3.countTokens(); - int i = 0; - String s1 = null; - String s2 = null; - while (st3.hasMoreTokens()) { - String s = st3.nextToken().trim(); - if (i == 0) { - s2 = s; // string - } else if (i == ll - 1) { - s1 = s; // label - } - i++; - } - - if (s1.equals("") || s1.equals("I-")) { - if (acknowledgment.getAffiliation() != null) { - if ((s1.equals("I-")) || - (!s1.equals(lastTag) && !lastTag.equals("I-")) - ) { - // new acknowledgment - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - - acknowledgment = new Acknowledgment(); - acknowledgment.setAffiliation(s2); - } else { - if (acknowledgment.getAffiliation().length() == 0) - acknowledgment.setAffiliation(s2); - else - acknowledgment.setAffiliation(acknowledgment.getAffiliation() + " " + s2); - } - } else { - acknowledgment.setAffiliation(s2); - } - } else if (s1.equals("") || s1.equals("I-")) { - if (acknowledgment.getEducationalInstitution() != null) { - if ((s1.equals("I-")) || - (!s1.equals(lastTag) && !lastTag.equals("I-")) - ) { - // new acknowledgment - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - - acknowledgment = new Acknowledgment(); - acknowledgment.setEducationalInstitution(s2); - } else { - if (acknowledgment.getEducationalInstitution().length() == 0) - acknowledgment.setEducationalInstitution(s2); - else - acknowledgment.setEducationalInstitution(acknowledgment.getEducationalInstitution() + " " + s2); - } - } else { - acknowledgment.setEducationalInstitution(s2); - } - } else if (s1.equals("") || s1.equals("I-")) { - if (acknowledgment.getFundingAgency() != null) { - if ((s1.equals("I-")) || - (!s1.equals(lastTag) && !lastTag.equals("I-")) - ) { - // new acknowledgment - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - - acknowledgment = new Acknowledgment(); - acknowledgment.setFundingAgency(s2); - } else { - if (acknowledgment.getFundingAgency().length() == 0) - acknowledgment.setFundingAgency(s2); - else - acknowledgment.setFundingAgency(acknowledgment.getFundingAgency() + " " + s2); - } - } else { - acknowledgment.setFundingAgency(s2); - } - } else if (s1.equals("") || s1.equals("I-")) { - if (acknowledgment.getGrantName() != null) { - if ((s1.equals("I-")) || - (!s1.equals(lastTag) && !lastTag.equals("I-")) - ) { - // new acknowledgment - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - - acknowledgment = new Acknowledgment(); - acknowledgment.setGrantName(s2); - } else { - if (acknowledgment.getGrantName().length() == 0) - acknowledgment.setGrantName(s2); - else - acknowledgment.setGrantName(acknowledgment.getGrantName() + " " + s2); - } - } else { - acknowledgment.setGrantName(s2); - } - } else if (s1.equals("") || s1.equals("I-")) { - if (acknowledgment.getGrantNumber() != null) { - if ((s1.equals("I-")) || - (!s1.equals(lastTag) && !lastTag.equals("I-")) - ) { - // new acknowledgment - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - - acknowledgment = new Acknowledgment(); - acknowledgment.setGrantNumber(s2); - } else { - if (acknowledgment.getGrantNumber().length() == 0) - acknowledgment.setGrantNumber(s2); - else - acknowledgment.setGrantNumber(acknowledgment.getGrantNumber() + " " + s2); - } - } else { - acknowledgment.setGrantNumber(s2); - } - } else if (s1.equals("") || s1.equals("I-")) { - if (acknowledgment.getIndividual() != null) { - if ((s1.equals("I-")) || - (!s1.equals(lastTag) && !lastTag.equals("I-")) - ) { - // new acknowledgment - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - - acknowledgment = new Acknowledgment(); - acknowledgment.setIndividual(s2); - } else { - if (acknowledgment.getIndividual().length() == 0) - acknowledgment.setIndividual(s2); - else - acknowledgment.setIndividual(acknowledgment.getIndividual() + " " + s2); - } - } else { - acknowledgment.setIndividual(s2); - } - } else if (s1.equals("") || s1.equals("I-")) { - if (acknowledgment.getOtherInstitution() != null) { - if ((s1.equals("I-")) || - (!s1.equals(lastTag) && !lastTag.equals("I-")) - ) { - // new acknowledgment - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - - acknowledgment = new Acknowledgment(); - acknowledgment.setOtherInstitution(s2); - } else { - if (acknowledgment.getOtherInstitution().length() == 0) - acknowledgment.setOtherInstitution(s2); - else - acknowledgment.setOtherInstitution(acknowledgment.getOtherInstitution() + " " + s2); - } - } else { - acknowledgment.setOtherInstitution(s2); - } - } else if (s1.equals("") || s1.equals("I-")) { - if (acknowledgment.getProjectName() != null) { - if ((s1.equals("I-")) || - (!s1.equals(lastTag) && !lastTag.equals("I-")) - ) { - // new acknowledgment - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - - acknowledgment = new Acknowledgment(); - acknowledgment.setProjectName(s2); - } else { - if (acknowledgment.getProjectName().length() == 0) - acknowledgment.setProjectName(s2); - else - acknowledgment.setProjectName(acknowledgment.getProjectName() + " " + s2); - } - } else { - acknowledgment.setProjectName(s2); - } - } else if (s1.equals("") || s1.equals("I-")) { - if (acknowledgment.getResearchInstitution() != null) { - if ((s1.equals("I-")) || - (!s1.equals(lastTag) && !lastTag.equals("I-")) - ) { - // new acknowledgment - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - - acknowledgment = new Acknowledgment(); - acknowledgment.setResearchInstitution(s2); - } else { - if (acknowledgment.getResearchInstitution().length() == 0) - acknowledgment.setResearchInstitution(s2); - else - acknowledgment.setResearchInstitution(acknowledgment.getResearchInstitution() + " " + s2); - } - } else { - acknowledgment.setResearchInstitution(s2); - } - } - - lastTag = s1; - lineCount++; - } - if (acknowledgment.isNotNull()) { - if (acknowledgments == null) - acknowledgments = new ArrayList(); - acknowledgments.add(acknowledgment); - } - + String headerAcknowledgment = FeaturesVectorAcknowledgment.addFeaturesAcknowledgment(tokens); + String resAcknowledgment = label(headerAcknowledgment); + acknowledgments = resultExtractionLayoutTokens(resAcknowledgment, tokens); + return acknowledgments; } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } - return acknowledgments; } + public List resultExtractionLayoutTokens(String result, List tokenizations) { + List acknowledgments = new ArrayList<>(); - /** - * Extract results from a date string in the training format without any string modification. - */ - public StringBuilder trainingExtraction(List inputs) { - StringBuilder buffer = new StringBuilder(); - try { - if (inputs == null) - return null; - - if (inputs.size() == 0) - return null; - - List tokenizations = null; - List acknowledgmentBlocks = new ArrayList(); - for (String input : inputs) { - if (input == null) - continue; - - tokenizations = analyzer.tokenize(input); - - if (tokenizations.size() == 0) - return null; - - for (String tok : tokenizations) { - if (tok.equals("\n")) { - acknowledgmentBlocks.add("@newline"); - } else if (!tok.equals(" ")) { - acknowledgmentBlocks.add(tok + " "); - } - } - acknowledgmentBlocks.add("\n"); - } - - String headerAcknowledgment = FeaturesVectorAcknowledgment.addFeaturesAcknowledgment(acknowledgmentBlocks); - String res = label(headerAcknowledgment); - - // extract results from the processed file - - StringTokenizer st2 = new StringTokenizer(res, "\n"); - String lastTag = null; - boolean tagClosed = false; - int q = 0; - boolean addSpace; - boolean hasAffiliation = false; - boolean hasEducationalInstitution = false; - boolean hasFundingAgency = false; - boolean hasGrantName = false; - boolean hasGrantNumber = false; - boolean hasIndividual = false; - boolean hasOtherInstitution = false; - boolean hasProjectName = false; - boolean hasResearchInstitution = false; - - String lastTag0; - String currentTag0; - boolean start = true; - while (st2.hasMoreTokens()) { - String line = st2.nextToken(); - addSpace = false; - if ((line.trim().length() == 0)) { - // new acknowledgment - buffer.append("\n"); - - hasAffiliation = false; - hasEducationalInstitution = false; - hasFundingAgency = false; - hasGrantName = false; - hasGrantNumber = false; - hasIndividual = false; - hasOtherInstitution = false; - hasProjectName = false; - hasResearchInstitution = false; - - buffer.append("\t"); - continue; - } else { - String theTok = tokenizations.get(q); - while (theTok.equals(" ")) { - addSpace = true; - q++; - theTok = tokenizations.get(q); - } - q++; - } - - StringTokenizer st3 = new StringTokenizer(line, "\t"); - int ll = st3.countTokens(); - int i = 0; - String s1 = null; - String s2 = null; - - while (st3.hasMoreTokens()) { - String s = st3.nextToken().trim(); - if (i == 0) { - s2 = TextUtilities.HTMLEncode(s); // string - } else if (i == ll - 1) { - s1 = s; // label - } - i++; - } - - if (start && (s1 != null)) { - buffer.append("\t"); - start = false; - } - - lastTag0 = null; - if (lastTag != null) { - if (lastTag.startsWith("I-")) { - lastTag0 = lastTag.substring(2, lastTag.length()); - } else { - lastTag0 = lastTag; - } - } - currentTag0 = null; - if (s1 != null) { - if (s1.startsWith("I-")) { - currentTag0 = s1.substring(2, s1.length()); - } else { - currentTag0 = s1; - } - } - - tagClosed = lastTag0 != null && testClosingTag(buffer, currentTag0, lastTag0); - - String output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - if (output != null) { - if (lastTag0 != null) { - if (hasAffiliation && !lastTag0.equals("")) { - buffer.append("\n"); - hasEducationalInstitution = false; - hasFundingAgency = false; - hasGrantName = false; - hasGrantNumber = false; - hasIndividual = false; - hasOtherInstitution = false; - hasProjectName = false; - hasResearchInstitution = false; - buffer.append("\t"); - } - } - hasAffiliation = true; - buffer.append(output); - lastTag = s1; - continue; - } else { - output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - } - - if (output == null) { - output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - } else { - if (lastTag0 != null) { - if (hasEducationalInstitution && !lastTag0.equals("")) { - buffer.append("\n"); - buffer.append("\t"); - } - } - buffer.append(output); - hasEducationalInstitution = true; - lastTag = s1; - continue; - } - - if (output == null) { - output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - } else { - if (lastTag0 != null) { - if (hasFundingAgency && !lastTag0.equals("")) { - buffer.append("\n"); - buffer.append("\t"); - } - } - buffer.append(output); - hasFundingAgency = true; - lastTag = s1; - continue; - } - - if (output == null) { - output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - } else { - if (lastTag0 != null) { - if (hasGrantName && !lastTag0.equals("")) { - buffer.append("\n"); - buffer.append("\t"); - } - } - buffer.append(output); - hasGrantName = true; - lastTag = s1; - continue; - } - - if (output == null) { - output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - } else { - if (lastTag0 != null) { - if (hasGrantNumber && !lastTag0.equals("")) { - buffer.append("\n"); - buffer.append("\t"); - } - } - buffer.append(output); - hasGrantNumber = true; - lastTag = s1; - continue; - } - - if (output == null) { - output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - } else { - if (lastTag0 != null) { - if (hasIndividual && !lastTag0.equals("")) { - buffer.append("\n"); - buffer.append("\t"); - } - } - buffer.append(output); - hasIndividual = true; - lastTag = s1; - continue; - } - - if (output == null) { - output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - } else { - if (lastTag0 != null) { - if (hasOtherInstitution && !lastTag0.equals("")) { - buffer.append("\n"); - buffer.append("\t"); - } - } - buffer.append(output); - hasOtherInstitution = true; - lastTag = s1; - continue; - } - - if (output == null) { - output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - } else { - if (lastTag0 != null) { - if (hasProjectName && !lastTag0.equals("")) { - buffer.append("\n"); - buffer.append("\t"); - } - } - buffer.append(output); - hasProjectName = true; - lastTag = s1; - continue; - } - - if (output == null) { - output = writeField(s1, lastTag0, s2, "", "", addSpace, 0); - } else { - if (lastTag0 != null) { - if (hasResearchInstitution && !lastTag0.equals("")) { - buffer.append("\n"); - buffer.append("\t"); - } - } - buffer.append(output); - hasResearchInstitution = true; - lastTag = s1; - continue; - } + TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.ACKNOWLEDGMENT, result, tokenizations); + List clusters = clusteror.cluster(); + AcknowledgmentItem acknowledgmentItem = null; + for (TaggingTokenCluster cluster : clusters) { + if (cluster == null) { + continue; } - - if (lastTag != null) { - if (lastTag.startsWith("I-")) { - lastTag0 = lastTag.substring(2, lastTag.length()); - } else { - lastTag0 = lastTag; - } - currentTag0 = ""; - testClosingTag(buffer, currentTag0, lastTag0); - buffer.append("\n"); + acknowledgmentItem = new AcknowledgmentItem(); + TaggingLabel clusterLabel = cluster.getTaggingLabel(); + Engine.getCntManager().i(clusterLabel); + + String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + if (clusterLabel.equals(TaggingLabels.ACKNOWLEDGMENT_AFFILIATION)) { + acknowledgmentItem.setText(clusterContent); + acknowledgmentItem.setLabel("affiliation"); + } else if (clusterLabel.equals(TaggingLabels.ACKNOWLEDGMENT_EDUCATIONAL_INSTITUTION)) { + acknowledgmentItem.setText(clusterContent); + acknowledgmentItem.setLabel("educationalInstitution"); + } else if (clusterLabel.equals(TaggingLabels.ACKNOWLEDGMENT_FUNDING_AGENCY)) { + acknowledgmentItem.setText(clusterContent); + acknowledgmentItem.setLabel("fundingAgency"); + } else if (clusterLabel.equals(TaggingLabels.ACKNOWLEDGMENT_GRANT_NAME)) { + acknowledgmentItem.setText(clusterContent); + acknowledgmentItem.setLabel("grantName"); + } else if (clusterLabel.equals(TaggingLabels.ACKNOWLEDGMENT_GRANT_NUMBER)) { + acknowledgmentItem.setText(clusterContent); + acknowledgmentItem.setLabel("grantNumber"); + } else if (clusterLabel.equals(TaggingLabels.ACKNOWLEDGMENT_INDIVIDUAL)) { + acknowledgmentItem.setText(clusterContent); + acknowledgmentItem.setLabel("individual"); + } else if (clusterLabel.equals(TaggingLabels.ACKNOWLEDGMENT_OTHER_INSTITUTION)) { + acknowledgmentItem.setText(clusterContent); + acknowledgmentItem.setLabel("otherInstitution"); + } else if (clusterLabel.equals(TaggingLabels.ACKNOWLEDGMENT_PROJECT_NAME)) { + acknowledgmentItem.setText(clusterContent); + acknowledgmentItem.setLabel("projectName"); + } else if (clusterLabel.equals(TaggingLabels.ACKNOWLEDGMENT_RESEARCH_INSTITUTION)) { + acknowledgmentItem.setText(clusterContent); + acknowledgmentItem.setLabel("researchInstitution"); } - } catch (Exception e) { -// e.printStackTrace(); - throw new GrobidException("An exception occured while running Grobid.", e); - } - return buffer; - } - - private String writeField(String s1, - String lastTag0, - String s2, - String field, - String outField, - boolean addSpace, - int nbIndent) { - String result = null; - if ((s1.equals(field)) || (s1.equals("I-" + field))) { - if ((s1.equals("") || s1.equals("I-"))) { - if (addSpace) - result = " " + s2; - else - result = s2; - } else if (s1.equals(lastTag0) || s1.equals("I-" + lastTag0)) { - if (addSpace) - result = " " + s2; - else - result = s2; - } else { - result = ""; - for (int i = 0; i < nbIndent; i++) { - result += "\t"; - } - if (addSpace) - result += " " + outField + s2; - else - result += outField + s2; - } - } - return result; - } - private boolean testClosingTag(StringBuilder buffer, - String currentTag0, - String lastTag0) { - boolean res = false; - if (!currentTag0.equals(lastTag0)) { - res = true; - // we close the current tag - if (lastTag0.equals("")) { - buffer.append(""); - } else if (lastTag0.equals("")) { - buffer.append(""); - } else if (lastTag0.equals("")) { - buffer.append(""); - } else if (lastTag0.equals("")) { - buffer.append(""); - } else if (lastTag0.equals("")) { - buffer.append(""); - } else if (lastTag0.equals("")) { - buffer.append(""); - } else if (lastTag0.equals("")) { - buffer.append(""); - } else if (lastTag0.equals("")) { - buffer.append(""); - } else if (lastTag0.equals("")) { - buffer.append(""); - } else if (lastTag0.equals("")) { - buffer.append(""); - } else { - res = false; + if (acknowledgmentItem.getLabel() != null) { + acknowledgments.add(acknowledgmentItem); } - } - return res; + return acknowledgments; } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/AcknowledgmentParserOld.java b/grobid-core/src/main/java/org/grobid/core/engines/AcknowledgmentParserOld.java new file mode 100644 index 0000000000..8ccc7374e3 --- /dev/null +++ b/grobid-core/src/main/java/org/grobid/core/engines/AcknowledgmentParserOld.java @@ -0,0 +1,302 @@ +package org.grobid.core.engines; + +/** + * @created by Tanti + */ + +import org.grobid.core.GrobidModels; +import org.grobid.core.data.Acknowledgment; +import org.grobid.core.exceptions.GrobidException; +import org.grobid.core.features.FeaturesVectorAcknowledgment; +import org.grobid.core.lang.Language; +import org.grobid.core.utilities.TextUtilities; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; + +public class AcknowledgmentParserOld extends AbstractParser { + public AcknowledgmentParserOld() { + super(GrobidModels.ACKNOWLEDGMENT); + } + + /** + * Processing of acknowledgment + */ + public List processing(String input) { + if (input == null) + return null; + + List acknowledgments = null; + + List acknowledgmentBlocks = new ArrayList(); + try { + // force English language for the tokenization only + List tokenizations = analyzer.tokenize(input, new Language("en", 1.0)); + if (tokenizations.size() == 0) + return null; + for (String tok : tokenizations) { + if (!tok.equals(" ") && !tok.equals("\n")) { + tok = tok.replaceAll("[ \n]", ""); + acknowledgmentBlocks.add(tok + " "); + } + } + + String headerAcknowledgment = FeaturesVectorAcknowledgment.addFeaturesAcknowledgmentString(acknowledgmentBlocks); + String res = label(headerAcknowledgment); + + //System.out.print(res.toString()); + StringTokenizer st2 = new StringTokenizer(res, "\n"); + String lastTag = null; + Acknowledgment acknowledgment = new Acknowledgment(); + int lineCount = 0; + while (st2.hasMoreTokens()) { + String line = st2.nextToken(); + if ((line.trim().length() == 0)) { + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + acknowledgment = new Acknowledgment(); + continue; + } + StringTokenizer st3 = new StringTokenizer(line, "\t "); + int ll = st3.countTokens(); + int i = 0; + String s1 = null; + String s2 = null; + while (st3.hasMoreTokens()) { + String s = st3.nextToken().trim(); + if (i == 0) { + s2 = s; // string + } else if (i == ll - 1) { + s1 = s; // label + } + i++; + } + + if (s1.equals("") || s1.equals("I-")) { + if (acknowledgment.getAffiliation() != null) { + if ((s1.equals("I-")) || + (!s1.equals(lastTag) && !lastTag.equals("I-")) + ) { + // new acknowledgment + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + acknowledgment = new Acknowledgment(); + acknowledgment.setAffiliation(s2); + } else { + if (acknowledgment.getAffiliation().length() == 0) + acknowledgment.setAffiliation(s2); + else + acknowledgment.setAffiliation(acknowledgment.getAffiliation() + " " + s2); + } + } else { + acknowledgment.setAffiliation(s2); + } + } else if (s1.equals("") || s1.equals("I-")) { + if (acknowledgment.getEducationalInstitution() != null) { + if ((s1.equals("I-")) || + (!s1.equals(lastTag) && !lastTag.equals("I-")) + ) { + // new acknowledgment + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + acknowledgment = new Acknowledgment(); + acknowledgment.setEducationalInstitution(s2); + } else { + if (acknowledgment.getEducationalInstitution().length() == 0) + acknowledgment.setEducationalInstitution(s2); + else + acknowledgment.setEducationalInstitution(acknowledgment.getEducationalInstitution() + " " + s2); + } + } else { + acknowledgment.setEducationalInstitution(s2); + } + } else if (s1.equals("") || s1.equals("I-")) { + if (acknowledgment.getFundingAgency() != null) { + if ((s1.equals("I-")) || + (!s1.equals(lastTag) && !lastTag.equals("I-")) + ) { + // new acknowledgment + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + acknowledgment = new Acknowledgment(); + acknowledgment.setFundingAgency(s2); + } else { + if (acknowledgment.getFundingAgency().length() == 0) + acknowledgment.setFundingAgency(s2); + else + acknowledgment.setFundingAgency(acknowledgment.getFundingAgency() + " " + s2); + } + } else { + acknowledgment.setFundingAgency(s2); + } + } else if (s1.equals("") || s1.equals("I-")) { + if (acknowledgment.getGrantName() != null) { + if ((s1.equals("I-")) || + (!s1.equals(lastTag) && !lastTag.equals("I-")) + ) { + // new acknowledgment + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + acknowledgment = new Acknowledgment(); + acknowledgment.setGrantName(s2); + } else { + if (acknowledgment.getGrantName().length() == 0) + acknowledgment.setGrantName(s2); + else + acknowledgment.setGrantName(acknowledgment.getGrantName() + " " + s2); + } + } else { + acknowledgment.setGrantName(s2); + } + } else if (s1.equals("") || s1.equals("I-")) { + if (acknowledgment.getGrantNumber() != null) { + if ((s1.equals("I-")) || + (!s1.equals(lastTag) && !lastTag.equals("I-")) + ) { + // new acknowledgment + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + acknowledgment = new Acknowledgment(); + acknowledgment.setGrantNumber(s2); + } else { + if (acknowledgment.getGrantNumber().length() == 0) + acknowledgment.setGrantNumber(s2); + else + acknowledgment.setGrantNumber(acknowledgment.getGrantNumber() + " " + s2); + } + } else { + acknowledgment.setGrantNumber(s2); + } + } else if (s1.equals("") || s1.equals("I-")) { + if (acknowledgment.getIndividual() != null) { + if ((s1.equals("I-")) || + (!s1.equals(lastTag) && !lastTag.equals("I-")) + ) { + // new acknowledgment + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + acknowledgment = new Acknowledgment(); + acknowledgment.setIndividual(s2); + } else { + if (acknowledgment.getIndividual().length() == 0) + acknowledgment.setIndividual(s2); + else + acknowledgment.setIndividual(acknowledgment.getIndividual() + " " + s2); + } + } else { + acknowledgment.setIndividual(s2); + } + } else if (s1.equals("") || s1.equals("I-")) { + if (acknowledgment.getOtherInstitution() != null) { + if ((s1.equals("I-")) || + (!s1.equals(lastTag) && !lastTag.equals("I-")) + ) { + // new acknowledgment + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + acknowledgment = new Acknowledgment(); + acknowledgment.setOtherInstitution(s2); + } else { + if (acknowledgment.getOtherInstitution().length() == 0) + acknowledgment.setOtherInstitution(s2); + else + acknowledgment.setOtherInstitution(acknowledgment.getOtherInstitution() + " " + s2); + } + } else { + acknowledgment.setOtherInstitution(s2); + } + } else if (s1.equals("") || s1.equals("I-")) { + if (acknowledgment.getProjectName() != null) { + if ((s1.equals("I-")) || + (!s1.equals(lastTag) && !lastTag.equals("I-")) + ) { + // new acknowledgment + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + acknowledgment = new Acknowledgment(); + acknowledgment.setProjectName(s2); + } else { + if (acknowledgment.getProjectName().length() == 0) + acknowledgment.setProjectName(s2); + else + acknowledgment.setProjectName(acknowledgment.getProjectName() + " " + s2); + } + } else { + acknowledgment.setProjectName(s2); + } + } else if (s1.equals("") || s1.equals("I-")) { + if (acknowledgment.getResearchInstitution() != null) { + if ((s1.equals("I-")) || + (!s1.equals(lastTag) && !lastTag.equals("I-")) + ) { + // new acknowledgment + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + acknowledgment = new Acknowledgment(); + acknowledgment.setResearchInstitution(s2); + } else { + if (acknowledgment.getResearchInstitution().length() == 0) + acknowledgment.setResearchInstitution(s2); + else + acknowledgment.setResearchInstitution(acknowledgment.getResearchInstitution() + " " + s2); + } + } else { + acknowledgment.setResearchInstitution(s2); + } + } + + lastTag = s1; + lineCount++; + } + if (acknowledgment.isNotNull()) { + if (acknowledgments == null) + acknowledgments = new ArrayList(); + acknowledgments.add(acknowledgment); + } + + } catch (Exception e) { + throw new GrobidException("An exception occured while running Grobid.", e); + } + return acknowledgments; + } +} diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java index 8e6d55f9bc..7e81ad7dc0 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java @@ -15,8 +15,6 @@ package org.grobid.core.engines; -import com.google.common.io.Files; -import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.tuple.Pair; //import org.grobid.core.annotations.TeiStAXParser; @@ -24,10 +22,7 @@ import org.grobid.core.document.Document; import org.grobid.core.document.DocumentSource; import org.grobid.core.engines.config.GrobidAnalysisConfig; -import org.grobid.core.engines.label.SegmentationLabels; import org.grobid.core.exceptions.GrobidException; -import org.grobid.core.exceptions.GrobidResourceException; -import org.grobid.core.factory.GrobidFactory; import org.grobid.core.factory.GrobidPoolingFactory; import org.grobid.core.lang.Language; import org.grobid.core.utilities.Consolidation; @@ -105,8 +100,8 @@ public List> processAuthorsCitationLists(List authorSequenc * @return the list of all structured date objects recognized in the string. * @throws IOException */ - public List processAcknowledgment(String acknowledgmentBlock) throws IOException { - List result = parsers.getAcknowledgmentParser().processing(acknowledgmentBlock); + public List processAcknowledgment(String acknowledgmentBlock) throws IOException { + List result = parsers.getAcknowledgmentParser().processing(acknowledgmentBlock); return result; } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/label/TaggingLabels.java b/grobid-core/src/main/java/org/grobid/core/engines/label/TaggingLabels.java index db0f1655b5..c8a3714297 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/label/TaggingLabels.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/label/TaggingLabels.java @@ -91,7 +91,6 @@ public class TaggingLabels { public final static String BACK_LABEL = ""; // acknowledgment - public final static String ACKNOWLEDGMENT_LABEL = ""; public final static String AFFILIATION = ""; public final static String EDUCATIONAL_INSTITUTION = ""; public final static String FUNDING_AGENCY = ""; @@ -102,7 +101,6 @@ public class TaggingLabels { public final static String PROJECT_NAME = ""; public final static String RESEARCH_INSTITUTION = ""; - /* title page (secondary title page) * publisher page (publication information, including usually the copyrights info) * summary (include executive summary) @@ -132,7 +130,6 @@ public class TaggingLabels { public static final TaggingLabel TABLE = new TaggingLabelImpl(GrobidModels.FULLTEXT, TABLE_LABEL); public static final TaggingLabel EQUATION = new TaggingLabelImpl(GrobidModels.FULLTEXT, EQUATION_LAB); public static final TaggingLabel EQUATION_LABEL = new TaggingLabelImpl(GrobidModels.FULLTEXT, EQUATION_ID_LABEL); - public static final TaggingLabel ACKNOWLEDGMENT_MARKER = new TaggingLabelImpl(GrobidModels.FULLTEXT, ACKNOWLEDGMENT_LABEL); public static final TaggingLabel HEADER_DATE = new TaggingLabelImpl(GrobidModels.HEADER, DATE_LABEL); public static final TaggingLabel HEADER_TITLE = new TaggingLabelImpl(GrobidModels.HEADER, TITLE_LABEL); @@ -230,17 +227,16 @@ public class TaggingLabels { public static final TaggingLabel MONOGRAPH_OTHER = new TaggingLabelImpl(GrobidModels.MONOGRAPH, OTHER_LABEL); // tagging label for acknowledgment - public static final TaggingLabel ACKNOWLEDGMENT = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, ACKNOWLEDGMENT_LABEL); - public static final TaggingLabel AKNOWLEDGMENT_AFFILIATION = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, AFFILIATION); - public static final TaggingLabel AKNOWLEDGMENT_EDUCATIONAL_INSTITUTION = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, EDUCATIONAL_INSTITUTION); - public static final TaggingLabel AKNOWLEDGMENT_FUNDING_AGENCY = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, FUNDING_AGENCY); - public static final TaggingLabel AKNOWLEDGMENT_GRANT_NAME = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, GRANT_NAME); - public static final TaggingLabel AKNOWLEDGMENT_GRANT_NUMBER = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, GRANT_NUMBER); - public static final TaggingLabel AKNOWLEDGMENT_INDIVIDUAL = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, INDIVIDUAL); - public static final TaggingLabel AKNOWLEDGMENT_OTHER_INSTITUTION = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, OTHER_INSTITUTION); - public static final TaggingLabel AKNOWLEDGMENT_PROJECT_NAME = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, PROJECT_NAME); - public static final TaggingLabel AKNOWLEDGMENT_RESEARCH_INSTITUTION = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, RESEARCH_INSTITUTION); - + public static final TaggingLabel ACKNOWLEDGMENT_AFFILIATION = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, AFFILIATION); + public static final TaggingLabel ACKNOWLEDGMENT_EDUCATIONAL_INSTITUTION = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, EDUCATIONAL_INSTITUTION); + public static final TaggingLabel ACKNOWLEDGMENT_FUNDING_AGENCY = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, FUNDING_AGENCY); + public static final TaggingLabel ACKNOWLEDGMENT_GRANT_NAME = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, GRANT_NAME); + public static final TaggingLabel ACKNOWLEDGMENT_GRANT_NUMBER = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, GRANT_NUMBER); + public static final TaggingLabel ACKNOWLEDGMENT_INDIVIDUAL = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, INDIVIDUAL); + public static final TaggingLabel ACKNOWLEDGMENT_OTHER_INSTITUTION = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, OTHER_INSTITUTION); + public static final TaggingLabel ACKNOWLEDGMENT_PROJECT_NAME = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, PROJECT_NAME); + public static final TaggingLabel ACKNOWLEDGMENT_RESEARCH_INSTITUTION = new TaggingLabelImpl(GrobidModels.ACKNOWLEDGMENT, RESEARCH_INSTITUTION); + protected static void register(TaggingLabel label) { cache.putIfAbsent(new Pair<>(label.getGrobidModel(), label.getLabel()), label); } @@ -361,18 +357,17 @@ protected static void register(TaggingLabel label) { register(MONOGRAPH_OTHER); // acknowledgment - register(AKNOWLEDGMENT_AFFILIATION); - register(AKNOWLEDGMENT_EDUCATIONAL_INSTITUTION); - register(AKNOWLEDGMENT_FUNDING_AGENCY); - register(AKNOWLEDGMENT_GRANT_NAME); - register(AKNOWLEDGMENT_GRANT_NUMBER); - register(AKNOWLEDGMENT_INDIVIDUAL); - register(AKNOWLEDGMENT_OTHER_INSTITUTION); - register(AKNOWLEDGMENT_PROJECT_NAME); - register(AKNOWLEDGMENT_RESEARCH_INSTITUTION); + register(ACKNOWLEDGMENT_AFFILIATION); + register(ACKNOWLEDGMENT_EDUCATIONAL_INSTITUTION); + register(ACKNOWLEDGMENT_FUNDING_AGENCY); + register(ACKNOWLEDGMENT_GRANT_NAME); + register(ACKNOWLEDGMENT_GRANT_NUMBER); + register(ACKNOWLEDGMENT_INDIVIDUAL); + register(ACKNOWLEDGMENT_OTHER_INSTITUTION); + register(ACKNOWLEDGMENT_PROJECT_NAME); + register(ACKNOWLEDGMENT_RESEARCH_INSTITUTION); } - protected TaggingLabels() { } diff --git a/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAcknowledgment.java b/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAcknowledgment.java index 28c9678271..15c38a4937 100644 --- a/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAcknowledgment.java +++ b/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAcknowledgment.java @@ -1,7 +1,11 @@ package org.grobid.core.features; import org.grobid.core.analyzers.GrobidAnalyzer; +import org.grobid.core.exceptions.GrobidException; +import org.grobid.core.layout.LayoutToken; +import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.TextUtilities; +import org.grobid.core.utilities.UnicodeUtil; import java.util.List; import java.util.StringTokenizer; @@ -147,7 +151,7 @@ public String printVector() { /** * Add feature for acknowledgment parsing. */ - static public String addFeaturesAcknowledgment(List lines) throws Exception { + static public String addFeaturesAcknowledgmentString(List lines) throws Exception { StringBuffer result = new StringBuffer(); List block = null; @@ -194,19 +198,12 @@ static public String addFeaturesAcknowledgment(List lines) throws Except } + static private FeaturesVectorAcknowledgment addFeaturesAcknowledgment(String line, String lineStatus) { FeatureFactory featureFactory = FeatureFactory.getInstance(); FeaturesVectorAcknowledgment featuresVectorAcknowledgment = new FeaturesVectorAcknowledgment(); - //List tokens = GrobidAnalyzer.getInstance().tokenize(line); StringTokenizer st = new StringTokenizer(line.trim(), "\t "); - /*for (String tok : tokens) { - String word = tok; - - String label = null; - if (tok != null) { - label = tok; - }*/ if (st.hasMoreTokens()) { String word = st.nextToken(); @@ -230,7 +227,7 @@ static private FeaturesVectorAcknowledgment addFeaturesAcknowledgment(String lin } else if (featureFactory.test_first_capital(word)) { featuresVectorAcknowledgment.capitalisation = "INITCAP"; } else - featuresVectorAcknowledgment.capitalisation = "NOCAP"; + featuresVectorAcknowledgment.capitalisation = "NOCAPS"; // digit if (featureFactory.test_number(word)) { @@ -285,4 +282,141 @@ static private FeaturesVectorAcknowledgment addFeaturesAcknowledgment(String lin } return featuresVectorAcknowledgment; } + + /** + * Add feature for acknowledgment parsing with tokens as input. + */ + static public String addFeaturesAcknowledgment(List tokens) throws Exception { + FeatureFactory featureFactory = FeatureFactory.getInstance(); + + StringBuilder acknowledgment = new StringBuilder(); + + String previousTag = null; + String previousText = null; + FeaturesVectorAcknowledgment featuresVectorAcknowledgment = null; + int sentenceLenth = tokens.size(); // length of the current sentence + for (int n=0; n < tokens.size(); n++) { + LayoutToken token = tokens.get(n); + String tag = null; + + boolean outputLineStatus = false; + + String text = token.getText(); + if (text.equals(" ")) { + continue; + } + + if (text.equals("\n")) { + continue; + } + + // parano normalisation + text = UnicodeUtil.normaliseTextAndRemoveSpaces(text); + if (text.trim().length() == 0 ) { + continue; + } + + if (TextUtilities.filterLine(text)) { + continue; + } + + featuresVectorAcknowledgment = new FeaturesVectorAcknowledgment(); + + featuresVectorAcknowledgment.string = text; + featuresVectorAcknowledgment.label = tag; + + // line status + if (n == 0) { + featuresVectorAcknowledgment.lineStatus = "LINESTART"; + outputLineStatus = true; + } + if (n == 0) { + if (!outputLineStatus) { + featuresVectorAcknowledgment.lineStatus = "LINESTART"; + outputLineStatus = true; + } + } else if (tokens.size() == n+1) { + if (!outputLineStatus) { + featuresVectorAcknowledgment.lineStatus = "LINEEND"; + outputLineStatus = true; + } + } + if (!outputLineStatus) { + featuresVectorAcknowledgment.lineStatus = "LINEIN"; + outputLineStatus = true; + } + + // single character + if (text.length() == 1) { + featuresVectorAcknowledgment.singleChar = true; + } + + // capital + if (featureFactory.test_all_capital(text)) { + featuresVectorAcknowledgment.capitalisation = "ALLCAPS"; + } + + if (Character.isUpperCase(text.charAt(0))) { + featuresVectorAcknowledgment.capitalisation = "INITCAP"; + } + + if (featuresVectorAcknowledgment.capitalisation == null) { + featuresVectorAcknowledgment.capitalisation = "NOCAPS"; + } + + // digit + Matcher m = featureFactory.isDigit.matcher(text); + if (m.find()) { + featuresVectorAcknowledgment.digit = "ALLDIGIT"; + } + + if (featureFactory.test_digit(text)) { + featuresVectorAcknowledgment.digit = "CONTAINSDIGITS"; + } + + if (featuresVectorAcknowledgment.digit == null) + featuresVectorAcknowledgment.digit = "NODIGIT"; + + // common name + if (featureFactory.test_common(text)) { + featuresVectorAcknowledgment.commonName = true; + } + + // proper name + if (featureFactory.test_names(text)) { + featuresVectorAcknowledgment.properName = true; + } + + // find the punctuations + Matcher m0 = featureFactory.isPunct.matcher(text); + if (m0.find()) { + featuresVectorAcknowledgment.punctType = "PUNCT"; + } + + if (featuresVectorAcknowledgment.punctType == null) + featuresVectorAcknowledgment.punctType = "NOPUNCT"; + + // token containing special character + if ((text.equals("(")) | (text.equals("["))) { + featuresVectorAcknowledgment.punctType = "OPENBRACKET"; + } else if ((text.equals(")")) | (text.equals("]"))) { + featuresVectorAcknowledgment.punctType = "ENDBRACKET"; + } else if (text.equals(".")) { + featuresVectorAcknowledgment.punctType = "DOT"; + } else if (text.equals(",")) { + featuresVectorAcknowledgment.punctType = "COMMA"; + } else if (text.equals("-")) { + featuresVectorAcknowledgment.punctType = "HYPHEN"; + } else if (text.equals("\"") | text.equals("\'") | text.equals("`")) { + featuresVectorAcknowledgment.punctType = "QUOTE"; + } + + acknowledgment.append(featuresVectorAcknowledgment.printVector()); + + previousTag = tag; + previousText = text; + } + + return acknowledgment.toString(); + } } diff --git a/grobid-core/src/test/java/org/grobid/core/engines/AcknowledgmentParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/AcknowledgmentParserTest.java index c54ebb921c..5fa8985566 100644 --- a/grobid-core/src/test/java/org/grobid/core/engines/AcknowledgmentParserTest.java +++ b/grobid-core/src/test/java/org/grobid/core/engines/AcknowledgmentParserTest.java @@ -1,8 +1,7 @@ package org.grobid.core.engines; +import org.grobid.core.data.AcknowledgmentItem; import org.grobid.core.data.Acknowledgment; -import org.grobid.core.data.Date; -import org.grobid.core.document.TEIFormatter; import org.grobid.core.factory.AbstractEngineFactory; import org.junit.Before; import org.junit.BeforeClass; @@ -19,7 +18,8 @@ */ public class AcknowledgmentParserTest { - AcknowledgmentParser target; + AcknowledgmentParserOld targetOld; + AcknowledgmentParser targetNew; @BeforeClass public static void setInitialContext() throws Exception { @@ -28,57 +28,46 @@ public static void setInitialContext() throws Exception { @Before public void setUp() throws Exception { - target = new AcknowledgmentParser(); + targetOld = new AcknowledgmentParserOld(); + targetNew = new AcknowledgmentParser(); } @Test - public void processing_akwnowledgment1_shouldWork() throws Exception { - List output = target.processing("This research was supported by the Deutsche Forschungsgemeinschaft through the SFB 649 \"Economic Risk\". http://sfb649.wiwi.hu-berlin.de ISSN 1860-5664"); + public void processing_akwnowledgmentOld_shouldWork() throws Exception { + List output = targetOld.processing("This research was supported by the Deutsche Forschungsgemeinschaft through the SFB 649 \"Economic Risk\". http://sfb649.wiwi.hu-berlin.de ISSN 1860-5664"); final Acknowledgment acknowledgment = output.get(0); assertThat(acknowledgment.getFundingAgency(), is("the Deutsche Forschungsgemeinschaft")); assertThat(acknowledgment.getProjectName(), is("the SFB 649 \" Economic Risk \"")); + } + + @Test + public void processing_akwnowledgmentNew1_shouldWork() throws Exception { + List output = targetNew.processing("This research was supported by the Deutsche Forschungsgemeinschaft through the SFB 649 \"Economic Risk\". http://sfb649.wiwi.hu-berlin.de ISSN 1860-5664"); + AcknowledgmentItem acknowledgment = new AcknowledgmentItem(); - //System.out.println(output.get(0).getFundingAgency()); + acknowledgment = output.get(0); + + assertThat(acknowledgment.getText(), is("the Deutsche Forschungsgemeinschaft")); + assertThat(acknowledgment.getLabel(), is("fundingAgency")); } @Test - public void processing_akwnowledgment2_shouldWork() throws Exception { - List output = target.processing("This research was funded by Computational Science grant #635.000.014 from " + + public void processing_akwnowledgmentNew2_shouldWork() throws Exception { + List output = targetNew.processing("This research was funded by Computational Science grant #635.000.014 from " + "the Netherlands Organization for Scientific Research (NWO). " + "Mikas Vengris, Denitsa Grancharova and Rienk van Grondelle provided the data modeled in Section 5.6. Rob Koehorst, " + "Bart van Oort, Sergey Laptenok, Ton Visser and Herbert van Amerongen provided the data modeled in Section 6.3. " + "Joris Snellenburg is thanked for constructive comments on the text. Uwe Ligges and Martin Mächler collaborated " + "in the implementation of the nls options described in Section B. Achim Zeileis contributed helpful suggestions regarding the figures."); - /*for (Acknowledgment acknowledgment : output) { - if (acknowledgment.getAffiliation() != null){ - System.out.println(acknowledgment.getAffiliation()); - } - if (acknowledgment.getEducationalInstitution() != null){ - System.out.println(acknowledgment.getEducationalInstitution()); - } - if (acknowledgment.getFundingAgency() != null){ - System.out.println(acknowledgment.getFundingAgency()); - } - if (acknowledgment.getGrantName() != null){ - System.out.println(acknowledgment.getGrantName()); - } - if (acknowledgment.getGrantNumber() != null){ - System.out.println(acknowledgment.getGrantNumber()); - } - if (acknowledgment.getOtherInstitution() != null){ - System.out.println(acknowledgment.getOtherInstitution()); - } - if (acknowledgment.getProjectName() != null){ - System.out.println(acknowledgment.getProjectName()); - } - if (acknowledgment.getResearchInstitution() != null){ - System.out.println(acknowledgment.getResearchInstitution()); - } - if (acknowledgment.getIndividual() != null){ - System.out.println(acknowledgment.getIndividual()); - } - }*/ + assertThat(output.get(1).getText(), is("the Netherlands Organization for Scientific Research (NWO)")); + assertThat(output.get(1).getLabel(), is("fundingAgency")); + + assertThat(output.get(2).getText(), is("Mikas Vengris")); + assertThat(output.get(2).getLabel(), is("individual")); + + assertThat(output.get(3).getText(), is("Denitsa Grancharova")); + assertThat(output.get(3).getLabel(), is("individual")); } } \ No newline at end of file diff --git a/grobid-core/src/test/java/org/grobid/core/engines/EngineTest.java b/grobid-core/src/test/java/org/grobid/core/engines/EngineTest.java index 7e064c6c19..9847880dd3 100644 --- a/grobid-core/src/test/java/org/grobid/core/engines/EngineTest.java +++ b/grobid-core/src/test/java/org/grobid/core/engines/EngineTest.java @@ -5,7 +5,6 @@ import fr.limsi.wapiti.SWIGTYPE_p_mdl_t; import fr.limsi.wapiti.Wapiti; import org.apache.pdfbox.pdmodel.PDDocument; -import org.grobid.core.GrobidModels; import org.grobid.core.data.Acknowledgment; import org.grobid.core.data.BibDataSet; import org.grobid.core.data.BiblioItem; @@ -674,7 +673,7 @@ public void run() { public void testAcknowledgmentParser() throws Exception { String acknowledgmentText = "Acknowledgements. We thank E. Brockmann and two anonymous reviewers " + "for their helpful reviews. This work was supported by the Centre National de la Recherche Scientifique (CNRS-INSU)"; - List acknowledgmentList = new AcknowledgmentParser().processing(acknowledgmentText); + List acknowledgmentList = new AcknowledgmentParserOld().processing(acknowledgmentText); assertThat(acknowledgmentList.size(), is(1)); assertThat(acknowledgmentList.get(0).getIndividual(), is("E . Brockmann")); diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java index 816720d36c..7d24db2a56 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java @@ -210,15 +210,18 @@ public Response processAcknowledgments(String acknowledgment) { engine = Engine.getEngine(true); acknowledgment = acknowledgment.replaceAll("\\t", " "); - List acknowledgmentList = engine.processAcknowledgment(acknowledgment); + List acknowledgmentList = engine.processAcknowledgment(acknowledgment); if (acknowledgmentList != null) { - for(Acknowledgment ack : acknowledgmentList) { + retVal = ""; + for(AcknowledgmentItem ack : acknowledgmentList) { if (retVal == null) { retVal = ""; + }else { + retVal += ack.toTEI(); } - retVal += ack.toTEI(); } + retVal += ""; } if (GrobidRestUtils.isResultNullOrEmpty(retVal)) { response = Response.status(Status.NO_CONTENT).build();