Skip to content

Commit

Permalink
adapt acknowledgmentParser with a new method using TokenClusteror, fi…
Browse files Browse the repository at this point in the history
…x all class linked with it, fix the test class as well
  • Loading branch information
tantikristanti committed Oct 23, 2019
1 parent b1c8152 commit 21a4ef8
Show file tree
Hide file tree
Showing 12 changed files with 686 additions and 888 deletions.
33 changes: 0 additions & 33 deletions grobid-core/src/main/java/org/grobid/core/data/Acknow.java

This file was deleted.

65 changes: 0 additions & 65 deletions grobid-core/src/main/java/org/grobid/core/data/Acknowledgment.java
Original file line number Diff line number Diff line change
Expand Up @@ -243,70 +243,5 @@ public String toTEI(){
return tei.toString();
}



public String toTEI(int n, int indent, GrobidAnalysisConfig config) {
StringBuilder tei = new StringBuilder();
boolean generateIDs = config.isGenerateTeiIds();
try {
for (int i = 0; i < indent; i++) {
tei.append("\t");
}
tei.append("<listAcknowledment>");
boolean withCoords = (config.getGenerateTeiCoordinates() != null) && (config.getGenerateTeiCoordinates().contains("listAcknowledment"));
tei.append(" ");
if (withCoords)
tei.append(TEIFormatter.getCoordsAttribute(coordinates, withCoords)).append(" ");
// title
for (int i = 0; i < indent + 2; i++) {
tei.append("\t");
}
if (affiliation != null) {
tei.append("<affiliation>").append(TextUtilities.HTMLEncode(affiliation)).append("</affiliation>");
}

if (educationalInstitution != null) {
tei.append("<educationalInstitution>").append(TextUtilities.HTMLEncode(educationalInstitution)).append("</educationalInstitution>");
}

if (fundingAgency != null) {
tei.append("<fundingAgency>").append(TextUtilities.HTMLEncode(fundingAgency)).append("</fundingAgency>");
}

if (grantName != null) {
tei.append("<grantName>").append(TextUtilities.HTMLEncode(grantName)).append("</grantName>");
}

if (grantNumber != null) {
tei.append("<grantNumber>").append(TextUtilities.HTMLEncode(grantNumber)).append("</grantNumber>");
}

if (individual != null) {
tei.append("<individual>").append(TextUtilities.HTMLEncode(individual)).append("</individual>");
}

if (otherInstitution != null) {
tei.append("<otherInstitution>").append(TextUtilities.HTMLEncode(otherInstitution)).append("</otherInstitution>");
}

if (projectName != null) {
tei.append("<projectName>").append(TextUtilities.HTMLEncode(projectName)).append("</projectName>");
}

if (researchInstitution != null) {
tei.append("<researchInstitution>").append(TextUtilities.HTMLEncode(researchInstitution)).append("</researchInstitution>");
}

for (int i = 0; i < indent; i++) {
tei.append("\t");
}
tei.append("</listAcknowledment>\n");
} catch (Exception e) {
throw new GrobidException("Cannot convert bibliographical item into a TEI, " +
"because of nested exception.", e);
}

return tei.toString();
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package org.grobid.core.data;

import org.grobid.core.utilities.TextUtilities;

/**
* A class for saving and exchancing information regarding acknowledgment item consisting of the text,
* the type label (affiliation, educational institution, individual, etc)
* and the bounding box coordinates.
*
* Created by Tanti, 2019
*/


public class AcknowledgmentItem {
String label;
String text;
String coords;

public String getLabel() {
return label;
}

public void setLabel(String label) {
this.label = label;
}

public String getText() {
return text;
}

public void setText(String text) {
this.text = text;
}

public String getCoords() {
return coords;
}

public void setCoords(String coords) {
this.coords = coords;
}

// result of acknowledgment string processing
public String toTEI(){
StringBuilder tei = new StringBuilder();
if (label== null) {
return null;
} else {

tei.append("<").append(label).append(">").
append(TextUtilities.HTMLEncode(text)).
append("</").append(label).append(">");
}
return tei.toString();
}


}
127 changes: 42 additions & 85 deletions grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.grobid.core.utilities.*;
import org.grobid.core.utilities.matching.EntityMatcherException;
import org.grobid.core.utilities.matching.ReferenceMarkerMatcher;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -60,19 +61,6 @@ public class TEIFormatter {
TaggingLabels.TABLE_MARKER,
TaggingLabels.EQUATION_MARKER);

// for acknowledgment parser
public static final Set<TaggingLabel> ACKNOWLEDGMENT_LABELS = Sets.newHashSet(
TaggingLabels.ACKNOWLEDGMENT,
TaggingLabels.AKNOWLEDGMENT_AFFILIATION,
TaggingLabels.AKNOWLEDGMENT_EDUCATIONAL_INSTITUTION,
TaggingLabels.AKNOWLEDGMENT_FUNDING_AGENCY,
TaggingLabels.AKNOWLEDGMENT_GRANT_NAME,
TaggingLabels.AKNOWLEDGMENT_GRANT_NUMBER,
TaggingLabels.AKNOWLEDGMENT_INDIVIDUAL,
TaggingLabels.AKNOWLEDGMENT_OTHER_INSTITUTION,
TaggingLabels.AKNOWLEDGMENT_PROJECT_NAME,
TaggingLabels.AKNOWLEDGMENT_RESEARCH_INSTITUTION);

// possible association to Grobid customised TEI schemas: DTD, XML schema, RelaxNG or compact RelaxNG
// DEFAULT means no schema association in the generated XML documents
public enum SchemaDeclaration {
Expand Down Expand Up @@ -1119,31 +1107,34 @@ public StringBuilder toTEIAcknowledgement(StringBuilder buffer,
List<BibDataSet> bds,
GrobidAnalysisConfig config) throws Exception {
AcknowledgmentParser acknowledgmentParser = new AcknowledgmentParser();
Map<String, List<String>> ackInMap = new HashMap<>();
List<Acknowledgment> resultAcknowledgment = new ArrayList<>();

if ((reseAcknowledgement == null) || (tokenizationsAcknowledgement == null)) {
return buffer;
}

buffer.append("\n\t\t\t<div type=\"acknowledgement\">\n");

StringBuilder buffer2 = new StringBuilder();
StringBuilder buffer2 = new StringBuilder(), buffer3 = new StringBuilder();
StringBuilder resultAcknow = new StringBuilder();

// get the acknowledgment sections together with other results in a whole document
buffer2 = toTEITextPiece(buffer2, reseAcknowledgement, null, bds, false,
new LayoutTokenization(tokenizationsAcknowledgement), null, null, null, doc, config);
String acknowResult = buffer2.toString();
String acknowWholeText = buffer2.toString();

// take only acknowledgment section content
buffer3 = takeOnlyAcknowledgmentText(buffer3, reseAcknowledgement,
new LayoutTokenization(tokenizationsAcknowledgement));
String acknowOnlyText = buffer3.toString();

// call the acknowledgment parser
resultAcknowledgment = acknowledgmentParser.processing(acknowResult);
List<AcknowledgmentItem> resultAcknowledgment = acknowledgmentParser.processing(acknowOnlyText);

// get the coordinates for acknowledgment results
List<Acknow> listAcknow = listAcknowledgmentResult(resultAcknowledgment);
getCoordAcknowledgment(listAcknow, tokenizationsAcknowledgement);
getCoordAcknowledgment(resultAcknowledgment, tokenizationsAcknowledgement);

// put the results together with the raw text
resultAcknow = markReferencesTEIAcknowledgment(acknowResult, listAcknow);
resultAcknow = markReferencesTEIAcknowledgment(acknowWholeText, resultAcknowledgment);
if (resultAcknow != null) {
String resultAcknowInString = resultAcknow.toString();

Expand All @@ -1161,78 +1152,44 @@ public StringBuilder toTEIAcknowledgement(StringBuilder buffer,
return buffer;
}

public List<Acknow> listAcknowledgmentResult(List<Acknowledgment> listAck) {
List<Acknow> acklist = new ArrayList<>();
public StringBuilder takeOnlyAcknowledgmentText(StringBuilder buffer,
String reseAcknowledgement,
LayoutTokenization layoutTokenization) {
List<LayoutToken> tokenizations = layoutTokenization.getTokenization();

for (Acknowledgment acknowledgment : listAck) {
Acknow acknow = null;
if (acknowledgment.getAffiliation() != null) {
acknow = new Acknow();
acknow.setText(acknowledgment.getAffiliation());
acknow.setLabel("affiliation");
acklist.add(acknow);
}
if (acknowledgment.getEducationalInstitution() != null) {
acknow = new Acknow();
acknow.setText(acknowledgment.getEducationalInstitution());
acknow.setLabel("educationalInstitution");
acklist.add(acknow);
}
if (acknowledgment.getFundingAgency() != null) {
acknow = new Acknow();
acknow.setText(acknowledgment.getFundingAgency());
acknow.setLabel("fundingAgency");
acklist.add(acknow);
}
if (acknowledgment.getGrantName() != null) {
acknow = new Acknow();
acknow.setText(acknowledgment.getGrantName());
acknow.setLabel("grantName");
acklist.add(acknow);
}
if (acknowledgment.getGrantNumber() != null) {
acknow = new Acknow();
acknow.setText(acknowledgment.getGrantNumber());
acknow.setLabel("grantNumber");
acklist.add(acknow);
}
if (acknowledgment.getOtherInstitution() != null) {
acknow = new Acknow();
acknow.setText(acknowledgment.getOtherInstitution());
acknow.setLabel("otherInstitution");
acklist.add(acknow);
}
if (acknowledgment.getProjectName() != null) {
acknow = new Acknow();
acknow.setText(acknowledgment.getProjectName());
acknow.setLabel("projectName");
acklist.add(acknow);
}
if (acknowledgment.getResearchInstitution() != null) {
acknow = new Acknow();
acknow.setText(acknowledgment.getResearchInstitution());
acknow.setLabel("researchInstitution");
acklist.add(acknow);
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.FULLTEXT, reseAcknowledgement, tokenizations);
String tokenLabel = null;
List<TaggingTokenCluster> clusters = clusteror.cluster();

for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
continue;
}
if (acknowledgment.getIndividual() != null) {
acknow = new Acknow();
acknow.setText(acknowledgment.getIndividual());
acknow.setLabel("individual");
acklist.add(acknow);

TaggingLabel clusterLabel = cluster.getTaggingLabel();

Engine.getCntManager().i(clusterLabel);
if (clusterLabel.equals(TaggingLabels.SECTION)) {
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
buffer.append(clusterContent);
} else if (clusterLabel.equals(TaggingLabels.PARAGRAPH)) {
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
buffer.append(clusterContent);
}
buffer.append("\n");
}
return acklist;
return buffer;
}

public StringBuilder markReferencesTEIAcknowledgment(String acknowText, List<Acknow> acknowList) {
public StringBuilder markReferencesTEIAcknowledgment(String acknowText, List<AcknowledgmentItem> acknowledgmentItemList) {
String acknowWholeText = acknowText;
StringBuilder result = new StringBuilder();

for (Acknow acknow : acknowList) {
for (AcknowledgmentItem acknowledgmentItem : acknowledgmentItemList) {
// text information
String text = acknow.getText();
String label = acknow.getLabel();
String coords = acknow.getCoords();
String text = acknowledgmentItem.getText();
String label = acknowledgmentItem.getLabel();
String coords = acknowledgmentItem.getCoords();

int lengTextAckPart = text.length();
int lengTextWhole = acknowWholeText.length();
Expand All @@ -1257,8 +1214,8 @@ public StringBuilder markReferencesTEIAcknowledgment(String acknowText, List<Ack
return result;
}

public void getCoordAcknowledgment(List<Acknow> listAcknow, List<LayoutToken> tokenizationsAcknowledgement) {
for (Acknow ack : listAcknow) { // iterate through all acknowledgment text as results of acknowledgment parser
public void getCoordAcknowledgment(List<AcknowledgmentItem> listAcknowledgmentItem, List<LayoutToken> tokenizationsAcknowledgement) {
for (AcknowledgmentItem ack : listAcknowledgmentItem) { // iterate through all acknowledgment text as results of acknowledgment parser
List<LayoutToken> layaoutTokenRelevant = new ArrayList<>();
List<LayoutToken> tokenListToBeFound = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(ack.getText());
int lastIdx = 0;
Expand Down
Loading

0 comments on commit 21a4ef8

Please sign in to comment.