Skip to content

Commit

Permalink
batch funder consolidation
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Aug 18, 2023
1 parent c00384d commit 12996fa
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,17 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
}
}

List<Funder> localFunders = new ArrayList<>();
for (Map.Entry<Funder, List<Funding>> entry : fundingRelation.entrySet()) {
localFunders.add(entry.getKey());
}

Map<Integer,Funder> consolidatedFunders = Consolidation.getInstance().consolidateFunders(localFunders);

int n =0;
for (Map.Entry<Funder, List<Funding>> entry : fundingRelation.entrySet()) {
Funder consolidatedFunder = Consolidation.getInstance().consolidateFunder(entry.getKey());
String funderPiece = null;
Funder consolidatedFunder = consolidatedFunders.get(n);
if (consolidatedFunder != null)
funderPiece = consolidatedFunder.toTEI(4);
else
Expand All @@ -200,6 +208,7 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
funderPiece = funderPiece.replace("<funder>", "<funder ref=\"" + referenceString.trim() + "\">");
tei.append(funderPiece);
}
n++;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -626,4 +626,77 @@ public void onError(int status, String message, Exception exception) {
return results.get(0);
}

public Map<Integer,Funder> consolidateFunders(List<Funder> funders) {
if (CollectionUtils.isEmpty(funders))
return null;
final Map<Integer,Funder> results = new HashMap<Integer,Funder>();
// init the results
int n = 0;
for(n=0; n<funders.size(); n++) {
results.put(n, null);
}
n = 0;
long threadId = Thread.currentThread().getId();
for(Funder funder : funders) {
//final List<Funder> results = new ArrayList<>();

Map<String, String> arguments = new HashMap<String,String>();

// CrossRef does not manage stopwords in funder search and has no usable term frequency, so we need
// to remove basic stopwords in the query to have something manageable from CrossRef
String funderNameString = funder.getFullName();
if (funderNameString == null || funderNameString.length() == 0)
return null;

funderNameString = TextUtilities.removeFieldStopwords(funderNameString);

arguments.put("query", funderNameString);
arguments.put("rows", "10"); // we request the top-10 results, because there are a lot of noise
// and we need many candidates in the pairwise comparison step

try {
client.pushRequest("funders", arguments, funderDeserializer, threadId, new CrossrefRequestListener<Funder>(n) {
@Override
public void onSuccess(List<Funder> res) {
List<Funder> localResults = new ArrayList<>();
if ((res != null) && (res.size() > 0) ) {
// we need here to post-check the candidates in a pairwise comparison
// in order to avoid false positive
for(Funder oneRes : res) {
/*
Glutton integrates its own post-validation, so we can skip post-validation in GROBID when it is used as
consolidation service. However, with CrossRef, post-validation is mandatory to control false positives.
*/
if (oneRes.getFullName() != null) {
String localFullName = oneRes.getFullName();
localFullName = TextUtilities.removeFieldStopwords(localFullName);
if (localFullName.toLowerCase().equals(arguments.get("query").toLowerCase())) {
localResults.add(oneRes);
break;
} else if (ratcliffObershelpDistance(localFullName, arguments.get("query"), false)>0.9) {
localResults.add(oneRes);
}
}
}

if (localResults.size() >0)
results.put(Integer.valueOf(getRank()), localResults.get(0));
}
}

@Override
public void onError(int status, String message, Exception exception) {
LOGGER.info("Funder consolidation service returns error ("+status+") : "+message, exception);
}
});
} catch(Exception e) {
LOGGER.info("Funder consolidation error - ", e);
}
n++;
}

client.finish(threadId);
return results;
}

}

0 comments on commit 12996fa

Please sign in to comment.