Skip to content

Commit

Permalink
prefix Catalogue of Life ids with their datasetKey as suggested by @m…
Browse files Browse the repository at this point in the history
  • Loading branch information
Jorrit Poelen committed Aug 14, 2023
1 parent 2c355fb commit 8bcdd64
Show file tree
Hide file tree
Showing 6 changed files with 56,960 additions and 31 deletions.
Original file line number Diff line number Diff line change
@@ -1,19 +1,11 @@
package org.eol.globi.taxon;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.eol.globi.domain.NameType;
import org.eol.globi.domain.TaxonImpl;
import org.eol.globi.domain.Term;
import org.eol.globi.domain.TermImpl;
import org.eol.globi.service.NameSuggester;
import org.eol.globi.service.PropertyEnricherException;
import org.globalbioticinteractions.nomer.util.TermMatcherContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;

public abstract class TaxonNameSuggestorBase implements TermMatcher {

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.globalbioticinteractions.nomer.match;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.RegExUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.StopWatch;
Expand All @@ -26,12 +27,18 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CatalogueOfLifeTaxonService extends CommonStringTaxonService {
private static final Logger LOG = LoggerFactory.getLogger(CatalogueOfLifeTaxonService.class);
private static final String DATASET_KEY = "datasetKey";

private boolean reverseSorted;


Expand Down Expand Up @@ -63,16 +70,20 @@ protected void lazyInit() throws PropertyEnricherException {
&& db.exists(CHILD_PARENT)
&& db.exists(MERGED_NODES)
&& db.exists(NAME_TO_NODE_IDS)
&& db.exists(DATASET_KEY)
) {
LOG.debug("[Catalogue of Life] taxonomy already indexed at [" + taxonomyDir.getAbsolutePath() + "], no need to import.");
nodes = db.getTreeMap(NODES);
childParent = db.getTreeMap(CHILD_PARENT);
mergedNodes = db.getTreeMap(MERGED_NODES);
name2nodeIds = db.getTreeMap(NAME_TO_NODE_IDS);
datasetKey = db.getAtomicLong(DATASET_KEY);
} else {
LOG.info("[" + getTaxonomyProvider().name() + "] taxonomy importing...");
StopWatch watch = new StopWatch();
watch.start();
indexDatasetKey(db);

if (reverseSorted) {
LOG.info("indexing taxon names...");
nodes = populateNodes(db, watch);
Expand All @@ -87,8 +98,10 @@ protected void lazyInit() throws PropertyEnricherException {
watch.reset();
watch.start();
} else {

try (InputStream resource = getNameUsageStream()) {


nodes = db
.createTreeMap(NODES)
.keySerializer(BTreeKeySerializer.STRING)
Expand All @@ -114,7 +127,11 @@ protected void lazyInit() throws PropertyEnricherException {
.valueSerializer(Serializer.STRING)
.make();

NameUsageListener nameUsageListener = new NameUsageListenerImpl(mergedNodes, nodes, childParent);
NameUsageListener nameUsageListener = new NameUsageListenerImpl(
mergedNodes,
nodes,
childParent
);
parseNameUsage(resource, nameUsageListener);
} catch (IOException e) {
throw new PropertyEnricherException("failed to parse taxon", e);
Expand All @@ -129,6 +146,34 @@ protected void lazyInit() throws PropertyEnricherException {
}
}

private void indexDatasetKey(DB db) throws PropertyEnricherException {
String propertyValue = getCtx().getProperty("nomer.col.metadata.url");
URI metadata = URI.create(propertyValue);
final Pattern compile = Pattern.compile("^key:[ ]+(?<datasetKey>[0-9]+)$");
try (InputStream resource = getCtx().retrieve(metadata)) {
BufferedReader reader = IOUtils.toBufferedReader(new InputStreamReader(resource, StandardCharsets.UTF_8));
Long key = reader
.lines()
.filter(line -> compile.matcher(line).matches())
.findFirst()
.map(line -> {
Matcher matcher = compile.matcher(line);
matcher.matches();
return Long.parseLong(matcher.group("datasetKey"));
}).orElseThrow(new Supplier<Throwable>() {
@Override
public Throwable get() {
return new PropertyEnricherException("failed to locate dataset key in [" + propertyValue + "]");
}
});
datasetKey = db
.createAtomicLong(DATASET_KEY, -1L);
datasetKey.set(key);
} catch (Throwable e) {
throw new PropertyEnricherException("failed to read metadata at [" + metadata + "]", e);
}
}

private BTreeMap<String, Map<String, String>> populateNodes(DB db, StopWatch watch) throws PropertyEnricherException {
BTreeMap<String, Map<String, String>> nodes;
InputStream is = getNameUsageStream();
Expand Down Expand Up @@ -332,14 +377,14 @@ private void parseNameUsage(InputStream resource, NameUsageListener nameUsageLis
private void parseLine(NameUsageListener nameUsageListener, String line) {
String[] rowValues = StringUtils.splitByWholeSeparatorPreserveAllTokens(line, "\t");
if (rowValues.length > 8) {
String taxId = rowValues[0];
String parentTaxId = rowValues[2];
String taxId = prefixIdWithDatasetKey(rowValues[0]);
String parentTaxId = prefixIdWithDatasetKey(rowValues[2]);
String status = rowValues[4];
String completeName = RegExUtils.replaceAll(rowValues[5], "[ ]+\\(.*\\)[ ]+", " ");
String authorship = rowValues[6];
String rank = rowValues[7];

String idPrefix = getTaxonomyProvider().getIdPrefix();
String idPrefix = getIdPrefix();
TaxonImpl taxon = new TaxonImpl(completeName, idPrefix + taxId);
if (StringUtils.isNoneBlank(authorship)) {
taxon.setAuthorship(StringUtils.trim(authorship));
Expand All @@ -351,6 +396,13 @@ private void parseLine(NameUsageListener nameUsageListener, String line) {
}
}

private String prefixIdWithDatasetKey(String rowValue) {
return StringUtils.isBlank(rowValue) || datasetKey == null
? rowValue
: datasetKey.get() + ":" + rowValue;
}


public void setReverseSorted(boolean reverseSorted) {
this.reverseSorted = reverseSorted;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.eol.globi.taxon.TermMatchListener;
import org.eol.globi.taxon.TermMatcher;
import org.globalbioticinteractions.nomer.util.TermMatcherContext;
import org.mapdb.Atomic;
import org.mapdb.BTreeMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -42,6 +43,7 @@ public abstract class CommonTaxonService<T> extends PropertyEnricherSimple imple
BTreeMap<T, Map<String, String>> nodes;
BTreeMap<T, T> childParent;
BTreeMap<String, List<T>> name2nodeIds;
Atomic.Long datasetKey;


public CommonTaxonService(TermMatcherContext ctx) {
Expand Down Expand Up @@ -175,9 +177,14 @@ private void registerRelation(TermMatchListener termMatchListener, String name,
}

protected boolean isIdSchemeSupported(String externalId) {
return StringUtils.startsWith(externalId, getTaxonomyProvider().getIdPrefix());
return StringUtils.startsWith(externalId, getIdPrefix());
}

protected String getIdPrefix() {
return getTaxonomyProvider().getIdPrefix();
}


@Override
public Map<String, String> enrich(Map<String, String> toBeEnriched) throws PropertyEnricherException {
checkInit();
Expand Down Expand Up @@ -406,7 +413,7 @@ private void resolveHierarchyIfNeeded(T focalTaxonKey, Map<T, T> childParent, Ma
visitedParents.add(focalTaxonKey);
while (parent != null
&& !visitedParents.contains(parent)
&& !pathIds.contains(primaryTaxonProvider.getIdPrefix() + parent)) {
&& !pathIds.contains(getIdPrefix() + parent)) {
Map<String, String> parentTaxonProperties = nodes.get(parent);
if (parentTaxonProperties != null) {
Taxon parentTaxon = TaxonUtil.mapToTaxon(parentTaxonProperties);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,23 @@ public void enrichById() throws PropertyEnricherException {
@Test
public void enrichByIdReverseSorted() throws PropertyEnricherException {
CatalogueOfLifeTaxonService service = createService(
"/org/globalbioticinteractions/nomer/match/col/NameUsageReverseSorted.tsv");
"/org/globalbioticinteractions/nomer/match/col/NameUsageReverseSorted.tsv",
"/org/globalbioticinteractions/nomer/match/col/metadata.yaml"
);
service.setReverseSorted(true);

assertEnrichById(service);
}

public void assertEnrichById(CatalogueOfLifeTaxonService service) throws PropertyEnricherException {
TaxonImpl taxon = new TaxonImpl(null, "COL:63MJH");
TaxonImpl taxon = new TaxonImpl(null, "COL:9916:63MJH");
Map<String, String> enriched = service.enrich(TaxonUtil.taxonToMap(taxon));

assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:63MJH"));
assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:9916:63MJH"));
assertThat(TaxonUtil.mapToTaxon(enriched).getName(), is("Phryganella"));
assertThat(TaxonUtil.mapToTaxon(enriched).getRank(), is("genus"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPath(), is("Phryganellidae | Phryganella"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:625ZT | COL:63MJH"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:9916:625ZT | COL:9916:63MJH"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathNames(), is("family | genus"));
}

Expand All @@ -50,11 +52,11 @@ public void enrichByName() throws PropertyEnricherException {
Taxon phryganella = new TaxonImpl("Phryganella", null);
Map<String, String> enriched = service.enrich(TaxonUtil.taxonToMap(phryganella));

assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:63MJH"));
assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:9916:63MJH"));
assertThat(TaxonUtil.mapToTaxon(enriched).getName(), is("Phryganella"));
assertThat(TaxonUtil.mapToTaxon(enriched).getRank(), is("genus"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPath(), is("Phryganellidae | Phryganella"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:625ZT | COL:63MJH"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:9916:625ZT | COL:9916:63MJH"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathNames(), is("family | genus"));
}

Expand All @@ -65,27 +67,27 @@ public void enrichByNameWitIgnoreSubgenus() throws PropertyEnricherException {
Taxon phryganella = new TaxonImpl("Pteronotus macleayii", null);
Map<String, String> enriched = service.enrich(TaxonUtil.taxonToMap(phryganella));

assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:7WP8W"));
assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:9916:7WP8W"));
assertThat(TaxonUtil.mapToTaxon(enriched).getName(), is("Pteronotus macleayii"));
assertThat(TaxonUtil.mapToTaxon(enriched).getRank(), is("species"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPath(), is("Pteronotus | Chilonycteris | Pteronotus macleayii"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:74SW | COL:8P3CB | COL:7WP8W"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:9916:74SW | COL:9916:8P3CB | COL:9916:7WP8W"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathNames(), is("genus | subgenus | species"));
}

@Test
public void enrichBySynonymId() throws PropertyEnricherException {
CatalogueOfLifeTaxonService service = createService();

String externalId = "COL:4BP2T";
String externalId = "COL:9916:4BP2T";
Map<String, String> enriched = service.enrich(TaxonUtil.taxonToMap(new TaxonImpl(null, externalId)));

assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:6TH9B"));
assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:9916:6TH9B"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPath(), is("Ozyptila yosemitica"));
assertThat(TaxonUtil.mapToTaxon(enriched).getName(), is("Ozyptila yosemitica"));
assertThat(TaxonUtil.mapToTaxon(enriched).getAuthorship(), is("Schick, 1965"));
assertThat(TaxonUtil.mapToTaxon(enriched).getRank(), is("species"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:6TH9B"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:9916:6TH9B"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathNames(), is("species"));
}

Expand All @@ -96,20 +98,23 @@ public void enrichBySynonymName() throws PropertyEnricherException {
TaxonImpl taxon = new TaxonImpl("Ozyptila schusteri", null);
Map<String, String> enriched = service.enrich(TaxonUtil.taxonToMap(taxon));

assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:6TH9B"));
assertThat(TaxonUtil.mapToTaxon(enriched).getExternalId(), is("COL:9916:6TH9B"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPath(), is("Ozyptila yosemitica"));
assertThat(TaxonUtil.mapToTaxon(enriched).getName(), is("Ozyptila yosemitica"));
assertThat(TaxonUtil.mapToTaxon(enriched).getAuthorship(), is("Schick, 1965"));
assertThat(TaxonUtil.mapToTaxon(enriched).getRank(), is("species"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:6TH9B"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathIds(), is("COL:9916:6TH9B"));
assertThat(TaxonUtil.mapToTaxon(enriched).getPathNames(), is("species"));
}

private CatalogueOfLifeTaxonService createService() {
return createService("/org/globalbioticinteractions/nomer/match/col/NameUsage.tsv");
return createService(
"/org/globalbioticinteractions/nomer/match/col/NameUsage.tsv",
"/org/globalbioticinteractions/nomer/match/col/metadata.yaml"
);
}

private CatalogueOfLifeTaxonService createService(final String nameUrl) {
private CatalogueOfLifeTaxonService createService(final String nameUrl, final String metaDataUrl) {
return new CatalogueOfLifeTaxonService(new TermMatcherContextClasspath() {
@Override
public String getCacheDir() {
Expand All @@ -126,6 +131,7 @@ public String getProperty(String key) {
return new TreeMap<String, String>() {
{
put("nomer.col.name_usage.url", nameUrl);
put("nomer.col.metadata.url", metaDataUrl);
}
}.get(key);
}
Expand Down
Loading

0 comments on commit 8bcdd64

Please sign in to comment.