Skip to content

Commit

Permalink
Some improvements for the 'organism' and 'datasource' index/search fi…
Browse files Browse the repository at this point in the history
…elds.
  • Loading branch information
IgorRodchenkov committed Apr 24, 2024
1 parent 3e8ff28 commit 74bf3a6
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 42 deletions.
4 changes: 2 additions & 2 deletions src/main/java/cpath/service/ConsoleApplication.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public void run(String... args) throws Exception {
.hasArg().argName("filename").build();
options.addOption(o);
o = Option.builder("F").longOpt("F")
.desc("filters for the export option, e.g., -Furis=<uri,..> -Fdatasources=<nameOrUri,..> -Ftypes=<interface,..> " +
.desc("filters for the export option, e.g., -Furis=<uri,..> -Fdatasources=<name,..> -Ftypes=<interface,..> " +
"(when 'uris' is defined, other options are ignored)")
.argName("property=value").hasArgs().valueSeparator().numberOfArgs(2).build();
options.addOption(o);
Expand Down Expand Up @@ -219,7 +219,7 @@ private void merge() {
*
* @param output - output BioPAX file name (path)
* @param uris - optional, the list of valid (existing) URIs to extract a sub-model
* @param datasources filter by datasource if 'uris' is not empty
* @param datasources filter by datasource (name or identifier) if 'uris' is not empty
* @param types filter by BioPAX type if 'uris' is not empty
* @throws IOException, IllegalStateException (in maintenance mode)
*/
Expand Down
66 changes: 31 additions & 35 deletions src/main/java/cpath/service/IndexImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ private SearchResponse transform(Query query, IndexSearcher searcher, TopDocs to
}
}

// extract organisms (URI only)
// extract organisms (URIs only)
if(doc.get(FIELD_ORGANISM) != null) {
Set<String> uniqueVals = new TreeSet<>();
for(String o : doc.getValues(FIELD_ORGANISM)) {
Expand Down Expand Up @@ -360,19 +360,19 @@ public void save(BioPAXElement bpe) {

// create a new document
final Document doc = new Document();
// using StringField and KeywordAnalyser for this field
// using StringField and KeywordAnalyser (when searching) for 'uri' field
final String uri = bpe.getUri();
// save URI: indexed, not analyzed, stored
// save URI: indexed, not analyzed, stored
doc.add(new StringField(FIELD_URI, uri, Field.Store.YES));
//extract and index the last part of the uri (e.g., 'hsa00010' or like 'ProteinReference_ca123bd44...')
if(uri.startsWith("http://")) {
String id = (uri.endsWith("/")) ? uri.substring(0, uri.length()-1) : uri;
id = id.replaceAll(".*[/#]", "").trim();
doc.add(new StringField(FIELD_URI, id, Field.Store.NO));
}

// index and store but not analyze/tokenize the biopax class name:

//index the last part of the uri (e.g., 'hsa00010' or like 'ProteinReference_ca123bd44...'); todo: why?..
String luri = (uri.endsWith("/")) ? uri.substring(0, uri.length()-1) : uri;
luri = luri.replaceAll(".*[/#]", "").trim();
doc.add(new StringField(FIELD_URI, luri, Field.Store.NO));

// index and store but not analyze/tokenize biopax class name (lowcase as we use StandardAnalyzer for searching/filtering in this field):
doc.add(new StringField(FIELD_TYPE, bpe.getModelInterface().getSimpleName().toLowerCase(), Field.Store.YES));

// extra index fields
addPathways(ModelUtils.getParentPathways(bpe), doc);
addOrganisms(ModelUtils.getOrganisms(bpe), doc);
Expand All @@ -394,11 +394,10 @@ public void save(BioPAXElement bpe) {
}

// Add more xref IDs to the index using id-mapping
Set<String> ids = CPathUtils.getXrefIds(bpe);
final Set<String> ids = CPathUtils.getXrefIds(bpe);
Pattern isoformIdPattern = Pattern.compile(Resolver.getNamespace("uniprot.isoform", true).getPattern());
Pattern uniprotIdPattern = Pattern.compile(Resolver.getNamespace("uniprot", true).getPattern()); //"uniprot protein" is the preferred name
// in addition, collect ChEBI and UniProt IDs and then
// use id-mapping to associate the bpe with more IDs:
Pattern uniprotIdPattern = Pattern.compile(Resolver.getNamespace("uniprot", true).getPattern());
// also collect ChEBI and UniProt IDs and then use id-mapping to associate the bpe with more IDs:
final List<String> uniprotIds = new ArrayList<>();
final List<String> chebiIds = new ArrayList<>();
for(String id : ids) {
Expand All @@ -407,16 +406,17 @@ public void save(BioPAXElement bpe) {
chebiIds.add(id);
} else if(isoformIdPattern.matcher(id).find()) {
//cut the isoform num. suffix
id = id.replaceFirst("-\\d+$", "");
uniprotIds.add(id);
uniprotIds.add(id.replaceFirst("-\\d+$", ""));
} else if(uniprotIdPattern.matcher(id).find()) {
uniprotIds.add(id);
}
}
//id-mapping to find some other ids that map to the chebi/uniprot ones that we collected from the bpe.
addSupportedIdsThatMapToChebi(chebiIds, ids);
addSupportedIdsThatMapToUniprotId(uniprotIds, ids);
for (String id : ids) {//index as: not analyzed, not tokenized
// doc.add(new StringField(FIELD_XREFID, id.toLowerCase(), Field.Store.NO)); // TODO: why did we do this? IDs are case-sensitive.
for (String id : ids) {
//index as: not analyzed, not tokenized; we use KeywordAnalyzer when searching this field...
//doc.add(new StringField(FIELD_XREFID, id.toLowerCase(), Field.Store.NO));//todo: why did we have it? (ID is normally case-sensitive)
doc.add(new StringField(FIELD_XREFID, id, Field.Store.NO));
//also store a lower-case prefix (banana, e.g. 'chebi:1234' version of the id)
if(StringUtils.contains(id,":")) {
Expand Down Expand Up @@ -452,7 +452,7 @@ public void save(BioPAXElement bpe) {
// save/update the lucene document
try {
indexWriter.updateDocument(new Term(FIELD_URI, uri), doc);
} catch (IOException e) {
} catch (Exception e) {
throw new RuntimeException("Failed to index: " + bpe.getUri(), e);
}
}
Expand Down Expand Up @@ -523,30 +523,26 @@ public long count(String queryString) {

private void addDatasources(Set<Provenance> set, Document doc) {
for (Provenance p : set) {
// Index (!) and store URI (untokenized) -
// required to accurately calculate no. entities or to filter by data source
// (different data sources might share same names)
//store but do not index/tokenize the URI
doc.add(new StoredField(FIELD_DATASOURCE, p.getUri()));

//index the last/local (collection prefix) part of the normalized Provenance uri
String u = p.getUri();
doc.add(new StringField(FIELD_DATASOURCE, u, Field.Store.YES));

//index the identifier part of uri as well
if(u.startsWith("http://")) {
if (u.endsWith("/"))
u = u.substring(0, u.length() - 1);
u = u.replaceAll(".*/", "");
doc.add(new StringField(FIELD_DATASOURCE, u.toLowerCase(), Field.Store.NO));
}
if (u.endsWith("/")) u = u.substring(0, u.length() - 1);
u = u.replaceAll(".*[/#]", "");
doc.add(new TextField(FIELD_DATASOURCE, u.toLowerCase(), Field.Store.NO));

// index names
//index names (including the datasource identifier from metadata json config; see premerge/merge)
//different data sources can have the same name e.g. 'intact'; tokenized - to search by partial name
for (String s : p.getName()) {
doc.add(new StringField(FIELD_DATASOURCE, s.toLowerCase(), Field.Store.NO));
doc.add(new TextField(FIELD_DATASOURCE, s.toLowerCase(), Field.Store.NO));
}
}
}

private void addOrganisms(Set<BioSource> set, Document doc) {
for(BioSource bs : set) {
// store URI as is (not indexed, not tokinized)
// store but do not index URI (see transform method above, where the organism URIs are added to search hits)
doc.add(new StoredField(FIELD_ORGANISM, bs.getUri()));

// add organism names
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/cpath/service/Merger.java
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ private Model merge(Datasource datasource) {
ModelUtils.normalizeGenerics(providerModel);

//for (already normalized) BioSource, also add the name from
//application.properties (it helps full-text search)
//application.properties (it helps full-text search in case the orig. BioSource had no names but taxon ref...)
Map<String,String> orgMap = service.settings().getOrganismsAsTaxonomyToNameMap();
for(BioSource org : providerModel.getObjects(BioSource.class)) {
for(UnificationXref x : new ClassFilterSet<>(org.getXref(), UnificationXref.class)) {
Expand Down Expand Up @@ -510,7 +510,7 @@ private void chemXrefByMapping(final Model m, Named bpe, final int maxNumXrefsTo
* This step won't much improve full-text index/search and graph queries
* (where id-mapping is used again anyway), but may help improve export to SIF and GSEA formats.
* This method is called only for original PEs or their ERs that were not mapped/merged
* with a warehouse canonical ERs for various known reasons (no match for a ID or no ID, ambiguous ID, etc.)
* with a warehouse canonical ERs for various known reasons (no match for an ID or no ID, ambiguous ID, etc.)
*
* This method won't add additional xrefs if a UniProt/HGNC one is already present despite it'd map
* to many canonical ERs/IDs (in fact, it'd even map to hundreds (Trembl) IDs, e.g., in cases like 'ND5',
Expand Down
1 change: 1 addition & 0 deletions src/main/java/cpath/service/metadata/Datasource.java
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ public void setProvenanceFor(Model model, String xmlBase) {
String displayName = getName().iterator().next();
pro.setDisplayName(displayName);
pro.setStandardName(standardName());
pro.addName(identifier);

if (getName().size() > 2)
for (int i = 2; i < getName().size(); i++)
Expand Down
12 changes: 9 additions & 3 deletions src/test/java/cpath/service/IndexIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,18 @@ public final void search() throws IOException {
response = index.search("*", 0, Provenance.class, new String[] {"kegg"}, null);
assertEquals(1, response.getSearchHit().size());

//datasource filter using a URI (required for -update-counts console command and datasources.html page to work)
//datasource filter using Provenance absolute URI - not needed anymore - still stored but not indexed anymore
response = index.search("*", 0, Pathway.class, new String[] {"http://identifiers.org/kegg.pathway/"}, null);
assertTrue(response.isEmpty());

//using the local/last part of the URI (standard bio collection prefix/name)
response = index.search("*", 0, Pathway.class, new String[] {"kegg.pathway"}, null);
assertFalse(response.isEmpty());
assertEquals(1, response.getSearchHit().size());
//using metadata identifier
response = index.search("*", 0, Pathway.class, new String[] {"kegg.pathway"}, null);
assertTrue(response.getSearchHit().stream().anyMatch(h -> h.getDataSource().contains("http://identifiers.org/kegg.pathway/")));

//find by partial name of a datasource - "pathway" of "KEGG Pathway"...
response = index.search("*", 0, Pathway.class, new String[] {"pathway"}, null);
assertFalse(response.isEmpty());
assertEquals(1, response.getSearchHit().size());

Expand Down

0 comments on commit 74bf3a6

Please sign in to comment.