Skip to content

Commit

Permalink
Add highlighted filter term hits. Resolve #135.
Browse files Browse the repository at this point in the history
  • Loading branch information
khituras committed Oct 21, 2021
1 parent 0963ea4 commit 7f57c54
Show file tree
Hide file tree
Showing 12 changed files with 103 additions and 65 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,16 @@
import java.util.stream.Stream;

public class EventRetrievalResult {
private final static Logger log = LoggerFactory.getLogger(EventRetrievalResult.class);
public enum EventResultType {OUTSIDE, BIPARTITE, FULLTEXT_FILTERED}
private List<Event> eventList;
private EventResultType resultType;

public List<Event> getEventList() {
log.warn("Returning {} events from {}", eventList.size(), this);
return Collections.unmodifiableList(eventList);
}

public void setEvents(Stream<Event> events) {
eventList = events.collect(Collectors.toList());
log.warn("Got {} events", eventList.size());
}

public EventResultType getResultType() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

public class EventResponseProcessingService implements IEventResponseProcessingService {

private final static Pattern FULLTEXT_QUERY_HIGHLIGHT_PATTERN = Pattern.compile("<em>");
private final static Pattern FULLTEXT_QUERY_HIGHLIGHT_PATTERN = Pattern.compile("<b>");
@Inject
private IEventPostProcessingService eventPPService;
private Logger log;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ public CompletableFuture<EventRetrievalResult> getBipartiteEvents(Future<IdConve
FIELD_PMCID,
FIELD_EVENT_LIKELIHOOD,
FIELD_EVENT_SENTENCE,
FIELD_EVENT_PARAGRAPH,
FIELD_EVENT_MAINEVENTTYPE,
FIELD_EVENT_ALL_EVENTTYPES,
FIELD_EVENT_ARG_GENE_IDS,
Expand All @@ -200,6 +201,23 @@ public CompletableFuture<EventRetrievalResult> getBipartiteEvents(Future<IdConve
FIELD_EVENT_ARG_TEXT);
serverCmd.downloadCompleteResults = true;
serverCmd.addSortCommand("_doc", SortOrder.ASCENDING);
if (!StringUtils.isBlank(sentenceFilter) || !StringUtils.isBlank(paragraphFilter)) {
HighlightCommand hlc = new HighlightCommand();
hlc.addField(FIELD_EVENT_SENTENCE, 10, 0);
hlc.addField(FIELD_EVENT_PARAGRAPH, 10, 0);
hlc.fields.forEach(f -> {
// f.boundaryChars = new char[]{'\n', '\t'};
// f.type = HighlightCommand.Highlighter.fastvector;
f.pre = "<b>";
f.post = "</b>";
// MatchQuery hlQuery = new MatchQuery();
// hlQuery.field = FIELD_EVENT_SENTENCE;
// hlQuery.query = "xargumentx";
// f.highlightQuery = hlQuery;
});
serverCmd.addHighlightCmd(hlc);
}


ElasticSearchCarrier<ElasticServerResponse> carrier = new ElasticSearchCarrier<>("BipartiteEvents");
carrier.addSearchServerRequest(serverCmd);
Expand Down Expand Up @@ -229,13 +247,14 @@ public CompletableFuture<EventRetrievalResult> getBipartiteEvents(Future<IdConve
* @param eventQuery The top event query that is currently constructed.
*/
private void addFulltextSearchQuery(String filterQuery, String field, Occur occur, BoolQuery eventQuery) {
final SimpleQueryStringQuery sentenceFilterQuery = new SimpleQueryStringQuery();
sentenceFilterQuery.query = filterQuery;
sentenceFilterQuery.fields = Arrays.asList(field);
final BoolClause sentenceFilterClause = new BoolClause();
sentenceFilterClause.addQuery(sentenceFilterQuery);
sentenceFilterClause.occur = occur;
eventQuery.addClause(sentenceFilterClause);
final SimpleQueryStringQuery textFilterQuery = new SimpleQueryStringQuery();
textFilterQuery.flags = List.of(SimpleQueryStringQuery.Flag.ALL);
textFilterQuery.query = filterQuery;
textFilterQuery.fields = Arrays.asList(field);
final BoolClause textFilterClause = new BoolClause();
textFilterClause.addQuery(textFilterQuery);
textFilterClause.occur = occur;
eventQuery.addClause(textFilterClause);
}

@Override
Expand All @@ -260,8 +279,11 @@ private void reorderBipartiteEventResultArguments(Set<String> idSetA, Set<String

for (Event e : eventResult.getEventList()) {
Argument firstArg = e.getFirstArgument();
Argument secondArg = e.getSecondArgument();
if (!(idSetA.contains(firstArg.getGeneId()) || idSetA.contains(firstArg.getTopHomologyId())))
e.swapArguments();
else if (!(idSetB.contains(secondArg.getGeneId()) || idSetB.contains(secondArg.getTopHomologyId())))
e.swapArguments();
}
}

Expand Down Expand Up @@ -400,13 +422,13 @@ public CompletableFuture<EventRetrievalResult> getFulltextFilteredEvents(List<St
serverCmd.addSortCommand("_doc", SortOrder.ASCENDING);

HighlightCommand hlc = new HighlightCommand();
hlc.addField(FIELD_EVENT_SENTENCE, 1, 0);
hlc.addField(FIELD_EVENT_PARAGRAPH, 1, 0);
hlc.addField(FIELD_EVENT_SENTENCE, 10, 0);
hlc.addField(FIELD_EVENT_PARAGRAPH, 10, 0);
hlc.fields.forEach(f -> {
// f.boundaryChars = new char[]{'\n', '\t'};
// f.type = HighlightCommand.Highlighter.fastvector;
// f.pre = "<b>";
// f.post = "</b>";
f.pre = "<b>";
f.post = "</b>";
// MatchQuery hlQuery = new MatchQuery();
// hlQuery.field = FIELD_EVENT_SENTENCE;
// hlQuery.query = "xargumentx";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,17 +239,17 @@ public JSONArray convertToJson(List<Event> eventList) {
}

@Override
public File getOverviewExcel(List<Event> events, long dataSessionId, EnumSet<InputMode> inputMode) throws IOException {
public File getOverviewExcel(List<Event> events, long dataSessionId, EnumSet<InputMode> inputMode, String sentenceFilterString, String paragraphFilterString) throws IOException {
log.debug("Creating event statistics Excel file for dataSessionId {}", dataSessionId);
File tsvFile = getTempTsvDataFile(dataSessionId);
File xlsFile = getTempXlsDataFile(dataSessionId);
writeOverviewTsvFile(events, tsvFile);
createExcelSummaryFile(tsvFile, xlsFile, inputMode);
createExcelSummaryFile(tsvFile, xlsFile, inputMode, sentenceFilterString, paragraphFilterString);
return xlsFile;
}

private void createExcelSummaryFile(File tsvFile, File xlsFile, EnumSet<InputMode> inputMode) throws IOException {
ProcessBuilder builder = new ProcessBuilder().command("python", "-c", excelResultCreationScript, tsvFile.getAbsolutePath(), xlsFile.getAbsolutePath(), inputMode.stream().map(InputMode::name).collect(Collectors.joining(" ")));
private void createExcelSummaryFile(File tsvFile, File xlsFile, EnumSet<InputMode> inputMode, String sentenceFilterString, String paragraphFilterString) throws IOException {
ProcessBuilder builder = new ProcessBuilder().command("python", "-c", excelResultCreationScript, tsvFile.getAbsolutePath(), xlsFile.getAbsolutePath(), inputMode.stream().map(InputMode::name).collect(Collectors.joining(" ")), sentenceFilterString != null ? sentenceFilterString : "<none>", paragraphFilterString != null ? paragraphFilterString : "<none>");
Process process = builder.start();
InputStream processInput = process.getInputStream();
InputStream processErrors = process.getErrorStream();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ public interface IGePiDataService {
* resulting Excel file is then read back in the form of the InputStream.</p>
* @param events The events to create the result workbook for.
* @param inputMode
* @param sentenceFilterString
* @param paragraphFilterString
* @return An InputStream of the created Excel file.
*/
File getOverviewExcel(List<Event> events, long dataSessionId, EnumSet<InputMode> inputMode) throws IOException;
File getOverviewExcel(List<Event> events, long dataSessionId, EnumSet<InputMode> inputMode, String sentenceFilterString, String paragraphFilterString) throws IOException;
}
42 changes: 24 additions & 18 deletions gepi/gepi-core/src/main/resources/ExcelResultCreation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def makeArgumentSymbolPivotTable(df, column, order):
givengenesfreq[('both','total sum')] = givengenesfreq[('exact','sum')] + givengenesfreq[('fuzzy','sum')]
return givengenesfreq

def writeresults(input,output,inputMode):
def writeresults(input,output,inputMode,sentenceFilterString,paragraphFilterString):
header = ["arg1symbol", "arg2symbol", "arg1text", "arg2text", "arg1entrezid", "arg2entrezid", "arg1matchtype", "arg2matchtype", "relationtypes", "docid", "eventid", "fulltextmatchtype", "context"]
columndesc=[ 'Input gene symbol',
'Event partner gene symbol',
Expand Down Expand Up @@ -124,33 +124,39 @@ def writeresults(input,output,inputMode):
bold = ew.book.add_format({'bold': True})
frontpage.write(0,0, f'This is a GePi statistics file which contains results of event extraction. Creation date is {date.today()}.')
frontpage.write(1,0, 'The contained worksheets contain the actual text mining results as well as statistics extracted from them.')
frontpage.write(3,0, 'The "Results" sheet is a large table containing the gene event arguments, an indication of how well the text matched')
frontpage.write(4,0, 'a gene synonym ("exact" or "fuzzy"), the recognized type of the event (such as "phosphorylation" or "regulation"),')
frontpage.write(5,0, 'the document ID (PubMed ID for PubMed results, PMC ID for PubMed Central results) and the sentence in which the')
frontpage.write(6,0, 'respective event was found.')
frontpage.write(2,0, 'The result was obtained using the following filter terms:')
frontpage.write(3,0, f'Sentence level filter terms: {sentenceFilterString}')
frontpage.write(4,0, f'Paragraph level filter terms: {paragraphFilterString}')
frontpage.write(5,0, 'Only molecular events that were described in a sentence or a paragraph containing the filter terms was returned for this result.')
frontpage.write(7,0, 'The "Results" sheet is a large table containing the gene event arguments, an indication of how well the text matched')
frontpage.write(8,0, 'a gene synonym ("exact" or "fuzzy"), the recognized type of the event (such as "phosphorylation" or "regulation"),')
frontpage.write(9,0, 'the document ID (PubMed ID for PubMed results, PMC ID for PubMed Central results) and the sentence in which the')
frontpage.write(10,0, 'respective event was found.')
resultsdesc.to_excel(ew, startrow=7, index=False, sheet_name='Frontpage')
frontpage.write(20,0, 'The matchtype "exact" means that the textual gene name could be matched perfectly to a synonym of a NCBI Gene database entry.')
frontpage.write(21,0, '"Fuzzy" means that the gene name found in the literature could only be mapped to an NCBI Gene record by allowing minor differences when comparing with the synonyms.')
frontpage.write(22,0, 'Example: Assume the text match was "{}". This cannot be found exactly in NCBI Gene. However, the synonym "{}" exists which could be used for the mapping.'.format('25 kDa lysophospholipid-specific lysophospholipase', 'lysophospholipid-specific lysophospholipase'))
frontpage.write(24,0, 'The matchtype "exact" means that the textual gene name could be matched perfectly to a synonym of a NCBI Gene database entry.')
frontpage.write(25,0, '"Fuzzy" means that the gene name found in the literature could only be mapped to an NCBI Gene record by allowing minor differences when comparing with the synonyms.')
frontpage.write(26,0, 'Example: Assume the text match was "{}". This cannot be found exactly in NCBI Gene. However, the synonym "{}" exists which could be used for the mapping.'.format('25 kDa lysophospholipid-specific lysophospholipase', 'lysophospholipid-specific lysophospholipase'))
#frontpage.write(24,0, 'Description of the sheets:', bold)
frontpage.write(24,0, 'Description of the sheets:')
frontpage.write(28,0, 'Description of the sheets:')
if 'A' in inputMode or 'AB' in inputMode:
frontpage.write(25,0, '"Given Genes Statistics" shows how often the input gene symbols were found in relations with other genes, separated by exact and fuzzy matches.')
frontpage.write(26,0, '"Event Partner Statistics" shows the same but from the perspective of the interaction partners of the input genes.')
frontpage.write(27,0, '"Event Statistics" lists the extracted events grouped by their combination of input and event partner genes. In other words, it counts how often two genes interact with each other in the results.')
frontpage.write(28,0, '"Input Gene Event Diversity" shows for each input gene symbol how many different interaction partners it has in the results.')
frontpage.write(29,0, '"Gene Argument Event Diversity" shows for each gene that participated in an event the number of different interaction partners in the results.')
frontpage.write(29,0, '"Given Genes Statistics" shows how often the input gene symbols were found in relations with other genes, separated by exact and fuzzy matches.')
frontpage.write(30,0, '"Event Partner Statistics" shows the same but from the perspective of the interaction partners of the input genes.')
frontpage.write(31,0, '"Event Statistics" lists the extracted events grouped by their combination of input and event partner genes. In other words, it counts how often two genes interact with each other in the results.')
frontpage.write(32,0, '"Input Gene Event Diversity" shows for each input gene symbol how many different interaction partners it has in the results.')
frontpage.write(33,0, '"Gene Argument Event Diversity" shows for each gene that participated in an event the number of different interaction partners in the results.')
else:
frontpage.write(25,0, '"Gene Interaction Statistics" shows how often gene symbols were found in relations with other genes, separated by exact and fuzzy matches.')
frontpage.write(26,0, '"Event Statistics" lists the extracted events grouped by their combination of input and event partner genes. In other words, it counts how often two genes interact with each other in the results.')
frontpage.write(27,0, '"Gene Argument Event Diversity" shows for each gene that participated in an event the number of different interaction partners in the results.')
frontpage.write(29,0, '"Gene Interaction Statistics" shows how often gene symbols were found in relations with other genes, separated by exact and fuzzy matches.')
frontpage.write(30,0, '"Event Statistics" lists the extracted events grouped by their combination of input and event partner genes. In other words, it counts how often two genes interact with each other in the results.')
frontpage.write(31,0, '"Gene Argument Event Diversity" shows for each gene that participated in an event the number of different interaction partners in the results.')

return df

if __name__ == "__main__":
input = sys.argv[1]
output = sys.argv[2]
inputMode = sys.argv[3].split(' ')
sentenceFilterString = sys.argv[4]
paragraphFilterString = sys.argv[5]

writeresults(input,output,inputMode)
writeresults(input,output,inputMode,sentenceFilterString,paragraphFilterString)

Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public void writeExcelSummary() throws Exception {


GePiDataService gePiDataService = new GePiDataService();
File outputFile = gePiDataService.getOverviewExcel(events, 1234, EnumSet.of(InputMode.A));
File outputFile = gePiDataService.getOverviewExcel(events, 1234, EnumSet.of(InputMode.A), null, null);
assertThat(outputFile).exists();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,11 @@ public class GepiInput {
private List<String> selectedDevSettings;

@Property
@Parameter
private String sentenceFilterString;

@Property
@Parameter
private String paragraphFilterString;

@Property
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ public class GepiWidget {
@Property
protected long dataSessionId;

@Parameter
protected EnumSet<InputMode> inputMode;

@InjectPage
private Index index;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,37 +1,36 @@
package de.julielab.gepi.webapp.components;

import java.io.*;
import java.text.FieldPosition;
import java.text.Format;
import java.text.MessageFormat;
import java.text.ParsePosition;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import de.julielab.gepi.core.retrieval.data.EventRetrievalResult;
import de.julielab.gepi.core.retrieval.data.Argument;
import de.julielab.gepi.core.retrieval.data.Event;
import de.julielab.gepi.core.retrieval.data.InputMode;
import de.julielab.gepi.core.services.IGePiDataService;
import de.julielab.gepi.webapp.base.TabPersistentField;
import de.julielab.java.utilities.FileUtilities;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.*;
import org.apache.tapestry5.ComponentResources;
import org.apache.tapestry5.StreamResponse;
import org.apache.tapestry5.annotations.Log;
import org.apache.tapestry5.annotations.Parameter;
import org.apache.tapestry5.annotations.Persist;
import org.apache.tapestry5.annotations.Property;
import org.apache.tapestry5.beanmodel.BeanModel;
import org.apache.tapestry5.beanmodel.services.BeanModelSource;
import org.apache.tapestry5.commons.Messages;
import org.apache.tapestry5.http.services.Response;
import org.apache.tapestry5.ioc.annotations.Inject;

import de.julielab.gepi.core.retrieval.data.Argument;
import de.julielab.gepi.core.retrieval.data.Event;
import org.slf4j.Logger;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.FieldPosition;
import java.text.Format;
import java.text.ParsePosition;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;

public class TableResultWidget extends GepiWidget {

@Inject
Expand Down Expand Up @@ -59,6 +58,15 @@ public class TableResultWidget extends GepiWidget {
@Inject
private ComponentResources resources;

@Parameter
protected EnumSet<InputMode> inputMode;

@Parameter
private String sentenceFilterString;

@Parameter
private String paragraphFilterString;

@Property
@Persist(TabPersistentField.TAB)
private BeanModel<BeanModelEvent> tableModel;
Expand Down Expand Up @@ -158,7 +166,7 @@ StreamResponse onDownload(long dataSessionId) {
@Override
public void prepareResponse(Response response) {
try {
statisticsFile = dataService.getOverviewExcel(getEsResult().get().getEventList(), dataSessionId, inputMode);
statisticsFile = dataService.getOverviewExcel(getEsResult().get().getEventList(), dataSessionId, inputMode, sentenceFilterString, paragraphFilterString);

response.setHeader("Content-Length", "" + statisticsFile.length()); // output into file
response.setHeader("Content-disposition", "attachment; filename=" + statisticsFile.getName());
Expand Down Expand Up @@ -243,9 +251,9 @@ public String getAllEventTypes() {

public String getContext() {
if (event.isParagraphMatchingFulltextQuery() && !event.isSentenceMatchingFulltextQuery())
return event.getHlParagraph();
return event.getSentence() + "<br>" + event.getHlParagraph();
if (event.isSentenceMatchingFulltextQuery())
return event.getHlSentence();
return event.getSentence() + "<br>" + event.getHlSentence();
return event.getSentence();
}

Expand Down
Loading

0 comments on commit 7f57c54

Please sign in to comment.