From 7f57c543e55a579fb14e71583dcc04a5ed68eb48 Mon Sep 17 00:00:00 2001 From: khituras Date: Thu, 21 Oct 2021 12:49:49 +0200 Subject: [PATCH] Add highlighted filter term hits. Resolve #135. --- .../retrieval/data/EventRetrievalResult.java | 3 -- .../EventResponseProcessingService.java | 2 +- .../services/EventRetrievalService.java | 44 +++++++++++++----- .../gepi/core/services/GePiDataService.java | 8 ++-- .../gepi/core/services/IGePiDataService.java | 4 +- .../src/main/resources/ExcelResultCreation.py | 42 +++++++++-------- .../core/services/GePiDataServiceTest.java | 2 +- .../gepi/webapp/components/GepiInput.java | 2 + .../gepi/webapp/components/GepiWidget.java | 2 - .../webapp/components/TableResultWidget.java | 46 +++++++++++-------- .../de/julielab/gepi/webapp/pages/Index.java | 9 ++-- .../de/julielab/gepi/webapp/pages/Index.tml | 4 +- 12 files changed, 103 insertions(+), 65 deletions(-) diff --git a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/data/EventRetrievalResult.java b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/data/EventRetrievalResult.java index d18c8d6f..f210c843 100644 --- a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/data/EventRetrievalResult.java +++ b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/data/EventRetrievalResult.java @@ -10,19 +10,16 @@ import java.util.stream.Stream; public class EventRetrievalResult { - private final static Logger log = LoggerFactory.getLogger(EventRetrievalResult.class); public enum EventResultType {OUTSIDE, BIPARTITE, FULLTEXT_FILTERED} private List eventList; private EventResultType resultType; public List getEventList() { - log.warn("Returning {} events from {}", eventList.size(), this); return Collections.unmodifiableList(eventList); } public void setEvents(Stream events) { eventList = events.collect(Collectors.toList()); - log.warn("Got {} events", eventList.size()); } public EventResultType getResultType() { diff --git a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/services/EventResponseProcessingService.java b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/services/EventResponseProcessingService.java index 62845971..30a3954f 100644 --- a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/services/EventResponseProcessingService.java +++ b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/services/EventResponseProcessingService.java @@ -20,7 +20,7 @@ public class EventResponseProcessingService implements IEventResponseProcessingService { - private final static Pattern FULLTEXT_QUERY_HIGHLIGHT_PATTERN = Pattern.compile(""); + private final static Pattern FULLTEXT_QUERY_HIGHLIGHT_PATTERN = Pattern.compile(""); @Inject private IEventPostProcessingService eventPPService; private Logger log; diff --git a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/services/EventRetrievalService.java b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/services/EventRetrievalService.java index 8e11252f..47cdb294 100644 --- a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/services/EventRetrievalService.java +++ b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/retrieval/services/EventRetrievalService.java @@ -189,6 +189,7 @@ public CompletableFuture getBipartiteEvents(Future getBipartiteEvents(Future { +// f.boundaryChars = new char[]{'\n', '\t'}; +// f.type = HighlightCommand.Highlighter.fastvector; + f.pre = ""; + f.post = ""; +// MatchQuery hlQuery = new MatchQuery(); +// hlQuery.field = FIELD_EVENT_SENTENCE; +// hlQuery.query = "xargumentx"; +// f.highlightQuery = hlQuery; + }); + serverCmd.addHighlightCmd(hlc); + } + ElasticSearchCarrier carrier = new ElasticSearchCarrier<>("BipartiteEvents"); carrier.addSearchServerRequest(serverCmd); @@ -229,13 +247,14 @@ public CompletableFuture getBipartiteEvents(Future idSetA, Set getFulltextFilteredEvents(List { // f.boundaryChars = new char[]{'\n', '\t'}; // f.type = HighlightCommand.Highlighter.fastvector; -// f.pre = ""; -// f.post = ""; + f.pre = ""; + f.post = ""; // MatchQuery hlQuery = new MatchQuery(); // hlQuery.field = FIELD_EVENT_SENTENCE; // hlQuery.query = "xargumentx"; diff --git a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/services/GePiDataService.java b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/services/GePiDataService.java index fa034b78..13623b44 100644 --- a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/services/GePiDataService.java +++ b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/services/GePiDataService.java @@ -239,17 +239,17 @@ public JSONArray convertToJson(List eventList) { } @Override - public File getOverviewExcel(List events, long dataSessionId, EnumSet inputMode) throws IOException { + public File getOverviewExcel(List events, long dataSessionId, EnumSet inputMode, String sentenceFilterString, String paragraphFilterString) throws IOException { log.debug("Creating event statistics Excel file for dataSessionId {}", dataSessionId); File tsvFile = getTempTsvDataFile(dataSessionId); File xlsFile = getTempXlsDataFile(dataSessionId); writeOverviewTsvFile(events, tsvFile); - createExcelSummaryFile(tsvFile, xlsFile, inputMode); + createExcelSummaryFile(tsvFile, xlsFile, inputMode, sentenceFilterString, paragraphFilterString); return xlsFile; } - private void createExcelSummaryFile(File tsvFile, File xlsFile, EnumSet inputMode) throws IOException { - ProcessBuilder builder = new ProcessBuilder().command("python", "-c", excelResultCreationScript, tsvFile.getAbsolutePath(), xlsFile.getAbsolutePath(), inputMode.stream().map(InputMode::name).collect(Collectors.joining(" "))); + private void createExcelSummaryFile(File tsvFile, File xlsFile, EnumSet inputMode, String sentenceFilterString, String paragraphFilterString) throws IOException { + ProcessBuilder builder = new ProcessBuilder().command("python", "-c", excelResultCreationScript, tsvFile.getAbsolutePath(), xlsFile.getAbsolutePath(), inputMode.stream().map(InputMode::name).collect(Collectors.joining(" ")), sentenceFilterString != null ? sentenceFilterString : "", paragraphFilterString != null ? paragraphFilterString : ""); Process process = builder.start(); InputStream processInput = process.getInputStream(); InputStream processErrors = process.getErrorStream(); diff --git a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/services/IGePiDataService.java b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/services/IGePiDataService.java index b7c542a7..265711f2 100644 --- a/gepi/gepi-core/src/main/java/de/julielab/gepi/core/services/IGePiDataService.java +++ b/gepi/gepi-core/src/main/java/de/julielab/gepi/core/services/IGePiDataService.java @@ -68,7 +68,9 @@ public interface IGePiDataService { * resulting Excel file is then read back in the form of the InputStream.

* @param events The events to create the result workbook for. * @param inputMode + * @param sentenceFilterString + * @param paragraphFilterString * @return An InputStream of the created Excel file. */ - File getOverviewExcel(List events, long dataSessionId, EnumSet inputMode) throws IOException; + File getOverviewExcel(List events, long dataSessionId, EnumSet inputMode, String sentenceFilterString, String paragraphFilterString) throws IOException; } diff --git a/gepi/gepi-core/src/main/resources/ExcelResultCreation.py b/gepi/gepi-core/src/main/resources/ExcelResultCreation.py index 129cf41b..9e21e70a 100644 --- a/gepi/gepi-core/src/main/resources/ExcelResultCreation.py +++ b/gepi/gepi-core/src/main/resources/ExcelResultCreation.py @@ -33,7 +33,7 @@ def makeArgumentSymbolPivotTable(df, column, order): givengenesfreq[('both','total sum')] = givengenesfreq[('exact','sum')] + givengenesfreq[('fuzzy','sum')] return givengenesfreq -def writeresults(input,output,inputMode): +def writeresults(input,output,inputMode,sentenceFilterString,paragraphFilterString): header = ["arg1symbol", "arg2symbol", "arg1text", "arg2text", "arg1entrezid", "arg2entrezid", "arg1matchtype", "arg2matchtype", "relationtypes", "docid", "eventid", "fulltextmatchtype", "context"] columndesc=[ 'Input gene symbol', 'Event partner gene symbol', @@ -124,26 +124,30 @@ def writeresults(input,output,inputMode): bold = ew.book.add_format({'bold': True}) frontpage.write(0,0, f'This is a GePi statistics file which contains results of event extraction. Creation date is {date.today()}.') frontpage.write(1,0, 'The contained worksheets contain the actual text mining results as well as statistics extracted from them.') - frontpage.write(3,0, 'The "Results" sheet is a large table containing the gene event arguments, an indication of how well the text matched') - frontpage.write(4,0, 'a gene synonym ("exact" or "fuzzy"), the recognized type of the event (such as "phosphorylation" or "regulation"),') - frontpage.write(5,0, 'the document ID (PubMed ID for PubMed results, PMC ID for PubMed Central results) and the sentence in which the') - frontpage.write(6,0, 'respective event was found.') + frontpage.write(2,0, 'The result was obtained using the following filter terms:') + frontpage.write(3,0, f'Sentence level filter terms: {sentenceFilterString}') + frontpage.write(4,0, f'Paragraph level filter terms: {paragraphFilterString}') + frontpage.write(5,0, 'Only molecular events that were described in a sentence or a paragraph containing the filter terms was returned for this result.') + frontpage.write(7,0, 'The "Results" sheet is a large table containing the gene event arguments, an indication of how well the text matched') + frontpage.write(8,0, 'a gene synonym ("exact" or "fuzzy"), the recognized type of the event (such as "phosphorylation" or "regulation"),') + frontpage.write(9,0, 'the document ID (PubMed ID for PubMed results, PMC ID for PubMed Central results) and the sentence in which the') + frontpage.write(10,0, 'respective event was found.') resultsdesc.to_excel(ew, startrow=7, index=False, sheet_name='Frontpage') - frontpage.write(20,0, 'The matchtype "exact" means that the textual gene name could be matched perfectly to a synonym of a NCBI Gene database entry.') - frontpage.write(21,0, '"Fuzzy" means that the gene name found in the literature could only be mapped to an NCBI Gene record by allowing minor differences when comparing with the synonyms.') - frontpage.write(22,0, 'Example: Assume the text match was "{}". This cannot be found exactly in NCBI Gene. However, the synonym "{}" exists which could be used for the mapping.'.format('25 kDa lysophospholipid-specific lysophospholipase', 'lysophospholipid-specific lysophospholipase')) + frontpage.write(24,0, 'The matchtype "exact" means that the textual gene name could be matched perfectly to a synonym of a NCBI Gene database entry.') + frontpage.write(25,0, '"Fuzzy" means that the gene name found in the literature could only be mapped to an NCBI Gene record by allowing minor differences when comparing with the synonyms.') + frontpage.write(26,0, 'Example: Assume the text match was "{}". This cannot be found exactly in NCBI Gene. However, the synonym "{}" exists which could be used for the mapping.'.format('25 kDa lysophospholipid-specific lysophospholipase', 'lysophospholipid-specific lysophospholipase')) #frontpage.write(24,0, 'Description of the sheets:', bold) - frontpage.write(24,0, 'Description of the sheets:') + frontpage.write(28,0, 'Description of the sheets:') if 'A' in inputMode or 'AB' in inputMode: - frontpage.write(25,0, '"Given Genes Statistics" shows how often the input gene symbols were found in relations with other genes, separated by exact and fuzzy matches.') - frontpage.write(26,0, '"Event Partner Statistics" shows the same but from the perspective of the interaction partners of the input genes.') - frontpage.write(27,0, '"Event Statistics" lists the extracted events grouped by their combination of input and event partner genes. In other words, it counts how often two genes interact with each other in the results.') - frontpage.write(28,0, '"Input Gene Event Diversity" shows for each input gene symbol how many different interaction partners it has in the results.') - frontpage.write(29,0, '"Gene Argument Event Diversity" shows for each gene that participated in an event the number of different interaction partners in the results.') + frontpage.write(29,0, '"Given Genes Statistics" shows how often the input gene symbols were found in relations with other genes, separated by exact and fuzzy matches.') + frontpage.write(30,0, '"Event Partner Statistics" shows the same but from the perspective of the interaction partners of the input genes.') + frontpage.write(31,0, '"Event Statistics" lists the extracted events grouped by their combination of input and event partner genes. In other words, it counts how often two genes interact with each other in the results.') + frontpage.write(32,0, '"Input Gene Event Diversity" shows for each input gene symbol how many different interaction partners it has in the results.') + frontpage.write(33,0, '"Gene Argument Event Diversity" shows for each gene that participated in an event the number of different interaction partners in the results.') else: - frontpage.write(25,0, '"Gene Interaction Statistics" shows how often gene symbols were found in relations with other genes, separated by exact and fuzzy matches.') - frontpage.write(26,0, '"Event Statistics" lists the extracted events grouped by their combination of input and event partner genes. In other words, it counts how often two genes interact with each other in the results.') - frontpage.write(27,0, '"Gene Argument Event Diversity" shows for each gene that participated in an event the number of different interaction partners in the results.') + frontpage.write(29,0, '"Gene Interaction Statistics" shows how often gene symbols were found in relations with other genes, separated by exact and fuzzy matches.') + frontpage.write(30,0, '"Event Statistics" lists the extracted events grouped by their combination of input and event partner genes. In other words, it counts how often two genes interact with each other in the results.') + frontpage.write(31,0, '"Gene Argument Event Diversity" shows for each gene that participated in an event the number of different interaction partners in the results.') return df @@ -151,6 +155,8 @@ def writeresults(input,output,inputMode): input = sys.argv[1] output = sys.argv[2] inputMode = sys.argv[3].split(' ') + sentenceFilterString = sys.argv[4] + paragraphFilterString = sys.argv[5] - writeresults(input,output,inputMode) + writeresults(input,output,inputMode,sentenceFilterString,paragraphFilterString) diff --git a/gepi/gepi-core/src/test/java/de/julielab/gepi/core/services/GePiDataServiceTest.java b/gepi/gepi-core/src/test/java/de/julielab/gepi/core/services/GePiDataServiceTest.java index f9e59cda..2e21f729 100644 --- a/gepi/gepi-core/src/test/java/de/julielab/gepi/core/services/GePiDataServiceTest.java +++ b/gepi/gepi-core/src/test/java/de/julielab/gepi/core/services/GePiDataServiceTest.java @@ -73,7 +73,7 @@ public void writeExcelSummary() throws Exception { GePiDataService gePiDataService = new GePiDataService(); - File outputFile = gePiDataService.getOverviewExcel(events, 1234, EnumSet.of(InputMode.A)); + File outputFile = gePiDataService.getOverviewExcel(events, 1234, EnumSet.of(InputMode.A), null, null); assertThat(outputFile).exists(); } diff --git a/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/GepiInput.java b/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/GepiInput.java index db4aa3e1..6fbfc718 100644 --- a/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/GepiInput.java +++ b/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/GepiInput.java @@ -122,9 +122,11 @@ public class GepiInput { private List selectedDevSettings; @Property + @Parameter private String sentenceFilterString; @Property + @Parameter private String paragraphFilterString; @Property diff --git a/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/GepiWidget.java b/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/GepiWidget.java index bd2bb053..7ec253e2 100644 --- a/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/GepiWidget.java +++ b/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/GepiWidget.java @@ -33,8 +33,6 @@ public class GepiWidget { @Property protected long dataSessionId; - @Parameter - protected EnumSet inputMode; @InjectPage private Index index; diff --git a/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/TableResultWidget.java b/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/TableResultWidget.java index 4c484757..8a800cee 100644 --- a/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/TableResultWidget.java +++ b/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/components/TableResultWidget.java @@ -1,25 +1,15 @@ package de.julielab.gepi.webapp.components; -import java.io.*; -import java.text.FieldPosition; -import java.text.Format; -import java.text.MessageFormat; -import java.text.ParsePosition; -import java.util.*; -import java.util.concurrent.ExecutionException; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import de.julielab.gepi.core.retrieval.data.EventRetrievalResult; +import de.julielab.gepi.core.retrieval.data.Argument; +import de.julielab.gepi.core.retrieval.data.Event; import de.julielab.gepi.core.retrieval.data.InputMode; import de.julielab.gepi.core.services.IGePiDataService; import de.julielab.gepi.webapp.base.TabPersistentField; import de.julielab.java.utilities.FileUtilities; -import org.apache.poi.hssf.usermodel.HSSFWorkbook; -import org.apache.poi.ss.usermodel.*; import org.apache.tapestry5.ComponentResources; import org.apache.tapestry5.StreamResponse; import org.apache.tapestry5.annotations.Log; +import org.apache.tapestry5.annotations.Parameter; import org.apache.tapestry5.annotations.Persist; import org.apache.tapestry5.annotations.Property; import org.apache.tapestry5.beanmodel.BeanModel; @@ -27,11 +17,20 @@ import org.apache.tapestry5.commons.Messages; import org.apache.tapestry5.http.services.Response; import org.apache.tapestry5.ioc.annotations.Inject; - -import de.julielab.gepi.core.retrieval.data.Argument; -import de.julielab.gepi.core.retrieval.data.Event; import org.slf4j.Logger; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.text.FieldPosition; +import java.text.Format; +import java.text.ParsePosition; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.stream.Collectors; + public class TableResultWidget extends GepiWidget { @Inject @@ -59,6 +58,15 @@ public class TableResultWidget extends GepiWidget { @Inject private ComponentResources resources; + @Parameter + protected EnumSet inputMode; + + @Parameter + private String sentenceFilterString; + + @Parameter + private String paragraphFilterString; + @Property @Persist(TabPersistentField.TAB) private BeanModel tableModel; @@ -158,7 +166,7 @@ StreamResponse onDownload(long dataSessionId) { @Override public void prepareResponse(Response response) { try { - statisticsFile = dataService.getOverviewExcel(getEsResult().get().getEventList(), dataSessionId, inputMode); + statisticsFile = dataService.getOverviewExcel(getEsResult().get().getEventList(), dataSessionId, inputMode, sentenceFilterString, paragraphFilterString); response.setHeader("Content-Length", "" + statisticsFile.length()); // output into file response.setHeader("Content-disposition", "attachment; filename=" + statisticsFile.getName()); @@ -243,9 +251,9 @@ public String getAllEventTypes() { public String getContext() { if (event.isParagraphMatchingFulltextQuery() && !event.isSentenceMatchingFulltextQuery()) - return event.getHlParagraph(); + return event.getSentence() + "
" + event.getHlParagraph(); if (event.isSentenceMatchingFulltextQuery()) - return event.getHlSentence(); + return event.getSentence() + "
" + event.getHlSentence(); return event.getSentence(); } diff --git a/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/pages/Index.java b/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/pages/Index.java index a9c4d2b8..949b2011 100644 --- a/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/pages/Index.java +++ b/gepi/gepi-webapp/src/main/java/de/julielab/gepi/webapp/pages/Index.java @@ -56,6 +56,12 @@ public class Index { @Property @Persist(TabPersistentField.TAB) private EnumSet inputMode; + @Property + @Persist(TabPersistentField.TAB) + private String sentenceFilterString; + @Property + @Persist(TabPersistentField.TAB) + private String paragraphFilterString; @Persist(TabPersistentField.TAB) private boolean hasLargeWidget; @@ -179,9 +185,6 @@ JSONObject onLoadDataToClient() { } else { List eventList = data.getUnrolledResult().get().getEventList(); - Set idSet = eventList.stream().map(Event::getEventId).collect(Collectors.toSet()); - Set duplicates = new HashSet<>(); - log.warn("All returned events: {}; unique event IDs: {}; some duplicate eventIDs: {}", eventList.size(), idSet.size(), eventList.stream().map(Event::getEventId).filter(Predicate.not(duplicates::add)).limit(10).collect(Collectors.toList())); log.debug("Obtained unrolled list of individual events of size {}.", eventList.size()); jsonObject = dataService.getPairedArgsCount(eventList); } diff --git a/gepi/gepi-webapp/src/main/resources/de/julielab/gepi/webapp/pages/Index.tml b/gepi/gepi-webapp/src/main/resources/de/julielab/gepi/webapp/pages/Index.tml index 8dd01f9a..d2d05c0e 100644 --- a/gepi/gepi-webapp/src/main/resources/de/julielab/gepi/webapp/pages/Index.tml +++ b/gepi/gepi-webapp/src/main/resources/de/julielab/gepi/webapp/pages/Index.tml @@ -7,7 +7,7 @@
- +
@@ -27,7 +27,7 @@
- +