Skip to content

Commit

Permalink
Fixes #183.
Browse files Browse the repository at this point in the history
  • Loading branch information
khituras committed May 20, 2024
1 parent f26e26b commit d098f1f
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 22 deletions.
4 changes: 2 additions & 2 deletions jcore-gnormplus-ae/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
<version>2.6.1</version>
</parent>

<version>2.6.9</version>
<version>2.6.10</version>

<dependencies>
<dependency>
<groupId>de.julielab</groupId>
<artifactId>julielab-gnormplus</artifactId>
<version>1.0.1</version>
<version>[1.0.2,1.1)</version>
</dependency>
<dependency>
<groupId>de.julielab</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,17 @@ public static Path processWithGNormPlus(BioCCollection bioCCollection, String ou
w.writeCollection(bioCCollection);
}
GNormPlus.processFile(filePath.toString(), filePath.getFileName().toString(), outputFilePath.toString(), System.currentTimeMillis(), "Test");
Files.delete(filePath);
} catch (IOException | XMLStreamException e) {
log.error("Could not process document {}", collectionId);
throw new AnalysisEngineProcessException(e);
} finally {
try {
if (Files.exists(filePath))
Files.delete(filePath);
} catch (IOException e) {
log.error("Could not delete temporary GNormPlus File {}", filePath);
throw new AnalysisEngineProcessException(e);
}
}
return outputFilePath;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package de.julielab.jcore.multiplier.gnp;

import GNormPluslib.InconsistentDataException;
import com.pengyifan.bioc.BioCCollection;
import com.pengyifan.bioc.BioCDocument;
import de.julielab.jcore.ae.gnp.GNormPlusProcessing;
Expand Down Expand Up @@ -81,6 +82,7 @@ public AbstractCas next() throws AnalysisEngineProcessException {
// This allows batch-processing within GNP which reduces file writes and reads (GNP internally
// writes a lot of temporary files that contain all the documents given to it in one single batch file).
cachedCasData.clear();

while (baseMultiplierHasNext.get()) {
final JCas jCas = baseMultiplierNext.get();
boolean isDocumentHashUnchanged = false;
Expand Down Expand Up @@ -110,8 +112,9 @@ public AbstractCas next() throws AnalysisEngineProcessException {
// now process the whole batch with GNP
if (gnormPlusInputCollection.getDocmentCount() > 0) {
log.trace("Processing {} documents with GNormPlus.", gnormPlusInputCollection.getDocmentCount());
final Path outputFilePath = GNormPlusProcessing.processWithGNormPlus(gnormPlusInputCollection, outputDirectory);
Path outputFilePath = null;
try {
outputFilePath = GNormPlusProcessing.processWithGNormPlus(gnormPlusInputCollection, outputDirectory);
bioCCasPopulator = new BioCCasPopulator(outputFilePath, Class.forName(outputGeneTypeName).getConstructor(JCas.class));
// delete the GNP output if we don't want to keep it
if (outputDirectory.isBlank()) {
Expand All @@ -123,6 +126,11 @@ public AbstractCas next() throws AnalysisEngineProcessException {
} catch (ClassNotFoundException | NoSuchMethodException e) {
log.error("Could not obtain UIMA gene annotation type constructor for class {}", outputGeneTypeName);
throw new AnalysisEngineProcessException(e);
} catch (InconsistentDataException e) {
log.warn("GNormPlus encountered a data issue it cannot recover from: {} - no gene annotations will be created for this document batch.", e.getMessage());
// We set the populator to null as a signal that there are no annotations to be read. This is
// used further down to skip entity population of cached CASes before returning the CASes.
bioCCasPopulator = null;
}
}
}
Expand All @@ -144,7 +152,8 @@ public AbstractCas next() throws AnalysisEngineProcessException {
}
// If the document is unchanged and we skip unchanged documents, we do not have a GNormPlus result for this
// document, skip.
if (!(isDocumentHashUnchanged && skipUnchangedDocuments)) {
// Also skip if the casPopulator is null. This can happen above when there is an error in GNormPlus.
if (!(isDocumentHashUnchanged && skipUnchangedDocuments) && bioCCasPopulator != null) {
bioCCasPopulator.populateWithNextDocument(jCas, true);
bioCCasPopulator.clearDocument(currentBiocResultCollectionIndex++);
}
Expand Down
2 changes: 1 addition & 1 deletion jcore-pmc-db-reader/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
<dependency>
<groupId>de.julielab</groupId>
<artifactId>jcore-db-reader</artifactId>
<version>2.6.2</version>
<version>2.6.3</version>
</dependency>
<dependency>
<groupId>de.julielab</groupId>
Expand Down
2 changes: 1 addition & 1 deletion jcore-xmi-db-reader/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
<description>Reads CAS XMI data from a relational database (Postgres). Thus, the stored CASes may then be processed
further.
</description>

<version>2.6.4-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>de.julielab</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,11 @@ public void populateCas(byte[][] data, JCas jCas) throws CasPopulationException
JCoReTools.deserializeXmi(jCas.getCas(), new ByteArrayInputStream(xmiByteData), xercesAttributeBufferSize);
} catch (SAXException e) {
String docData = new String(xmiByteData, StandardCharsets.UTF_8);
if (!docData.contains("xmi:XMI xmlns:xmi=\"http://www.omg.org/XMI\""))
if (!docData.contains("xmi:XMI") || !docData.contains("xmlns:xmi=\"http://www.omg.org/XMI\""))
throw new CollectionException(new IllegalArgumentException("The document that has been received from the database does not " +
"appear to contain XMI data. The beginning of the document data is: " +
StringUtils.abbreviate(docData, 200), e));
log.error("SAXException while deserializing CAS XMI data from a segmented and re-assemblied XMI " +
log.error("SAXException while deserializing CAS XMI data from a segmented and re-assembled XMI " +
"document. Beginning of data was: {}", StringUtils.abbreviate(docData, 200));
throw new CollectionException(e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
import org.apache.uima.cas.impl.XCASParsingException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
Expand Down Expand Up @@ -49,7 +50,7 @@ public class XmiDBMultiplier extends DBMultiplier implements Initializable {
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
logFinalXmi = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_LOG_FINAL_XMI)).orElse(false);
truncationSize = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(0);
truncationSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(0);
}

@Override
Expand Down Expand Up @@ -87,26 +88,48 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException {

@Override
public AbstractCas next() throws AnalysisEngineProcessException {
JCas jCas = getEmptyJCas();
try {
if (documentDataIterator.hasNext()) {
JCas jCas = null;
// we use a loop here because further down we catch a particular exception that should cause the current
// document to be skipped over
while (jCas == null && documentDataIterator.hasNext()) {
try {
jCas = getEmptyJCas();
log.trace("Returning next CAS");
try {
initializer.initializeAnnotationTableNames(jCas);
} catch (ResourceInitializationException e) {
throw new AnalysisEngineProcessException(e);
}
populateCas(jCas);
if (log.isTraceEnabled()) {
log.trace("Outgoing multiplier jCas instance: {}", jCas);
log.trace("Returning CAS containing document {}", JCoReTools.getDocId(jCas));
}
} catch (Throwable throwable) {
jCas.release();
// We want to skip XMI exception with the assumption that they stem from a corrupted JeDIS XMI
// annotation store. Because we can't know which documents in the complete dataset are corrupted.
// We just skip the corrupted ones, so they are stuck with in_process state, and we can collect
// them after processing.
Throwable cause = throwable.getCause();
while (cause != null && cause.getCause() != null)
cause = cause.getCause();
if (cause != null && cause instanceof XCASParsingException) {
log.warn("XCASParsingException occurred. That means that the JeDIS XMI modules could not be assembled into a complete, valid XMI document. The CAS is skipped and the next document is read. After processing, the skipped documents will still have 'in_process' state in the CoStoSys subset. This information can be used to repair/redo the preprocessing for the corrupted documents.");
// setting jCas to null, so we repeat the loop at the top and continue to the next document
jCas = null;
} else if (cause != null && cause instanceof IllegalArgumentException && cause.getMessage() != null && cause.getMessage().startsWith("Detected XMI ID clash")){
log.warn("XMI ID clash in assembled XMI detected. The XMI elements in the CoStoSys storage have non-unique XMI IDs, the data is corrupt. The CAS is skipped and the next document is read. After processing, the skipped documents will still have 'in_process' state in the CoStoSys subset. This information can be used to repair/redo the preprocessing for the corrupted documents.");
// setting jCas to null, so we repeat the loop at the top and continue to the next document
jCas = null;
}
else {
log.error("Error while reading document from the database. Releasing the CAS. ", throwable);
throw new AnalysisEngineProcessException(throwable);
}
}
} catch (Throwable throwable) {
log.error("Error while reading document from the database. Releasing the CAS. ", throwable);
jCas.release();
throw new AnalysisEngineProcessException(throwable);
}
if (log.isTraceEnabled()) {
log.trace("Outgoing multiplier jCas instance: {}", jCas);
log.trace("Returning CAS containing document {}", JCoReTools.getDocId(jCas));
}

return jCas;
}

Expand All @@ -129,7 +152,7 @@ private void populateCas(JCas jCas) throws AnalysisEngineProcessException {
}
boolean truncate = false;
if (truncationSize > 0) {
if(data[pkSize].length > truncationSize)
if (data[pkSize].length > truncationSize)
truncate = true;
}
if (data != null && !truncate)
Expand Down

0 comments on commit d098f1f

Please sign in to comment.