Fixes #183.

JULIELab · May 20, 2024 · d098f1f · d098f1f
1 parent f26e26b
commit d098f1f
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 22 deletions.
diff --git a/jcore-gnormplus-ae/pom.xml b/jcore-gnormplus-ae/pom.xml
@@ -13,13 +13,13 @@
  <version>2.6.1</version>
  </parent>
 
- <version>2.6.9</version>
+ <version>2.6.10</version>
 
  <dependencies>
  <dependency>
  <groupId>de.julielab</groupId>
  <artifactId>julielab-gnormplus</artifactId>
- <version>1.0.1</version>
+ <version>[1.0.2,1.1)</version>
  </dependency>
  <dependency>
  <groupId>de.julielab</groupId>

diff --git a/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java b/jcore-gnormplus-ae/src/main/java/de/julielab/jcore/ae/gnp/GNormPlusProcessing.java
@@ -61,10 +61,17 @@ public static Path processWithGNormPlus(BioCCollection bioCCollection, String ou
  w.writeCollection(bioCCollection);
  }
  GNormPlus.processFile(filePath.toString(), filePath.getFileName().toString(), outputFilePath.toString(), System.currentTimeMillis(), "Test");
- Files.delete(filePath);
  } catch (IOException | XMLStreamException e) {
  log.error("Could not process document {}", collectionId);
  throw new AnalysisEngineProcessException(e);
+ } finally {
+ try {
+ if (Files.exists(filePath))
+ Files.delete(filePath);
+ } catch (IOException e) {
+ log.error("Could not delete temporary GNormPlus File {}", filePath);
+ throw new AnalysisEngineProcessException(e);
+ }
  }
  return outputFilePath;
  }

diff --git a/...gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java b/...gnormplus-ae/src/main/java/de/julielab/jcore/multiplier/gnp/GNormPlusMultiplierLogic.java
@@ -1,5 +1,6 @@
 package de.julielab.jcore.multiplier.gnp;
 
+import GNormPluslib.InconsistentDataException;
 import com.pengyifan.bioc.BioCCollection;
 import com.pengyifan.bioc.BioCDocument;
 import de.julielab.jcore.ae.gnp.GNormPlusProcessing;
@@ -81,6 +82,7 @@ public AbstractCas next() throws AnalysisEngineProcessException {
  // This allows batch-processing within GNP which reduces file writes and reads (GNP internally
  // writes a lot of temporary files that contain all the documents given to it in one single batch file).
  cachedCasData.clear();
+
  while (baseMultiplierHasNext.get()) {
  final JCas jCas = baseMultiplierNext.get();
  boolean isDocumentHashUnchanged = false;
@@ -110,8 +112,9 @@ public AbstractCas next() throws AnalysisEngineProcessException {
  // now process the whole batch with GNP
  if (gnormPlusInputCollection.getDocmentCount() > 0) {
  log.trace("Processing {} documents with GNormPlus.", gnormPlusInputCollection.getDocmentCount());
- final Path outputFilePath = GNormPlusProcessing.processWithGNormPlus(gnormPlusInputCollection, outputDirectory);
+ Path outputFilePath = null;
  try {
+ outputFilePath = GNormPlusProcessing.processWithGNormPlus(gnormPlusInputCollection, outputDirectory);
  bioCCasPopulator = new BioCCasPopulator(outputFilePath, Class.forName(outputGeneTypeName).getConstructor(JCas.class));
  // delete the GNP output if we don't want to keep it
  if (outputDirectory.isBlank()) {
@@ -123,6 +126,11 @@ public AbstractCas next() throws AnalysisEngineProcessException {
  } catch (ClassNotFoundException | NoSuchMethodException e) {
  log.error("Could not obtain UIMA gene annotation type constructor for class {}", outputGeneTypeName);
  throw new AnalysisEngineProcessException(e);
+ } catch (InconsistentDataException e) {
+ log.warn("GNormPlus encountered a data issue it cannot recover from: {} - no gene annotations will be created for this document batch.", e.getMessage());
+ // We set the populator to null as a signal that there are no annotations to be read. This is
+ // used further down to skip entity population of cached CASes before returning the CASes.
+ bioCCasPopulator = null;
  }
  }
  }
@@ -144,7 +152,8 @@ public AbstractCas next() throws AnalysisEngineProcessException {
  }
  // If the document is unchanged and we skip unchanged documents, we do not have a GNormPlus result for this
  // document, skip.
- if (!(isDocumentHashUnchanged && skipUnchangedDocuments)) {
+ // Also skip if the casPopulator is null. This can happen above when there is an error in GNormPlus.
+ if (!(isDocumentHashUnchanged && skipUnchangedDocuments) && bioCCasPopulator != null) {
  bioCCasPopulator.populateWithNextDocument(jCas, true);
  bioCCasPopulator.clearDocument(currentBiocResultCollectionIndex++);
  }

diff --git a/jcore-pmc-db-reader/pom.xml b/jcore-pmc-db-reader/pom.xml
@@ -30,7 +30,7 @@
  <dependency>
  <groupId>de.julielab</groupId>
  <artifactId>jcore-db-reader</artifactId>
- <version>2.6.2</version>
+ <version>2.6.3</version>
  </dependency>
  <dependency>
  <groupId>de.julielab</groupId>

diff --git a/jcore-xmi-db-reader/pom.xml b/jcore-xmi-db-reader/pom.xml
@@ -13,7 +13,7 @@
  <description>Reads CAS XMI data from a relational database (Postgres). Thus, the stored CASes may then be processed
  further.
  </description>
-
+ <version>2.6.4-SNAPSHOT</version>
  <dependencies>
  <dependency>
  <groupId>de.julielab</groupId>

diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/CasPopulator.java
@@ -162,11 +162,11 @@ public void populateCas(byte[][] data, JCas jCas) throws CasPopulationException
  JCoReTools.deserializeXmi(jCas.getCas(), new ByteArrayInputStream(xmiByteData), xercesAttributeBufferSize);
  } catch (SAXException e) {
  String docData = new String(xmiByteData, StandardCharsets.UTF_8);
- if (!docData.contains("xmi:XMI xmlns:xmi=\"http://www.omg.org/XMI\""))
+ if (!docData.contains("xmi:XMI") || !docData.contains("xmlns:xmi=\"http://www.omg.org/XMI\""))
  throw new CollectionException(new IllegalArgumentException("The document that has been received from the database does not " +
  "appear to contain XMI data. The beginning of the document data is: " +
  StringUtils.abbreviate(docData, 200), e));
- log.error("SAXException while deserializing CAS XMI data from a segmented and re-assemblied XMI " +
+ log.error("SAXException while deserializing CAS XMI data from a segmented and re-assembled XMI " +
  "document. Beginning of data was: {}", StringUtils.abbreviate(docData, 200));
  throw new CollectionException(e);
  }

diff --git a/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java b/jcore-xmi-db-reader/src/main/java/de/julielab/jcore/reader/xmi/XmiDBMultiplier.java
@@ -14,6 +14,7 @@
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.AbstractCas;
+import org.apache.uima.cas.impl.XCASParsingException;
 import org.apache.uima.fit.descriptor.ConfigurationParameter;
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
@@ -49,7 +50,7 @@ public class XmiDBMultiplier extends DBMultiplier implements Initializable {
  public void initialize(UimaContext aContext) throws ResourceInitializationException {
  super.initialize(aContext);
  logFinalXmi = Optional.ofNullable((Boolean) aContext.getConfigParameterValue(PARAM_LOG_FINAL_XMI)).orElse(false);
- truncationSize = Optional.ofNullable((Integer)aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(0);
+ truncationSize = Optional.ofNullable((Integer) aContext.getConfigParameterValue(PARAM_TRUNCATE_AT_SIZE)).orElse(0);
  }
 
  @Override
@@ -87,26 +88,48 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException {
 
  @Override
  public AbstractCas next() throws AnalysisEngineProcessException {
- JCas jCas = getEmptyJCas();
- try {
- if (documentDataIterator.hasNext()) {
+ JCas jCas = null;
+ // we use a loop here because further down we catch a particular exception that should cause the current
+ // document to be skipped over
+ while (jCas == null && documentDataIterator.hasNext()) {
+ try {
+ jCas = getEmptyJCas();
  log.trace("Returning next CAS");
  try {
  initializer.initializeAnnotationTableNames(jCas);
  } catch (ResourceInitializationException e) {
  throw new AnalysisEngineProcessException(e);
  }
  populateCas(jCas);
+ if (log.isTraceEnabled()) {
+ log.trace("Outgoing multiplier jCas instance: {}", jCas);
+ log.trace("Returning CAS containing document {}", JCoReTools.getDocId(jCas));
+ }
+ } catch (Throwable throwable) {
+ jCas.release();
+ // We want to skip XMI exception with the assumption that they stem from a corrupted JeDIS XMI
+ // annotation store. Because we can't know which documents in the complete dataset are corrupted.
+ // We just skip the corrupted ones, so they are stuck with in_process state, and we can collect
+ // them after processing.
+ Throwable cause = throwable.getCause();
+ while (cause != null && cause.getCause() != null)
+ cause = cause.getCause();
+ if (cause != null && cause instanceof XCASParsingException) {
+ log.warn("XCASParsingException occurred. That means that the JeDIS XMI modules could not be assembled into a complete, valid XMI document. The CAS is skipped and the next document is read. After processing, the skipped documents will still have 'in_process' state in the CoStoSys subset. This information can be used to repair/redo the preprocessing for the corrupted documents.");
+ // setting jCas to null, so we repeat the loop at the top and continue to the next document
+ jCas = null;
+ } else if (cause != null && cause instanceof IllegalArgumentException && cause.getMessage() != null && cause.getMessage().startsWith("Detected XMI ID clash")){
+ log.warn("XMI ID clash in assembled XMI detected. The XMI elements in the CoStoSys storage have non-unique XMI IDs, the data is corrupt. The CAS is skipped and the next document is read. After processing, the skipped documents will still have 'in_process' state in the CoStoSys subset. This information can be used to repair/redo the preprocessing for the corrupted documents.");
+ // setting jCas to null, so we repeat the loop at the top and continue to the next document
+ jCas = null;
+ }
+ else {
+ log.error("Error while reading document from the database. Releasing the CAS. ", throwable);
+ throw new AnalysisEngineProcessException(throwable);
+ }
  }
- } catch (Throwable throwable) {
- log.error("Error while reading document from the database. Releasing the CAS. ", throwable);
- jCas.release();
- throw new AnalysisEngineProcessException(throwable);
- }
- if (log.isTraceEnabled()) {
- log.trace("Outgoing multiplier jCas instance: {}", jCas);
- log.trace("Returning CAS containing document {}", JCoReTools.getDocId(jCas));
  }
+
  return jCas;
  }
 
@@ -129,7 +152,7 @@ private void populateCas(JCas jCas) throws AnalysisEngineProcessException {
  }
  boolean truncate = false;
  if (truncationSize > 0) {
- if(data[pkSize].length > truncationSize)
+ if (data[pkSize].length > truncationSize)
  truncate = true;
  }
  if (data != null && !truncate)