From dd4f354f1d7d50c5c285ea28625a9644130e29e5 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 21 Feb 2024 18:40:00 +0100 Subject: [PATCH] #4533 - BioC exported from INCEpTION cannot be imported again due to missing mandatory metadata - Make sure that key, date and source are generated if they are not present in the CAS when writing BioC - Make reading BioC more robust so that it does not fail if key, date or source are missing --- .../inception/io/bioc/BioCReaderImplBase.java | 48 +++++++++++----- .../ukp/inception/io/bioc/BioCWriter.java | 9 ++- .../ukp/inception/io/bioc/BioCReaderTest.java | 15 +++++ .../ukp/inception/io/bioc/BioCWriterTest.java | 56 +++++++++++++++++++ .../bioc/example-with-incomplete-metadata.xml | 24 ++++++++ 5 files changed, 136 insertions(+), 16 deletions(-) create mode 100644 inception/inception-io-bioc/src/test/java/de/tudarmstadt/ukp/inception/io/bioc/BioCWriterTest.java create mode 100644 inception/inception-io-bioc/src/test/resources/bioc/example-with-incomplete-metadata.xml diff --git a/inception/inception-io-bioc/src/main/java/de/tudarmstadt/ukp/inception/io/bioc/BioCReaderImplBase.java b/inception/inception-io-bioc/src/main/java/de/tudarmstadt/ukp/inception/io/bioc/BioCReaderImplBase.java index 888485047d0..0394dbeb425 100644 --- a/inception/inception-io-bioc/src/main/java/de/tudarmstadt/ukp/inception/io/bioc/BioCReaderImplBase.java +++ b/inception/inception-io-bioc/src/main/java/de/tudarmstadt/ukp/inception/io/bioc/BioCReaderImplBase.java @@ -18,9 +18,11 @@ package de.tudarmstadt.ukp.inception.io.bioc; import static de.tudarmstadt.ukp.inception.support.xml.XmlParserUtils.isStartElement; +import static java.util.Arrays.asList; import java.io.IOException; import java.io.InputStream; +import java.util.Optional; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; @@ -147,24 +149,40 @@ protected void readCollectionMetdata() throws XMLStreamException while ((collectionSource == null || collectionDate == null || collectionKey == null) && xmlEventReader.hasNext()) { var event = xmlEventReader.nextEvent(); - if (event.isStartElement()) { - if (event.asStartElement().getName().getLocalPart().equals(E_KEY)) { - event = xmlEventReader.nextEvent(); - collectionKey = event.asCharacters().getData(); - xmlEventReader.nextEvent(); // Reader closing element - } - else if (event.asStartElement().getName().getLocalPart().equals(E_SOURCE)) { - event = xmlEventReader.nextEvent(); - collectionSource = event.asCharacters().getData(); - xmlEventReader.nextEvent(); // Reader closing element - } - else if (event.asStartElement().getName().getLocalPart().equals(E_DATE)) { - event = xmlEventReader.nextEvent(); - collectionDate = event.asCharacters().getData(); - xmlEventReader.nextEvent(); // Reader closing element + + tryReadingMetadata(event, E_SOURCE).ifPresent($ -> collectionSource = $); + tryReadingMetadata(event, E_DATE).ifPresent($ -> collectionDate = $); + tryReadingMetadata(event, E_KEY).ifPresent($ -> collectionKey = $); + + if (xmlEventReader.hasNext()) { + var nextEvent = xmlEventReader.peek(); + if (nextEvent.isStartElement() && asList(E_DOCUMENT, E_INFON) + .contains(nextEvent.asStartElement().getName().getLocalPart())) { + // Make sure we do not consume the documents while looking for collection + // metadata. While all metadata fields are mandatory in BioC, it does not + // mean that some of them may not be missing anyway... + break; } } } } } + + private Optional tryReadingMetadata(XMLEvent event, String element) + throws XMLStreamException + { + if (event.isStartElement()) { + if (event.asStartElement().getName().getLocalPart().equals(element)) { + event = xmlEventReader.nextEvent(); + if (event.isCharacters()) { + return Optional.of(event.asCharacters().getData()); + } + else if (!event.isEndElement()) { + xmlEventReader.next(); // Reader closing element + } + } + } + + return Optional.empty(); + } } diff --git a/inception/inception-io-bioc/src/main/java/de/tudarmstadt/ukp/inception/io/bioc/BioCWriter.java b/inception/inception-io-bioc/src/main/java/de/tudarmstadt/ukp/inception/io/bioc/BioCWriter.java index cb91b6d7897..d08fe365997 100644 --- a/inception/inception-io-bioc/src/main/java/de/tudarmstadt/ukp/inception/io/bioc/BioCWriter.java +++ b/inception/inception-io-bioc/src/main/java/de/tudarmstadt/ukp/inception/io/bioc/BioCWriter.java @@ -19,6 +19,9 @@ import static de.tudarmstadt.ukp.inception.io.bioc.BioCComponent.getCollectionMetadataField; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; + import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; import javax.xml.bind.Marshaller; @@ -80,8 +83,10 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { + var formatter = DateTimeFormatter.ofPattern("yyyyMMdd"); + try (var docOS = getOutputStream(aJCas, filenameSuffix)) { - Marshaller marshaller = context.createMarshaller(); + var marshaller = context.createMarshaller(); marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, true); // Set to fragment mode to omit XML declaration marshaller.setProperty(Marshaller.JAXB_FRAGMENT, true); @@ -91,6 +96,8 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException // Base-information - may be overwritten by the metadata fields below var dmd = DocumentMetaData.get(aJCas); bioCCollection.setSource(dmd.getCollectionId()); + bioCCollection.setKey(dmd.getDocumentId()); + bioCCollection.setDate(LocalDate.now().format(formatter)); // Use BioC metadata fields if available getCollectionMetadataField(aJCas.getCas(), E_SOURCE) diff --git a/inception/inception-io-bioc/src/test/java/de/tudarmstadt/ukp/inception/io/bioc/BioCReaderTest.java b/inception/inception-io-bioc/src/test/java/de/tudarmstadt/ukp/inception/io/bioc/BioCReaderTest.java index 5126f592489..656446b4999 100644 --- a/inception/inception-io-bioc/src/test/java/de/tudarmstadt/ukp/inception/io/bioc/BioCReaderTest.java +++ b/inception/inception-io-bioc/src/test/java/de/tudarmstadt/ukp/inception/io/bioc/BioCReaderTest.java @@ -115,4 +115,19 @@ void testReadMultipleFromOneFile() throws Exception assertThat(texts) // .containsExactly("Document 1 text.", "Document 2 text.", "Document 3 text."); } + + @Test + void testReadFileWithIncompleteMetadata() throws Exception + { + var reader = createReaderDescription( // + BioCReader.class, // + BioCReader.PARAM_SOURCE_LOCATION, + "src/test/resources/bioc/example-with-incomplete-metadata.xml"); + + var texts = new ArrayList(); + iteratePipeline(reader).forEach(cas -> texts.add(cas.getDocumentText().trim())); + + assertThat(texts) // + .containsExactly("Document 1 text.", "Document 2 text.", "Document 3 text."); + } } diff --git a/inception/inception-io-bioc/src/test/java/de/tudarmstadt/ukp/inception/io/bioc/BioCWriterTest.java b/inception/inception-io-bioc/src/test/java/de/tudarmstadt/ukp/inception/io/bioc/BioCWriterTest.java new file mode 100644 index 00000000000..8a6733ccdbe --- /dev/null +++ b/inception/inception-io-bioc/src/test/java/de/tudarmstadt/ukp/inception/io/bioc/BioCWriterTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.inception.io.bioc; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; + +import org.apache.uima.fit.factory.CasFactory; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; + +class BioCWriterTest +{ + @Test + void thatMetadataIsGeneratedToOutput(@TempDir File aTmp) throws Exception + { + var cas = CasFactory.createCas(); + cas.setDocumentText("This is a test"); + + var dmd = DocumentMetaData.create(cas); + dmd.setCollectionId("collectionId"); + dmd.setDocumentId("documentId"); + + var writer = createEngine( // + BioCWriter.class, // + BioCWriter.PARAM_TARGET_LOCATION, aTmp); + + writer.process(cas); + + var out = new File(aTmp, "documentId.xml"); + assertThat(out).exists() // + .content() // + .contains("collectionId") // + .contains("") // + .contains("documentId"); + } +} diff --git a/inception/inception-io-bioc/src/test/resources/bioc/example-with-incomplete-metadata.xml b/inception/inception-io-bioc/src/test/resources/bioc/example-with-incomplete-metadata.xml new file mode 100644 index 00000000000..3dfd9640208 --- /dev/null +++ b/inception/inception-io-bioc/src/test/resources/bioc/example-with-incomplete-metadata.xml @@ -0,0 +1,24 @@ + + + + 1 + + 0 + Document 1 text. + + + + 2 + + 0 + Document 2 text. + + + + 3 + + 0 + Document 3 text. + + +