From e93b60769123e6e27104bbdec5c32e7a93d41033 Mon Sep 17 00:00:00 2001 From: Ludovic DANIEL Date: Mon, 22 Jul 2024 16:09:33 +0200 Subject: [PATCH 1/4] While harvesting OAI DC dc opening tag is not more required but other tags are ignored (cherry picked from commit 8514c7fca20b5ba0d9b889f5f72eb7c93d551075) --- .../api/imports/ImportGenericServiceBean.java | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java index d32a548c8bf..778d5a4167c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java @@ -175,8 +175,6 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse) throws XMLStreamException //while (xmlr.next() == XMLStreamConstants.COMMENT); // skip pre root comments xmlr.nextTag(); - xmlr.require(XMLStreamConstants.START_ELEMENT, null, OAI_DC_OPENING_TAG); - processXMLElement(xmlr, ":", OAI_DC_OPENING_TAG, dublinCoreMapping, datasetDTO); } catch (XMLStreamException ex) { throw new EJBException("ERROR occurred while parsing XML fragment (" + DcXmlToParse.substring(0, 64) + "...); ", ex); @@ -205,10 +203,24 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse) throws XMLStreamException private void processXMLElement(XMLStreamReader xmlr, String currentPath, String openingTag, ForeignMetadataFormatMapping foreignFormatMapping, DatasetDTO datasetDTO) throws XMLStreamException { logger.fine("entering processXMLElement; ("+currentPath+")"); - - for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) { + + while (xmlr.hasNext()) { + + int event; + try { + event = xmlr.next(); + } catch (XMLStreamException ex) { + continue; // Skip Undeclared namespace prefix and Unexpected close tag related to com.ctc.wstx.exc.WstxParsingException + } + if (event == XMLStreamConstants.START_ELEMENT) { + String prefix = xmlr.getPrefix(); String currentElement = xmlr.getLocalName(); + + if (prefix != null && !prefix.equals(OAI_DC_OPENING_TAG)) { // Ignore non "dc:" prefix + logger.warning("Element " + prefix + ":" + currentElement + " is ignored"); + continue; + } ForeignMetadataFieldMapping mappingDefined = datasetfieldService.findFieldMapping(foreignFormatMapping.getName(), currentPath+currentElement); From 3925baff88d79a548a60e465fd8f4168ec63e82e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20ROUCOU?= Date: Thu, 12 Sep 2024 17:21:16 +0200 Subject: [PATCH 2/4] Remove unnecessary code --- .../dataverse/api/imports/ImportGenericServiceBean.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java index 778d5a4167c..661d99b3d52 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java @@ -175,6 +175,8 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse) throws XMLStreamException //while (xmlr.next() == XMLStreamConstants.COMMENT); // skip pre root comments xmlr.nextTag(); + xmlr.require(XMLStreamConstants.START_ELEMENT, null, OAI_DC_OPENING_TAG); + processXMLElement(xmlr, ":", OAI_DC_OPENING_TAG, dublinCoreMapping, datasetDTO); } catch (XMLStreamException ex) { throw new EJBException("ERROR occurred while parsing XML fragment (" + DcXmlToParse.substring(0, 64) + "...); ", ex); @@ -210,18 +212,13 @@ private void processXMLElement(XMLStreamReader xmlr, String currentPath, String try { event = xmlr.next(); } catch (XMLStreamException ex) { + logger.warning("Error occurred in the OAI_DC XML parsing : " + ex.getMessage()); continue; // Skip Undeclared namespace prefix and Unexpected close tag related to com.ctc.wstx.exc.WstxParsingException } if (event == XMLStreamConstants.START_ELEMENT) { - String prefix = xmlr.getPrefix(); String currentElement = xmlr.getLocalName(); - if (prefix != null && !prefix.equals(OAI_DC_OPENING_TAG)) { // Ignore non "dc:" prefix - logger.warning("Element " + prefix + ":" + currentElement + " is ignored"); - continue; - } - ForeignMetadataFieldMapping mappingDefined = datasetfieldService.findFieldMapping(foreignFormatMapping.getName(), currentPath+currentElement); if (mappingDefined != null) { From 484e1d272188c8f46125ccf0a4b44cce3662a93d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20ROUCOU?= Date: Thu, 12 Sep 2024 17:21:16 +0200 Subject: [PATCH 3/4] Remove unnecessary code --- .../iq/dataverse/api/imports/ImportGenericServiceBean.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java index 661d99b3d52..41a57665010 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java @@ -212,13 +212,13 @@ private void processXMLElement(XMLStreamReader xmlr, String currentPath, String try { event = xmlr.next(); } catch (XMLStreamException ex) { - logger.warning("Error occurred in the OAI_DC XML parsing : " + ex.getMessage()); + logger.warning("Error occurred in the XML parsing : " + ex.getMessage()); continue; // Skip Undeclared namespace prefix and Unexpected close tag related to com.ctc.wstx.exc.WstxParsingException } if (event == XMLStreamConstants.START_ELEMENT) { String currentElement = xmlr.getLocalName(); - + ForeignMetadataFieldMapping mappingDefined = datasetfieldService.findFieldMapping(foreignFormatMapping.getName(), currentPath+currentElement); if (mappingDefined != null) { From 34cf77df9babadae038665fe87e6ccd9b6096308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20ROUCOU?= Date: Fri, 13 Sep 2024 14:58:31 +0200 Subject: [PATCH 4/4] release note --- doc/release-notes/10837-exclude-others-ns-harvesting-oai-dc.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 doc/release-notes/10837-exclude-others-ns-harvesting-oai-dc.md diff --git a/doc/release-notes/10837-exclude-others-ns-harvesting-oai-dc.md b/doc/release-notes/10837-exclude-others-ns-harvesting-oai-dc.md new file mode 100644 index 00000000000..c1826bfaed5 --- /dev/null +++ b/doc/release-notes/10837-exclude-others-ns-harvesting-oai-dc.md @@ -0,0 +1,3 @@ +Some repository extend the "oai_dc" metadata prefix with specific namespaces. In this case, harvesting of these datasets is not possible, as an XML parsing error is raised. + +The PR [#10837](https://github.com/IQSS/dataverse/pull/10837) allows the harvesting of these datasets by excluding tags with namespaces that are not "dc:", and harvest only metadata with the "dc" namespace.