DS4SD · ceberam · Feb 17, 2025 · Feb 5, 2025 · Feb 5, 2025 · Feb 14, 2025
diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py
diff --git a/docling/backend/xml/pubmed_backend.py b/docling/backend/xml/pubmed_backend.py
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -34,14 +34,14 @@ class InputFormat(str, Enum):
     DOCX = "docx"
     PPTX = "pptx"
     HTML = "html"
-    XML_PUBMED = "xml_pubmed"
     IMAGE = "image"
     PDF = "pdf"
     ASCIIDOC = "asciidoc"
     MD = "md"
     CSV = "csv"
     XLSX = "xlsx"
     XML_USPTO = "xml_uspto"
+    XML_JATS = "xml_jats"
     JSON_DOCLING = "json_docling"
 
 
@@ -59,7 +59,7 @@ class OutputFormat(str, Enum):
     InputFormat.PDF: ["pdf"],
     InputFormat.MD: ["md"],
     InputFormat.HTML: ["html", "htm", "xhtml"],
-    InputFormat.XML_PUBMED: ["xml", "nxml"],
+    InputFormat.XML_JATS: ["xml", "nxml"],
     InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.CSV: ["csv"],
@@ -79,7 +79,7 @@ class OutputFormat(str, Enum):
         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
     ],
     InputFormat.HTML: ["text/html", "application/xhtml+xml"],
-    InputFormat.XML_PUBMED: ["application/xml"],
+    InputFormat.XML_JATS: ["application/xml"],
     InputFormat.IMAGE: [
         "image/png",
         "image/jpeg",

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -333,11 +333,11 @@ def _guess_from_content(
                 ):
                     input_format = InputFormat.XML_USPTO
 
-                if (
-                    InputFormat.XML_PUBMED in formats
-                    and "/NLM//DTD JATS" in xml_doctype
+                if InputFormat.XML_JATS in formats and (
+                    "JATS-journalpublishing" in xml_doctype
+                    or "JATS-archive" in xml_doctype
                 ):
-                    input_format = InputFormat.XML_PUBMED
+                    input_format = InputFormat.XML_JATS
 
         elif mime == "text/plain":
             if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):

diff --git a/docling/document_converter.py b/docling/document_converter.py
@@ -18,7 +18,7 @@
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
+from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
@@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
     backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
 
 
-class XMLPubMedFormatOption(FormatOption):
+class XMLJatsFormatOption(FormatOption):
     pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
+    backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
 
 
 class ImageFormatOption(FormatOption):
@@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.XML_USPTO: FormatOption(
             pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
         ),
-        InputFormat.XML_PUBMED: FormatOption(
-            pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
+        InputFormat.XML_JATS: FormatOption(
+            pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
         ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend