openapi.json

{
  "openapi": "3.1.0",
  "info": {
    "title": "Unstructured Pipeline API",
    "version": "0.0.1",
    "summary": "Partition documents with the Unstructured library"
  },
  "servers": [
    {
      "url": "https://api.unstructured.io",
      "description": "Hosted API",
      "x-speakeasy-server-id": "prod"
    },
    {
      "url": "http://localhost:8000",
      "description": "Development server",
      "x-speakeasy-server-id": "local"
    }
  ],
  "x-speakeasy-retries": {
    "strategy": "backoff",
    "backoff": {
      "initialInterval": 500,
      "maxInterval": 60000,
      "maxElapsedTime": 900000,
      "exponent": 1.5
    },
    "statusCodes": ["5xx"],
    "retryConnectionErrors": true
  },
  "security": [{ "ApiKeyAuth": [] }],
  "tags": [{ "name": "general" }],
  "paths": {
    "/general/v0/general": {
      "post": {
        "tags": ["general"],
        "summary": "Pipeline 1",
        "operationId": "partition",
        "x-speakeasy-name-override": "partition",
        "requestBody": {
          "content": {
            "multipart/form-data": {
              "schema": { "$ref": "#/components/schemas/partition_parameters" }
            }
          }
        },
        "responses": {
          "200": {
            "description": "Successful Response",
            "content": {
              "application/json": {
                "schema": { "$ref": "#/components/schemas/Elements" }
              }
            }
          },
          "422": {
            "description": "Validation Error",
            "content": {
              "application/json": {
                "schema": { "$ref": "#/components/schemas/HTTPValidationError" }
              }
            }
          }
        },
        "x-codeSamples": [
          {
            "lang": "python",
            "label": "partition",
            "source": "import unstructured_client\nfrom unstructured_client.models import shared\n\ns = unstructured_client.UnstructuredClient(\n    api_key_auth=\"YOUR_API_KEY\",\n)\n\nreq = shared.PartitionParameters(\n    chunking_strategy='by_title',\n    combine_under_n_chars=500,\n    encoding='utf-8',\n    extract_image_block_types=[\n        'image',\n        'table',\n    ],\n    gz_uncompressed_content_type='application/pdf',\n    hi_res_model_name='yolox',\n    languages=[\n        '[',\n        'e',\n        'n',\n        'g',\n        ']',\n    ],\n    max_characters=1500,\n    new_after_n_chars=1500,\n    output_format='application/json',\n    overlap=25,\n    strategy='hi_res',\n)\n\nres = s.partition(req)\n\nif res.elements is not None:\n    # handle response\n    pass"
          },
          {
            "lang": "typescript",
            "label": "partition",
            "source": "import { openAsBlob } from \"node:fs\";\nimport { UnstructuredClient } from \"unstructured-client\";\n\nasync function run() {\n  const sdk = new UnstructuredClient({\n    security: {\n      apiKeyAuth: \"YOUR_API_KEY\",\n    },\n  });\n\n  const result = await sdk.partition({\n    chunkingStrategy: \"by_title\",\n    combineUnderNChars: 500,\n    encoding: \"utf-8\",\n    extractImageBlockTypes: [\n      \"image\",\n      \"table\",\n    ],\n    files: await openAsBlob(\"./sample-file\"),\n    gzUncompressedContentType: \"application/pdf\",\n    hiResModelName: \"yolox\",\n    languages: [\n      \"[\",\n      \"e\",\n      \"n\",\n      \"g\",\n      \"]\",\n    ],\n    maxCharacters: 1500,\n    newAfterNChars: 1500,\n    outputFormat: \"application/json\",\n    overlap: 25,\n    skipInferTableTypes: [\n      \"pdf\",\n    ],\n    strategy: \"hi_res\",\n  });\n\n  // Handle the result\n  console.log(result)\n}\n\nrun();"
          }
        ]
      }
    }
  },
  "components": {
    "securitySchemes": {
      "ApiKeyAuth": {
        "type": "apiKey",
        "name": "unstructured-api-key",
        "in": "header",
        "x-speakeasy-example": "YOUR_API_KEY"
      }
    },
    "schemas": {
      "Elements": {
        "type": "array",
        "items": {
          "Element": {
            "type": "object",
            "properties": {
              "type": {},
              "element_id": {},
              "metadata": {},
              "text": {}
            }
          }
        }
      },
      "partition_parameters": {
        "properties": {
          "files": {
            "type": "string",
            "format": "binary",
            "description": "The file to extract",
            "required": "true",
            "example": {
              "summary": "File to be partitioned",
              "externalValue": "https://github.com/Unstructured-IO/unstructured-ingest/blob/main/example-docs/pdf/layout-parser-paper.pdf"
            }
          },
          "strategy": {
            "type": "string",
            "title": "Strategy",
            "description": "The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto",
            "example": "hi_res"
          },
          "gz_uncompressed_content_type": {
            "type": "string",
            "title": "Uncompressed Content Type",
            "description": "If file is gzipped, use this content type after unzipping",
            "example": "application/pdf"
          },
          "output_format": {
            "type": "string",
            "title": "Output Format",
            "description": "The format of the response. Supported formats are application/json and text/csv. Default: application/json.",
            "example": "application/json"
          },
          "coordinates": {
            "type": "boolean",
            "title": "Coordinates",
            "description": "If true, return coordinates for each element. Default: false"
          },
          "encoding": {
            "type": "string",
            "title": "Encoding",
            "description": "The encoding method used to decode the text input. Default: utf-8",
            "example": "utf-8"
          },
          "hi_res_model_name": {
            "type": "string",
            "title": "Hi Res Model Name",
            "description": "The name of the inference model used when strategy is hi_res",
            "example": "yolox"
          },
          "include_page_breaks": {
            "type": "boolean",
            "title": "Include Page Breaks",
            "description": "If True, the output will include page breaks if the filetype supports it. Default: false"
          },
          "languages": {
            "items": { "type": "string", "example": "eng" },
            "type": "array",
            "title": "OCR Languages",
            "default": [],
            "description": "The languages present in the document, for use in partitioning and/or OCR",
            "example": "[eng]"
          },
          "pdf_infer_table_structure": {
            "type": "boolean",
            "title": "Pdf Infer Table Structure",
            "description": "If True and strategy=hi_res, any Table Elements extracted from a PDF will include an additional metadata field, 'text_as_html', where the value (string) is a just a transformation of the data into an HTML <table>."
          },
          "skip_infer_table_types": {
            "items": { "type": "string", "example": "pdf" },
            "type": "array",
            "title": "Skip Infer Table Types",
            "description": "The document types that you want to skip table extraction with. Default: ['pdf', 'jpg', 'png']"
          },
          "xml_keep_tags": {
            "type": "boolean",
            "title": "Xml Keep Tags",
            "description": "If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml."
          },
          "chunking_strategy": {
            "type": "string",
            "title": "Chunking Strategy",
            "description": "Use one of the supported strategies to chunk the returned elements. Currently supports: by_title",
            "example": "by_title"
          },
          "combine_under_n_chars": {
            "type": "integer",
            "title": "Combine Under N Chars",
            "description": "If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: max_characters",
            "example": 500
          },
          "include_orig_elements": {
            "type": "boolean",
            "title": "Original-elements flag",
            "description": "When True (the default), the elements used to form a chunk appear in `.metadata.orig_elements` for that chunk. Only applies when chunking is specified using the `chunking_strategy` argument."
          },
          "max_characters": {
            "type": "integer",
            "title": "Max Characters",
            "description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500",
            "example": 1500
          },
          "multipage_sections": {
            "type": "boolean",
            "title": "Multipage Sections",
            "description": "If chunking strategy is set, determines if sections can span multiple pages. Only applies to by_title chunking strategy.Default: true"
          },
          "new_after_n_chars": {
            "type": "integer",
            "title": "New after n chars",
            "description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: max_characters (off)",
            "example": 1500
          },
          "overlap": {
            "type": "integer",
            "title": "Intra-chunk overlap",
            "description": "A prefix of this many trailing characters from the prior text-split chunk is applied to second and later chunks formed from oversized elements by text-splitting. Default: None",
            "example": 25
          },
          "overlap_all": {
            "type": "boolean",
            "title": "Inter-chunk overlap",
            "description": "When True, overlap is also applied to 'normal' chunks formed by combining whole elements. Use with caution as this can introduce noise into otherwise clean semantic units. Default: None"
          },
          "extract_image_block_types": {
            "items": { "type": "string", "example": "image" },
            "type": "array",
            "title": "Image block types to extract",
            "default": [],
            "description": "The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields",
            "example": ["image", "table"]
          }
        },
        "type": "object",
        "title": "Partition Parameters"
      },
      "HTTPValidationError": {
        "properties": {
          "detail": {
            "items": { "$ref": "#/components/schemas/ValidationError" },
            "type": "array",
            "title": "Detail"
          }
        },
        "type": "object",
        "title": "HTTPValidationError"
      },
      "ValidationError": {
        "properties": {
          "loc": {
            "items": { "oneOf": [{ "type": "string" }, { "type": "integer" }] },
            "type": "array",
            "title": "Location"
          },
          "msg": { "type": "string", "title": "Message" },
          "type": { "type": "string", "title": "Error Type" }
        },
        "type": "object",
        "required": ["loc", "msg", "type"],
        "title": "ValidationError"
      }
    }
  }
}