diff --git a/assets/request-processing.jpg b/assets/request-processing.jpg new file mode 100644 index 0000000..8104882 Binary files /dev/null and b/assets/request-processing.jpg differ diff --git a/assets/web-api-distributed-queue.jpg b/assets/web-api-distributed-queue.jpg index 1d3a868..5073960 100644 Binary files a/assets/web-api-distributed-queue.jpg and b/assets/web-api-distributed-queue.jpg differ diff --git a/assets/web-api-distributed.jpg b/assets/web-api-distributed.jpg new file mode 100644 index 0000000..dca3e37 Binary files /dev/null and b/assets/web-api-distributed.jpg differ diff --git a/openapi.yml b/openapi.yml index b187b64..ef0152e 100644 --- a/openapi.yml +++ b/openapi.yml @@ -24,16 +24,12 @@ info: Defined in [https://ocr-d.de/en/spec/ocrd_zip](https://ocr-d.de/en/spec/ocrd_zip) - ### `text/vnd.ocrd.workflow` - - Workflow format, currently (April 2022) still to be determined. - contact: email: info@ocr-d.de license: name: Apache 2.0 url: 'https://www.apache.org/licenses/LICENSE-2.0.html' - version: 0.0.1 + version: 0.1.0 externalDocs: description: OCR-D Website url: 'https://ocr-d.de' @@ -62,11 +58,16 @@ paths: operationId: listProcessors responses: '200': - description: A list of all processors + description: A list of names of all available processors content: - application/json: {schema: {$ref: '#/components/schemas/ProcessorList'}} + application/json: + schema: + type: array + items: + schema: + $ref: '#/components/schemas/OcrdExecutable' - '/processor/{executable}': + '/processor/info/{executable}': get: tags: ['processing', 'discovery'] operationId: getProcessor @@ -74,139 +75,157 @@ paths: - name: executable in: path description: Name of the executable - schema: {$ref: '#/components/schemas/OcrdExecutable'} + schema: + $ref: '#/components/schemas/OcrdExecutable' required: true responses: '200': description: Get this processor content: - application/json: {schema: {$ref: '#/components/schemas/Processor'}} + application/json: + schema: + type: object + description: Ocrd-tool-json of the processor '404': - description: 'Processor not available' + description: Processor not available + + '/processor/run/{executable}': post: tags: ['processing'] + description: Run a processor operationId: runProcessor parameters: - name: executable in: path description: Name of the executable - schema: {$ref: '#/components/schemas/OcrdExecutable'} + schema: + $ref: '#/components/schemas/OcrdExecutable' required: true requestBody: - description: Execute this ProcessorCall content: - application/json: {schema: {$ref: '#/components/schemas/ProcessorArgs'}} + application/json: + schema: + $ref: '#/components/schemas/ProcessorArgs' required: true responses: '200': - description: Return the ProcessorJob running this ProcessorCall - content: {application/json: {schema: {$ref: '#/components/schemas/ProcessorJob'}}} + description: Return the ProcessorJob + content: + application/json: + schema: + $ref: '#/components/schemas/ProcessorJob' - '/processor/{executable}/{job-id}': + '/processor/job/{job-id}': get: tags: ['processing'] + description: Get information about a ProcessorJob operationId: getProcessorJob parameters: - - name: executable - in: path - description: Name of the executable - schema: {$ref: '#/components/schemas/OcrdExecutable'} - required: true - name: job-id in: path description: ID of the ProcessorJob - schema: {type: string} + schema: + type: string required: true responses: '200': description: Return ProcessorJob - content: {application/json: {schema: {$ref: '#/components/schemas/ProcessorJob'}}} + content: + application/json: + schema: + $ref: '#/components/schemas/ProcessorJob' '404': - description: 'ProcessorJob not found' + description: ProcessorJob not found - '/processor/{executable}/{job-id}/log': + '/processor/log/{job-id}': get: tags: ['processing'] + description: Get logs of a Processor operationId: getProcessorJobLog parameters: - - name: executable - in: path - description: Name of the executable - schema: {$ref: '#/components/schemas/OcrdExecutable'} - required: true - name: job-id in: path description: ID of the ProcessorJob - schema: {type: string} + schema: + type: string required: true responses: '200': description: Return log content: - 'text/plain': {} - '404': - description: 'ProcessorJobLog not found' - post: - tags: ['processing'] - operationId: postProcessorJobLogEntry - parameters: - - name: executable - in: path - description: LogEntry to be logged - schema: {$ref: '#/components/schemas/LogEntry'} - required: true - - name: job-id - in: path - description: ID of the ProcessorJob - schema: {type: string} - required: true - responses: - '200': - description: Return Log - content: - 'text/plain': {} + 'text/plain': + schema: + type: string '404': - description: 'ProcessorJob not found' + description: Logs not found '/workflow': post: - tags: ['workflow', 'discovery'] + tags: ['workflow'] + description: Upload/Register a new workflow file operationId: postWorkflow requestBody: - description: 'Register a new workflow' - content: - 'text/vnd.ocrd.workflow': {} required: true + content: + multipart/form-data: + schema: + type: object + properties: + workflow: + type: string + format: binary + required: + - workflow responses: '200': description: Created a new OCR-D workflow - content: {application/json: {schema: {$ref: '#/components/schemas/Workflow'}}} - '400': + content: + application/json: + schema: + type: object + properties: + workflow_id: string + description: ID of the workflow created + '422': description: Invalid workflow '/workflow/{workflow-id}': put: tags: ['workflow'] + description: Update/Replace a workflow file operationId: putWorkflow parameters: - name: workflow-id in: path description: ID of the Workflow - schema: {type: string} + schema: + type: string required: true requestBody: - description: 'Replace existing or create new workflow' - content: - 'text/vnd.ocrd.workflow': {} required: true + content: + multipart/form-data: + schema: + type: object + properties: + workflow: + type: string + format: binary responses: '200': - description: Created/updated a new OCR-D workflow - content: {application/json: {schema: {$ref: '#/components/schemas/Workflow'}}} - '400': + description: Successful Response + content: + application/json: + schema: + type: string + description: ID of the workflow script + '404': + description: Workflow to update not existing + '422': description: Invalid workflow get: tags: ['workflow', 'discovery'] + description: Download a workflow file operationId: getWorkflow parameters: - name: workflow-id @@ -218,54 +237,105 @@ paths: '200': description: Return Workflow content: - application/json: {schema: {$ref: '#/components/schemas/Workflow'}} - 'text/vnd.ocrd.workflow': {} + text/plain: + schema: + type: string + format: binary '404': - description: 'Workflow not available' + description: Workflow not available + + '/workflow/run': post: - tags: ['workflow'] + tags: ['workflow', 'processing'] + description: Run a workflow operationId: runWorkflow parameters: - - name: workflow-id - in: path - description: ID of the Workflow - schema: {type: string} + - name: mets_path + in: query + description: Path to mets-file + schema: + type: string required: true + - name: agent_type + in: query + description: Type of agent to run the processor: worker or server + schema: + type: string + default: worker + required: false + - name: page_id + in: query + description: Page Id or Page Range to process + schema: + type: string + required: false + - name: page_wise + in: query + required: false + schema: + type: boolean + default: false + description: Split requests into pages to process in parallel + - name: workflow_callback_url + in: query + description: Url to receive a callback after workflow finishes + schema: + type: string + required: false + - name: workflow_id + in: query + required: false + description: Use previously uploaded workflow instead of workflow in requestBody + schema: + type: string requestBody: - description: Execute this Workflow content: - application/json: {schema: {$ref: '#/components/schemas/WorkflowArgs'}} - required: true + multipart/form-data: + description: Textfile with ocrd process syntax + schema: + properties: + workflow: + type: string + format: binary + example: | + cis-ocropy-binarize -I DEFAULT -O BIN + anybaseocr-crop -I BIN -O CROP + cis-ocropy-denoise -I CROP -O DENOISE -P dpi 300 -P level-of-operation page + required: false responses: - '200': - description: Return WorkflowJob + "200": + description: Successful Response content: - application/json: {schema: {$ref: '#/components/schemas/WorkflowJob'}} + application/json: + schema: + $ref: '#/components/schemas/WorkflowJob' + '404': + description: Workspace not found + '422': + description: Input validation error - '/workflow/{workflow-id}/{job-id}': + '/workflow/job/{job-id}': get: tags: ['workflow'] + description: Get information about a workflow job operationId: getWorkflowJob parameters: - - name: workflow-id - in: path - description: ID of the Workflow - schema: {type: string} - required: true - name: job-id in: path - description: ID of the ProcessorJob - schema: {type: string} required: true + description: ID of the ProcessorJob + schema: + type: string responses: '200': - description: Return WorkflowJob + description: Return status of the workflow job content: - application/json: {schema: {$ref: '#/components/schemas/WorkflowJob'}} - '400': - description: Workflow failed + application/json: + schema: + type: object + description: Processor-jobs of the workflow with their status '404': - description: WorkflowJob not found + description: Workflow-Job with job-id not found '/workspace': get: @@ -296,7 +366,9 @@ paths: '201': description: Created Workspace content: - application/json: {schema: {$ref: '#/components/schemas/Workspace'}} + application/json: + schema: + $ref: '#/components/schemas/Workspace' '400': description: Invalid workspace @@ -309,7 +381,8 @@ paths: - name: workspace-id in: path description: ID of the workspace - schema: {type: string} + schema: + type: string required: true requestBody: description: OCRD-ZIP of the updated workspace @@ -323,7 +396,9 @@ paths: '200': description: Workspace replaced or created content: - application/json: {schema: {$ref: '#/components/schemas/Workspace'}} + application/json: + schema: + $ref: '#/components/schemas/Workspace' '400': description: Workspace invalid get: @@ -339,7 +414,9 @@ paths: '200': description: Return workspace or OCRD-ZIP content: - application/json: {schema: {$ref: '#/components/schemas/Workspace'}} + application/json: + schema: + $ref: '#/components/schemas/Workspace' application/vnd.ocrd+zip: {} '404': description: Workspace not found @@ -354,13 +431,16 @@ paths: - name: workspace-id in: path description: ID of the workspace - schema: {type: string} + schema: + type: string required: true responses: '200': description: Workspace deleted content: - application/json: {schema: {$ref: '#/components/schemas/Workspace'}} + application/json: + schema: + $ref: '#/components/schemas/Workspace' '404': description: Workspace not found content: {} @@ -376,7 +456,9 @@ paths: '200': description: Return DiscoveryResponse content: - application/json: {schema: {$ref: '#/components/schemas/DiscoveryResponse'}} + application/json: + schema: + $ref: '#/components/schemas/DiscoveryResponse' components: schemas: @@ -392,81 +474,144 @@ components: description: Description of the thing JobState: type: string - pattern: '^(QUEUED|RUNNING|STOPPED)' - Log: - allOf: - - {$ref: '#/components/schemas/Resource'} + pattern: '^(CACHED|CANCELLED|QUEUED|RUNNING|SUCCESS|FAILED)' Workspace: allOf: - - {$ref: '#/components/schemas/Resource'} - Job: - allOf: - - {$ref: '#/components/schemas/Resource'} - - type: object - properties: - state: - $ref: '#/components/schemas/JobState' + - $ref: '#/components/schemas/Resource' OcrdExecutable: type: string pattern: '^ocrd-.*' - Processor: - description: The ocrd-tool.json for a specific tool - x-$ref: 'https://ocr-d.de/ocrd_tool.schema.json#/properties/tools/patternProperties/ocrd-.*' - ProcessorList: - description: List all available processors - type: array - items: - type: array - items: - $ref: '#/components/schemas/Processor' ProcessorArgs: - description: The CLI arguments passed to an OCR-D processor - type: object properties: - workspace: {$ref: '#/components/schemas/Workspace'} - input_file_grps: {type: string} - output_file_grps: {type: string} - page_id: {type: string} + processor_name: + type: string + description: Processor Name + path_to_mets: + type: string + description: Path to mets of workspace to process if available + workspace_id: + type: string + description: ID of the workspace to process if available + description: + type: string + description: Description of the execution + input_file_grps: + items: + type: string + type: array + description: Input file groups + output_file_grps: + items: + type: string + type: array + description: Output file groups + page_id: + type: string + description: Page Id parameters: type: object + description: Parameters for processor execution default: {} - ProcessorCall: - allOf: - - {$ref: '#/components/schemas/ProcessorArgs'} - - type: object - description: Full representation of a CLI call of a processor - required: ['executable'] - properties: - executable: {$ref: '#/components/schemas/OcrdExecutable'} + result_queue_name: + type: string + description: Result Queue Name + callback_url: + type: string + description: Url to send response to after finished execution + agent_type: + type: string + description: Agent Type, worker or server + default: worker + depends_on: + items: + type: string + type: array + description: List of job_id's to wait for before starting execution + type: object + required: + - input_file_grps + description: Wraps the parameters required to make a run-processor-request + example: + path_to_mets: /path/to/mets.xml + description: The description of this execution + input_file_grps: + - INPUT_FILE_GROUP + output_file_grps: + - OUTPUT_FILE_GROUP + page_id: PAGE_ID + parameters: {} ProcessorJob: - allOf: - - {$ref: '#/components/schemas/Job'} - - type: object - properties: - processor: {$ref: '#/components/schemas/Processor'} - workspace: {$ref: '#/components/schemas/Workspace'} + description: Wraps output information for a job-response + properties: + job_id: + type: string + description: Job Id + processor_name: + type: string + description: Processor Name + state: + $ref: '#/components/schemas/JobState' + path_to_mets: + type: string + description: Path to mets of workspace to process if available + workspace_id: + type: string + description: ID of the workspace to process if available + input_file_grps: + items: + type: string + type: array + description: Input File Grps + output_file_grps: + items: + type: string + type: array + description: Output File Grps + page_id: + type: string + title: Page Id + log_file_path: + type: string + description: Path to logfile for this job + type: object + required: + - job_id + - processor_name + - state + - input_file_grps Workflow: allOf: - {$ref: '#/components/schemas/Resource'} - WorkflowArgs: - description: The arguments needed to run the Workflow - type: object - required: ['workspace_id'] + WorkflowJob: properties: - workspace_id: {type: string} - workflow_parameters: - description: >- - Possibility to add additional parameters for the workflow. The parameters that can/must - be specified here depend on the respective implementation of the workflow. + job_id: + type: string + title: Job Id + page_id: + type: string + title: Page Id + page_wise: + type: boolean + title: Page Wise + default: false + processing_job_ids: type: object - default: {} - WorkflowJob: - allOf: - - {$ref: '#/components/schemas/Job'} - - type: object - properties: - workflow: {$ref: '#/components/schemas/Workflow'} - workspace: {$ref: '#/components/schemas/Workspace'} + title: Processing Job Ids + path_to_mets: + type: string + title: Path To Mets + workspace_id: + type: string + title: Workspace Id + description: + type: string + title: Description + type: object + required: + - job_id + - page_id + - processing_job_ids + description: Wraps output information for a workflow job-response DiscoveryResponse: type: object properties: @@ -490,21 +635,3 @@ components: ocrd_all_version: description: Git tag of the ocrd_all version implemented type: string - LogEntry: - type: object - required: - - level - - message - properties: - level: - description: Log level - type: string - enum: ['debug', 'info', 'warning', 'error'] - message: - description: Log message - type: string - time: - description: Log UTC time - type: string - format: date-time - diff --git a/web_api.md b/web_api.md index 829542b..52d5d74 100644 --- a/web_api.md +++ b/web_api.md @@ -1,18 +1,29 @@ -# Web API +# OCR-D Network -## Terminology +## 1. Why do we need OCR-D Network? + +After having processors running locally via the [CLI](https://ocr-d.de/en/spec/cli), communication over network is the +natural extension. The [OCR-D Network]((https://github.com/OCR-D/core/tree/master/ocrd_network/ocrd_network)) package, +which is implemented as part of [OCR-D/core](https://github.com/OCR-D/core), allows users to set up OCR-D in a +distributed environment. This setup greatly improves the flexibility, scalability and reliability of OCR-D. + +## 2. Terminology * **Processing Worker**: a Processing Worker is an [OCR-D Processor](https://ocr-d.de/en/spec/glossary#ocr-d-processor) running as a worker, i.e. listening to the Process Queue, pulling new jobs when available, processing them, and pushing the updated job statuses back to the queue if necessary. +* **Processor Server**: a Processor Server is an [OCR-D Processor](https://ocr-d.de/en/spec/glossary#ocr-d-processor) + running as a server over HTTP. It accepts requests, execute the processor with parameters provided in the requests, + and return responses. * **Workflow Server**: a Workflow Server is a server which exposes REST endpoints in the `Workflow` section of - the [Web API specification](openapi.yml). In particular, for each `POST /workflow/{workflow-id}` request, the - corresponding Nextflow script is executed. The script comprises a chain of call to the `POST /processor/{executable}` - endpoint in an appropriate order. + the [Web API specification](openapi.yml). In particular, with a `POST /workflow/run` request a workflow can be + executed. The Workflow Server comprises a chain of call to the `POST /processor/run/{executable}` endpoint in an + appropriate order. * **Processing Server**: a Processing Server is a server which exposes REST endpoints in the `Processing` section of - the [Web API specification](openapi.yml). In particular, for each `POST /processor/{executable}` request, - a Processing Message is added to the respective Job Queue. -* **Process Queue**: a Process Queue is a queueing system for workflow jobs (i.e. single processor runs on one + the [Web API specification](openapi.yml). In particular, for each `POST /processor/run/{executable}` request, + either a processing message is added to the respective Job Queue or a request is delegated to the respective Processor + Server. +* **Process Queue**: a Process Queue is a queuing system for workflow jobs (i.e. single processor runs on one workspace) to be executed by Processing Workers and to be enqueued by the Workflow Server via the Processing Server. In our implementation, it's [RabbitMQ](https://www.rabbitmq.com/). * **Job queue**: one or many queues in the Process Queue, which contains processing messages. Processing Workers listen @@ -24,22 +35,19 @@ Processing Worker to process data and perform actions after the processing has finished. These actions include `POST` ing the result message to the provided callback URL, or publishing the result message to the result queue. The schema of processing messages can be found [here](web_api/processing-message.schema.yml). -* **Result message**: a message published to the result queue. This message contains information about a job (ID, - status, etc.). Depending on the configuration in the processing message, a result message can be `POST`ed to the - callback URL, published to the result queue, or both. The schema for result messages can be +* **Result message**: a message send from a Processing Worker when it has finished processing. This message contains + information about a job (ID, status, etc.). Depending on the configuration in the processing message, a result message + can be `POST`ed to the callback URL, published to the result queue, or both. The schema for result messages can be found [here](web_api/result-message.schema.yml). +* **METS Server**: a METS Server makes a workspace accessible over HTTP or Unix file socket. Thanks to this server, all + operations on a METS file can be executed asynchronously. -## Why do we need a Web API? - -After having processors running locally via the [CLI](https://ocr-d.de/en/spec/cli), communication over network is the -natural extension. This feature will improve the flexibility, scalability and reliability of the system. This -specification presents ideas behind the endpoints, how to use them, and technical details happening in the background. +## 3. The Specification -## The Specification - -The Web API specification can be found [here](openapi.yml). It follows -the [OpenAPI specification](https://swagger.io/specification/). There are 4 parts to be implemented: discovery, -processing, workflow, and workspace. +When having OCR-D running over network, it should expose endpoints to allow users' interactions. Those endpoints are +described [here](openapi.yml). It follows the [OpenAPI specification](https://swagger.io/specification/). Most endpoints +are already included in [OCR-D/core](https://github.com/OCR-D/core). The rest could be implemented by the organization +which uses it. There are 4 parts in the specification: discovery, processing, workflow, and workspace. **Discovery**: The service endpoints in this section provide information about the server. They include, but are not limited to, hardware configuration, installed processors, and information about each processor. @@ -48,80 +56,51 @@ limited to, hardware configuration, installed processors, and information about specific [processor](https://ocr-d.de/en/spec/glossary#ocr-d-processor), trigger a processor run, and check the status of a running processor. By exposing these endpoints, the server can encapsulate the detailed setup of the system and offer users a single entry to the processors. The implementation of this section is provided -by [OCR-D/core](https://github.com/OCR-D/core). Implementors do not need to implement it themselves, they can reuse -and/or extend the reference implementation from OCR-D/core. +by [OCR-D/core](https://github.com/OCR-D/core). **Workflow**: Beyond single processors, one can manage entire [workflows](https://ocr-d.de/en/spec/glossary#ocr-d-workflow), i.e. a series of connected processor -instances. In this spec, a workflow amounts to a [Nextflow](https://www.nextflow.io/) script. Some information -about Nextflow and how to use it in OCR-D is documented [in the Nextflow spec](nextflow). +instances. A workflow is a text file that describes the OCR-D workflow using `ocrd process` syntax. **Workspace**: The service endpoints in this section concern data management, which in OCR-D is handled via [workspaces](https://ocr-d.de/en/spec/glossary#workspace). (A workspace is the combination of a [METS](https://ocr-d.de/en/spec/mets) file and any number of referenced files already downloaded, i.e. with locations relative to the METS file path.) Processing (via single processors or workflows) always refers to existing workspaces, -i.e. workspaces residing in the server's filesystem. - -## Usage +i.e. workspaces residing in the server's file system. -When a system implements the Web API completely, it can be used as follows: - -1. Retrieve information about the system via endpoints in the `Discovery` section. -2. Create a workspace (from an [OCRD-ZIP](https://ocr-d.de/en/spec/ocrd_zip) or METS URL) via the `POST /workspace` - endpoint and get back a workspace ID. -3. Create a workflow by uploading a Nextflow script to the system via the `POST /workflow` endpoint and get back a - workflow ID. -4. One can either: - * Trigger a single processor on a workspace by calling the `POST /processor/{executable}` endpoint with the chosen - processor name, workspace ID and parameters, or - * Start a workflow on a workspace by calling the `POST /workflow/{workflow-id}` endpoint with the chosen workflow ID - and workspace ID. - * In both cases, a job ID is returned. -5. With the given job ID, it is possible to check the job status by calling: - * `GET /processor/{executable}/{job-id}` for a single processor, or - * `GET /workflow/{workflow-id}/{job-id}` for the workflow. -6. Download the resulting workspace via the `GET /workspace/{workspace-id}` endpoint and get back an OCRD-ZIP. - Set the request header to `Accept: application/json` in case you only want the meta-data of the workspace but not the - files. +## 4. Suggested OCR-D System Architecture -## Suggested OCR-D System Architecture +This document presents two possible architecture setup using OCR-D Network and the technical details behind. In both +setup, all servers are implemented using [FastAPI](https://fastapi.tiangolo.com/). Behind the scene, it +runs [Uvicorn](https://www.uvicorn.org/), an [ASGI](https://asgi.readthedocs.io/en/latest/) web server implementation +for Python. [RabbitMQ](https://www.rabbitmq.com/) is used for the Process Queue, and [MongoDB](https://www.mongodb.com/) +is the database system. There are many options for a reverse proxy, such as Nginx, Apache, or HAProxy. From our side, we +recommend using [Traefik](https://doc.traefik.io/traefik/). -There are various ways to build a system which implements this Web API. In this section, we describe a distributed -architecture, which greatly improves the scalability, flexibility, and reliability of the system compared to -the [CLI](https://ocr-d.de/en/spec/cli) and the Distributed Processor REST Calls approach. +### 4.1 Processors as workers
- Distributed architecture with the Web API + Distributed architecture where processors are deployed as workers.
- Fig. 1: OCR-D System Architecture + Fig. 1: A distributed architecture with message queue. In this architecture, processors are deployed as workers.
-In this architecture, all servers are implemented using [FastAPI](https://fastapi.tiangolo.com/). Behind the scene, it -runs [Uvicorn](https://www.uvicorn.org/), an [ASGI](https://asgi.readthedocs.io/en/latest/) web server implementation -for Python. [RabbitMQ](https://www.rabbitmq.com/) is used for the Process Queue, and [MongoDB](https://www.mongodb.com/) -is the database system. There are many options for a reverse proxy, such as Nginx, Apache, or HAProxy. From our side, we -recommend using [Traefik](https://doc.traefik.io/traefik/). - -### Description - -As shown in Fig. 1, each section in the [Web API specification](#the-specification) is implemented by different servers, -which are Discovery Server, Processing Server, Workflow Server, and Workspace Server respectively. Although each server -in the figure is deployed on its own machine, it is completely up to the implementors to decide which machines run which -servers. However, having each processor run on its own machine reduces the risk of version and resource conflicts. -Furthermore, the machine can be customized to best fit the processor's hardware requirements and throughput demand. For -example, some processors need GPU computation, while others do not, or some need more CPU capacity while others need -more memory. +As shown in Fig. 1, each section in the [Web API specification](#3-the-specification) is implemented by different +servers, which are Discovery Server, Processing Server, Workflow Server, and Workspace Server respectively. Although +each server in the figure is deployed on its own machine, it is completely up to the implementers to decide which +machines run which servers. However, having each processor run on its own machine reduces the risk of version and +resource conflicts. Furthermore, the machine can be customized to best fit the processor's hardware requirements and +throughput demand. For example, some processors need GPU computation, while others do not, or some need more CPU +capacity while others need more memory. -**Processing**: since the `Processing` section is provided by [OCR-D Core](https://github.com/OCR-D/core), implementors -do not need to implement Processing Server, Process Queue, and Processing Worker themselves, they can reuse/customize -the existing implementation. Once a request arrives, it will be pushed to a job queue. A job queue always has the same -name as its respective processors. For example, `ocrd-olena-binarize`processors listen only to the queue +**Processing**: once a request arrives, it will be pushed to a job queue. A job queue always has the same name as its +respective processors. For example, `ocrd-olena-binarize`processors listen only to the queue named `ocrd-olena-binarize`. A Processing Worker, which is an [OCR-D Processor](https://ocr-d.de/en/spec/glossary#ocr-d-processor) running as a worker, listens to the queue, pulls new jobs when available, processes them, and push the job statuses back to the queue if necessary. One normally does not run a Processing Worker directly, but via a Processing Server. Job statuses can be pushed back to the queue, depending -on the [job configuration](#process-queue), so that other services get updates and act accordingly. +on the [job configuration](#63-process-queue), so that other services get updates and act accordingly. **Database**: in this architecture, a database is required to store information such as users requests, jobs statuses, workspaces, etc. [MongoDB](https://www.mongodb.com/) is required here. @@ -131,20 +110,83 @@ a [Network File System (NFS)](https://en.wikipedia.org/wiki/Network_File_System) Servers(specifically processors) can work in a shared storage environment and access files as if they are local files. To get data into the NFS, one could use the `POST /workspace` endpoint to upload [OCRD-ZIP](https://ocr-d.de/en/spec/ocrd_zip)files. However, this approach is only appropriate for testing or -very limited data sizes. Usually, Workspace Server should be able to pull data from other storages. +very limited data sizes. Usually, Workspace Server should be able to pull data from other storage. + +### 4.2 Processors as servers + +
+ Distributed architecture where processors are deployed as servers. +
+ Fig. 2: A distributed architecture where processors are deployed as servers. +
+
+ +The difference between this architecture and the one shown in Fig. 1 is the processors. In this architecture, each +processor runs as a server and exposes one endpoint. When the Processing Server receives a request, it will forward that +request to the respective Processor Server and wait for the response. -### Processing Server +This architecture is simpler than the other one, since there is no need to have a Process Queue involved. Without a +queue, all communications are synchronous. It means that clients need to wait for responses from Processing Server. It +might take a long time, therefore high timeout is recommended. -The Processing Server is a server which exposes REST endpoints in the `Processing` section of +## 5. Usage + +Both setups above can be used as follows: + +1. Retrieve information about the system via endpoints in the `Discovery` section. +2. Create a workspace (from an [OCRD-ZIP](https://ocr-d.de/en/spec/ocrd_zip) or METS URL) via the `POST /workspace` + endpoint and get back a workspace ID. +3. Create a workflow by uploading a workflow script to the system via the `POST /workflow` endpoint and get back a + workflow ID. +4. One can either: + * Trigger a single processor on a workspace by calling the `POST /processor/run/{executable}` endpoint with the + chosen processor name, workspace ID and parameters, or + * Start a workflow on a workspace by calling the `POST /workflow/run` endpoint with the chosen workflow ID + and workspace ID. + * In both cases, a job ID is returned. +5. With the given job ID, it is possible to check the job status by calling: + * `GET /processor/job/{job-id}` for a single processor, or + * `GET /workflow/job/{job-id}` for the workflow. +6. Download the resulting workspace via the `GET /workspace/{workspace-id}` endpoint and get back an OCRD-ZIP. + Set the request header to `Accept: application/json` in case you only want the meta-data of the workspace but not the + files. + +## 6. Technical Details + +### 6.1 How does Processing Server process requests? + +As one can see from two setups above, the Processing Server needs to go through many steps when it receives a request. +These steps are illustrated in Fig. 3 below. + +
+ This activity diagram shows how a Processing Server handles a request +
+ Fig. 3: This activity diagram shows how a Processing Server handles a request. +
+
+ +**Job cache**: there are usually dependencies between jobs, i.e. one job can only run after other jobs are finished. To +support this, when the Processing Server receives a job at `/processor/run/{processor-name}` endpoint, it first checks +if all dependent jobs are finished or not. If not, the new coming job will be cached and then executed later. + +**Page lock**: to avoid conflict, only one job is allowed to write to a page group at a time. Therefore, before a job is +executed, its output file group is locked so that other jobs cannot write to it. The group will then be unlocked when +the job finished. If a job needs to write to a locked file group, it will be cached and executed later. + +### 6.2 Processing Server + +A Processing Server is a server which exposes REST endpoints in the `Processing` section of the [Web API specification](openapi.yml). In the queue-based system architecture, a Processing Server is responsible for -deployment management and enqueueing workflow jobs. For the former, a Processing Server can deploy, re-use, and shutdown +deployment management and enqueuing workflow jobs. For the former, a Processing Server can deploy, re-use, and shutdown Processing Workers, Process Queue, and Database, depending on the configuration. For the latter, it decodes requests and -delegates them to the Process Queue. +delegates them to the Process Queue. Additionally, it is possible to start the needed components externally, with +Docker. Therefore `skip_deployment: true` can be set in the `process_queue` and `database` section of the configuration +file. To start a Processing Server, run ```shell -$ ocrd processing-server --address=: /path/to/config.yml +$ ocrd network processing-server --address=: /path/to/config.yml ``` This command starts a Processing Server on the provided IP address and port. It accepts only one argument, which is the @@ -193,7 +235,7 @@ hosts: There are three main sections in the configuration file. -1. `process_queue`: it contains the `address` and `port`, where the Process Queue was deployed, or will be deployed with +1. `process_queue`: it contains the `address` and `port`, where the Process Queue is deployed, or will be deployed with the specified `credentials`. If the `ssh` property is presented, the Processing Server will try to connect to the `address` via `ssh` with provided `username` and `password` and deploy [RabbitMQ](https://www.rabbitmq.com/) at the specified `port`. The remote machine must have [Docker](https://www.docker.com/) installed since the deployment @@ -205,20 +247,22 @@ There are three main sections in the configuration file. The `ssh` section behaves exactly the same as described in the `process_queue` section above. 3. `hosts`: this section contains a list of hosts, usually virtual machines, where Processing Workers should be deployed. To be able to connect to a host, an `address` and `username` are required, then comes either `password` - or `path_to_privkey` (path to a private key). All Processing Workers, which will be deployed, must be declared under + or `path_to_privkey` (path to a private key). All Processing Workers, which should be deployed, must be declared + under the `workers` property. In case `deploy_type` is `docker`, make sure that [Docker](https://www.docker.com/) is installed in the target machine and the provided `username` has enough rights to execute Docker commands. -Among three sections, only the `process_queue` is required. However, if `hosts` is present, `database` must be there -as well. For more information, please check the [configuration file schema](web_api/config.schema.yml). +Among three sections, `process_queue` and `database` are required, `hosts` is optional. Processing Workers can +additionally be start externally and register themselves to the process_queue`. For more information, please check the +[configuration file schema](web_api/config.schema.yml). -### Process Queue +### 6.3 Process Queue -By using a queuing system for individual per-workspace per-job processor runs, specifically as message queueing +By using a queuing system for individual per-workspace per-job processor runs, specifically as message queuing with [RabbitMQ](https://www.rabbitmq.com/), the reliability and flexibility of the Processing Server are greatly improved over a system directly coupling the workflow engine and distributed processor instances. -In our implementation of the Process Queue, manual acknowledgement mode is used. This means, when a Processing Worker +In our implementation of the Process Queue, manual acknowledgment mode is used. This means, when a Processing Worker finishes successfully, it sends a positive ACK signal to RabbitMQ. In case of failure, it tries again three times before sending a negative ACK signal. When a negative signal is received, RabbitMQ will re-queue the message. If there is not any ACK signal sent for any reason (e.g. consumer crash, power outage, network problem, etc.), RabbitMQ will @@ -232,10 +276,9 @@ re-queued. If yes, and the status of this process in the database is not `SUCCES in the message again. When a Processing Server receives a request, it creates a message based on the request content, then push it to a -job queue. A job queue always has the same name as its respective processors. For -example, `ocrd-olena-binarize` processors listen only to the job queue named `ocrd-olena-binarize`. Below is an example -of how a message looks like. For a detailed schema, please check -the [message schema](web_api/processing-message.schema.yml). +job queue. A job queue always has the same name as its respective processors. For example, `ocrd-olena-binarize` +processors listen only to the job queue named `ocrd-olena-binarize`. Below is an example of how a message looks like. +For a detailed schema, please check the [message schema](web_api/processing-message.schema.yml). ```yaml job_id: uuid @@ -253,17 +296,18 @@ parameters: result_queue_name: ocrd-cis-ocropy-binarize-result callback_url: https://my.domain.com/callback +internall_callback_url: http://ocrd-processing-server:8000 created_time: 1668782988590 ``` -In the message content, `job_id`, `processor_name`, and `created_time` are added by the Processing Server, while the -rest comes from the body of the `POST /processor/{executable}` request. +In the message content, `job_id`, `processor_name`, `internal_callback_url` and `created_time` are added by the +Processing Server, while the rest comes from the body of the `POST /processor/run/{executable}` request. Instead of `path_to_mets`, one can also use `workspace_id` to specify a workspace. An ID of a workspace can be obtained -from the Workspace Server. +from the Workspace Server which is not part of OCR-D core. -In case `result_queue_name` property is presented, the result of the processing will be pushed to the queue with the +In case `result_queue_name` property is present, the result of the processing will be pushed to the queue with the provided name. If the queue does not exist yet, it will be created on the fly. This is useful when there is another service waiting for the results of processing. That service can simply listen to that queue and will be immediately notified when the results are available. Below is a simple Python script to demonstrate how a service can listen to @@ -293,8 +337,8 @@ def main(): It is important that the result queue exists before one starts listening on it, otherwise an error is thrown. The best way to ensure this is trying to create the result queue in the listener service, as shown in the Python script above. In RabbitMQ, this action is idempotent, which means that the creation only happens if the queue doesn't exist yet, -otherwise nothing will happen. For more information, please check -the [RabbitMQ tutorials](https://www.rabbitmq.com/getstarted.html). +otherwise nothing will happen. For more information, please check the +[RabbitMQ tutorials](https://www.rabbitmq.com/getstarted.html). If the `callback_url` in the processing message is set, a `POST` request will be made to the provided endpoint when the processing is finished. The body of the request is the result message described below. @@ -310,33 +354,44 @@ path_to_mets: /path/to/mets.xml ``` With the returned `job_id`, one can retrieve more information by sending a `GET` request to -the `/processor/{executable}/{job_id}` endpoint, or to `/processor/{executable}/{job_id}/log` to get all logs of that +the `/processor/job/{job_id}` endpoint, or to `/processor/log/{job_id}` to get all logs of that job. -### Processing Worker +### 6.4 Processing Worker -There is normally no need to start (or stop) a Processing Worker manually, since it can be managed by a Processing -Server via a [configuration file](#processing-server). However, if it is necessary to do so, there are two ways to start -a Processing Worker: +A Processing Worker can be started manually, or it can be managed by a Processing Server via +a [configuration file](#62-processing-server). There are the two ways to start a processing worker: ```shell -# 1. Use ocrd CLI bundled with OCR-D/core -$ ocrd server --type=worker --queue= --database= +# 1. Use processor name +$ worker --queue= --database= -# 2. Use processor name -$ --server --type=worker --queue= --database= +# 2. Use ocrd CLI bundled with OCR-D/core +$ ocrd network processing-worker --queue= --database= ``` * `--queue`: a [Rabbit MQ connection string](https://www.rabbitmq.com/uri-spec.html) to a running instance. * `--database`: a [MongoDB connection string](https://www.mongodb.com/docs/manual/reference/connection-string/) to a running instance. -### Database +### 6.5 Processor Server + +Same as Processing Worker, there are also two ways to start a Processor Server: + +```shell +# 1. Use processor name +$ server --address= --database= + +# 2. Use ocrd CLI bundled with OCR-D/core +$ ocrd network processor-server --queue= --database= +``` + +* `--address`: The URL/address to run the processor server on, format: host:port. +* `--database`: a [MongoDB connection string](https://www.mongodb.com/docs/manual/reference/connection-string/) to a + running instance. + +### 6.6 Database A database is required to store necessary information such as users requests, jobs statuses, workspaces, etc. [MongoDB](https://www.mongodb.com/) is used in this case. To connect to MongoDB via a Graphical User Interface, [MongoDB Compass](https://www.mongodb.com/products/compass) is recommended. - -When a Processing Worker connects to the database for the first time, it will create a database called `ocrd`. -For collections, each processor creates and works on a collection with the same name as its own. For example, -all `ocrd-olena-binarize` processors will read and write to the `ocrd-olena-binarize` collection only.