From e2d40380b72bc3f4c0c4bca29f5ac608abd54652 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 6 Sep 2022 17:44:48 +0200 Subject: [PATCH 01/56] wip --- ocrd_eval.md | 23 +++++++++++ ocrd_eval.sample.yml | 38 +++++++++++++++++ ocrd_eval.schema.yml | 97 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+) create mode 100644 ocrd_eval.md create mode 100644 ocrd_eval.sample.yml create mode 100644 ocrd_eval.schema.yml diff --git a/ocrd_eval.md b/ocrd_eval.md new file mode 100644 index 0000000..3c9b72c --- /dev/null +++ b/ocrd_eval.md @@ -0,0 +1,23 @@ +# Quality Assurance in OCR-D + +## Rationale + +Estimating the quality of OCR requires workflows run on representative data, +evaluation metrics and evaluation tools that need to work together in a +well-defined manner to allow users to make informed decisions about which OCR +solution works best for their use case. + +## Evaluation metrics + + + +## Evaluation JSON schema + + + +The results of an evaluation should be expressed in JSON according to +the [`ocrd-eval.json`](https://ocr-d.de/en/spec/ocrd-eval.schema.json). + +## Tools + + diff --git a/ocrd_eval.sample.yml b/ocrd_eval.sample.yml new file mode 100644 index 0000000..c543e0b --- /dev/null +++ b/ocrd_eval.sample.yml @@ -0,0 +1,38 @@ +wf1-data345-eval1: + label: Workflow 1 on Data 345 + metadata: + workflow: https://example.org/workflow/1 + eval_workflow: https://example.org/workflow/eval1 + eval_data: https://example.org/workspace/345 + gt_data: https://gt.ocr-d.de/workspace/789 + document: + publication_year: 1789 + number_of_pages: 10 + evaluations: + document_wide: + cer: 0.57 + cer_document_mean: 0.38 + cer_document_median: 0.52 + cer_document_standard_deviation: 0.12 + by_page: + PHYS_0001: + cer: 0.8 +wf2-data345-eval1: + label: Workflow 2 on Data 345 + metadata: + workflow: https://example.org/workflow/2 + eval_workflow: https://example.org/workflow/eval1 + eval_data: https://example.org/workspace/345 + gt_data: https://gt.ocr-d.de/workspace/789 + document: + publication_year: 1789 + number_of_pages: 10 + evaluations: + document_wide: + cer: 0.88 + cer_document_mean: 0.77 + cer_document_median: 0.66 + cer_document_standard_deviation: 0.55 + by_page: + PHYS_0001: + cer: 0.9 diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml new file mode 100644 index 0000000..839568f --- /dev/null +++ b/ocrd_eval.schema.yml @@ -0,0 +1,97 @@ +type: object +description: An evaluation report +properties: + type: object + patternProperties: + '*': + type: object + properties: + label: + type: string + description: Label to be displayed in UI + metadata: + type: object + description: Metadata about the evaluation(s) + properties: + workflow: + type: object + properties: + $id: + type: string + format: uri + description: The Nextflow workflow used to generate the data + label: + type: string + description: Label to be displayed in UI + workflow_job: + type: object + properties: + $id: + type: string + format: uri + description: The WorkflowJob that produced the OCR + label: + type: string + description: Label to be displayed in UI + eval_data: + type: object + properties: + $id: + type: string + format: uri + description: The OCRD-ZIP of the data to be evaluated + label: + type: string + description: Label to be displayed in UI + gt_data: + type: object + properties: + $id: + type: string + format: uri + description: The OCRD-ZIP of the Ground Truth for this evaluationi + label: + type: string + description: Label to be displayed in UI + document: + type: object + properties: + publication_year: + type: number + description: Year of the document was originally published + number_of_pages: + type: number + description: Number of pages in this OCRD-ZIP + provenance: + type: object + description: Information on which tools in which version were used in determining metrics + evaluation: + type: object + description: The metrics measured for this document + document_wide: + type: object + description: Document-wide metrics + properties: { $ref: '#$defs/metrics' } + by_page: + type: object + description: Metrics page-by-page, key is the page ID + patternProperties: + '*': + type: object + properties: { $ref: '#$defs/metrics' } +$defs: + metrics: + cer_document_total: + description: CER calculated over the text of a page (in by_page) or combined text of all pages (in document_wide) of GT and OCR + cer_document_mean: + description: Arithmetic mean of the page-wise CER + cer_document_median: + description: Median of the page-wise CER + cer_document_standard_deviation: + description: Standard deviation the page-wise CER + wall_time: + description: Actual time needed for processing workflow + cpu_time: + description: Cumulative CPU time used for processing workflow + + From 5aa6bd5bcfa3cb89edb807a429fbccdb94891269 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 21 Sep 2022 17:23:21 +0200 Subject: [PATCH 02/56] rewrite eval schema and saple according to OCR-D/zenhub#123 --- Makefile | 14 +-- ocrd_eval.sample.json | 120 ++++++++++++++++++ ocrd_eval.sample.yml | 109 ++++++++++++----- ocrd_eval.schema.json | 238 ++++++++++++++++++++++++++++++++++++ ocrd_eval.schema.yml | 273 +++++++++++++++++++++++++++-------------- ocrd_tool.schema.json | 276 +++++++++++++++++++++++++++++++++++++++++- 6 files changed, 902 insertions(+), 128 deletions(-) create mode 100644 ocrd_eval.sample.json create mode 100644 ocrd_eval.schema.json diff --git a/Makefile b/Makefile index 111a114..66f48f9 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,10 @@ -json: \ - bagit-profile.json \ - ocrd_tool.schema.json \ - openapi.json - -deps: - pip install yaml click +json: $(shell find -name '*.json') %.json: %.yml python3 scripts/yaml-to-json.py $< $@ + +validate: json + jsonschema --output pretty --validator Draft201909Validator --instance ocrd_eval.sample.json ocrd_eval.schema.json + +deps: + pip install yaml click jsonschema diff --git a/ocrd_eval.sample.json b/ocrd_eval.sample.json new file mode 100644 index 0000000..e1a540f --- /dev/null +++ b/ocrd_eval.sample.json @@ -0,0 +1,120 @@ +[ + { + "@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json", + "label": "OCR workflow 1 on workspace 345", + "metadata": { + "ocr_workflow": { + "@id": "https://github.com/OCR-D/quiver/tree/data/workflows/1.nf", + "label": "OCR Workflow 1" + }, + "eval_workflow": { + "@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", + "label": "Evaluation Workflow 1" + }, + "gt_workspace": { + "@id": "https://gt.ocr-d.de/workspace/789", + "label": "GT workspace 789 (19th century fraktur)" + }, + "ocr_workspace": { + "@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", + "label": "OCR result workspace 3000" + }, + "eval_workspace": { + "@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", + "label": "Evaluation Workspace 345" + }, + "workflow_steps": { + "0": "Processor A", + "1": "Processor B" + }, + "workflow_model": "Fraktur_GT4HistOCR", + "document_metadata": { + "fonts": [ + "antiqua", + "fraktur" + ], + "publication_century": "1800-1900", + "publication_decade": "1850-1860", + "publication_year": 1855, + "number_of_pages": 100, + "layout": "simple" + } + }, + "evaluation": { + "document_wide": { + "wall_time": 1234, + "cer": 0.57, + "cer_min_max": [ + 0.2, + 0.57 + ] + }, + "by_page": [ + { + "page_id": "PHYS_0001", + "cer": 0.8, + "processing_time": 2.1 + } + ] + } + }, + { + "@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json", + "label": "OCR Workflow 2 on Data 345", + "metadata": { + "ocr_workflow": { + "@id": "https://github.com/OCR-D/quiver/tree/data/workflows/2.nf", + "label": "OCR Workflow 2" + }, + "eval_workflow": { + "@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", + "label": "Evaluation Workflow 1" + }, + "gt_workspace": { + "@id": "https://gt.ocr-d.de/workspace/789", + "label": "GT workspace 789 (19th century fraktur)" + }, + "ocr_workspace": { + "@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", + "label": "OCR result workspace 3000" + }, + "eval_workspace": { + "@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", + "label": "Evaluation Workspace 345" + }, + "workflow_steps": { + "0": "Processor A", + "1": "Processor B" + }, + "workflow_model": "Fraktur_GT4HistOCR", + "document_metadata": { + "fonts": [ + "antiqua", + "fraktur" + ], + "publication_century": "1800-1900", + "publication_decade": "1850-1860", + "publication_year": 1855, + "number_of_pages": 100, + "layout": "simple" + } + }, + "evaluation": { + "document_wide": { + "wall_time": 4567, + "cer": 0.9, + "cer_min_max": [ + 0.2, + 0.99 + ] + }, + "by_page": [ + { + "page_id": "PHYS_0001", + "cer": 0.9, + "processing_time": 2.1 + } + ] + } + } +] \ No newline at end of file diff --git a/ocrd_eval.sample.yml b/ocrd_eval.sample.yml index c543e0b..74383f8 100644 --- a/ocrd_eval.sample.yml +++ b/ocrd_eval.sample.yml @@ -1,38 +1,85 @@ -wf1-data345-eval1: - label: Workflow 1 on Data 345 +- '@id': https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json + label: OCR workflow 1 on workspace 345 metadata: - workflow: https://example.org/workflow/1 - eval_workflow: https://example.org/workflow/eval1 - eval_data: https://example.org/workspace/345 - gt_data: https://gt.ocr-d.de/workspace/789 - document: - publication_year: 1789 - number_of_pages: 10 - evaluations: + ocr_workflow: + '@id': https://github.com/OCR-D/quiver/tree/data/workflows/1.nf + label: OCR Workflow 1 + eval_workflow: + '@id': https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf + label: Evaluation Workflow 1 + gt_workspace: + '@id': https://gt.ocr-d.de/workspace/789 + label: GT workspace 789 (19th century fraktur) + ocr_workspace: + '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip + label: OCR result workspace 3000 + eval_workspace: + '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip + label: Evaluation Workspace 345 + workflow_steps: + '0': Processor A + '1': Processor B + workflow_model: Fraktur_GT4HistOCR + document_metadata: + fonts: + - antiqua + - fraktur + publication_century: 1800-1900 + publication_decade: 1850-1860 + publication_year: 1855 + number_of_pages: 100 + layout: simple + evaluation: document_wide: + wall_time: 1234 cer: 0.57 - cer_document_mean: 0.38 - cer_document_median: 0.52 - cer_document_standard_deviation: 0.12 + cer_min_max: + - 0.2 + - 0.57 by_page: - PHYS_0001: - cer: 0.8 -wf2-data345-eval1: - label: Workflow 2 on Data 345 + - page_id: PHYS_0001 + cer: 0.8 + processing_time: 2.1 + +- '@id': https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json + label: OCR Workflow 2 on Data 345 metadata: - workflow: https://example.org/workflow/2 - eval_workflow: https://example.org/workflow/eval1 - eval_data: https://example.org/workspace/345 - gt_data: https://gt.ocr-d.de/workspace/789 - document: - publication_year: 1789 - number_of_pages: 10 - evaluations: + ocr_workflow: + '@id': https://github.com/OCR-D/quiver/tree/data/workflows/2.nf + label: OCR Workflow 2 + eval_workflow: + '@id': https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf + label: Evaluation Workflow 1 + gt_workspace: + '@id': https://gt.ocr-d.de/workspace/789 + label: GT workspace 789 (19th century fraktur) + ocr_workspace: + '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip + label: OCR result workspace 3000 + eval_workspace: + '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip + label: Evaluation Workspace 345 + workflow_steps: + '0': Processor A + '1': Processor B + workflow_model: Fraktur_GT4HistOCR + document_metadata: + fonts: + - antiqua + - fraktur + publication_century: 1800-1900 + publication_decade: 1850-1860 + publication_year: 1855 + number_of_pages: 100 + layout: simple + evaluation: document_wide: - cer: 0.88 - cer_document_mean: 0.77 - cer_document_median: 0.66 - cer_document_standard_deviation: 0.55 + wall_time: 4567 + cer: 0.9 + cer_min_max: + - 0.2 + - 0.99 by_page: - PHYS_0001: - cer: 0.9 + - page_id: PHYS_0001 + cer: 0.9 + processing_time: 2.1 diff --git a/ocrd_eval.schema.json b/ocrd_eval.schema.json new file mode 100644 index 0000000..9fa0463 --- /dev/null +++ b/ocrd_eval.schema.json @@ -0,0 +1,238 @@ +{ + "$schema": "https://json-schema.org/draft/2019-09/schema", + "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", + "title": "A list of evaluations for OCR-D", + "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI\n", + "type": "array", + "items": { + "required": [ + "@id", + "label", + "metadata", + "evaluation" + ], + "unevaluatedProperties": false, + "allOf": [ + { + "$ref": "#/$defs/LabeledUrl" + }, + { + "properties": { + "metadata": { + "$ref": "#/$defs/EvaluationMetadata" + }, + "evaluation": { + "$ref": "#/$defs/EvaluationReport" + } + } + } + ] + }, + "$defs": { + "LabeledUrl": { + "type": "object", + "required": [ + "@id" + ], + "properties": { + "@id": { + "type": "string", + "format": "uri", + "description": "URL of the thing" + }, + "label": { + "type": "string", + "description": "Description of the thing for UI purposes" + } + } + }, + "EvaluationMetadata": { + "type": "object", + "title": "Metadata about one evaluation", + "additionalProperties": false, + "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", + "required": [ + "ocr_workflow", + "ocr_workspace", + "eval_workflow", + "eval_workspace", + "gt_workspace", + "document_metadata" + ], + "properties": { + "ocr_workflow": { + "allOf": [ + { + "$ref": "#/$defs/LabeledUrl" + } + ], + "description": "The OCR-D workflow that produced the ocr_workspace" + }, + "ocr_workspace": { + "allOf": [ + { + "$ref": "#/$defs/LabeledUrl" + } + ], + "description": "The workspace containing the OCR" + }, + "eval_workflow": { + "allOf": [ + { + "$ref": "#/$defs/LabeledUrl" + } + ], + "description": "The OCR-D workflow that produced the eval_workspace" + }, + "eval_workspace": { + "allOf": [ + { + "$ref": "#/$defs/LabeledUrl" + } + ], + "description": "The workspace containing the evaluation results" + }, + "gt_workspace": { + "allOf": [ + { + "$ref": "#/$defs/LabeledUrl" + } + ], + "description": "The workspace containing the GT" + }, + "workflow_steps": { + "type": "object", + "description": "Human readable description of the individual steps in the workflow (for UI)", + "patternProperties": { + "^[0-9]+$": { + "type": "string", + "description": "Description of this workflow step" + } + } + }, + "workflow_model": { + "type": "string", + "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)" + }, + "eval_tool": { + "type": "string", + "description": "Human readable name and version of evaluation tool used (for UI" + }, + "document_metadata": { + "type": "object", + "title": "Bibliographical and typographical metadata about the work to be evaluated", + "properties": { + "publication_year": { + "type": "number", + "description": "Year he document was originally published" + }, + "publication_century": { + "type": "string", + "description": "Century he document was originally published", + "pattern": "[12][0-9]{3}-[12][0-9]{3}" + }, + "publication_decade": { + "type": "string", + "description": "Decade the document was originally published", + "pattern": "[12][0-9]{3}-[12][0-9]{3}" + }, + "number_of_pages": { + "type": "number", + "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)" + }, + "layout": { + "type": "string", + "enum": [ + "simple", + "complex" + ] + }, + "fonts": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "antiqua", + "fraktur" + ] + } + } + } + }, + "provenance": { + "type": "object", + "description": "Information on which tools in which version were used in determining metrics", + "properties": { + "paramters": { + "type": "object", + "description": "Parameters passed to the evaluation processor" + } + } + } + } + }, + "EvaluationReport": { + "type": "object", + "additionalProperties": false, + "description": "The metrics measured for this document", + "properties": { + "document_wide": { + "type": "object", + "description": "Document-wide metrics" + }, + "by_page": { + "type": "array", + "description": "Metrics page-by-page", + "items": { + "type": "object", + "allOf": [ + { + "properties": { + "page_id": { + "type": "string", + "description": "PAGE ID" + } + } + } + ] + } + } + } + }, + "EvaluationMetrics": { + "cer": { + "description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)" + }, + "cer_mean": { + "description": "Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)" + }, + "cer_median": { + "description": "Median of the page-wise CER (in document_wide) or regions on a page (in by_page)" + }, + "cer_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": { + "type": "number", + "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)" + } + }, + "cer_standard_deviation": { + "description": "Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)" + }, + "wer": { + "description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)" + }, + "wall_time": { + "description": "Actual time needed for processing workflow" + }, + "cpu_time": { + "description": "Cumulative CPU time used for processing workflow" + }, + "pages_per_minute": { + "description": "Number of pages processed per minute" + } + } + } +} \ No newline at end of file diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml index 839568f..fe1c655 100644 --- a/ocrd_eval.schema.yml +++ b/ocrd_eval.schema.yml @@ -1,97 +1,192 @@ -type: object -description: An evaluation report -properties: - type: object - patternProperties: - '*': - type: object - properties: - label: - type: string - description: Label to be displayed in UI - metadata: +$schema: https://json-schema.org/draft/2019-09/schema +$id: https://ocr-d.de/en/spec/ocrd_eval.schema.json + +title: A list of evaluations for OCR-D +description: > + - All references to URL are JSON-LD-like objects with at least an `@id` + property referencing the URL and `label` for a human-readable label to be + used in the UI +type: array +items: + required: ['@id', 'label', 'metadata', 'evaluation'] + unevaluatedProperties: false + allOf: + - { '$ref': '#/$defs/LabeledUrl' } + - properties: + metadata: { '$ref': '#/$defs/EvaluationMetadata' } + evaluation: { '$ref': '#/$defs/EvaluationReport' } + +# Reusable definitions +$defs: + + LabeledUrl: + type: object + required: ['@id'] + properties: + '@id': + type: string + format: uri + description: URL of the thing + label: + type: string + description: Description of the thing for UI purposes + + EvaluationMetadata: + type: object + title: Metadata about one evaluation + additionalProperties: false + description: > + EvaluationMetadata contains all the info on how an EvaluationReport came to be. + + There are two OCR-D *workflows* involved: + - ocr_workflow: The workflow which produced the OCR results to evaluate + - eval_workflow: The workflow run to evaluate OCR and GT + + There are three OCR-D *workspaces* involved: + - gt_workspace: The workspace containing the GT + - ocr_workspace: The workspace containing the OCR results from ocr_workflow + - eval_workspace: The workspace on which the eval_workflow was run + + required: + - ocr_workflow + - ocr_workspace + - eval_workflow + - eval_workspace + - gt_workspace + - document_metadata + + properties: + + ocr_workflow: + allOf: [{ '$ref': '#/$defs/LabeledUrl' }] + description: The OCR-D workflow that produced the ocr_workspace + + ocr_workspace: + allOf: [{ '$ref': '#/$defs/LabeledUrl' }] + description: The workspace containing the OCR + + eval_workflow: + allOf: [{ '$ref': '#/$defs/LabeledUrl' }] + description: The OCR-D workflow that produced the eval_workspace + + eval_workspace: + allOf: [{ '$ref': '#/$defs/LabeledUrl' }] + description: The workspace containing the evaluation results + + gt_workspace: + allOf: [{ '$ref': '#/$defs/LabeledUrl' }] + description: The workspace containing the GT + + workflow_steps: + type: object + description: Human readable description of the individual steps in the workflow (for UI) + patternProperties: + '^[0-9]+$': + type: string + description: Description of this workflow step + + workflow_model: + type: string + description: Human readable name of the main model used for recognition in the OCR workflow (for UI) + + eval_tool: + type: string + description: Human readable name and version of evaluation tool used (for UI + + document_metadata: + type: object + title: Bibliographical and typographical metadata about the work to be evaluated + properties: + + publication_year: + type: number + description: Year he document was originally published + + publication_century: + type: string + description: Century he document was originally published + pattern: '[12][0-9]{3}-[12][0-9]{3}' + + publication_decade: + type: string + description: Decade the document was originally published + pattern: '[12][0-9]{3}-[12][0-9]{3}' + + number_of_pages: + type: number + description: Number of pages in this work (i.e. the number of images in the gt_workspace) + + layout: + type: string + enum: ['simple', 'complex'] + + fonts: + type: array + items: + type: string + enum: ['antiqua', 'fraktur'] + + provenance: + type: object + description: Information on which tools in which version were used in determining metrics + properties: + paramters: + type: object + description: Parameters passed to the evaluation processor + + EvaluationReport: + type: object + additionalProperties: false + description: The metrics measured for this document + properties: + document_wide: + type: object + description: Document-wide metrics + #properties: { $ref: '#$defs/EvaluationMetrics' } + by_page: + type: array + description: Metrics page-by-page + items: type: object - description: Metadata about the evaluation(s) - properties: - workflow: - type: object - properties: - $id: - type: string - format: uri - description: The Nextflow workflow used to generate the data - label: - type: string - description: Label to be displayed in UI - workflow_job: - type: object - properties: - $id: - type: string - format: uri - description: The WorkflowJob that produced the OCR - label: + allOf: + - properties: + page_id: type: string - description: Label to be displayed in UI - eval_data: - type: object - properties: - $id: - type: string - format: uri - description: The OCRD-ZIP of the data to be evaluated - label: - type: string - description: Label to be displayed in UI - gt_data: - type: object - properties: - $id: - type: string - format: uri - description: The OCRD-ZIP of the Ground Truth for this evaluationi - label: - type: string - description: Label to be displayed in UI - document: - type: object - properties: - publication_year: - type: number - description: Year of the document was originally published - number_of_pages: - type: number - description: Number of pages in this OCRD-ZIP - provenance: - type: object - description: Information on which tools in which version were used in determining metrics - evaluation: - type: object - description: The metrics measured for this document - document_wide: - type: object - description: Document-wide metrics - properties: { $ref: '#$defs/metrics' } - by_page: - type: object - description: Metrics page-by-page, key is the page ID - patternProperties: - '*': - type: object - properties: { $ref: '#$defs/metrics' } -$defs: - metrics: - cer_document_total: - description: CER calculated over the text of a page (in by_page) or combined text of all pages (in document_wide) of GT and OCR - cer_document_mean: - description: Arithmetic mean of the page-wise CER - cer_document_median: - description: Median of the page-wise CER - cer_document_standard_deviation: - description: Standard deviation the page-wise CER + description: PAGE ID + #- properties: { $ref: '#$defs/EvaluationMetrics' } + + EvaluationMetrics: + + cer: + description: CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide) + + cer_mean: + description: Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page) + + cer_median: + description: Median of the page-wise CER (in document_wide) or regions on a page (in by_page) + + cer_range: + type: array + minItems: 2 + maxItems: 2 + items: + type: number + description: Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide) + + cer_standard_deviation: + description: Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page) + + wer: + description: CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide) + wall_time: description: Actual time needed for processing workflow + cpu_time: description: Cumulative CPU time used for processing workflow + pages_per_minute: + description: Number of pages processed per minute + diff --git a/ocrd_tool.schema.json b/ocrd_tool.schema.json index 6e722a4..2a42cb1 100644 --- a/ocrd_tool.schema.json +++ b/ocrd_tool.schema.json @@ -1 +1,275 @@ -{"type": "object", "description": "Schema for tools by OCR-D MP", "required": ["version", "git_url", "tools"], "additionalProperties": false, "properties": {"version": {"description": "Version of the tool, expressed as MAJOR.MINOR.PATCH.", "type": "string", "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$"}, "git_url": {"description": "Github/Gitlab URL", "type": "string", "format": "url"}, "dockerhub": {"description": "DockerHub image", "type": "string"}, "tools": {"type": "object", "additionalProperties": false, "patternProperties": {"ocrd-.*": {"type": "object", "additionalProperties": false, "required": ["description", "steps", "executable", "categories", "input_file_grp"], "properties": {"executable": {"description": "The name of the CLI executable in $PATH", "type": "string"}, "input_file_grp": {"description": "Input fileGrp@USE this tool expects by default", "type": "array", "items": {"type": "string"}}, "output_file_grp": {"description": "Output fileGrp@USE this tool produces by default", "type": "array", "items": {"type": "string"}}, "parameters": {"description": "Object describing the parameters of a tool. Keys are parameter names, values sub-schemas.", "type": "object", "patternProperties": {".*": {"type": "object", "additionalProperties": false, "required": ["description", "type"], "properties": {"type": {"type": "string", "description": "Data type of this parameter", "enum": ["string", "number", "boolean", "object", "array"]}, "format": {"description": "Subtype, such as `float` for type `number` or `uri` for type `string`."}, "description": {"description": "Concise description of syntax and semantics of this parameter"}, "items": {"type": "object", "description": "describe the items of an array further"}, "minimum": {"type": "number", "description": "Minimum value for number parameters, including the minimum"}, "maximum": {"type": "number", "description": "Maximum value for number parameters, including the maximum"}, "exclusiveMinimum": {"type": "number", "description": "Minimum value for number parameters, excluding the minimum"}, "exclusiveMaximum": {"type": "number", "description": "Maximum value for number parameters, excluding the maximum"}, "multipleOf": {"type": "number", "description": "For number values, those values must be multiple of this number"}, "properties": {"type": "object", "description": "Describe the properties of an object value"}, "additionalProperties": {"type": "boolean", "description": "Whether an object value may contain properties not explicitly defined"}, "required": {"type": "boolean", "description": "Whether this parameter is required"}, "default": {"description": "Default value when not provided by the user"}, "enum": {"type": "array", "description": "List the allowed values if a fixed list."}, "content-type": {"type": "string", "default": "application/octet-stream", "description": "The media type of resources this processor expects for this parameter. Most processors use files for resources (e.g. `*.traineddata` for `ocrd-tesserocr-recognize`) while others use directories of files (e.g. `default` for `ocrd-eynollah-segment`). If a parameter requires directories, it must set `content-type` to `text/directory`.\n"}, "cacheable": {"type": "boolean", "description": "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change.", "default": false}}}}}, "description": {"description": "Concise description what the tool does"}, "categories": {"description": "Tools belong to this categories, representing modules within the OCR-D project structure", "type": "array", "items": {"type": "string", "enum": ["Image preprocessing", "Layout analysis", "Text recognition and optimization", "Model training", "Long-term preservation", "Quality assurance"]}}, "steps": {"description": "This tool can be used at these steps in the OCR-D functional model", "type": "array", "items": {"type": "string", "enum": ["preprocessing/characterization", "preprocessing/optimization", "preprocessing/optimization/cropping", "preprocessing/optimization/deskewing", "preprocessing/optimization/despeckling", "preprocessing/optimization/dewarping", "preprocessing/optimization/binarization", "preprocessing/optimization/grayscale_normalization", "recognition/text-recognition", "recognition/font-identification", "recognition/post-correction", "layout/segmentation", "layout/segmentation/text-nontext", "layout/segmentation/region", "layout/segmentation/line", "layout/segmentation/word", "layout/segmentation/classification", "layout/analysis"]}}, "resource_locations": {"type": "array", "description": "The locations in the filesystem this processor supports for resource lookup", "default": ["data", "cwd", "system", "module"], "items": {"type": "string", "enum": ["data", "cwd", "system", "module"]}}, "resources": {"type": "array", "description": "Resources for this processor", "items": {"type": "object", "additionalProperties": false, "required": ["url", "description", "name", "size"], "properties": {"url": {"type": "string", "description": "URLs of all components of this resource"}, "description": {"type": "string", "description": "A description of the resource"}, "name": {"type": "string", "description": "Name to store the resource as"}, "type": {"type": "string", "enum": ["file", "directory", "archive"], "default": "file", "description": "Type of the URL"}, "parameter_usage": {"type": "string", "description": "Defines how the parameter is to be used", "enum": ["as-is", "without-extension"], "default": "as-is"}, "path_in_archive": {"type": "string", "description": "if type is archive, the resource is at this location in the archive", "default": "."}, "version_range": {"type": "string", "description": "Range of supported versions, syntax like in PEP 440", "default": ">= 0.0.1"}, "size": {"type": "number", "description": "Size of the resource in bytes"}}}}}}}}}} \ No newline at end of file +{ + "type": "object", + "description": "Schema for tools by OCR-D MP", + "required": [ + "version", + "git_url", + "tools" + ], + "additionalProperties": false, + "properties": { + "version": { + "description": "Version of the tool, expressed as MAJOR.MINOR.PATCH.", + "type": "string", + "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$" + }, + "git_url": { + "description": "Github/Gitlab URL", + "type": "string", + "format": "url" + }, + "dockerhub": { + "description": "DockerHub image", + "type": "string" + }, + "tools": { + "type": "object", + "additionalProperties": false, + "patternProperties": { + "ocrd-.*": { + "type": "object", + "additionalProperties": false, + "required": [ + "description", + "steps", + "executable", + "categories", + "input_file_grp" + ], + "properties": { + "executable": { + "description": "The name of the CLI executable in $PATH", + "type": "string" + }, + "input_file_grp": { + "description": "Input fileGrp@USE this tool expects by default", + "type": "array", + "items": { + "type": "string" + } + }, + "output_file_grp": { + "description": "Output fileGrp@USE this tool produces by default", + "type": "array", + "items": { + "type": "string" + } + }, + "parameters": { + "description": "Object describing the parameters of a tool. Keys are parameter names, values sub-schemas.", + "type": "object", + "default": {}, + "patternProperties": { + ".*": { + "type": "object", + "additionalProperties": false, + "required": [ + "description", + "type" + ], + "properties": { + "type": { + "type": "string", + "description": "Data type of this parameter", + "enum": [ + "string", + "number", + "boolean", + "object", + "array" + ] + }, + "format": { + "description": "Subtype, such as `float` for type `number` or `uri` for type `string`." + }, + "description": { + "description": "Concise description of syntax and semantics of this parameter" + }, + "items": { + "type": "object", + "description": "describe the items of an array further" + }, + "minimum": { + "type": "number", + "description": "Minimum value for number parameters, including the minimum" + }, + "maximum": { + "type": "number", + "description": "Maximum value for number parameters, including the maximum" + }, + "exclusiveMinimum": { + "type": "number", + "description": "Minimum value for number parameters, excluding the minimum" + }, + "exclusiveMaximum": { + "type": "number", + "description": "Maximum value for number parameters, excluding the maximum" + }, + "multipleOf": { + "type": "number", + "description": "For number values, those values must be multiple of this number" + }, + "properties": { + "type": "object", + "description": "Describe the properties of an object value" + }, + "additionalProperties": { + "type": "boolean", + "description": "Whether an object value may contain properties not explicitly defined" + }, + "required": { + "type": "boolean", + "description": "Whether this parameter is required" + }, + "default": { + "description": "Default value when not provided by the user" + }, + "enum": { + "type": "array", + "description": "List the allowed values if a fixed list." + }, + "content-type": { + "type": "string", + "default": "application/octet-stream", + "description": "The media type of resources this processor expects for this parameter. Most processors use files for resources (e.g. `*.traineddata` for `ocrd-tesserocr-recognize`) while others use directories of files (e.g. `default` for `ocrd-eynollah-segment`). If a parameter requires directories, it must set `content-type` to `text/directory`.\n" + }, + "cacheable": { + "type": "boolean", + "description": "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change.", + "default": false + } + } + } + } + }, + "description": { + "description": "Concise description what the tool does" + }, + "categories": { + "description": "Tools belong to this categories, representing modules within the OCR-D project structure", + "type": "array", + "items": { + "type": "string", + "enum": [ + "Image preprocessing", + "Layout analysis", + "Text recognition and optimization", + "Model training", + "Long-term preservation", + "Quality assurance" + ] + } + }, + "steps": { + "description": "This tool can be used at these steps in the OCR-D functional model", + "type": "array", + "items": { + "type": "string", + "enum": [ + "preprocessing/characterization", + "preprocessing/optimization", + "preprocessing/optimization/cropping", + "preprocessing/optimization/deskewing", + "preprocessing/optimization/despeckling", + "preprocessing/optimization/dewarping", + "preprocessing/optimization/binarization", + "preprocessing/optimization/grayscale_normalization", + "recognition/text-recognition", + "recognition/font-identification", + "recognition/post-correction", + "layout/segmentation", + "layout/segmentation/text-nontext", + "layout/segmentation/region", + "layout/segmentation/line", + "layout/segmentation/word", + "layout/segmentation/classification", + "layout/analysis" + ] + } + }, + "resource_locations": { + "type": "array", + "description": "The locations in the filesystem this processor supports for resource lookup", + "default": [ + "data", + "cwd", + "system", + "module" + ], + "items": { + "type": "string", + "enum": [ + "data", + "cwd", + "system", + "module" + ] + } + }, + "resources": { + "type": "array", + "description": "Resources for this processor", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "url", + "description", + "name", + "size" + ], + "properties": { + "url": { + "type": "string", + "description": "URLs of all components of this resource" + }, + "description": { + "type": "string", + "description": "A description of the resource" + }, + "name": { + "type": "string", + "description": "Name to store the resource as" + }, + "type": { + "type": "string", + "enum": [ + "file", + "directory", + "archive" + ], + "default": "file", + "description": "Type of the URL" + }, + "parameter_usage": { + "type": "string", + "description": "Defines how the parameter is to be used", + "enum": [ + "as-is", + "without-extension" + ], + "default": "as-is" + }, + "path_in_archive": { + "type": "string", + "description": "if type is archive, the resource is at this location in the archive", + "default": "." + }, + "version_range": { + "type": "string", + "description": "Range of supported versions, syntax like in PEP 440", + "default": ">= 0.0.1" + }, + "size": { + "type": "number", + "description": "Size of the resource in bytes" + } + } + } + } + } + } + } + } + } +} \ No newline at end of file From 6cd0caf368c68000f4be8048fea06f11ff01362b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 21 Sep 2022 19:02:11 +0200 Subject: [PATCH 03/56] add metrics to ocrd_eval.md --- ocrd_eval.md | 365 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 363 insertions(+), 2 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 3c9b72c..49478ca 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -9,7 +9,350 @@ solution works best for their use case. ## Evaluation metrics - +The evaluation of the success (accuracy) of OCR is a complex task for which multiple methods and metrics are available. It aims to capture quality in different aspects, such as the recognition of text, but also the detection of layout, for which different methods and metrics are needed. + +Furthermore, the time and resources required for OCR processing also have to be captured. Here we describe the metrics that were selected for use in OCR-D, how exactly they are applied, and what was the motivation. + +### Scope of these Definitions + +At this stage (Q3 2022) these definitions serve as a basis of common understanding for the metrics used in the benchmarking presented in OCR-D QUIVER. Further implications for evaluation tools do not yet apply. + +### Text Evaluation + +The most important measure to assess the quality of OCR is the accuracy of the recognized text. The majority of metrics for this are based on the Levenshtein distance, an algorithm to compute the distance between two strings. In OCR, one of these strings is generally the Ground Truth text and the other the recognized text which is the result of an OCR. + +#### Levenshtein Distance + +Levenshtein distance between two strings `a` and `b` is the number of edit operations needed to turn `a` into `b`. Edit operations depend on the specific variant of the algorithm but for OCR, relevant operations are deletion, insertion and substitution. + +The Levenshtein distance forms the basis for the calculation of [CER/WER](https://pad.gwdg.de/#CERWER). + +General example: + +The Levenshtein distance between "Monday" and "Tuesday" is 4, because 4 edit operations are necessary to turn "Monday" into "Tuesday": + +* **M**onday --> **T**onday (substitution) +* T**o**nday --> T**u**nday (substitution) +* Tu**n**day --> Tu**e**day (substitution) +* Tueday --> Tue**s**day (insertion) + +OCR example: + +Given a Ground truth that reads `ſind` and the recognized text `fmd`. + +The Levenshtein distance between these texts is 4, because 4 edit operations are necessary to turn `fmd` into `ſind`: + +* `fmd` --> `ſmd` (substitution) +* `ſmd` --> `ſimd` (insertion) +* `ſimd` --> `ſind` (substitution) + + +#### CER and WER + +##### Characters + +A text consists of a set of characters that have a certain meaning. A character is a glyph that represents a word, a letter in a word, or a symbol. + +Examples: + +* the character `a` in the text `babst` represents the German letter `a` +* the character `&` represents the Latin abbreviation `etc.` +* the character `☿` represents an Astronomical symbol for the planet Mercury + +##### Character Error Rate (CER) + +The character error rate (CER) describes how many faulty characters the output of an OCR engine contains compaired to the Ground Truth text in relation to the text length. + +Errors fall into one of the following three categories: + +* **deletion**: a character that is present in the text has been deleted from the output. Example: +![](https://pad.gwdg.de/uploads/304cf855-3436-42b7-86af-87c16106f1ad.jpg) +This reads `Sonnenfinſterniſſe:`. The output contains `Sonnenfinſterniſſe`, deleting `:`. + +* **substitution**: a character is replaced by another character in the output. Example: +![](https://pad.gwdg.de/uploads/d7fa6f23-7c79-4fb2-ad94-7e98084c69d6.jpg) + +This heading reads `Die Finſterniſſe des 1801ſten Jahrs`. The output contains `180iſten`, replacing `1` with `i`. + +* **insertion**: a new character is introduced in the output. Example: +![](https://pad.gwdg.de/uploads/e6b6432e-d79c-4568-9aef-15a026c05b39.jpg) +This reads `diese Strahlen, und`. The output contains `Strahlen ,`, inserting a white space before the comma. + + +CER can be calculated in several ways, depending on whether a normalized CER is used or not. + +Given $i$ as the number of insertions, $d$ the number of deletions, $s$ the number of substitutions and $n$ the total number of characters in a text, the CER can be obtained by + +$CER = \frac{i + s+ d}{n}$ + +If the CER value is calculated this way, it represents the percentage of characters incorrectly recognized by the OCR engine. Also, we can easily reach error rates beyond 100% when the output contains a lot of insertions. + +The *normalized* CER tries to mitigate this effect by considering the number of correct characters, $c$: + +$CER_n = \frac{i + s+ d}{i + s + d + c}$ + +In OCR-D's benchmarking we calculate the *non-normalized* CER where values over 1 should be read as 100%. + + +###### CER Granularity + +In OCR-D we distinguish between the CER per **page** and the **overall** CER of a text. The reasoning behind this is that the material OCR-D mainly aims at (historical prints) is very heterogeneous: Some pages might have an almost simplistic layout while others can be highly complex and difficult to process. Providing only an overall CER would cloud these differences between pages. + +At this point we only provide a CER per page; an overall CER might be calculated as a weighted aggregate at a later stage. + +##### Word Error Rate (WER) + +The word error rate (WER) is closely connected to the CER. While the CER focusses on differences between characters, the WER represents the percentage of words incorrectly recognized in a text. + +CER and WER share categories of errors, and the WER is similarly calculated: + +$WER = \frac{i_w + s_w + d_w}{n_w}$ + +where $i_w$ is the number of inserted, $s_w$ the number of substituted, $d_w$ the number of deleted and $n_w$ the total number of words. + +More specific cases of WER consider only the "significant" words, omitting e.g. stopwords from the calculation. + + +###### WER Granularity + +In OCR-D we distinguish between the WER per **page** and the **overall** WER of a text. The reasoning here follows the one of CER granularity. + +At this point we only provide a WER per page; an overall WER might be calculated at a later stage. + + +#### Bag of Words + +In the "Bag of Words" model a text is represented as a set of its word irregardless of word order or grammar; Only the words themselves and their number of occurence are considered. + +Example: + +![](https://pad.gwdg.de/uploads/4d33b422-6c77-436c-a3e6-bf27e67dc203.jpg) + + +> Eine Mondfinsternis ist die Himmelsbegebenheit welche sich zur Zeit des Vollmondes ereignet, wenn die Erde zwischen der Sonne und dem Monde steht, so daß die Strahlen der Sonne von der Erde aufgehalten werden, und daß man so den Schatten der Erde in dem Monde siehet. In diesem Jahre sind zwey Monfinsternisse, davon ist ebenfalls nur Eine bey uns sichtbar, und zwar am 30sten März des Morgens nach 4 Uhr, und währt bis nach 6 Uhr. + +To get the Bag of Words of this paragraph a set containing each word and its number of occurence is created: + +$BoW$ = +```json= +{ + "Eine": 2, "Mondfinsternis": 1, "ist": 2, "die": 2, "Himmelsbegebenheit": 1, + "welche": 1, "sich": 1, "zur": 1, "Zeit": 1, "des": 2, "Vollmondes": 1, + "ereignet,": 1, "wenn":1, "Erde": 3, "zwischen": 1, "der": 4, "Sonne": 2, + "und": 4, "dem": 2, "Monde": 2, "steht,": 1, "so": 2, "daß": 2, + "Strahlen": 1, "von": 1, "aufgehalten": 1, "werden,": 1, "man": 1, "den": 1, + "Schatten": 1, "in": 1, "siehet.": 1, "In": 1, "diesem": 1, "Jahre": 1, + "sind": 1, "zwey": 1, "Monfinsternisse,": 1, "davon": 1, "ebenfalls": 1, "nur": 1, + "bey": 1, "uns": 1, "sichtbar,": 1, "zwar": 1, "am": 1, "30sten": 1, + "März": 1, "Morgens": 1, "nach": 2, "4": 1, "Uhr,": 1, "währt": 1, + "bis": 1, "6": 1, "Uhr.": 1 +} +``` + + +### Layout Evaluation + +For documents with a complex structure, looking at the recognized text's accuracy alone is often insufficient to accurately determine the quality of OCR. An example can help to illustrate this: in a document containing two columns, all characters and words may be recognized correctly, but when the two columns are detected by layout analysis as just one, the OCR result will contain the text for the first lines of the first and second column, followed by the second lines of the first and second column asf., rendering the sequence of words and paragraphs in the Ground Truth text wrongly, which defeats almost all downstream processes. + +While the comprehensive evaluation of OCR with consideration of layout analysis is still a research topic, several established metrics can be used to capture different aspects of it. + +#### Reading Order + +Reading order describes the order in which segments on a page are intended to be read. While the reading order might be easily obtained in monographs with a single column where only a few page segments exist, identifying the reading order in more complex layouts (e.g. newspapers or multi-column layouts) can be more challenging. + +Example of a simple page layout with reading order: + +![](https://pad.gwdg.de/uploads/bc5258cb-bf91-479e-8a91-abf5ff8bbbfa.jpg) +(http://resolver.sub.uni-goettingen.de/purl?PPN1726778096) + + +Example of a complex page layout with reading order: + +![](https://pad.gwdg.de/uploads/100f14c4-19b0-4810-b3e5-74c674575424.jpg) +(http://resolver.sub.uni-goettingen.de/purl?PPN1726778096) + + + +#### IoU (Intersection over Union) + +Intersection over Union is a term which describes the degree of overlap of two regions of a (document) image defined either by a bounding box or polygon. Example: + +![](https://pad.gwdg.de/uploads/62945a01-a7a7-48f3-86c2-6bb8f97d67fe.jpg) + +(where green represents the Ground Truth and red the detected bounding box) + +Given a region A with an area $area_1$, a region B with the area $area_2$, and their overlap (or intersection) $area_o$, the IoU can then be expressed as + +$IoU = \frac{area_o}{area_1+area_2-area_o}$ + +where $area_1+area_2-area_o$ expresses the union of the two regions ($area_1+area_2$) while not counting the overlapping area twice. + +The IoU ranges between 0 (no overlap at all) and 1 (the two regions overlap perfectly). Users executing object detection can choose a [threshold](#Threshold) that defines which degree of overlap must be given to define a prediction as correct. If e.g. a threshold of 0.6 is chosen, all prediction that have an IoU of 0.6 or higher are correct. + +In OCR-D we use IoU to measure how well segments on a page are recognized during the segmentation step. The area of one region represents the area identified in the Ground Truth, while the second region represents the area identified by an OCR-D processor. + +### Resource Utilization + +Last but not least, it is important to collect information about the resource utilization of each processing step, so that informed decisions can be made when e.g. having to decide between results quality and throughput speed. + +#### CPU Time + +CPU time is the time taken by the CPU to process an instruction. It does not include idle time. + +#### Wall Time + +Wall time (or elapsed time) is the time taken by a processor to process an instruction including idle time. + +#### I/O + +I/O (input / output) is the number of bytes read and written during a process. + +#### Memory Usage + +Memory usage is the number of bytes the process allocates in memory (RAM). + +#### Disk Usage + +Disk usage is the number of bytes the process allocates on hard disk. + +### Unicode normalization + +In Unicode there can be multiple ways to express characters that have multiple components, such as a base letter and an accent. For evaluation it is essential that both Ground Truth and OCR results are normalized *in the same way* before evaluation. + +For example, the letter `ä` can be expressed directly as `ä` (`U+00E4` in Unicode) or as a combination of `a` and `◌̈` (`U+0061 + U+0308`). Both encodings are semantically equivalent but technically different. + +Unicode has the notion of *normalization forms* to provide canonically normalized text. The most common forms are *NFC* (Normalization Form Canonical Composed) and *NFD* (Normalization Form Canonical Decomposed). When a Unicode string is in NFC, all decomposed codepoints are replaced with their decomposed equivalent (e.g. `U+0061 + U+0308` to `U+00E4`). In an NFD encoding, all decomposed codepoints are replaced with their composed equivalents (e.g. `U+00E4` to `U+0061 + U+0308`). + + + +In accordance with the concept of [GT levels in OCR-D](https://ocr-d.de/en/gt-guidelines/trans/trLevels.html), it is preferable for strings to be normalized as NFC. + +The Unicode normalization algorithms rely on data from the Unicode database on equivalence classes and other script- and language-related metadata. For graphemes from the Private Use Area (PUA), such as MUFI, this information is not readily available and can lead to inconsistent normalization. Therefore, it is essential that evaluation tools normalize PUA codepoints in addition to canonical Unicode normalization. + + + +### Metrics Not in Use Yet + +:::info +The following metrics are not part of the MVP (minimal viable product) and will (if ever) be implemented at a later stage. +::: + +#### GPU metrics + +##### GPU time + +GPU time is the time a GPU (graphics card) spent processing instructions + +##### GPU avg memory + +GPU avg memory refers to the average amount of memory of the GPU (in GiB) that was used during processing. + +#### Text Evaluation + +##### Flexible Character Accuracy Measure + +The flexible character accuracy measure has been introduced to mitigate a major flaw of the CER: The CER is heavily dependent on the reading order an OCR engine detects; When content blocks are e.g. mixed up or merged during the text recognition step but single characters have been perfectly recognized, the CER is still very low. + +The flexible character accuracy measure circumvents this effect by splitting the recognized text and the Ground Truth in smaller chunks and measure their partial edit distance. After all partial edit distances have been obtained, they are summed up to receive the overall character accuracy measure. + +The algorithm can be summarized as follows: + +> 1. Split the two input texts into text lines +> 2. Sort the ground truth text lines by length (in descending order) +> 3. For the first ground truth line, find the best matching OCR result line segment (by minimising a penalty that is partly based on string edit distance) +> 4. If full match (full length of line) +> a. Mark as done and remove line from list +> b. Else subdivide and add to respective list of text lines; resort +> 5. If any more lines available repeat step 3 +> 6. Count non-matched lines / strings as insertions or deletions (depending on origin: ground truth or result) +> 7. Sum up all partial edit distances and calculate overall character accuracy + +(C. Clausner, S. Pletschacher and A. Antonacopoulos / Pattern Recognition Letters 131 (2020) 390–397, p. 392) + +#### Layout Evalutation + +##### mAP (mean Average Precision) + +###### Precision and Recall + +**Precision** is a means to describe how accurate a model can identify an object within an image. The higher the precision of a model, the more confidently we can assume that a prediction (e.g. the model having identified a bicycle in an image) is correct. A precision of 1 indicates that each identified object in an image has been correctly identified (true positives) and no false positives have been detected. As the precision value descreases, the result contains more and more false positives. + +**Recall**, on the other hand, measures how well a model performs in finding all instances of an object in an image (true positives), irregardless of false positives. Given a model tries to identify bicycles in an image, a recall of 1 indicates that all bicycles have been found by the model (while not considering other objects that have been falsely labelled as a bicycle). + +###### Prediction Score + +When a model tries to identify objects in an image, it predicts that a certain area in an image represents said object with a certain confidence or prediction score. The prediction score varies between 0 and 1 and represents the percentage of certainty of having correctly identified an object. Given a model tries to identify ornaments on a page. If the model returns an area of a page with a prediction score of 0.6, the model is "60% sure" that this area is an ornament. If this area is then considered to be a positive, depends on the chosen threshold. + +###### Thresholds + +A threshold is a freely chosen number between 0 and 1. It divides the output of a model into two groups: Outputs that have a prediction score or IoU greater than or equal to the threshold represent an object. Outputs with a prediction score or IoU below the threshold are discarded as not representing the object. + +Example: +Given a threshold of 0.6 and a model that tries to detect bicycles in an image. The model returns two areas in an image that might be bicycles, one with a prediction score of 0.4 and one with 0.9. Since the threshold equals 0.6, the first area is tossed and not regarded as bicycle while the second one is kept and counted as recognized. + +###### Precision-Recall-Curve + +Precision and recall are connected to each other since both depend on the true positives detected. A precision-recall-curve is a means to balance these values while maximizing them. + +Given a dataset with 100 images in total of which 50 depict a bicycle. Also given a model trying to identify bicycles on images. The model is run 7 times using the given dataset while gradually increasing the threshold from 0.1 to 0.7. + + +| run | threshold | true positives | false positives | false negatives |precision | recall | +|-----|-----------|----------------|-----------------|-----------------|----------|--------| +| 1 | 0.1 | 50 | 25 | 0 | 0.66 | 1 | +| 2 | 0.2 | 45 | 20 | 5 | 0.69 | 0.9 | +| 3 | 0.3 | 40 | 15 | 10 | 0.73 | 0.8 | +| 4 | 0.4 | 35 | 5 | 15 | 0.88 | 0.7 | +| 5 | 0.5 | 30 | 3 | 20 | 0.91 | 0.6 | +| 6 | 0.6 | 20 | 0 | 30 | 1 | 0.4 | +| 7 | 0.7 | 10 | 0 | 40 | 1 | 0.2 | + +For each threshold a pair of precision and recall can be computed and plotted to a curve: + +![](https://pad.gwdg.de/uploads/2d3c62ff-cab4-4a12-8043-014fe0440459.png) + + +This graph is called Precision-Recall-Curve. + + +###### Average Precision + +The average precision (AP) describes how well a model can detect objects in an image for recall values over 0 to 1 by computing the average of all precisions given in the Precision-Recall-Curve. It is equal to the area under the curve. + +![](https://pad.gwdg.de/uploads/799e6a05-e64a-4956-9ede-440ac0463a3f.png) + +The Average Precision can be computed with the weighted mean of precision at each confidence threshold: + +$AP = \displaystyle\sum_{k=0}^{k=n-1}[r(k) - r(k+1)] * p(k)$ + +with $n$ being the number of thresholds and $r(k)$/$p(k)$ being the respective recall/precision values for the current confidence threshold $k$. + +Example: +Given the example above, we get: + +$$ +\begin{array}{2} +AP & = \displaystyle\sum_{k=0}^{k=n-1}[r(k) - r(k+1)] * p(k) \\ +& = \displaystyle\sum_{k=0}^{k=6}[r(k) - r(k+1)] * p(k) \\ +& = (1-0.9) * 0.66 + (0.9-0.8) * 0.69 + \text{...} + (0.2-0) * 1\\ +& = 0.878 +\end{array} +$$ + +###### mAP (mean Average Precision) + +The mean Average Precision is a metric used to measure how accurate an object detector is. [As stated](#Thresholds), a threshold can be chosen freely, so there is some room for errors when picking one single threshold. To mitigate this effect, the mean Average Precision metric has been introduced which considers a set of IoU thresholds to determine the detector's performance. It is calculated by first computing the Average Precision for each IoU threshold and then finding the average: + +$mAP = \displaystyle\frac{1}{N}\sum_{i=1}^{N}AP_i$ + +with $N$ being the number of thresholds. + + +##### Scenario-driven Performance Evaluation + +Scenario-driven performance evaluation as described in [Clausner et al., 2011](https://primaresearch.org/publications/ICDAR2011_Clausner_PerformanceEvaluation) is currently the most comprehensive and sophisticated approach to evaluate OCR success with consideration of layout. + +The approach is based on the definition of so called evaluation scenarios, which allow the flexible combination of a selection of metrics together with their weights, targeted at a specific use case. ## Evaluation JSON schema @@ -20,4 +363,22 @@ the [`ocrd-eval.json`](https://ocr-d.de/en/spec/ocrd-eval.schema.json). ## Tools - +See [OCR-D workflow guide](https://ocr-d.de/en/workflows#evaluation) + +## References + +* CER/WER: + * https://sites.google.com/site/textdigitisation/qualitymeasures + * https://towardsdatascience.com/evaluating-ocr-output-quality-with-character-error-rate-cer-and-word-error-rate-wer-853175297510#5aec +* IoU: + * https://medium.com/analytics-vidhya/iou-intersection-over-union-705a39e7acef +* mAP: + * https://blog.paperspace.com/mean-average-precision/ + * https://jonathan-hui.medium.com/map-mean-average-precision-for-object-detection-45c121a31173 +* BoW: + * https://en.wikipedia.org/wiki/Bag-of-words_model +* FCA: + * https://www.primaresearch.org/www/assets/papers/PRL_Clausner_FlexibleCharacterAccuracy.pdf +* More background on evaluation of OCR + * https://doi.org/10.1145/3476887.3476888 + * https://doi.org/10.1515/9783110691597-009 From b5295313b6cf18f88c5e264ac7b6e885f06664ed Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 22 Sep 2022 15:43:23 +0200 Subject: [PATCH 04/56] ocrd_eval: \begin{array}{ll} instead of .. {2} --- ocrd_eval.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 49478ca..4b211da 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -331,7 +331,7 @@ Example: Given the example above, we get: $$ -\begin{array}{2} +\begin{array}{ll} AP & = \displaystyle\sum_{k=0}^{k=n-1}[r(k) - r(k+1)] * p(k) \\ & = \displaystyle\sum_{k=0}^{k=6}[r(k) - r(k+1)] * p(k) \\ & = (1-0.9) * 0.66 + (0.9-0.8) * 0.69 + \text{...} + (0.2-0) * 1\\ @@ -343,9 +343,7 @@ $$ The mean Average Precision is a metric used to measure how accurate an object detector is. [As stated](#Thresholds), a threshold can be chosen freely, so there is some room for errors when picking one single threshold. To mitigate this effect, the mean Average Precision metric has been introduced which considers a set of IoU thresholds to determine the detector's performance. It is calculated by first computing the Average Precision for each IoU threshold and then finding the average: -$mAP = \displaystyle\frac{1}{N}\sum_{i=1}^{N}AP_i$ - -with $N$ being the number of thresholds. +$mAP = \displaystyle\frac{1}{N}\sum_{i=1}^{N}AP_i$ with $N$ being the number of thresholds. ##### Scenario-driven Performance Evaluation From 18333b8ef2f96adf80eafd2cb5edb1c8f6889021 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Mon, 26 Sep 2022 11:36:47 +0200 Subject: [PATCH 05/56] style(ocrd_eval.md): linting, formatting and correcting images --- ocrd_eval.md | 97 +++++++++++++++++++++++++--------------------------- 1 file changed, 46 insertions(+), 51 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 4b211da..be2414a 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -9,9 +9,9 @@ solution works best for their use case. ## Evaluation metrics -The evaluation of the success (accuracy) of OCR is a complex task for which multiple methods and metrics are available. It aims to capture quality in different aspects, such as the recognition of text, but also the detection of layout, for which different methods and metrics are needed. +The evaluation of the success (accuracy) of OCR is a complex task for which multiple methods and metrics are available. It aims to capture quality in different aspects, such as the recognition of text, but also the detection of layout, for which different methods and metrics are needed. -Furthermore, the time and resources required for OCR processing also have to be captured. Here we describe the metrics that were selected for use in OCR-D, how exactly they are applied, and what was the motivation. +Furthermore, the time and resources required for OCR processing also have to be captured. Here we describe the metrics that were selected for use in OCR-D, how exactly they are applied, and what was the motivation. ### Scope of these Definitions @@ -27,7 +27,7 @@ Levenshtein distance between two strings `a` and `b` is the number of edit opera The Levenshtein distance forms the basis for the calculation of [CER/WER](https://pad.gwdg.de/#CERWER). -General example: +##### General example The Levenshtein distance between "Monday" and "Tuesday" is 4, because 4 edit operations are necessary to turn "Monday" into "Tuesday": @@ -36,7 +36,7 @@ The Levenshtein distance between "Monday" and "Tuesday" is 4, because 4 edit ope * Tu**n**day --> Tu**e**day (substitution) * Tueday --> Tue**s**day (insertion) -OCR example: +##### OCR example Given a Ground truth that reads `ſind` and the recognized text `fmd`. @@ -46,14 +46,13 @@ The Levenshtein distance between these texts is 4, because 4 edit operations are * `ſmd` --> `ſimd` (insertion) * `ſimd` --> `ſind` (substitution) - #### CER and WER ##### Characters A text consists of a set of characters that have a certain meaning. A character is a glyph that represents a word, a letter in a word, or a symbol. -Examples: +###### Examples * the character `a` in the text `babst` represents the German letter `a` * the character `&` represents the Latin abbreviation `etc.` @@ -65,19 +64,28 @@ The character error rate (CER) describes how many faulty characters the output o Errors fall into one of the following three categories: -* **deletion**: a character that is present in the text has been deleted from the output. Example: -![](https://pad.gwdg.de/uploads/304cf855-3436-42b7-86af-87c16106f1ad.jpg) +* **deletion**: a character that is present in the text has been deleted from the output. + +Example: +![A Fraktur sample reading "Sonnenfinſterniſſe:"](https://pad.gwdg.de/uploads/d7fa6f23-7c79-4fb2-ad94-7e98084c69d6.jpg) + This reads `Sonnenfinſterniſſe:`. The output contains `Sonnenfinſterniſſe`, deleting `:`. -* **substitution**: a character is replaced by another character in the output. Example: -![](https://pad.gwdg.de/uploads/d7fa6f23-7c79-4fb2-ad94-7e98084c69d6.jpg) +* **substitution**: a character is replaced by another character in the output. + +Example: + +![A Fraktur sample reading "Die Finſterniſſe des 1801ſten Jahrs"](https://pad.gwdg.de/uploads/b894049b-8d98-4fe7-ac31-71b2c9393a6c.jpg) This heading reads `Die Finſterniſſe des 1801ſten Jahrs`. The output contains `180iſten`, replacing `1` with `i`. -* **insertion**: a new character is introduced in the output. Example: -![](https://pad.gwdg.de/uploads/e6b6432e-d79c-4568-9aef-15a026c05b39.jpg) -This reads `diese Strahlen, und`. The output contains `Strahlen ,`, inserting a white space before the comma. +* **insertion**: a new character is introduced in the output. + +Example: +![A Fraktur sample reading "diese Strahlen, und"](https://pad.gwdg.de/uploads/e6b6432e-d79c-4568-9aef-15a026c05b39.jpg) + +This reads `diese Strahlen, und`. The output contains `Strahlen ,`, inserting a white space before the comma. CER can be calculated in several ways, depending on whether a normalized CER is used or not. @@ -93,7 +101,6 @@ $CER_n = \frac{i + s+ d}{i + s + d + c}$ In OCR-D's benchmarking we calculate the *non-normalized* CER where values over 1 should be read as 100%. - ###### CER Granularity In OCR-D we distinguish between the CER per **page** and the **overall** CER of a text. The reasoning behind this is that the material OCR-D mainly aims at (historical prints) is very heterogeneous: Some pages might have an almost simplistic layout while others can be highly complex and difficult to process. Providing only an overall CER would cloud these differences between pages. @@ -112,28 +119,26 @@ where $i_w$ is the number of inserted, $s_w$ the number of substituted, $d_w$ th More specific cases of WER consider only the "significant" words, omitting e.g. stopwords from the calculation. - ###### WER Granularity In OCR-D we distinguish between the WER per **page** and the **overall** WER of a text. The reasoning here follows the one of CER granularity. At this point we only provide a WER per page; an overall WER might be calculated at a later stage. - #### Bag of Words In the "Bag of Words" model a text is represented as a set of its word irregardless of word order or grammar; Only the words themselves and their number of occurence are considered. Example: -![](https://pad.gwdg.de/uploads/4d33b422-6c77-436c-a3e6-bf27e67dc203.jpg) - +![A sample paragraph in German Fraktur](https://pad.gwdg.de/uploads/4d33b422-6c77-436c-a3e6-bf27e67dc203.jpg) > Eine Mondfinsternis ist die Himmelsbegebenheit welche sich zur Zeit des Vollmondes ereignet, wenn die Erde zwischen der Sonne und dem Monde steht, so daß die Strahlen der Sonne von der Erde aufgehalten werden, und daß man so den Schatten der Erde in dem Monde siehet. In diesem Jahre sind zwey Monfinsternisse, davon ist ebenfalls nur Eine bey uns sichtbar, und zwar am 30sten März des Morgens nach 4 Uhr, und währt bis nach 6 Uhr. To get the Bag of Words of this paragraph a set containing each word and its number of occurence is created: -$BoW$ = +$BoW$ = + ```json= { "Eine": 2, "Mondfinsternis": 1, "ist": 2, "die": 2, "Himmelsbegebenheit": 1, @@ -149,12 +154,11 @@ $BoW$ = } ``` - ### Layout Evaluation -For documents with a complex structure, looking at the recognized text's accuracy alone is often insufficient to accurately determine the quality of OCR. An example can help to illustrate this: in a document containing two columns, all characters and words may be recognized correctly, but when the two columns are detected by layout analysis as just one, the OCR result will contain the text for the first lines of the first and second column, followed by the second lines of the first and second column asf., rendering the sequence of words and paragraphs in the Ground Truth text wrongly, which defeats almost all downstream processes. +For documents with a complex structure, looking at the recognized text's accuracy alone is often insufficient to accurately determine the quality of OCR. An example can help to illustrate this: in a document containing two columns, all characters and words may be recognized correctly, but when the two columns are detected by layout analysis as just one, the OCR result will contain the text for the first lines of the first and second column, followed by the second lines of the first and second column asf., rendering the sequence of words and paragraphs in the Ground Truth text wrongly, which defeats almost all downstream processes. -While the comprehensive evaluation of OCR with consideration of layout analysis is still a research topic, several established metrics can be used to capture different aspects of it. +While the comprehensive evaluation of OCR with consideration of layout analysis is still a research topic, several established metrics can be used to capture different aspects of it. #### Reading Order @@ -162,22 +166,19 @@ Reading order describes the order in which segments on a page are intended to be Example of a simple page layout with reading order: -![](https://pad.gwdg.de/uploads/bc5258cb-bf91-479e-8a91-abf5ff8bbbfa.jpg) -(http://resolver.sub.uni-goettingen.de/purl?PPN1726778096) - +![A sample page in German Fraktur with a simple page layout showing the intended reading order](https://pad.gwdg.de/uploads/bc5258cb-bf91-479e-8a91-abf5ff8bbbfa.jpg) +() Example of a complex page layout with reading order: -![](https://pad.gwdg.de/uploads/100f14c4-19b0-4810-b3e5-74c674575424.jpg) -(http://resolver.sub.uni-goettingen.de/purl?PPN1726778096) - - +![A sample page in German Fraktur with a complex page layout showing the intended reading order](https://pad.gwdg.de/uploads/100f14c4-19b0-4810-b3e5-74c674575424.jpg) +() #### IoU (Intersection over Union) Intersection over Union is a term which describes the degree of overlap of two regions of a (document) image defined either by a bounding box or polygon. Example: -![](https://pad.gwdg.de/uploads/62945a01-a7a7-48f3-86c2-6bb8f97d67fe.jpg) +![A sample heading in German Fraktur illustrating a Ground Truth bounding box and a detected bounding box](https://pad.gwdg.de/uploads/62945a01-a7a7-48f3-86c2-6bb8f97d67fe.jpg) (where green represents the Ground Truth and red the detected bounding box) @@ -219,7 +220,7 @@ Disk usage is the number of bytes the process allocates on hard disk. In Unicode there can be multiple ways to express characters that have multiple components, such as a base letter and an accent. For evaluation it is essential that both Ground Truth and OCR results are normalized *in the same way* before evaluation. -For example, the letter `ä` can be expressed directly as `ä` (`U+00E4` in Unicode) or as a combination of `a` and `◌̈` (`U+0061 + U+0308`). Both encodings are semantically equivalent but technically different. +For example, the letter `ä` can be expressed directly as `ä` (`U+00E4` in Unicode) or as a combination of `a` and `◌̈` (`U+0061 + U+0308`). Both encodings are semantically equivalent but technically different. Unicode has the notion of *normalization forms* to provide canonically normalized text. The most common forms are *NFC* (Normalization Form Canonical Composed) and *NFD* (Normalization Form Canonical Decomposed). When a Unicode string is in NFC, all decomposed codepoints are replaced with their decomposed equivalent (e.g. `U+0061 + U+0308` to `U+00E4`). In an NFD encoding, all decomposed codepoints are replaced with their composed equivalents (e.g. `U+00E4` to `U+0061 + U+0308`). @@ -233,9 +234,7 @@ The Unicode normalization algorithms rely on data from the Unicode database on e ### Metrics Not in Use Yet -:::info The following metrics are not part of the MVP (minimal viable product) and will (if ever) be implemented at a later stage. -::: #### GPU metrics @@ -296,7 +295,6 @@ Precision and recall are connected to each other since both depend on the true p Given a dataset with 100 images in total of which 50 depict a bicycle. Also given a model trying to identify bicycles on images. The model is run 7 times using the given dataset while gradually increasing the threshold from 0.1 to 0.7. - | run | threshold | true positives | false positives | false negatives |precision | recall | |-----|-----------|----------------|-----------------|-----------------|----------|--------| | 1 | 0.1 | 50 | 25 | 0 | 0.66 | 1 | @@ -309,17 +307,15 @@ Given a dataset with 100 images in total of which 50 depict a bicycle. Also give For each threshold a pair of precision and recall can be computed and plotted to a curve: -![](https://pad.gwdg.de/uploads/2d3c62ff-cab4-4a12-8043-014fe0440459.png) - +![A sample precision/recall curve](https://pad.gwdg.de/uploads/2d3c62ff-cab4-4a12-8043-014fe0440459.png) This graph is called Precision-Recall-Curve. - ###### Average Precision The average precision (AP) describes how well a model can detect objects in an image for recall values over 0 to 1 by computing the average of all precisions given in the Precision-Recall-Curve. It is equal to the area under the curve. -![](https://pad.gwdg.de/uploads/799e6a05-e64a-4956-9ede-440ac0463a3f.png) +![A sample precision/recall curve with highlighted area under curve](https://pad.gwdg.de/uploads/799e6a05-e64a-4956-9ede-440ac0463a3f.png) The Average Precision can be computed with the weighted mean of precision at each confidence threshold: @@ -345,12 +341,11 @@ The mean Average Precision is a metric used to measure how accurate an object de $mAP = \displaystyle\frac{1}{N}\sum_{i=1}^{N}AP_i$ with $N$ being the number of thresholds. - ##### Scenario-driven Performance Evaluation -Scenario-driven performance evaluation as described in [Clausner et al., 2011](https://primaresearch.org/publications/ICDAR2011_Clausner_PerformanceEvaluation) is currently the most comprehensive and sophisticated approach to evaluate OCR success with consideration of layout. +Scenario-driven performance evaluation as described in [Clausner et al., 2011](https://primaresearch.org/publications/ICDAR2011_Clausner_PerformanceEvaluation) is currently the most comprehensive and sophisticated approach to evaluate OCR success with consideration of layout. -The approach is based on the definition of so called evaluation scenarios, which allow the flexible combination of a selection of metrics together with their weights, targeted at a specific use case. +The approach is based on the definition of so called evaluation scenarios, which allow the flexible combination of a selection of metrics together with their weights, targeted at a specific use case. ## Evaluation JSON schema @@ -361,22 +356,22 @@ the [`ocrd-eval.json`](https://ocr-d.de/en/spec/ocrd-eval.schema.json). ## Tools -See [OCR-D workflow guide](https://ocr-d.de/en/workflows#evaluation) +See [OCR-D workflow guide](https://ocr-d.de/en/workflows#evaluation). ## References * CER/WER: - * https://sites.google.com/site/textdigitisation/qualitymeasures - * https://towardsdatascience.com/evaluating-ocr-output-quality-with-character-error-rate-cer-and-word-error-rate-wer-853175297510#5aec + * + * * IoU: - * https://medium.com/analytics-vidhya/iou-intersection-over-union-705a39e7acef + * * mAP: - * https://blog.paperspace.com/mean-average-precision/ - * https://jonathan-hui.medium.com/map-mean-average-precision-for-object-detection-45c121a31173 + * + * * BoW: - * https://en.wikipedia.org/wiki/Bag-of-words_model + * * FCA: - * https://www.primaresearch.org/www/assets/papers/PRL_Clausner_FlexibleCharacterAccuracy.pdf + * * More background on evaluation of OCR - * https://doi.org/10.1145/3476887.3476888 - * https://doi.org/10.1515/9783110691597-009 + * + * From fe9d6ffd2c6d4e6048a718027733007f3433add5 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Mon, 26 Sep 2022 12:14:03 +0200 Subject: [PATCH 06/56] stlye: add new line --- ocrd_eval.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocrd_eval.md b/ocrd_eval.md index be2414a..bf87a06 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -167,11 +167,13 @@ Reading order describes the order in which segments on a page are intended to be Example of a simple page layout with reading order: ![A sample page in German Fraktur with a simple page layout showing the intended reading order](https://pad.gwdg.de/uploads/bc5258cb-bf91-479e-8a91-abf5ff8bbbfa.jpg) + () Example of a complex page layout with reading order: ![A sample page in German Fraktur with a complex page layout showing the intended reading order](https://pad.gwdg.de/uploads/100f14c4-19b0-4810-b3e5-74c674575424.jpg) + () #### IoU (Intersection over Union) From d7854a16201db54166ed9f23b74c555c54d1b19a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Sep 2022 11:59:49 +0200 Subject: [PATCH 07/56] Apply suggestions from code review Co-authored-by: mweidling <13831557+mweidling@users.noreply.github.com> --- ocrd_eval.schema.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml index fe1c655..3484960 100644 --- a/ocrd_eval.schema.yml +++ b/ocrd_eval.schema.yml @@ -91,7 +91,7 @@ $defs: eval_tool: type: string - description: Human readable name and version of evaluation tool used (for UI + description: Human readable name and version of evaluation tool used (for UI) document_metadata: type: object @@ -100,11 +100,11 @@ $defs: publication_year: type: number - description: Year he document was originally published + description: Year the document was originally published publication_century: type: string - description: Century he document was originally published + description: Century the document was originally published pattern: '[12][0-9]{3}-[12][0-9]{3}' publication_decade: @@ -124,7 +124,7 @@ $defs: type: array items: type: string - enum: ['antiqua', 'fraktur'] + enum: ['antiqua', 'fraktur', 'ancient_greek', 'hebrew'] provenance: type: object From ee67881540f86b36e404f32d0c7cdb356e6b7965 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Sep 2022 12:03:06 +0200 Subject: [PATCH 08/56] Apply suggestions from code review Co-authored-by: mweidling <13831557+mweidling@users.noreply.github.com> --- ocrd_eval.schema.json | 12 +++++++----- ocrd_eval.schema.yml | 2 +- ocrd_tool.schema.json | 8 ++++---- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/ocrd_eval.schema.json b/ocrd_eval.schema.json index 9fa0463..1f7af3a 100644 --- a/ocrd_eval.schema.json +++ b/ocrd_eval.schema.json @@ -116,7 +116,7 @@ }, "eval_tool": { "type": "string", - "description": "Human readable name and version of evaluation tool used (for UI" + "description": "Human readable name and version of evaluation tool used (for UI)" }, "document_metadata": { "type": "object", @@ -124,11 +124,11 @@ "properties": { "publication_year": { "type": "number", - "description": "Year he document was originally published" + "description": "Year the document was originally published" }, "publication_century": { "type": "string", - "description": "Century he document was originally published", + "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}" }, "publication_decade": { @@ -153,7 +153,9 @@ "type": "string", "enum": [ "antiqua", - "fraktur" + "fraktur", + "ancient_greek", + "hebrew" ] } } @@ -163,7 +165,7 @@ "type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": { - "paramters": { + "parameters": { "type": "object", "description": "Parameters passed to the evaluation processor" } diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml index 3484960..3fc8b43 100644 --- a/ocrd_eval.schema.yml +++ b/ocrd_eval.schema.yml @@ -130,7 +130,7 @@ $defs: type: object description: Information on which tools in which version were used in determining metrics properties: - paramters: + parameters: type: object description: Parameters passed to the evaluation processor diff --git a/ocrd_tool.schema.json b/ocrd_tool.schema.json index 2a42cb1..10aef43 100644 --- a/ocrd_tool.schema.json +++ b/ocrd_tool.schema.json @@ -14,7 +14,7 @@ "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$" }, "git_url": { - "description": "Github/Gitlab URL", + "description": "GitHub/GitLab URL", "type": "string", "format": "url" }, @@ -143,10 +143,10 @@ } }, "description": { - "description": "Concise description what the tool does" + "description": "Concise description of what the tool does" }, "categories": { - "description": "Tools belong to this categories, representing modules within the OCR-D project structure", + "description": "Tools belong to these categories, representing modules within the OCR-D project structure", "type": "array", "items": { "type": "string", @@ -252,7 +252,7 @@ }, "path_in_archive": { "type": "string", - "description": "if type is archive, the resource is at this location in the archive", + "description": "If type is archive, the resource is at this location in the archive", "default": "." }, "version_range": { From 5b35358a862829bd1e362a8fd23b9c1e730269d9 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:04:21 +0200 Subject: [PATCH 09/56] retcon JSON changes to YAML --- bagit-profile.json | 50 +++++++++++++++++++++++++++++++++++++++++++- ocrd_tool.md | 1 + ocrd_tool.schema.yml | 8 +++---- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/bagit-profile.json b/bagit-profile.json index 805bbb4..ca0d5b9 100644 --- a/bagit-profile.json +++ b/bagit-profile.json @@ -1 +1,49 @@ -{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.de/en/spec/bagit-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR data","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Ocrd-Mets":{"required":false,"default":"mets.xml"},"Ocrd-Identifier":{"required":true},"Ocrd-Checksum":{"required":false,"default":"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e"}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md","Makefile","build.sh","sources.csv","metadata/*.xml","metadata/*.txt"],"Allow-Fetch.txt":false,"Serialization":"required","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file +{ + "BagIt-Profile-Info": { + "BagIt-Profile-Identifier": "https://ocr-d.de/en/spec/bagit-profile.json", + "BagIt-Profile-Version": "1.2.0", + "Source-Organization": "OCR-D", + "External-Description": "BagIt profile for OCR data", + "Contact-Name": "Konstantin Baierer", + "Contact-Email": "konstantin.baierer@sbb.spk-berlin.de", + "Version": 0.1 + }, + "Bag-Info": { + "Bagging-Date": { + "required": false + }, + "Source-Organization": { + "required": false + }, + "Ocrd-Mets": { + "required": false, + "default": "mets.xml" + }, + "Ocrd-Identifier": { + "required": true + }, + "Ocrd-Checksum": { + "required": false, + "default": "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e" + } + }, + "Manifests-Required": [ + "sha512" + ], + "Tag-Manifests-Required": [], + "Tag-Files-Required": [], + "Tag-Files-Allowed": [ + "README.md", + "Makefile", + "build.sh", + "sources.csv", + "metadata/*.xml", + "metadata/*.txt" + ], + "Allow-Fetch.txt": false, + "Serialization": "required", + "Accept-Serialization": "application/zip", + "Accept-BagIt-Version": [ + "1.0" + ] +} \ No newline at end of file diff --git a/ocrd_tool.md b/ocrd_tool.md index 2cf8219..d3a42e9 100644 --- a/ocrd_tool.md +++ b/ocrd_tool.md @@ -113,6 +113,7 @@ properties: parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. type: object + default: {} patternProperties: ".*": type: object diff --git a/ocrd_tool.schema.yml b/ocrd_tool.schema.yml index 0153db5..32d00d6 100644 --- a/ocrd_tool.schema.yml +++ b/ocrd_tool.schema.yml @@ -11,7 +11,7 @@ properties: type: string pattern: '^[0-9]+\.[0-9]+\.[0-9]+$' git_url: - description: Github/Gitlab URL + description: GitHub/GitLab URL type: string format: url dockerhub: @@ -122,9 +122,9 @@ properties: description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change." default: false description: - description: Concise description what the tool does + description: Concise description of what the tool does categories: - description: Tools belong to this categories, representing modules within the OCR-D project structure + description: Tools belong to these categories, representing modules within the OCR-D project structure type: array items: type: string @@ -199,7 +199,7 @@ properties: default: 'as-is' path_in_archive: type: string - description: if type is archive, the resource is at this location in the archive + description: If type is archive, the resource is at this location in the archive default: '.' version_range: type: string From 1aa048cdae13a828317868d1acc38baf0337e0a5 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Sep 2022 12:06:51 +0200 Subject: [PATCH 10/56] comment EvaluationMetrics back in --- ocrd_eval.schema.json | 10 +++++++++- ocrd_eval.schema.yml | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/ocrd_eval.schema.json b/ocrd_eval.schema.json index 1f7af3a..7546d8a 100644 --- a/ocrd_eval.schema.json +++ b/ocrd_eval.schema.json @@ -180,7 +180,10 @@ "properties": { "document_wide": { "type": "object", - "description": "Document-wide metrics" + "description": "Document-wide metrics", + "properties": { + "$ref": "#$defs/EvaluationMetrics" + } }, "by_page": { "type": "array", @@ -195,6 +198,11 @@ "description": "PAGE ID" } } + }, + { + "properties": { + "$ref": "#$defs/EvaluationMetrics" + } } ] } diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml index 3fc8b43..596bc90 100644 --- a/ocrd_eval.schema.yml +++ b/ocrd_eval.schema.yml @@ -142,7 +142,7 @@ $defs: document_wide: type: object description: Document-wide metrics - #properties: { $ref: '#$defs/EvaluationMetrics' } + properties: { $ref: '#$defs/EvaluationMetrics' } by_page: type: array description: Metrics page-by-page @@ -153,7 +153,7 @@ $defs: page_id: type: string description: PAGE ID - #- properties: { $ref: '#$defs/EvaluationMetrics' } + - properties: { $ref: '#$defs/EvaluationMetrics' } EvaluationMetrics: From 5840476b3bb705613974fe47e755180b1ab3740e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Sep 2022 12:12:11 +0200 Subject: [PATCH 11/56] generate minimal JSON from YAML src --- Makefile | 2 +- bagit-profile.json | 50 +------- ocrd_eval.sample.json | 121 +----------------- ocrd_eval.schema.json | 249 +------------------------------------ ocrd_tool.schema.json | 276 +----------------------------------------- 5 files changed, 5 insertions(+), 693 deletions(-) diff --git a/Makefile b/Makefile index 66f48f9..3eeb0e7 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ json: $(shell find -name '*.json') %.json: %.yml - python3 scripts/yaml-to-json.py $< $@ + python3 scripts/yaml-to-json.py --indent 0 $< $@ validate: json jsonschema --output pretty --validator Draft201909Validator --instance ocrd_eval.sample.json ocrd_eval.schema.json diff --git a/bagit-profile.json b/bagit-profile.json index ca0d5b9..a461ec0 100644 --- a/bagit-profile.json +++ b/bagit-profile.json @@ -1,49 +1 @@ -{ - "BagIt-Profile-Info": { - "BagIt-Profile-Identifier": "https://ocr-d.de/en/spec/bagit-profile.json", - "BagIt-Profile-Version": "1.2.0", - "Source-Organization": "OCR-D", - "External-Description": "BagIt profile for OCR data", - "Contact-Name": "Konstantin Baierer", - "Contact-Email": "konstantin.baierer@sbb.spk-berlin.de", - "Version": 0.1 - }, - "Bag-Info": { - "Bagging-Date": { - "required": false - }, - "Source-Organization": { - "required": false - }, - "Ocrd-Mets": { - "required": false, - "default": "mets.xml" - }, - "Ocrd-Identifier": { - "required": true - }, - "Ocrd-Checksum": { - "required": false, - "default": "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e" - } - }, - "Manifests-Required": [ - "sha512" - ], - "Tag-Manifests-Required": [], - "Tag-Files-Required": [], - "Tag-Files-Allowed": [ - "README.md", - "Makefile", - "build.sh", - "sources.csv", - "metadata/*.xml", - "metadata/*.txt" - ], - "Allow-Fetch.txt": false, - "Serialization": "required", - "Accept-Serialization": "application/zip", - "Accept-BagIt-Version": [ - "1.0" - ] -} \ No newline at end of file +{"BagIt-Profile-Info": {"BagIt-Profile-Identifier": "https://ocr-d.de/en/spec/bagit-profile.json", "BagIt-Profile-Version": "1.2.0", "Source-Organization": "OCR-D", "External-Description": "BagIt profile for OCR data", "Contact-Name": "Konstantin Baierer", "Contact-Email": "konstantin.baierer@sbb.spk-berlin.de", "Version": 0.1}, "Bag-Info": {"Bagging-Date": {"required": false}, "Source-Organization": {"required": false}, "Ocrd-Mets": {"required": false, "default": "mets.xml"}, "Ocrd-Identifier": {"required": true}, "Ocrd-Checksum": {"required": false, "default": "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e"}}, "Manifests-Required": ["sha512"], "Tag-Manifests-Required": [], "Tag-Files-Required": [], "Tag-Files-Allowed": ["README.md", "Makefile", "build.sh", "sources.csv", "metadata/*.xml", "metadata/*.txt"], "Allow-Fetch.txt": false, "Serialization": "required", "Accept-Serialization": "application/zip", "Accept-BagIt-Version": ["1.0"]} \ No newline at end of file diff --git a/ocrd_eval.sample.json b/ocrd_eval.sample.json index e1a540f..80de251 100644 --- a/ocrd_eval.sample.json +++ b/ocrd_eval.sample.json @@ -1,120 +1 @@ -[ - { - "@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json", - "label": "OCR workflow 1 on workspace 345", - "metadata": { - "ocr_workflow": { - "@id": "https://github.com/OCR-D/quiver/tree/data/workflows/1.nf", - "label": "OCR Workflow 1" - }, - "eval_workflow": { - "@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", - "label": "Evaluation Workflow 1" - }, - "gt_workspace": { - "@id": "https://gt.ocr-d.de/workspace/789", - "label": "GT workspace 789 (19th century fraktur)" - }, - "ocr_workspace": { - "@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", - "label": "OCR result workspace 3000" - }, - "eval_workspace": { - "@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", - "label": "Evaluation Workspace 345" - }, - "workflow_steps": { - "0": "Processor A", - "1": "Processor B" - }, - "workflow_model": "Fraktur_GT4HistOCR", - "document_metadata": { - "fonts": [ - "antiqua", - "fraktur" - ], - "publication_century": "1800-1900", - "publication_decade": "1850-1860", - "publication_year": 1855, - "number_of_pages": 100, - "layout": "simple" - } - }, - "evaluation": { - "document_wide": { - "wall_time": 1234, - "cer": 0.57, - "cer_min_max": [ - 0.2, - 0.57 - ] - }, - "by_page": [ - { - "page_id": "PHYS_0001", - "cer": 0.8, - "processing_time": 2.1 - } - ] - } - }, - { - "@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json", - "label": "OCR Workflow 2 on Data 345", - "metadata": { - "ocr_workflow": { - "@id": "https://github.com/OCR-D/quiver/tree/data/workflows/2.nf", - "label": "OCR Workflow 2" - }, - "eval_workflow": { - "@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", - "label": "Evaluation Workflow 1" - }, - "gt_workspace": { - "@id": "https://gt.ocr-d.de/workspace/789", - "label": "GT workspace 789 (19th century fraktur)" - }, - "ocr_workspace": { - "@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", - "label": "OCR result workspace 3000" - }, - "eval_workspace": { - "@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", - "label": "Evaluation Workspace 345" - }, - "workflow_steps": { - "0": "Processor A", - "1": "Processor B" - }, - "workflow_model": "Fraktur_GT4HistOCR", - "document_metadata": { - "fonts": [ - "antiqua", - "fraktur" - ], - "publication_century": "1800-1900", - "publication_decade": "1850-1860", - "publication_year": 1855, - "number_of_pages": 100, - "layout": "simple" - } - }, - "evaluation": { - "document_wide": { - "wall_time": 4567, - "cer": 0.9, - "cer_min_max": [ - 0.2, - 0.99 - ] - }, - "by_page": [ - { - "page_id": "PHYS_0001", - "cer": 0.9, - "processing_time": 2.1 - } - ] - } - } -] \ No newline at end of file +[{"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json", "label": "OCR workflow 1 on workspace 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/1.nf", "label": "OCR Workflow 1"}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_steps": {"0": "Processor A", "1": "Processor B"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 1234, "cer": 0.57, "cer_min_max": [0.2, 0.57]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.8, "processing_time": 2.1}]}}, {"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json", "label": "OCR Workflow 2 on Data 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/2.nf", "label": "OCR Workflow 2"}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_steps": {"0": "Processor A", "1": "Processor B"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 4567, "cer": 0.9, "cer_min_max": [0.2, 0.99]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.9, "processing_time": 2.1}]}}] \ No newline at end of file diff --git a/ocrd_eval.schema.json b/ocrd_eval.schema.json index 7546d8a..c903137 100644 --- a/ocrd_eval.schema.json +++ b/ocrd_eval.schema.json @@ -1,248 +1 @@ -{ - "$schema": "https://json-schema.org/draft/2019-09/schema", - "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", - "title": "A list of evaluations for OCR-D", - "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI\n", - "type": "array", - "items": { - "required": [ - "@id", - "label", - "metadata", - "evaluation" - ], - "unevaluatedProperties": false, - "allOf": [ - { - "$ref": "#/$defs/LabeledUrl" - }, - { - "properties": { - "metadata": { - "$ref": "#/$defs/EvaluationMetadata" - }, - "evaluation": { - "$ref": "#/$defs/EvaluationReport" - } - } - } - ] - }, - "$defs": { - "LabeledUrl": { - "type": "object", - "required": [ - "@id" - ], - "properties": { - "@id": { - "type": "string", - "format": "uri", - "description": "URL of the thing" - }, - "label": { - "type": "string", - "description": "Description of the thing for UI purposes" - } - } - }, - "EvaluationMetadata": { - "type": "object", - "title": "Metadata about one evaluation", - "additionalProperties": false, - "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", - "required": [ - "ocr_workflow", - "ocr_workspace", - "eval_workflow", - "eval_workspace", - "gt_workspace", - "document_metadata" - ], - "properties": { - "ocr_workflow": { - "allOf": [ - { - "$ref": "#/$defs/LabeledUrl" - } - ], - "description": "The OCR-D workflow that produced the ocr_workspace" - }, - "ocr_workspace": { - "allOf": [ - { - "$ref": "#/$defs/LabeledUrl" - } - ], - "description": "The workspace containing the OCR" - }, - "eval_workflow": { - "allOf": [ - { - "$ref": "#/$defs/LabeledUrl" - } - ], - "description": "The OCR-D workflow that produced the eval_workspace" - }, - "eval_workspace": { - "allOf": [ - { - "$ref": "#/$defs/LabeledUrl" - } - ], - "description": "The workspace containing the evaluation results" - }, - "gt_workspace": { - "allOf": [ - { - "$ref": "#/$defs/LabeledUrl" - } - ], - "description": "The workspace containing the GT" - }, - "workflow_steps": { - "type": "object", - "description": "Human readable description of the individual steps in the workflow (for UI)", - "patternProperties": { - "^[0-9]+$": { - "type": "string", - "description": "Description of this workflow step" - } - } - }, - "workflow_model": { - "type": "string", - "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)" - }, - "eval_tool": { - "type": "string", - "description": "Human readable name and version of evaluation tool used (for UI)" - }, - "document_metadata": { - "type": "object", - "title": "Bibliographical and typographical metadata about the work to be evaluated", - "properties": { - "publication_year": { - "type": "number", - "description": "Year the document was originally published" - }, - "publication_century": { - "type": "string", - "description": "Century the document was originally published", - "pattern": "[12][0-9]{3}-[12][0-9]{3}" - }, - "publication_decade": { - "type": "string", - "description": "Decade the document was originally published", - "pattern": "[12][0-9]{3}-[12][0-9]{3}" - }, - "number_of_pages": { - "type": "number", - "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)" - }, - "layout": { - "type": "string", - "enum": [ - "simple", - "complex" - ] - }, - "fonts": { - "type": "array", - "items": { - "type": "string", - "enum": [ - "antiqua", - "fraktur", - "ancient_greek", - "hebrew" - ] - } - } - } - }, - "provenance": { - "type": "object", - "description": "Information on which tools in which version were used in determining metrics", - "properties": { - "parameters": { - "type": "object", - "description": "Parameters passed to the evaluation processor" - } - } - } - } - }, - "EvaluationReport": { - "type": "object", - "additionalProperties": false, - "description": "The metrics measured for this document", - "properties": { - "document_wide": { - "type": "object", - "description": "Document-wide metrics", - "properties": { - "$ref": "#$defs/EvaluationMetrics" - } - }, - "by_page": { - "type": "array", - "description": "Metrics page-by-page", - "items": { - "type": "object", - "allOf": [ - { - "properties": { - "page_id": { - "type": "string", - "description": "PAGE ID" - } - } - }, - { - "properties": { - "$ref": "#$defs/EvaluationMetrics" - } - } - ] - } - } - } - }, - "EvaluationMetrics": { - "cer": { - "description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)" - }, - "cer_mean": { - "description": "Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)" - }, - "cer_median": { - "description": "Median of the page-wise CER (in document_wide) or regions on a page (in by_page)" - }, - "cer_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": { - "type": "number", - "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)" - } - }, - "cer_standard_deviation": { - "description": "Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)" - }, - "wer": { - "description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)" - }, - "wall_time": { - "description": "Actual time needed for processing workflow" - }, - "cpu_time": { - "description": "Cumulative CPU time used for processing workflow" - }, - "pages_per_minute": { - "description": "Number of pages processed per minute" - } - } - } -} \ No newline at end of file +{"$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", "title": "A list of evaluations for OCR-D", "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI\n", "type": "array", "items": {"required": ["@id", "label", "metadata", "evaluation"], "unevaluatedProperties": false, "allOf": [{"$ref": "#/$defs/LabeledUrl"}, {"properties": {"metadata": {"$ref": "#/$defs/EvaluationMetadata"}, "evaluation": {"$ref": "#/$defs/EvaluationReport"}}}]}, "$defs": {"LabeledUrl": {"type": "object", "required": ["@id"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}}}, "EvaluationMetadata": {"type": "object", "title": "Metadata about one evaluation", "additionalProperties": false, "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", "required": ["ocr_workflow", "ocr_workspace", "eval_workflow", "eval_workspace", "gt_workspace", "document_metadata"], "properties": {"ocr_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the ocr_workspace"}, "ocr_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the OCR"}, "eval_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the eval_workspace"}, "eval_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the evaluation results"}, "gt_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the GT"}, "workflow_steps": {"type": "object", "description": "Human readable description of the individual steps in the workflow (for UI)", "patternProperties": {"^[0-9]+$": {"type": "string", "description": "Description of this workflow step"}}}, "workflow_model": {"type": "string", "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)"}, "eval_tool": {"type": "string", "description": "Human readable name and version of evaluation tool used (for UI)"}, "document_metadata": {"type": "object", "title": "Bibliographical and typographical metadata about the work to be evaluated", "properties": {"publication_year": {"type": "number", "description": "Year the document was originally published"}, "publication_century": {"type": "string", "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "publication_decade": {"type": "string", "description": "Decade the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "number_of_pages": {"type": "number", "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)"}, "layout": {"type": "string", "enum": ["simple", "complex"]}, "fonts": {"type": "array", "items": {"type": "string", "enum": ["antiqua", "fraktur", "ancient_greek", "hebrew"]}}}}, "provenance": {"type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": {"parameters": {"type": "object", "description": "Parameters passed to the evaluation processor"}}}}}, "EvaluationReport": {"type": "object", "additionalProperties": false, "description": "The metrics measured for this document", "properties": {"document_wide": {"type": "object", "description": "Document-wide metrics", "properties": {"$ref": "#$defs/EvaluationMetrics"}}, "by_page": {"type": "array", "description": "Metrics page-by-page", "items": {"type": "object", "allOf": [{"properties": {"page_id": {"type": "string", "description": "PAGE ID"}}}, {"properties": {"$ref": "#$defs/EvaluationMetrics"}}]}}}}, "EvaluationMetrics": {"cer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "cer_mean": {"description": "Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "cer_median": {"description": "Median of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "cer_range": {"type": "array", "minItems": 2, "maxItems": 2, "items": {"type": "number", "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}, "cer_standard_deviation": {"description": "Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "wer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "wall_time": {"description": "Actual time needed for processing workflow"}, "cpu_time": {"description": "Cumulative CPU time used for processing workflow"}, "pages_per_minute": {"description": "Number of pages processed per minute"}}}} \ No newline at end of file diff --git a/ocrd_tool.schema.json b/ocrd_tool.schema.json index 10aef43..c2ab758 100644 --- a/ocrd_tool.schema.json +++ b/ocrd_tool.schema.json @@ -1,275 +1 @@ -{ - "type": "object", - "description": "Schema for tools by OCR-D MP", - "required": [ - "version", - "git_url", - "tools" - ], - "additionalProperties": false, - "properties": { - "version": { - "description": "Version of the tool, expressed as MAJOR.MINOR.PATCH.", - "type": "string", - "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$" - }, - "git_url": { - "description": "GitHub/GitLab URL", - "type": "string", - "format": "url" - }, - "dockerhub": { - "description": "DockerHub image", - "type": "string" - }, - "tools": { - "type": "object", - "additionalProperties": false, - "patternProperties": { - "ocrd-.*": { - "type": "object", - "additionalProperties": false, - "required": [ - "description", - "steps", - "executable", - "categories", - "input_file_grp" - ], - "properties": { - "executable": { - "description": "The name of the CLI executable in $PATH", - "type": "string" - }, - "input_file_grp": { - "description": "Input fileGrp@USE this tool expects by default", - "type": "array", - "items": { - "type": "string" - } - }, - "output_file_grp": { - "description": "Output fileGrp@USE this tool produces by default", - "type": "array", - "items": { - "type": "string" - } - }, - "parameters": { - "description": "Object describing the parameters of a tool. Keys are parameter names, values sub-schemas.", - "type": "object", - "default": {}, - "patternProperties": { - ".*": { - "type": "object", - "additionalProperties": false, - "required": [ - "description", - "type" - ], - "properties": { - "type": { - "type": "string", - "description": "Data type of this parameter", - "enum": [ - "string", - "number", - "boolean", - "object", - "array" - ] - }, - "format": { - "description": "Subtype, such as `float` for type `number` or `uri` for type `string`." - }, - "description": { - "description": "Concise description of syntax and semantics of this parameter" - }, - "items": { - "type": "object", - "description": "describe the items of an array further" - }, - "minimum": { - "type": "number", - "description": "Minimum value for number parameters, including the minimum" - }, - "maximum": { - "type": "number", - "description": "Maximum value for number parameters, including the maximum" - }, - "exclusiveMinimum": { - "type": "number", - "description": "Minimum value for number parameters, excluding the minimum" - }, - "exclusiveMaximum": { - "type": "number", - "description": "Maximum value for number parameters, excluding the maximum" - }, - "multipleOf": { - "type": "number", - "description": "For number values, those values must be multiple of this number" - }, - "properties": { - "type": "object", - "description": "Describe the properties of an object value" - }, - "additionalProperties": { - "type": "boolean", - "description": "Whether an object value may contain properties not explicitly defined" - }, - "required": { - "type": "boolean", - "description": "Whether this parameter is required" - }, - "default": { - "description": "Default value when not provided by the user" - }, - "enum": { - "type": "array", - "description": "List the allowed values if a fixed list." - }, - "content-type": { - "type": "string", - "default": "application/octet-stream", - "description": "The media type of resources this processor expects for this parameter. Most processors use files for resources (e.g. `*.traineddata` for `ocrd-tesserocr-recognize`) while others use directories of files (e.g. `default` for `ocrd-eynollah-segment`). If a parameter requires directories, it must set `content-type` to `text/directory`.\n" - }, - "cacheable": { - "type": "boolean", - "description": "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change.", - "default": false - } - } - } - } - }, - "description": { - "description": "Concise description of what the tool does" - }, - "categories": { - "description": "Tools belong to these categories, representing modules within the OCR-D project structure", - "type": "array", - "items": { - "type": "string", - "enum": [ - "Image preprocessing", - "Layout analysis", - "Text recognition and optimization", - "Model training", - "Long-term preservation", - "Quality assurance" - ] - } - }, - "steps": { - "description": "This tool can be used at these steps in the OCR-D functional model", - "type": "array", - "items": { - "type": "string", - "enum": [ - "preprocessing/characterization", - "preprocessing/optimization", - "preprocessing/optimization/cropping", - "preprocessing/optimization/deskewing", - "preprocessing/optimization/despeckling", - "preprocessing/optimization/dewarping", - "preprocessing/optimization/binarization", - "preprocessing/optimization/grayscale_normalization", - "recognition/text-recognition", - "recognition/font-identification", - "recognition/post-correction", - "layout/segmentation", - "layout/segmentation/text-nontext", - "layout/segmentation/region", - "layout/segmentation/line", - "layout/segmentation/word", - "layout/segmentation/classification", - "layout/analysis" - ] - } - }, - "resource_locations": { - "type": "array", - "description": "The locations in the filesystem this processor supports for resource lookup", - "default": [ - "data", - "cwd", - "system", - "module" - ], - "items": { - "type": "string", - "enum": [ - "data", - "cwd", - "system", - "module" - ] - } - }, - "resources": { - "type": "array", - "description": "Resources for this processor", - "items": { - "type": "object", - "additionalProperties": false, - "required": [ - "url", - "description", - "name", - "size" - ], - "properties": { - "url": { - "type": "string", - "description": "URLs of all components of this resource" - }, - "description": { - "type": "string", - "description": "A description of the resource" - }, - "name": { - "type": "string", - "description": "Name to store the resource as" - }, - "type": { - "type": "string", - "enum": [ - "file", - "directory", - "archive" - ], - "default": "file", - "description": "Type of the URL" - }, - "parameter_usage": { - "type": "string", - "description": "Defines how the parameter is to be used", - "enum": [ - "as-is", - "without-extension" - ], - "default": "as-is" - }, - "path_in_archive": { - "type": "string", - "description": "If type is archive, the resource is at this location in the archive", - "default": "." - }, - "version_range": { - "type": "string", - "description": "Range of supported versions, syntax like in PEP 440", - "default": ">= 0.0.1" - }, - "size": { - "type": "number", - "description": "Size of the resource in bytes" - } - } - } - } - } - } - } - } - } -} \ No newline at end of file +{"type": "object", "description": "Schema for tools by OCR-D MP", "required": ["version", "git_url", "tools"], "additionalProperties": false, "properties": {"version": {"description": "Version of the tool, expressed as MAJOR.MINOR.PATCH.", "type": "string", "pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$"}, "git_url": {"description": "GitHub/GitLab URL", "type": "string", "format": "url"}, "dockerhub": {"description": "DockerHub image", "type": "string"}, "tools": {"type": "object", "additionalProperties": false, "patternProperties": {"ocrd-.*": {"type": "object", "additionalProperties": false, "required": ["description", "steps", "executable", "categories", "input_file_grp"], "properties": {"executable": {"description": "The name of the CLI executable in $PATH", "type": "string"}, "input_file_grp": {"description": "Input fileGrp@USE this tool expects by default", "type": "array", "items": {"type": "string"}}, "output_file_grp": {"description": "Output fileGrp@USE this tool produces by default", "type": "array", "items": {"type": "string"}}, "parameters": {"description": "Object describing the parameters of a tool. Keys are parameter names, values sub-schemas.", "type": "object", "default": {}, "patternProperties": {".*": {"type": "object", "additionalProperties": false, "required": ["description", "type"], "properties": {"type": {"type": "string", "description": "Data type of this parameter", "enum": ["string", "number", "boolean", "object", "array"]}, "format": {"description": "Subtype, such as `float` for type `number` or `uri` for type `string`."}, "description": {"description": "Concise description of syntax and semantics of this parameter"}, "items": {"type": "object", "description": "describe the items of an array further"}, "minimum": {"type": "number", "description": "Minimum value for number parameters, including the minimum"}, "maximum": {"type": "number", "description": "Maximum value for number parameters, including the maximum"}, "exclusiveMinimum": {"type": "number", "description": "Minimum value for number parameters, excluding the minimum"}, "exclusiveMaximum": {"type": "number", "description": "Maximum value for number parameters, excluding the maximum"}, "multipleOf": {"type": "number", "description": "For number values, those values must be multiple of this number"}, "properties": {"type": "object", "description": "Describe the properties of an object value"}, "additionalProperties": {"type": "boolean", "description": "Whether an object value may contain properties not explicitly defined"}, "required": {"type": "boolean", "description": "Whether this parameter is required"}, "default": {"description": "Default value when not provided by the user"}, "enum": {"type": "array", "description": "List the allowed values if a fixed list."}, "content-type": {"type": "string", "default": "application/octet-stream", "description": "The media type of resources this processor expects for this parameter. Most processors use files for resources (e.g. `*.traineddata` for `ocrd-tesserocr-recognize`) while others use directories of files (e.g. `default` for `ocrd-eynollah-segment`). If a parameter requires directories, it must set `content-type` to `text/directory`.\n"}, "cacheable": {"type": "boolean", "description": "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change.", "default": false}}}}}, "description": {"description": "Concise description of what the tool does"}, "categories": {"description": "Tools belong to these categories, representing modules within the OCR-D project structure", "type": "array", "items": {"type": "string", "enum": ["Image preprocessing", "Layout analysis", "Text recognition and optimization", "Model training", "Long-term preservation", "Quality assurance"]}}, "steps": {"description": "This tool can be used at these steps in the OCR-D functional model", "type": "array", "items": {"type": "string", "enum": ["preprocessing/characterization", "preprocessing/optimization", "preprocessing/optimization/cropping", "preprocessing/optimization/deskewing", "preprocessing/optimization/despeckling", "preprocessing/optimization/dewarping", "preprocessing/optimization/binarization", "preprocessing/optimization/grayscale_normalization", "recognition/text-recognition", "recognition/font-identification", "recognition/post-correction", "layout/segmentation", "layout/segmentation/text-nontext", "layout/segmentation/region", "layout/segmentation/line", "layout/segmentation/word", "layout/segmentation/classification", "layout/analysis"]}}, "resource_locations": {"type": "array", "description": "The locations in the filesystem this processor supports for resource lookup", "default": ["data", "cwd", "system", "module"], "items": {"type": "string", "enum": ["data", "cwd", "system", "module"]}}, "resources": {"type": "array", "description": "Resources for this processor", "items": {"type": "object", "additionalProperties": false, "required": ["url", "description", "name", "size"], "properties": {"url": {"type": "string", "description": "URLs of all components of this resource"}, "description": {"type": "string", "description": "A description of the resource"}, "name": {"type": "string", "description": "Name to store the resource as"}, "type": {"type": "string", "enum": ["file", "directory", "archive"], "default": "file", "description": "Type of the URL"}, "parameter_usage": {"type": "string", "description": "Defines how the parameter is to be used", "enum": ["as-is", "without-extension"], "default": "as-is"}, "path_in_archive": {"type": "string", "description": "If type is archive, the resource is at this location in the archive", "default": "."}, "version_range": {"type": "string", "description": "Range of supported versions, syntax like in PEP 440", "default": ">= 0.0.1"}, "size": {"type": "number", "description": "Size of the resource in bytes"}}}}}}}}}} \ No newline at end of file From c9d313f61927c6e153205ea4aa10f57e1cd2dd37 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Tue, 27 Sep 2022 13:25:43 +0200 Subject: [PATCH 12/56] comment out undiscussed CER metrics --- ocrd_eval.schema.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml index 596bc90..4a9c1df 100644 --- a/ocrd_eval.schema.yml +++ b/ocrd_eval.schema.yml @@ -160,11 +160,12 @@ $defs: cer: description: CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide) - cer_mean: - description: Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page) - - cer_median: - description: Median of the page-wise CER (in document_wide) or regions on a page (in by_page) +# To be implemented in the future. +# cer_mean: +# description: Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page) +# +# cer_median: +# description: Median of the page-wise CER (in document_wide) or regions on a page (in by_page) cer_range: type: array @@ -174,8 +175,9 @@ $defs: type: number description: Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide) - cer_standard_deviation: - description: Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page) +# To be implemented in the future. +# cer_standard_deviation: +# description: Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page) wer: description: CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide) From a814c891b3415b536087e21c255bfc51ba44da60 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Thu, 24 Nov 2022 15:10:11 +0100 Subject: [PATCH 13/56] feat: move workflow_steps to ocr_workflow object --- ocrd_eval.sample.json | 2 +- ocrd_eval.sample.yml | 12 ++++++------ ocrd_eval.schema.json | 2 +- ocrd_eval.schema.yml | 26 +++++++++++++++++--------- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/ocrd_eval.sample.json b/ocrd_eval.sample.json index 80de251..df71d17 100644 --- a/ocrd_eval.sample.json +++ b/ocrd_eval.sample.json @@ -1 +1 @@ -[{"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json", "label": "OCR workflow 1 on workspace 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/1.nf", "label": "OCR Workflow 1"}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_steps": {"0": "Processor A", "1": "Processor B"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 1234, "cer": 0.57, "cer_min_max": [0.2, 0.57]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.8, "processing_time": 2.1}]}}, {"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json", "label": "OCR Workflow 2 on Data 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/2.nf", "label": "OCR Workflow 2"}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_steps": {"0": "Processor A", "1": "Processor B"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 4567, "cer": 0.9, "cer_min_max": [0.2, 0.99]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.9, "processing_time": 2.1}]}}] \ No newline at end of file +[{"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json", "label": "OCR workflow 1 on workspace 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/1.nf", "label": "OCR Workflow 1", "steps": {"0": "Processor A", "1": "Processor B"}}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 1234, "cer": 0.57, "cer_min_max": [0.2, 0.57]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.8, "processing_time": 2.1}]}}, {"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json", "label": "OCR Workflow 2 on Data 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/2.nf", "label": "OCR Workflow 2", "steps": {"0": "Processor A", "1": "Processor B"}}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 4567, "cer": 0.9, "cer_min_max": [0.2, 0.99]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.9, "processing_time": 2.1}]}}] \ No newline at end of file diff --git a/ocrd_eval.sample.yml b/ocrd_eval.sample.yml index 74383f8..9924204 100644 --- a/ocrd_eval.sample.yml +++ b/ocrd_eval.sample.yml @@ -4,6 +4,9 @@ ocr_workflow: '@id': https://github.com/OCR-D/quiver/tree/data/workflows/1.nf label: OCR Workflow 1 + steps: + '0': Processor A + '1': Processor B eval_workflow: '@id': https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf label: Evaluation Workflow 1 @@ -16,9 +19,6 @@ eval_workspace: '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip label: Evaluation Workspace 345 - workflow_steps: - '0': Processor A - '1': Processor B workflow_model: Fraktur_GT4HistOCR document_metadata: fonts: @@ -47,6 +47,9 @@ ocr_workflow: '@id': https://github.com/OCR-D/quiver/tree/data/workflows/2.nf label: OCR Workflow 2 + steps: + '0': Processor A + '1': Processor B eval_workflow: '@id': https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf label: Evaluation Workflow 1 @@ -59,9 +62,6 @@ eval_workspace: '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip label: Evaluation Workspace 345 - workflow_steps: - '0': Processor A - '1': Processor B workflow_model: Fraktur_GT4HistOCR document_metadata: fonts: diff --git a/ocrd_eval.schema.json b/ocrd_eval.schema.json index c903137..b11d918 100644 --- a/ocrd_eval.schema.json +++ b/ocrd_eval.schema.json @@ -1 +1 @@ -{"$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", "title": "A list of evaluations for OCR-D", "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI\n", "type": "array", "items": {"required": ["@id", "label", "metadata", "evaluation"], "unevaluatedProperties": false, "allOf": [{"$ref": "#/$defs/LabeledUrl"}, {"properties": {"metadata": {"$ref": "#/$defs/EvaluationMetadata"}, "evaluation": {"$ref": "#/$defs/EvaluationReport"}}}]}, "$defs": {"LabeledUrl": {"type": "object", "required": ["@id"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}}}, "EvaluationMetadata": {"type": "object", "title": "Metadata about one evaluation", "additionalProperties": false, "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", "required": ["ocr_workflow", "ocr_workspace", "eval_workflow", "eval_workspace", "gt_workspace", "document_metadata"], "properties": {"ocr_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the ocr_workspace"}, "ocr_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the OCR"}, "eval_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the eval_workspace"}, "eval_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the evaluation results"}, "gt_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the GT"}, "workflow_steps": {"type": "object", "description": "Human readable description of the individual steps in the workflow (for UI)", "patternProperties": {"^[0-9]+$": {"type": "string", "description": "Description of this workflow step"}}}, "workflow_model": {"type": "string", "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)"}, "eval_tool": {"type": "string", "description": "Human readable name and version of evaluation tool used (for UI)"}, "document_metadata": {"type": "object", "title": "Bibliographical and typographical metadata about the work to be evaluated", "properties": {"publication_year": {"type": "number", "description": "Year the document was originally published"}, "publication_century": {"type": "string", "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "publication_decade": {"type": "string", "description": "Decade the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "number_of_pages": {"type": "number", "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)"}, "layout": {"type": "string", "enum": ["simple", "complex"]}, "fonts": {"type": "array", "items": {"type": "string", "enum": ["antiqua", "fraktur", "ancient_greek", "hebrew"]}}}}, "provenance": {"type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": {"parameters": {"type": "object", "description": "Parameters passed to the evaluation processor"}}}}}, "EvaluationReport": {"type": "object", "additionalProperties": false, "description": "The metrics measured for this document", "properties": {"document_wide": {"type": "object", "description": "Document-wide metrics", "properties": {"$ref": "#$defs/EvaluationMetrics"}}, "by_page": {"type": "array", "description": "Metrics page-by-page", "items": {"type": "object", "allOf": [{"properties": {"page_id": {"type": "string", "description": "PAGE ID"}}}, {"properties": {"$ref": "#$defs/EvaluationMetrics"}}]}}}}, "EvaluationMetrics": {"cer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "cer_mean": {"description": "Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "cer_median": {"description": "Median of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "cer_range": {"type": "array", "minItems": 2, "maxItems": 2, "items": {"type": "number", "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}, "cer_standard_deviation": {"description": "Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "wer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "wall_time": {"description": "Actual time needed for processing workflow"}, "cpu_time": {"description": "Cumulative CPU time used for processing workflow"}, "pages_per_minute": {"description": "Number of pages processed per minute"}}}} \ No newline at end of file +{"$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", "title": "A list of evaluations for OCR-D", "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI\n", "type": "array", "items": {"required": ["@id", "label", "metadata", "evaluation"], "unevaluatedProperties": false, "allOf": [{"$ref": "#/$defs/LabeledUrl"}, {"properties": {"metadata": {"$ref": "#/$defs/EvaluationMetadata"}, "evaluation": {"$ref": "#/$defs/EvaluationReport"}}}]}, "$defs": {"LabeledUrl": {"type": "object", "required": ["@id"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}}}, "EvaluationMetadata": {"type": "object", "title": "Metadata about one evaluation", "additionalProperties": false, "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", "required": ["ocr_workflow", "ocr_workspace", "eval_workflow", "eval_workspace", "gt_workspace", "document_metadata"], "properties": {"ocr_workflow": {"type": "object", "required": ["@id", "steps"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}, "steps": {"type": "object", "description": "Human readable description of the individual steps in the workflow (for UI)", "patternProperties": {"^[0-9]+$": {"type": "string", "description": "Description of this workflow step"}}}}, "description": "The OCR-D workflow that produced the ocr_workspace"}, "ocr_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the OCR"}, "eval_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the eval_workspace"}, "eval_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the evaluation results"}, "gt_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the GT"}, "workflow_steps": {"type": "object", "description": "Human readable description of the individual steps in the workflow (for UI)", "patternProperties": {"^[0-9]+$": {"type": "string", "description": "Description of this workflow step"}}}, "workflow_model": {"type": "string", "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)"}, "eval_tool": {"type": "string", "description": "Human readable name and version of evaluation tool used (for UI)"}, "document_metadata": {"type": "object", "title": "Bibliographical and typographical metadata about the work to be evaluated", "properties": {"publication_year": {"type": "number", "description": "Year the document was originally published"}, "publication_century": {"type": "string", "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "publication_decade": {"type": "string", "description": "Decade the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "number_of_pages": {"type": "number", "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)"}, "layout": {"type": "string", "enum": ["simple", "complex"]}, "fonts": {"type": "array", "items": {"type": "string", "enum": ["antiqua", "fraktur", "ancient_greek", "hebrew"]}}}}, "provenance": {"type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": {"parameters": {"type": "object", "description": "Parameters passed to the evaluation processor"}}}}}, "EvaluationReport": {"type": "object", "additionalProperties": false, "description": "The metrics measured for this document", "properties": {"document_wide": {"type": "object", "description": "Document-wide metrics", "properties": {"$ref": "#$defs/EvaluationMetrics"}}, "by_page": {"type": "array", "description": "Metrics page-by-page", "items": {"type": "object", "allOf": [{"properties": {"page_id": {"type": "string", "description": "PAGE ID"}}}, {"properties": {"$ref": "#$defs/EvaluationMetrics"}}]}}}}, "EvaluationMetrics": {"cer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "cer_range": {"type": "array", "minItems": 2, "maxItems": 2, "items": {"type": "number", "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}, "wer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "wall_time": {"description": "Actual time needed for processing workflow"}, "cpu_time": {"description": "Cumulative CPU time used for processing workflow"}, "pages_per_minute": {"description": "Number of pages processed per minute"}}}} \ No newline at end of file diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml index 4a9c1df..a3ef08a 100644 --- a/ocrd_eval.schema.yml +++ b/ocrd_eval.schema.yml @@ -58,7 +58,23 @@ $defs: properties: ocr_workflow: - allOf: [{ '$ref': '#/$defs/LabeledUrl' }] + type: object + required: ['@id', 'steps'] + properties: + '@id': + type: string + format: uri + description: URL of the thing + label: + type: string + description: Description of the thing for UI purposes + steps: + type: object + description: Human readable description of the individual steps in the workflow (for UI) + patternProperties: + '^[0-9]+$': + type: string + description: Description of this workflow step description: The OCR-D workflow that produced the ocr_workspace ocr_workspace: @@ -77,14 +93,6 @@ $defs: allOf: [{ '$ref': '#/$defs/LabeledUrl' }] description: The workspace containing the GT - workflow_steps: - type: object - description: Human readable description of the individual steps in the workflow (for UI) - patternProperties: - '^[0-9]+$': - type: string - description: Description of this workflow step - workflow_model: type: string description: Human readable name of the main model used for recognition in the OCR workflow (for UI) From a881e085603958ea09db6b2b110c47beff4872df Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 19 Dec 2022 13:55:33 +0100 Subject: [PATCH 14/56] remove schema from this branch, cf. #236 --- ocrd_eval.sample.json | 1 - ocrd_eval.sample.yml | 85 ------------------ ocrd_eval.schema.json | 1 - ocrd_eval.schema.yml | 202 ------------------------------------------ 4 files changed, 289 deletions(-) delete mode 100644 ocrd_eval.sample.json delete mode 100644 ocrd_eval.sample.yml delete mode 100644 ocrd_eval.schema.json delete mode 100644 ocrd_eval.schema.yml diff --git a/ocrd_eval.sample.json b/ocrd_eval.sample.json deleted file mode 100644 index df71d17..0000000 --- a/ocrd_eval.sample.json +++ /dev/null @@ -1 +0,0 @@ -[{"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json", "label": "OCR workflow 1 on workspace 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/1.nf", "label": "OCR Workflow 1", "steps": {"0": "Processor A", "1": "Processor B"}}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 1234, "cer": 0.57, "cer_min_max": [0.2, 0.57]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.8, "processing_time": 2.1}]}}, {"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json", "label": "OCR Workflow 2 on Data 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/2.nf", "label": "OCR Workflow 2", "steps": {"0": "Processor A", "1": "Processor B"}}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 4567, "cer": 0.9, "cer_min_max": [0.2, 0.99]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.9, "processing_time": 2.1}]}}] \ No newline at end of file diff --git a/ocrd_eval.sample.yml b/ocrd_eval.sample.yml deleted file mode 100644 index 9924204..0000000 --- a/ocrd_eval.sample.yml +++ /dev/null @@ -1,85 +0,0 @@ -- '@id': https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json - label: OCR workflow 1 on workspace 345 - metadata: - ocr_workflow: - '@id': https://github.com/OCR-D/quiver/tree/data/workflows/1.nf - label: OCR Workflow 1 - steps: - '0': Processor A - '1': Processor B - eval_workflow: - '@id': https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf - label: Evaluation Workflow 1 - gt_workspace: - '@id': https://gt.ocr-d.de/workspace/789 - label: GT workspace 789 (19th century fraktur) - ocr_workspace: - '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip - label: OCR result workspace 3000 - eval_workspace: - '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip - label: Evaluation Workspace 345 - workflow_model: Fraktur_GT4HistOCR - document_metadata: - fonts: - - antiqua - - fraktur - publication_century: 1800-1900 - publication_decade: 1850-1860 - publication_year: 1855 - number_of_pages: 100 - layout: simple - evaluation: - document_wide: - wall_time: 1234 - cer: 0.57 - cer_min_max: - - 0.2 - - 0.57 - by_page: - - page_id: PHYS_0001 - cer: 0.8 - processing_time: 2.1 - -- '@id': https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json - label: OCR Workflow 2 on Data 345 - metadata: - ocr_workflow: - '@id': https://github.com/OCR-D/quiver/tree/data/workflows/2.nf - label: OCR Workflow 2 - steps: - '0': Processor A - '1': Processor B - eval_workflow: - '@id': https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf - label: Evaluation Workflow 1 - gt_workspace: - '@id': https://gt.ocr-d.de/workspace/789 - label: GT workspace 789 (19th century fraktur) - ocr_workspace: - '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip - label: OCR result workspace 3000 - eval_workspace: - '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip - label: Evaluation Workspace 345 - workflow_model: Fraktur_GT4HistOCR - document_metadata: - fonts: - - antiqua - - fraktur - publication_century: 1800-1900 - publication_decade: 1850-1860 - publication_year: 1855 - number_of_pages: 100 - layout: simple - evaluation: - document_wide: - wall_time: 4567 - cer: 0.9 - cer_min_max: - - 0.2 - - 0.99 - by_page: - - page_id: PHYS_0001 - cer: 0.9 - processing_time: 2.1 diff --git a/ocrd_eval.schema.json b/ocrd_eval.schema.json deleted file mode 100644 index b11d918..0000000 --- a/ocrd_eval.schema.json +++ /dev/null @@ -1 +0,0 @@ -{"$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", "title": "A list of evaluations for OCR-D", "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI\n", "type": "array", "items": {"required": ["@id", "label", "metadata", "evaluation"], "unevaluatedProperties": false, "allOf": [{"$ref": "#/$defs/LabeledUrl"}, {"properties": {"metadata": {"$ref": "#/$defs/EvaluationMetadata"}, "evaluation": {"$ref": "#/$defs/EvaluationReport"}}}]}, "$defs": {"LabeledUrl": {"type": "object", "required": ["@id"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}}}, "EvaluationMetadata": {"type": "object", "title": "Metadata about one evaluation", "additionalProperties": false, "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", "required": ["ocr_workflow", "ocr_workspace", "eval_workflow", "eval_workspace", "gt_workspace", "document_metadata"], "properties": {"ocr_workflow": {"type": "object", "required": ["@id", "steps"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}, "steps": {"type": "object", "description": "Human readable description of the individual steps in the workflow (for UI)", "patternProperties": {"^[0-9]+$": {"type": "string", "description": "Description of this workflow step"}}}}, "description": "The OCR-D workflow that produced the ocr_workspace"}, "ocr_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the OCR"}, "eval_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the eval_workspace"}, "eval_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the evaluation results"}, "gt_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the GT"}, "workflow_steps": {"type": "object", "description": "Human readable description of the individual steps in the workflow (for UI)", "patternProperties": {"^[0-9]+$": {"type": "string", "description": "Description of this workflow step"}}}, "workflow_model": {"type": "string", "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)"}, "eval_tool": {"type": "string", "description": "Human readable name and version of evaluation tool used (for UI)"}, "document_metadata": {"type": "object", "title": "Bibliographical and typographical metadata about the work to be evaluated", "properties": {"publication_year": {"type": "number", "description": "Year the document was originally published"}, "publication_century": {"type": "string", "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "publication_decade": {"type": "string", "description": "Decade the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "number_of_pages": {"type": "number", "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)"}, "layout": {"type": "string", "enum": ["simple", "complex"]}, "fonts": {"type": "array", "items": {"type": "string", "enum": ["antiqua", "fraktur", "ancient_greek", "hebrew"]}}}}, "provenance": {"type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": {"parameters": {"type": "object", "description": "Parameters passed to the evaluation processor"}}}}}, "EvaluationReport": {"type": "object", "additionalProperties": false, "description": "The metrics measured for this document", "properties": {"document_wide": {"type": "object", "description": "Document-wide metrics", "properties": {"$ref": "#$defs/EvaluationMetrics"}}, "by_page": {"type": "array", "description": "Metrics page-by-page", "items": {"type": "object", "allOf": [{"properties": {"page_id": {"type": "string", "description": "PAGE ID"}}}, {"properties": {"$ref": "#$defs/EvaluationMetrics"}}]}}}}, "EvaluationMetrics": {"cer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "cer_range": {"type": "array", "minItems": 2, "maxItems": 2, "items": {"type": "number", "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}, "wer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "wall_time": {"description": "Actual time needed for processing workflow"}, "cpu_time": {"description": "Cumulative CPU time used for processing workflow"}, "pages_per_minute": {"description": "Number of pages processed per minute"}}}} \ No newline at end of file diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml deleted file mode 100644 index a3ef08a..0000000 --- a/ocrd_eval.schema.yml +++ /dev/null @@ -1,202 +0,0 @@ -$schema: https://json-schema.org/draft/2019-09/schema -$id: https://ocr-d.de/en/spec/ocrd_eval.schema.json - -title: A list of evaluations for OCR-D -description: > - - All references to URL are JSON-LD-like objects with at least an `@id` - property referencing the URL and `label` for a human-readable label to be - used in the UI -type: array -items: - required: ['@id', 'label', 'metadata', 'evaluation'] - unevaluatedProperties: false - allOf: - - { '$ref': '#/$defs/LabeledUrl' } - - properties: - metadata: { '$ref': '#/$defs/EvaluationMetadata' } - evaluation: { '$ref': '#/$defs/EvaluationReport' } - -# Reusable definitions -$defs: - - LabeledUrl: - type: object - required: ['@id'] - properties: - '@id': - type: string - format: uri - description: URL of the thing - label: - type: string - description: Description of the thing for UI purposes - - EvaluationMetadata: - type: object - title: Metadata about one evaluation - additionalProperties: false - description: > - EvaluationMetadata contains all the info on how an EvaluationReport came to be. - - There are two OCR-D *workflows* involved: - - ocr_workflow: The workflow which produced the OCR results to evaluate - - eval_workflow: The workflow run to evaluate OCR and GT - - There are three OCR-D *workspaces* involved: - - gt_workspace: The workspace containing the GT - - ocr_workspace: The workspace containing the OCR results from ocr_workflow - - eval_workspace: The workspace on which the eval_workflow was run - - required: - - ocr_workflow - - ocr_workspace - - eval_workflow - - eval_workspace - - gt_workspace - - document_metadata - - properties: - - ocr_workflow: - type: object - required: ['@id', 'steps'] - properties: - '@id': - type: string - format: uri - description: URL of the thing - label: - type: string - description: Description of the thing for UI purposes - steps: - type: object - description: Human readable description of the individual steps in the workflow (for UI) - patternProperties: - '^[0-9]+$': - type: string - description: Description of this workflow step - description: The OCR-D workflow that produced the ocr_workspace - - ocr_workspace: - allOf: [{ '$ref': '#/$defs/LabeledUrl' }] - description: The workspace containing the OCR - - eval_workflow: - allOf: [{ '$ref': '#/$defs/LabeledUrl' }] - description: The OCR-D workflow that produced the eval_workspace - - eval_workspace: - allOf: [{ '$ref': '#/$defs/LabeledUrl' }] - description: The workspace containing the evaluation results - - gt_workspace: - allOf: [{ '$ref': '#/$defs/LabeledUrl' }] - description: The workspace containing the GT - - workflow_model: - type: string - description: Human readable name of the main model used for recognition in the OCR workflow (for UI) - - eval_tool: - type: string - description: Human readable name and version of evaluation tool used (for UI) - - document_metadata: - type: object - title: Bibliographical and typographical metadata about the work to be evaluated - properties: - - publication_year: - type: number - description: Year the document was originally published - - publication_century: - type: string - description: Century the document was originally published - pattern: '[12][0-9]{3}-[12][0-9]{3}' - - publication_decade: - type: string - description: Decade the document was originally published - pattern: '[12][0-9]{3}-[12][0-9]{3}' - - number_of_pages: - type: number - description: Number of pages in this work (i.e. the number of images in the gt_workspace) - - layout: - type: string - enum: ['simple', 'complex'] - - fonts: - type: array - items: - type: string - enum: ['antiqua', 'fraktur', 'ancient_greek', 'hebrew'] - - provenance: - type: object - description: Information on which tools in which version were used in determining metrics - properties: - parameters: - type: object - description: Parameters passed to the evaluation processor - - EvaluationReport: - type: object - additionalProperties: false - description: The metrics measured for this document - properties: - document_wide: - type: object - description: Document-wide metrics - properties: { $ref: '#$defs/EvaluationMetrics' } - by_page: - type: array - description: Metrics page-by-page - items: - type: object - allOf: - - properties: - page_id: - type: string - description: PAGE ID - - properties: { $ref: '#$defs/EvaluationMetrics' } - - EvaluationMetrics: - - cer: - description: CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide) - -# To be implemented in the future. -# cer_mean: -# description: Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page) -# -# cer_median: -# description: Median of the page-wise CER (in document_wide) or regions on a page (in by_page) - - cer_range: - type: array - minItems: 2 - maxItems: 2 - items: - type: number - description: Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide) - -# To be implemented in the future. -# cer_standard_deviation: -# description: Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page) - - wer: - description: CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide) - - wall_time: - description: Actual time needed for processing workflow - - cpu_time: - description: Cumulative CPU time used for processing workflow - - pages_per_minute: - description: Number of pages processed per minute - - From c7ae88d012358119ac406128cf239a37386cabc7 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Mon, 23 Jan 2023 11:29:32 +0100 Subject: [PATCH 15/56] integrate Uwe's feedback --- ocrd_eval.md | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index bf87a06..a44feb5 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -7,13 +7,13 @@ evaluation metrics and evaluation tools that need to work together in a well-defined manner to allow users to make informed decisions about which OCR solution works best for their use case. -## Evaluation metrics +## Evaluation Metrics The evaluation of the success (accuracy) of OCR is a complex task for which multiple methods and metrics are available. It aims to capture quality in different aspects, such as the recognition of text, but also the detection of layout, for which different methods and metrics are needed. Furthermore, the time and resources required for OCR processing also have to be captured. Here we describe the metrics that were selected for use in OCR-D, how exactly they are applied, and what was the motivation. -### Scope of these Definitions +### Scope of These Definitions At this stage (Q3 2022) these definitions serve as a basis of common understanding for the metrics used in the benchmarking presented in OCR-D QUIVER. Further implications for evaluation tools do not yet apply. @@ -21,22 +21,14 @@ At this stage (Q3 2022) these definitions serve as a basis of common understandi The most important measure to assess the quality of OCR is the accuracy of the recognized text. The majority of metrics for this are based on the Levenshtein distance, an algorithm to compute the distance between two strings. In OCR, one of these strings is generally the Ground Truth text and the other the recognized text which is the result of an OCR. -#### Levenshtein Distance +#### Levenshtein Distance (Edit Distance) Levenshtein distance between two strings `a` and `b` is the number of edit operations needed to turn `a` into `b`. Edit operations depend on the specific variant of the algorithm but for OCR, relevant operations are deletion, insertion and substitution. The Levenshtein distance forms the basis for the calculation of [CER/WER](https://pad.gwdg.de/#CERWER). +As there are different implementations of the edit distance available (e.g. rapidfuzz, jellyfish, …), the OCR-D coordination project will provide a recommendation in the final version of this document. -##### General example - -The Levenshtein distance between "Monday" and "Tuesday" is 4, because 4 edit operations are necessary to turn "Monday" into "Tuesday": - -* **M**onday --> **T**onday (substitution) -* T**o**nday --> T**u**nday (substitution) -* Tu**n**day --> Tu**e**day (substitution) -* Tueday --> Tue**s**day (insertion) - -##### OCR example +##### Example Given a Ground truth that reads `ſind` and the recognized text `fmd`. @@ -51,6 +43,7 @@ The Levenshtein distance between these texts is 4, because 4 edit operations are ##### Characters A text consists of a set of characters that have a certain meaning. A character is a glyph that represents a word, a letter in a word, or a symbol. +Not included in the character definition are all forms of white spaces. ###### Examples @@ -60,7 +53,7 @@ A text consists of a set of characters that have a certain meaning. A character ##### Character Error Rate (CER) -The character error rate (CER) describes how many faulty characters the output of an OCR engine contains compaired to the Ground Truth text in relation to the text length. +The character error rate (CER) describes how many faulty characters the output of an OCR engine contains compaired to the Ground Truth text in relation to the text length (i.e. the number of characters of the text). Errors fall into one of the following three categories: @@ -218,7 +211,7 @@ Memory usage is the number of bytes the process allocates in memory (RAM). Disk usage is the number of bytes the process allocates on hard disk. -### Unicode normalization +### Unicode Normalization In Unicode there can be multiple ways to express characters that have multiple components, such as a base letter and an accent. For evaluation it is essential that both Ground Truth and OCR results are normalized *in the same way* before evaluation. @@ -238,13 +231,13 @@ The Unicode normalization algorithms rely on data from the Unicode database on e The following metrics are not part of the MVP (minimal viable product) and will (if ever) be implemented at a later stage. -#### GPU metrics +#### GPU Metrics -##### GPU time +##### GPU Time GPU time is the time a GPU (graphics card) spent processing instructions -##### GPU avg memory +##### GPU Avg Memory GPU avg memory refers to the average amount of memory of the GPU (in GiB) that was used during processing. @@ -343,7 +336,7 @@ The mean Average Precision is a metric used to measure how accurate an object de $mAP = \displaystyle\frac{1}{N}\sum_{i=1}^{N}AP_i$ with $N$ being the number of thresholds. -##### Scenario-driven Performance Evaluation +##### Scenario-Driven Performance Evaluation Scenario-driven performance evaluation as described in [Clausner et al., 2011](https://primaresearch.org/publications/ICDAR2011_Clausner_PerformanceEvaluation) is currently the most comprehensive and sophisticated approach to evaluate OCR success with consideration of layout. From 7ce6c1ac9de07eaffed0a081a6aeeed401333e3d Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Mon, 23 Jan 2023 14:22:36 +0100 Subject: [PATCH 16/56] Update ocrd_eval.md Co-authored-by: Konstantin Baierer --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index a44feb5..6afdcda 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -26,7 +26,7 @@ The most important measure to assess the quality of OCR is the accuracy of the r Levenshtein distance between two strings `a` and `b` is the number of edit operations needed to turn `a` into `b`. Edit operations depend on the specific variant of the algorithm but for OCR, relevant operations are deletion, insertion and substitution. The Levenshtein distance forms the basis for the calculation of [CER/WER](https://pad.gwdg.de/#CERWER). -As there are different implementations of the edit distance available (e.g. rapidfuzz, jellyfish, …), the OCR-D coordination project will provide a recommendation in the final version of this document. +As there are different implementations of the edit distance available (e.g. [rapidfuzz](https://maxbachmann.github.io/RapidFuzz/Usage/distance/Levenshtein.html), [jellyfish](https://jamesturk.github.io/jellyfish/functions/#levenshtein-distance), …), the OCR-D coordination project will provide a recommendation in the final version of this document. ##### Example From ef8aeea26c982095c10f279f436406ce7f451f05 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Mon, 23 Jan 2023 14:25:16 +0100 Subject: [PATCH 17/56] Update ocrd_eval.md Co-authored-by: Konstantin Baierer --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 6afdcda..9c5e4b0 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -53,7 +53,7 @@ Not included in the character definition are all forms of white spaces. ##### Character Error Rate (CER) -The character error rate (CER) describes how many faulty characters the output of an OCR engine contains compaired to the Ground Truth text in relation to the text length (i.e. the number of characters of the text). +The character error rate (CER) describes how many faulty characters the output of an OCR engine contains compaired to the Ground Truth text in relation to the text length (i.e. the number of characters of the text in the GT) Errors fall into one of the following three categories: From e2d2ec93c5cf627a3c08119bea6d4871aa41b856 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 3 Feb 2023 08:00:29 +0100 Subject: [PATCH 18/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 9c5e4b0..f89bcbd 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -32,7 +32,7 @@ As there are different implementations of the edit distance available (e.g. [rap Given a Ground truth that reads `ſind` and the recognized text `fmd`. -The Levenshtein distance between these texts is 4, because 4 edit operations are necessary to turn `fmd` into `ſind`: +The Levenshtein distance between these texts is 3, because 3 single-character edit operations are necessary to turn `fmd` into `ſind`. For example: * `fmd` --> `ſmd` (substitution) * `ſmd` --> `ſimd` (insertion) From ad975948706617f0757a834ffc01b327afabb3a6 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 3 Feb 2023 08:06:22 +0100 Subject: [PATCH 19/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index f89bcbd..b68fe70 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -9,7 +9,8 @@ solution works best for their use case. ## Evaluation Metrics -The evaluation of the success (accuracy) of OCR is a complex task for which multiple methods and metrics are available. It aims to capture quality in different aspects, such as the recognition of text, but also the detection of layout, for which different methods and metrics are needed. +The evaluation of the quality (accuracy and precision) of OCR is a complex task, for which multiple methods and metrics are available. +It needs to capture several aspects corresponding to the interdependent subtasks of an OCR workflow, viz. layout analysis and text recognition, which themselves require different methods and metrics. Furthermore, the time and resources required for OCR processing also have to be captured. Here we describe the metrics that were selected for use in OCR-D, how exactly they are applied, and what was the motivation. From 34a78cbbfe045b017b631029290dd9d05096454b Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 3 Feb 2023 08:09:18 +0100 Subject: [PATCH 20/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index b68fe70..c0133f3 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -56,7 +56,7 @@ Not included in the character definition are all forms of white spaces. The character error rate (CER) describes how many faulty characters the output of an OCR engine contains compaired to the Ground Truth text in relation to the text length (i.e. the number of characters of the text in the GT) -Errors fall into one of the following three categories: +Thus, CER defines a (single-character) **error** in terms of the above three categories of edit operations: * **deletion**: a character that is present in the text has been deleted from the output. From c95ce0bfecb6cf937268e8a0043c168e0c3d9add Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 3 Feb 2023 08:10:37 +0100 Subject: [PATCH 21/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index c0133f3..cab5cec 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -24,7 +24,9 @@ The most important measure to assess the quality of OCR is the accuracy of the r #### Levenshtein Distance (Edit Distance) -Levenshtein distance between two strings `a` and `b` is the number of edit operations needed to turn `a` into `b`. Edit operations depend on the specific variant of the algorithm but for OCR, relevant operations are deletion, insertion and substitution. +Levenshtein distance between two strings is defined as the (minimum) number of (single-character) edit operations needed to turn the one into the other. +Edit operations depend on the specific variant of the algorithm but for OCR, relevant operations are deletion, insertion and substitution. +To calculate the edit distance, the two strings first have to be (optimally) aligned. The Levenshtein distance forms the basis for the calculation of [CER/WER](https://pad.gwdg.de/#CERWER). As there are different implementations of the edit distance available (e.g. [rapidfuzz](https://maxbachmann.github.io/RapidFuzz/Usage/distance/Levenshtein.html), [jellyfish](https://jamesturk.github.io/jellyfish/functions/#levenshtein-distance), …), the OCR-D coordination project will provide a recommendation in the final version of this document. From 13b2bcde91eb18ba99efd94f799b975ef56332ae Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 3 Feb 2023 08:12:08 +0100 Subject: [PATCH 22/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index cab5cec..09d0958 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -85,7 +85,8 @@ This reads `diese Strahlen, und`. The output contains `Strahlen ,`, inserting a CER can be calculated in several ways, depending on whether a normalized CER is used or not. -Given $i$ as the number of insertions, $d$ the number of deletions, $s$ the number of substitutions and $n$ the total number of characters in a text, the CER can be obtained by +Given $i$ as the number of insertions, $d$ the number of deletions, $s$ the number of substitutions of the OCR text, +and $n$ the total number of characters of the GT text, the CER can be obtained by $CER = \frac{i + s+ d}{n}$ From b823afc1be9fd9aaf2e6e9f6b6a05b911a409633 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 3 Feb 2023 08:13:45 +0100 Subject: [PATCH 23/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 09d0958..2849066 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -342,7 +342,9 @@ $mAP = \displaystyle\frac{1}{N}\sum_{i=1}^{N}AP_i$ with $N$ being the number of ##### Scenario-Driven Performance Evaluation -Scenario-driven performance evaluation as described in [Clausner et al., 2011](https://primaresearch.org/publications/ICDAR2011_Clausner_PerformanceEvaluation) is currently the most comprehensive and sophisticated approach to evaluate OCR success with consideration of layout. +Scenario-driven, layout-dedicated, text-flow informed performance evaluation as described in +[Clausner et al., 2011](https://primaresearch.org/publications/ICDAR2011_Clausner_PerformanceEvaluation) +is currently the most comprehensive and sophisticated approach to evaluate the quality of layout analysis. The approach is based on the definition of so called evaluation scenarios, which allow the flexible combination of a selection of metrics together with their weights, targeted at a specific use case. From 8ab13910edeb9b114aecfbd5f48d10ba6c9d0f86 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 3 Feb 2023 08:32:23 +0100 Subject: [PATCH 24/56] Apply suggestions from code review Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 50 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 2849066..ebe1dd9 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -92,7 +92,10 @@ $CER = \frac{i + s+ d}{n}$ If the CER value is calculated this way, it represents the percentage of characters incorrectly recognized by the OCR engine. Also, we can easily reach error rates beyond 100% when the output contains a lot of insertions. -The *normalized* CER tries to mitigate this effect by considering the number of correct characters, $c$: +Sometimes, this is mitigated by defining $n$ as the maximum of both lengths, or by clipping the rate at 100%. +Neither of these strategies yields an unbiased estimate. + +The *normalized* CER avoids this effect by considering the number of correct characters (or identity operations), $c$: $CER_n = \frac{i + s+ d}{i + s + d + c}$ @@ -102,11 +105,14 @@ In OCR-D's benchmarking we calculate the *non-normalized* CER where values over In OCR-D we distinguish between the CER per **page** and the **overall** CER of a text. The reasoning behind this is that the material OCR-D mainly aims at (historical prints) is very heterogeneous: Some pages might have an almost simplistic layout while others can be highly complex and difficult to process. Providing only an overall CER would cloud these differences between pages. -At this point we only provide a CER per page; an overall CER might be calculated as a weighted aggregate at a later stage. +Currently we only provide CER per page; higher-level CER results might be calculated as a weighted aggregate at a later stage. ##### Word Error Rate (WER) -The word error rate (WER) is closely connected to the CER. While the CER focusses on differences between characters, the WER represents the percentage of words incorrectly recognized in a text. +Word error rate (WER) is analogous to CER: While CER operates on (differences between) characters, +WER measures the percentage of incorrectly recognized words in a text. + +A **word** in that context is usually defined as any sequence of characters between white space (including line breaks), with leading and trailing punctuation removed (according to [Unicode TR29 Word Boundary algorithm](http://unicode.org/reports/tr29/#Word_Boundaries)). CER and WER share categories of errors, and the WER is similarly calculated: @@ -120,7 +126,7 @@ More specific cases of WER consider only the "significant" words, omitting e.g. In OCR-D we distinguish between the WER per **page** and the **overall** WER of a text. The reasoning here follows the one of CER granularity. -At this point we only provide a WER per page; an overall WER might be calculated at a later stage. +Currently we only provide WER per page; higher-level WER results might be calculated at a later stage. #### Bag of Words @@ -197,7 +203,7 @@ Last but not least, it is important to collect information about the resource ut #### CPU Time -CPU time is the time taken by the CPU to process an instruction. It does not include idle time. +CPU time is the time taken by the CPU(s) on the processors. It does not include idle time, but does grow with the number of threads/processes. #### Wall Time @@ -205,15 +211,15 @@ Wall time (or elapsed time) is the time taken by a processor to process an instr #### I/O -I/O (input / output) is the number of bytes read and written during a process. +I/O (input / output) bandwith is the (average/peak) number of bytes per second read and written from disk during processing. #### Memory Usage -Memory usage is the number of bytes the process allocates in memory (RAM). +Memory usage is the (average/peak) number of bytes the process allocates in memory (RAM), i.e. resident set size (RSS) or proportional set size (PSS). #### Disk Usage -Disk usage is the number of bytes the process allocates on hard disk. +Disk usage is the total number of bytes the process reads and writes on disk. ### Unicode Normalization @@ -249,9 +255,14 @@ GPU avg memory refers to the average amount of memory of the GPU (in GiB) that w ##### Flexible Character Accuracy Measure -The flexible character accuracy measure has been introduced to mitigate a major flaw of the CER: The CER is heavily dependent on the reading order an OCR engine detects; When content blocks are e.g. mixed up or merged during the text recognition step but single characters have been perfectly recognized, the CER is still very low. +The Flexible Character Accuracy (FCA) measure has been introduced to mitigate a major drawback of CER: +CER (if applied naively by comparing concatenated page-level texts) is heavily dependent on the reading order an OCR engine detects. +Thus, where text blocks are rearranged or merged, no suitable text alignment can be made, so CER is very low, +even if single characters, words and even lines have been perfectly recognized. -The flexible character accuracy measure circumvents this effect by splitting the recognized text and the Ground Truth in smaller chunks and measure their partial edit distance. After all partial edit distances have been obtained, they are summed up to receive the overall character accuracy measure. +FCA avoids this by splitting the recognized text and GT into lines and, if necessary, sub-line chunks, +finding pairs that align maximally until only unmatched lines remain (which must be treated as errors), +and measuring average CER of all pairs. The algorithm can be summarized as follows: @@ -271,6 +282,11 @@ The algorithm can be summarized as follows: ##### mAP (mean Average Precision) +This score was originally devised for object detection in photo scenery (where overlaps are allowed and cannot conflict with text flow). +It is not adequate for document layout for various reasons, but since it is a standard metric in the domain of neural computer vision, +methods and tools of which are increasingly used for layout analysis as well, it is still somewhat useful for reference. + +The following paragraphs will first introduce the intermediate concepts needed to define the mAP metric itself. ###### Precision and Recall **Precision** is a means to describe how accurate a model can identify an object within an image. The higher the precision of a model, the more confidently we can assume that a prediction (e.g. the model having identified a bicycle in an image) is correct. A precision of 1 indicates that each identified object in an image has been correctly identified (true positives) and no false positives have been detected. As the precision value descreases, the result contains more and more false positives. @@ -288,7 +304,7 @@ A threshold is a freely chosen number between 0 and 1. It divides the output of Example: Given a threshold of 0.6 and a model that tries to detect bicycles in an image. The model returns two areas in an image that might be bicycles, one with a prediction score of 0.4 and one with 0.9. Since the threshold equals 0.6, the first area is tossed and not regarded as bicycle while the second one is kept and counted as recognized. -###### Precision-Recall-Curve +###### Precision-Recall Curve Precision and recall are connected to each other since both depend on the true positives detected. A precision-recall-curve is a means to balance these values while maximizing them. @@ -320,7 +336,7 @@ The Average Precision can be computed with the weighted mean of precision at eac $AP = \displaystyle\sum_{k=0}^{k=n-1}[r(k) - r(k+1)] * p(k)$ -with $n$ being the number of thresholds and $r(k)$/$p(k)$ being the respective recall/precision values for the current confidence threshold $k$. +with $n$ being the number of thresholds and $r(k)/p(k)$ being the respective recall/precision values for the current confidence threshold $k$. Example: Given the example above, we get: @@ -334,19 +350,25 @@ AP & = \displaystyle\sum_{k=0}^{k=n-1}[r(k) - r(k+1)] * p(k) \\ \end{array} $$ -###### mAP (mean Average Precision) +###### Mean Average Precision The mean Average Precision is a metric used to measure how accurate an object detector is. [As stated](#Thresholds), a threshold can be chosen freely, so there is some room for errors when picking one single threshold. To mitigate this effect, the mean Average Precision metric has been introduced which considers a set of IoU thresholds to determine the detector's performance. It is calculated by first computing the Average Precision for each IoU threshold and then finding the average: $mAP = \displaystyle\frac{1}{N}\sum_{i=1}^{N}AP_i$ with $N$ being the number of thresholds. +Often, this mAP for a range of IoU thresholds gets complemented by additional mAP runs for a set of fixed values, or for various classes and object sizes only. +The common understanding is that those different measures collectively allow drawing better conclusions and comparisons about the model's quality. ##### Scenario-Driven Performance Evaluation Scenario-driven, layout-dedicated, text-flow informed performance evaluation as described in [Clausner et al., 2011](https://primaresearch.org/publications/ICDAR2011_Clausner_PerformanceEvaluation) is currently the most comprehensive and sophisticated approach to evaluate the quality of layout analysis. -The approach is based on the definition of so called evaluation scenarios, which allow the flexible combination of a selection of metrics together with their weights, targeted at a specific use case. +It is not a single metric, but comprises a multitude of measures derived in a unified method, which considers +the crucial effects that segmentation can have on text flow, i.e. which kinds of overlaps (merges and splits) +amount to benign deviations (extra white-space) or pathological ones (breaking lines and words apart). +In this approach, all the derived measures are aggregated under various sets of weights, called evaluation scenarios, +which target specific use cases (like headline or keyword extraction, linear fulltext, newspaper or figure extraction). ## Evaluation JSON schema From 5e1da316367baba6f3ded7689adad8743526c098 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 3 Feb 2023 09:05:03 +0100 Subject: [PATCH 25/56] Apply suggestions from code review Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index ebe1dd9..5480957 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -289,7 +289,16 @@ methods and tools of which are increasingly used for layout analysis as well, it The following paragraphs will first introduce the intermediate concepts needed to define the mAP metric itself. ###### Precision and Recall -**Precision** is a means to describe how accurate a model can identify an object within an image. The higher the precision of a model, the more confidently we can assume that a prediction (e.g. the model having identified a bicycle in an image) is correct. A precision of 1 indicates that each identified object in an image has been correctly identified (true positives) and no false positives have been detected. As the precision value descreases, the result contains more and more false positives. +**Precision** describes to which degree the predictions of a model are correct. +The higher the precision of a model, the more confidently we can assume that each prediction is correct +(e.g. the model having identified a bicycle in an image actually depicts a bicycle). +A precision of 1 (or 100%) indicates all predictions are correct (true positives) and no predictions are incorrect (false positives). The lower the precision value, the more false positives. + +In the context of object detection in images, it measures either +- the ratio of correctly detected segments over all detected segments + (where _correct_ is defined as having sufficient overlap with some GT segment), or +- the ratio of correctly segmented pixels over the image size + (assuming all predictions can be combined into some coherent segmentation). **Recall**, on the other hand, measures how well a model performs in finding all instances of an object in an image (true positives), irregardless of false positives. Given a model tries to identify bicycles in an image, a recall of 1 indicates that all bicycles have been found by the model (while not considering other objects that have been falsely labelled as a bicycle). @@ -328,7 +337,9 @@ This graph is called Precision-Recall-Curve. ###### Average Precision -The average precision (AP) describes how well a model can detect objects in an image for recall values over 0 to 1 by computing the average of all precisions given in the Precision-Recall-Curve. It is equal to the area under the curve. +Average Precision (AP) describes how well (flexible and robust) a model can detect objects in an image, +by averaging precision over the full range (from 0 to 1) of confidence thresholds (and thus, recall results). +It is equal to the area under the Precision-Recall Curve. ![A sample precision/recall curve with highlighted area under curve](https://pad.gwdg.de/uploads/799e6a05-e64a-4956-9ede-440ac0463a3f.png) @@ -352,7 +363,10 @@ $$ ###### Mean Average Precision -The mean Average Precision is a metric used to measure how accurate an object detector is. [As stated](#Thresholds), a threshold can be chosen freely, so there is some room for errors when picking one single threshold. To mitigate this effect, the mean Average Precision metric has been introduced which considers a set of IoU thresholds to determine the detector's performance. It is calculated by first computing the Average Precision for each IoU threshold and then finding the average: +Mean Average Precision (mAP) is a metric used to measure the full potential of an object detector over various conditions. +AP is merely an average over confidence thresholds. But as [stated earlier](#iou-thresholds), the IoU threshold can be chosen freely, +so AP only reflects the performance under that particular choice. In general though, how accurately every object must be matched may depend on the use-case, and on the class or size of the objects. +That's why the mAP metric has been introduced: It is calculated by computing the AP over a range of IoU thresholds, and averaging over them: $mAP = \displaystyle\frac{1}{N}\sum_{i=1}^{N}AP_i$ with $N$ being the number of thresholds. From 0183ea9792e191d76d8615c99478ce88ea0f2854 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 3 Feb 2023 10:00:25 +0100 Subject: [PATCH 26/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 5480957..d2cda8f 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -2,10 +2,17 @@ ## Rationale -Estimating the quality of OCR requires workflows run on representative data, -evaluation metrics and evaluation tools that need to work together in a -well-defined manner to allow users to make informed decisions about which OCR -solution works best for their use case. +Evaluating the quality of OCR requires comparing the OCR results on representative **ground truth** (GT) +– i.e. realistic data (images) with manual transcriptions (segmentation, text). +OCR results can be obtained via several distinct **OCR workflows**. + +The comparison requires evaluation tools which themselves build on a number of established +evaluation metrics. +The evaluation results must be presented in a way that allows factorising and localising aberrations, +both within documents (page types, individual pages, region types, individual regions) and across classes of similar documents. + + All this needs to work together in a well-defined and automatically repeatable manner, so + users can make informed decisions about which OCR workflow works best for which material and use case. ## Evaluation Metrics From 5519120e476a2d8d648375628c9345f75597aeb7 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Tue, 7 Feb 2023 09:07:43 +0100 Subject: [PATCH 27/56] update character definition wrt. white spaces --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index d2cda8f..0c8b272 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -53,7 +53,7 @@ The Levenshtein distance between these texts is 3, because 3 single-character ed ##### Characters A text consists of a set of characters that have a certain meaning. A character is a glyph that represents a word, a letter in a word, or a symbol. -Not included in the character definition are all forms of white spaces. +White spaces are considered as characters. ###### Examples From dd2d63b0f8d1acc42d45a8880dad42d30224eca4 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Thu, 9 Feb 2023 11:45:42 +0100 Subject: [PATCH 28/56] refine paragraph about characters --- ocrd_eval.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 0c8b272..a222bd3 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -2,7 +2,7 @@ ## Rationale -Evaluating the quality of OCR requires comparing the OCR results on representative **ground truth** (GT) +Evaluating the quality of OCR requires comparing the OCR results on representative **ground truth** (GT) – i.e. realistic data (images) with manual transcriptions (segmentation, text). OCR results can be obtained via several distinct **OCR workflows**. @@ -52,14 +52,16 @@ The Levenshtein distance between these texts is 3, because 3 single-character ed ##### Characters -A text consists of a set of characters that have a certain meaning. A character is a glyph that represents a word, a letter in a word, or a symbol. +A text consists of a set of characters that have a certain meaning. +In OCR-D, a character is technically defined as a grapheme cluster, i.e. one or more Unicode (or Private Use Area) codepoint(s) that represents an element of a writing system in NFC (see [#Unicode-Normalization]). White spaces are considered as characters. +Special codepoints like Byte-Order Marks or directional marks are ignored. + ###### Examples -* the character `a` in the text `babst` represents the German letter `a` -* the character `&` represents the Latin abbreviation `etc.` -* the character `☿` represents an Astronomical symbol for the planet Mercury +* the character `ä` in the word `Kälte` is encoded by Unicode `U+00E4` +* the character `ܡܿ` in the word `ܡܿܢ` is encoded by Unicode `U+0721` + `U+073F` ##### Character Error Rate (CER) From 025372072baddef44b84bfaf09480c84da2aafc1 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Thu, 9 Feb 2023 12:39:25 +0100 Subject: [PATCH 29/56] move character section before edit distance section --- ocrd_eval.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index a222bd3..73f2c5e 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -29,6 +29,19 @@ At this stage (Q3 2022) these definitions serve as a basis of common understandi The most important measure to assess the quality of OCR is the accuracy of the recognized text. The majority of metrics for this are based on the Levenshtein distance, an algorithm to compute the distance between two strings. In OCR, one of these strings is generally the Ground Truth text and the other the recognized text which is the result of an OCR. +#### Characters + +A text consists of a set of characters that have a certain meaning. +In OCR-D, a character is technically defined as a grapheme cluster, i.e. one or more Unicode (or Private Use Area) codepoint(s) that represents an element of a writing system in NFC (see [#Unicode-Normalization]). +White spaces are considered as characters. + +Special codepoints like Byte-Order Marks or directional marks are ignored. + +##### Examples + +* the character `ä` in the word `Kälte` is encoded by Unicode `U+00E4` +* the character `ܡܿ` in the word `ܡܿܢ` is encoded by Unicode `U+0721` + `U+073F` + #### Levenshtein Distance (Edit Distance) Levenshtein distance between two strings is defined as the (minimum) number of (single-character) edit operations needed to turn the one into the other. @@ -50,19 +63,6 @@ The Levenshtein distance between these texts is 3, because 3 single-character ed #### CER and WER -##### Characters - -A text consists of a set of characters that have a certain meaning. -In OCR-D, a character is technically defined as a grapheme cluster, i.e. one or more Unicode (or Private Use Area) codepoint(s) that represents an element of a writing system in NFC (see [#Unicode-Normalization]). -White spaces are considered as characters. - -Special codepoints like Byte-Order Marks or directional marks are ignored. - -###### Examples - -* the character `ä` in the word `Kälte` is encoded by Unicode `U+00E4` -* the character `ܡܿ` in the word `ܡܿܢ` is encoded by Unicode `U+0721` + `U+073F` - ##### Character Error Rate (CER) The character error rate (CER) describes how many faulty characters the output of an OCR engine contains compaired to the Ground Truth text in relation to the text length (i.e. the number of characters of the text in the GT) From 19deddd3ab0f775025c628c04aafbe5670a501a3 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Thu, 9 Feb 2023 12:41:56 +0100 Subject: [PATCH 30/56] add placeholder for letter accuracy --- ocrd_eval.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 73f2c5e..aa968c8 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -262,6 +262,10 @@ GPU avg memory refers to the average amount of memory of the GPU (in GiB) that w #### Text Evaluation +##### Letter Accuracy + +TODO + ##### Flexible Character Accuracy Measure The Flexible Character Accuracy (FCA) measure has been introduced to mitigate a major drawback of CER: @@ -296,6 +300,7 @@ It is not adequate for document layout for various reasons, but since it is a st methods and tools of which are increasingly used for layout analysis as well, it is still somewhat useful for reference. The following paragraphs will first introduce the intermediate concepts needed to define the mAP metric itself. + ###### Precision and Recall **Precision** describes to which degree the predictions of a model are correct. @@ -303,10 +308,11 @@ The higher the precision of a model, the more confidently we can assume that eac (e.g. the model having identified a bicycle in an image actually depicts a bicycle). A precision of 1 (or 100%) indicates all predictions are correct (true positives) and no predictions are incorrect (false positives). The lower the precision value, the more false positives. -In the context of object detection in images, it measures either -- the ratio of correctly detected segments over all detected segments - (where _correct_ is defined as having sufficient overlap with some GT segment), or -- the ratio of correctly segmented pixels over the image size +In the context of object detection in images, it measures either + +* the ratio of correctly detected segments over all detected segments + (where *correct* is defined as having sufficient overlap with some GT segment), or +* the ratio of correctly segmented pixels over the image size (assuming all predictions can be combined into some coherent segmentation). **Recall**, on the other hand, measures how well a model performs in finding all instances of an object in an image (true positives), irregardless of false positives. Given a model tries to identify bicycles in an image, a recall of 1 indicates that all bicycles have been found by the model (while not considering other objects that have been falsely labelled as a bicycle). From 149b2714adf29834bae63bfffd9196e9c43523c8 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Thu, 9 Feb 2023 12:44:21 +0100 Subject: [PATCH 31/56] fix link --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index aa968c8..e2c984c 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -32,7 +32,7 @@ The most important measure to assess the quality of OCR is the accuracy of the r #### Characters A text consists of a set of characters that have a certain meaning. -In OCR-D, a character is technically defined as a grapheme cluster, i.e. one or more Unicode (or Private Use Area) codepoint(s) that represents an element of a writing system in NFC (see [#Unicode-Normalization]). +In OCR-D, a character is technically defined as a grapheme cluster, i.e. one or more Unicode (or Private Use Area) codepoint(s) that represents an element of a writing system in NFC (see [Unicode Normalization](#unicode-normalization)). White spaces are considered as characters. Special codepoints like Byte-Order Marks or directional marks are ignored. From 7d0bbf6505952dde29fb2b2e31f391b24ddb3fb8 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Thu, 9 Feb 2023 12:48:07 +0100 Subject: [PATCH 32/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index e2c984c..d08c8a2 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -92,7 +92,7 @@ Example: This reads `diese Strahlen, und`. The output contains `Strahlen ,`, inserting a white space before the comma. -CER can be calculated in several ways, depending on whether a normalized CER is used or not. +CER can be defined in multiple ways, depending on what exactly counts as the **length** of the text. Given $i$ as the number of insertions, $d$ the number of deletions, $s$ the number of substitutions of the OCR text, and $n$ the total number of characters of the GT text, the CER can be obtained by From 851aeb718f524f51e3ffbbb5afba2bc9d2b25f56 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Thu, 9 Feb 2023 12:49:23 +0100 Subject: [PATCH 33/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index d08c8a2..816486f 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -65,7 +65,9 @@ The Levenshtein distance between these texts is 3, because 3 single-character ed ##### Character Error Rate (CER) -The character error rate (CER) describes how many faulty characters the output of an OCR engine contains compaired to the Ground Truth text in relation to the text length (i.e. the number of characters of the text in the GT) +The character error rate (CER) is defined as the quotient of the edit distance over the length +with respect to the character string pair of GT and OCR text. It thus describes an empirical estimate +of the probability of some random character to be misrecognised. Thus, CER defines a (single-character) **error** in terms of the above three categories of edit operations: From c81079ab8b6b37043ae0b16211d6a09ceb9b5e37 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 10 Feb 2023 08:05:56 +0100 Subject: [PATCH 34/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 816486f..661778e 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -218,7 +218,7 @@ CPU time is the time taken by the CPU(s) on the processors. It does not include #### Wall Time -Wall time (or elapsed time) is the time taken by a processor to process an instruction including idle time. +Wall-clock time (or elapsed time) is the time taken on the processors including idle time but ignoring concurrency. #### I/O From fab62029984df5ec02b8e10019249cfaaf3bb90f Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 10 Feb 2023 08:07:29 +0100 Subject: [PATCH 35/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 661778e..2b26b72 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -110,7 +110,7 @@ The *normalized* CER avoids this effect by considering the number of correct cha $CER_n = \frac{i + s+ d}{i + s + d + c}$ -In OCR-D's benchmarking we calculate the *non-normalized* CER where values over 1 should be read as 100%. +In OCR-D's benchmarking we calculate the *normalized* CER where values naturally range between 0 and 100%. ###### CER Granularity From f678050667bb6bcbbc4002a84075c1789395d267 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Fri, 10 Feb 2023 08:09:52 +0100 Subject: [PATCH 36/56] implement feedback --- ocrd_eval.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 2b26b72..107aa19 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -27,7 +27,10 @@ At this stage (Q3 2022) these definitions serve as a basis of common understandi ### Text Evaluation -The most important measure to assess the quality of OCR is the accuracy of the recognized text. The majority of metrics for this are based on the Levenshtein distance, an algorithm to compute the distance between two strings. In OCR, one of these strings is generally the Ground Truth text and the other the recognized text which is the result of an OCR. +The most important measure to assess the quality of OCR is the accuracy of the recognized text. +The majority of metrics for this are based on the Levenshtein distance, an algorithm to compute the distance between two strings. +In OCR, one of these strings is generally the Ground Truth text and the other the recognized text which is the result of an OCR. +The text is concatenated at page level from smaller constituents in reading order. #### Characters From a680c7072f607a6c6337942317ac866ae13e55b4 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Fri, 10 Feb 2023 08:11:49 +0100 Subject: [PATCH 37/56] be more precise about CER/WER granularity --- ocrd_eval.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 107aa19..f1384e5 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -117,7 +117,7 @@ In OCR-D's benchmarking we calculate the *normalized* CER where values naturally ###### CER Granularity -In OCR-D we distinguish between the CER per **page** and the **overall** CER of a text. The reasoning behind this is that the material OCR-D mainly aims at (historical prints) is very heterogeneous: Some pages might have an almost simplistic layout while others can be highly complex and difficult to process. Providing only an overall CER would cloud these differences between pages. +In OCR-D we distinguish between the CER per **page** and the **overall** CER of a document. The reasoning behind this is that the material OCR-D mainly aims at (historical prints) is very heterogeneous: Some pages might have an almost simplistic layout while others can be highly complex and difficult to process. Providing only an overall CER would cloud these differences between pages. Currently we only provide CER per page; higher-level CER results might be calculated as a weighted aggregate at a later stage. @@ -138,7 +138,7 @@ More specific cases of WER consider only the "significant" words, omitting e.g. ###### WER Granularity -In OCR-D we distinguish between the WER per **page** and the **overall** WER of a text. The reasoning here follows the one of CER granularity. +In OCR-D we distinguish between the WER per **page** and the **overall** WER of a document. The reasoning here follows the one of CER granularity. Currently we only provide WER per page; higher-level WER results might be calculated at a later stage. From 5e94aa089afbd60e37e9c23db7306448702299ae Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Fri, 10 Feb 2023 08:18:41 +0100 Subject: [PATCH 38/56] change GPU metrics --- ocrd_eval.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index f1384e5..efdec98 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -257,13 +257,13 @@ The following metrics are not part of the MVP (minimal viable product) and will #### GPU Metrics -##### GPU Time +##### GPU Avg Usage -GPU time is the time a GPU (graphics card) spent processing instructions +GPU avg usage is the average GPU load during the execution of a workflow represented by a real number between 0 and 1. -##### GPU Avg Memory +##### GPU Peak Usage -GPU avg memory refers to the average amount of memory of the GPU (in GiB) that was used during processing. +GPU peak usage is the maximum GPU load during the execution of a workflow represented by a real number between 0 and 1. #### Text Evaluation From e8dc864896beb89aa35ad7e180f378b2728f9d61 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Fri, 10 Feb 2023 08:22:54 +0100 Subject: [PATCH 39/56] change citation hint --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index efdec98..5ac8380 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -294,7 +294,7 @@ The algorithm can be summarized as follows: > 6. Count non-matched lines / strings as insertions or deletions (depending on origin: ground truth or result) > 7. Sum up all partial edit distances and calculate overall character accuracy -(C. Clausner, S. Pletschacher and A. Antonacopoulos / Pattern Recognition Letters 131 (2020) 390–397, p. 392) +(paraphrase of C. Clausner, S. Pletschacher and A. Antonacopoulos / Pattern Recognition Letters 131 (2020) 390–397, p. 392) #### Layout Evalutation From ee330c9a6739c537a2ad9e308d6823350d025f7c Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Fri, 10 Feb 2023 08:26:52 +0100 Subject: [PATCH 40/56] adjust WER definition --- ocrd_eval.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 5ac8380..4ebc333 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -130,9 +130,9 @@ A **word** in that context is usually defined as any sequence of characters betw CER and WER share categories of errors, and the WER is similarly calculated: -$WER = \frac{i_w + s_w + d_w}{n_w}$ +$WER = \frac{i_w + s_w + d_w}{i_w + s_w + d_w + c_w}$ -where $i_w$ is the number of inserted, $s_w$ the number of substituted, $d_w$ the number of deleted and $n_w$ the total number of words. +where $i_w$ is the number of inserted, $s_w$ the number of substituted, $d_w$ the number of deleted and $c_w$ the number of correctl words. More specific cases of WER consider only the "significant" words, omitting e.g. stopwords from the calculation. From 9ea4b62f4dc84ab1d52c518f6314eaa088ba4b9f Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Fri, 10 Feb 2023 08:33:50 +0100 Subject: [PATCH 41/56] Apply suggestions from code review Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 63 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 4ebc333..9d5bf69 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -144,7 +144,7 @@ Currently we only provide WER per page; higher-level WER results might be calcul #### Bag of Words -In the "Bag of Words" model a text is represented as a set of its word irregardless of word order or grammar; Only the words themselves and their number of occurence are considered. +In the "Bag of Words" (BaW) model, a text is represented as a multiset of the words (as defined in the previous section) it contains, regardless of their order. Example: @@ -284,15 +284,18 @@ and measuring average CER of all pairs. The algorithm can be summarized as follows: -> 1. Split the two input texts into text lines -> 2. Sort the ground truth text lines by length (in descending order) -> 3. For the first ground truth line, find the best matching OCR result line segment (by minimising a penalty that is partly based on string edit distance) -> 4. If full match (full length of line) -> a. Mark as done and remove line from list -> b. Else subdivide and add to respective list of text lines; resort +> 1. Split both input texts into text lines +> 2. Sort the GT lines by length +> (in descending order) +> 3. For the top GT line, find the best fully or partially matching OCR line +> (by lowest edit distance and highest coverage) +> 4. If full match (i.e. full length of line) +> a. Mark as done and remove line from both lists +> b. Else mark matching part as done, +> then cut off unmatched part and add to respective list of text lines; resort > 5. If any more lines available repeat step 3 -> 6. Count non-matched lines / strings as insertions or deletions (depending on origin: ground truth or result) -> 7. Sum up all partial edit distances and calculate overall character accuracy +> 6. Count remaining unmatched lines as insertions or deletions (depending on origin – GT or OCR) +> 7. Calculate the (micro-)average CER of all marked pairs and return as overall FCER (paraphrase of C. Clausner, S. Pletschacher and A. Antonacopoulos / Pattern Recognition Letters 131 (2020) 390–397, p. 392) @@ -320,22 +323,50 @@ In the context of object detection in images, it measures either * the ratio of correctly segmented pixels over the image size (assuming all predictions can be combined into some coherent segmentation). -**Recall**, on the other hand, measures how well a model performs in finding all instances of an object in an image (true positives), irregardless of false positives. Given a model tries to identify bicycles in an image, a recall of 1 indicates that all bicycles have been found by the model (while not considering other objects that have been falsely labelled as a bicycle). +**Recall**, on the other hand, describes to which degree a model predicts what is actually present. +The higher the recall of a model, the more confidently we can assume that it covers everything to be found +(e.g. the model having identified every bicycle, car, person etc. in an image). +A recall of 1 (or 100%) indicates that all objects have a correct prediction (true positives) and no predictions are missing or mislabelled (false negatives). The lower the recall value, the more false negatives. + +In the context of object detection in images, it measures either +- the ratio of correctly detected segments over all actual segments, or +- the ratio of correctly segmented pixels over the image size. + +Notice that both goals are naturally conflicting each other. A good predictor needs both high precision and recall. +But the optimal trade-off depens on the application. + +For layout analysis though, the underlying notion of sufficient overlap itself is inadequate: +- it does not discern oversegmentation from undersegmentation +- it does not discern splits/merges that are allowable (irrelevant w.r.t. text flow) or not (break up or conflate lines) +- it does not discern foreground from background, or when partial overlap starts breaking character legibility or introducing ghost characters ###### Prediction Score -When a model tries to identify objects in an image, it predicts that a certain area in an image represents said object with a certain confidence or prediction score. The prediction score varies between 0 and 1 and represents the percentage of certainty of having correctly identified an object. Given a model tries to identify ornaments on a page. If the model returns an area of a page with a prediction score of 0.6, the model is "60% sure" that this area is an ornament. If this area is then considered to be a positive, depends on the chosen threshold. +Most types of model can output a confidence score alongside each predicted object, +which represents the model's certainty that the prediction is correct. +For example, when a model tries to identify ornaments on a page, if it returns a segment (polygon / mask) +with a prediction score of 0.6, the model asserts there is a 60% probability that there is an ornament at that location. +Whether this prediction is then considered to be a positive detection, depends on the chosen threshold. + +###### IoU Thresholds -###### Thresholds +For object detection, the metrics precision and recall are usually defined in terms of a threshold for the degree of overlap +(represented by the IoU as defined [above](#iou-intersection-over-union)), ranging between 0 and 1) +above which pairs of detected and GT segments are qualified as matches. -A threshold is a freely chosen number between 0 and 1. It divides the output of a model into two groups: Outputs that have a prediction score or IoU greater than or equal to the threshold represent an object. Outputs with a prediction score or IoU below the threshold are discarded as not representing the object. +(Predictions that are non-matches across all GT objects – false positives – and GT objects that are non-matches across all predictions – false negatives – contribute indirectly in the denominator.) Example: -Given a threshold of 0.6 and a model that tries to detect bicycles in an image. The model returns two areas in an image that might be bicycles, one with a prediction score of 0.4 and one with 0.9. Since the threshold equals 0.6, the first area is tossed and not regarded as bicycle while the second one is kept and counted as recognized. +Given a prediction threshold of 0.8, an IoU threshold of 0.6 and a model that tries to detect bicycles in an image which depicts two bicycles. +The model returns two areas in an image that might be bicycles, one with a confidence score of 0.4 and one with 0.9. Since the prediction threshold equals 0.8, the first candidate gets immediately tossed out. The other +is compared to both bicycles in the GT. One GT object is missed (false negative), the other intersects the remaining prediction, but the latter is twice as large. +Therefore, the union of that pair is more than double the intersection. But since the IoU threshold equals 0.6, even the second candidate is not regarded as a match and thus also counted as false negative. Overall, both precision and recall are zero (becaue 1 kept prediction is a false positive and 2 GTs are false negatives). ###### Precision-Recall Curve -Precision and recall are connected to each other since both depend on the true positives detected. A precision-recall-curve is a means to balance these values while maximizing them. +By varying the prediction threshold (and/or the IoU threshold), the tradeoff between precision and recall can be tuned. +When the full range of combinations has been gauged, the result can be visualised in a precision-recall curve (or receiver operator characteristic, ROC). +Usually the optimum balance is where the product of precision and recall (i.e. area under the curve) is maximal. Given a dataset with 100 images in total of which 50 depict a bicycle. Also given a model trying to identify bicycles on images. The model is run 7 times using the given dataset while gradually increasing the threshold from 0.1 to 0.7. @@ -381,6 +412,8 @@ AP & = \displaystyle\sum_{k=0}^{k=n-1}[r(k) - r(k+1)] * p(k) \\ \end{array} $$ +Usually, AP calculation also involves _smoothing_ (i.e. clipping local minima) and _interpolation_ (i.e. adding data points between the measured confidence thresholds). + ###### Mean Average Precision Mean Average Precision (mAP) is a metric used to measure the full potential of an object detector over various conditions. From c910f0e41f83f9edb23fd9060a000c9a1cd2823c Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Tue, 14 Feb 2023 10:22:31 +0100 Subject: [PATCH 42/56] add bow metric --- ocrd_eval.md | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 9d5bf69..056ef2e 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -152,15 +152,15 @@ Example: > Eine Mondfinsternis ist die Himmelsbegebenheit welche sich zur Zeit des Vollmondes ereignet, wenn die Erde zwischen der Sonne und dem Monde steht, so daß die Strahlen der Sonne von der Erde aufgehalten werden, und daß man so den Schatten der Erde in dem Monde siehet. In diesem Jahre sind zwey Monfinsternisse, davon ist ebenfalls nur Eine bey uns sichtbar, und zwar am 30sten März des Morgens nach 4 Uhr, und währt bis nach 6 Uhr. -To get the Bag of Words of this paragraph a set containing each word and its number of occurence is created: +To get the Bag of Words of this paragraph a multiset containing each word and its number of occurence is created: -$BoW$ = +$BoW_{GT}$ = ```json= { "Eine": 2, "Mondfinsternis": 1, "ist": 2, "die": 2, "Himmelsbegebenheit": 1, "welche": 1, "sich": 1, "zur": 1, "Zeit": 1, "des": 2, "Vollmondes": 1, - "ereignet,": 1, "wenn":1, "Erde": 3, "zwischen": 1, "der": 4, "Sonne": 2, + "ereignet,": 1, "wenn": 1, "Erde": 3, "zwischen": 1, "der": 4, "Sonne": 2, "und": 4, "dem": 2, "Monde": 2, "steht,": 1, "so": 2, "daß": 2, "Strahlen": 1, "von": 1, "aufgehalten": 1, "werden,": 1, "man": 1, "den": 1, "Schatten": 1, "in": 1, "siehet.": 1, "In": 1, "diesem": 1, "Jahre": 1, @@ -171,6 +171,28 @@ $BoW$ = } ``` +##### Bag of Words Metric + +The Bag of Words Metric describes how many words in a recognized text correspond to words given in the Ground Truth, independent of a page's layout. + +$BoW_m = \frac{BoW_{GT} - |\Delta_{GT/recognized}|}{|n_{GT}|}$ + +###### Example + +Given + +$BoW_{GT} = \{"Eine": 1, "Mondfinsternis": 1, "steht": 1, "bevor": 1\}$ + +and + +$BoW_{recognized} = \{"Eine": 1, "Mondfinsternis": 1, "fteht": 1, "bevor": 1\}$ + +results in: + +$BoW_m = \frac{4 - 1}{4}$ = 0.75 + +In this example 75% of the words have been correctly recognized. + ### Layout Evaluation For documents with a complex structure, looking at the recognized text's accuracy alone is often insufficient to accurately determine the quality of OCR. An example can help to illustrate this: in a document containing two columns, all characters and words may be recognized correctly, but when the two columns are detected by layout analysis as just one, the OCR result will contain the text for the first lines of the first and second column, followed by the second lines of the first and second column asf., rendering the sequence of words and paragraphs in the Ground Truth text wrongly, which defeats almost all downstream processes. From 8c22169870e3674540ab34c78bf61df9a0c63c92 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Tue, 14 Feb 2023 10:24:35 +0100 Subject: [PATCH 43/56] format document --- ocrd_eval.md | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 056ef2e..5aebc0f 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -3,11 +3,11 @@ ## Rationale Evaluating the quality of OCR requires comparing the OCR results on representative **ground truth** (GT) -– i.e. realistic data (images) with manual transcriptions (segmentation, text). +– i.e. realistic data (images) with manual transcriptions (segmentation, text). OCR results can be obtained via several distinct **OCR workflows**. The comparison requires evaluation tools which themselves build on a number of established -evaluation metrics. +evaluation metrics. The evaluation results must be presented in a way that allows factorising and localising aberrations, both within documents (page types, individual pages, region types, individual regions) and across classes of similar documents. @@ -123,7 +123,7 @@ Currently we only provide CER per page; higher-level CER results might be calcul ##### Word Error Rate (WER) -Word error rate (WER) is analogous to CER: While CER operates on (differences between) characters, +Word error rate (WER) is analogous to CER: While CER operates on (differences between) characters, WER measures the percentage of incorrectly recognized words in a text. A **word** in that context is usually defined as any sequence of characters between white space (including line breaks), with leading and trailing punctuation removed (according to [Unicode TR29 Word Boundary algorithm](http://unicode.org/reports/tr29/#Word_Boundaries)). @@ -295,12 +295,12 @@ TODO ##### Flexible Character Accuracy Measure -The Flexible Character Accuracy (FCA) measure has been introduced to mitigate a major drawback of CER: +The Flexible Character Accuracy (FCA) measure has been introduced to mitigate a major drawback of CER: CER (if applied naively by comparing concatenated page-level texts) is heavily dependent on the reading order an OCR engine detects. Thus, where text blocks are rearranged or merged, no suitable text alignment can be made, so CER is very low, even if single characters, words and even lines have been perfectly recognized. -FCA avoids this by splitting the recognized text and GT into lines and, if necessary, sub-line chunks, +FCA avoids this by splitting the recognized text and GT into lines and, if necessary, sub-line chunks, finding pairs that align maximally until only unmatched lines remain (which must be treated as errors), and measuring average CER of all pairs. @@ -333,9 +333,9 @@ The following paragraphs will first introduce the intermediate concepts needed t ###### Precision and Recall -**Precision** describes to which degree the predictions of a model are correct. +**Precision** describes to which degree the predictions of a model are correct. The higher the precision of a model, the more confidently we can assume that each prediction is correct -(e.g. the model having identified a bicycle in an image actually depicts a bicycle). +(e.g. the model having identified a bicycle in an image actually depicts a bicycle). A precision of 1 (or 100%) indicates all predictions are correct (true positives) and no predictions are incorrect (false positives). The lower the precision value, the more false positives. In the context of object detection in images, it measures either @@ -351,16 +351,18 @@ The higher the recall of a model, the more confidently we can assume that it cov A recall of 1 (or 100%) indicates that all objects have a correct prediction (true positives) and no predictions are missing or mislabelled (false negatives). The lower the recall value, the more false negatives. In the context of object detection in images, it measures either -- the ratio of correctly detected segments over all actual segments, or -- the ratio of correctly segmented pixels over the image size. + +* the ratio of correctly detected segments over all actual segments, or +* the ratio of correctly segmented pixels over the image size. Notice that both goals are naturally conflicting each other. A good predictor needs both high precision and recall. But the optimal trade-off depens on the application. For layout analysis though, the underlying notion of sufficient overlap itself is inadequate: -- it does not discern oversegmentation from undersegmentation -- it does not discern splits/merges that are allowable (irrelevant w.r.t. text flow) or not (break up or conflate lines) -- it does not discern foreground from background, or when partial overlap starts breaking character legibility or introducing ghost characters + +* it does not discern oversegmentation from undersegmentation +* it does not discern splits/merges that are allowable (irrelevant w.r.t. text flow) or not (break up or conflate lines) +* it does not discern foreground from background, or when partial overlap starts breaking character legibility or introducing ghost characters ###### Prediction Score @@ -374,7 +376,7 @@ Whether this prediction is then considered to be a positive detection, depends o For object detection, the metrics precision and recall are usually defined in terms of a threshold for the degree of overlap (represented by the IoU as defined [above](#iou-intersection-over-union)), ranging between 0 and 1) -above which pairs of detected and GT segments are qualified as matches. +above which pairs of detected and GT segments are qualified as matches. (Predictions that are non-matches across all GT objects – false positives – and GT objects that are non-matches across all predictions – false negatives – contribute indirectly in the denominator.) @@ -386,7 +388,7 @@ Therefore, the union of that pair is more than double the intersection. But sinc ###### Precision-Recall Curve -By varying the prediction threshold (and/or the IoU threshold), the tradeoff between precision and recall can be tuned. +By varying the prediction threshold (and/or the IoU threshold), the tradeoff between precision and recall can be tuned. When the full range of combinations has been gauged, the result can be visualised in a precision-recall curve (or receiver operator characteristic, ROC). Usually the optimum balance is where the product of precision and recall (i.e. area under the curve) is maximal. @@ -410,8 +412,8 @@ This graph is called Precision-Recall-Curve. ###### Average Precision -Average Precision (AP) describes how well (flexible and robust) a model can detect objects in an image, -by averaging precision over the full range (from 0 to 1) of confidence thresholds (and thus, recall results). +Average Precision (AP) describes how well (flexible and robust) a model can detect objects in an image, +by averaging precision over the full range (from 0 to 1) of confidence thresholds (and thus, recall results). It is equal to the area under the Precision-Recall Curve. ![A sample precision/recall curve with highlighted area under curve](https://pad.gwdg.de/uploads/799e6a05-e64a-4956-9ede-440ac0463a3f.png) @@ -434,7 +436,7 @@ AP & = \displaystyle\sum_{k=0}^{k=n-1}[r(k) - r(k+1)] * p(k) \\ \end{array} $$ -Usually, AP calculation also involves _smoothing_ (i.e. clipping local minima) and _interpolation_ (i.e. adding data points between the measured confidence thresholds). +Usually, AP calculation also involves *smoothing* (i.e. clipping local minima) and *interpolation* (i.e. adding data points between the measured confidence thresholds). ###### Mean Average Precision @@ -447,14 +449,15 @@ $mAP = \displaystyle\frac{1}{N}\sum_{i=1}^{N}AP_i$ with $N$ being the number of Often, this mAP for a range of IoU thresholds gets complemented by additional mAP runs for a set of fixed values, or for various classes and object sizes only. The common understanding is that those different measures collectively allow drawing better conclusions and comparisons about the model's quality. + ##### Scenario-Driven Performance Evaluation -Scenario-driven, layout-dedicated, text-flow informed performance evaluation as described in +Scenario-driven, layout-dedicated, text-flow informed performance evaluation as described in [Clausner et al., 2011](https://primaresearch.org/publications/ICDAR2011_Clausner_PerformanceEvaluation) is currently the most comprehensive and sophisticated approach to evaluate the quality of layout analysis. It is not a single metric, but comprises a multitude of measures derived in a unified method, which considers -the crucial effects that segmentation can have on text flow, i.e. which kinds of overlaps (merges and splits) +the crucial effects that segmentation can have on text flow, i.e. which kinds of overlaps (merges and splits) amount to benign deviations (extra white-space) or pathological ones (breaking lines and words apart). In this approach, all the derived measures are aggregated under various sets of weights, called evaluation scenarios, which target specific use cases (like headline or keyword extraction, linear fulltext, newspaper or figure extraction). From 149a2ebe4c8fd715c489c224f58f1e455cc52341 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Tue, 14 Feb 2023 10:29:05 +0100 Subject: [PATCH 44/56] gpu mem instead of util --- ocrd_eval.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 5aebc0f..07ba3cd 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -279,13 +279,13 @@ The following metrics are not part of the MVP (minimal viable product) and will #### GPU Metrics -##### GPU Avg Usage +##### GPU Avg Memory -GPU avg usage is the average GPU load during the execution of a workflow represented by a real number between 0 and 1. +GPU avg memory refers to the average amount of memory of the GPU (in GiB) that was used during processing. ##### GPU Peak Usage -GPU peak usage is the maximum GPU load during the execution of a workflow represented by a real number between 0 and 1. +GPU peak memory refers to the largest amount of memory of the GPU (in GiB) that was used during processing. #### Text Evaluation From 2999ef4274e1f577188bd4cb1ddab9f1eb622863 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Tue, 14 Feb 2023 10:30:53 +0100 Subject: [PATCH 45/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 07ba3cd..ec97ed4 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -283,7 +283,7 @@ The following metrics are not part of the MVP (minimal viable product) and will GPU avg memory refers to the average amount of memory of the GPU (in GiB) that was used during processing. -##### GPU Peak Usage +##### GPU Peak Memory GPU peak memory refers to the largest amount of memory of the GPU (in GiB) that was used during processing. From 87f943877c886f56636a21317b7e90f5d9d0e739 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Tue, 14 Feb 2023 10:32:19 +0100 Subject: [PATCH 46/56] GPU Peak Memory definition --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index ec97ed4..d5e7462 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -285,7 +285,7 @@ GPU avg memory refers to the average amount of memory of the GPU (in GiB) that w ##### GPU Peak Memory -GPU peak memory refers to the largest amount of memory of the GPU (in GiB) that was used during processing. +GPU peak memory is the maximum GPU memory allocated during the execution of a workflow in MB. #### Text Evaluation From 5e80c94e1f0073a9d4055eedf7f2d92deacabdc8 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Tue, 14 Feb 2023 10:33:15 +0100 Subject: [PATCH 47/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index d5e7462..223548b 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -132,7 +132,7 @@ CER and WER share categories of errors, and the WER is similarly calculated: $WER = \frac{i_w + s_w + d_w}{i_w + s_w + d_w + c_w}$ -where $i_w$ is the number of inserted, $s_w$ the number of substituted, $d_w$ the number of deleted and $c_w$ the number of correctl words. +where $i_w$ is the number of inserted, $s_w$ the number of substituted, $d_w$ the number of deleted and $c_w$ the number of correct words. More specific cases of WER consider only the "significant" words, omitting e.g. stopwords from the calculation. From 5cd5efbe0c33dcbe37130b66c4279ab4e3356be7 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Wed, 15 Feb 2023 08:11:35 +0100 Subject: [PATCH 48/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 223548b..51586f2 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -171,9 +171,11 @@ $BoW_{GT}$ = } ``` -##### Bag of Words Metric +##### Bag-of-Words Error Rate -The Bag of Words Metric describes how many words in a recognized text correspond to words given in the Ground Truth, independent of a page's layout. +Based on the above concept, the Bag-of-Words Error Rate is defined as the sum over the modulus of the GT count minus OCR count of each word, divided by the sum total of words in GT and OCR. + +The BoW error therefore describes how many words are misrecognized (positively or negatively), independent of a page's layout (order/segmentation). $BoW_m = \frac{BoW_{GT} - |\Delta_{GT/recognized}|}{|n_{GT}|}$ From 492b6ee2f37d01da839d3555df85826431ae541b Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Wed, 15 Feb 2023 08:12:09 +0100 Subject: [PATCH 49/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 51586f2..b758990 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -177,7 +177,7 @@ Based on the above concept, the Bag-of-Words Error Rate is defined as the sum ov The BoW error therefore describes how many words are misrecognized (positively or negatively), independent of a page's layout (order/segmentation). -$BoW_m = \frac{BoW_{GT} - |\Delta_{GT/recognized}|}{|n_{GT}|}$ +$BWE = \frac{|BoW_{GT} - BoW_{OCR}|}{{n_w}_{GT} + {n_w}_{OCR}}$ ###### Example From d8d4cef987445514d66cece19b9639305cfa8b28 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Wed, 15 Feb 2023 08:13:01 +0100 Subject: [PATCH 50/56] Update ocrd_eval.md Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index b758990..22f288e 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -181,19 +181,19 @@ $BWE = \frac{|BoW_{GT} - BoW_{OCR}|}{{n_w}_{GT} + {n_w}_{OCR}}$ ###### Example -Given +Given the GT text `der Mann steht an der Ampel`, recognised by OCR as `cer Mann fteht an der Ampel`: -$BoW_{GT} = \{"Eine": 1, "Mondfinsternis": 1, "steht": 1, "bevor": 1\}$ +$BoW_{GT} = \{"Ampel": 1, "an": 1, "der": 2, "Mann": 1, "steht": 1\}$ and -$BoW_{recognized} = \{"Eine": 1, "Mondfinsternis": 1, "fteht": 1, "bevor": 1\}$ +$BoW_{OCR} = \{"Ampel": 1, "an": 1, "cer": 1, "der": 1, "Mann": 1, "fteht": 1\}$ results in: -$BoW_m = \frac{4 - 1}{4}$ = 0.75 +$BWE = \frac{|1 - 1| + |1 - 1| + |2 - 1| + |0 - 1| + |1 - 1| + |1 - 0| + |0 - 1|}{12} = \frac{0 + 0 + 1 + 1 + 0 + 1 + 1}{12}$ = 0.33 -In this example 75% of the words have been correctly recognized. +In this example, 66% of the words have been correctly recognized. ### Layout Evaluation From 48e69f8043253314f0dc9677c55b741649210e63 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Wed, 15 Feb 2023 08:29:48 +0100 Subject: [PATCH 51/56] add letter accuracy --- ocrd_eval.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 22f288e..4e48ccf 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -293,7 +293,14 @@ GPU peak memory is the maximum GPU memory allocated during the execution of a wo ##### Letter Accuracy -TODO +Letter Accuracy is a metric that focusses on a defined set of characters for evaluation while ignoring the rest. +The set relevant for Letter Accuracy can be chosen freely, but it is common to omit punctuation and white spaces which are removed from both the candidate text and the ground truth before evaluation. + +Letter Accuracy can be calculated as follows: + +Let $|L_{GT}|$ be the number of relevant letters in the ground truth, $|L_{r}|$ the number of recognized letters, then + +$LA = 1 - \frac{|L_{GT}| - |L_{r}|}{|L_{GT}|}$ ##### Flexible Character Accuracy Measure From 3cc5beef903f5e4ac7837531a1979b100df78b3f Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Wed, 15 Feb 2023 08:41:11 +0100 Subject: [PATCH 52/56] rephrase layout eval intro --- ocrd_eval.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 4e48ccf..9936ff6 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -197,9 +197,15 @@ In this example, 66% of the words have been correctly recognized. ### Layout Evaluation -For documents with a complex structure, looking at the recognized text's accuracy alone is often insufficient to accurately determine the quality of OCR. An example can help to illustrate this: in a document containing two columns, all characters and words may be recognized correctly, but when the two columns are detected by layout analysis as just one, the OCR result will contain the text for the first lines of the first and second column, followed by the second lines of the first and second column asf., rendering the sequence of words and paragraphs in the Ground Truth text wrongly, which defeats almost all downstream processes. +A good text segementation is the basis for measuring text accuracy. + +An example can help to illustrate this: +Given in a document containing two columns these two columns are detected by layout analysis as just one. +The OCR result will then contain the text for the first lines of the first and second column, followed by the second lines of the first and second column asf. which does not correspond to the sequence of words and paragraphs given in the Ground Truth. +Even if all characters and words may be recognized correctly, all downstream processes to measure text accuracy will be defeated. While the comprehensive evaluation of OCR with consideration of layout analysis is still a research topic, several established metrics can be used to capture different aspects of it. +For pragmatic reasons we set aside errors resulting from misdetecting the reading order for the moment (though this might be implemented in the future). #### Reading Order From f8175210c93498ffa476d98816471cf63428fb68 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Wed, 15 Feb 2023 09:56:46 +0100 Subject: [PATCH 53/56] add reading order evaluation --- ocrd_eval.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 9936ff6..029da74 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -207,7 +207,9 @@ Even if all characters and words may be recognized correctly, all downstream pro While the comprehensive evaluation of OCR with consideration of layout analysis is still a research topic, several established metrics can be used to capture different aspects of it. For pragmatic reasons we set aside errors resulting from misdetecting the reading order for the moment (though this might be implemented in the future). -#### Reading Order +Any layout evaluation in the context of OCR-D focusses on region level which should be sufficient for most use cases. + +#### Reading Order (Definition) Reading order describes the order in which segments on a page are intended to be read. While the reading order might be easily obtained in monographs with a single column where only a few page segments exist, identifying the reading order in more complex layouts (e.g. newspapers or multi-column layouts) can be more challenging. @@ -223,6 +225,8 @@ Example of a complex page layout with reading order: () +See [Reading Order Evaluation](#reading-order-evaluation) for the actual metric. + #### IoU (Intersection over Union) Intersection over Union is a term which describes the degree of overlap of two regions of a (document) image defined either by a bounding box or polygon. Example: @@ -338,6 +342,25 @@ The algorithm can be summarized as follows: #### Layout Evalutation +##### Reading Order Evaluation + +[Clausner, Pletschacher and Antonacopoulos 2013](https://www.primaresearch.org/www/assets/papers/ICDAR2013_Clausner_ReadingOrder.pdf) propose a way of evaluating reading order by classifying relations between two regions. +For both the ground truth and the detected reading order the type of relation between two regions are calculated and then compared. +The authors introduce a penalty matrix in which a penality (as integer) for a misclassified relation is given. +The more the detected relation differs from the relation in the ground truth, the higher the penalty. + +If for example the relation given in the ground truth is "Somewhere after (but unordered group involved)" and the detected relation is "directly before" the penalty is lower (`10`) than as if the ground truth relation was "directly after" (`40`) because the latter is more specific than the former. + +To calculate the success measure $s$ of the detected reading order, first all penalties obtained from comparing all GT to detected relations are summed up ($e$) and the error value at a success rate of 50% is determined by + +$e_{50} = p_{max} * n_{GT} / 2$ + +where $p_{max}$ is the highest single penality and $n_{GT}$ is the number of regions in the ground truth. + +The success measure is then given by + +$s = \frac{1}{e * (1/e_{50}) + 1}$ + ##### mAP (mean Average Precision) This score was originally devised for object detection in photo scenery (where overlaps are allowed and cannot conflict with text flow). @@ -502,6 +525,11 @@ See [OCR-D workflow guide](https://ocr-d.de/en/workflows#evaluation). * * FCA: * +* Letter Accuary: + * +* Reading Order Evaluation: + * + * More background on evaluation of OCR * * From 04c5c27af52e871edbb8833abf8bb606d4e20e28 Mon Sep 17 00:00:00 2001 From: Michelle Weidling Date: Wed, 15 Feb 2023 10:02:34 +0100 Subject: [PATCH 54/56] implement Uwe's feedback reg. Letter Accuracy --- ocrd_eval.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 029da74..9c32b4f 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -303,8 +303,10 @@ GPU peak memory is the maximum GPU memory allocated during the execution of a wo ##### Letter Accuracy -Letter Accuracy is a metric that focusses on a defined set of characters for evaluation while ignoring the rest. -The set relevant for Letter Accuracy can be chosen freely, but it is common to omit punctuation and white spaces which are removed from both the candidate text and the ground truth before evaluation. +Letter Accuracy is a metric that focusses on a pre-defined set of characters classes for evaluation while ignoring others. +Letters in a common sense do not include white spaces and punctuations or Arabic and Indic digits. +Furthermore, even letter capitalization might be ignored. +The relevant character classes must be removed from both the candidate text and the ground truth before evaluation. Letter Accuracy can be calculated as follows: From d078b1bbcc4f2744399ba9a6987c89a9b5d90ca7 Mon Sep 17 00:00:00 2001 From: mweidling <13831557+mweidling@users.noreply.github.com> Date: Thu, 16 Feb 2023 08:29:47 +0100 Subject: [PATCH 55/56] Apply suggestions from code review Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index 9c32b4f..eeeddb6 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -346,14 +346,25 @@ The algorithm can be summarized as follows: ##### Reading Order Evaluation -[Clausner, Pletschacher and Antonacopoulos 2013](https://www.primaresearch.org/www/assets/papers/ICDAR2013_Clausner_ReadingOrder.pdf) propose a way of evaluating reading order by classifying relations between two regions. -For both the ground truth and the detected reading order the type of relation between two regions are calculated and then compared. -The authors introduce a penalty matrix in which a penality (as integer) for a misclassified relation is given. -The more the detected relation differs from the relation in the ground truth, the higher the penalty. +[Clausner, Pletschacher and Antonacopoulos 2013](https://www.primaresearch.org/www/assets/papers/ICDAR2013_Clausner_ReadingOrder.pdf) +propose a method to evaluate reading order by classifying relations between any two regions: +direct or indirect successor / predecessor, unordered, undefined. -If for example the relation given in the ground truth is "Somewhere after (but unordered group involved)" and the detected relation is "directly before" the penalty is lower (`10`) than as if the ground truth relation was "directly after" (`40`) because the latter is more specific than the former. +Next, text regions on both sides, ground truth and detected reading order, are matched and assigned (depending on overlap area). +A GT region can have multiple corresponding detections. Then, for each pair of regions, the relation type +on GT is compared to the relation types of the corresponding predictions. Any deviation introduces costs, +depending both on the kind of relation (e.g. direct vs indirect, or successor vs predecessor) +and the relative size of the overlap. -To calculate the success measure $s$ of the detected reading order, first all penalties obtained from comparing all GT to detected relations are summed up ($e$) and the error value at a success rate of 50% is determined by +The authors introduce a predefined penalty matrix where the cost for each misclassification is given. +(Direct opposition is more expensive than indirect.) + +For example, if the relation given in GT is "somewhere after (but unordered group involved)", +but the detected relation is "directly before", then the penalty will be lower (`10`) than +if the GT relation is "directly after" (`40`) – because the latter is more specific than the former. + +To calculate the success measure $s$ of the detected reading order, first the costs obtained from comparing all GT to all detected relations are summed up ($e$). +Then this error value is normalised by the hypothetical error value at 50% agreement ($e_{50}$): $e_{50} = p_{max} * n_{GT} / 2$ From 43b364a314e106711c8e37b5276ea31220dac88b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 28 Feb 2023 15:20:55 +0100 Subject: [PATCH 56/56] eval: Improvements to TeX formulas Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd_eval.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_eval.md b/ocrd_eval.md index eeeddb6..9038e83 100644 --- a/ocrd_eval.md +++ b/ocrd_eval.md @@ -177,17 +177,17 @@ Based on the above concept, the Bag-of-Words Error Rate is defined as the sum ov The BoW error therefore describes how many words are misrecognized (positively or negatively), independent of a page's layout (order/segmentation). -$BWE = \frac{|BoW_{GT} - BoW_{OCR}|}{{n_w}_{GT} + {n_w}_{OCR}}$ +$$ BWE = \frac{|BoW_{GT} - BoW_{OCR}|}{ {n_w}_{GT} + {n_w}_{OCR} } $$ ###### Example Given the GT text `der Mann steht an der Ampel`, recognised by OCR as `cer Mann fteht an der Ampel`: -$BoW_{GT} = \{"Ampel": 1, "an": 1, "der": 2, "Mann": 1, "steht": 1\}$ +$$ BoW_{GT} = \{ \text{Ampel}: 1, \text{an}: 1, \text{der}: 2, \text{Mann}: 1, \text{steht}: 1 \} $$ and -$BoW_{OCR} = \{"Ampel": 1, "an": 1, "cer": 1, "der": 1, "Mann": 1, "fteht": 1\}$ +$$ BoW_{OCR} = \{ \text{Ampel}: 1, \text{an}: 1, \text{cer}: 1, \text{der}: 1, \text{Mann}: 1, \text{fteht}: 1 \} $$ results in: