diff --git a/docs/CHANGES.TXT b/docs/CHANGES.TXT index eb567dd83..54301cd9c 100644 --- a/docs/CHANGES.TXT +++ b/docs/CHANGES.TXT @@ -35,6 +35,7 @@ - Fix: infinite loop in MP4 file type detector. - Improvement: Use Corrosion to build Rust code - Improvement: Ignore MXF Caption Essence Container version byte to enhance SRT subtitle extraction compatibility +- New: Add tesseract page segmentation modes control with `--psm` flag 0.94 (2021-12-14) ----------------- diff --git a/src/lib_ccx/ccx_common_option.c b/src/lib_ccx/ccx_common_option.c index 4ac0a3fef..eb61f6412 100644 --- a/src/lib_ccx/ccx_common_option.c +++ b/src/lib_ccx/ccx_common_option.c @@ -72,6 +72,7 @@ void init_options(struct ccx_s_options *options) options->dvblang = NULL; // By default, autodetect DVB language options->ocrlang = NULL; // By default, autodetect .traineddata file options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version + options->psm = 3; // Default PSM mode (3 is the default tesseract as well) options->ocr_quantmode = 1; // CCExtractor's internal options->mkvlang = NULL; // By default, all the languages are extracted options->ignore_pts_jumps = 1; diff --git a/src/lib_ccx/ccx_common_option.h b/src/lib_ccx/ccx_common_option.h index fcc9799d4..7bc35ac1a 100644 --- a/src/lib_ccx/ccx_common_option.h +++ b/src/lib_ccx/ccx_common_option.h @@ -147,6 +147,7 @@ struct ccx_s_options // Options from user parameters char *dvblang; // The name of the language stream for DVB const char *ocrlang; // The name of the .traineddata file to be loaded with tesseract int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2 + int psm; // The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default int ocr_quantmode; // How to quantize the bitmap before passing to to tesseract (0=no quantization at all, 1=CCExtractor's internal) char *mkvlang; // The name of the language stream for MKV int analyze_video_stream; // If 1, the video stream will be processed even if we're using a different one for subtitles. diff --git a/src/lib_ccx/ocr.c b/src/lib_ccx/ocr.c index 9571030d5..c43ca5589 100644 --- a/src/lib_ccx/ocr.c +++ b/src/lib_ccx/ocr.c @@ -177,6 +177,9 @@ void *init_ocr(int lang_index) &pars_values, 1, false); } + // set PSM mode + TessBaseAPISetPageSegMode(ctx->api, ccx_options.psm); + free(pars_vec); free(pars_values); diff --git a/src/lib_ccx/params.c b/src/lib_ccx/params.c index cd2515806..4a5c73420 100644 --- a/src/lib_ccx/params.c +++ b/src/lib_ccx/params.c @@ -679,6 +679,23 @@ void print_usage(void) mprint(" Default value depends on the tesseract version linked :\n"); mprint(" Tesseract v3 : default mode is 0,\n"); mprint(" Tesseract v4 : default mode is 1.\n"); + mprint(" --psm: Select the PSM mode for Tesseract.\n"); + mprint(" Available Page segmentation modes:\n"); + mprint(" 0 Orientation and script detection (OSD) only.\n"); + mprint(" 1 Automatic page segmentation with OSD.\n"); + mprint(" 2 Automatic page segmentation, but no OSD, or OCR.\n"); + mprint(" 3 Fully automatic page segmentation, but no OSD. (Default)\n"); + mprint(" 4 Assume a single column of text of variable sizes.\n"); + mprint(" 5 Assume a single uniform block of vertically aligned text.\n"); + mprint(" 6 Assume a single uniform block of text.\n"); + mprint(" 7 Treat the image as a single text line.\n"); + mprint(" 8 Treat the image as a single word.\n"); + mprint(" 9 Treat the image as a single word in a circle.\n"); + mprint(" 10 Treat the image as a single character.\n"); + mprint(" 11 Sparse text. Find as much text as possible in no particular order.\n"); + mprint(" 12 Sparse text with OSD.\n"); + mprint(" 13 Raw line. Treat the image as a single text line,\n"); + mprint(" bypassing hacks that are Tesseract-specific.\n"); mprint(" --mkvlang: For MKV subtitles, select which language's caption\n"); mprint(" stream will be processed. e.g. 'eng' for English.\n"); mprint(" Language codes can be either the 3 letters bibliographic\n"); @@ -1696,6 +1713,27 @@ int parse_parameters(struct ccx_s_options *opt, int argc, char *argv[]) fatal(EXIT_MALFORMED_PARAMETER, "--oem has no argument.\n"); } } + if (strcmp(argv[i], "--psm") == 0) + { + if (i < argc - 1) + { + i++; + + char *str = (char *)malloc(sizeof(argv[i])); + sprintf(str, "%s", argv[i]); + opt->psm = atoi(str); + if (opt->psm < 0 || opt->psm > 13) + { + fatal(EXIT_MALFORMED_PARAMETER, "--psm must be between 0 and 13\n"); + } + + continue; + } + else + { + fatal(EXIT_MALFORMED_PARAMETER, "--psm has no argument.\n"); + } + } if (strcmp(argv[i], "--mkvlang") == 0) { if (i < argc - 1) diff --git a/src/lib_ccx/params_dump.c b/src/lib_ccx/params_dump.c index 2b991ba1e..f679eea73 100644 --- a/src/lib_ccx/params_dump.c +++ b/src/lib_ccx/params_dump.c @@ -216,6 +216,8 @@ void params_dump(struct lib_ccx_ctx *ctx) mprint("Reduced color palette]\n"); break; } + + mprint("[Tesseract PSM: %d]\n", ccx_options.psm); } #define Y_N(cond) ((cond) ? "Yes" : "No") diff --git a/src/rust/lib_ccxr/src/common/options.rs b/src/rust/lib_ccxr/src/common/options.rs index c982203f6..63fbcbb97 100644 --- a/src/rust/lib_ccxr/src/common/options.rs +++ b/src/rust/lib_ccxr/src/common/options.rs @@ -455,6 +455,8 @@ pub struct Options { pub ocrlang: PathBuf, /// The Tesseract OEM mode, could be 0 (default), 1 or 2 pub ocr_oem: i8, + /// The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default + pub psm: i32, /// How to quantize the bitmap before passing to to tesseract /// (0 = no quantization at all, 1 = CCExtractor's internal, /// 2 = reduce distinct color count in image for faster results.) @@ -589,6 +591,7 @@ impl Default for Options { dvblang: Default::default(), ocrlang: Default::default(), ocr_oem: -1, + psm: 3, ocr_quantmode: 1, mkvlang: Default::default(), analyze_video_stream: Default::default(), diff --git a/src/rust/src/args.rs b/src/rust/src/args.rs index d437f232d..e39ab09b2 100644 --- a/src/rust/src/args.rs +++ b/src/rust/src/args.rs @@ -600,6 +600,25 @@ pub struct Args { /// Tesseract v4 : default mode is 1. #[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)] pub oem: Option, + /// Select the PSM mode for Tesseract. + /// Available Page segmentation modes: + /// 0 Orientation and script detection (OSD) only. + /// 1 Automatic page segmentation with OSD. + /// 2 Automatic page segmentation, but no OSD, or OCR. + /// 3 Fully automatic page segmentation, but no OSD. (Default) + /// 4 Assume a single column of text of variable sizes. + /// 5 Assume a single uniform block of vertically aligned text. + /// 6 Assume a single uniform block of text. + /// 7 Treat the image as a single text line. + /// 8 Treat the image as a single word. + /// 9 Treat the image as a single word in a circle. + /// 10 Treat the image as a single character. + /// 11 Sparse text. Find as much text as possible in no particular order. + /// 12 Sparse text with OSD. + /// 13 Raw line. Treat the image as a single text line, + /// bypassing hacks that are Tesseract-specific. + #[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)] + pub psm: Option, /// For MKV subtitles, select which language's caption /// stream will be processed. e.g. 'eng' for English. /// Language codes can be either the 3 letters bibliographic diff --git a/src/rust/src/parser.rs b/src/rust/src/parser.rs index d7e4d39a7..d3833fbaa 100644 --- a/src/rust/src/parser.rs +++ b/src/rust/src/parser.rs @@ -801,6 +801,16 @@ impl OptionsExt for Options { self.ocr_oem = *oem as _; } + if let Some(ref psm) = args.psm { + if !(0..=13).contains(psm) { + fatal!( + cause = ExitCause::MalformedParameter; + "--psm must be between 0 and 13" + ); + } + self.psm = *psm as _; + } + if let Some(ref lang) = args.mkvlang { self.mkvlang = Some(Language::from_str(lang.as_str()).unwrap()); let str = lang.as_str();