Skip to content

Commit

Permalink
Move unsafe handling code to plumbing module
Browse files Browse the repository at this point in the history
This has given me the opportunity to review the safety of the unsafe
code - and it was found lacking.

But it's fixed in the plumbing module.

#16

I've tried to not put naming opinions within the plumbing module.
That is, things within it are named to match the c or c++ libraries of
leptonica and tesseract as much as possible.

This addresses #17 at
least within the plumbing module.
  • Loading branch information
ccouzens committed May 29, 2020
1 parent 4d207df commit f203642
Show file tree
Hide file tree
Showing 6 changed files with 411 additions and 56 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ categories = ["api-bindings", "multimedia::images"]
[dependencies]
leptonica-sys = "0.3.0"
tesseract-sys = "0.5.1"
thiserror = "1.0"
86 changes: 30 additions & 56 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,9 @@
extern crate leptonica_sys;
extern crate tesseract_sys;
pub mod plumbing;

use leptonica_sys::{pixFreeData, pixRead, pixReadMem};
use std::ffi::CStr;
use std::ffi::CString;
use std::ptr;
use std::str;
use tesseract_sys::{
TessBaseAPI, TessBaseAPICreate, TessBaseAPIDelete, TessBaseAPIGetUTF8Text, TessBaseAPIInit3,
TessBaseAPIRecognize, TessBaseAPISetImage, TessBaseAPISetImage2,
TessBaseAPISetSourceResolution, TessBaseAPISetVariable, TessDeleteText,
};

pub struct Tesseract {
raw: *mut TessBaseAPI,
}

impl Drop for Tesseract {
fn drop(&mut self) {
unsafe { TessBaseAPIDelete(self.raw) }
}
}
pub struct Tesseract(plumbing::TessBaseAPI);

impl Default for Tesseract {
fn default() -> Self {
Expand All @@ -29,27 +12,24 @@ impl Default for Tesseract {
}

fn cs(string: &str) -> CString {
// do not call as_ptr yet, since the data will be freed before we return
CString::new(string).unwrap()
}

impl Tesseract {
pub fn new() -> Tesseract {
Tesseract {
raw: unsafe { TessBaseAPICreate() },
}
Tesseract(plumbing::TessBaseAPI::new())
}
pub fn set_lang(&mut self, language: &str) -> i32 {
let cs_language = cs(language);
unsafe { TessBaseAPIInit3(self.raw, ptr::null(), cs_language.as_ptr()) }
match self.0.init_2(None, Some(&cs_language)) {
Ok(()) => 0,
Err(_) => -1,
}
}
pub fn set_image(&mut self, filename: &str) {
let cs_filename = cs(filename);
unsafe {
let img = pixRead(cs_filename.as_ptr());
TessBaseAPISetImage2(self.raw, img);
pixFreeData(img);
}
let img = plumbing::Pix::read(&cs_filename).unwrap();
self.0.set_image_2(&img);
}
pub fn set_frame(
&mut self,
Expand All @@ -59,47 +39,41 @@ impl Tesseract {
bytes_per_pixel: i32,
bytes_per_line: i32,
) {
unsafe {
TessBaseAPISetImage(
self.raw,
frame_data.as_ptr(),
width,
height,
bytes_per_pixel,
bytes_per_line,
);
}
self.0
.set_image_1(frame_data, width, height, bytes_per_pixel, bytes_per_line)
.unwrap();
}
pub fn set_image_from_mem(&mut self, img: &[u8]) {
unsafe {
let img = pixReadMem(img.as_ptr(), img.len());
TessBaseAPISetImage2(self.raw, img);
pixFreeData(img);
}
let pix = plumbing::Pix::read_mem(img).unwrap();
self.0.set_image_2(&pix);
}

pub fn set_source_resolution(&mut self, ppi: i32) {
unsafe {
TessBaseAPISetSourceResolution(self.raw, ppi);
}
self.0.set_source_resolution(ppi)
}

pub fn set_variable(&mut self, name: &str, value: &str) -> i32 {
let cs_name = cs(name);
let cs_value = cs(value);
unsafe { TessBaseAPISetVariable(self.raw, cs_name.as_ptr(), cs_value.as_ptr()) }
match self.0.set_variable(&cs_name, &cs_value) {
Ok(()) => 1,
Err(_) => 0,
}
}
pub fn recognize(&mut self) -> i32 {
unsafe { TessBaseAPIRecognize(self.raw, ptr::null_mut()) }
}
pub fn get_text(&self) -> String {
unsafe {
let cs_value = TessBaseAPIGetUTF8Text(self.raw);
let string = CStr::from_ptr(cs_value).to_string_lossy().into_owned();
TessDeleteText(cs_value);
string
match self.0.recognize() {
Ok(()) => 0,
Err(_) => -1,
}
}
pub fn get_text(&mut self) -> String {
self.0
.get_utf8_text()
.unwrap()
.as_ref()
.to_string_lossy()
.into_owned()
}
}

pub fn ocr(filename: &str, language: &str) -> String {
Expand Down
67 changes: 67 additions & 0 deletions src/plumbing/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
//! A direct but safe wrapper for `tesseract-sys`. It should stick as close as
//! possible to the upstream API whilst avoiding unsafe behaviour.
//!
//! Are you interested in using this on its own?
//! Raise an issue, and I'll split it into its own crate.
mod pix;
mod tess_base_api;
mod tesseract_text;

pub use self::pix::Pix;
pub use self::pix::PixReadError;
pub use self::pix::PixReadMemError;
pub use self::tess_base_api::TessBaseAPI;
pub use self::tess_base_api::TessBaseAPIGetUTF8TextError;
pub use self::tess_base_api::TessBaseAPIInitError;
pub use self::tess_base_api::TessBaseAPIRecogniseError;
pub use self::tess_base_api::TessBaseAPISetImageSafetyError;
pub use self::tess_base_api::TessBaseAPISetVariableError;
pub use self::tesseract_text::TesseractText;

#[test]
fn ocr_from_mem_with_ppi() -> Result<(), Box<dyn std::error::Error>> {
use std::ffi::CString;

let pix = Pix::read_mem(include_bytes!("../../img.tiff"))?;

let mut cube = TessBaseAPI::new();
cube.init_2(None, Some(&CString::new("eng")?))?;
cube.set_image_2(&pix);

cube.set_source_resolution(70);
assert_eq!(
cube.get_utf8_text()?.as_ref().to_str()?,
include_str!("../../img.txt")
);
Ok(())
}

#[test]
fn expanded_test() -> Result<(), Box<dyn std::error::Error>> {
use std::ffi::CString;

let mut cube = TessBaseAPI::new();
cube.set_variable(
&CString::new("tessedit_char_blacklist")?,
&CString::new("z")?,
)?;
cube.init_2(None, None)?;
let pix = Pix::read(&CString::new("../img.png")?)?;
cube.set_image_2(&pix);
cube.recognize()?;
assert_eq!(
cube.get_utf8_text()?.as_ref().to_str()?,
include_str!("../../img.txt")
);
Ok(())
}

#[test]
fn setting_image_without_initializing_test() -> Result<(), PixReadMemError> {
let mut cube = TessBaseAPI::new();
let pix = Pix::read_mem(include_bytes!("../../img.tiff"))?;
cube.set_image_2(&pix);
assert!(cube.recognize().is_err());
assert!(cube.get_utf8_text().is_err());
Ok(())
}
70 changes: 70 additions & 0 deletions src/plumbing/pix.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
extern crate leptonica_sys;
extern crate thiserror;

use self::leptonica_sys::{pixFreeData, pixRead, pixReadMem};
use self::thiserror::Error;
use std::convert::AsRef;
use std::ffi::CStr;

/// Wrapper around Leptonica's [`Pix`](https://tpgit.github.io/Leptonica/struct_pix.html) structure
pub struct Pix(*mut leptonica_sys::Pix);

impl Drop for Pix {
fn drop(&mut self) {
unsafe {
pixFreeData(self.0);
}
}
}

impl AsRef<*mut leptonica_sys::Pix> for Pix {
fn as_ref(&self) -> &*mut leptonica_sys::Pix {
&self.0
}
}

#[derive(Debug, Error)]
#[error("Pix::read returned null")]
pub struct PixReadError();

#[derive(Debug, Error)]
#[error("Pix::read_mem returned null")]
pub struct PixReadMemError();

impl Pix {
/// Wrapper for [`pixRead`](https://tpgit.github.io/Leptonica/leptprotos_8h.html#a84634846cbb5e01df667d6e9241dfc53)
///
/// Read an image from a filename
pub fn read(filename: &CStr) -> Result<Self, PixReadError> {
let ptr = unsafe { pixRead(filename.as_ptr()) };
if ptr.is_null() {
Err(PixReadError {})
} else {
Ok(Self(ptr))
}
}

/// Wrapper for [`pixReadMem`](https://tpgit.github.io/Leptonica/leptprotos_8h.html#a027a927dc3438192e3bdae8c219d7f6a)
///
/// Read an image from memory
pub fn read_mem(img: &[u8]) -> Result<Self, PixReadMemError> {
let ptr = unsafe { pixReadMem(img.as_ptr(), img.len()) };
if ptr.is_null() {
Err(PixReadMemError {})
} else {
Ok(Self(ptr))
}
}
}

#[test]
fn read_error_test() -> Result<(), Box<dyn std::error::Error>> {
let path = std::ffi::CString::new("fail")?;
assert!(Pix::read(&path).is_err());
Ok(())
}

#[test]
fn read_mem_error_test() {
assert!(Pix::read_mem(&[]).is_err());
}
Loading

0 comments on commit f203642

Please sign in to comment.