From daf62e255031bd06f172bd05403a71c347448e59 Mon Sep 17 00:00:00 2001 From: RikaKit <118338010+RikaKit@users.noreply.github.com> Date: Sun, 12 May 2024 21:16:51 +0400 Subject: [PATCH 1/4] added functionality for receiving and parsing stext from page --- Cargo.toml | 2 + examples/extract_stext.rs | 31 ++++++++++++++ src/text_page.rs | 88 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 examples/extract_stext.rs diff --git a/Cargo.toml b/Cargo.toml index 198df77..0b66483 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,8 @@ mupdf-sys = { version = "0.4.2", path = "mupdf-sys" } once_cell = "1.3.1" num_enum = "0.7.0" bitflags = "2.0.2" +serde = { version = "1.0.201", features = ["derive"] } +serde_json = "1.0.117" [dependencies.font-kit] version = "0.12.0" diff --git a/examples/extract_stext.rs b/examples/extract_stext.rs new file mode 100644 index 0000000..5224a4b --- /dev/null +++ b/examples/extract_stext.rs @@ -0,0 +1,31 @@ +use std::io; + +fn main() { + // cargo run --example extract_stext + let mut path_to_doc = String::new(); + println!("Enter a path to document: "); + io::stdin().read_line(&mut path_to_doc).expect("Failed to read line"); + let doc = mupdf::document::Document::open(path_to_doc.trim()).unwrap(); + let page = doc.load_page(0).unwrap(); + let stext_page = page.to_text_page(mupdf::text_page::TextPageOptions::empty()).unwrap(); + match stext_page.stext_page_as_json(1.0) { + Ok(stext_json) => { + let stext_page: serde_json::Result = serde_json::from_str(stext_json.as_str()); + match stext_page { + Ok(res) => { + for block in res.blocks { + if block.r#type.eq("text") { + for line in block.lines { + println!("{:?}", &line.text); + } + } + } + } + Err(err) => { + println!("stext_json parsing error: {:?}", &err); + } + } + } + Err(_) => {} + } +} diff --git a/src/text_page.rs b/src/text_page.rs index 38cf030..168507e 100644 --- a/src/text_page.rs +++ b/src/text_page.rs @@ -8,6 +8,7 @@ use std::slice; use bitflags::bitflags; use mupdf_sys::*; use num_enum::TryFromPrimitive; +use serde::{Deserialize, Serialize}; use crate::{context, Buffer, Error, Image, Matrix, Point, Quad, Rect, WriteMode}; @@ -43,6 +44,18 @@ impl TextPage { buf.read_to_string(&mut text)?; Ok(text) } + pub fn stext_page_as_json(&self, scale: f32) -> Result { + let mut buf = unsafe { + let buf = fz_new_buffer(context(), 1024); + let out = fz_new_output_with_buffer(context(), buf); + fz_print_stext_page_as_json(context(), out, self.inner, scale); + fz_close_output(context(), out); + Buffer::from_raw(buf) + }; + let mut res = String::new(); + buf.read_to_string(&mut res).unwrap(); + Ok(res) + } pub fn blocks(&self) -> TextBlockIter { TextBlockIter { @@ -99,6 +112,46 @@ pub enum TextBlockType { Image = FZ_STEXT_BLOCK_IMAGE as u32, } +#[derive(Deserialize, Serialize, Debug)] +pub struct Font { + pub name: String, + pub family: String, + pub weight: String, + pub style: String, + pub size: u32, +} + +#[derive(Deserialize, Serialize, Debug)] +pub struct BBox { + pub x: u32, + pub y: u32, + pub w: u32, + pub h: u32, +} + +#[derive(Deserialize, Serialize, Debug)] +pub struct Line { + pub wmode: u32, + pub bbox: BBox, + pub font: Font, + pub x: u32, + pub y: u32, + pub text: String, +} + +#[derive(Deserialize, Serialize, Debug)] +pub struct Block { + pub r#type: String, + pub bbox: BBox, + pub lines: Vec, +} + +// StructuredText +#[derive(Deserialize, Serialize, Debug)] +pub struct StextPage { + pub blocks: Vec, +} + /// A text block is a list of lines of text (typically a paragraph), or an image. pub struct TextBlock<'a> { inner: &'a fz_stext_block, @@ -115,7 +168,7 @@ impl TextBlock<'_> { pub fn lines(&self) -> TextLineIter { unsafe { - if self.inner.type_ == FZ_STEXT_BLOCK_TEXT as _ { + if self.inner.type_ == FZ_STEXT_BLOCK_TEXT as i32 { return TextLineIter { next: self.inner.u.t.first_line, _marker: PhantomData, @@ -130,7 +183,7 @@ impl TextBlock<'_> { pub fn ctm(&self) -> Option { unsafe { - if self.inner.type_ == FZ_STEXT_BLOCK_IMAGE as _ { + if self.inner.type_ == FZ_STEXT_BLOCK_IMAGE as i32 { return Some(self.inner.u.i.transform.into()); } } @@ -139,7 +192,7 @@ impl TextBlock<'_> { pub fn image(&self) -> Option { unsafe { - if self.inner.type_ == FZ_STEXT_BLOCK_IMAGE as _ { + if self.inner.type_ == FZ_STEXT_BLOCK_IMAGE as i32 { let inner = self.inner.u.i.image; fz_keep_image(context(), inner); return Some(Image::from_raw(inner)); @@ -258,6 +311,35 @@ impl<'a> Iterator for TextCharIter<'a> { mod test { use crate::{Document, TextPageOptions}; + #[test] + fn test_get_stext_page_as_json() { + let path_to_doc = std::env::current_dir().unwrap() + .join("tests").join("files").join("dummy.pdf"); + let doc = Document::open(path_to_doc.to_str().unwrap()).unwrap(); + let page = doc.load_page(0).unwrap(); + let stext_page = page.to_text_page(TextPageOptions::empty()).unwrap(); + match stext_page.stext_page_as_json(1.0) { + Ok(stext_json) => { + let stext_page: serde_json::Result = serde_json::from_str(stext_json.as_str()); + match stext_page { + Ok(res) => { + for block in res.blocks { + if block.r#type.eq("text") { + for line in block.lines { + assert_eq!(&line.text, &"Dummy PDF file".to_string()); + } + } + } + } + Err(err) => { + println!("stext_json parsing error: {:?}", &err); + } + } + } + Err(_) => {} + } + } + #[test] fn test_text_page_search() { use crate::{Point, Quad}; From 6e2278136623bfa068658394b1843603fc852db6 Mon Sep 17 00:00:00 2001 From: RikaKit <118338010+RikaKit@users.noreply.github.com> Date: Mon, 13 May 2024 17:04:16 +0400 Subject: [PATCH 2/4] added error handling --- examples/extract_stext.rs | 9 ++-- mupdf-sys/wrapper.c | 28 ++++++++++++ src/page.rs | 95 ++++++++++++++++++++++++++++++++++++--- src/text_page.rs | 82 --------------------------------- 4 files changed, 120 insertions(+), 94 deletions(-) diff --git a/examples/extract_stext.rs b/examples/extract_stext.rs index 5224a4b..d82ed8e 100644 --- a/examples/extract_stext.rs +++ b/examples/extract_stext.rs @@ -7,10 +7,9 @@ fn main() { io::stdin().read_line(&mut path_to_doc).expect("Failed to read line"); let doc = mupdf::document::Document::open(path_to_doc.trim()).unwrap(); let page = doc.load_page(0).unwrap(); - let stext_page = page.to_text_page(mupdf::text_page::TextPageOptions::empty()).unwrap(); - match stext_page.stext_page_as_json(1.0) { + match page.stext_page_as_json_from_page(1.0) { Ok(stext_json) => { - let stext_page: serde_json::Result = serde_json::from_str(stext_json.as_str()); + let stext_page: serde_json::Result = serde_json::from_str(stext_json.as_str()); match stext_page { Ok(res) => { for block in res.blocks { @@ -22,10 +21,10 @@ fn main() { } } Err(err) => { - println!("stext_json parsing error: {:?}", &err); + println!("stext_page parsing error: {:?}", &err); } } } - Err(_) => {} + Err(_err) => {} } } diff --git a/mupdf-sys/wrapper.c b/mupdf-sys/wrapper.c index 97390c4..42ea7d2 100644 --- a/mupdf-sys/wrapper.c +++ b/mupdf-sys/wrapper.c @@ -889,6 +889,34 @@ fz_buffer *mupdf_page_to_html(fz_context *ctx, fz_page *page, mupdf_error_t **er return buf; } +fz_buffer *mupdf_stext_page_as_json_from_page(fz_context *ctx, fz_page *page, float scale, mupdf_error_t **errptr) +{ + fz_buffer *buf = NULL; + fz_output *out = NULL; + fz_stext_page *stext_page = NULL; + fz_var(stext_page); + fz_var(buf); + fz_var(out); + fz_try(ctx) + { + stext_page = fz_new_stext_page_from_page(ctx, page, NULL); + buf = fz_new_buffer(ctx, 8192); + out = fz_new_output_with_buffer(ctx, buf); + fz_print_stext_page_as_json(ctx, out, stext_page, scale); + fz_close_output(ctx, out); + } + fz_always(ctx) + { + fz_drop_output(ctx, out); + fz_drop_stext_page(ctx, stext_page); + } + fz_catch(ctx) + { + mupdf_save_error(ctx, errptr); + } + return buf; +} + fz_buffer *mupdf_page_to_xhtml(fz_context *ctx, fz_page *page, mupdf_error_t **errptr) { fz_buffer *buf = NULL; diff --git a/src/page.rs b/src/page.rs index 880c4cc..52b38ed 100644 --- a/src/page.rs +++ b/src/page.rs @@ -3,10 +3,12 @@ use std::io::Read; use std::ptr; use std::slice; +use serde::{Deserialize, Serialize}; + use mupdf_sys::*; use crate::{ - context, Buffer, Colorspace, Cookie, Device, DisplayList, Error, Link, Matrix, Pixmap, Quad, + Buffer, Colorspace, context, Cookie, Device, DisplayList, Error, Link, Matrix, Pixmap, Quad, Rect, Separations, TextPage, TextPageOptions, }; @@ -235,6 +237,16 @@ impl Page { Ok(out) } + pub fn stext_page_as_json_from_page(&self, scale: f32) -> Result { + let mut buf = unsafe { + let inner = ffi_try!(mupdf_stext_page_as_json_from_page(context(), self.inner, scale)); + Buffer::from_raw(inner) + }; + let mut res = String::new(); + buf.read_to_string(&mut res).unwrap(); + Ok(res) + } + pub fn to_xhtml(&self) -> Result { let mut buf = unsafe { let inner = ffi_try!(mupdf_page_to_xhtml(context(), self.inner)); @@ -348,7 +360,7 @@ impl Iterator for LinkIter { ptr::null_mut(), ptr::null_mut(), ) - .page; + .page; } Some(Link { bounds, @@ -359,9 +371,78 @@ impl Iterator for LinkIter { } } +#[derive(Deserialize, Serialize, Debug)] +pub struct Font { + pub name: String, + pub family: String, + pub weight: String, + pub style: String, + pub size: u32, +} + +#[derive(Deserialize, Serialize, Debug)] +pub struct BBox { + pub x: u32, + pub y: u32, + pub w: u32, + pub h: u32, +} + +#[derive(Deserialize, Serialize, Debug)] +pub struct Line { + pub wmode: u32, + pub bbox: BBox, + pub font: Font, + pub x: u32, + pub y: u32, + pub text: String, +} + +#[derive(Deserialize, Serialize, Debug)] +pub struct Block { + pub r#type: String, + pub bbox: BBox, + pub lines: Vec, +} + +// StructuredText +#[derive(Deserialize, Serialize, Debug)] +pub struct StextPage { + pub blocks: Vec, +} + #[cfg(test)] mod test { use crate::{Document, Matrix}; + use crate::page::StextPage; + + #[test] + fn test_get_stext_page_as_json() { + let path_to_doc = std::env::current_dir().unwrap() + .join("tests").join("files").join("dummy.pdf"); + let doc = Document::open(path_to_doc.to_str().unwrap()).unwrap(); + let page = doc.load_page(0).unwrap(); + match page.stext_page_as_json_from_page(1.0) { + Ok(stext_json) => { + let stext_page: serde_json::Result = serde_json::from_str(stext_json.as_str()); + match stext_page { + Ok(res) => { + for block in res.blocks { + if block.r#type.eq("text") { + for line in block.lines { + assert_eq!(&line.text, &"Dummy PDF file".to_string()); + } + } + } + } + Err(err) => { + println!("stext_page parsing error: {:?}", &err); + } + } + } + Err(_err) => {} + } + } #[test] fn test_page_to_svg() { @@ -454,20 +535,20 @@ mod test { [Quad { ul: Point { x: 56.8, - y: 69.32512 + y: 69.32512, }, ur: Point { x: 115.85405, - y: 69.32512 + y: 69.32512, }, ll: Point { x: 56.8, - y: 87.311844 + y: 87.311844, }, lr: Point { x: 115.85405, - y: 87.311844 - } + y: 87.311844, + }, }] ); diff --git a/src/text_page.rs b/src/text_page.rs index 168507e..8da02cb 100644 --- a/src/text_page.rs +++ b/src/text_page.rs @@ -8,7 +8,6 @@ use std::slice; use bitflags::bitflags; use mupdf_sys::*; use num_enum::TryFromPrimitive; -use serde::{Deserialize, Serialize}; use crate::{context, Buffer, Error, Image, Matrix, Point, Quad, Rect, WriteMode}; @@ -44,18 +43,6 @@ impl TextPage { buf.read_to_string(&mut text)?; Ok(text) } - pub fn stext_page_as_json(&self, scale: f32) -> Result { - let mut buf = unsafe { - let buf = fz_new_buffer(context(), 1024); - let out = fz_new_output_with_buffer(context(), buf); - fz_print_stext_page_as_json(context(), out, self.inner, scale); - fz_close_output(context(), out); - Buffer::from_raw(buf) - }; - let mut res = String::new(); - buf.read_to_string(&mut res).unwrap(); - Ok(res) - } pub fn blocks(&self) -> TextBlockIter { TextBlockIter { @@ -112,46 +99,6 @@ pub enum TextBlockType { Image = FZ_STEXT_BLOCK_IMAGE as u32, } -#[derive(Deserialize, Serialize, Debug)] -pub struct Font { - pub name: String, - pub family: String, - pub weight: String, - pub style: String, - pub size: u32, -} - -#[derive(Deserialize, Serialize, Debug)] -pub struct BBox { - pub x: u32, - pub y: u32, - pub w: u32, - pub h: u32, -} - -#[derive(Deserialize, Serialize, Debug)] -pub struct Line { - pub wmode: u32, - pub bbox: BBox, - pub font: Font, - pub x: u32, - pub y: u32, - pub text: String, -} - -#[derive(Deserialize, Serialize, Debug)] -pub struct Block { - pub r#type: String, - pub bbox: BBox, - pub lines: Vec, -} - -// StructuredText -#[derive(Deserialize, Serialize, Debug)] -pub struct StextPage { - pub blocks: Vec, -} - /// A text block is a list of lines of text (typically a paragraph), or an image. pub struct TextBlock<'a> { inner: &'a fz_stext_block, @@ -311,35 +258,6 @@ impl<'a> Iterator for TextCharIter<'a> { mod test { use crate::{Document, TextPageOptions}; - #[test] - fn test_get_stext_page_as_json() { - let path_to_doc = std::env::current_dir().unwrap() - .join("tests").join("files").join("dummy.pdf"); - let doc = Document::open(path_to_doc.to_str().unwrap()).unwrap(); - let page = doc.load_page(0).unwrap(); - let stext_page = page.to_text_page(TextPageOptions::empty()).unwrap(); - match stext_page.stext_page_as_json(1.0) { - Ok(stext_json) => { - let stext_page: serde_json::Result = serde_json::from_str(stext_json.as_str()); - match stext_page { - Ok(res) => { - for block in res.blocks { - if block.r#type.eq("text") { - for line in block.lines { - assert_eq!(&line.text, &"Dummy PDF file".to_string()); - } - } - } - } - Err(err) => { - println!("stext_json parsing error: {:?}", &err); - } - } - } - Err(_) => {} - } - } - #[test] fn test_text_page_search() { use crate::{Point, Quad}; From 38fa1ff07c19c3c9d3881bbdc11cceb55f123aeb Mon Sep 17 00:00:00 2001 From: messense Date: Tue, 14 May 2024 09:45:19 +0800 Subject: [PATCH 3/4] Impl Clone --- src/page.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/page.rs b/src/page.rs index 52b38ed..0ca6305 100644 --- a/src/page.rs +++ b/src/page.rs @@ -371,7 +371,7 @@ impl Iterator for LinkIter { } } -#[derive(Deserialize, Serialize, Debug)] +#[derive(Deserialize, Serialize, Debug, Clone)] pub struct Font { pub name: String, pub family: String, @@ -380,7 +380,7 @@ pub struct Font { pub size: u32, } -#[derive(Deserialize, Serialize, Debug)] +#[derive(Deserialize, Serialize, Debug, Clone)] pub struct BBox { pub x: u32, pub y: u32, @@ -388,7 +388,7 @@ pub struct BBox { pub h: u32, } -#[derive(Deserialize, Serialize, Debug)] +#[derive(Deserialize, Serialize, Debug, Clone)] pub struct Line { pub wmode: u32, pub bbox: BBox, @@ -398,7 +398,7 @@ pub struct Line { pub text: String, } -#[derive(Deserialize, Serialize, Debug)] +#[derive(Deserialize, Serialize, Debug, Clone)] pub struct Block { pub r#type: String, pub bbox: BBox, @@ -406,7 +406,7 @@ pub struct Block { } // StructuredText -#[derive(Deserialize, Serialize, Debug)] +#[derive(Deserialize, Serialize, Debug, Clone)] pub struct StextPage { pub blocks: Vec, } From 504556504d80e38ba0ea1347de66c04459dff24f Mon Sep 17 00:00:00 2001 From: RikaKit <118338010+RikaKit@users.noreply.github.com> Date: Tue, 14 May 2024 09:41:38 +0400 Subject: [PATCH 4/4] applied cargo fmt --- examples/extract_stext.rs | 7 +++++-- src/page.rs | 22 +++++++++++++++------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/examples/extract_stext.rs b/examples/extract_stext.rs index d82ed8e..52508d8 100644 --- a/examples/extract_stext.rs +++ b/examples/extract_stext.rs @@ -4,12 +4,15 @@ fn main() { // cargo run --example extract_stext let mut path_to_doc = String::new(); println!("Enter a path to document: "); - io::stdin().read_line(&mut path_to_doc).expect("Failed to read line"); + io::stdin() + .read_line(&mut path_to_doc) + .expect("Failed to read line"); let doc = mupdf::document::Document::open(path_to_doc.trim()).unwrap(); let page = doc.load_page(0).unwrap(); match page.stext_page_as_json_from_page(1.0) { Ok(stext_json) => { - let stext_page: serde_json::Result = serde_json::from_str(stext_json.as_str()); + let stext_page: serde_json::Result = + serde_json::from_str(stext_json.as_str()); match stext_page { Ok(res) => { for block in res.blocks { diff --git a/src/page.rs b/src/page.rs index 0ca6305..111c1ea 100644 --- a/src/page.rs +++ b/src/page.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use mupdf_sys::*; use crate::{ - Buffer, Colorspace, context, Cookie, Device, DisplayList, Error, Link, Matrix, Pixmap, Quad, + context, Buffer, Colorspace, Cookie, Device, DisplayList, Error, Link, Matrix, Pixmap, Quad, Rect, Separations, TextPage, TextPageOptions, }; @@ -239,7 +239,11 @@ impl Page { pub fn stext_page_as_json_from_page(&self, scale: f32) -> Result { let mut buf = unsafe { - let inner = ffi_try!(mupdf_stext_page_as_json_from_page(context(), self.inner, scale)); + let inner = ffi_try!(mupdf_stext_page_as_json_from_page( + context(), + self.inner, + scale + )); Buffer::from_raw(inner) }; let mut res = String::new(); @@ -360,7 +364,7 @@ impl Iterator for LinkIter { ptr::null_mut(), ptr::null_mut(), ) - .page; + .page; } Some(Link { bounds, @@ -413,18 +417,22 @@ pub struct StextPage { #[cfg(test)] mod test { - use crate::{Document, Matrix}; use crate::page::StextPage; + use crate::{Document, Matrix}; #[test] fn test_get_stext_page_as_json() { - let path_to_doc = std::env::current_dir().unwrap() - .join("tests").join("files").join("dummy.pdf"); + let path_to_doc = std::env::current_dir() + .unwrap() + .join("tests") + .join("files") + .join("dummy.pdf"); let doc = Document::open(path_to_doc.to_str().unwrap()).unwrap(); let page = doc.load_page(0).unwrap(); match page.stext_page_as_json_from_page(1.0) { Ok(stext_json) => { - let stext_page: serde_json::Result = serde_json::from_str(stext_json.as_str()); + let stext_page: serde_json::Result = + serde_json::from_str(stext_json.as_str()); match stext_page { Ok(res) => { for block in res.blocks {