Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added functionality for receiving and parsing stext from page #87

Merged
merged 4 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ mupdf-sys = { version = "0.4.2", path = "mupdf-sys" }
once_cell = "1.3.1"
num_enum = "0.7.0"
bitflags = "2.0.2"
serde = { version = "1.0.201", features = ["derive"] }
serde_json = "1.0.117"

[dependencies.font-kit]
version = "0.12.0"
Expand Down
33 changes: 33 additions & 0 deletions examples/extract_stext.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
use std::io;

fn main() {
// cargo run --example extract_stext
let mut path_to_doc = String::new();
println!("Enter a path to document: ");
io::stdin()
.read_line(&mut path_to_doc)
.expect("Failed to read line");
let doc = mupdf::document::Document::open(path_to_doc.trim()).unwrap();
let page = doc.load_page(0).unwrap();
match page.stext_page_as_json_from_page(1.0) {
Ok(stext_json) => {
let stext_page: serde_json::Result<mupdf::page::StextPage> =
serde_json::from_str(stext_json.as_str());
match stext_page {
Ok(res) => {
for block in res.blocks {
if block.r#type.eq("text") {
for line in block.lines {
println!("{:?}", &line.text);
}
}
}
}
Err(err) => {
println!("stext_page parsing error: {:?}", &err);
}
}
}
Err(_err) => {}
}
}
28 changes: 28 additions & 0 deletions mupdf-sys/wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,34 @@ fz_buffer *mupdf_page_to_html(fz_context *ctx, fz_page *page, mupdf_error_t **er
return buf;
}

fz_buffer *mupdf_stext_page_as_json_from_page(fz_context *ctx, fz_page *page, float scale, mupdf_error_t **errptr)
{
fz_buffer *buf = NULL;
fz_output *out = NULL;
fz_stext_page *stext_page = NULL;
fz_var(stext_page);
fz_var(buf);
fz_var(out);
fz_try(ctx)
{
stext_page = fz_new_stext_page_from_page(ctx, page, NULL);
buf = fz_new_buffer(ctx, 8192);
out = fz_new_output_with_buffer(ctx, buf);
fz_print_stext_page_as_json(ctx, out, stext_page, scale);
fz_close_output(ctx, out);
}
fz_always(ctx)
{
fz_drop_output(ctx, out);
fz_drop_stext_page(ctx, stext_page);
}
fz_catch(ctx)
{
mupdf_save_error(ctx, errptr);
}
return buf;
}

fz_buffer *mupdf_page_to_xhtml(fz_context *ctx, fz_page *page, mupdf_error_t **errptr)
{
fz_buffer *buf = NULL;
Expand Down
99 changes: 94 additions & 5 deletions src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ use std::io::Read;
use std::ptr;
use std::slice;

use serde::{Deserialize, Serialize};

use mupdf_sys::*;

use crate::{
Expand Down Expand Up @@ -235,6 +237,20 @@ impl Page {
Ok(out)
}

pub fn stext_page_as_json_from_page(&self, scale: f32) -> Result<String, Error> {
let mut buf = unsafe {
let inner = ffi_try!(mupdf_stext_page_as_json_from_page(
context(),
self.inner,
scale
));
Buffer::from_raw(inner)
};
let mut res = String::new();
buf.read_to_string(&mut res).unwrap();
Ok(res)
}

pub fn to_xhtml(&self) -> Result<String, Error> {
let mut buf = unsafe {
let inner = ffi_try!(mupdf_page_to_xhtml(context(), self.inner));
Expand Down Expand Up @@ -359,10 +375,83 @@ impl Iterator for LinkIter {
}
}

#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct Font {
pub name: String,
pub family: String,
pub weight: String,
pub style: String,
pub size: u32,
}

#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct BBox {
pub x: u32,
pub y: u32,
pub w: u32,
pub h: u32,
}

#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct Line {
pub wmode: u32,
pub bbox: BBox,
pub font: Font,
pub x: u32,
pub y: u32,
pub text: String,
}

#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct Block {
pub r#type: String,
pub bbox: BBox,
pub lines: Vec<Line>,
}

// StructuredText
#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct StextPage {
pub blocks: Vec<Block>,
}

#[cfg(test)]
mod test {
use crate::page::StextPage;
use crate::{Document, Matrix};

#[test]
fn test_get_stext_page_as_json() {
let path_to_doc = std::env::current_dir()
.unwrap()
.join("tests")
.join("files")
.join("dummy.pdf");
let doc = Document::open(path_to_doc.to_str().unwrap()).unwrap();
let page = doc.load_page(0).unwrap();
match page.stext_page_as_json_from_page(1.0) {
Ok(stext_json) => {
let stext_page: serde_json::Result<StextPage> =
serde_json::from_str(stext_json.as_str());
match stext_page {
Ok(res) => {
for block in res.blocks {
if block.r#type.eq("text") {
for line in block.lines {
assert_eq!(&line.text, &"Dummy PDF file".to_string());
}
}
}
}
Err(err) => {
println!("stext_page parsing error: {:?}", &err);
}
}
}
Err(_err) => {}
}
}

#[test]
fn test_page_to_svg() {
let doc = Document::open("tests/files/dummy.pdf").unwrap();
Expand Down Expand Up @@ -454,20 +543,20 @@ mod test {
[Quad {
ul: Point {
x: 56.8,
y: 69.32512
y: 69.32512,
},
ur: Point {
x: 115.85405,
y: 69.32512
y: 69.32512,
},
ll: Point {
x: 56.8,
y: 87.311844
y: 87.311844,
},
lr: Point {
x: 115.85405,
y: 87.311844
}
y: 87.311844,
},
}]
);

Expand Down
6 changes: 3 additions & 3 deletions src/text_page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ impl TextBlock<'_> {

pub fn lines(&self) -> TextLineIter {
unsafe {
if self.inner.type_ == FZ_STEXT_BLOCK_TEXT as _ {
if self.inner.type_ == FZ_STEXT_BLOCK_TEXT as i32 {
return TextLineIter {
next: self.inner.u.t.first_line,
_marker: PhantomData,
Expand All @@ -130,7 +130,7 @@ impl TextBlock<'_> {

pub fn ctm(&self) -> Option<Matrix> {
unsafe {
if self.inner.type_ == FZ_STEXT_BLOCK_IMAGE as _ {
if self.inner.type_ == FZ_STEXT_BLOCK_IMAGE as i32 {
return Some(self.inner.u.i.transform.into());
}
}
Expand All @@ -139,7 +139,7 @@ impl TextBlock<'_> {

pub fn image(&self) -> Option<Image> {
unsafe {
if self.inner.type_ == FZ_STEXT_BLOCK_IMAGE as _ {
if self.inner.type_ == FZ_STEXT_BLOCK_IMAGE as i32 {
let inner = self.inner.u.i.image;
fz_keep_image(context(), inner);
return Some(Image::from_raw(inner));
Expand Down
Loading