Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
diegobernardes committed Feb 20, 2025
1 parent 1dcb8d0 commit f18595d
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 104 deletions.
88 changes: 10 additions & 78 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -290,9 +290,7 @@ char *strdup(const char *s1) {
return str;
}

fz_stext_page *
tmp_fz_new_stext_page_from_page(fz_context *ctx, pdf_page *page, const fz_stext_options *options, save_to_html_input input)
{
fz_stext_page *nitro_new_stext_page_from_page(fz_context *ctx, pdf_page *page, const fz_stext_options *options, save_to_html_input input) {
fz_stext_page *text;
fz_device *dev = NULL;

Expand All @@ -301,7 +299,6 @@ tmp_fz_new_stext_page_from_page(fz_context *ctx, pdf_page *page, const fz_stext_
if (page == NULL)
return NULL;

// ------
float scale_factor = 1.5;
fz_rect bounds = pdf_bound_page(ctx, page, FZ_CROP_BOX);
if (input.width != 0) {
Expand All @@ -318,50 +315,37 @@ tmp_fz_new_stext_page_from_page(fz_context *ctx, pdf_page *page, const fz_stext_
scale_factor = 1.5;
}
}

float resolution = (float)(input.dpi) / 72;
fz_matrix ctm = fz_concat(fz_scale(resolution, resolution), fz_scale(scale_factor, scale_factor));
bounds = fz_transform_rect(bounds, ctm);
// fz_irect bbox = fz_round_rect(bounds);
// ------
fz_rect bounds2 = fz_bound_page(ctx, &(page->super));
printf("Rectangle1: (%.2f, %.2f) to (%.2f, %.2f)\n", bounds.x0, bounds.y0, bounds.x1, bounds.y1);
printf("Rectangle2: (%.2f, %.2f) to (%.2f, %.2f)\n\n", bounds2.x0, bounds2.y0, bounds2.x1, bounds2.y1);

// text = fz_new_stext_page(ctx, fz_bound_page(ctx, &(page->super)));
text = fz_new_stext_page(ctx, bounds);
fz_try(ctx)
{
fz_try(ctx) {
dev = fz_new_stext_device(ctx, text, options);
fz_run_page_contents(ctx, &(page->super), dev, ctm, NULL);
fz_close_device(ctx, dev);
}
fz_always(ctx)
{
} fz_always(ctx) {
fz_drop_device(ctx, dev);
}
fz_catch(ctx)
{
} fz_catch(ctx) {
fz_drop_stext_page(ctx, text);
fz_rethrow(ctx);
}

return text;
}

fz_stext_page *
tmp_fz_new_stext_page_from_page_number(fz_context *ctx, pdf_document *doc, int number, const fz_stext_options *options, save_to_html_input input)
{
fz_stext_page *nitro_new_stext_page_from_page_number(fz_context *ctx, pdf_document *doc, int number, const fz_stext_options *options, save_to_html_input input) {
pdf_page *page;
fz_stext_page *text = NULL;

page = pdf_load_page(ctx, doc, number);
fz_try(ctx)
text = tmp_fz_new_stext_page_from_page(ctx, page, options, input);
text = nitro_new_stext_page_from_page(ctx, page, options, input);
fz_always(ctx)
pdf_drop_page(ctx, page);
fz_catch(ctx)
fz_rethrow(ctx);

return text;
}

Expand All @@ -374,25 +358,17 @@ save_to_html_output save_to_html(save_to_html_input input) {
return output;
}

pdf_document *doc = NULL;
fz_stream *stream = NULL;
pdf_document *doc = NULL;
fz_buffer *html_buffer = NULL;
fz_output *out = NULL;
fz_stext_page *text_page = NULL;
fz_matrix ctm;
fz_rect bounds;
fz_device *dev = NULL;
fz_page *page = NULL;

fz_var(doc);
fz_var(stream);
fz_var(doc);
fz_var(html_buffer);
fz_var(out);
fz_var(text_page);
fz_var(ctm);
fz_var(bounds);
fz_var(dev);
fz_var(page);

fz_try(ctx) {
stream = fz_open_memory(ctx, (unsigned char *)input.payload, input.payload_length);
Expand All @@ -404,40 +380,18 @@ save_to_html_output save_to_html(save_to_html_input input) {

html_buffer = fz_new_buffer(ctx, 8192);
out = fz_new_output_with_buffer(ctx, html_buffer);

fz_write_string(ctx, out, "<!DOCTYPE html>\n<html>\n<head>\n<style>\np{position:absolute;white-space:pre;margin:0}\n</style>\n</head>\n<body>\n");

// page = fz_load_page(ctx, doc, input.page);
// bounds = fz_bound_page_box(ctx, page, FZ_CROP_BOX);
// // TODO: calculate the correct size.
// ctm = fz_identity;
// bounds = fz_transform_rect(bounds, ctm);

fz_stext_options stext_options = { 0 };
stext_options.flags |= FZ_STEXT_CLIP;
stext_options.flags |= FZ_STEXT_ACCURATE_BBOXES;
stext_options.flags |= FZ_STEXT_PRESERVE_WHITESPACE;
stext_options.flags |= FZ_STEXT_COLLECT_STRUCTURE;
stext_options.flags |= FZ_STEXT_COLLECT_VECTORS;

// text_page = fz_new_stext_page(ctx, bounds);
// dev = fz_new_stext_device(ctx, text_page, &stext_options);
// fz_run_page(ctx, page, dev, ctm, input.cookie);

// text_page = tmp_fz_new_stext_page_from_page_number(ctx, doc, input.page, &stext_options);
// text_page = fz_new_stext_page_from_page(ctx, page, &stext_options);
text_page = tmp_fz_new_stext_page_from_page_number(ctx, doc, input.page, &stext_options, input);

// TODO: This can be use to capture payload as a JSON.
// fz_print_stext_page_as_json(ctx, out, text_page, 1);

// print_custom_stext_page_as_html(ctx, out, text_page);

text_page = nitro_new_stext_page_from_page_number(ctx, doc, input.page, &stext_options, input);

fz_print_stext_page_as_html(ctx, out, text_page, input.page);

fz_write_string(ctx, out, "</body></html>");

fz_close_output(ctx, out);

output.payload = je_malloc(html_buffer->len + 1);
Expand All @@ -456,9 +410,7 @@ save_to_html_output save_to_html(save_to_html_input input) {
}

cleanup:
if(dev) fz_drop_device(ctx, dev);
if(text_page) fz_drop_stext_page(ctx, text_page);
if(page) fz_drop_page(ctx, page);
if(out) fz_drop_output(ctx, out);
if(html_buffer) fz_drop_buffer(ctx, html_buffer);
if(doc) pdf_drop_document(ctx, doc);
Expand All @@ -467,23 +419,3 @@ save_to_html_output save_to_html(save_to_html_input input) {

return output;
}

/*
add to the css page
p {
position: absolute;
}
span {
color: transparent !important;
user-select: text !important;
}
body {
background-image: url('sample_page0.png');
background-size: cover;
background-position: center;
background-repeat: no-repeat;
background-attachment: fixed;
width: 1190.6pt;
height: 841.9pt;
}
*/
11 changes: 9 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ func SaveToPNG(
return nil
}

func SaveToHTML(ctx context.Context, page uint16, rawPayload io.Reader, output io.Writer) (err error) {
func SaveToHTML(
ctx context.Context, page, width uint16, scale float32, dpi int, rawPayload io.Reader, output io.Writer,
) (err error) {
span, _ := ddTracer.StartSpanFromContext(ctx, "lazypdf.SaveToHTML")
defer func() { span.Finish(ddTracer.WithError(err)) }()

Expand All @@ -97,11 +99,16 @@ func SaveToHTML(ctx context.Context, page uint16, rawPayload io.Reader, output i

input := C.save_to_html_input{
page: C.int(page),
dpi: C.int(72),
width: C.int(width),
scale: C.float(scale),
dpi: C.int(dpi),
payload: (*C.char)(unsafe.Pointer(&payload[0])),
payload_length: C.size_t(len(payload)),
cookie: &C.fz_cookie{abort: 0},
}
if dpi < defaultDPI {
input.dpi = C.int(defaultDPI)
}
go func() {
<-ctx.Done()
input.cookie.abort = 1
Expand Down
14 changes: 6 additions & 8 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,14 @@ func TestSaveToHTMLOK(t *testing.T) {
defer func() { require.NoError(t, file.Close()) }()

buf := bytes.NewBuffer([]byte{})
err = SaveToHTML(context.Background(), i, file, buf)
err = SaveToHTML(context.Background(), i, 0, 0, 0, file, buf)
require.NoError(t, err)

//expectedPage, err := os.ReadFile(fmt.Sprintf("testdata/sample_page%d.html", i))
//require.NoError(t, err)
expectedPage, err := os.ReadFile(fmt.Sprintf("testdata/sample_page%d.html", i))
require.NoError(t, err)
resultPage, err := io.ReadAll(buf)
//require.NoError(t, err)
//equire.Equal(t, expectedPage, resultPage)

os.WriteFile(fmt.Sprintf("testdata/sample_page%d.html", i), resultPage, 0644)
require.NoError(t, err)
require.Equal(t, expectedPage, resultPage)
}
}

Expand All @@ -36,7 +34,7 @@ func TestSaveToHTMLFail(t *testing.T) {
require.NoError(t, err)
defer func() { require.NoError(t, file.Close()) }()

err = SaveToHTML(context.Background(), 0, file, bytes.NewBuffer([]byte{}))
err = SaveToHTML(context.Background(), 0, 0, 0, 0, file, bytes.NewBuffer([]byte{}))
require.Error(t, err)
require.Equal(t, "failure at the C/MuPDF layer: no objects found", err.Error())
}
Expand Down
17 changes: 1 addition & 16 deletions testdata/sample_page0.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,7 @@
<html>
<head>
<style>
p {
position: absolute;
}
span {
color: transparent !important; /* Makes the font transparent */
user-select: text !important; /* Allows text selection */
}
body {
background-image: url('sample_page0.png');
background-size: cover; /* Covers the entire viewport */
background-position: center;
background-repeat: no-repeat;
background-attachment: fixed; /* Keeps the image fixed during scroll */
width: 1190.6pt;
height: 841.9pt;
}
p{position:absolute;white-space:pre;margin:0}
</style>
</head>
<body>
Expand Down

0 comments on commit f18595d

Please sign in to comment.