Skip to content

Commit

Permalink
Added savePDF option to recognize per #488; cleaned up code for linter
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Sep 18, 2022
1 parent 689a150 commit 622c841
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 49 deletions.
11 changes: 7 additions & 4 deletions examples/browser/download-pdf.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,29 @@
<button id="download-pdf" disabled="true">Download PDF</button>
</div>
<textarea id="board" readonly rows="8" cols="80">Upload an image file</textarea>
<script>
<script type="module">
const { createWorker } = Tesseract;
const worker = await createWorker({
corePath: '/node_modules/tesseract.js-core/tesseract-core.wasm.js',
workerPath: "/dist/worker.dev.js",
logger: m => console.log(m),
});
const uploader = document.getElementById('uploader');
const dlBtn = document.getElementById('download-pdf');
let pdf;
const recognize = async ({ target: { files } }) => {
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(files[0]);
const res = await worker.recognize(files[0], {savePDF: true});
pdf = res.data.pdf;
const text = res.data.text;
const board = document.getElementById('board');
board.value = text;
dlBtn.disabled = false;
};
const downloadPDF = async () => {
const filename = 'tesseract-ocr-result.pdf';
const { data } = await worker.getPDF('Tesseract OCR Result');
const blob = new Blob([new Uint8Array(data)], { type: 'application/pdf' });
const blob = new Blob([new Uint8Array(pdf)], { type: 'application/pdf' });
if (navigator.msSaveBlob) {
// IE 10+
navigator.msSaveBlob(blob, filename);
Expand Down
34 changes: 17 additions & 17 deletions src/createWorker.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ module.exports = async (_options = {}) => {
const resolves = {};
const rejects = {};

let resReject;
let resResolve;
const res = new Promise((resolve, reject) => {
resResolve = resolve;
resReject = reject;
let workerResReject;
let workerResResolve;
const workerRes = new Promise((resolve, reject) => {
workerResResolve = resolve;
workerResReject = reject;
});
let workerError = (event) => {resReject(event.message)};
const workerError = (event) => { workerResReject(event.message); };

let worker = spawnWorker(options);
worker.onerror = workerError;

Expand Down Expand Up @@ -63,8 +63,8 @@ module.exports = async (_options = {}) => {
})
);

const load = (jobId) => (
console.warn("`load` is depreciated and should be removed from code (workers now come pre-loaded)")
const load = () => (
console.warn('`load` is depreciated and should be removed from code (workers now come pre-loaded)')
);

const loadInternal = (jobId) => (
Expand Down Expand Up @@ -145,13 +145,14 @@ module.exports = async (_options = {}) => {
}))
);

const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => (
startJob(createJob({
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => {
console.log('`getPDF` function is depreciated. `recognize` option `savePDF` should be used instead.');
return startJob(createJob({
id: jobId,
action: 'getPDF',
payload: { title, textonly },
}))
);
}));
};

const detect = async (image, jobId) => (
startJob(createJob({
Expand Down Expand Up @@ -189,7 +190,7 @@ module.exports = async (_options = {}) => {
resolves[action]({ jobId, data: d });
} else if (status === 'reject') {
rejects[action](data);
if (action === "load") resReject(data);
if (action === 'load') workerResReject(data);
if (errorHandler) {
errorHandler(data);
} else {
Expand Down Expand Up @@ -220,8 +221,7 @@ module.exports = async (_options = {}) => {
terminate,
};

loadInternal().then(() => resResolve(resolveObj)).catch(() => {});

return res;
loadInternal().then(() => workerResResolve(resolveObj)).catch(() => {});

return workerRes;
};
12 changes: 8 additions & 4 deletions src/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ declare namespace Tesseract {
saveImageOriginal: boolean
saveImageGrey: boolean
saveImageBinary: boolean
savePDF: boolean
pdfTitle: string
pdfTextOnly: boolean
rotateAuto: boolean
rotateRadians: float
}
Expand Down Expand Up @@ -231,10 +234,11 @@ declare namespace Tesseract {
box: string | null;
unlv: string | null;
sd: string | null;
imageOriginal: string;
imageGrey: string;
imageBinary: string;
rotateRadians: number;
imageOriginal: string | null;
imageGrey: string | null;
imageBinary: string | null;
rotateRadians: number | null;
pdf: number[] | null;
}
}

Expand Down
70 changes: 46 additions & 24 deletions src/worker-script/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,20 @@ const initialize = async ({
}
};

const getPDFInternal = (title, textonly) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);

return TessModule.FS.readFile('/tesseract-ocr.pdf');
};

const getPDF = async ({ payload: { title, textonly } }, res) => {
res.resolve(getPDFInternal(title, textonly));
};

const getImage = (type) => {
api.WriteImage(type, '/image.png');
const pngBuffer = TessModule.FS.readFile('/image.png');
Expand All @@ -211,7 +225,8 @@ const getImage = (type) => {
const recognize = async ({
payload: {
image, options: {
rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, rotateAuto, rotateRadians,
rectangle: rec, saveImageOriginal, saveImageGrey, saveImageBinary, savePDF, pdfTitle,
pdfTextOnly, rotateAuto, rotateRadians,
},
},
}, res) => {
Expand Down Expand Up @@ -263,12 +278,23 @@ const recognize = async ({
const result = dump(TessModule, api, params);
if (saveImageOriginal) {
result.imageOriginal = getImage(imageType.ORIGINAL);
} else {
result.imageOriginal = null;
}
if (saveImageGrey) {
result.imageGrey = getImage(imageType.GREY);
} else {
result.imageGrey = null;
}
if (saveImageBinary) {
result.imageBinary = getImage(imageType.BINARY);
} else {
result.imageBinary = null;
}
if (savePDF) {
result.pdf = getPDFInternal(pdfTitle ?? 'Tesseract OCR Result', pdfTextOnly ?? false);
} else {
result.pdf = null;
}
result.rotateRadians = rotateRadiansFinal;
res.resolve(result);
Expand Down Expand Up @@ -325,12 +351,18 @@ const threshold = async ({
const result = {};
if (saveImageOriginal) {
result.imageOriginal = getImage(imageType.ORIGINAL);
} else {
result.imageOriginal = null;
}
if (saveImageGrey) {
result.imageGrey = getImage(imageType.GREY);
} else {
result.imageGrey = null;
}
if (saveImageBinary) {
result.imageBinary = getImage(imageType.BINARY);
} else {
result.imageBinary = null;
}
result.rotateRadians = rotateRadiansFinal;
res.resolve(result);
Expand All @@ -340,24 +372,14 @@ const threshold = async ({
}
};

const getPDF = async ({ payload: { title, textonly } }, res) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);

res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf'));
};

const detect = async ({ payload: { image } }, res) => {
try {
const ptr = setImage(TessModule, api, image);
const results = new TessModule.OSResults();

if (!api.DetectOS(results)) {
TessModule._free(ptr);

res.resolve({
tesseract_script_id: null,
script: null,
Expand Down Expand Up @@ -422,18 +444,18 @@ exports.dispatchHandlers = (packet, send) => {

latestJob = res;

({
load,
FS,
loadLanguage,
initialize,
setParameters,
recognize,
threshold,
getPDF,
detect,
terminate,
})[packet.action](packet, res)
({
load,
FS,
loadLanguage,
initialize,
setParameters,
recognize,
threshold,
getPDF,
detect,
terminate,
})[packet.action](packet, res)
.catch((err) => res.reject(err.toString()));
};

Expand Down

0 comments on commit 622c841

Please sign in to comment.