From 40ebf911fbb0a124db0f49367fdc2257f5e57ef9 Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 12:51:33 +0900 Subject: [PATCH 01/11] add getPDFs, getImageDirs --- src/index.ts | 6 ++---- src/utils/file.ts | 13 +++++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/index.ts b/src/index.ts index d7f1a84..183dbd5 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,11 +1,10 @@ -import fs from 'node:fs'; import path from 'node:path'; import Bottleneck from 'bottleneck'; import cliProgress from 'cli-progress'; import * as dotenv from 'dotenv'; import { sleep } from 'app/utils/utils.js'; -import { getFileInfo, mkdir } from 'app/utils/file.js'; +import { getFileInfo, getPDFs, mkdir } from 'app/utils/file.js'; import { readPDF, saveFiles } from 'app/pdf.js'; import * as Gyazo from './gyazo.js'; @@ -22,8 +21,7 @@ type Config = { }; export async function main(config: Config) { - const files = fs.readdirSync(config.workspace); - const pdfs = files.filter(file => path.extname(file) === '.pdf'); + const pdfs = await getPDFs(config.workspace); for (const pdf of pdfs) { const filepath = path.join(config.workspace, pdf); diff --git a/src/utils/file.ts b/src/utils/file.ts index 9b9ef13..8ce552c 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -1,6 +1,19 @@ import fs from 'node:fs/promises'; import path from 'node:path'; +export const getPDFs = async (path_: string) => { + const files = await fs.readdir(path_); + return files.filter(file => path.extname(file) === '.pdf'); +}; + +export const getImageDirs = async (path_: string) => { + const files = await fs.readdir(path_); + return files.filter(async file => { + const stats = await fs.lstat(path.join(path_, file)); + return stats.isDirectory(); + }); +}; + type Extension = '.json' | '.pdf'; export function getFileInfo(filepath: string, extension: Extension) { From 0374c502724662769d6a191455f14857116bc752 Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 12:52:04 +0900 Subject: [PATCH 02/11] todo comment --- src/index.ts | 1 + src/utils/file.ts | 1 + 2 files changed, 2 insertions(+) diff --git a/src/index.ts b/src/index.ts index 183dbd5..3e76f20 100644 --- a/src/index.ts +++ b/src/index.ts @@ -81,6 +81,7 @@ const generatePage = async ( pageLength: number, config: Config, ) => { + // TODO: out const path = `out/${filename}/${index}.jpg`; const imagePath = await saveFiles(img, path); diff --git a/src/utils/file.ts b/src/utils/file.ts index 8ce552c..c80400e 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -27,6 +27,7 @@ export function getFileInfo(filepath: string, extension: Extension) { }; } +// TODO: out export async function mkdir(filename: string) { try { await fs.stat(`out/${filename}`); From bd22ada8ef3853d882206f23fdc1f338c19bba61 Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 12:58:08 +0900 Subject: [PATCH 03/11] define Path type --- src/utils/file.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/utils/file.ts b/src/utils/file.ts index c80400e..6f8b476 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -1,12 +1,14 @@ import fs from 'node:fs/promises'; import path from 'node:path'; -export const getPDFs = async (path_: string) => { +export type Path = string; + +export const getPDFs = async (path_: Path) => { const files = await fs.readdir(path_); return files.filter(file => path.extname(file) === '.pdf'); }; -export const getImageDirs = async (path_: string) => { +export const getImageDirs = async (path_: Path) => { const files = await fs.readdir(path_); return files.filter(async file => { const stats = await fs.lstat(path.join(path_, file)); @@ -16,7 +18,7 @@ export const getImageDirs = async (path_: string) => { type Extension = '.json' | '.pdf'; -export function getFileInfo(filepath: string, extension: Extension) { +export function getFileInfo(filepath: Path, extension: Extension) { if (filepath === '') { throw new Error('invalid argument'); } From f3b81edad3c695abadde9e4ee7b5e0d3c4d75c20 Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 13:23:14 +0900 Subject: [PATCH 04/11] update gitignore --- .gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index fa92f44..e1f9672 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,13 @@ node_modules -out .env -*.pdf .direnv/ .envrc + *.jpg +*.png +*.pdf +*.json .DS_Store \ No newline at end of file From 8f773047edfefc47ffe4fb0573a89d788919f211 Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 13:24:17 +0900 Subject: [PATCH 05/11] return Path --- src/utils/file.ts | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/utils/file.ts b/src/utils/file.ts index 6f8b476..bde1e3f 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -1,19 +1,23 @@ import fs from 'node:fs/promises'; -import path from 'node:path'; +import np from 'node:path'; export type Path = string; -export const getPDFs = async (path_: Path) => { - const files = await fs.readdir(path_); - return files.filter(file => path.extname(file) === '.pdf'); +export const getPDFs = async (path: Path): Promise => { + const files = await fs.readdir(path); + return files + .filter(file => np.extname(file) === '.pdf') + .map(file => np.join(path, file)); }; -export const getImageDirs = async (path_: Path) => { - const files = await fs.readdir(path_); - return files.filter(async file => { - const stats = await fs.lstat(path.join(path_, file)); - return stats.isDirectory(); - }); +export const getImageDirs = async (path: Path): Promise => { + const files = await fs.readdir(path); + return files + .filter(async file => { + const stats = await fs.lstat(np.join(path, file)); + return stats.isDirectory(); + }) + .map(file => np.join(path, file)); }; type Extension = '.json' | '.pdf'; @@ -25,15 +29,18 @@ export function getFileInfo(filepath: Path, extension: Extension) { return { filepath, - filename: path.basename(filepath, extension), + filename: np.basename(filepath, extension), }; } -// TODO: out -export async function mkdir(filename: string) { +/** + * e.g. mkdir('out') + * e.g. mkdir('out/2021-01-01') + */ +export async function mkdir(name: string) { try { - await fs.stat(`out/${filename}`); + await fs.stat(name); } catch { - await fs.mkdir(`out/${filename}`, { recursive: true }); + await fs.mkdir(name, { recursive: true }); } } From 3007c5083457448408c2e7f573f5e05547dfdfcf Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 13:26:54 +0900 Subject: [PATCH 06/11] wip: pdf2images --- src/index.ts | 141 +++++++++++++++++++++++++++------------------- src/pdf.ts | 33 ++++++++++- src/utils/file.ts | 19 +++++++ 3 files changed, 134 insertions(+), 59 deletions(-) diff --git a/src/index.ts b/src/index.ts index 3e76f20..3508261 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,12 +1,19 @@ -import path from 'node:path'; +import np from 'node:path'; import Bottleneck from 'bottleneck'; import cliProgress from 'cli-progress'; import * as dotenv from 'dotenv'; import { sleep } from 'app/utils/utils.js'; -import { getFileInfo, getPDFs, mkdir } from 'app/utils/file.js'; - -import { readPDF, saveFiles } from 'app/pdf.js'; +import { + Path, + fileInfo, + getFileInfo, + getImageDirs, + getPDFs, + mkdir, +} from 'app/utils/file.js'; + +import { pdfToImages, pdfs2images, saveFiles } from 'app/pdf.js'; import * as Gyazo from './gyazo.js'; import { renderPage, saveJson } from 'app/renderScrapboxPage.js'; import { ProfilePath, createProfilePage } from 'app/profilePage.js'; @@ -21,76 +28,96 @@ type Config = { }; export async function main(config: Config) { - const pdfs = await getPDFs(config.workspace); + const pdfPaths = await getPDFs(config.workspace); + await pdfs2images(pdfPaths, config.workspace); + + // const dirs = await getImageDirs(config.inputs); + + // const images = [ss, ...dirs]; + // console.log({ images }); - for (const pdf of pdfs) { - const filepath = path.join(config.workspace, pdf); - await processSinglePDF(config, filepath); - } + // for (const pdf of images) { + // const filepath = path.join(config.inputs, pdf); + // console.log({ filepath }); + // // await processSinglePDF(config, filepath); + // } } -async function processSinglePDF(config: Config, filepath: string) { - const { filename } = getFileInfo(filepath, '.pdf'); +// // TODO: name, move +// async function createCosenseJson(config: Config, dirpath: string) { +// const pages = await Promise.all( +// imgs.map((img, index) => +// limiter.schedule(() => { +// progressBar.increment(); +// return generatePage(img, filename, index, imgs.length, config); +// }), +// ), +// ); +// // +// } - console.log(`\nProcessing PDF: ${filename}\n`); +// async function processSinglePDF(config: Config, filepath: string) { +// const { filename } = getFileInfo(filepath, '.pdf'); - await mkdir(filename); +// console.log(`\nProcessing PDF: ${filename}\n`); - const imgs = await readPDF(filepath); +// await mkdir(filename); - const limiter = new Bottleneck({ - maxConcurrent: 30, - minTime: 1000, - }); +// const imgs = await pdfToImages(filepath); - const progressBar = new cliProgress.SingleBar( - {}, - cliProgress.Presets.shades_classic, - ); +// const limiter = new Bottleneck({ +// maxConcurrent: 30, +// minTime: 1000, +// }); - progressBar.start(imgs.length, 0); +// const progressBar = new cliProgress.SingleBar( +// {}, +// cliProgress.Presets.shades_classic, +// ); - const pages = await Promise.all( - imgs.map((img, index) => - limiter.schedule(() => { - progressBar.increment(); - return generatePage(img, filename, index, imgs.length, config); - }), - ), - ); +// progressBar.start(imgs.length, 0); - const pagesWithProfile = await (async () => { - if (config.profile == null) return pages; +// const pages = await Promise.all( +// imgs.map((img, index) => +// limiter.schedule(() => { +// progressBar.increment(); +// return generatePage(img, filename, index, imgs.length, config); +// }), +// ), +// ); - const profilePage = await createProfilePage(config.profile); - return [...pages, profilePage]; - })(); +// const pagesWithProfile = await (async () => { +// if (config.profile == null) return pages; - progressBar.stop(); +// const profilePage = await createProfilePage(config.profile); +// return [...pages, profilePage]; +// })(); - // TODO: out - await saveJson(`out/${filename}-ocr.json`, { pages: pagesWithProfile }); +// progressBar.stop(); - console.log(`Finished processing PDF: ${filename}\n`); -} +// // TODO: out +// await saveJson(`out/${filename}-ocr.json`, { pages: pagesWithProfile }); -const generatePage = async ( - img: Buffer, - filename: string, - index: number, - pageLength: number, - config: Config, -) => { - // TODO: out - const path = `out/${filename}/${index}.jpg`; +// console.log(`Finished processing PDF: ${filename}\n`); +// } - const imagePath = await saveFiles(img, path); - const gyazoImageId = await Gyazo.upload(imagePath); +// const generatePage = async ( +// img: Buffer, +// filename: string, +// index: number, +// pageLength: number, +// config: Config, +// ) => { +// // TODO: out +// const path = `out/${filename}/${index}.jpg`; - await sleep(config.waitTimeForOcr); +// const imagePath = await saveFiles(img, path); +// const gyazoImageId = await Gyazo.upload(imagePath); - const ocr = await Gyazo.getGyazoOCR(gyazoImageId); - const page = renderPage(index, pageLength, gyazoImageId, ocr); +// await sleep(config.waitTimeForOcr); - return page; -}; +// const ocr = await Gyazo.getGyazoOCR(gyazoImageId); +// const page = renderPage(index, pageLength, gyazoImageId, ocr); + +// return page; +// }; diff --git a/src/pdf.ts b/src/pdf.ts index 8e22a73..4e567d1 100644 --- a/src/pdf.ts +++ b/src/pdf.ts @@ -1,8 +1,37 @@ import fs from 'node:fs/promises'; +import np from 'node:path'; import { pdf } from 'pdf-to-img'; import { range } from './utils/utils.js'; +import { Path, fileInfo, mkdir } from 'app/utils/file.js'; -export async function readPDF(path: string): Promise { +export async function pdfs2images( + pdfs: Path[], + outDir: string, +): Promise { + return Promise.all(pdfs.map(pdf => pdf2images(pdf, outDir))); +} + +/** + * - take a path of pdf + * - convert pdf to images + * - create outDir + * - save to outDir + */ +async function pdf2images(pdf: Path, outDir: string): Promise { + const { filename } = fileInfo(pdf); + const outPath = np.join(outDir, filename); + const imgs = await pdfToImages(pdf); + + await mkdir(outPath); + + Promise.all( + imgs.map((img, index) => saveFiles(img, `${outPath}/${index}.jpg`)), + ); + + return outPath; +} + +async function pdfToImages(path: Path): Promise { const buffer = await fs.readFile(path); const src = new Uint8Array(buffer); const doc = await pdf(src, { scale: 3 }); @@ -13,7 +42,7 @@ export async function readPDF(path: string): Promise { return pages; } -export async function saveFiles(img: Buffer, path: string) { +async function saveFiles(img: Buffer, path: string) { await fs.writeFile(path, img); return path; } diff --git a/src/utils/file.ts b/src/utils/file.ts index bde1e3f..9159dda 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -22,6 +22,25 @@ export const getImageDirs = async (path: Path): Promise => { type Extension = '.json' | '.pdf'; +/** + * e.g. fileInfo('out/2021-01-01/2021-01-01.pdf') + * - path: 'out/2021-01-01/2021-01-01.pdf', + * - dir: 'out/2021-01-01', + * - name: '2021-01-01.pdf', + * - filename: '2021-01-01', + * - ext: '.pdf', + */ +export function fileInfo(path: Path) { + return { + path: path, + dir: np.dirname(path), + name: np.basename(path), + filename: np.basename(path, np.extname(path)), + ext: np.extname(path), + }; +} + +/** @deprecated */ export function getFileInfo(filepath: Path, extension: Extension) { if (filepath === '') { throw new Error('invalid argument'); From 1eed61072c8ed82974f41d5e93ccc75e7b75f31a Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 13:53:44 +0900 Subject: [PATCH 07/11] =?UTF-8?q?fix:=20=E5=90=8C=E6=9C=9F=E5=87=A6?= =?UTF-8?q?=E7=90=86=E3=81=A8filter=E3=82=92=E5=90=8C=E6=99=82=E3=81=AB?= =?UTF-8?q?=E8=A1=8C=E3=81=86=E3=81=A8=E3=83=90=E3=82=B0=E3=81=A3=E3=81=A6?= =?UTF-8?q?pdf=E3=82=82=E8=BF=94=E3=81=A3=E3=81=A6=E3=81=8D=E3=81=A6?= =?UTF-8?q?=E3=81=84=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/utils/file.ts | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/utils/file.ts b/src/utils/file.ts index 9159dda..2014754 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -10,7 +10,20 @@ export const getPDFs = async (path: Path): Promise => { .map(file => np.join(path, file)); }; -export const getImageDirs = async (path: Path): Promise => { +export const getImageDirs = async (dirPath: string): Promise => { + const files = await fs.readdir(dirPath); + + const dirs = await Promise.all( + files.map(async file => { + const fullPath = np.join(dirPath, file); + const stats = await fs.lstat(fullPath); + return stats.isDirectory() ? fullPath : null; + }), + ); + + return dirs.filter(d => d != null); +}; + const files = await fs.readdir(path); return files .filter(async file => { From 4870a2eaef71ff35c1b15b27d9990a5dc7cb2750 Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 14:06:02 +0900 Subject: [PATCH 08/11] =?UTF-8?q?pdf=E2=86=92image=E3=81=A8image=E2=86=92j?= =?UTF-8?q?son=E3=82=92=E5=88=86=E9=9B=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/index.ts | 127 +++++++++++++--------------------------------- src/utils/file.ts | 20 +------- 2 files changed, 36 insertions(+), 111 deletions(-) diff --git a/src/index.ts b/src/index.ts index 3508261..790d5bc 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,19 +1,8 @@ -import np from 'node:path'; -import Bottleneck from 'bottleneck'; -import cliProgress from 'cli-progress'; import * as dotenv from 'dotenv'; import { sleep } from 'app/utils/utils.js'; -import { - Path, - fileInfo, - getFileInfo, - getImageDirs, - getPDFs, - mkdir, -} from 'app/utils/file.js'; - -import { pdfToImages, pdfs2images, saveFiles } from 'app/pdf.js'; +import { Path, getImageDirs, getImages, getPDFs } from 'app/utils/file.js'; +import { pdfs2images } from 'app/pdf.js'; import * as Gyazo from './gyazo.js'; import { renderPage, saveJson } from 'app/renderScrapboxPage.js'; import { ProfilePath, createProfilePage } from 'app/profilePage.js'; @@ -27,97 +16,49 @@ type Config = { profile?: ProfilePath; }; +// TODO: log export async function main(config: Config) { const pdfPaths = await getPDFs(config.workspace); await pdfs2images(pdfPaths, config.workspace); - // const dirs = await getImageDirs(config.inputs); - - // const images = [ss, ...dirs]; - // console.log({ images }); - - // for (const pdf of images) { - // const filepath = path.join(config.inputs, pdf); - // console.log({ filepath }); - // // await processSinglePDF(config, filepath); - // } + const dirs = await getImageDirs(config.workspace); + await dirs2Cosense(config, dirs); } -// // TODO: name, move -// async function createCosenseJson(config: Config, dirpath: string) { -// const pages = await Promise.all( -// imgs.map((img, index) => -// limiter.schedule(() => { -// progressBar.increment(); -// return generatePage(img, filename, index, imgs.length, config); -// }), -// ), -// ); -// // -// } - -// async function processSinglePDF(config: Config, filepath: string) { -// const { filename } = getFileInfo(filepath, '.pdf'); - -// console.log(`\nProcessing PDF: ${filename}\n`); - -// await mkdir(filename); - -// const imgs = await pdfToImages(filepath); - -// const limiter = new Bottleneck({ -// maxConcurrent: 30, -// minTime: 1000, -// }); - -// const progressBar = new cliProgress.SingleBar( -// {}, -// cliProgress.Presets.shades_classic, -// ); - -// progressBar.start(imgs.length, 0); - -// const pages = await Promise.all( -// imgs.map((img, index) => -// limiter.schedule(() => { -// progressBar.increment(); -// return generatePage(img, filename, index, imgs.length, config); -// }), -// ), -// ); - -// const pagesWithProfile = await (async () => { -// if (config.profile == null) return pages; - -// const profilePage = await createProfilePage(config.profile); -// return [...pages, profilePage]; -// })(); +async function dirs2Cosense(config: Config, dirPaths: Path[]) { + Promise.all(dirPaths.map(dirPath => dir2Cosense(config, dirPath))); +} -// progressBar.stop(); +async function dir2Cosense(config: Config, dirPath: Path) { + const images = await getImages(dirPath); + const pages = await Promise.all( + images.map((img, index) => { + return generatePage(img, index, images.length, config); + }), + ); -// // TODO: out -// await saveJson(`out/${filename}-ocr.json`, { pages: pagesWithProfile }); + const pagesWithProfile = await (async () => { + if (config.profile == null) return pages; -// console.log(`Finished processing PDF: ${filename}\n`); -// } + const profilePage = await createProfilePage(config.profile); + return [...pages, profilePage]; + })(); -// const generatePage = async ( -// img: Buffer, -// filename: string, -// index: number, -// pageLength: number, -// config: Config, -// ) => { -// // TODO: out -// const path = `out/${filename}/${index}.jpg`; + await saveJson(`${dirPath}-ocr.json`, { pages: pagesWithProfile }); +} -// const imagePath = await saveFiles(img, path); -// const gyazoImageId = await Gyazo.upload(imagePath); +const generatePage = async ( + path: Path, + index: number, + pageLength: number, + config: Config, +) => { + const gyazoImageId = await Gyazo.upload(path); -// await sleep(config.waitTimeForOcr); + await sleep(config.waitTimeForOcr); -// const ocr = await Gyazo.getGyazoOCR(gyazoImageId); -// const page = renderPage(index, pageLength, gyazoImageId, ocr); + const ocr = await Gyazo.getGyazoOCR(gyazoImageId); + const page = renderPage(index, pageLength, gyazoImageId, ocr); -// return page; -// }; + return page; +}; diff --git a/src/utils/file.ts b/src/utils/file.ts index 2014754..ac06747 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -24,17 +24,13 @@ export const getImageDirs = async (dirPath: string): Promise => { return dirs.filter(d => d != null); }; +export const getImages = async (path: Path): Promise => { const files = await fs.readdir(path); return files - .filter(async file => { - const stats = await fs.lstat(np.join(path, file)); - return stats.isDirectory(); - }) + .filter(file => np.extname(file) === '.jpg' || np.extname(file) === '.png') .map(file => np.join(path, file)); }; -type Extension = '.json' | '.pdf'; - /** * e.g. fileInfo('out/2021-01-01/2021-01-01.pdf') * - path: 'out/2021-01-01/2021-01-01.pdf', @@ -53,18 +49,6 @@ export function fileInfo(path: Path) { }; } -/** @deprecated */ -export function getFileInfo(filepath: Path, extension: Extension) { - if (filepath === '') { - throw new Error('invalid argument'); - } - - return { - filepath, - filename: np.basename(filepath, extension), - }; -} - /** * e.g. mkdir('out') * e.g. mkdir('out/2021-01-01') From 1a9e97cbbc923337d36fa9921f5ecd7a85624e01 Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 14:11:46 +0900 Subject: [PATCH 09/11] add log --- src/index.ts | 21 +++++++++++++++++---- src/pdf.ts | 3 +++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/index.ts b/src/index.ts index 790d5bc..60ebbcc 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,5 @@ import * as dotenv from 'dotenv'; +import cp from 'cli-progress'; import { sleep } from 'app/utils/utils.js'; import { Path, getImageDirs, getImages, getPDFs } from 'app/utils/file.js'; @@ -6,6 +7,7 @@ import { pdfs2images } from 'app/pdf.js'; import * as Gyazo from './gyazo.js'; import { renderPage, saveJson } from 'app/renderScrapboxPage.js'; import { ProfilePath, createProfilePage } from 'app/profilePage.js'; +import Bottleneck from 'bottleneck'; dotenv.config(); @@ -16,7 +18,6 @@ type Config = { profile?: ProfilePath; }; -// TODO: log export async function main(config: Config) { const pdfPaths = await getPDFs(config.workspace); await pdfs2images(pdfPaths, config.workspace); @@ -30,11 +31,20 @@ async function dirs2Cosense(config: Config, dirPaths: Path[]) { } async function dir2Cosense(config: Config, dirPath: Path) { + const limiter = new Bottleneck({ maxConcurrent: 30, minTime: 1000 }); + const progressBar = new cp.SingleBar({}, cp.Presets.shades_classic); + + console.log(`imgs→cosense: start ${dirPath}\n`); const images = await getImages(dirPath); + progressBar.start(images.length, 0); + const pages = await Promise.all( - images.map((img, index) => { - return generatePage(img, index, images.length, config); - }), + images.map((img, index) => + limiter.schedule(() => { + progressBar.increment(); + return generatePage(img, index, images.length, config); + }), + ), ); const pagesWithProfile = await (async () => { @@ -45,6 +55,9 @@ async function dir2Cosense(config: Config, dirPath: Path) { })(); await saveJson(`${dirPath}-ocr.json`, { pages: pagesWithProfile }); + + progressBar.stop(); + console.log(`imgs→cosense: end ${dirPath}\n`); } const generatePage = async ( diff --git a/src/pdf.ts b/src/pdf.ts index 4e567d1..4795568 100644 --- a/src/pdf.ts +++ b/src/pdf.ts @@ -18,6 +18,8 @@ export async function pdfs2images( * - save to outDir */ async function pdf2images(pdf: Path, outDir: string): Promise { + console.log(`pdf→images: start ${pdf}\n`); + const { filename } = fileInfo(pdf); const outPath = np.join(outDir, filename); const imgs = await pdfToImages(pdf); @@ -28,6 +30,7 @@ async function pdf2images(pdf: Path, outDir: string): Promise { imgs.map((img, index) => saveFiles(img, `${outPath}/${index}.jpg`)), ); + console.log(`pdf→images: end ${pdf}\n`); return outPath; } From b7cd9edf123190bad0bd8533529608e95143f110 Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 14:27:40 +0900 Subject: [PATCH 10/11] add empty workspace dir --- .gitignore | 5 ++++- workspace/.gitkeep | 0 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 workspace/.gitkeep diff --git a/.gitignore b/.gitignore index e1f9672..26604cb 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,7 @@ node_modules *.pdf *.json -.DS_Store \ No newline at end of file +.DS_Store + +workspaces/* +!workspaces/.gitkeep \ No newline at end of file diff --git a/workspace/.gitkeep b/workspace/.gitkeep new file mode 100644 index 0000000..e69de29 From aaf0056c9a9a78473baf3800d11cf7a74e87c042 Mon Sep 17 00:00:00 2001 From: mrsekut Date: Wed, 14 Aug 2024 14:33:12 +0900 Subject: [PATCH 11/11] =?UTF-8?q?dos=E9=98=B2=E6=AD=A2=E3=81=AE=E3=81=9F?= =?UTF-8?q?=E3=82=81=E7=9B=B4=E5=88=97=E3=81=AB=E5=AE=9F=E8=A1=8C=E3=81=99?= =?UTF-8?q?=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/index.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/index.ts b/src/index.ts index 60ebbcc..79e7421 100644 --- a/src/index.ts +++ b/src/index.ts @@ -27,7 +27,9 @@ export async function main(config: Config) { } async function dirs2Cosense(config: Config, dirPaths: Path[]) { - Promise.all(dirPaths.map(dirPath => dir2Cosense(config, dirPath))); + for (const dirPath of dirPaths) { + await dir2Cosense(config, dirPath); + } } async function dir2Cosense(config: Config, dirPath: Path) {