diff --git a/.gitignore b/.gitignore index fa92f44..26604cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,16 @@ node_modules -out .env -*.pdf .direnv/ .envrc + *.jpg +*.png +*.pdf +*.json + +.DS_Store -.DS_Store \ No newline at end of file +workspaces/* +!workspaces/.gitkeep \ No newline at end of file diff --git a/src/index.ts b/src/index.ts index d7f1a84..79e7421 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,16 +1,13 @@ -import fs from 'node:fs'; -import path from 'node:path'; -import Bottleneck from 'bottleneck'; -import cliProgress from 'cli-progress'; import * as dotenv from 'dotenv'; +import cp from 'cli-progress'; import { sleep } from 'app/utils/utils.js'; -import { getFileInfo, mkdir } from 'app/utils/file.js'; - -import { readPDF, saveFiles } from 'app/pdf.js'; +import { Path, getImageDirs, getImages, getPDFs } from 'app/utils/file.js'; +import { pdfs2images } from 'app/pdf.js'; import * as Gyazo from './gyazo.js'; import { renderPage, saveJson } from 'app/renderScrapboxPage.js'; import { ProfilePath, createProfilePage } from 'app/profilePage.js'; +import Bottleneck from 'bottleneck'; dotenv.config(); @@ -22,41 +19,32 @@ type Config = { }; export async function main(config: Config) { - const files = fs.readdirSync(config.workspace); - const pdfs = files.filter(file => path.extname(file) === '.pdf'); + const pdfPaths = await getPDFs(config.workspace); + await pdfs2images(pdfPaths, config.workspace); - for (const pdf of pdfs) { - const filepath = path.join(config.workspace, pdf); - await processSinglePDF(config, filepath); - } + const dirs = await getImageDirs(config.workspace); + await dirs2Cosense(config, dirs); } -async function processSinglePDF(config: Config, filepath: string) { - const { filename } = getFileInfo(filepath, '.pdf'); - - console.log(`\nProcessing PDF: ${filename}\n`); - - await mkdir(filename); - - const imgs = await readPDF(filepath); - - const limiter = new Bottleneck({ - maxConcurrent: 30, - minTime: 1000, - }); +async function dirs2Cosense(config: Config, dirPaths: Path[]) { + for (const dirPath of dirPaths) { + await dir2Cosense(config, dirPath); + } +} - const progressBar = new cliProgress.SingleBar( - {}, - cliProgress.Presets.shades_classic, - ); +async function dir2Cosense(config: Config, dirPath: Path) { + const limiter = new Bottleneck({ maxConcurrent: 30, minTime: 1000 }); + const progressBar = new cp.SingleBar({}, cp.Presets.shades_classic); - progressBar.start(imgs.length, 0); + console.log(`imgs→cosense: start ${dirPath}\n`); + const images = await getImages(dirPath); + progressBar.start(images.length, 0); const pages = await Promise.all( - imgs.map((img, index) => + images.map((img, index) => limiter.schedule(() => { progressBar.increment(); - return generatePage(img, filename, index, imgs.length, config); + return generatePage(img, index, images.length, config); }), ), ); @@ -68,25 +56,19 @@ async function processSinglePDF(config: Config, filepath: string) { return [...pages, profilePage]; })(); - progressBar.stop(); + await saveJson(`${dirPath}-ocr.json`, { pages: pagesWithProfile }); - // TODO: out - await saveJson(`out/${filename}-ocr.json`, { pages: pagesWithProfile }); - - console.log(`Finished processing PDF: ${filename}\n`); + progressBar.stop(); + console.log(`imgs→cosense: end ${dirPath}\n`); } const generatePage = async ( - img: Buffer, - filename: string, + path: Path, index: number, pageLength: number, config: Config, ) => { - const path = `out/${filename}/${index}.jpg`; - - const imagePath = await saveFiles(img, path); - const gyazoImageId = await Gyazo.upload(imagePath); + const gyazoImageId = await Gyazo.upload(path); await sleep(config.waitTimeForOcr); diff --git a/src/pdf.ts b/src/pdf.ts index 8e22a73..4795568 100644 --- a/src/pdf.ts +++ b/src/pdf.ts @@ -1,8 +1,40 @@ import fs from 'node:fs/promises'; +import np from 'node:path'; import { pdf } from 'pdf-to-img'; import { range } from './utils/utils.js'; +import { Path, fileInfo, mkdir } from 'app/utils/file.js'; -export async function readPDF(path: string): Promise { +export async function pdfs2images( + pdfs: Path[], + outDir: string, +): Promise { + return Promise.all(pdfs.map(pdf => pdf2images(pdf, outDir))); +} + +/** + * - take a path of pdf + * - convert pdf to images + * - create outDir + * - save to outDir + */ +async function pdf2images(pdf: Path, outDir: string): Promise { + console.log(`pdf→images: start ${pdf}\n`); + + const { filename } = fileInfo(pdf); + const outPath = np.join(outDir, filename); + const imgs = await pdfToImages(pdf); + + await mkdir(outPath); + + Promise.all( + imgs.map((img, index) => saveFiles(img, `${outPath}/${index}.jpg`)), + ); + + console.log(`pdf→images: end ${pdf}\n`); + return outPath; +} + +async function pdfToImages(path: Path): Promise { const buffer = await fs.readFile(path); const src = new Uint8Array(buffer); const doc = await pdf(src, { scale: 3 }); @@ -13,7 +45,7 @@ export async function readPDF(path: string): Promise { return pages; } -export async function saveFiles(img: Buffer, path: string) { +async function saveFiles(img: Buffer, path: string) { await fs.writeFile(path, img); return path; } diff --git a/src/utils/file.ts b/src/utils/file.ts index 9b9ef13..ac06747 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -1,23 +1,62 @@ import fs from 'node:fs/promises'; -import path from 'node:path'; +import np from 'node:path'; -type Extension = '.json' | '.pdf'; +export type Path = string; -export function getFileInfo(filepath: string, extension: Extension) { - if (filepath === '') { - throw new Error('invalid argument'); - } +export const getPDFs = async (path: Path): Promise => { + const files = await fs.readdir(path); + return files + .filter(file => np.extname(file) === '.pdf') + .map(file => np.join(path, file)); +}; + +export const getImageDirs = async (dirPath: string): Promise => { + const files = await fs.readdir(dirPath); + + const dirs = await Promise.all( + files.map(async file => { + const fullPath = np.join(dirPath, file); + const stats = await fs.lstat(fullPath); + return stats.isDirectory() ? fullPath : null; + }), + ); + + return dirs.filter(d => d != null); +}; + +export const getImages = async (path: Path): Promise => { + const files = await fs.readdir(path); + return files + .filter(file => np.extname(file) === '.jpg' || np.extname(file) === '.png') + .map(file => np.join(path, file)); +}; +/** + * e.g. fileInfo('out/2021-01-01/2021-01-01.pdf') + * - path: 'out/2021-01-01/2021-01-01.pdf', + * - dir: 'out/2021-01-01', + * - name: '2021-01-01.pdf', + * - filename: '2021-01-01', + * - ext: '.pdf', + */ +export function fileInfo(path: Path) { return { - filepath, - filename: path.basename(filepath, extension), + path: path, + dir: np.dirname(path), + name: np.basename(path), + filename: np.basename(path, np.extname(path)), + ext: np.extname(path), }; } -export async function mkdir(filename: string) { +/** + * e.g. mkdir('out') + * e.g. mkdir('out/2021-01-01') + */ +export async function mkdir(name: string) { try { - await fs.stat(`out/${filename}`); + await fs.stat(name); } catch { - await fs.mkdir(`out/${filename}`, { recursive: true }); + await fs.mkdir(name, { recursive: true }); } } diff --git a/workspace/.gitkeep b/workspace/.gitkeep new file mode 100644 index 0000000..e69de29