Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pdf→imageとimage→jsonを分離 #5

Merged
merged 11 commits into from
Aug 14, 2024
11 changes: 8 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
node_modules
out

.env
*.pdf

.direnv/
.envrc

*.jpg
*.png
*.pdf
*.json

.DS_Store

.DS_Store
workspaces/*
!workspaces/.gitkeep
70 changes: 26 additions & 44 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
import fs from 'node:fs';
import path from 'node:path';
import Bottleneck from 'bottleneck';
import cliProgress from 'cli-progress';
import * as dotenv from 'dotenv';
import cp from 'cli-progress';

import { sleep } from 'app/utils/utils.js';
import { getFileInfo, mkdir } from 'app/utils/file.js';

import { readPDF, saveFiles } from 'app/pdf.js';
import { Path, getImageDirs, getImages, getPDFs } from 'app/utils/file.js';
import { pdfs2images } from 'app/pdf.js';
import * as Gyazo from './gyazo.js';
import { renderPage, saveJson } from 'app/renderScrapboxPage.js';
import { ProfilePath, createProfilePage } from 'app/profilePage.js';
import Bottleneck from 'bottleneck';

dotenv.config();

Expand All @@ -22,41 +19,32 @@ type Config = {
};

export async function main(config: Config) {
const files = fs.readdirSync(config.workspace);
const pdfs = files.filter(file => path.extname(file) === '.pdf');
const pdfPaths = await getPDFs(config.workspace);
await pdfs2images(pdfPaths, config.workspace);

for (const pdf of pdfs) {
const filepath = path.join(config.workspace, pdf);
await processSinglePDF(config, filepath);
}
const dirs = await getImageDirs(config.workspace);
await dirs2Cosense(config, dirs);
}

async function processSinglePDF(config: Config, filepath: string) {
const { filename } = getFileInfo(filepath, '.pdf');

console.log(`\nProcessing PDF: ${filename}\n`);

await mkdir(filename);

const imgs = await readPDF(filepath);

const limiter = new Bottleneck({
maxConcurrent: 30,
minTime: 1000,
});
async function dirs2Cosense(config: Config, dirPaths: Path[]) {
for (const dirPath of dirPaths) {
await dir2Cosense(config, dirPath);
}
}

const progressBar = new cliProgress.SingleBar(
{},
cliProgress.Presets.shades_classic,
);
async function dir2Cosense(config: Config, dirPath: Path) {
const limiter = new Bottleneck({ maxConcurrent: 30, minTime: 1000 });
const progressBar = new cp.SingleBar({}, cp.Presets.shades_classic);

progressBar.start(imgs.length, 0);
console.log(`imgs→cosense: start ${dirPath}\n`);
const images = await getImages(dirPath);
progressBar.start(images.length, 0);

const pages = await Promise.all(
imgs.map((img, index) =>
images.map((img, index) =>
limiter.schedule(() => {
progressBar.increment();
return generatePage(img, filename, index, imgs.length, config);
return generatePage(img, index, images.length, config);
}),
),
);
Expand All @@ -68,25 +56,19 @@ async function processSinglePDF(config: Config, filepath: string) {
return [...pages, profilePage];
})();

progressBar.stop();
await saveJson(`${dirPath}-ocr.json`, { pages: pagesWithProfile });

// TODO: out
await saveJson(`out/${filename}-ocr.json`, { pages: pagesWithProfile });

console.log(`Finished processing PDF: ${filename}\n`);
progressBar.stop();
console.log(`imgs→cosense: end ${dirPath}\n`);
}

const generatePage = async (
img: Buffer,
filename: string,
path: Path,
index: number,
pageLength: number,
config: Config,
) => {
const path = `out/${filename}/${index}.jpg`;

const imagePath = await saveFiles(img, path);
const gyazoImageId = await Gyazo.upload(imagePath);
const gyazoImageId = await Gyazo.upload(path);

await sleep(config.waitTimeForOcr);

Expand Down
36 changes: 34 additions & 2 deletions src/pdf.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,40 @@
import fs from 'node:fs/promises';
import np from 'node:path';
import { pdf } from 'pdf-to-img';
import { range } from './utils/utils.js';
import { Path, fileInfo, mkdir } from 'app/utils/file.js';

export async function readPDF(path: string): Promise<Buffer[]> {
export async function pdfs2images(
pdfs: Path[],
outDir: string,
): Promise<Path[]> {
return Promise.all(pdfs.map(pdf => pdf2images(pdf, outDir)));
}

/**
* - take a path of pdf
* - convert pdf to images
* - create outDir
* - save to outDir
*/
async function pdf2images(pdf: Path, outDir: string): Promise<Path> {
console.log(`pdf→images: start ${pdf}\n`);

const { filename } = fileInfo(pdf);
const outPath = np.join(outDir, filename);
const imgs = await pdfToImages(pdf);

await mkdir(outPath);

Promise.all(
imgs.map((img, index) => saveFiles(img, `${outPath}/${index}.jpg`)),
);

console.log(`pdf→images: end ${pdf}\n`);
return outPath;
}

async function pdfToImages(path: Path): Promise<Buffer[]> {
const buffer = await fs.readFile(path);
const src = new Uint8Array(buffer);
const doc = await pdf(src, { scale: 3 });
Expand All @@ -13,7 +45,7 @@ export async function readPDF(path: string): Promise<Buffer[]> {
return pages;
}

export async function saveFiles(img: Buffer, path: string) {
async function saveFiles(img: Buffer, path: string) {
await fs.writeFile(path, img);
return path;
}
61 changes: 50 additions & 11 deletions src/utils/file.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,62 @@
import fs from 'node:fs/promises';
import path from 'node:path';
import np from 'node:path';

type Extension = '.json' | '.pdf';
export type Path = string;

export function getFileInfo(filepath: string, extension: Extension) {
if (filepath === '') {
throw new Error('invalid argument');
}
export const getPDFs = async (path: Path): Promise<Path[]> => {
const files = await fs.readdir(path);
return files
.filter(file => np.extname(file) === '.pdf')
.map(file => np.join(path, file));
};

export const getImageDirs = async (dirPath: string): Promise<string[]> => {
const files = await fs.readdir(dirPath);

const dirs = await Promise.all(
files.map(async file => {
const fullPath = np.join(dirPath, file);
const stats = await fs.lstat(fullPath);
return stats.isDirectory() ? fullPath : null;
}),
);

return dirs.filter(d => d != null);
};

export const getImages = async (path: Path): Promise<Path[]> => {
const files = await fs.readdir(path);
return files
.filter(file => np.extname(file) === '.jpg' || np.extname(file) === '.png')
.map(file => np.join(path, file));
};

/**
* e.g. fileInfo('out/2021-01-01/2021-01-01.pdf')
* - path: 'out/2021-01-01/2021-01-01.pdf',
* - dir: 'out/2021-01-01',
* - name: '2021-01-01.pdf',
* - filename: '2021-01-01',
* - ext: '.pdf',
*/
export function fileInfo(path: Path) {
return {
filepath,
filename: path.basename(filepath, extension),
path: path,
dir: np.dirname(path),
name: np.basename(path),
filename: np.basename(path, np.extname(path)),
ext: np.extname(path),
};
}

export async function mkdir(filename: string) {
/**
* e.g. mkdir('out')
* e.g. mkdir('out/2021-01-01')
*/
export async function mkdir(name: string) {
try {
await fs.stat(`out/${filename}`);
await fs.stat(name);
} catch {
await fs.mkdir(`out/${filename}`, { recursive: true });
await fs.mkdir(name, { recursive: true });
}
}
Empty file added workspace/.gitkeep
Empty file.
Loading