From ce94780b956c35cc5ad51ef137c62243249f56fd Mon Sep 17 00:00:00 2001 From: Marcus Schiesser Date: Tue, 7 May 2024 11:45:55 +0800 Subject: [PATCH] feat: add page number to read PDFs (#815) --- .changeset/ninety-doors-impress.md | 5 +++++ packages/core/src/readers/PDFReader.ts | 11 +++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 .changeset/ninety-doors-impress.md diff --git a/.changeset/ninety-doors-impress.md b/.changeset/ninety-doors-impress.md new file mode 100644 index 0000000000..856d529f6e --- /dev/null +++ b/.changeset/ninety-doors-impress.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Add page number to read PDFs and use generated IDs for PDF and markdown content diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts index ee1b1b2f70..659ed51346 100644 --- a/packages/core/src/readers/PDFReader.ts +++ b/packages/core/src/readers/PDFReader.ts @@ -12,10 +12,13 @@ export class PDFReader implements BaseReader { fs: GenericFileSystem = defaultFS, ): Promise { const content = await fs.readRawFile(file); - const text = await readPDF(content); - return text.map((text, page) => { - const id_ = `${file}_${page}`; - return new Document({ text, id_ }); + const pages = await readPDF(content); + return pages.map((text, page) => { + const id_ = `${file}_${page + 1}`; + const metadata = { + page_number: page + 1, + }; + return new Document({ text, id_, metadata }); }); } }