Skip to content

Commit

Permalink
fix: use sha256 hash value as the Document.id_ in MarkdownReader (r…
Browse files Browse the repository at this point in the history
…un-llama#768)

Co-authored-by: Alex Yang <himself65@outlook.com>
  • Loading branch information
ezirmusitua and himself65 authored May 7, 2024
1 parent e37fa5d commit 645fcf6
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 14 deletions.
12 changes: 6 additions & 6 deletions packages/core/src/readers/MarkdownReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,16 +95,16 @@ export class MarkdownReader implements FileReader {
const content = await fs.readFile(file);
const tups = this.parseTups(content);
const results: Document[] = [];
let counter = 0;
for (const [header, value] of tups) {
const id_ = `${file}_${counter}`;
if (header) {
results.push(
new Document({
text: `\n\n${header}\n${value}`,
}),
);
const text = `\n\n${header}\n${value}`;
results.push(new Document({ text, id_ }));
} else {
results.push(new Document({ text: value }));
results.push(new Document({ text: value, id_ }));
}
counter += 1;
}
return results;
}
Expand Down
12 changes: 4 additions & 8 deletions packages/core/src/readers/PDFReader.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { GenericFileSystem } from "@llamaindex/env";
import { createSHA256, defaultFS } from "@llamaindex/env";
import { defaultFS } from "@llamaindex/env";
import { Document } from "../Node.js";
import type { BaseReader } from "./type.js";

Expand All @@ -13,13 +13,9 @@ export class PDFReader implements BaseReader {
): Promise<Document[]> {
const content = await fs.readRawFile(file);
const text = await readPDF(content);
return text.map((text) => {
const sha256 = createSHA256();
sha256.update(text);
return new Document({
text,
id_: sha256.digest(),
});
return text.map((text, page) => {
const id_ = `${file}_${page}`;
return new Document({ text, id_ });
});
}
}
Expand Down

0 comments on commit 645fcf6

Please sign in to comment.