forked from run-llama/LlamaIndexTS
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ingestion pipeline with doc store strategies (run-llama#418)
- Loading branch information
1 parent
ba42aa5
commit e2790da
Showing
18 changed files
with
363 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
"llamaindex": patch | ||
--- | ||
|
||
Preview: Add ingestion pipeline (incl. different strategies to handle doc store duplicates) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import fs from "node:fs/promises"; | ||
|
||
import { | ||
Document, | ||
IngestionPipeline, | ||
MetadataMode, | ||
OpenAIEmbedding, | ||
SimpleNodeParser, | ||
} from "llamaindex"; | ||
|
||
async function main() { | ||
// Load essay from abramov.txt in Node | ||
const path = "node_modules/llamaindex/examples/abramov.txt"; | ||
|
||
const essay = await fs.readFile(path, "utf-8"); | ||
|
||
// Create Document object with essay | ||
const document = new Document({ text: essay, id_: path }); | ||
const pipeline = new IngestionPipeline({ | ||
transformations: [ | ||
new SimpleNodeParser({ chunkSize: 1024, chunkOverlap: 20 }), | ||
// new TitleExtractor(llm), | ||
new OpenAIEmbedding(), | ||
], | ||
}); | ||
|
||
// run the pipeline | ||
const nodes = await pipeline.run({ documents: [document] }); | ||
|
||
// print out the result of the pipeline run | ||
for (const node of nodes) { | ||
console.log(node.getContent(MetadataMode.NONE)); | ||
} | ||
} | ||
|
||
main().catch(console.error); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import { BaseNode, Document } from "../Node"; | ||
import { BaseReader } from "../readers/base"; | ||
import { BaseDocumentStore, VectorStore } from "../storage"; | ||
import { DocStoreStrategy, createDocStoreStrategy } from "./strategies"; | ||
import { TransformComponent } from "./types"; | ||
|
||
interface IngestionRunArgs { | ||
documents?: Document[]; | ||
nodes?: BaseNode[]; | ||
inPlace?: boolean; | ||
} | ||
|
||
export async function runTransformations( | ||
nodesToRun: BaseNode[], | ||
transformations: TransformComponent[], | ||
transformOptions: any = {}, | ||
{ inPlace = true }: IngestionRunArgs, | ||
): Promise<BaseNode[]> { | ||
let nodes = nodesToRun; | ||
if (!inPlace) { | ||
nodes = [...nodesToRun]; | ||
} | ||
for (const transform of transformations) { | ||
nodes = await transform.transform(nodes, transformOptions); | ||
} | ||
return nodes; | ||
} | ||
|
||
// TODO: add caching, add concurrency | ||
export class IngestionPipeline { | ||
transformations: TransformComponent[] = []; | ||
documents?: Document[]; | ||
reader?: BaseReader; | ||
vectorStore?: VectorStore; | ||
docStore?: BaseDocumentStore; | ||
docStoreStrategy: DocStoreStrategy = DocStoreStrategy.UPSERTS; | ||
disableCache: boolean = true; | ||
|
||
private _docStoreStrategy?: TransformComponent; | ||
|
||
constructor(init?: Partial<IngestionPipeline>) { | ||
Object.assign(this, init); | ||
this._docStoreStrategy = createDocStoreStrategy( | ||
this.docStoreStrategy, | ||
this.docStore, | ||
this.vectorStore, | ||
); | ||
} | ||
|
||
async prepareInput( | ||
documents?: Document[], | ||
nodes?: BaseNode[], | ||
): Promise<BaseNode[]> { | ||
const inputNodes: BaseNode[] = []; | ||
if (documents) { | ||
inputNodes.push(...documents); | ||
} | ||
if (nodes) { | ||
inputNodes.push(...nodes); | ||
} | ||
if (this.documents) { | ||
inputNodes.push(...this.documents); | ||
} | ||
if (this.reader) { | ||
inputNodes.push(...(await this.reader.loadData())); | ||
} | ||
return inputNodes; | ||
} | ||
|
||
async run( | ||
args: IngestionRunArgs = {}, | ||
transformOptions?: any, | ||
): Promise<BaseNode[]> { | ||
const inputNodes = await this.prepareInput(args.documents, args.nodes); | ||
let nodesToRun; | ||
if (this._docStoreStrategy) { | ||
nodesToRun = await this._docStoreStrategy.transform(inputNodes); | ||
} else { | ||
nodesToRun = inputNodes; | ||
} | ||
|
||
const nodes = await runTransformations( | ||
nodesToRun, | ||
this.transformations, | ||
transformOptions, | ||
args, | ||
); | ||
if (this.vectorStore) { | ||
const nodesToAdd = nodes.filter((node) => node.embedding); | ||
await this.vectorStore.add(nodesToAdd); | ||
} | ||
return nodes; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
export * from "./IngestionPipeline"; | ||
export * from "./types"; |
32 changes: 32 additions & 0 deletions
32
packages/core/src/ingestion/strategies/DuplicatesStrategy.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import { BaseNode } from "../../Node"; | ||
import { BaseDocumentStore } from "../../storage"; | ||
import { TransformComponent } from "../types"; | ||
|
||
/** | ||
* Handle doc store duplicates by checking all hashes. | ||
*/ | ||
export class DuplicatesStrategy implements TransformComponent { | ||
private docStore: BaseDocumentStore; | ||
|
||
constructor(docStore: BaseDocumentStore) { | ||
this.docStore = docStore; | ||
} | ||
|
||
async transform(nodes: BaseNode[]): Promise<BaseNode[]> { | ||
const hashes = await this.docStore.getAllDocumentHashes(); | ||
const currentHashes = new Set<string>(); | ||
const nodesToRun: BaseNode[] = []; | ||
|
||
for (const node of nodes) { | ||
if (!(node.hash in hashes) && !currentHashes.has(node.hash)) { | ||
this.docStore.setDocumentHash(node.id_, node.hash); | ||
nodesToRun.push(node); | ||
currentHashes.add(node.hash); | ||
} | ||
} | ||
|
||
this.docStore.addDocuments(nodesToRun, true); | ||
|
||
return nodesToRun; | ||
} | ||
} |
44 changes: 44 additions & 0 deletions
44
packages/core/src/ingestion/strategies/UpsertsAndDeleteStrategy.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import { BaseNode } from "../../Node"; | ||
import { BaseDocumentStore, VectorStore } from "../../storage"; | ||
import { classify } from "./classify"; | ||
|
||
/** | ||
* Handle docstore upserts by checking hashes and ids. | ||
* Identify missing docs and delete them from docstore and vector store | ||
*/ | ||
export class UpsertsAndDeleteStrategy { | ||
protected docStore: BaseDocumentStore; | ||
protected vectorStore?: VectorStore; | ||
|
||
constructor(docStore: BaseDocumentStore, vectorStore?: VectorStore) { | ||
this.docStore = docStore; | ||
this.vectorStore = vectorStore; | ||
} | ||
|
||
async transform(nodes: BaseNode[]): Promise<BaseNode[]> { | ||
const { dedupedNodes, missingDocs, unusedDocs } = await classify( | ||
this.docStore, | ||
nodes, | ||
); | ||
|
||
// remove unused docs | ||
for (const refDocId of unusedDocs) { | ||
await this.docStore.deleteRefDoc(refDocId, false); | ||
if (this.vectorStore) { | ||
await this.vectorStore.delete(refDocId); | ||
} | ||
} | ||
|
||
// remove missing docs | ||
for (const docId of missingDocs) { | ||
await this.docStore.deleteDocument(docId, true); | ||
if (this.vectorStore) { | ||
await this.vectorStore.delete(docId); | ||
} | ||
} | ||
|
||
await this.docStore.addDocuments(dedupedNodes, true); | ||
|
||
return dedupedNodes; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import { BaseNode } from "../../Node"; | ||
import { BaseDocumentStore, VectorStore } from "../../storage"; | ||
import { TransformComponent } from "../types"; | ||
import { classify } from "./classify"; | ||
|
||
/** | ||
* Handles doc store upserts by checking hashes and ids. | ||
*/ | ||
export class UpsertsStrategy implements TransformComponent { | ||
protected docStore: BaseDocumentStore; | ||
protected vectorStore?: VectorStore; | ||
|
||
constructor(docStore: BaseDocumentStore, vectorStore?: VectorStore) { | ||
this.docStore = docStore; | ||
this.vectorStore = vectorStore; | ||
} | ||
|
||
async transform(nodes: BaseNode[]): Promise<BaseNode[]> { | ||
const { dedupedNodes, unusedDocs } = await classify(this.docStore, nodes); | ||
// remove unused docs | ||
for (const refDocId of unusedDocs) { | ||
await this.docStore.deleteRefDoc(refDocId, false); | ||
if (this.vectorStore) { | ||
await this.vectorStore.delete(refDocId); | ||
} | ||
} | ||
// add non-duplicate docs | ||
this.docStore.addDocuments(dedupedNodes, true); | ||
return dedupedNodes; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import { BaseNode } from "../../Node"; | ||
import { BaseDocumentStore } from "../../storage"; | ||
|
||
export async function classify(docStore: BaseDocumentStore, nodes: BaseNode[]) { | ||
const existingDocIds = Object.values(await docStore.getAllDocumentHashes()); | ||
const docIdsFromNodes = new Set<string>(); | ||
const dedupedNodes: BaseNode[] = []; | ||
const unusedDocs: string[] = []; | ||
|
||
for (const node of nodes) { | ||
const refDocId = node.sourceNode?.nodeId || node.id_; | ||
docIdsFromNodes.add(refDocId); | ||
const existingHash = await docStore.getDocumentHash(refDocId); | ||
|
||
if (!existingHash) { | ||
// document doesn't exist, so add it | ||
dedupedNodes.push(node); | ||
} else if (existingHash && existingHash !== node.hash) { | ||
// document exists but hash is different, so mark doc as unused and add node as deduped | ||
unusedDocs.push(refDocId); | ||
dedupedNodes.push(node); | ||
} | ||
// otherwise, document exists and hash is the same, so do nothing | ||
} | ||
const missingDocs = existingDocIds.filter((id) => !docIdsFromNodes.has(id)); | ||
return { dedupedNodes, missingDocs, unusedDocs }; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import { BaseDocumentStore, VectorStore } from "../../storage"; | ||
import { TransformComponent } from "../types"; | ||
import { DuplicatesStrategy } from "./DuplicatesStrategy"; | ||
import { UpsertsStrategy } from "./UpsertsStrategy"; | ||
|
||
export enum DocStoreStrategy { | ||
UPSERTS = "upserts", | ||
DUPLICATES_ONLY = "duplicates_only", | ||
UPSERTS_AND_DELETE = "upserts_and_delete", | ||
} | ||
|
||
export function createDocStoreStrategy( | ||
docStoreStrategy: DocStoreStrategy, | ||
docStore?: BaseDocumentStore, | ||
vectorStore?: VectorStore, | ||
): TransformComponent | undefined { | ||
if (docStore && vectorStore) { | ||
if ( | ||
docStoreStrategy === DocStoreStrategy.UPSERTS || | ||
docStoreStrategy === DocStoreStrategy.UPSERTS_AND_DELETE | ||
) { | ||
return new UpsertsStrategy(docStore, vectorStore); | ||
} else if (docStoreStrategy === DocStoreStrategy.DUPLICATES_ONLY) { | ||
return new DuplicatesStrategy(docStore); | ||
} else { | ||
throw new Error(`Invalid docstore strategy: ${docStoreStrategy}`); | ||
} | ||
} else if (docStore && !vectorStore) { | ||
if (docStoreStrategy === DocStoreStrategy.UPSERTS) { | ||
console.warn( | ||
"Docstore strategy set to upserts, but no vector store. Switching to duplicates_only strategy.", | ||
); | ||
} else if (docStoreStrategy === DocStoreStrategy.UPSERTS_AND_DELETE) { | ||
console.warn( | ||
"Docstore strategy set to upserts and delete, but no vector store. Switching to duplicates_only strategy.", | ||
); | ||
} | ||
return new DuplicatesStrategy(docStore); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import { BaseNode } from "../Node"; | ||
|
||
export interface TransformComponent { | ||
transform(nodes: BaseNode[], options?: any): Promise<BaseNode[]>; | ||
} |
Oops, something went wrong.