Skip to content

Commit

Permalink
feat: LlamaCloudIndex from documents (run-llama#677)
Browse files Browse the repository at this point in the history
  • Loading branch information
EmanuelCampos authored Apr 2, 2024
1 parent d256cbe commit 3cbfa98
Show file tree
Hide file tree
Showing 8 changed files with 494 additions and 104 deletions.
5 changes: 5 additions & 0 deletions .changeset/warm-ligers-hammer.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"llamaindex": patch
---

feat: llamacloud index from documents
44 changes: 44 additions & 0 deletions examples/cloud/from_documents.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import fs from "node:fs/promises";

import { stdin as input, stdout as output } from "node:process";

import readline from "node:readline/promises";

import { Document, LlamaCloudIndex } from "llamaindex";

async function main() {
const path = "node_modules/llamaindex/examples/abramov.txt";

const essay = await fs.readFile(path, "utf-8");

// Create Document object with essay
const document = new Document({ text: essay, id_: path });

const index = await LlamaCloudIndex.fromDocuments({
documents: [document],
name: "test",
projectName: "default",
apiKey: process.env.LLAMA_CLOUD_API_KEY,
baseUrl: process.env.LLAMA_CLOUD_BASE_URL,
});

const queryEngine = index.asQueryEngine({
denseSimilarityTopK: 5,
});

const rl = readline.createInterface({ input, output });

while (true) {
const query = await rl.question("Query: ");
const stream = await queryEngine.query({
query,
stream: true,
});
console.log();
for await (const chunk of stream) {
process.stdout.write(chunk.response);
}
}
}

main().catch(console.error);
2 changes: 1 addition & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"@aws-crypto/sha256-js": "^5.2.0",
"@datastax/astra-db-ts": "^0.1.4",
"@grpc/grpc-js": "^1.10.2",
"@llamaindex/cloud": "0.0.4",
"@llamaindex/cloud": "0.0.5",
"@llamaindex/env": "workspace:*",
"@mistralai/mistralai": "^0.0.10",
"@notionhq/client": "^2.2.14",
Expand Down
5 changes: 3 additions & 2 deletions packages/core/src/TextSplitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ export const defaultParagraphSeparator = EOL + EOL + EOL;
* One of the advantages of SentenceSplitter is that even in the fixed length chunks it will try to keep sentences together.
*/
export class SentenceSplitter {
private chunkSize: number;
private chunkOverlap: number;
public chunkSize: number;
public chunkOverlap: number;

private tokenizer: any;
private tokenizerDecoder: any;
private paragraphSeparator: string;
Expand Down
154 changes: 154 additions & 0 deletions packages/core/src/cloud/LlamaCloudIndex.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
import { PlatformApi } from "@llamaindex/cloud";
import type { Document } from "../Node.js";
import type { BaseRetriever } from "../Retriever.js";
import { RetrieverQueryEngine } from "../engines/query/RetrieverQueryEngine.js";
import type { TransformComponent } from "../ingestion/types.js";
import type { BaseNodePostprocessor } from "../postprocessors/types.js";
import type { BaseSynthesizer } from "../synthesizers/types.js";
import type { BaseQueryEngine } from "../types.js";
import type { CloudRetrieveParams } from "./LlamaCloudRetriever.js";
import { LlamaCloudRetriever } from "./LlamaCloudRetriever.js";
import { getPipelineCreate } from "./config.js";
import type { CloudConstructorParams } from "./types.js";
import { getAppBaseUrl, getClient } from "./utils.js";

import { getEnv } from "@llamaindex/env";
import { OpenAIEmbedding } from "../embeddings/OpenAIEmbedding.js";
import { SimpleNodeParser } from "../nodeParsers/SimpleNodeParser.js";

export class LlamaCloudIndex {
params: CloudConstructorParams;
Expand All @@ -14,6 +23,151 @@ export class LlamaCloudIndex {
this.params = params;
}

static async fromDocuments(
params: {
documents: Document[];
transformations?: TransformComponent[];
verbose?: boolean;
} & CloudConstructorParams,
): Promise<LlamaCloudIndex> {
const defaultTransformations: TransformComponent[] = [
new OpenAIEmbedding({
apiKey: getEnv("OPENAI_API_KEY"),
}),
new SimpleNodeParser(),
];

const appUrl = getAppBaseUrl(params.baseUrl);

const client = await getClient({ ...params, baseUrl: appUrl });

const pipelineCreateParams = await getPipelineCreate({
pipelineName: params.name,
pipelineType: "MANAGED",
inputNodes: params.documents,
transformations: params.transformations ?? defaultTransformations,
});

const project = await client.project.upsertProject({
name: params.projectName ?? "default",
});

if (!project.id) {
throw new Error("Project ID should be defined");
}

const pipeline = await client.project.upsertPipelineForProject(
project.id,
pipelineCreateParams,
);

if (!pipeline.id) {
throw new Error("Pipeline ID must be defined");
}

if (params.verbose) {
console.log(`Created pipeline ${pipeline.id} with name ${params.name}`);
}

const executionsIds: {
exectionId: string;
dataSourceId: string;
}[] = [];

for (const dataSource of pipeline.dataSources) {
const dataSourceExection =
await client.dataSource.createDataSourceExecution(dataSource.id);

if (!dataSourceExection.id) {
throw new Error("Data Source Execution ID must be defined");
}

executionsIds.push({
exectionId: dataSourceExection.id,
dataSourceId: dataSource.id,
});
}

let isDone = false;

while (!isDone) {
const statuses = [];

for await (const execution of executionsIds) {
const dataSourceExecution =
await client.dataSource.getDataSourceExecution(
execution.dataSourceId,
execution.exectionId,
);

statuses.push(dataSourceExecution.status);

if (
statuses.every((status) => status === PlatformApi.StatusEnum.Success)
) {
isDone = true;
if (params.verbose) {
console.info("Data Source Execution completed");
}
break;
} else if (
statuses.some((status) => status === PlatformApi.StatusEnum.Error)
) {
throw new Error("Data Source Execution failed");
} else {
await new Promise((resolve) => setTimeout(resolve, 1000));
if (params.verbose) {
process.stdout.write(".");
}
}
}
}

isDone = false;

const execution = await client.pipeline.runManagedPipelineIngestion(
pipeline.id,
);

const ingestionId = execution.id;

if (!ingestionId) {
throw new Error("Ingestion ID must be defined");
}

while (!isDone) {
const pipelineStatus = await client.pipeline.getManagedIngestionExecution(
pipeline.id,
ingestionId,
);

if (pipelineStatus.status === PlatformApi.StatusEnum.Success) {
isDone = true;

if (params.verbose) {
console.info("Ingestion completed");
}

break;
} else if (pipelineStatus.status === PlatformApi.StatusEnum.Error) {
throw new Error("Ingestion failed");
} else {
await new Promise((resolve) => setTimeout(resolve, 1000));
if (params.verbose) {
process.stdout.write(".");
}
}
}

if (params.verbose) {
console.info(
`Ingestion completed, find your index at ${appUrl}/project/${project.id}/deploy/${pipeline.id}`,
);
}

return new LlamaCloudIndex({ ...params });
}

asRetriever(params: CloudRetrieveParams = {}): BaseRetriever {
return new LlamaCloudRetriever({ ...this.params, ...params });
}
Expand Down
21 changes: 12 additions & 9 deletions packages/core/src/cloud/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,22 @@ function getTransformationConfig(
return {
configurableTransformationType: "SENTENCE_AWARE_NODE_PARSER",
component: {
// TODO: API returns 422 if these parameters are included
// chunkSize: transformation.textSplitter.chunkSize, // TODO: set to public in SentenceSplitter
// chunkOverlap: transformation.textSplitter.chunkOverlap, // TODO: set to public in SentenceSplitter
// includeMetadata: transformation.includeMetadata,
// includePrevNextRel: transformation.includePrevNextRel,
// TODO: API doesnt accept camelCase
chunk_size: transformation.textSplitter.chunkSize, // TODO: set to public in SentenceSplitter
chunk_overlap: transformation.textSplitter.chunkOverlap, // TODO: set to public in SentenceSplitter
include_metadata: transformation.includeMetadata,
include_prev_next_rel: transformation.includePrevNextRel,
},
};
}
if (transformation instanceof OpenAIEmbedding) {
return {
configurableTransformationType: "OPENAI_EMBEDDING",
component: {
modelName: transformation.model,
apiKey: transformation.apiKey,
embedBatchSize: transformation.embedBatchSize,
// TODO: API doesnt accept camelCase
model: transformation.model,
api_key: transformation.apiKey,
embed_batch_size: transformation.embedBatchSize,
dimensions: transformation.dimensions,
},
};
Expand Down Expand Up @@ -71,10 +72,12 @@ export async function getPipelineCreate(
inputNodes = [],
} = params;

const dataSources = inputNodes.map(getDataSourceConfig);

return {
name: pipelineName,
configuredTransformations: transformations.map(getTransformationConfig),
dataSources: inputNodes.map(getDataSourceConfig),
dataSources,
dataSinks: [],
pipelineType,
};
Expand Down
2 changes: 1 addition & 1 deletion packages/edge/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"@aws-crypto/sha256-js": "^5.2.0",
"@datastax/astra-db-ts": "^0.1.4",
"@grpc/grpc-js": "^1.10.2",
"@llamaindex/cloud": "0.0.4",
"@llamaindex/cloud": "0.0.5",
"@llamaindex/env": "workspace:*",
"@mistralai/mistralai": "^0.0.10",
"@notionhq/client": "^2.2.14",
Expand Down
Loading

0 comments on commit 3cbfa98

Please sign in to comment.