Skip to content

Commit

Permalink
fix: use Blob instead of File (run-llama#1231)
Browse files Browse the repository at this point in the history
  • Loading branch information
himself65 authored Sep 19, 2024
1 parent d24d3d1 commit fb36eff
Show file tree
Hide file tree
Showing 10 changed files with 593 additions and 87 deletions.
7 changes: 7 additions & 0 deletions .changeset/neat-maps-visit.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@llamaindex/cloud": patch
---

fix: backport for node.js 18

There could have one missing API in the node.js 18, so we need to backport it to make it work.
1 change: 0 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ concurrency:
cancel-in-progress: true

env:
POSTGRES_USER: runneradmin
POSTGRES_HOST_AUTH_METHOD: trust

jobs:
Expand Down
22 changes: 8 additions & 14 deletions packages/cloud/src/reader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -229,20 +229,18 @@ export class LlamaParseReader extends FileReader {
}

// Create a job for the LlamaParse API
private async createJob(
data: Uint8Array,
fileName: string = "unknown",
): Promise<string> {
private async createJob(data: Uint8Array): Promise<string> {
// Load data, set the mime type
const { mime, extension } = await LlamaParseReader.getMimeType(data);
const { mime } = await LlamaParseReader.getMimeType(data);

if (this.verbose) {
const name = fileName ? fileName : extension;
console.log(`Starting load for ${name} file`);
console.log("Started uploading the file");
}

const body = {
file: new File([data], fileName, { type: mime }),
file: new Blob([data], {
type: mime,
}),
language: this.language,
parsing_instruction: this.parsingInstruction,
skip_diagonal_text: this.skipDiagonalText,
Expand Down Expand Up @@ -373,14 +371,10 @@ export class LlamaParseReader extends FileReader {
* To be used with resultType = "text" and "markdown"
*
* @param {Uint8Array} fileContent - The content of the file to be loaded.
* @param {string} [fileName] - The optional name of the file to be loaded.
* @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects.
*/
async loadDataAsContent(
fileContent: Uint8Array,
fileName?: string,
): Promise<Document[]> {
return this.createJob(fileContent, fileName)
async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
return this.createJob(fileContent)
.then(async (jobId) => {
if (this.verbose) {
console.log(`Started parsing the file under job id ${jobId}`);
Expand Down
1 change: 1 addition & 0 deletions packages/llamaindex/e2e/.env.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
POSTGRES_USER=runner
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
/* eslint-disable turbo/no-undeclared-env-vars */
import { config } from "dotenv";
import { Document, VectorStoreQueryMode } from "llamaindex";
import { PGVectorStore } from "llamaindex/vector-store/PGVectorStore";
import assert from "node:assert";
import { test } from "node:test";
import pg from "pg";
import { registerTypes } from "pgvector/pg";

config({ path: [".env.local", ".env", ".env.ci"] });

let pgClient: pg.Client | pg.Pool;
test.afterEach(async () => {
await pgClient.end();
});

const pgConfig = {
user: process.env.POSTGRES_USER ?? "user",
password: process.env.POSTGRES_PASSWORD ?? "password",
database: "llamaindex_node_test",
};

await test("init with client", async () => {
pgClient = new pg.Client({
database: "llamaindex_node_test",
});
pgClient = new pg.Client(pgConfig);
await pgClient.connect();
await pgClient.query("CREATE EXTENSION IF NOT EXISTS vector");
await registerTypes(pgClient);
Expand All @@ -22,9 +30,7 @@ await test("init with client", async () => {
});

await test("init with pool", async () => {
pgClient = new pg.Pool({
database: "llamaindex_node_test",
});
pgClient = new pg.Pool(pgConfig);
await pgClient.query("CREATE EXTENSION IF NOT EXISTS vector");
const client = await pgClient.connect();
await registerTypes(client);
Expand All @@ -34,9 +40,7 @@ await test("init with pool", async () => {
});

await test("init without client", async () => {
const vectorStore = new PGVectorStore({
database: "llamaindex_node_test",
});
const vectorStore = new PGVectorStore(pgConfig);
pgClient = (await vectorStore.client()) as pg.Client;
assert.notDeepStrictEqual(pgClient, undefined);
});
Expand All @@ -52,7 +56,7 @@ await test("simple node", async () => {
embedding: [0.1, 0.2, 0.3],
});
const vectorStore = new PGVectorStore({
database: "llamaindex_node_test",
...pgConfig,
dimensions,
schemaName,
});
Expand Down
9 changes: 5 additions & 4 deletions packages/llamaindex/e2e/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
"version": "0.0.7",
"type": "module",
"scripts": {
"e2e": "node --import tsx --import ./mock-register.js --test ./node/*.e2e.ts",
"e2e:nomock": "node --import tsx --test ./node/*.e2e.ts",
"e2e:updatesnap": "UPDATE_SNAPSHOT=1 node --import tsx --test ./node/*.e2e.ts"
"e2e": "node --import tsx --import ./mock-register.js --test ./node/**/*.e2e.ts",
"e2e:nomock": "node --import tsx --test ./node/**/*.e2e.ts",
"e2e:updatesnap": "UPDATE_SNAPSHOT=1 node --import tsx --test ./node/**/*.e2e.ts"
},
"devDependencies": {
"@faker-js/faker": "^8.4.1",
"@faker-js/faker": "^9.0.1",
"@types/node": "^22.5.1",
"consola": "^3.2.3",
"dotenv": "^16.4.5",
"llamaindex": "workspace:*",
"tsx": "^4.19.0"
}
Expand Down
19 changes: 14 additions & 5 deletions packages/llamaindex/src/vector-store/PGVectorStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ import { Document, MetadataMode } from "@llamaindex/core/schema";
export const PGVECTOR_SCHEMA = "public";
export const PGVECTOR_TABLE = "llamaindex_embedding";

export type PGVectorStoreConfig = {
export type PGVectorStoreConfig = Pick<
pg.ClientConfig,
"user" | "database" | "password" | "connectionString"
> & {
schemaName?: string | undefined;
tableName?: string | undefined;
database?: string | undefined;
connectionString?: string | undefined;
dimensions?: number | undefined;
embedModel?: BaseEmbedding | undefined;
};
Expand All @@ -43,8 +44,12 @@ export class PGVectorStore
private schemaName: string = PGVECTOR_SCHEMA;
private tableName: string = PGVECTOR_TABLE;

private database: string | undefined = undefined;
private connectionString: string | undefined = undefined;
private user: pg.ClientConfig["user"] | undefined = undefined;
private password: pg.ClientConfig["password"] | undefined = undefined;
private database: pg.ClientConfig["database"] | undefined = undefined;
private connectionString: pg.ClientConfig["connectionString"] | undefined =
undefined;

private dimensions: number = 1536;

private db?: pg.ClientBase;
Expand Down Expand Up @@ -76,6 +81,8 @@ export class PGVectorStore
super(config?.embedModel);
this.schemaName = config?.schemaName ?? PGVECTOR_SCHEMA;
this.tableName = config?.tableName ?? PGVECTOR_TABLE;
this.user = config?.user;
this.password = config?.password;
this.database = config?.database;
this.connectionString = config?.connectionString;
this.dimensions = config?.dimensions ?? 1536;
Expand Down Expand Up @@ -114,6 +121,8 @@ export class PGVectorStore
// Create DB connection
// Read connection params from env - see comment block above
const db = new Client({
user: this.user,
password: this.password,
database: this.database,
connectionString: this.connectionString,
});
Expand Down
2 changes: 2 additions & 0 deletions packages/llamaindex/tests/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"test": "vitest run"
},
"devDependencies": {
"@faker-js/faker": "^9.0.1",
"llamaindex": "workspace:*",
"msw": "^2.4.8",
"vitest": "^2.0.5"
}
}
176 changes: 176 additions & 0 deletions packages/llamaindex/tests/readers/llama-parse.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
/**
* DO NOT PUT THIS TEST CASE FROM VITEST TO NODE.JS TEST RUNNER
*
* msw has side effect that will replace the global fetch function,
* which will cause the test runner to hang indefinitely for some reason.
* but vitest will start new process for each test case, so it's safe to use msw in vitest,
* in the meanwhile, node.js test runner only run in single process.
*/
import { faker } from "@faker-js/faker";
import { http, HttpResponse } from "msw";
import { setupServer } from "msw/node";
import { fileURLToPath } from "node:url";
import { afterAll, afterEach, beforeAll, expect, test } from "vitest";

const jobsHashMap = new Map<string, boolean>();

const handlers = [
http.post("https://api.cloud.llamaindex.ai/api/v1/parsing/upload", () => {
return HttpResponse.json({
id: faker.string.uuid(),
});
}),
http.get(
"https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id",
({ params }): HttpResponse => {
const jobId = params.id as string;
if (jobsHashMap.has(jobId)) {
return HttpResponse.json({
id: jobId,
status: "SUCCESS",
});
} else {
jobsHashMap.set(jobId, true);
}
return HttpResponse.json({
id: jobId,
status: "PENDING",
});
},
),
http.get(
"https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id/result/markdown",
() => {
const job_metadata = {
credits_used: faker.number.int({ min: 1, max: 10 }),
credits_max: 1000,
job_credits_usage: faker.number.int({ min: 1, max: 10 }),
job_pages: faker.number.int({ min: 0, max: 5 }),
job_is_cache_hit: faker.datatype.boolean(),
};
return HttpResponse.json({
markdown: faker.lorem.paragraphs({
min: 3,
max: 1000,
}),
job_metadata,
});
},
),
http.get(
"https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id/result/text",
() => {
const job_metadata = {
credits_used: faker.number.int({ min: 1, max: 10 }),
credits_max: 1000,
job_credits_usage: faker.number.int({ min: 1, max: 10 }),
job_pages: faker.number.int({ min: 0, max: 5 }),
job_is_cache_hit: faker.datatype.boolean(),
};
return HttpResponse.json({
text: faker.lorem.paragraphs({
min: 3,
max: 1000,
}),
job_metadata,
});
},
),
http.get(
"https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id/result/json",
() => {
const pages = Array.from({ length: 1 }, () => ({
page: 1,
text: faker.lorem.paragraphs(2),
md: `# ${faker.lorem.sentence()}\n\n${faker.lorem.paragraph()}`,
images: [
{
name: faker.system.fileName(),
height: faker.number.int({ min: 100, max: 500 }),
width: faker.number.int({ min: 600, max: 1600 }),
x: faker.number.int({ min: 0, max: 50 }),
y: faker.number.int({ min: 0, max: 50 }),
original_width: faker.number.int({ min: 1800, max: 2000 }),
original_height: faker.number.int({ min: 400, max: 600 }),
},
],
items: [
{
type: "heading",
lvl: 1,
value: faker.lorem.sentence(),
md: `# ${faker.lorem.sentence()}`,
bBox: {
x: faker.number.float({ min: 20, max: 40 }),
y: faker.number.float({ min: 20, max: 30 }),
w: faker.number.float({ min: 300, max: 400 }),
h: faker.number.float({ min: 30, max: 50 }),
},
},
{
type: "table",
rows: [
[faker.lorem.word(), faker.lorem.sentence()],
[faker.lorem.word(), faker.lorem.sentence()],
[faker.lorem.word(), faker.lorem.sentence()],
[faker.lorem.word(), faker.lorem.sentence()],
],
md: faker.lorem.sentences(4),
isPerfectTable: faker.datatype.boolean(),
csv: faker.lorem.sentences(4),
},
{
type: "text",
value: faker.lorem.paragraphs(2),
md: faker.lorem.paragraphs(2),
bBox: {
x: faker.number.float({ min: 5, max: 10 }),
y: faker.number.float({ min: 20, max: 30 }),
w: faker.number.float({ min: 800, max: 900 }),
h: faker.number.float({ min: 30, max: 50 }),
},
},
],
}));

const response = {
pages,
job_metadata: {
credits_used: faker.number.int({ min: 1, max: 10 }),
credits_max: 1000,
job_credits_usage: faker.number.int({ min: 1, max: 10 }),
job_pages: faker.number.int({ min: 0, max: 5 }),
job_is_cache_hit: faker.datatype.boolean(),
},
};
return HttpResponse.json(response);
},
),
];

const server = setupServer(...handlers);

beforeAll(() => {
server.listen({
onUnhandledRequest: "error",
});
});

afterEach(() => {
server.resetHandlers();
});

afterAll(() => {
server.close();
});

test("llama parse should return a successful document", async () => {
const { LlamaParseReader } = await import("@llamaindex/cloud/reader");
const reader = new LlamaParseReader({
verbose: false,
apiKey: "llx-fake-api-key",
});
const fileUrl = new URL("../../../../examples/data/TOS.pdf", import.meta.url);
const documents = await reader.loadData(fileURLToPath(fileUrl));
expect(documents.length).toBe(1);
});
Loading

0 comments on commit fb36eff

Please sign in to comment.