Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: use Blob instead of File #1231

Merged
merged 18 commits into from
Sep 19, 2024
7 changes: 7 additions & 0 deletions .changeset/neat-maps-visit.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@llamaindex/cloud": patch
---

fix: backport for node.js 18

There could have one missing API in the node.js 18, so we need to backport it to make it work.
1 change: 0 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ concurrency:
cancel-in-progress: true

env:
POSTGRES_USER: runneradmin
POSTGRES_HOST_AUTH_METHOD: trust

jobs:
Expand Down
22 changes: 8 additions & 14 deletions packages/cloud/src/reader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -229,20 +229,18 @@ export class LlamaParseReader extends FileReader {
}

// Create a job for the LlamaParse API
private async createJob(
data: Uint8Array,
fileName: string = "unknown",
): Promise<string> {
private async createJob(data: Uint8Array): Promise<string> {
// Load data, set the mime type
const { mime, extension } = await LlamaParseReader.getMimeType(data);
const { mime } = await LlamaParseReader.getMimeType(data);

if (this.verbose) {
const name = fileName ? fileName : extension;
console.log(`Starting load for ${name} file`);
console.log("Started uploading the file");
}

const body = {
file: new File([data], fileName, { type: mime }),
file: new Blob([data], {
type: mime,
}),
language: this.language,
parsing_instruction: this.parsingInstruction,
skip_diagonal_text: this.skipDiagonalText,
Expand Down Expand Up @@ -373,14 +371,10 @@ export class LlamaParseReader extends FileReader {
* To be used with resultType = "text" and "markdown"
*
* @param {Uint8Array} fileContent - The content of the file to be loaded.
* @param {string} [fileName] - The optional name of the file to be loaded.
* @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects.
*/
async loadDataAsContent(
fileContent: Uint8Array,
fileName?: string,
): Promise<Document[]> {
return this.createJob(fileContent, fileName)
async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
return this.createJob(fileContent)
.then(async (jobId) => {
if (this.verbose) {
console.log(`Started parsing the file under job id ${jobId}`);
Expand Down
1 change: 1 addition & 0 deletions packages/llamaindex/e2e/.env.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
POSTGRES_USER=runner
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
/* eslint-disable turbo/no-undeclared-env-vars */
import { config } from "dotenv";
import { Document, VectorStoreQueryMode } from "llamaindex";
import { PGVectorStore } from "llamaindex/vector-store/PGVectorStore";
import assert from "node:assert";
import { test } from "node:test";
import pg from "pg";
import { registerTypes } from "pgvector/pg";

config({ path: [".env.local", ".env", ".env.ci"] });
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is confusing, nodejs test runner actually doesn't read environment variable by default


let pgClient: pg.Client | pg.Pool;
test.afterEach(async () => {
await pgClient.end();
});

const pgConfig = {
user: process.env.POSTGRES_USER ?? "user",
password: process.env.POSTGRES_PASSWORD ?? "password",
database: "llamaindex_node_test",
};

await test("init with client", async () => {
pgClient = new pg.Client({
database: "llamaindex_node_test",
});
pgClient = new pg.Client(pgConfig);
await pgClient.connect();
await pgClient.query("CREATE EXTENSION IF NOT EXISTS vector");
await registerTypes(pgClient);
Expand All @@ -22,9 +30,7 @@ await test("init with client", async () => {
});

await test("init with pool", async () => {
pgClient = new pg.Pool({
database: "llamaindex_node_test",
});
pgClient = new pg.Pool(pgConfig);
await pgClient.query("CREATE EXTENSION IF NOT EXISTS vector");
const client = await pgClient.connect();
await registerTypes(client);
Expand All @@ -34,9 +40,7 @@ await test("init with pool", async () => {
});

await test("init without client", async () => {
const vectorStore = new PGVectorStore({
database: "llamaindex_node_test",
});
const vectorStore = new PGVectorStore(pgConfig);
pgClient = (await vectorStore.client()) as pg.Client;
assert.notDeepStrictEqual(pgClient, undefined);
});
Expand All @@ -52,7 +56,7 @@ await test("simple node", async () => {
embedding: [0.1, 0.2, 0.3],
});
const vectorStore = new PGVectorStore({
database: "llamaindex_node_test",
...pgConfig,
dimensions,
schemaName,
});
Expand Down
9 changes: 5 additions & 4 deletions packages/llamaindex/e2e/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
"version": "0.0.7",
"type": "module",
"scripts": {
"e2e": "node --import tsx --import ./mock-register.js --test ./node/*.e2e.ts",
"e2e:nomock": "node --import tsx --test ./node/*.e2e.ts",
"e2e:updatesnap": "UPDATE_SNAPSHOT=1 node --import tsx --test ./node/*.e2e.ts"
"e2e": "node --import tsx --import ./mock-register.js --test ./node/**/*.e2e.ts",
"e2e:nomock": "node --import tsx --test ./node/**/*.e2e.ts",
"e2e:updatesnap": "UPDATE_SNAPSHOT=1 node --import tsx --test ./node/**/*.e2e.ts"
},
"devDependencies": {
"@faker-js/faker": "^8.4.1",
"@faker-js/faker": "^9.0.1",
"@types/node": "^22.5.1",
"consola": "^3.2.3",
"dotenv": "^16.4.5",
"llamaindex": "workspace:*",
"tsx": "^4.19.0"
}
Expand Down
19 changes: 14 additions & 5 deletions packages/llamaindex/src/vector-store/PGVectorStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ import { Document, MetadataMode } from "@llamaindex/core/schema";
export const PGVECTOR_SCHEMA = "public";
export const PGVECTOR_TABLE = "llamaindex_embedding";

export type PGVectorStoreConfig = {
export type PGVectorStoreConfig = Pick<
pg.ClientConfig,
"user" | "database" | "password" | "connectionString"
> & {
schemaName?: string | undefined;
tableName?: string | undefined;
database?: string | undefined;
connectionString?: string | undefined;
dimensions?: number | undefined;
embedModel?: BaseEmbedding | undefined;
};
Expand All @@ -43,8 +44,12 @@ export class PGVectorStore
private schemaName: string = PGVECTOR_SCHEMA;
private tableName: string = PGVECTOR_TABLE;

private database: string | undefined = undefined;
private connectionString: string | undefined = undefined;
private user: pg.ClientConfig["user"] | undefined = undefined;
private password: pg.ClientConfig["password"] | undefined = undefined;
private database: pg.ClientConfig["database"] | undefined = undefined;
private connectionString: pg.ClientConfig["connectionString"] | undefined =
undefined;

private dimensions: number = 1536;

private db?: pg.ClientBase;
Expand Down Expand Up @@ -76,6 +81,8 @@ export class PGVectorStore
super(config?.embedModel);
this.schemaName = config?.schemaName ?? PGVECTOR_SCHEMA;
this.tableName = config?.tableName ?? PGVECTOR_TABLE;
this.user = config?.user;
this.password = config?.password;
this.database = config?.database;
this.connectionString = config?.connectionString;
this.dimensions = config?.dimensions ?? 1536;
Expand Down Expand Up @@ -114,6 +121,8 @@ export class PGVectorStore
// Create DB connection
// Read connection params from env - see comment block above
const db = new Client({
user: this.user,
password: this.password,
database: this.database,
connectionString: this.connectionString,
});
Expand Down
2 changes: 2 additions & 0 deletions packages/llamaindex/tests/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"test": "vitest run"
},
"devDependencies": {
"@faker-js/faker": "^9.0.1",
"llamaindex": "workspace:*",
"msw": "^2.4.8",
"vitest": "^2.0.5"
}
}
176 changes: 176 additions & 0 deletions packages/llamaindex/tests/readers/llama-parse.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
/**
* DO NOT PUT THIS TEST CASE FROM VITEST TO NODE.JS TEST RUNNER
*
* msw has side effect that will replace the global fetch function,
* which will cause the test runner to hang indefinitely for some reason.
* but vitest will start new process for each test case, so it's safe to use msw in vitest,
* in the meanwhile, node.js test runner only run in single process.
*/
import { faker } from "@faker-js/faker";
import { http, HttpResponse } from "msw";
import { setupServer } from "msw/node";
import { fileURLToPath } from "node:url";
import { afterAll, afterEach, beforeAll, expect, test } from "vitest";

const jobsHashMap = new Map<string, boolean>();

const handlers = [
http.post("https://api.cloud.llamaindex.ai/api/v1/parsing/upload", () => {
return HttpResponse.json({
id: faker.string.uuid(),
});
}),
http.get(
"https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id",
({ params }): HttpResponse => {
const jobId = params.id as string;
if (jobsHashMap.has(jobId)) {
return HttpResponse.json({
id: jobId,
status: "SUCCESS",
});
} else {
jobsHashMap.set(jobId, true);
}
return HttpResponse.json({
id: jobId,
status: "PENDING",
});
},
),
http.get(
"https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id/result/markdown",
() => {
const job_metadata = {
credits_used: faker.number.int({ min: 1, max: 10 }),
credits_max: 1000,
job_credits_usage: faker.number.int({ min: 1, max: 10 }),
job_pages: faker.number.int({ min: 0, max: 5 }),
job_is_cache_hit: faker.datatype.boolean(),
};
return HttpResponse.json({
markdown: faker.lorem.paragraphs({
min: 3,
max: 1000,
}),
job_metadata,
});
},
),
http.get(
"https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id/result/text",
() => {
const job_metadata = {
credits_used: faker.number.int({ min: 1, max: 10 }),
credits_max: 1000,
job_credits_usage: faker.number.int({ min: 1, max: 10 }),
job_pages: faker.number.int({ min: 0, max: 5 }),
job_is_cache_hit: faker.datatype.boolean(),
};
return HttpResponse.json({
text: faker.lorem.paragraphs({
min: 3,
max: 1000,
}),
job_metadata,
});
},
),
http.get(
"https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id/result/json",
() => {
const pages = Array.from({ length: 1 }, () => ({
page: 1,
text: faker.lorem.paragraphs(2),
md: `# ${faker.lorem.sentence()}\n\n${faker.lorem.paragraph()}`,
images: [
{
name: faker.system.fileName(),
height: faker.number.int({ min: 100, max: 500 }),
width: faker.number.int({ min: 600, max: 1600 }),
x: faker.number.int({ min: 0, max: 50 }),
y: faker.number.int({ min: 0, max: 50 }),
original_width: faker.number.int({ min: 1800, max: 2000 }),
original_height: faker.number.int({ min: 400, max: 600 }),
},
],
items: [
{
type: "heading",
lvl: 1,
value: faker.lorem.sentence(),
md: `# ${faker.lorem.sentence()}`,
bBox: {
x: faker.number.float({ min: 20, max: 40 }),
y: faker.number.float({ min: 20, max: 30 }),
w: faker.number.float({ min: 300, max: 400 }),
h: faker.number.float({ min: 30, max: 50 }),
},
},
{
type: "table",
rows: [
[faker.lorem.word(), faker.lorem.sentence()],
[faker.lorem.word(), faker.lorem.sentence()],
[faker.lorem.word(), faker.lorem.sentence()],
[faker.lorem.word(), faker.lorem.sentence()],
],
md: faker.lorem.sentences(4),
isPerfectTable: faker.datatype.boolean(),
csv: faker.lorem.sentences(4),
},
{
type: "text",
value: faker.lorem.paragraphs(2),
md: faker.lorem.paragraphs(2),
bBox: {
x: faker.number.float({ min: 5, max: 10 }),
y: faker.number.float({ min: 20, max: 30 }),
w: faker.number.float({ min: 800, max: 900 }),
h: faker.number.float({ min: 30, max: 50 }),
},
},
],
}));

const response = {
pages,
job_metadata: {
credits_used: faker.number.int({ min: 1, max: 10 }),
credits_max: 1000,
job_credits_usage: faker.number.int({ min: 1, max: 10 }),
job_pages: faker.number.int({ min: 0, max: 5 }),
job_is_cache_hit: faker.datatype.boolean(),
},
};
return HttpResponse.json(response);
},
),
];

const server = setupServer(...handlers);

beforeAll(() => {
server.listen({
onUnhandledRequest: "error",
});
});

afterEach(() => {
server.resetHandlers();
});

afterAll(() => {
server.close();
});

test("llama parse should return a successful document", async () => {
const { LlamaParseReader } = await import("@llamaindex/cloud/reader");
const reader = new LlamaParseReader({
verbose: false,
apiKey: "llx-fake-api-key",
});
const fileUrl = new URL("../../../../examples/data/TOS.pdf", import.meta.url);
const documents = await reader.loadData(fileURLToPath(fileUrl));
expect(documents.length).toBe(1);
});
Loading