fix: use Blob instead of File (run-llama#1231)

himself65 · Sep 19, 2024 · fb36eff · fb36eff
1 parent d24d3d1
commit fb36eff
Show file tree

Hide file tree

Showing 10 changed files with 593 additions and 87 deletions.
diff --git a/.changeset/neat-maps-visit.md b/.changeset/neat-maps-visit.md
@@ -0,0 +1,7 @@
+---
+"@llamaindex/cloud": patch
+---
+
+fix: backport for node.js 18
+
+There could have one missing API in the node.js 18, so we need to backport it to make it work.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -13,7 +13,6 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  POSTGRES_USER: runneradmin
   POSTGRES_HOST_AUTH_METHOD: trust
 
 jobs:

diff --git a/packages/cloud/src/reader.ts b/packages/cloud/src/reader.ts
@@ -229,20 +229,18 @@ export class LlamaParseReader extends FileReader {
   }
 
   // Create a job for the LlamaParse API
-  private async createJob(
-    data: Uint8Array,
-    fileName: string = "unknown",
-  ): Promise<string> {
+  private async createJob(data: Uint8Array): Promise<string> {
     // Load data, set the mime type
-    const { mime, extension } = await LlamaParseReader.getMimeType(data);
+    const { mime } = await LlamaParseReader.getMimeType(data);
 
     if (this.verbose) {
-      const name = fileName ? fileName : extension;
-      console.log(`Starting load for ${name} file`);
+      console.log("Started uploading the file");
     }
 
     const body = {
-      file: new File([data], fileName, { type: mime }),
+      file: new Blob([data], {
+        type: mime,
+      }),
       language: this.language,
       parsing_instruction: this.parsingInstruction,
       skip_diagonal_text: this.skipDiagonalText,
@@ -373,14 +371,10 @@ export class LlamaParseReader extends FileReader {
    * To be used with resultType = "text" and "markdown"
    *
    * @param {Uint8Array} fileContent - The content of the file to be loaded.
-   * @param {string} [fileName] - The optional name of the file to be loaded.
    * @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects.
    */
-  async loadDataAsContent(
-    fileContent: Uint8Array,
-    fileName?: string,
-  ): Promise<Document[]> {
-    return this.createJob(fileContent, fileName)
+  async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
+    return this.createJob(fileContent)
       .then(async (jobId) => {
         if (this.verbose) {
           console.log(`Started parsing the file under job id ${jobId}`);

diff --git a/packages/llamaindex/e2e/.env.ci b/packages/llamaindex/e2e/.env.ci
@@ -0,0 +1 @@
+POSTGRES_USER=runner
diff --git a/...node/vector-store/pg-vector-store.test.ts → .../node/vector-store/pg-vector-store.e2e.ts b/...node/vector-store/pg-vector-store.test.ts → .../node/vector-store/pg-vector-store.e2e.ts
@@ -1,19 +1,27 @@
+/* eslint-disable turbo/no-undeclared-env-vars */
+import { config } from "dotenv";
 import { Document, VectorStoreQueryMode } from "llamaindex";
 import { PGVectorStore } from "llamaindex/vector-store/PGVectorStore";
 import assert from "node:assert";
 import { test } from "node:test";
 import pg from "pg";
 import { registerTypes } from "pgvector/pg";
 
+config({ path: [".env.local", ".env", ".env.ci"] });
+
 let pgClient: pg.Client | pg.Pool;
 test.afterEach(async () => {
   await pgClient.end();
 });
 
+const pgConfig = {
+  user: process.env.POSTGRES_USER ?? "user",
+  password: process.env.POSTGRES_PASSWORD ?? "password",
+  database: "llamaindex_node_test",
+};
+
 await test("init with client", async () => {
-  pgClient = new pg.Client({
-    database: "llamaindex_node_test",
-  });
+  pgClient = new pg.Client(pgConfig);
   await pgClient.connect();
   await pgClient.query("CREATE EXTENSION IF NOT EXISTS vector");
   await registerTypes(pgClient);
@@ -22,9 +30,7 @@ await test("init with client", async () => {
 });
 
 await test("init with pool", async () => {
-  pgClient = new pg.Pool({
-    database: "llamaindex_node_test",
-  });
+  pgClient = new pg.Pool(pgConfig);
   await pgClient.query("CREATE EXTENSION IF NOT EXISTS vector");
   const client = await pgClient.connect();
   await registerTypes(client);
@@ -34,9 +40,7 @@ await test("init with pool", async () => {
 });
 
 await test("init without client", async () => {
-  const vectorStore = new PGVectorStore({
-    database: "llamaindex_node_test",
-  });
+  const vectorStore = new PGVectorStore(pgConfig);
   pgClient = (await vectorStore.client()) as pg.Client;
   assert.notDeepStrictEqual(pgClient, undefined);
 });
@@ -52,7 +56,7 @@ await test("simple node", async () => {
     embedding: [0.1, 0.2, 0.3],
   });
   const vectorStore = new PGVectorStore({
-    database: "llamaindex_node_test",
+    ...pgConfig,
     dimensions,
     schemaName,
   });

diff --git a/packages/llamaindex/e2e/package.json b/packages/llamaindex/e2e/package.json
@@ -4,14 +4,15 @@
   "version": "0.0.7",
   "type": "module",
   "scripts": {
-    "e2e": "node --import tsx --import ./mock-register.js --test ./node/*.e2e.ts",
-    "e2e:nomock": "node --import tsx --test ./node/*.e2e.ts",
-    "e2e:updatesnap": "UPDATE_SNAPSHOT=1 node --import tsx --test ./node/*.e2e.ts"
+    "e2e": "node --import tsx --import ./mock-register.js --test ./node/**/*.e2e.ts",
+    "e2e:nomock": "node --import tsx --test ./node/**/*.e2e.ts",
+    "e2e:updatesnap": "UPDATE_SNAPSHOT=1 node --import tsx --test ./node/**/*.e2e.ts"
   },
   "devDependencies": {
-    "@faker-js/faker": "^8.4.1",
+    "@faker-js/faker": "^9.0.1",
     "@types/node": "^22.5.1",
     "consola": "^3.2.3",
+    "dotenv": "^16.4.5",
     "llamaindex": "workspace:*",
     "tsx": "^4.19.0"
   }

diff --git a/packages/llamaindex/src/vector-store/PGVectorStore.ts b/packages/llamaindex/src/vector-store/PGVectorStore.ts
@@ -20,11 +20,12 @@ import { Document, MetadataMode } from "@llamaindex/core/schema";
 export const PGVECTOR_SCHEMA = "public";
 export const PGVECTOR_TABLE = "llamaindex_embedding";
 
-export type PGVectorStoreConfig = {
+export type PGVectorStoreConfig = Pick<
+  pg.ClientConfig,
+  "user" | "database" | "password" | "connectionString"
+> & {
   schemaName?: string | undefined;
   tableName?: string | undefined;
-  database?: string | undefined;
-  connectionString?: string | undefined;
   dimensions?: number | undefined;
   embedModel?: BaseEmbedding | undefined;
 };
@@ -43,8 +44,12 @@ export class PGVectorStore
   private schemaName: string = PGVECTOR_SCHEMA;
   private tableName: string = PGVECTOR_TABLE;
 
-  private database: string | undefined = undefined;
-  private connectionString: string | undefined = undefined;
+  private user: pg.ClientConfig["user"] | undefined = undefined;
+  private password: pg.ClientConfig["password"] | undefined = undefined;
+  private database: pg.ClientConfig["database"] | undefined = undefined;
+  private connectionString: pg.ClientConfig["connectionString"] | undefined =
+    undefined;
+
   private dimensions: number = 1536;
 
   private db?: pg.ClientBase;
@@ -76,6 +81,8 @@ export class PGVectorStore
       super(config?.embedModel);
       this.schemaName = config?.schemaName ?? PGVECTOR_SCHEMA;
       this.tableName = config?.tableName ?? PGVECTOR_TABLE;
+      this.user = config?.user;
+      this.password = config?.password;
       this.database = config?.database;
       this.connectionString = config?.connectionString;
       this.dimensions = config?.dimensions ?? 1536;
@@ -114,6 +121,8 @@ export class PGVectorStore
         // Create DB connection
         // Read connection params from env - see comment block above
         const db = new Client({
+          user: this.user,
+          password: this.password,
           database: this.database,
           connectionString: this.connectionString,
         });

diff --git a/packages/llamaindex/tests/package.json b/packages/llamaindex/tests/package.json
@@ -7,7 +7,9 @@
     "test": "vitest run"
   },
   "devDependencies": {
+    "@faker-js/faker": "^9.0.1",
     "llamaindex": "workspace:*",
+    "msw": "^2.4.8",
     "vitest": "^2.0.5"
   }
 }
diff --git a/packages/llamaindex/tests/readers/llama-parse.test.ts b/packages/llamaindex/tests/readers/llama-parse.test.ts
@@ -0,0 +1,176 @@
+/**
+ * DO NOT PUT THIS TEST CASE FROM VITEST TO NODE.JS TEST RUNNER
+ *
+ * msw has side effect that will replace the global fetch function,
+ *  which will cause the test runner to hang indefinitely for some reason.
+ *  but vitest will start new process for each test case, so it's safe to use msw in vitest,
+ *  in the meanwhile, node.js test runner only run in single process.
+ */
+import { faker } from "@faker-js/faker";
+import { http, HttpResponse } from "msw";
+import { setupServer } from "msw/node";
+import { fileURLToPath } from "node:url";
+import { afterAll, afterEach, beforeAll, expect, test } from "vitest";
+
+const jobsHashMap = new Map<string, boolean>();
+
+const handlers = [
+  http.post("https://api.cloud.llamaindex.ai/api/v1/parsing/upload", () => {
+    return HttpResponse.json({
+      id: faker.string.uuid(),
+    });
+  }),
+  http.get(
+    "https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id",
+    ({ params }): HttpResponse => {
+      const jobId = params.id as string;
+      if (jobsHashMap.has(jobId)) {
+        return HttpResponse.json({
+          id: jobId,
+          status: "SUCCESS",
+        });
+      } else {
+        jobsHashMap.set(jobId, true);
+      }
+      return HttpResponse.json({
+        id: jobId,
+        status: "PENDING",
+      });
+    },
+  ),
+  http.get(
+    "https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id/result/markdown",
+    () => {
+      const job_metadata = {
+        credits_used: faker.number.int({ min: 1, max: 10 }),
+        credits_max: 1000,
+        job_credits_usage: faker.number.int({ min: 1, max: 10 }),
+        job_pages: faker.number.int({ min: 0, max: 5 }),
+        job_is_cache_hit: faker.datatype.boolean(),
+      };
+      return HttpResponse.json({
+        markdown: faker.lorem.paragraphs({
+          min: 3,
+          max: 1000,
+        }),
+        job_metadata,
+      });
+    },
+  ),
+  http.get(
+    "https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id/result/text",
+    () => {
+      const job_metadata = {
+        credits_used: faker.number.int({ min: 1, max: 10 }),
+        credits_max: 1000,
+        job_credits_usage: faker.number.int({ min: 1, max: 10 }),
+        job_pages: faker.number.int({ min: 0, max: 5 }),
+        job_is_cache_hit: faker.datatype.boolean(),
+      };
+      return HttpResponse.json({
+        text: faker.lorem.paragraphs({
+          min: 3,
+          max: 1000,
+        }),
+        job_metadata,
+      });
+    },
+  ),
+  http.get(
+    "https://api.cloud.llamaindex.ai/api/v1/parsing/job/:id/result/json",
+    () => {
+      const pages = Array.from({ length: 1 }, () => ({
+        page: 1,
+        text: faker.lorem.paragraphs(2),
+        md: `# ${faker.lorem.sentence()}\n\n${faker.lorem.paragraph()}`,
+        images: [
+          {
+            name: faker.system.fileName(),
+            height: faker.number.int({ min: 100, max: 500 }),
+            width: faker.number.int({ min: 600, max: 1600 }),
+            x: faker.number.int({ min: 0, max: 50 }),
+            y: faker.number.int({ min: 0, max: 50 }),
+            original_width: faker.number.int({ min: 1800, max: 2000 }),
+            original_height: faker.number.int({ min: 400, max: 600 }),
+          },
+        ],
+        items: [
+          {
+            type: "heading",
+            lvl: 1,
+            value: faker.lorem.sentence(),
+            md: `# ${faker.lorem.sentence()}`,
+            bBox: {
+              x: faker.number.float({ min: 20, max: 40 }),
+              y: faker.number.float({ min: 20, max: 30 }),
+              w: faker.number.float({ min: 300, max: 400 }),
+              h: faker.number.float({ min: 30, max: 50 }),
+            },
+          },
+          {
+            type: "table",
+            rows: [
+              [faker.lorem.word(), faker.lorem.sentence()],
+              [faker.lorem.word(), faker.lorem.sentence()],
+              [faker.lorem.word(), faker.lorem.sentence()],
+              [faker.lorem.word(), faker.lorem.sentence()],
+            ],
+            md: faker.lorem.sentences(4),
+            isPerfectTable: faker.datatype.boolean(),
+            csv: faker.lorem.sentences(4),
+          },
+          {
+            type: "text",
+            value: faker.lorem.paragraphs(2),
+            md: faker.lorem.paragraphs(2),
+            bBox: {
+              x: faker.number.float({ min: 5, max: 10 }),
+              y: faker.number.float({ min: 20, max: 30 }),
+              w: faker.number.float({ min: 800, max: 900 }),
+              h: faker.number.float({ min: 30, max: 50 }),
+            },
+          },
+        ],
+      }));
+
+      const response = {
+        pages,
+        job_metadata: {
+          credits_used: faker.number.int({ min: 1, max: 10 }),
+          credits_max: 1000,
+          job_credits_usage: faker.number.int({ min: 1, max: 10 }),
+          job_pages: faker.number.int({ min: 0, max: 5 }),
+          job_is_cache_hit: faker.datatype.boolean(),
+        },
+      };
+      return HttpResponse.json(response);
+    },
+  ),
+];
+
+const server = setupServer(...handlers);
+
+beforeAll(() => {
+  server.listen({
+    onUnhandledRequest: "error",
+  });
+});
+
+afterEach(() => {
+  server.resetHandlers();
+});
+
+afterAll(() => {
+  server.close();
+});
+
+test("llama parse should return a successful document", async () => {
+  const { LlamaParseReader } = await import("@llamaindex/cloud/reader");
+  const reader = new LlamaParseReader({
+    verbose: false,
+    apiKey: "llx-fake-api-key",
+  });
+  const fileUrl = new URL("../../../../examples/data/TOS.pdf", import.meta.url);
+  const documents = await reader.loadData(fileURLToPath(fileUrl));
+  expect(documents.length).toBe(1);
+});
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,7 +13,6 @@ concurrency: @@
       cancel-in-progress: true
     env:
-      POSTGRES_USER: runneradmin
       POSTGRES_HOST_AUTH_METHOD: trust
     jobs:
@@ Expand Down @@