From 61dda5ce7f8fa9a93cb3f235195b66eaa135b941 Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Tue, 15 Aug 2023 16:27:22 -0400 Subject: [PATCH 01/12] Rebuild services --- .drone.yml | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/.drone.yml b/.drone.yml index 970c675ce..494623a24 100644 --- a/.drone.yml +++ b/.drone.yml @@ -53,7 +53,7 @@ steps: # Chat Server pipelines # --- --- -depends_on: ["test-all"] +# depends_on: ["test-all"] kind: pipeline type: kubernetes name: staging-build-chat-server @@ -61,14 +61,15 @@ name: staging-build-chat-server trigger: branch: - main + - DOCSP-32194 event: - push - paths: - include: - - chat-server/**/* - - chat-core/**/* - - chat-ui/**/* + # paths: + # include: + # - chat-server/**/* + # - chat-core/**/* + # - chat-ui/**/* steps: # Builds and publishes Docker image for staging @@ -302,7 +303,7 @@ steps: # Ingest service # --- --- -depends_on: ["test-all"] +# depends_on: ["test-all"] kind: pipeline type: kubernetes name: staging-build-ingest-service @@ -310,12 +311,13 @@ name: staging-build-ingest-service trigger: branch: - main + - DOCSP-32194 event: - push - paths: - include: - - ingest/**/* - - chat-core/**/* + # paths: + # include: + # - ingest/**/* + # - chat-core/**/* steps: # Builds and publishes Docker image for staging From 7db2d9d87a2e1b6a246aee476d5ffc58d9e9da2c Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Tue, 15 Aug 2023 17:05:57 -0400 Subject: [PATCH 02/12] Trigger staging deploy --- .drone.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.drone.yml b/.drone.yml index 494623a24..16c8ff21c 100644 --- a/.drone.yml +++ b/.drone.yml @@ -135,9 +135,9 @@ steps: api_server: https://api.staging.corp.mongodb.com kubernetes_token: from_secret: staging_kubernetes_token - when: - branch: - - main + # when: + # branch: + # - main --- depends_on: ["test-all"] kind: pipeline From ebbd71791bba0e632391b225d2be52ace7272561 Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Tue, 15 Aug 2023 17:50:53 -0400 Subject: [PATCH 03/12] Trigger ingest build --- ingest/trigger | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 ingest/trigger diff --git a/ingest/trigger b/ingest/trigger new file mode 100644 index 000000000..e69de29bb From 0f152bbb3a840f2332d96c4e5df5135f28d93660 Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Wed, 16 Aug 2023 11:13:39 -0400 Subject: [PATCH 04/12] clean up drone file + PR --- .drone.yml | 30 ++++++++++++++---------------- ingest/trigger | 0 2 files changed, 14 insertions(+), 16 deletions(-) delete mode 100644 ingest/trigger diff --git a/.drone.yml b/.drone.yml index 16c8ff21c..970c675ce 100644 --- a/.drone.yml +++ b/.drone.yml @@ -53,7 +53,7 @@ steps: # Chat Server pipelines # --- --- -# depends_on: ["test-all"] +depends_on: ["test-all"] kind: pipeline type: kubernetes name: staging-build-chat-server @@ -61,15 +61,14 @@ name: staging-build-chat-server trigger: branch: - main - - DOCSP-32194 event: - push - # paths: - # include: - # - chat-server/**/* - # - chat-core/**/* - # - chat-ui/**/* + paths: + include: + - chat-server/**/* + - chat-core/**/* + - chat-ui/**/* steps: # Builds and publishes Docker image for staging @@ -135,9 +134,9 @@ steps: api_server: https://api.staging.corp.mongodb.com kubernetes_token: from_secret: staging_kubernetes_token - # when: - # branch: - # - main + when: + branch: + - main --- depends_on: ["test-all"] kind: pipeline @@ -303,7 +302,7 @@ steps: # Ingest service # --- --- -# depends_on: ["test-all"] +depends_on: ["test-all"] kind: pipeline type: kubernetes name: staging-build-ingest-service @@ -311,13 +310,12 @@ name: staging-build-ingest-service trigger: branch: - main - - DOCSP-32194 event: - push - # paths: - # include: - # - ingest/**/* - # - chat-core/**/* + paths: + include: + - ingest/**/* + - chat-core/**/* steps: # Builds and publishes Docker image for staging diff --git a/ingest/trigger b/ingest/trigger deleted file mode 100644 index e69de29bb..000000000 From b2005a66923dd3218649175a1626fe5c597dc0bf Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Wed, 16 Aug 2023 16:03:30 -0400 Subject: [PATCH 05/12] refactor with Page.metadata --- chat-core/src/DatabaseConnection.test.ts | 16 +++++++++---- chat-core/src/Page.ts | 9 ++++++-- ingest/src/DevCenterDataSource.ts | 4 +++- ingest/src/ProjectBase.d.ts | 6 +++++ ingest/src/SnootyDataSource.test.ts | 20 ++++++++++++---- ingest/src/SnootyDataSource.ts | 17 +++++++++++--- ingest/src/chunkPage.test.ts | 17 +++++--------- ingest/src/chunkPage.ts | 11 +++++---- ingest/src/getChangedPages.test.ts | 19 ++++++++++++---- ingest/src/getChangedPages.ts | 4 ++-- ingest/src/projectSources.ts | 29 ++++++++++++++++++++++++ ingest/src/updateEmbeddedContent.test.ts | 4 +++- ingest/src/updatePages.test.ts | 4 +++- 13 files changed, 122 insertions(+), 38 deletions(-) diff --git a/chat-core/src/DatabaseConnection.test.ts b/chat-core/src/DatabaseConnection.test.ts index eb7a10ad3..ce632f8f9 100644 --- a/chat-core/src/DatabaseConnection.test.ts +++ b/chat-core/src/DatabaseConnection.test.ts @@ -51,7 +51,9 @@ describe("DatabaseConnection", () => { body: "foo", format: "md", sourceName: "source1", - tags: [], + metadata: { + tags: [], + }, updated: new Date(), url: "/x/y/z", }; @@ -113,7 +115,9 @@ describe("DatabaseConnection", () => { body: "foo", format: "md", sourceName: "source1", - tags: [], + metadata: { + tags: [], + }, updated: new Date(), url: "/x/y/z", }; @@ -153,7 +157,9 @@ describe("DatabaseConnection", () => { body: "The Matrix (1999) comes out", format: "md", sourceName: "", - tags: [], + metadata: { + tags: [], + }, updated: new Date("1999-03-31"), url: "matrix1", }, @@ -162,7 +168,9 @@ describe("DatabaseConnection", () => { body: "The Matrix: Reloaded (2003) comes out", format: "md", sourceName: "", - tags: [], + metadata: { + tags: [], + }, updated: new Date("2003-05-15"), url: "matrix2", }, diff --git a/chat-core/src/Page.ts b/chat-core/src/Page.ts index 46a11ed10..333063e19 100644 --- a/chat-core/src/Page.ts +++ b/chat-core/src/Page.ts @@ -22,9 +22,14 @@ export type Page = { sourceName: string; /** - Arbitrary tags. + Arbitrary metadata for page. */ - tags: string[]; + metadata?: { + /** + Arbitrary tags. + */ + tags?: string[]; + } & Record; }; export type PageAction = "created" | "updated" | "deleted"; diff --git a/ingest/src/DevCenterDataSource.ts b/ingest/src/DevCenterDataSource.ts index 953a4b3e0..fcc26a6e3 100644 --- a/ingest/src/DevCenterDataSource.ts +++ b/ingest/src/DevCenterDataSource.ts @@ -52,7 +52,9 @@ export const makeDevCenterDataSource = async ({ }), format: "md", sourceName: name, - tags: [], // TODO + metadata: { + tags: [], // TODO + }, url: /^https?:\/\//.test(document.calculated_slug) ? document.calculated_slug : new URL( diff --git a/ingest/src/ProjectBase.d.ts b/ingest/src/ProjectBase.d.ts index 4357d5d2b..86a8b8fe6 100644 --- a/ingest/src/ProjectBase.d.ts +++ b/ingest/src/ProjectBase.d.ts @@ -17,4 +17,10 @@ export interface ProjectBase { @example ["kotlin", "docs", "driver"] */ tags?: string[]; + + /** + Name of the product. + @example "MongoDB Atlas" + */ + productName?: string; } diff --git a/ingest/src/SnootyDataSource.test.ts b/ingest/src/SnootyDataSource.test.ts index e47a2ea4d..4f7d54f51 100644 --- a/ingest/src/SnootyDataSource.test.ts +++ b/ingest/src/SnootyDataSource.test.ts @@ -57,7 +57,9 @@ describe("SnootyDataSource", () => { expect(pages[1]).toMatchObject({ format: "md", sourceName: "snooty-test", - tags: ["docs", "manual"], + metadata: { + tags: ["docs", "manual"], + }, url: "https://mongodb.com/docs/v6.0/administration/", body: firstPageText, }); @@ -73,7 +75,9 @@ describe("SnootyDataSource", () => { expect(pages[0]).toMatchObject({ format: "md", sourceName: "snooty-docs", - tags: ["docs", "manual"], + metadata: { + tags: ["docs", "manual"], + }, url: "https://mongodb.com/docs/v6.0/", }); @@ -83,7 +87,9 @@ describe("SnootyDataSource", () => { expect(pages[2]).toMatchObject({ format: "md", sourceName: "snooty-docs", - tags: ["docs", "manual"], + metadata: { + tags: ["docs", "manual"], + }, url: "https://mongodb.com/docs/v6.0/administration/analyzing-mongodb-performance/index/", }); @@ -91,7 +97,9 @@ describe("SnootyDataSource", () => { expect(pages[3]).toMatchObject({ format: "md", sourceName: "snooty-docs", - tags: ["docs", "manual"], + metadata: { + tags: ["docs", "manual"], + }, url: "https://mongodb.com/docs/v6.0/administration/index/backup-sharded-clusters/", }); @@ -99,7 +107,9 @@ describe("SnootyDataSource", () => { expect(pages[4]).toMatchObject({ format: "md", sourceName: "snooty-docs", - tags: ["docs", "manual"], + metadata: { + tags: ["docs", "manual"], + }, url: "https://mongodb.com/docs/v6.0/administration/change-streams-production-recommendations/how-to-index/", }); }); diff --git a/ingest/src/SnootyDataSource.ts b/ingest/src/SnootyDataSource.ts index 84daa3433..74789ca05 100644 --- a/ingest/src/SnootyDataSource.ts +++ b/ingest/src/SnootyDataSource.ts @@ -97,7 +97,13 @@ export const makeSnootyDataSource = async ({ _snootyProjectName: string; } > => { - const { baseUrl, currentBranch, name: snootyProjectName, tags } = project; + const { + baseUrl, + currentBranch, + name: snootyProjectName, + tags, + productName, + } = project; return { // Additional members for testing purposes _baseUrl: baseUrl, @@ -128,7 +134,7 @@ export const makeSnootyDataSource = async ({ (async () => { const page = await handlePage( (entry as SnootyPageEntry).data, - { sourceName, baseUrl, tags: tags ?? [] } + { sourceName, baseUrl, tags: tags ?? [], productName } ); pages.push(page); })() @@ -214,10 +220,12 @@ const handlePage = async ( sourceName, baseUrl, tags = [], + productName, }: { sourceName: string; baseUrl: string; tags: string[]; + productName?: string; } ): Promise => { // Strip first three path segments - according to Snooty team, they'll always @@ -242,6 +250,9 @@ const handlePage = async ( title: getTitleFromSnootyAst(page.ast), body: snootyAstToMd(page.ast, { baseUrl }), format: "md", - tags, + metadata: { + tags, + productName, + }, }; }; diff --git a/ingest/src/chunkPage.test.ts b/ingest/src/chunkPage.test.ts index d622aafd8..f91a02b68 100644 --- a/ingest/src/chunkPage.test.ts +++ b/ingest/src/chunkPage.test.ts @@ -16,7 +16,9 @@ Praesent a neque diam. Sed ultricies nunc quam, sed maximus risus dignissim sit Vestibulum tempus aliquet convallis. Aenean ac dolor sed tortor malesuada bibendum in vel diam. Pellentesque varius dapibus molestie. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Mauris blandit metus sit amet libero pretium, sit amet cursus sem tempor. Proin euismod ut mi vitae luctus. Etiam pulvinar lacus nulla, vel placerat lacus pharetra auctor.`, format: "md", sourceName: "test-source", - tags: ["a", "b"], + metadata: { + tags: ["a", "b"], + }, }; it("chunks pages", async () => { const chunks = await chunkPage(page, { chunkSize: 500, chunkOverlap: 0 }); @@ -95,12 +97,10 @@ Vestibulum tempus aliquet convallis. Aenean ac dolor sed tortor malesuada bibend metadata: { hasCodeBlock: false, pageTitle: "Test Page", - sourceName: "test-source", tags: ["a", "b"], }, text: `--- pageTitle: Test Page -sourceName: test-source hasCodeBlock: false tags: - a @@ -108,7 +108,7 @@ tags: --- This is some text`, - tokenCount: 39, // Calculated after transformation + tokenCount: 32, // Calculated after transformation url: "test", }, ]); @@ -130,13 +130,11 @@ This is some text`, metadata: { hasCodeBlock: true, pageTitle: "Test Page", - sourceName: "test-source", codeBlockLanguages: ["js"], tags: ["a", "b"], }, text: `--- pageTitle: Test Page -sourceName: test-source hasCodeBlock: true codeBlockLanguages: - js @@ -152,7 +150,7 @@ let foo = 1 + 1; \`\`\` Neat, huh?`, - tokenCount: 75, + tokenCount: 68, url: "test", }, ]); @@ -174,12 +172,10 @@ Neat, huh?`, metadata: { hasCodeBlock: true, pageTitle: "Test Page", - sourceName: "test-source", tags: ["a", "b"], }, text: `--- pageTitle: Test Page -sourceName: test-source hasCodeBlock: true tags: - a @@ -193,7 +189,7 @@ let foo = 1 + 1; \`\`\` Neat, huh?`, - tokenCount: 65, + tokenCount: 58, url: "test", }, ]); @@ -230,7 +226,6 @@ someArray: - foo hasCodeBlock: false pageTitle: Test Page -sourceName: test-source tags: - a - b diff --git a/ingest/src/chunkPage.ts b/ingest/src/chunkPage.ts index f1287f040..40ef79d89 100644 --- a/ingest/src/chunkPage.ts +++ b/ingest/src/chunkPage.ts @@ -4,6 +4,7 @@ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; import GPT3Tokenizer from "gpt3-tokenizer"; import { EmbeddedContent, Page } from "chat-core"; import { updateFrontMatter, extractFrontMatter } from "chat-core"; +import { string } from "yargs"; export type ContentChunk = Omit; @@ -143,10 +144,11 @@ export const makeChunkFrontMatterUpdater = < */ export const standardMetadataGetter: ChunkMetadataGetter<{ pageTitle?: string; - sourceName: string; + productName?: string; hasCodeBlock: boolean; codeBlockLanguages?: string[]; tags?: string[]; + [string: string]: unknown; }> = async ({ page, text }) => { // Detect code blocks const mdCodeBlockToken = /```([A-z0-1-_]*)/; @@ -165,7 +167,6 @@ export const standardMetadataGetter: ChunkMetadataGetter<{ const metadata: Awaited> = { pageTitle: page.title, - sourceName: page.sourceName, hasCodeBlock: codeBlockLanguages.length !== 0, }; @@ -178,8 +179,10 @@ export const standardMetadataGetter: ChunkMetadataGetter<{ metadata["codeBlockLanguages"] = specifiedLanguages; } - if (page.tags.length !== 0) { - metadata["tags"] = page.tags; + if (page.metadata) { + for (const key in page.metadata) { + metadata[key] = page.metadata[key]; + } } return metadata; diff --git a/ingest/src/getChangedPages.test.ts b/ingest/src/getChangedPages.test.ts index f696866b2..e7fcdcc2e 100644 --- a/ingest/src/getChangedPages.test.ts +++ b/ingest/src/getChangedPages.test.ts @@ -8,7 +8,9 @@ describe("getChangedPages", () => { body: "abc", format: "md", sourceName: "test", - tags: [], + metadata: { + tags: [], + }, }; const [page0, page1, page2, page3] = Array(4) .fill(0) @@ -54,18 +56,27 @@ describe("getChangedPages", () => { format: "md", sourceName: "test", url: "test", - tags: ["test1", "test2"], + metadata: { + tags: ["test1", "test2"], + }, }; const { created, updated, deleted } = await getChangedPages({ oldPages: [{ ...page, action: "updated" }], - newPages: [{ ...page, tags: ["newTag", ...page.tags] }], + newPages: [ + { + ...page, + metadata: { tags: ["newTag", ...(page?.metadata?.tags || [])] }, + }, + ], }); const changedPages = [...deleted, ...created, ...updated]; expect(changedPages.length).toBe(1); expect(changedPages[0]).toMatchObject({ action: "updated", - tags: ["newTag", ...page.tags], + metadata: { + tags: ["newTag", ...(page?.metadata?.tags || [])], + }, }); }); }); diff --git a/ingest/src/getChangedPages.ts b/ingest/src/getChangedPages.ts index c276d21a9..927653d8d 100644 --- a/ingest/src/getChangedPages.ts +++ b/ingest/src/getChangedPages.ts @@ -91,11 +91,11 @@ const comparablePartialPage = ({ sourceName, body, format, - tags, + metadata, }: Page): Partial => ({ url, sourceName, body, format, - tags, + metadata, }); diff --git a/ingest/src/projectSources.ts b/ingest/src/projectSources.ts index e5fabde97..401c3ad6e 100644 --- a/ingest/src/projectSources.ts +++ b/ingest/src/projectSources.ts @@ -19,12 +19,14 @@ export const snootyProjectConfig: LocallySpecifiedSnootyProjectConfig[] = [ name: "cloud-docs", currentBranch: "master", tags: ["atlas", "docs"], + productName: "MongoDB Atlas", }, { type: "snooty", name: "cloudgov", currentBranch: "master", tags: ["atlas", "docs", "government"], + productName: "MongoDB Atlas for Government", }, { // MongoDB Manual @@ -32,96 +34,112 @@ export const snootyProjectConfig: LocallySpecifiedSnootyProjectConfig[] = [ name: "docs", currentBranch: "v6.0", tags: ["docs", "manual"], + productName: "MongoDB Server", }, { type: "snooty", name: "atlas-app-services", currentBranch: "master", tags: ["atlas", "docs", "app-services"], + productName: "Atlas App Services", }, { type: "snooty", name: "atlas-cli", currentBranch: "v1.9", tags: ["atlas", "docs", "cli", "atlas-cli"], + productName: "Atlas CLI", }, { type: "snooty", name: "bi-connector", currentBranch: "master", tags: ["bi-connector", "docs"], + productName: "MongoDB Connector for BI", }, { type: "snooty", name: "charts", currentBranch: "master", tags: ["charts", "docs", "atlas"], + productName: "Atlas Charts", }, { type: "snooty", name: "cluster-sync", currentBranch: "master", tags: ["cluster-sync", "docs"], + productName: "Cluster-to-Cluster Sync", }, { type: "snooty", name: "database-tools", currentBranch: "master", tags: ["database-tools", "docs", "cli"], + productName: "MongoDB Database Tools", }, { type: "snooty", name: "compass", currentBranch: "master", tags: ["compass", "docs", "gui"], + productName: "MongoDB Compass", }, { type: "snooty", name: "csharp", currentBranch: "v2.20", tags: ["docs", "driver", "csharp"], + productName: "C# Driver", }, { type: "snooty", name: "datalake", currentBranch: "master", tags: ["datalake", "docs", "atlas"], + productName: "Atlas Data Lake", }, { type: "snooty", name: "drivers", currentBranch: "master", tags: ["docs", "driver"], + productName: "MongoDB Drivers", }, { type: "snooty", name: "golang", currentBranch: "v1.12", tags: ["docs", "driver", "golang"], + productName: "Go Driver", }, { type: "snooty", name: "java", currentBranch: "v4.10", tags: ["docs", "driver", "java", "java-sync"], + productName: "Java Driver", }, { type: "snooty", name: "kubernetes-operator", currentBranch: "master", tags: ["docs", "kubernetes-operator", "kubernetes"], + productName: "MongoDB Kubernetes Operator", }, { type: "snooty", name: "kafka-connector", currentBranch: "v1.10", tags: ["docs", "kafka-connector", "kafka"], + productName: "MongoDB Kafka Connector", }, { type: "snooty", name: "kotlin", currentBranch: "v4.10", tags: ["docs", "driver", "kotlin", "kotlin-coroutines"], + productName: "Kotlin Driver", }, { type: "snooty", @@ -134,60 +152,70 @@ export const snootyProjectConfig: LocallySpecifiedSnootyProjectConfig[] = [ name: "mongocli", currentBranch: "v1.30", tags: ["docs", "cli", "mongocli"], + productName: "MongoDB CLI", }, { type: "snooty", name: "mongodb-shell", currentBranch: "master", tags: ["docs", "cli", "mongodb-shell"], + productName: "MongoDB Shell", }, { type: "snooty", name: "mongodb-vscode", currentBranch: "master", tags: ["docs", "mongodb-vscode", "vscode", "gui"], + productName: "MongoDB for VS Code", }, { type: "snooty", name: "mongoid", currentBranch: "8.0", tags: ["docs", "driver", "mongoid", "ruby"], + productName: "Mongoid ODM", }, { type: "snooty", name: "node", currentBranch: "v5.7", tags: ["docs", "driver", "node", "javascript"], + productName: "Node.js Driver", }, { type: "snooty", name: "php-library", currentBranch: "master", tags: ["docs", "driver", "php", "php-library"], + productName: "PHP Library", }, { type: "snooty", name: "realm", currentBranch: "master", tags: ["docs", "realm", "mobile", "sdk"], + productName: "Realm SDKs", }, { type: "snooty", name: "docs-relational-migrator", currentBranch: "master", tags: ["docs", "relational-migrator"], + productName: "MongoDB Relational Migrator", }, { type: "snooty", name: "ruby-driver", currentBranch: "v2.19", tags: ["docs", "driver", "ruby"], + productName: "Ruby Driver", }, { type: "snooty", name: "spark-connector", currentBranch: "v10.2", tags: ["docs", "spark-connector", "spark", "apache-spark"], + productName: "MongoDB Spark Connector", }, { type: "snooty", @@ -200,6 +228,7 @@ export const snootyProjectConfig: LocallySpecifiedSnootyProjectConfig[] = [ name: "visual-studio-extension", currentBranch: "v1.2", tags: ["docs", "visual-studio-extension", "visual-studio", "gui"], + productName: "MongoDB Visual Studio Extension", }, ]; diff --git a/ingest/src/updateEmbeddedContent.test.ts b/ingest/src/updateEmbeddedContent.test.ts index 384dc06fb..d5d011022 100644 --- a/ingest/src/updateEmbeddedContent.test.ts +++ b/ingest/src/updateEmbeddedContent.test.ts @@ -26,7 +26,9 @@ const examplePage: Page = { body: "this is a test page", format: "md", sourceName: "test", - tags: [], + metadata: { + tags: [], + }, url: "https://example.com/test", }; diff --git a/ingest/src/updatePages.test.ts b/ingest/src/updatePages.test.ts index 0670f40ce..39e86dc8c 100644 --- a/ingest/src/updatePages.test.ts +++ b/ingest/src/updatePages.test.ts @@ -18,7 +18,9 @@ const examplePage: Page = { body: "", format: "md", sourceName: "test", - tags: [], + metadata: { + tags: [], + }, url: "https://example.com/test", }; From 4821628b9ff3645ab09d3a3d7fe595f0f512af77 Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Wed, 16 Aug 2023 16:22:29 -0400 Subject: [PATCH 06/12] add test for arbitrary metadata --- ingest/src/chunkPage.test.ts | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/ingest/src/chunkPage.test.ts b/ingest/src/chunkPage.test.ts index f91a02b68..a4a0458dc 100644 --- a/ingest/src/chunkPage.test.ts +++ b/ingest/src/chunkPage.test.ts @@ -233,4 +233,40 @@ tags: This is some text`); }); + it("can add arbitrary page metadata", async () => { + const pageWithMetadata: Page = { + ...page, + body: "FOO", + metadata: { + ...page.metadata, + arbitrary: "metadata", + }, + }; + const chunks = await chunkPage(pageWithMetadata, { + transform: standardChunkFrontMatterUpdater, + }); + expect(chunks).toHaveLength(1); + expect(chunks[0]).toStrictEqual({ + chunkIndex: 0, + sourceName: "test-source", + metadata: { + pageTitle: "Test Page", + hasCodeBlock: false, + tags: ["a", "b"], + arbitrary: "metadata", + }, + text: `--- +pageTitle: Test Page +hasCodeBlock: false +tags: + - a + - b +arbitrary: metadata +--- + +FOO`, + tokenCount: 36, + url: "test", + }); + }); }); From 8cae2a16f9d9f7199da8d97249ea4d9606246bd6 Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Wed, 16 Aug 2023 16:22:51 -0400 Subject: [PATCH 07/12] update EmbeddedContent description --- chat-core/src/EmbeddedContent.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chat-core/src/EmbeddedContent.ts b/chat-core/src/EmbeddedContent.ts index 3c11139c1..c1e36c5cb 100644 --- a/chat-core/src/EmbeddedContent.ts +++ b/chat-core/src/EmbeddedContent.ts @@ -12,7 +12,7 @@ export interface EmbeddedContent { sourceName: string; /** - The original text. + The text associated with the vector embedding. */ text: string; From 0e5e8d37ce6275213da0a341b81f1baf69710926 Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Wed, 16 Aug 2023 16:23:21 -0400 Subject: [PATCH 08/12] update MongoDbUserQueryPreprocessorResponse for greater semantic meaning --- .../src/processors/MongoDbUserQueryPreprocessorResponse.d.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chat-server/src/processors/MongoDbUserQueryPreprocessorResponse.d.ts b/chat-server/src/processors/MongoDbUserQueryPreprocessorResponse.d.ts index 52f53ed6d..33ec67dcc 100644 --- a/chat-server/src/processors/MongoDbUserQueryPreprocessorResponse.d.ts +++ b/chat-server/src/processors/MongoDbUserQueryPreprocessorResponse.d.ts @@ -10,8 +10,8 @@ export interface MongoDbUserQueryPreprocessorResponse { programmingLanguages: string[]; /** One or more MongoDB products present in the content. Which MongoDB products is the user interested in? Ordered by relevancy. - Include driver if the user is asking about a programming language with a MongoDB driver. - @example ["atlas", "charts", "server", "compass", "bi-connector", "realm", "driver", ...other MongoDB products] + Include "Driver" if the user is asking about a programming language with a MongoDB driver. + @example ["MongoDB Atlas", "Atlas Charts", "Atlas Search", "Aggregation Framework", "MongoDB Server", "Compass", "MongoDB Connector for BI", "Realm SDK", "Driver", "Atlas App Services", ...other MongoDB products] */ mongoDbProducts: string[]; /** Using your knowledge of MongoDB and the conversational context, From c28801c5cadff71f2422149a9ce48d09c75bc7fe Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Wed, 16 Aug 2023 16:30:47 -0400 Subject: [PATCH 09/12] update pre-processor tests --- .../src/processors/makePreprocessMongoDbUserQuery.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chat-server/src/processors/makePreprocessMongoDbUserQuery.test.ts b/chat-server/src/processors/makePreprocessMongoDbUserQuery.test.ts index fe875c337..1a09c3530 100644 --- a/chat-server/src/processors/makePreprocessMongoDbUserQuery.test.ts +++ b/chat-server/src/processors/makePreprocessMongoDbUserQuery.test.ts @@ -60,7 +60,7 @@ describe("makePreprocessMongoDbUserQuery()", () => { } = response; expect(outputQuery).toContain("MongoDB"); expect(outputQuery).toContain("code example"); - expect(outputQuery).toContain("aggregation"); + expect(outputQuery.toLowerCase()).toContain("aggregation"); expect(outputQuery).toContain("?"); expect(programmingLanguages).toStrictEqual(["shell"]); expect(mongoDbProducts[0]).toBeDefined(); @@ -83,7 +83,7 @@ describe("makePreprocessMongoDbUserQuery()", () => { messages, }); const { mongoDbProducts } = response; - expect(mongoDbProducts[0]).toBe("charts"); + expect(mongoDbProducts[0]).toBe("Atlas Charts"); }); test("should be aware of MongoDB", async () => { const query = "ruby lookup example"; From 062ea41dc9c9ec758546a9c4e6abdbab4237515f Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Fri, 18 Aug 2023 13:07:05 -0400 Subject: [PATCH 10/12] implement review feedback --- chat-core/src/EmbeddedContent.ts | 2 +- chat-core/src/Page.ts | 3 ++- ingest/src/chunkPage.ts | 11 ++--------- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/chat-core/src/EmbeddedContent.ts b/chat-core/src/EmbeddedContent.ts index c1e36c5cb..a8557c012 100644 --- a/chat-core/src/EmbeddedContent.ts +++ b/chat-core/src/EmbeddedContent.ts @@ -12,7 +12,7 @@ export interface EmbeddedContent { sourceName: string; /** - The text associated with the vector embedding. + The text represented by the vector embedding. */ text: string; diff --git a/chat-core/src/Page.ts b/chat-core/src/Page.ts index 333063e19..4c6a0fa66 100644 --- a/chat-core/src/Page.ts +++ b/chat-core/src/Page.ts @@ -29,7 +29,8 @@ export type Page = { Arbitrary tags. */ tags?: string[]; - } & Record; + [k: string]: unknown; + }; }; export type PageAction = "created" | "updated" | "deleted"; diff --git a/ingest/src/chunkPage.ts b/ingest/src/chunkPage.ts index 40ef79d89..37c068ccb 100644 --- a/ingest/src/chunkPage.ts +++ b/ingest/src/chunkPage.ts @@ -144,11 +144,10 @@ export const makeChunkFrontMatterUpdater = < */ export const standardMetadataGetter: ChunkMetadataGetter<{ pageTitle?: string; - productName?: string; hasCodeBlock: boolean; codeBlockLanguages?: string[]; tags?: string[]; - [string: string]: unknown; + [k: string]: unknown; }> = async ({ page, text }) => { // Detect code blocks const mdCodeBlockToken = /```([A-z0-1-_]*)/; @@ -179,13 +178,7 @@ export const standardMetadataGetter: ChunkMetadataGetter<{ metadata["codeBlockLanguages"] = specifiedLanguages; } - if (page.metadata) { - for (const key in page.metadata) { - metadata[key] = page.metadata[key]; - } - } - - return metadata; + return { ...(page.metadata ?? {}), ...metadata }; }; export const standardChunkFrontMatterUpdater = makeChunkFrontMatterUpdater( From c4c91ec3c3d2f0555baae95c82cce04b5e7702ee Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Mon, 21 Aug 2023 10:46:02 -0400 Subject: [PATCH 11/12] fix broken tests --- ingest/src/chunkPage.test.ts | 39 +++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/ingest/src/chunkPage.test.ts b/ingest/src/chunkPage.test.ts index a4a0458dc..e3732dd53 100644 --- a/ingest/src/chunkPage.test.ts +++ b/ingest/src/chunkPage.test.ts @@ -90,7 +90,9 @@ Vestibulum tempus aliquet convallis. Aenean ac dolor sed tortor malesuada bibend } ); expect(chunks).toHaveLength(1); - expect(chunks).toStrictEqual([ + console.log("Results", chunks); + + const expected = [ { chunkIndex: 0, sourceName: "test-source", @@ -100,20 +102,21 @@ Vestibulum tempus aliquet convallis. Aenean ac dolor sed tortor malesuada bibend tags: ["a", "b"], }, text: `--- -pageTitle: Test Page -hasCodeBlock: false tags: - a - b +pageTitle: Test Page +hasCodeBlock: false --- This is some text`, tokenCount: 32, // Calculated after transformation url: "test", }, - ]); + ]; + expect(chunks).toStrictEqual(expected); - chunks = await chunkPage( + const codeBlockChunks = await chunkPage( { ...page, body: "This text has a code example:\n\n```js\nlet foo = 1 + 1;\n```\n\nNeat, huh?", @@ -122,8 +125,8 @@ This is some text`, transform: standardChunkFrontMatterUpdater, } ); - expect(chunks).toHaveLength(1); - expect(chunks).toStrictEqual([ + expect(codeBlockChunks).toHaveLength(1); + expect(codeBlockChunks).toStrictEqual([ { chunkIndex: 0, sourceName: "test-source", @@ -134,13 +137,13 @@ This is some text`, tags: ["a", "b"], }, text: `--- +tags: + - a + - b pageTitle: Test Page hasCodeBlock: true codeBlockLanguages: - js -tags: - - a - - b --- This text has a code example: @@ -155,7 +158,7 @@ Neat, huh?`, }, ]); - chunks = await chunkPage( + const unspecifiedCodeBlockChunks = await chunkPage( { ...page, body: "This text has an unspecified code example:\n\n```\nlet foo = 1 + 1;\n```\n\nNeat, huh?", @@ -164,8 +167,8 @@ Neat, huh?`, transform: standardChunkFrontMatterUpdater, } ); - expect(chunks).toHaveLength(1); - expect(chunks).toStrictEqual([ + expect(unspecifiedCodeBlockChunks).toHaveLength(1); + expect(unspecifiedCodeBlockChunks).toStrictEqual([ { chunkIndex: 0, sourceName: "test-source", @@ -175,11 +178,11 @@ Neat, huh?`, tags: ["a", "b"], }, text: `--- -pageTitle: Test Page -hasCodeBlock: true tags: - a - b +pageTitle: Test Page +hasCodeBlock: true --- This text has an unspecified code example: @@ -225,10 +228,10 @@ someArray: - 2 - foo hasCodeBlock: false -pageTitle: Test Page tags: - a - b +pageTitle: Test Page --- This is some text`); @@ -256,12 +259,12 @@ This is some text`); arbitrary: "metadata", }, text: `--- -pageTitle: Test Page -hasCodeBlock: false tags: - a - b arbitrary: metadata +pageTitle: Test Page +hasCodeBlock: false --- FOO`, From 20b6a8d62a127a28767437c12d25dd30d743ec89 Mon Sep 17 00:00:00 2001 From: Ben Perlmutter <90647379+mongodben@users.noreply.github.com> Date: Mon, 21 Aug 2023 10:54:04 -0400 Subject: [PATCH 12/12] Fix lint err --- ingest/src/chunkPage.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/src/chunkPage.test.ts b/ingest/src/chunkPage.test.ts index e3732dd53..af61c250d 100644 --- a/ingest/src/chunkPage.test.ts +++ b/ingest/src/chunkPage.test.ts @@ -83,7 +83,7 @@ Vestibulum tempus aliquet convallis. Aenean ac dolor sed tortor malesuada bibend }); it("can add frontmatter", async () => { - let chunks = await chunkPage( + const chunks = await chunkPage( { ...page, body: "This is some text\n" }, { transform: standardChunkFrontMatterUpdater,