From 54fd0f02d07f42648efefece79370188a87ac0ad Mon Sep 17 00:00:00 2001 From: Brace Sproul Date: Thu, 1 Aug 2024 10:41:38 -0700 Subject: [PATCH] scripts[minor]: Add CLI for document loader integration docs (#6303) * scripts[minor]: Add CLI for document loader integration docs * drop mdx file * chore: lint files * cr * cr --- .../docs/integrations/chat/fireworks.ipynb | 2 +- .../docs/integrations/chat/mistral.ipynb | 2 +- .../docs/integrations/chat/togetherai.ipynb | 2 +- .../web_loaders/web_cheerio.ipynb | 304 ++++++++++++++++++ .../web_loaders/web_cheerio.mdx | 46 --- libs/langchain-scripts/package.json | 2 + libs/langchain-scripts/src/cli/docs/chat.ts | 22 +- .../src/cli/docs/document_loaders.ts | 175 ++++++++++ libs/langchain-scripts/src/cli/docs/index.ts | 14 +- .../src/cli/docs/templates/chat.ipynb | 4 +- .../cli/docs/templates/document_loaders.ipynb | 189 +++++++++++ yarn.lock | 2 + 12 files changed, 697 insertions(+), 67 deletions(-) create mode 100644 docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.ipynb delete mode 100644 docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.mdx create mode 100644 libs/langchain-scripts/src/cli/docs/document_loaders.ts create mode 100644 libs/langchain-scripts/src/cli/docs/templates/document_loaders.ipynb diff --git a/docs/core_docs/docs/integrations/chat/fireworks.ipynb b/docs/core_docs/docs/integrations/chat/fireworks.ipynb index 9c2219315fca..aaaf3e03e75a 100644 --- a/docs/core_docs/docs/integrations/chat/fireworks.ipynb +++ b/docs/core_docs/docs/integrations/chat/fireworks.ipynb @@ -26,7 +26,7 @@ "## Overview\n", "### Integration details\n", "\n", - "| Class | Package | Local | Serializable | [PY support](https:/python.langchain.com/v0.2/docs/integrations/chat/fireworks) | Package downloads | Package latest |\n", + "| Class | Package | Local | Serializable | [PY support](https://python.langchain.com/docs/integrations/chat/fireworks) | Package downloads | Package latest |\n", "| :--- | :--- | :---: | :---: | :---: | :---: | :---: |\n", "| [ChatFireworks](https://api.js.langchain.com/classes/langchain_community_chat_models_fireworks.ChatFireworks.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_chat_models_fireworks.html) | ❌ | ✅ | ✅ | ![NPM - Downloads](https://img.shields.io/npm/dm/@langchain/community?style=flat-square&label=%20&) | ![NPM - Version](https://img.shields.io/npm/v/@langchain/community?style=flat-square&label=%20&) |\n", "\n", diff --git a/docs/core_docs/docs/integrations/chat/mistral.ipynb b/docs/core_docs/docs/integrations/chat/mistral.ipynb index 047ec3c4e1ad..f3f61fec8bff 100644 --- a/docs/core_docs/docs/integrations/chat/mistral.ipynb +++ b/docs/core_docs/docs/integrations/chat/mistral.ipynb @@ -26,7 +26,7 @@ "## Overview\n", "### Integration details\n", "\n", - "| Class | Package | Local | Serializable | [PY support](https:/python.langchain.com/v0.2/docs/integrations/chat/mistralai) | Package downloads | Package latest |\n", + "| Class | Package | Local | Serializable | [PY support](https://python.langchain.com/docs/integrations/chat/mistralai) | Package downloads | Package latest |\n", "| :--- | :--- | :---: | :---: | :---: | :---: | :---: |\n", "| [ChatMistralAI](https://api.js.langchain.com/classes/langchain_mistralai.ChatMistralAI.html) | [@langchain/mistralai](https://api.js.langchain.com/modules/langchain_mistralai.html) | ❌ | ❌ | ✅ | ![NPM - Downloads](https://img.shields.io/npm/dm/@langchain/mistralai?style=flat-square&label=%20&) | ![NPM - Version](https://img.shields.io/npm/v/@langchain/mistralai?style=flat-square&label=%20&) |\n", "\n", diff --git a/docs/core_docs/docs/integrations/chat/togetherai.ipynb b/docs/core_docs/docs/integrations/chat/togetherai.ipynb index 1b28aa9f2582..8ed09f8d41c6 100644 --- a/docs/core_docs/docs/integrations/chat/togetherai.ipynb +++ b/docs/core_docs/docs/integrations/chat/togetherai.ipynb @@ -26,7 +26,7 @@ "## Overview\n", "### Integration details\n", "\n", - "| Class | Package | Local | Serializable | [PY support](https://python.langchain.com/v0.2/docs/integrations/chat/togetherai) | Package downloads | Package latest |\n", + "| Class | Package | Local | Serializable | [PY support](https://python.langchain.com/docs/integrations/chat/togetherai) | Package downloads | Package latest |\n", "| :--- | :--- | :---: | :---: | :---: | :---: | :---: |\n", "| [ChatTogetherAI](https://api.js.langchain.com/classes/langchain_community_chat_models_togetherai.ChatTogetherAI.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_chat_models_togetherai.html) | ❌ | ✅ | ✅ | ![NPM - Downloads](https://img.shields.io/npm/dm/@langchain/community?style=flat-square&label=%20&) | ![NPM - Version](https://img.shields.io/npm/v/@langchain/community?style=flat-square&label=%20&) |\n", "\n", diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.ipynb new file mode 100644 index 000000000000..b12e3a8e5a00 --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: CheerioWebBaseLoader\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cheerio\n", + "\n", + "This notebook provides a quick overview for getting started with [CheerioWebBaseLoader](/docs/integrations/document_loaders/). For detailed documentation of all CheerioWebBaseLoader features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_cheerio.CheerioWebBaseLoader.html).\n", + "\n", + "## Overview\n", + "### Integration details\n", + "\n", + "This example goes over how to load data from webpages using Cheerio. One document will be created for each webpage.\n", + "\n", + "Cheerio is a fast and lightweight library that allows you to parse and traverse HTML documents using a jQuery-like syntax. You can use Cheerio to extract data from web pages, without having to render them in a browser.\n", + "\n", + "However, Cheerio does not simulate a web browser, so it cannot execute JavaScript code on the page. This means that it cannot extract data from dynamic web pages that require JavaScript to render. To do that, you can use the [`PlaywrightWebBaseLoader`](/docs/integrations/document_loaders/web_loaders/web_playwright) or [`PuppeteerWebBaseLoader`](/docs/integrations/document_loaders/web_loaders/web_puppeteer) instead.\n", + "\n", + "| Class | Package | Local | Serializable | PY support|\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| [CheerioWebBaseLoader](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_cheerio.CheerioWebBaseLoader.html) | @langchain/community | ✅ | ✅ | ❌ | \n", + "### Loader features\n", + "| Source | Web Support | Node Support\n", + "| :---: | :---: | :---: | \n", + "| CheerioWebBaseLoader | ✅ | ✅ | \n", + "\n", + "## Setup\n", + "\n", + "- TODO: Update with relevant info.\n", + "\n", + "To access `CheerioWebBaseLoader` document loader you'll need to install the `@langchain/community` integration package, along with the `cheerio` peer dependency.\n", + "\n", + "### Credentials\n", + "\n", + "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:\n", + "\n", + "```bash\n", + "# export LANGCHAIN_TRACING_V2=\"true\"\n", + "# export LANGCHAIN_API_KEY=\"your-api-key\"\n", + "```\n", + "\n", + "### Installation\n", + "\n", + "The LangChain CheerioWebBaseLoader integration lives in the `@langchain/community` package:\n", + "\n", + "```{=mdx}\n", + "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\";\n", + "\n", + "\n", + "\n", + "\n", + " @langchain/community cheerio\n", + "\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiation\n", + "\n", + "Now we can instantiate our model object and load documents:\n", + "\n", + "- TODO: Update model instantiation with relevant params." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import { CheerioWebBaseLoader } from \"@langchain/community/document_loaders/web/cheerio\"\n", + "\n", + "const loader = new CheerioWebBaseLoader(\"https://news.ycombinator.com/item?id=34817881\", {\n", + " // optional params: ...\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document {\n", + " pageContent: '\\n' +\n", + " ' \\n' +\n", + " ' Hacker News\\n' +\n", + " ' new | past | comments | ask | show | jobs | submit \\n' +\n", + " ' login\\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " '\\n' +\n", + " ' \\n' +\n", + " ' What Lights the Universe’s Standard Candles? (quantamagazine.org)\\n' +\n", + " ' 75 points by Amorymeltzer on Feb 17, 2023 | hide | past | favorite | 6 comments \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' delta_p_delta_x on Feb 17, 2023 \\n' +\n", + " ' | next [–] \\n' +\n", + " ' \\n' +\n", + " \" Astrophysical and cosmological simulations are often insightful. They're also very cross-disciplinary; besides the obvious astrophysics, there's networking and sysadmin, parallel computing and algorithm theory (so that the simulation programs are actually fast but still accurate), systems design, and even a bit of graphic design for the visualisations.Some of my favourite simulation projects:- IllustrisTNG: https://www.tng-project.org/- SWIFT: https://swift.dur.ac.uk/- CO5BOLD: https://www.astro.uu.se/~bf/co5bold_main.html (which produced these animations of a red-giant star: https://www.astro.uu.se/~bf/movie/AGBmovie.html)- AbacusSummit: https://abacussummit.readthedocs.io/en/latest/And I can add the simulations in the article, too.\\n\" +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' froeb on Feb 18, 2023 \\n' +\n", + " ' | parent | next [–] \\n' +\n", + " ' \\n' +\n", + " \" Supernova simulations are especially interesting too. I have heard them described as the only time in physics when all 4 of the fundamental forces are important. The explosion can be quite finicky too. If I remember right, you can't get supernova to explode properly in 1D simulations, only in higher dimensions. This was a mystery until the realization that turbulence is necessary for supernova to trigger--there is no turbulent flow in 1D.\\n\" +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' andrewflnr on Feb 17, 2023 \\n' +\n", + " ' | prev | next [–] \\n' +\n", + " ' \\n' +\n", + " \" Whoa. I didn't know the accretion theory of Ia supernovae was dead, much less that it had been since 2011.\\n\" +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' andreareina on Feb 17, 2023 \\n' +\n", + " ' | prev | next [–] \\n' +\n", + " ' \\n' +\n", + " ' This seems to be the paper https://academic.oup.com/mnras/article/517/4/5260/6779709\\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' andreareina on Feb 17, 2023 \\n' +\n", + " ' | prev [–] \\n' +\n", + " ' \\n' +\n", + " \" Wouldn't double detonation show up as variance in the brightness?\\n\" +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' yencabulator on Feb 18, 2023 \\n' +\n", + " ' | parent [–] \\n' +\n", + " ' \\n' +\n", + " ' Or widening of the peak. If one type Ia supernova goes 1,2,3,2,1, the sum of two could go 1+0=1\\n' +\n", + " ' 2+1=3\\n' +\n", + " ' 3+2=5\\n' +\n", + " ' 2+3=5\\n' +\n", + " ' 1+2=3\\n' +\n", + " ' 0+1=1\\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " ' \\n' +\n", + " '\\n' +\n", + " '\\n' +\n", + " 'Guidelines | FAQ | Lists | API | Security | Legal | Apply to YC | Contact\\n' +\n", + " 'Search: \\n' +\n", + " ' \\n' +\n", + " ' \\n',\n", + " metadata: { source: 'https://news.ycombinator.com/item?id=34817881' },\n", + " id: undefined\n", + "}\n" + ] + } + ], + "source": [ + "const docs = await loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{ source: 'https://news.ycombinator.com/item?id=34817881' }\n" + ] + } + ], + "source": [ + "console.log(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional configurations\n", + "\n", + "`CheerioWebBaseLoader` supports additional configuration when instantiating the loader. Here is an example of how to use it with the `selector` field passed, making it only load content from the provided HTML class names:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Some of my favourite simulation projects:- IllustrisTNG: https://www.tng-project.org/- SWIFT: https://swift.dur.ac.uk/- CO5BOLD: https://www.astro.uu.se/~bf/co5bold_main.html (which produced these animations of a red-giant star: https://www.astro.uu.se/~bf/movie/AGBmovie.html)- AbacusSummit: https://abacussummit.readthedocs.io/en/latest/And I can add the simulations in the article, too.\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n" + ] + } + ], + "source": [ + "import { CheerioWebBaseLoader } from \"@langchain/community/document_loaders/web/cheerio\"\n", + "\n", + "const loaderWithSelector = new CheerioWebBaseLoader(\"https://news.ycombinator.com/item?id=34817881\", {\n", + " selector: \"p\",\n", + "});\n", + "\n", + "const docsWithSelector = await loaderWithSelector.load();\n", + "docsWithSelector[0].pageContent;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all CheerioWebBaseLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_community_document_loaders_web_cheerio.CheerioWebBaseLoader.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TypeScript", + "language": "typescript", + "name": "tslab" + }, + "language_info": { + "codemirror_mode": { + "mode": "typescript", + "name": "javascript", + "typescript": true + }, + "file_extension": ".ts", + "mimetype": "text/typescript", + "name": "typescript", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.mdx deleted file mode 100644 index a33912f424ae..000000000000 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/web_cheerio.mdx +++ /dev/null @@ -1,46 +0,0 @@ ---- -sidebar_position: 1 -sidebar_label: Cheerio -hide_table_of_contents: true ---- - -# Webpages, with Cheerio - -This example goes over how to load data from webpages using Cheerio. One document will be created for each webpage. - -Cheerio is a fast and lightweight library that allows you to parse and traverse HTML documents using a jQuery-like syntax. You can use Cheerio to extract data from web pages, without having to render them in a browser. - -However, Cheerio does not simulate a web browser, so it cannot execute JavaScript code on the page. This means that it cannot extract data from dynamic web pages that require JavaScript to render. To do that, you can use the [`PlaywrightWebBaseLoader`](/docs/integrations/document_loaders/web_loaders/web_playwright) or [`PuppeteerWebBaseLoader`](/docs/integrations/document_loaders/web_loaders/web_puppeteer) instead. - -## Setup - -```bash npm2yarn -npm install cheerio -``` - -## Usage - -```typescript -import { CheerioWebBaseLoader } from "@langchain/community/document_loaders/web/cheerio"; - -const loader = new CheerioWebBaseLoader( - "https://news.ycombinator.com/item?id=34817881" -); - -const docs = await loader.load(); -``` - -## Usage, with a custom selector - -```typescript -import { CheerioWebBaseLoader } from "@langchain/community/document_loaders/web/cheerio"; - -const loader = new CheerioWebBaseLoader( - "https://news.ycombinator.com/item?id=34817881", - { - selector: "p.athing", - } -); - -const docs = await loader.load(); -``` diff --git a/libs/langchain-scripts/package.json b/libs/langchain-scripts/package.json index 82048e1f88a6..1b9e36114282 100644 --- a/libs/langchain-scripts/package.json +++ b/libs/langchain-scripts/package.json @@ -44,6 +44,7 @@ "axios": "^1.6.7", "commander": "^11.1.0", "glob": "^10.3.10", + "lodash": "^4.17.21", "readline": "^1.3.0", "rimraf": "^5.0.1", "rollup": "^4.5.2", @@ -55,6 +56,7 @@ "@swc/core": "^1.3.90", "@swc/jest": "^0.2.29", "@tsconfig/recommended": "^1.0.3", + "@types/lodash": "^4", "@typescript-eslint/eslint-plugin": "^6.12.0", "@typescript-eslint/parser": "^6.12.0", "dotenv": "^16.3.1", diff --git a/libs/langchain-scripts/src/cli/docs/chat.ts b/libs/langchain-scripts/src/cli/docs/chat.ts index a272a553ef39..196caabddc19 100644 --- a/libs/langchain-scripts/src/cli/docs/chat.ts +++ b/libs/langchain-scripts/src/cli/docs/chat.ts @@ -69,57 +69,57 @@ type ExtraFields = { async function promptExtraFields(): Promise { const hasToolCalling = await getUserInput( - "Does the tool support tool calling? (y/n) ", + "Does this integration support tool calling? (y/n) ", undefined, true ); const hasJsonMode = await getUserInput( - "Does the tool support JSON mode? (y/n) ", + "Does this integration support JSON mode? (y/n) ", undefined, true ); const hasImageInput = await getUserInput( - "Does the tool support image input? (y/n) ", + "Does this integration support image input? (y/n) ", undefined, true ); const hasAudioInput = await getUserInput( - "Does the tool support audio input? (y/n) ", + "Does this integration support audio input? (y/n) ", undefined, true ); const hasVideoInput = await getUserInput( - "Does the tool support video input? (y/n) ", + "Does this integration support video input? (y/n) ", undefined, true ); const hasTokenLevelStreaming = await getUserInput( - "Does the tool support token level streaming? (y/n) ", + "Does this integration support token level streaming? (y/n) ", undefined, true ); const hasTokenUsage = await getUserInput( - "Does the tool support token usage? (y/n) ", + "Does this integration support token usage? (y/n) ", undefined, true ); const hasLogprobs = await getUserInput( - "Does the tool support logprobs? (y/n) ", + "Does this integration support logprobs? (y/n) ", undefined, true ); const hasLocal = await getUserInput( - "Does the tool support local usage? (y/n) ", + "Does this integration support local usage? (y/n) ", undefined, true ); const hasSerializable = await getUserInput( - "Does the tool support serializable output? (y/n) ", + "Does this integration support serializable output? (y/n) ", undefined, true ); const hasPySupport = await getUserInput( - "Does the tool support Python support? (y/n) ", + "Does this integration have Python support? (y/n) ", undefined, true ); diff --git a/libs/langchain-scripts/src/cli/docs/document_loaders.ts b/libs/langchain-scripts/src/cli/docs/document_loaders.ts new file mode 100644 index 000000000000..359b94034e44 --- /dev/null +++ b/libs/langchain-scripts/src/cli/docs/document_loaders.ts @@ -0,0 +1,175 @@ +import * as path from "node:path"; +import * as fs from "node:fs"; +import _ from "lodash"; +import { + boldText, + getUserInput, + greenText, + redBackground, +} from "../utils/get-input.js"; + +const NODE_OR_WEB_PLACEHOLDER = "__fs_or_web__"; +const PACKAGE_NAME_PLACEHOLDER = "__package_name__"; +const MODULE_NAME_PLACEHOLDER = "__ModuleName__"; +const PACKAGE_NAME_SHORT_SNAKE_CASE_PLACEHOLDER = + "__package_name_short_snake_case__"; +const PACKAGE_NAME_SNAKE_CASE_PLACEHOLDER = "__package_name_snake_case__"; +const PACKAGE_IMPORT_PATH_PLACEHOLDER = "__import_path__"; + +// This should not be prefixed with `Chat` as it's used for API keys. +const MODULE_NAME_ALL_CAPS_PLACEHOLDER = "__MODULE_NAME_ALL_CAPS__"; + +const SERIALIZABLE_PLACEHOLDER = "__serializable__"; +const LOCAL_PLACEHOLDER = "__local__"; +const PY_SUPPORT_PLACEHOLDER = "__py_support__"; + +const WEB_SUPPORT_PLACEHOLDER = "__web_support__"; +const NODE_SUPPORT_PLACEHOLDER = "__fs_support__"; + +const API_REF_BASE_MODULE_URL = `https://api.js.langchain.com/classes/langchain_community_document_loaders_${NODE_OR_WEB_PLACEHOLDER}_${PACKAGE_NAME_PLACEHOLDER}.${MODULE_NAME_PLACEHOLDER}.html`; + +const TEMPLATE_PATH = path.resolve( + "./src/cli/docs/templates/document_loaders.ipynb" +); +const INTEGRATIONS_DOCS_PATH = path.resolve( + "../../docs/core_docs/docs/integrations/document_loaders" +); + +const fetchAPIRefUrl = async (url: string): Promise => { + try { + const res = await fetch(url); + if (res.status !== 200) { + throw new Error(`API Reference URL ${url} not found.`); + } + return true; + } catch (_) { + return false; + } +}; + +type ExtraFields = { + nodeSupport: boolean; + webSupport: boolean; + serializable: boolean; + pySupport: boolean; + local: boolean; +}; + +async function promptExtraFields(): Promise { + const hasNodeSupport = await getUserInput( + "Does this integration support Node environments? (y/n) ", + undefined, + true + ); + const hasWebSupport = await getUserInput( + "Does this integration support web environments? (y/n) ", + undefined, + true + ); + const hasSerializable = await getUserInput( + "Does this integration support serializable output? (y/n) ", + undefined, + true + ); + const hasPySupport = await getUserInput( + "Does this integration have Python support? (y/n) ", + undefined, + true + ); + const hasLocalSupport = await getUserInput( + "Does this integration support running locally? (y/n) ", + undefined, + true + ); + + return { + nodeSupport: hasNodeSupport.toLowerCase() === "y", + webSupport: hasWebSupport.toLowerCase() === "y", + serializable: hasSerializable.toLowerCase() === "y", + pySupport: hasPySupport.toLowerCase() === "y", + local: hasLocalSupport.toLowerCase() === "y", + }; +} + +export async function fillDocLoaderIntegrationDocTemplate(fields: { + packageName: string; + moduleName: string; + webSupport?: boolean; + nodeSupport?: boolean; +}) { + // Ask the user if they'd like to fill in extra fields, if so, prompt them. + let extraFields: ExtraFields | undefined; + const shouldPromptExtraFields = await getUserInput( + "Would you like to fill out optional fields? (y/n) ", + "white_background" + ); + if (shouldPromptExtraFields.toLowerCase() === "y") { + extraFields = await promptExtraFields(); + } + + const formattedApiRefModuleUrl = API_REF_BASE_MODULE_URL.replace( + PACKAGE_NAME_PLACEHOLDER, + fields.packageName + ) + .replace(MODULE_NAME_PLACEHOLDER, fields.moduleName) + .replace(NODE_OR_WEB_PLACEHOLDER, extraFields?.webSupport ? "web" : "fs"); + + const success = await fetchAPIRefUrl(formattedApiRefModuleUrl); + if (!success) { + // Don't error out because this might be used before the package is released. + console.error("Invalid package or module name. API reference not found."); + } + + const packageNameShortSnakeCase = fields.packageName.replaceAll("-", "_"); + const fullPackageNameSnakeCase = `langchain_community_document_loaders_${ + extraFields?.webSupport ? "web" : "fs" + }_${packageNameShortSnakeCase}`; + const fullPackageImportPath = `@langchain/community/document_loaders/${ + extraFields?.webSupport ? "web" : "fs" + }/${fields.packageName}`; + + let moduleNameAllCaps = _.snakeCase(fields.moduleName).toUpperCase(); + if (moduleNameAllCaps.endsWith("DOCUMENT_LOADER")) { + moduleNameAllCaps = moduleNameAllCaps.replace("DOCUMENT_LOADER", ""); + } + + const docTemplate = (await fs.promises.readFile(TEMPLATE_PATH, "utf-8")) + .replaceAll(PACKAGE_NAME_PLACEHOLDER, fields.packageName) + .replaceAll(PACKAGE_NAME_SNAKE_CASE_PLACEHOLDER, fullPackageNameSnakeCase) + .replaceAll( + PACKAGE_NAME_SHORT_SNAKE_CASE_PLACEHOLDER, + packageNameShortSnakeCase + ) + .replaceAll(PACKAGE_IMPORT_PATH_PLACEHOLDER, fullPackageImportPath) + .replaceAll(MODULE_NAME_PLACEHOLDER, fields.moduleName) + .replaceAll(MODULE_NAME_ALL_CAPS_PLACEHOLDER, moduleNameAllCaps) + .replace(WEB_SUPPORT_PLACEHOLDER, extraFields?.webSupport ? "✅" : "❌") + .replace(NODE_SUPPORT_PLACEHOLDER, extraFields?.nodeSupport ? "✅" : "❌") + .replace(LOCAL_PLACEHOLDER, extraFields?.local ? "✅" : "❌") + .replace( + SERIALIZABLE_PLACEHOLDER, + extraFields?.serializable ? "✅" : "beta" + ) + .replace(PY_SUPPORT_PLACEHOLDER, extraFields?.pySupport ? "✅" : "❌"); + + const docPath = path.join( + INTEGRATIONS_DOCS_PATH, + extraFields?.webSupport ? "web_loaders" : "file_loaders", + `${packageNameShortSnakeCase}.ipynb` + ); + await fs.promises.writeFile(docPath, docTemplate); + const prettyDocPath = docPath.split("docs/core_docs/")[1]; + + const updatePythonDocUrlText = ` ${redBackground( + "- Update the Python documentation URL with the proper URL." + )}`; + const successText = `\nSuccessfully created new document loader integration doc at ${prettyDocPath}.`; + + console.log( + `${greenText(successText)}\n +${boldText("Next steps:")} +${extraFields?.pySupport ? updatePythonDocUrlText : ""} + - Run all code cells in the generated doc to record the outputs. + - Add extra sections on integration specific features.\n` + ); +} diff --git a/libs/langchain-scripts/src/cli/docs/index.ts b/libs/langchain-scripts/src/cli/docs/index.ts index d86109618e0d..a7a89745e7a1 100644 --- a/libs/langchain-scripts/src/cli/docs/index.ts +++ b/libs/langchain-scripts/src/cli/docs/index.ts @@ -3,6 +3,7 @@ // --------------------------------------------- import { Command } from "commander"; import { fillChatIntegrationDocTemplate } from "./chat.js"; +import { fillDocLoaderIntegrationDocTemplate } from "./document_loaders.js"; type CLIInput = { package: string; @@ -15,10 +16,7 @@ async function main() { const program = new Command(); program .description("Create a new integration doc.") - .option( - "--package ", - "Package name, eg openai. Should be value of @langchain/" - ) + .option("--package ", "Package name, eg openai.") .option("--module ", "Module name, e.g ChatOpenAI") .option("--type ", "Type of integration, e.g. 'chat'") .option( @@ -45,9 +43,15 @@ async function main() { isCommunity, }); break; + case "doc_loader": + await fillDocLoaderIntegrationDocTemplate({ + packageName, + moduleName, + }); + break; default: console.error( - `Invalid type: ${type}.\nOnly 'chat' is supported at this time.` + `Invalid type: ${type}.\nOnly 'chat' and 'doc_loader' are supported at this time.` ); process.exit(1); } diff --git a/libs/langchain-scripts/src/cli/docs/templates/chat.ipynb b/libs/langchain-scripts/src/cli/docs/templates/chat.ipynb index 57019e3eb0d6..1f6508d8a861 100644 --- a/libs/langchain-scripts/src/cli/docs/templates/chat.ipynb +++ b/libs/langchain-scripts/src/cli/docs/templates/chat.ipynb @@ -24,9 +24,9 @@ "\n", "- TODO: Make sure Python integration doc link is correct, if applicable.\n", "\n", - "| Class | Package | Local | Serializable | [PY support](https:/python.langchain.com/v0.2/docs/integrations/chat/__package_name_short_snake_case__) | Package downloads | Package latest |\n", + "| Class | Package | Local | Serializable | [PY support](https://python.langchain.com/docs/integrations/chat/__package_name_short_snake_case__) | Package downloads | Package latest |\n", "| :--- | :--- | :---: | :---: | :---: | :---: | :---: |\n", - "| [__ModuleName__](https://api.js.langchain.com/classes/__package_name_snake_case__.__ModuleName__.html) | [__package_name_pretty__](https://api.js.langchain.com/modules/__package_name_snake_case__.html) | __local__ | __serializable__ | __py_support__ | ![NPM - Downloads](https://img.shields.io/npm/dm/__package_name_pretty__?style=flat-square&label=%20) | ![NPM - Version](https://img.shields.io/npm/v/__package_name_pretty__?style=flat-square&label=%20) |\n", + "| [__ModuleName__](https://api.js.langchain.com/classes/__package_name_snake_case__.__ModuleName__.html) | [__package_name_pretty__](https://api.js.langchain.com/modules/__package_name_snake_case__.html) | __local__ | __serializable__ | __py_support__ | ![NPM - Downloads](https://img.shields.io/npm/dm/__package_name_pretty__?style=flat-square&label=%20&) | ![NPM - Version](https://img.shields.io/npm/v/__package_name_pretty__?style=flat-square&label=%20&) |\n", "\n", "### Model features\n", "| [Tool calling](/docs/how_to/tool_calling) | [Structured output](/docs/how_to/structured_output/) | JSON mode | [Image input](/docs/how_to/multimodal_inputs/) | Audio input | Video input | [Token-level streaming](/docs/how_to/chat_streaming/) | [Token usage](/docs/how_to/chat_token_usage_tracking/) | [Logprobs](/docs/how_to/logprobs/) |\n", diff --git a/libs/langchain-scripts/src/cli/docs/templates/document_loaders.ipynb b/libs/langchain-scripts/src/cli/docs/templates/document_loaders.ipynb new file mode 100644 index 000000000000..bc49c5c0801c --- /dev/null +++ b/libs/langchain-scripts/src/cli/docs/templates/document_loaders.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: __ModuleName__\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# __ModuleName__\n", + "\n", + "- TODO: Make sure API reference link is correct.\n", + "\n", + "This notebook provides a quick overview for getting started with [__ModuleName__](/docs/integrations/document_loaders/). For detailed documentation of all __ModuleName__ features and configurations head to the [API reference](https://api.js.langchain.com/classes/__package_name_snake_case__.__ModuleName__.html).\n", + "\n", + "- TODO: Add any other relevant links, like information about underlying API, etc.\n", + "\n", + "## Overview\n", + "### Integration details\n", + "\n", + "- TODO: Fill in table features.\n", + "- TODO: Remove JS support link if not relevant, otherwise ensure link is correct.\n", + "- TODO: Make sure API reference links are correct.\n", + "\n", + "| Class | Package | Local | Serializable | [PY support](https://python.langchain.com/docs/integrations/document_loaders/__package_name_short_snake_case__)|\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| [__ModuleName__](https://api.js.langchain.com/classes/__package_name_snake_case__.__ModuleName__.html) | @langchain/community | __local__ | __serializable__ | __py_support__ | \n", + "### Loader features\n", + "| Source | Web Support | Node Support\n", + "| :---: | :---: | :---: | \n", + "| __ModuleName__ | __web_support__ | __fs_support__ | \n", + "\n", + "## Setup\n", + "\n", + "- TODO: Update with relevant info.\n", + "\n", + "To access `__ModuleName__` document loader you'll need to install the `@langchain/community` integration package, and create a **__ModuleName__** account and get an API key.\n", + "\n", + "### Credentials\n", + "\n", + "- TODO: Update with relevant info.\n", + "\n", + "Head to (TODO: link) to sign up to __ModuleName__ and generate an API key. Once you've done this set the `__MODULE_NAME_ALL_CAPS___API_KEY` environment variable:\n", + "\n", + "```bash\n", + "export __MODULE_NAME_ALL_CAPS___API_KEY=\"your-api-key\"\n", + "```\n", + "\n", + "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:\n", + "\n", + "```bash\n", + "# export LANGCHAIN_TRACING_V2=\"true\"\n", + "# export LANGCHAIN_API_KEY=\"your-api-key\"\n", + "```\n", + "\n", + "### Installation\n", + "\n", + "The LangChain __ModuleName__ integration lives in the `@langchain/community` package:\n", + "\n", + "```{=mdx}\n", + "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\";\n", + "\n", + "\n", + "\n", + "\n", + " @langchain/community\n", + "\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiation\n", + "\n", + "Now we can instantiate our model object and load documents:\n", + "\n", + "- TODO: Update model instantiation with relevant params." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "typescript" + } + }, + "outputs": [], + "source": [ + "import { __ModuleName__ } from \"__import_path__\"\n", + "\n", + "const loader = new __ModuleName__({\n", + " // required params = ...\n", + " // optional params = ...\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load\n", + "\n", + "- TODO: Run cells to show loading capabilities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "typescript" + } + }, + "outputs": [], + "source": [ + "const docs = await loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "typescript" + } + }, + "outputs": [], + "source": [ + "console.log(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TODO: Any functionality specific to this document loader\n", + "\n", + "E.g. using specific configs for different loading behavior. Delete if not relevant." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all __ModuleName__ features and configurations head to the API reference: https://api.js.langchain.com/classes/__package_name_snake_case__.__ModuleName__.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/yarn.lock b/yarn.lock index 20617a68d1fb..f2c95da3ec90 100644 --- a/yarn.lock +++ b/yarn.lock @@ -12339,6 +12339,7 @@ __metadata: "@swc/core": ^1.3.90 "@swc/jest": ^0.2.29 "@tsconfig/recommended": ^1.0.3 + "@types/lodash": ^4 "@typescript-eslint/eslint-plugin": ^6.12.0 "@typescript-eslint/parser": ^6.12.0 axios: ^1.6.7 @@ -12354,6 +12355,7 @@ __metadata: glob: ^10.3.10 jest: ^29.5.0 jest-environment-node: ^29.6.4 + lodash: ^4.17.21 prettier: ^2.8.3 readline: ^1.3.0 release-it: ^15.10.1