From f4da1344ec395c08b8dae1e3cd0904e490f40513 Mon Sep 17 00:00:00 2001 From: bracesproul Date: Thu, 1 Aug 2024 14:21:33 -0700 Subject: [PATCH 1/4] docs[minor]: Update webpdf loader docs --- .../document_loaders/web_loaders/pdf.ipynb | 323 ++++++++++++++++++ .../document_loaders/web_loaders/pdf.mdx | 53 --- 2 files changed, 323 insertions(+), 53 deletions(-) create mode 100644 docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb delete mode 100644 docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb new file mode 100644 index 000000000000..a0789ee0f93f --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb @@ -0,0 +1,323 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: PDF files\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# WebPDFLoader\n", + "\n", + "This notebook provides a quick overview for getting started with [WebPDFLoader](/docs/integrations/document_loaders/). For detailed documentation of all WebPDFLoader features and configurations head to the [API reference](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_pdf.WebPDFLoader.html).\n", + "\n", + "## Overview\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Serializable | PY support |\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| [WebPDFLoader](https://api.js.langchain.com/classes/langchain_community_document_loaders_web_pdf.WebPDFLoader.html) | [@langchain/community](https://api.js.langchain.com/modules/langchain_community_document_loaders_web_pdf.html) | ✅ | beta | ❌ | \n", + "### Loader features\n", + "| Source | Web Loader | Node Envs Only\n", + "| :---: | :---: | :---: | \n", + "| WebPDFLoader | ✅ | ❌ | \n", + "\n", + "You can use this version of the popular PDFLoader in web environments.\n", + "By default, one document will be created for each page in the PDF file, you can change this behavior by setting the `splitPages` option to `false`.\n", + "\n", + "## Setup\n", + "\n", + "To access `WebPDFLoader` document loader you'll need to install the `@langchain/community` integration, along with the `pdf-parse` package:\n", + "\n", + "### Credentials\n", + "\n", + "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:\n", + "\n", + "```bash\n", + "# export LANGCHAIN_TRACING_V2=\"true\"\n", + "# export LANGCHAIN_API_KEY=\"your-api-key\"\n", + "```\n", + "\n", + "### Installation\n", + "\n", + "The LangChain WebPDFLoader integration lives in the `@langchain/community` package:\n", + "\n", + "```{=mdx}\n", + "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\";\n", + "\n", + "\n", + "\n", + "\n", + " @langchain/community pdf-parse\n", + "\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiation\n", + "\n", + "Now we can instantiate our model object and load documents:\n", + "\n", + "- TODO: Update model instantiation with relevant params." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import fs from \"fs/promises\";\n", + "import { WebPDFLoader } from \"@langchain/community/document_loaders/web/pdf\"\n", + "\n", + "const nike10kPDFPath = \"../../../../data/nke-10k-2023.pdf\";\n", + "\n", + "// Read the file as a buffer\n", + "const buffer = await fs.readFile(nike10kPDFPath);\n", + "\n", + "// Create a Blob from the buffer\n", + "const nike10kPDFBlob = new Blob([buffer], { type: 'application/pdf' });\n", + "\n", + "const loader = new WebPDFLoader(nike10kPDFBlob, {\n", + " // required params = ...\n", + " // optional params = ...\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document {\n", + " pageContent: 'Table of Contents\\n' +\n", + " 'UNITED STATES\\n' +\n", + " 'SECURITIES AND EXCHANGE COMMISSION\\n' +\n", + " 'Washington, D.C. 20549\\n' +\n", + " 'FORM 10-K\\n' +\n", + " '(Mark One)\\n' +\n", + " '☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\\n' +\n", + " 'FOR THE FISCAL YEAR ENDED MAY 31, 2023\\n' +\n", + " 'OR\\n' +\n", + " '☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\\n' +\n", + " 'FOR THE TRANSITION PERIOD FROM TO .\\n' +\n", + " 'Commission File No. 1-10635\\n' +\n", + " 'NIKE, Inc.\\n' +\n", + " '(Exact name of Registrant as specified in its charter)\\n' +\n", + " 'Oregon93-0584541\\n' +\n", + " '(State or other jurisdiction of incorporation)(IRS Employer Identification No.)\\n' +\n", + " 'One Bowerman Drive, Beaverton, Oregon 97005-6453\\n' +\n", + " '(Address of principal executive offices and zip code)\\n' +\n", + " '(503) 671-6453\\n' +\n", + " \"(Registrant's telephone number, including area code)\\n\" +\n", + " 'SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:\\n' +\n", + " 'Class B Common StockNKENew York Stock Exchange\\n' +\n", + " '(Title of each class)(Trading symbol)(Name of each exchange on which registered)\\n' +\n", + " 'SECURITIES REGISTERED PURSUANT TO SECTION 12(G) OF THE ACT:\\n' +\n", + " 'NONE\\n' +\n", + " 'Indicate by check mark:YESNO\\n' +\n", + " '•if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.þ ̈\\n' +\n", + " '•if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act. ̈þ\\n' +\n", + " '•whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding\\n' +\n", + " '12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the\\n' +\n", + " 'past 90 days.\\n' +\n", + " 'þ ̈\\n' +\n", + " '•whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T\\n' +\n", + " '(§232.405 of this chapter) during the preceding 12 months (or for such shorter period that the registrant was required to submit such files).\\n' +\n", + " 'þ ̈\\n' +\n", + " '•whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company or an emerging growth company. See the definitions of “large accelerated filer,”\\n' +\n", + " '“accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.\\n' +\n", + " 'Large accelerated filerþAccelerated filer☐Non-accelerated filer☐Smaller reporting company☐Emerging growth company☐\\n' +\n", + " '•if an emerging growth company, if the registrant has elected not to use the extended transition period for complying with any new or revised financial\\n' +\n", + " 'accounting standards provided pursuant to Section 13(a) of the Exchange Act.\\n' +\n", + " ' ̈\\n' +\n", + " \"•whether the registrant has filed a report on and attestation to its management's assessment of the effectiveness of its internal control over financial\\n\" +\n", + " 'reporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit\\n' +\n", + " 'report.\\n' +\n", + " 'þ\\n' +\n", + " '•if securities are registered pursuant to Section 12(b) of the Act, whether the financial statements of the registrant included in the filing reflect the\\n' +\n", + " 'correction of an error to previously issued financial statements.\\n' +\n", + " ' ̈\\n' +\n", + " '•whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the\\n' +\n", + " \"registrant's executive officers during the relevant recovery period pursuant to § 240.10D-1(b).\\n\" +\n", + " ' ̈\\n' +\n", + " '•\\n' +\n", + " 'whether the registrant is a shell company (as defined in Rule 12b-2 of the Act).☐þ\\n' +\n", + " \"As of November 30, 2022, the aggregate market values of the Registrant's Common Stock held by non-affiliates were:\\n\" +\n", + " 'Class A$7,831,564,572 \\n' +\n", + " 'Class B136,467,702,472 \\n' +\n", + " '$144,299,267,044 ',\n", + " metadata: {\n", + " pdf: {\n", + " version: '1.10.100',\n", + " info: [Object],\n", + " metadata: null,\n", + " totalPages: 107\n", + " },\n", + " loc: { pageNumber: 1 }\n", + " },\n", + " id: undefined\n", + "}\n" + ] + } + ], + "source": [ + "const docs = await loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " pdf: {\n", + " version: '1.10.100',\n", + " info: {\n", + " PDFFormatVersion: '1.4',\n", + " IsAcroFormPresent: false,\n", + " IsXFAPresent: false,\n", + " Title: '0000320187-23-000039',\n", + " Author: 'EDGAR Online, a division of Donnelley Financial Solutions',\n", + " Subject: 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31',\n", + " Keywords: '0000320187-23-000039; ; 10-K',\n", + " Creator: 'EDGAR Filing HTML Converter',\n", + " Producer: 'EDGRpdf Service w/ EO.Pdf 22.0.40.0',\n", + " CreationDate: \"D:20230720162200-04'00'\",\n", + " ModDate: \"D:20230720162208-04'00'\"\n", + " },\n", + " metadata: null,\n", + " totalPages: 107\n", + " },\n", + " loc: { pageNumber: 1 }\n", + "}\n" + ] + } + ], + "source": [ + "console.log(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage, custom `pdfjs` build\n", + "\n", + "By default we use the `pdfjs` build bundled with `pdf-parse`, which is compatible with most environments, including Node.js and modern browsers. If you want to use a more recent version of `pdfjs-dist` or if you want to use a custom build of `pdfjs-dist`, you can do so by providing a custom `pdfjs` function that returns a promise that resolves to the `PDFJS` object.\n", + "\n", + "In the following example we use the \"legacy\" (see [pdfjs docs](https://github.com/mozilla/pdf.js/wiki/Frequently-Asked-Questions#which-browsersenvironments-are-supported)) build of `pdfjs-dist`, which includes several polyfills not included in the default build.\n", + "\n", + "```{=mdx}\n", + "\n", + " pdfjs-dist\n", + "\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import { WebPDFLoader } from \"@langchain/community/document_loaders/web/pdf\";\n", + "\n", + "const blob = new Blob(); // e.g. from a file input\n", + "\n", + "const loader = new WebPDFLoader(blob, {\n", + " // you may need to add `.then(m => m.default)` to the end of the import\n", + " pdfjs: () => import(\"pdfjs-dist/legacy/build/pdf.js\"),\n", + "});" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Eliminating extra spaces\n", + "\n", + "PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but\n", + "if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import { WebPDFLoader } from \"@langchain/community/document_loaders/web/pdf\";\n", + "\n", + "const blob = new Blob(); // e.g. from a file input\n", + "\n", + "const loader = new WebPDFLoader(blob, {\n", + " parsedItemSeparator: \"\",\n", + "});" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all WebPDFLoader features and configurations head to the API reference: https://api.js.langchain.com/classes/langchain_community_document_loaders_web_pdf.WebPDFLoader.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TypeScript", + "language": "typescript", + "name": "tslab" + }, + "language_info": { + "codemirror_mode": { + "mode": "typescript", + "name": "javascript", + "typescript": true + }, + "file_extension": ".ts", + "mimetype": "text/typescript", + "name": "typescript", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx deleted file mode 100644 index 64e96247765c..000000000000 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.mdx +++ /dev/null @@ -1,53 +0,0 @@ -# PDF files - -You can use this version of the popular PDFLoader in web environments. -By default, one document will be created for each page in the PDF file, you can change this behavior by setting the `splitPages` option to `false`. - -## Setup - -```bash npm2yarn -npm install pdf-parse -``` - -## Usage - -import CodeBlock from "@theme/CodeBlock"; -import Example from "@examples/document_loaders/web_pdf.ts"; - -{Example} - -## Usage, custom `pdfjs` build - -By default we use the `pdfjs` build bundled with `pdf-parse`, which is compatible with most environments, including Node.js and modern browsers. If you want to use a more recent version of `pdfjs-dist` or if you want to use a custom build of `pdfjs-dist`, you can do so by providing a custom `pdfjs` function that returns a promise that resolves to the `PDFJS` object. - -In the following example we use the "legacy" (see [pdfjs docs](https://github.com/mozilla/pdf.js/wiki/Frequently-Asked-Questions#which-browsersenvironments-are-supported)) build of `pdfjs-dist`, which includes several polyfills not included in the default build. - -```bash npm2yarn -npm install pdfjs-dist -``` - -```typescript -import { WebPDFLoader } from "@langchain/community/document_loaders/web/pdf"; - -const blob = new Blob(); // e.g. from a file input - -const loader = new WebPDFLoader(blob, { - // you may need to add `.then(m => m.default)` to the end of the import - pdfjs: () => import("pdfjs-dist/legacy/build/pdf.js"), -}); -``` - -## Eliminating extra spaces - -PDFs come in many varieties, which makes reading them a challenge. The loader parses individual text elements and joins them together with a space by default, but -if you are seeing excessive spaces, this may not be the desired behavior. In that case, you can override the separator with an empty string like this: - -```typescript -import { WebPDFLoader } from "@langchain/community/document_loaders/web/pdf"; - -const blob = new Blob(); // e.g. from a file input - -const loader = new WebPDFLoader(blob, { - parsedItemSeparator: "", -}); -``` From ebf5be7a3081d969210c1d6d9751f4a6486a61ac Mon Sep 17 00:00:00 2001 From: bracesproul Date: Thu, 1 Aug 2024 14:32:49 -0700 Subject: [PATCH 2/4] fix dup vars error --- .../docs/integrations/document_loaders/web_loaders/pdf.ipynb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb index a0789ee0f93f..a2b6c56d4065 100644 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb @@ -254,7 +254,7 @@ "\n", "const blob = new Blob(); // e.g. from a file input\n", "\n", - "const loader = new WebPDFLoader(blob, {\n", + "const customBuildLoader = new WebPDFLoader(blob, {\n", " // you may need to add `.then(m => m.default)` to the end of the import\n", " pdfjs: () => import(\"pdfjs-dist/legacy/build/pdf.js\"),\n", "});" @@ -280,7 +280,8 @@ "\n", "const blob = new Blob(); // e.g. from a file input\n", "\n", - "const loader = new WebPDFLoader(blob, {\n", + "// new Blob(); e.g. from a file input\n", + "const eliminatingExtraSpacesLoader = new WebPDFLoader(new Blob(), {\n", " parsedItemSeparator: \"\",\n", "});" ] From 71a9083b498ef69287b35d3b5570ea05ab6b0e13 Mon Sep 17 00:00:00 2001 From: bracesproul Date: Thu, 1 Aug 2024 15:17:40 -0700 Subject: [PATCH 3/4] cr --- .../docs/integrations/document_loaders/web_loaders/pdf.ipynb | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb index a2b6c56d4065..1a6052fff8f5 100644 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb @@ -278,8 +278,6 @@ "source": [ "import { WebPDFLoader } from \"@langchain/community/document_loaders/web/pdf\";\n", "\n", - "const blob = new Blob(); // e.g. from a file input\n", - "\n", "// new Blob(); e.g. from a file input\n", "const eliminatingExtraSpacesLoader = new WebPDFLoader(new Blob(), {\n", " parsedItemSeparator: \"\",\n", From cfac9fbea67bc40f94ae4a4a6ac3e378679f1cfa Mon Sep 17 00:00:00 2001 From: bracesproul Date: Thu, 1 Aug 2024 17:45:41 -0700 Subject: [PATCH 4/4] add lc tsignore --- .../docs/integrations/document_loaders/web_loaders/pdf.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb index 1a6052fff8f5..812ed2961124 100644 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/pdf.ipynb @@ -256,6 +256,7 @@ "\n", "const customBuildLoader = new WebPDFLoader(blob, {\n", " // you may need to add `.then(m => m.default)` to the end of the import\n", + " // @lc-ts-ignore\n", " pdfjs: () => import(\"pdfjs-dist/legacy/build/pdf.js\"),\n", "});" ]