From 4cf8726bba7a1a02f879323bf9c0cafd2739b0ce Mon Sep 17 00:00:00 2001 From: Rafael Sales Date: Tue, 18 Nov 2025 22:58:00 -0300 Subject: [PATCH 1/3] feat: integrate llmstxt.org standard into FetchWebPageTool --- .../tools/vscode-node/fetchWebPageTool.tsx | 68 ++++++++++++++++--- 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/src/extension/tools/vscode-node/fetchWebPageTool.tsx b/src/extension/tools/vscode-node/fetchWebPageTool.tsx index 3436aa7f97..a149ea15f0 100644 --- a/src/extension/tools/vscode-node/fetchWebPageTool.tsx +++ b/src/extension/tools/vscode-node/fetchWebPageTool.tsx @@ -3,7 +3,7 @@ * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ import { BasePromptElementProps, Chunk, PromptElement, PromptSizing, TextChunk, useKeepWith } from '@vscode/prompt-tsx'; -import { CancellationToken, LanguageModelPromptTsxPart, LanguageModelTextPart, LanguageModelDataPart, LanguageModelToolInvocationOptions, LanguageModelToolInvocationPrepareOptions, LanguageModelToolResult, lm, PreparedToolInvocation, ProviderResult } from 'vscode'; +import { CancellationToken, LanguageModelDataPart, LanguageModelPromptTsxPart, LanguageModelTextPart, LanguageModelToolInvocationOptions, LanguageModelToolInvocationPrepareOptions, LanguageModelToolResult, lm, PreparedToolInvocation, ProviderResult } from 'vscode'; import { FileChunkAndScore } from '../../../platform/chunking/common/chunk'; import { ILogService } from '../../../platform/log/common/logService'; import { UrlChunkEmbeddingsIndex } from '../../../platform/urlChunkSearch/node/urlChunkEmbeddingsIndex'; @@ -26,6 +26,19 @@ interface IFetchWebPageParams { */ const internalToolName = 'vscode_fetchWebPage_internal'; +function getLlmTxtCandidates(urlStr: string): string[] { + try { + const uri = URI.parse(urlStr); + if (uri.scheme !== 'http' && uri.scheme !== 'https') { + return []; + } + const root = `${uri.scheme}://${uri.authority}`; + return [`${root}/llms.txt`, `${root}/llms-full.txt`]; + } catch { + return []; + } +} + interface WebPageChunkResult { uri: URI; chunks: FileChunkAndScore[]; @@ -68,7 +81,23 @@ class FetchWebPageTool implements ICopilotTool { throw new Error('Tool not found'); } const { urls } = options.input; - const { content } = await lm.invokeTool(internalToolName, options, token); + + // Generate candidates + const candidateUrls = [...new Set(urls.flatMap(url => getLlmTxtCandidates(url)))].filter(u => !urls.includes(u)); + + const [originalResult, candidateResult] = await Promise.all([ + lm.invokeTool(internalToolName, options, token), + candidateUrls.length > 0 ? lm.invokeTool(internalToolName, { + input: { urls: candidateUrls }, + toolInvocationToken: options.toolInvocationToken, + tokenizationOptions: options.tokenizationOptions + }, token).catch(e => { + this._logService.warn('Failed to fetch llms.txt candidates', e); + return new LanguageModelToolResult([]); + }) : Promise.resolve(new LanguageModelToolResult([])) + ]); + + const { content } = originalResult; if (urls.length !== content.length) { this._logService.error(`Expected ${urls.length} responses but got ${content.length}`); return new LanguageModelToolResult([ @@ -80,30 +109,51 @@ class FetchWebPageTool implements ICopilotTool { const validTextContent: Array<{ readonly uri: URI; readonly content: string }> = []; const imageResults: WebPageImageResult[] = []; - for (let i = 0; i < urls.length; i++) { + const processContent = (url: string, contentPart: any, isCandidate: boolean) => { try { - const uri = URI.parse(urls[i]); - const contentPart = content[i]; + const uri = URI.parse(url); if (options.model?.capabilities.supportsImageToText && isImageDataPart(contentPart)) { // Handle image data - don't chunk it, just pass it through imageResults.push({ uri, imagePart: contentPart }); } else if (contentPart instanceof LanguageModelTextPart) { // Handle text content - this will be chunked + if (isCandidate && !contentPart.value.trim()) { + return; + } validTextContent.push({ uri, content: contentPart.value }); } else { // Handle other data parts as text if they have a value property const textValue = (contentPart as any).value; if (typeof textValue === 'string') { + if (isCandidate && !textValue.trim()) { + return; + } validTextContent.push({ uri, content: textValue }); } else { - this._logService.warn(`Unsupported content type at index ${i}: ${urls[i]}`); - invalidUrls.push(urls[i]); + if (!isCandidate) { + this._logService.warn(`Unsupported content type for: ${url}`); + invalidUrls.push(url); + } } } } catch (error) { - this._logService.error(`Invalid URL at index ${i}: ${urls[i]}`, error); - invalidUrls.push(urls[i]); + if (!isCandidate) { + this._logService.error(`Invalid URL: ${url}`, error); + invalidUrls.push(url); + } + } + }; + + for (let i = 0; i < urls.length; i++) { + processContent(urls[i], content[i], false); + } + + if (candidateResult && candidateResult.content) { + for (let i = 0; i < candidateUrls.length; i++) { + if (i < candidateResult.content.length) { + processContent(candidateUrls[i], candidateResult.content[i], true); + } } } From 0954f1ad23898a6b976440421cc2d344f151ae03 Mon Sep 17 00:00:00 2001 From: Rafael Sales Date: Wed, 19 Nov 2025 08:08:02 -0300 Subject: [PATCH 2/3] fix: update processContent parameter types for improved type safety --- src/extension/tools/vscode-node/fetchWebPageTool.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/extension/tools/vscode-node/fetchWebPageTool.tsx b/src/extension/tools/vscode-node/fetchWebPageTool.tsx index a149ea15f0..e328dbf3a8 100644 --- a/src/extension/tools/vscode-node/fetchWebPageTool.tsx +++ b/src/extension/tools/vscode-node/fetchWebPageTool.tsx @@ -109,7 +109,7 @@ class FetchWebPageTool implements ICopilotTool { const validTextContent: Array<{ readonly uri: URI; readonly content: string }> = []; const imageResults: WebPageImageResult[] = []; - const processContent = (url: string, contentPart: any, isCandidate: boolean) => { + const processContent = (url: string, contentPart: LanguageModelTextPart | LanguageModelPromptTsxPart | LanguageModelDataPart | unknown, isCandidate: boolean) => { try { const uri = URI.parse(url); From 4c9da1afbc5e939a34ba5fad0c42454789d20556 Mon Sep 17 00:00:00 2001 From: Rafael Sales Date: Wed, 19 Nov 2025 08:11:58 -0300 Subject: [PATCH 3/3] feat: enhance candidate URL generation logic for llms.txt support --- src/extension/tools/vscode-node/fetchWebPageTool.tsx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/extension/tools/vscode-node/fetchWebPageTool.tsx b/src/extension/tools/vscode-node/fetchWebPageTool.tsx index e328dbf3a8..070afcb747 100644 --- a/src/extension/tools/vscode-node/fetchWebPageTool.tsx +++ b/src/extension/tools/vscode-node/fetchWebPageTool.tsx @@ -82,7 +82,14 @@ class FetchWebPageTool implements ICopilotTool { } const { urls } = options.input; - // Generate candidates + /** + * For each input URL, generate possible llms.txt/llms-full.txt candidates. + * Deduplicate with Set, and filter out any candidate URLs that are already present in the original `urls` array. + * This ensures: + * - If a user explicitly provides a `llms.txt` or `llms-full.txt` URL, it is only fetched as an original URL, not as a candidate. + * - If the same URL appears multiple times in the input, only one candidate is generated and fetched. + * - This logic prevents redundant fetches and avoids surprising behavior if the user explicitly requests a candidate URL. + */ const candidateUrls = [...new Set(urls.flatMap(url => getLlmTxtCandidates(url)))].filter(u => !urls.includes(u)); const [originalResult, candidateResult] = await Promise.all([