From 1bc0e48d0a8eab803ebd624a0f7582f341963cc6 Mon Sep 17 00:00:00 2001 From: cygaar Date: Fri, 29 Nov 2024 11:16:58 -0500 Subject: [PATCH] fix: embeddings for messages with urls --- packages/core/src/knowledge.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/core/src/knowledge.ts b/packages/core/src/knowledge.ts index 3d7d32b494..c29cf48f31 100644 --- a/packages/core/src/knowledge.ts +++ b/packages/core/src/knowledge.ts @@ -94,6 +94,10 @@ export function preprocess(content: string): string { .replace(/!\[(.*?)\]\(.*?\)/g, "$1") // Remove links but keep text .replace(/\[(.*?)\]\(.*?\)/g, "$1") + // Simplify URLs: remove protocol and simplify to domain+path + .replace(/(https?:\/\/)?(www\.)?([^\s]+\.[^\s]+)/g, "$3") + // Remove Discord mentions specifically + .replace(/<@[!&]?\d+>/g, "") // Remove HTML tags .replace(/<[^>]*>/g, "") // Remove horizontal rules @@ -105,10 +109,8 @@ export function preprocess(content: string): string { .replace(/\s+/g, " ") // Remove multiple newlines .replace(/\n{3,}/g, "\n\n") - // strip all special characters - .replace(/[^a-zA-Z0-9\s]/g, "") - // Remove Discord mentions - .replace(/<@!?\d+>/g, "") + // Remove special characters except those common in URLs + .replace(/[^a-zA-Z0-9\s\-_./:?=&]/g, "") .trim() .toLowerCase() );