Skip to content

Commit

Permalink
fix: embeddings for messages with urls
Browse files Browse the repository at this point in the history
  • Loading branch information
cygaar committed Nov 29, 2024
1 parent ec87587 commit 1bc0e48
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions packages/core/src/knowledge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ export function preprocess(content: string): string {
.replace(/!\[(.*?)\]\(.*?\)/g, "$1")
// Remove links but keep text
.replace(/\[(.*?)\]\(.*?\)/g, "$1")
// Simplify URLs: remove protocol and simplify to domain+path
.replace(/(https?:\/\/)?(www\.)?([^\s]+\.[^\s]+)/g, "$3")
// Remove Discord mentions specifically
.replace(/<@[!&]?\d+>/g, "")
// Remove HTML tags
.replace(/<[^>]*>/g, "")
// Remove horizontal rules
Expand All @@ -105,10 +109,8 @@ export function preprocess(content: string): string {
.replace(/\s+/g, " ")
// Remove multiple newlines
.replace(/\n{3,}/g, "\n\n")
// strip all special characters
.replace(/[^a-zA-Z0-9\s]/g, "")
// Remove Discord mentions
.replace(/<@!?\d+>/g, "")
// Remove special characters except those common in URLs
.replace(/[^a-zA-Z0-9\s\-_./:?=&]/g, "")
.trim()
.toLowerCase()
);
Expand Down

0 comments on commit 1bc0e48

Please sign in to comment.