Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
change: use markdowntextsplitter for context-aware splitting
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 committed May 13, 2024
1 parent 65f50ec commit d293a6a
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 4 deletions.
2 changes: 0 additions & 2 deletions pkg/datastore/datastore.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@ func NewDatastore(dsn string, automigrate bool, vectorDBPath string, openAIConfi
openAIConfig.APIKey,
openAIConfig.EmbeddingModel,
z.Pointer(true),
nil,
nil,
)
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/datastore/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ func GetDocuments(ctx context.Context, filename, filetype string, reader io.Read
textSplitterOpts = z.Pointer(NewTextSplitterOpts())
}
lcgoTextSplitter := NewLcgoTextSplitter(*textSplitterOpts)
lcgoMarkdownSplitter := NewLcgoMarkdownSplitter(*textSplitterOpts)

/*
* Load documents from the content
Expand Down Expand Up @@ -237,7 +238,7 @@ func GetDocuments(ctx context.Context, filename, filetype string, reader io.Read
case ".html", "text/html":
lcgodocs, err = lcgodocloaders.NewHTML(reader).LoadAndSplit(ctx, lcgoTextSplitter)
case ".md", "text/markdown":
lcgodocs, err = lcgodocloaders.NewText(reader).LoadAndSplit(ctx, lcgoTextSplitter)
lcgodocs, err = lcgodocloaders.NewText(reader).LoadAndSplit(ctx, lcgoMarkdownSplitter)
case ".txt", "text/plain":
lcgodocs, err = lcgodocloaders.NewText(reader).LoadAndSplit(ctx, lcgoTextSplitter)
case ".csv", "text/csv":
Expand Down Expand Up @@ -268,7 +269,6 @@ func GetDocuments(ctx context.Context, filename, filetype string, reader io.Read
}
lcgodocs, err = lcgodocloaders.NewText(strings.NewReader(text)).LoadAndSplit(ctx, lcgoTextSplitter)
default:
// TODO(@iwilltry42): Fallback to plaintext reader? Example: Makefile, Dockerfile, Source Files, etc.
slog.Error("Unsupported file type", "filename", filename, "type", filetype)
return nil, fmt.Errorf("file %q has unsupported file type %q", filename, filetype)
}
Expand Down
9 changes: 9 additions & 0 deletions pkg/datastore/textsplitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,12 @@ func NewLcgoTextSplitter(opts TextSplitterOpts) lcgosplitter.TokenSplitter {
lcgosplitter.WithEncodingName(opts.EncodingName),
)
}

func NewLcgoMarkdownSplitter(opts TextSplitterOpts) *lcgosplitter.MarkdownTextSplitter {
return lcgosplitter.NewMarkdownTextSplitter(
lcgosplitter.WithChunkSize(opts.ChunkSize),
lcgosplitter.WithChunkOverlap(opts.ChunkOverlap),
lcgosplitter.WithModelName(opts.ModelName),
lcgosplitter.WithEncodingName(opts.EncodingName),
)
}

0 comments on commit d293a6a

Please sign in to comment.