Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
add: metadata-manipulator transformer & post-processor (#97)
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 authored Aug 29, 2024
1 parent 09184d6 commit ded7674
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 5 deletions.
17 changes: 17 additions & 0 deletions examples/no-filenames.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
flows:
bm25:
default: true
retrieval:
retriever:
name: basic
options:
topK: 1
postprocessors:
- name: metadata
options:
manipulations:
- operator: add
key: foobar
value: 42
- operator: remove
key: absPath
17 changes: 16 additions & 1 deletion pkg/datastore/postprocessors/postprocessors.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (

"github.com/gptscript-ai/knowledge/pkg/datastore/transformers"
"github.com/gptscript-ai/knowledge/pkg/datastore/types"
"github.com/mitchellh/mapstructure"
)

// Postprocessor is similar to types.DocumentTransformer, but can take into account the retrieval query
Expand All @@ -16,7 +17,7 @@ type Postprocessor interface {
}

type TransformerWrapper struct {
types.DocumentTransformer
DocumentTransformer types.DocumentTransformer
}

func NewTransformerWrapper(transformer types.DocumentTransformer) *TransformerWrapper {
Expand All @@ -38,10 +39,24 @@ func (t *TransformerWrapper) Name() string {
return t.DocumentTransformer.Name()
}

func (t *TransformerWrapper) Decode(cfg map[string]any) error {
transformerCfg, err := transformers.GetTransformer(t.Name())
if err != nil {
return err
}
err = mapstructure.Decode(cfg, &transformerCfg)
if err != nil {
return fmt.Errorf("failed to decode transformer configuration: %w", err)
}
t.DocumentTransformer = transformerCfg
return nil
}

var PostprocessorMap = map[string]Postprocessor{
transformers.ExtraMetadataName: NewTransformerWrapper(&transformers.ExtraMetadata{}),
transformers.KeywordExtractorName: NewTransformerWrapper(&transformers.KeywordExtractor{}),
transformers.FilterMarkdownDocsNoContentName: NewTransformerWrapper(&transformers.FilterMarkdownDocsNoContent{}),
transformers.MetadataManipulatorName: NewTransformerWrapper(&transformers.MetadataManipulator{}),
SimilarityPostprocessorName: &SimilarityPostprocessor{},
ContentSubstringFilterPostprocessorName: &ContentSubstringFilterPostprocessor{},
ContentFilterPostprocessorName: &ContentFilterPostprocessor{},
Expand Down
58 changes: 56 additions & 2 deletions pkg/datastore/transformers/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package transformers

import (
"context"
"fmt"
"log/slog"

vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
)
Expand All @@ -13,14 +15,66 @@ type ExtraMetadata struct {
}

func (e *ExtraMetadata) Transform(_ context.Context, docs []vs.Document) ([]vs.Document, error) {
for _, doc := range docs {
for i, doc := range docs {
metadata := doc.Metadata
for k, v := range e.Metadata {
doc.Metadata[k] = v
metadata[k] = v
}
docs[i].Metadata = metadata
}
return docs, nil
}

func (e *ExtraMetadata) Name() string {
return ExtraMetadataName
}

const MetadataManipulatorName = "metadata"

type MetadataManipulationOperator string

const (
MetadataManipulationOperatorAdd MetadataManipulationOperator = "add"
MetadataManipulationOperatorUpdate MetadataManipulationOperator = "upsert"
MetadataManipulationOperatorRemove MetadataManipulationOperator = "remove"
)

type MetadataManipulation struct {
Operator MetadataManipulationOperator `json:"operator,omitempty" mapstructure:"operator"`
Key string `json:"key,omitempty" mapstructure:"key"`
Value any `json:"value,omitempty" mapstructure:"value"`
}

type MetadataManipulator struct {
Manipulations []MetadataManipulation
}

func (m *MetadataManipulator) Name() string {
return MetadataManipulatorName
}

func (m *MetadataManipulator) Transform(_ context.Context, docs []vs.Document) ([]vs.Document, error) {
for i, doc := range docs {
metadata := doc.Metadata
if metadata == nil {
metadata = make(map[string]any)
}
slog.Debug("metadata manipulator", "docMetadata", metadata, "manipulations", m.Manipulations)
for _, manipulation := range m.Manipulations {
switch manipulation.Operator {
case MetadataManipulationOperatorAdd:
if _, exists := metadata[manipulation.Key]; exists {
return nil, fmt.Errorf("metadata key %q already exists in document", manipulation.Key)
}
metadata[manipulation.Key] = manipulation.Value
case MetadataManipulationOperatorUpdate:
metadata[manipulation.Key] = manipulation.Value
case MetadataManipulationOperatorRemove:
delete(metadata, manipulation.Key)
}
}
slog.Debug("metadata manipulator DONE", "docMetadata", metadata)
docs[i].Metadata = metadata
}
return docs, nil
}
1 change: 1 addition & 0 deletions pkg/datastore/transformers/transformers.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ var TransformerMap = map[string]types.DocumentTransformer{
ExtraMetadataName: &ExtraMetadata{},
FilterMarkdownDocsNoContentName: &FilterMarkdownDocsNoContent{},
KeywordExtractorName: &KeywordExtractor{},
MetadataManipulatorName: &MetadataManipulator{},
}

func GetTransformer(name string) (types.DocumentTransformer, error) {
Expand Down
14 changes: 12 additions & 2 deletions pkg/flows/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,9 +299,19 @@ func (r *RetrievalFlowConfig) AsRetrievalFlow() (*flows.RetrievalFlow, error) {
if err != nil {
return nil, err
}

if len(pp.Options) > 0 {
if err := mapstructure.Decode(pp.Options, &postprocessor); err != nil {
return nil, fmt.Errorf("failed to decode postprocessor configuration: %w", err)
// if it's a transformer wrapper, call decode
if transformerWrapper, ok := postprocessor.(*postprocessors.TransformerWrapper); ok {
if err := transformerWrapper.Decode(pp.Options); err != nil {
return nil, fmt.Errorf("failed to decode postprocessor configuration: %w", err)
}
postprocessor = transformerWrapper
} else {

if err := mapstructure.Decode(pp.Options, &postprocessor); err != nil {
return nil, fmt.Errorf("failed to decode postprocessor configuration: %w", err)
}
}
slog.Debug("Postprocessor custom configuration", "name", pp.Name, "config", output.RedactSensitive(postprocessor))
}
Expand Down

0 comments on commit ded7674

Please sign in to comment.