Skip to content

Commit

Permalink
feat(multimodal): allow to template placeholders (#3728)
Browse files Browse the repository at this point in the history
feat(multimodal): allow to template image placeholders

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
  • Loading branch information
mudler authored Oct 4, 2024
1 parent 04c0841 commit 648ffdf
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 4 deletions.
4 changes: 4 additions & 0 deletions core/config/backend_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,10 @@ type TemplateConfig struct {
// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
// It defaults to \n
JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`

Video string `yaml:"video"`
Image string `yaml:"image"`
Audio string `yaml:"audio"`
}

func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
Expand Down
21 changes: 18 additions & 3 deletions core/http/endpoints/openai/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/pkg/functions"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/templates"
"github.com/mudler/LocalAI/pkg/utils"
"github.com/rs/zerolog/log"
)
Expand Down Expand Up @@ -168,8 +169,13 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
continue CONTENT
}
input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff

t := "[vid-{{.ID}}]{{.Text}}"
if config.TemplateConfig.Video != "" {
t = config.TemplateConfig.Video
}
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, vidIndex, input.Messages[i].StringContent)
vidIndex++
case "audio_url", "audio":
// Decode content as base64 either if it's an URL or base64 text
Expand All @@ -180,7 +186,11 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
}
input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[audio-%d]", audioIndex) + input.Messages[i].StringContent
t := "[audio-{{.ID}}]{{.Text}}"
if config.TemplateConfig.Audio != "" {
t = config.TemplateConfig.Audio
}
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, audioIndex, input.Messages[i].StringContent)
audioIndex++
case "image_url", "image":
// Decode content as base64 either if it's an URL or base64 text
Expand All @@ -189,9 +199,14 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
log.Error().Msgf("Failed encoding image: %s", err)
continue CONTENT
}

t := "[img-{{.ID}}]{{.Text}}"
if config.TemplateConfig.Image != "" {
t = config.TemplateConfig.Image
}
input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, imgIndex, input.Messages[i].StringContent)
imgIndex++
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/model/initializers.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string

client = NewModel(modelID, serverAddress, process)
} else {
log.Debug().Msg("external backend is uri")
log.Debug().Msg("external backend is a uri")
// address
client = NewModel(modelID, uri, nil)
}
Expand Down
24 changes: 24 additions & 0 deletions pkg/templates/multimodal.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package templates

import (
"bytes"
"text/template"
)

func TemplateMultiModal(templateString string, templateID int, text string) (string, error) {
// compile the template
tmpl, err := template.New("template").Parse(templateString)
if err != nil {
return "", err
}
result := bytes.NewBuffer(nil)
// execute the template
err = tmpl.Execute(result, struct {
ID int
Text string
}{
ID: templateID,
Text: text,
})
return result.String(), err
}
19 changes: 19 additions & 0 deletions pkg/templates/multimodal_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package templates_test

import (
. "github.com/mudler/LocalAI/pkg/templates" // Update with your module path

// Update with your module path
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

var _ = Describe("EvaluateTemplate", func() {
Context("templating simple strings for multimodal chat", func() {
It("should template messages correctly", func() {
result, err := TemplateMultiModal("[img-{{.ID}}]{{.Text}}", 1, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("[img-1]bar"))
})
})
})

0 comments on commit 648ffdf

Please sign in to comment.