Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[full-ci] experimental search backport #5221

Merged
merged 5 commits into from
Dec 13, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions changelog/unreleased/enhancement-search.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Bugfix: Enhancement search

Provides multiple enhancement to the current search implementation.
* content extraction, search now supports apache tika to extract resource contents.
* search engine, underlying search engine is swappable now.
* event consumers, the number of event consumers can now be set, which improves the speed of the individual tasks

https://github.com/owncloud/ocis/pull/5221
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/Masterminds/semver v1.5.0
github.com/MicahParks/keyfunc v1.5.1
github.com/armon/go-radix v1.0.0
github.com/bbalet/stopwords v1.0.0
github.com/blevesearch/bleve/v2 v2.3.5
github.com/coreos/go-oidc/v3 v3.4.0
github.com/cs3org/go-cs3apis v0.0.0-20221012090518-ef2996678965
Expand Down Expand Up @@ -36,6 +37,7 @@ require (
github.com/gofrs/uuid v4.3.1+incompatible
github.com/golang-jwt/jwt/v4 v4.4.2
github.com/golang/protobuf v1.5.2
github.com/google/go-tika v0.2.0
github.com/gookit/config/v2 v2.1.8
github.com/gorilla/mux v1.8.0
github.com/grpc-ecosystem/grpc-gateway/v2 v2.13.0
Expand Down
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ github.com/aws/aws-sdk-go v1.43.11/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4
github.com/aws/aws-sdk-go v1.44.114/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
github.com/aws/aws-sdk-go v1.44.122 h1:p6mw01WBaNpbdP2xrisz5tIkcNwzj/HysobNoaAHjgo=
github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
github.com/bbalet/stopwords v1.0.0 h1:0TnGycCtY0zZi4ltKoOGRFIlZHv0WqpoIGUsObjztfo=
github.com/bbalet/stopwords v1.0.0/go.mod h1:sAWrQoDMfqARGIn4s6dp7OW7ISrshUD8IP2q3KoqPjc=
github.com/beevik/etree v1.1.0 h1:T0xke/WvNtMoCqgzPhkX2r4rjY3GDZFi+FjpRZY2Jbs=
github.com/beevik/etree v1.1.0/go.mod h1:r8Aw8JqVegEf0w2fDnATrX9VpkMcyFeM0FhwO62wh+A=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
Expand Down Expand Up @@ -661,6 +663,8 @@ github.com/google/go-github/v32 v32.1.0/go.mod h1:rIEpZD9CTDQwDK9GDrtMTycQNA4JU3
github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck=
github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8=
github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU=
github.com/google/go-tika v0.2.0 h1:+1dnOoJ/pJrko2XH/3Rm5ssG9+ixOgjmPEz94ikUsxI=
github.com/google/go-tika v0.2.0/go.mod h1:vnMADwNG1A2AJx+ycQgTNMGe3ZG4CZUowEhK2FykumQ=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
Expand Down Expand Up @@ -1451,6 +1455,7 @@ golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
Expand Down
6 changes: 4 additions & 2 deletions services/search/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ include ../../.make/generate.mk

.PHONY: ci-go-generate
ci-go-generate: $(MOCKERY) # CI runs ci-node-generate automatically before this target
$(MOCKERY) --dir pkg/search --output pkg/search/mocks --case underscore --name IndexClient
$(MOCKERY) --dir pkg/search --output pkg/search/mocks --case underscore --name ProviderClient
$(MOCKERY) --dir pkg/engine --output pkg/engine/mocks --case underscore --name Engine
$(MOCKERY) --dir pkg/content --output pkg/content/mocks --case underscore --name Extractor
$(MOCKERY) --dir pkg/content --output pkg/content/mocks --case underscore --name Retriever
$(MOCKERY) --dir pkg/search --output pkg/search/mocks --case underscore --name Searcher

.PHONY: ci-node-generate
ci-node-generate:
Expand Down
8 changes: 6 additions & 2 deletions services/search/pkg/command/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,18 @@ func Server(cfg *config.Config) *cli.Command {
mtrcs := metrics.New()
mtrcs.BuildInfo.WithLabelValues(version.GetString()).Set(1)

grpcServer := grpc.Server(
grpcServer, teardown, err := grpc.Server(
grpc.Config(cfg),
grpc.Logger(logger),
grpc.Name(cfg.Service.Name),
grpc.Context(ctx),
grpc.Metrics(mtrcs),
)
defer teardown()
if err != nil {
logger.Info().Err(err).Str("transport", "grpc").Msg("Failed to initialize server")
return err
}

gr.Add(grpcServer.Run, func(_ error) {
logger.Error().
Expand All @@ -71,7 +76,6 @@ func Server(cfg *config.Config) *cli.Command {
debug.Context(ctx),
debug.Config(cfg),
)

if err != nil {
logger.Info().Err(err).Str("transport", "debug").Msg("Failed to initialize server")
return err
Expand Down
15 changes: 2 additions & 13 deletions services/search/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,13 @@ type Config struct {

GRPC GRPCConfig `yaml:"grpc"`

Datapath string `yaml:"data_path" env:"SEARCH_DATA_PATH" desc:"The directory where the filesystem storage will store search data. If not definied, the root directory derives from $OCIS_BASE_DATA_PATH:/search."`
DebounceDuration int `yaml:"debounce_duration" env:"SEARCH_REINDEX_DEBOUNCE_DURATION" desc:"The duration in milliseconds the reindex debouncer waits before triggering a reindex of a space that was modified."`

Reva *shared.Reva `yaml:"reva"`
GRPCClientTLS *shared.GRPCClientTLS `yaml:"grpc_client_tls"`
Events Events `yaml:"events"`
Engine Engine `yaml:"engine"`
Extractor Extractor `yaml:"extractor"`

MachineAuthAPIKey string `yaml:"machine_auth_api_key" env:"OCIS_MACHINE_AUTH_API_KEY;SEARCH_MACHINE_AUTH_API_KEY" desc:"Machine auth API key used to validate internal requests necessary for the access to resources from other services."`

Context context.Context `yaml:"-"`
}

// Events combines the configuration options for the event bus.
type Events struct {
Endpoint string `yaml:"endpoint" env:"SEARCH_EVENTS_ENDPOINT" desc:"The address of the event system. The event system is the message queuing service. It is used as message broker for the microservice architecture."`
Cluster string `yaml:"cluster" env:"SEARCH_EVENTS_CLUSTER" desc:"The clusterID of the event system. The event system is the message queuing service. It is used as message broker for the microservice architecture. Mandatory when using NATS as event system."`
ConsumerGroup string `yaml:"group" env:"SEARCH_EVENTS_GROUP" desc:"The customer group of the service. One group will only get one copy of an event"`
TLSInsecure bool `yaml:"tls_insecure" env:"OCIS_INSECURE;SEARCH_EVENTS_TLS_INSECURE" desc:"Whether to verify the server TLS certificates."`
TLSRootCACertificate string `yaml:"tls_root_ca_certificate" env:"SEARCH_EVENTS_TLS_ROOT_CA_CERTIFICATE" desc:"The root CA certificate used to validate the server's TLS certificate. If provided SEARCH_EVENTS_TLS_INSECURE will be seen as false."`
EnableTLS bool `yaml:"enable_tls" env:"OCIS_EVENTS_ENABLE_TLS;SEARCH_EVENTS_ENABLE_TLS" desc:"Enable TLS for the connection to the events broker. The events broker is the ocis service which receives and delivers events between the services.."`
}
13 changes: 13 additions & 0 deletions services/search/pkg/config/content.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package config

// Extractor defines which extractor to use
type Extractor struct {
Type string `yaml:"type" env:"SEARCH_EXTRACTOR_TYPE" desc:"Defines the content extraction engine."`
CS3AllowInsecure bool `yaml:"cs3_allow_insecure" env:"OCIS_INSECURE;SEARCH_EXTRACTOR_CS3SOURCE_INSECURE" desc:"Ignore untrusted SSL certificates when connecting to the CS3 source."`
Tika ExtractorTika `yaml:"tika"`
}

// ExtractorTika configures the Tika extractor
type ExtractorTika struct {
TikaURL string `yaml:"tika_url" env:"SEARCH_EXTRACTOR_TIKA_TIKA_URL" desc:"URL of the tika server."`
}
26 changes: 19 additions & 7 deletions services/search/pkg/config/defaults/defaultconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,26 @@ func DefaultConfig() *config.Config {
Service: config.Service{
Name: "search",
},
Datapath: path.Join(defaults.BaseDataPath(), "search"),
DebounceDuration: 1000,
Reva: shared.DefaultRevaConfig(),
Reva: shared.DefaultRevaConfig(),
Engine: config.Engine{
Type: "bleve",
Bleve: config.EngineBleve{
Datapath: path.Join(defaults.BaseDataPath(), "search"),
fschade marked this conversation as resolved.
Show resolved Hide resolved
},
},
Extractor: config.Extractor{
Type: "basic",
CS3AllowInsecure: false,
Tika: config.ExtractorTika{
TikaURL: "http://127.0.0.1:9998",
},
},
Events: config.Events{
Endpoint: "127.0.0.1:9233",
Cluster: "ocis-cluster",
ConsumerGroup: "search",
EnableTLS: false,
Endpoint: "127.0.0.1:9233",
Cluster: "ocis-cluster",
DebounceDuration: 1000,
AsyncUploads: false,
EnableTLS: false,
},
MachineAuthAPIKey: "",
}
Expand Down
12 changes: 12 additions & 0 deletions services/search/pkg/config/engine.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package config

// Engine defines which search engine to use
type Engine struct {
Type string `yaml:"type" env:"SEARCH_ENGINE_TYPE" desc:"Defines which search engine to use."`
Bleve EngineBleve `yaml:"bleve"`
}

// EngineBleve configures the bleve engine
type EngineBleve struct {
Datapath string `yaml:"data_path" env:"SEARCH_ENGINE_BLEVE_DATA_PATH" desc:"Path for the search persistence directory."`
}
6 changes: 6 additions & 0 deletions services/search/pkg/config/reva.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package config

// Reva defines all available REVA configuration.
type Reva struct {
Address string `ocisConfig:"address" env:"REVA_GATEWAY" desc:"The CS3 gateway endpoint."`
}
14 changes: 14 additions & 0 deletions services/search/pkg/config/search.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package config

// Events combines the configuration options for the event bus.
type Events struct {
Endpoint string `yaml:"endpoint" env:"SEARCH_EVENTS_ENDPOINT" desc:"The address of the event system. The event system is the message queuing service. It is used as message broker for the microservice architecture."`
Cluster string `yaml:"cluster" env:"SEARCH_EVENTS_CLUSTER" desc:"The clusterID of the event system. The event system is the message queuing service. It is used as message broker for the microservice architecture. Mandatory when using NATS as event system."`
AsyncUploads bool `yaml:"async_uploads" env:"STORAGE_USERS_OCIS_ASYNC_UPLOADS;SEARCH_EVENTS_ASYNC_UPLOADS" desc:"Enable asynchronous file uploads."`
NumConsumers int `yaml:"num_consumers" env:"SEARCH_EVENTS_NUM_CONSUMERS" desc:"number of event consumers per service instance"`
DebounceDuration int `yaml:"debounce_duration" env:"SEARCH_REINDEX_DEBOUNCE_DURATION" desc:"The duration in milliseconds the reindex debouncer waits before triggering a reindex of a space that was modified."`
fschade marked this conversation as resolved.
Show resolved Hide resolved

TLSInsecure bool `yaml:"tls_insecure" env:"OCIS_INSECURE;SEARCH_EVENTS_TLS_INSECURE" desc:"Whether to verify the server TLS certificates."`
TLSRootCACertificate string `yaml:"tls_root_ca_certificate" env:"SEARCH_EVENTS_TLS_ROOT_CA_CERTIFICATE" desc:"The root CA certificate used to validate the server's TLS certificate. If provided SEARCH_EVENTS_TLS_INSECURE will be seen as false."`
EnableTLS bool `yaml:"enable_tls" env:"OCIS_EVENTS_ENABLE_TLS;SEARCH_EVENTS_ENABLE_TLS" desc:"Enable TLS for the connection to the events broker. The events broker is the ocis service which receives and delivers events between the services.."`
}
41 changes: 41 additions & 0 deletions services/search/pkg/content/basic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package content

import (
"context"
"time"

storageProvider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
//"github.com/cs3org/reva/v2/pkg/tags"
"github.com/owncloud/ocis/v2/ocis-pkg/log"
)

// Basic is the simplest Extractor implementation.
type Basic struct {
logger log.Logger
}

// NewBasicExtractor creates a new Basic instance.
func NewBasicExtractor(logger log.Logger) (*Basic, error) {
return &Basic{logger: logger}, nil
}

// Extract literally just rearranges the inputs and processes them into a Document.
func (b Basic) Extract(_ context.Context, ri *storageProvider.ResourceInfo) (Document, error) {
doc := Document{
Name: ri.Path,
Size: ri.Size,
MimeType: ri.MimeType,
}

//if m := ri.ArbitraryMetadata.GetMetadata(); m != nil {
//if t, ok := m["tags"]; ok {
//doc.Tags = tags.FromList(t).AsSlice()
//}
//}

if ri.Mtime != nil {
doc.Mtime = time.Unix(int64(ri.Mtime.Seconds), int64(ri.Mtime.Nanos)).UTC().Format(time.RFC3339)
}

return doc, nil
}
89 changes: 89 additions & 0 deletions services/search/pkg/content/basic_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package content_test

import (
"context"

storageProvider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1"
cs3Types "github.com/cs3org/go-cs3apis/cs3/types/v1beta1"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/owncloud/ocis/v2/ocis-pkg/log"
"github.com/owncloud/ocis/v2/services/search/pkg/content"
)

var _ = Describe("Basic", func() {
var (
basic content.Extractor
logger = log.NewLogger()
ctx = context.TODO()
)

BeforeEach(func() {
basic, _ = content.NewBasicExtractor(logger)
})

Describe("extract", func() {
It("basic fields", func() {
ri := &storageProvider.ResourceInfo{
Path: "./foo/bar.pdf",
Size: 1024,
MimeType: "application/pdf",
}

doc, err := basic.Extract(ctx, ri)

Expect(err).To(BeNil())
Expect(doc).ToNot(BeNil())
Expect(doc.Name).To(Equal(ri.Path))
Expect(doc.Size).To(Equal(ri.Size))
Expect(doc.MimeType).To(Equal(ri.MimeType))
})

/*It("adds tags", func() {
for _, data := range []struct {
tags string
expect []string
}{
{tags: "", expect: []string{}},
{tags: ",,,", expect: []string{}},
{tags: ",foo,,", expect: []string{"foo"}},
{tags: ",foo,,bar,", expect: []string{"foo", "bar"}},
} {
ri := &storageProvider.ResourceInfo{
ArbitraryMetadata: &storageProvider.ArbitraryMetadata{
Metadata: map[string]string{
"tags": data.tags,
},
},
}

doc, err := basic.Extract(ctx, ri)
Expect(err).To(BeNil())
Expect(doc).ToNot(BeNil())
Expect(doc.Tags).To(Equal(data.expect))
}
})*/

It("RFC3339 mtime", func() {
for _, data := range []struct {
second uint64
expect string
}{
{second: 4000, expect: "1970-01-01T01:06:40Z"},
{second: 3000, expect: "1970-01-01T00:50:00Z"},
{expect: ""},
} {
ri := &storageProvider.ResourceInfo{}

if data.second != 0 {
ri.Mtime = &cs3Types.Timestamp{Seconds: data.second}
}

doc, err := basic.Extract(ctx, ri)
Expect(err).To(BeNil())
Expect(doc).ToNot(BeNil())
Expect(doc.Mtime).To(Equal(data.expect))
}
})
})
})
13 changes: 13 additions & 0 deletions services/search/pkg/content/content.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package content

// Document wraps all resource meta fields,
// it is used as a content extraction result.
type Document struct {
Title string
Name string
Content string
Size uint64
Mtime string
MimeType string
//Tags []string
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package provider_test
package content_test

import (
"testing"
Expand All @@ -7,7 +7,7 @@ import (
. "github.com/onsi/gomega"
)

func TestProvider(t *testing.T) {
func TestContent(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Provider Suite")
RunSpecs(t, "Content Suite")
}
Loading