diff --git a/.gitignore b/.gitignore index 7ab3181..e69de29 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +0,0 @@ -.idea/ -.vscode/ -*.json -.env \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..9104999 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,26 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "NLP tokenize example", + "type": "go", + "request": "launch", + "mode": "auto", + "program": "${workspaceFolder}/examples/nlp.go", + "env": { + "GOOGLE_NLP_CREDS_PATH": "${input:google_nlp_creds_path}" + }, + "args": [] + } + ], + "inputs": [ + { + "id": "google_nlp_creds_path", + "description": "Google Natural Language API credentials path", + "type": "promptString" + } + ] +} \ No newline at end of file diff --git a/LICENSE b/LICENSE index d69b668..281de00 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2020 Julian Claus +Copyright (c) 2025 Julian Claus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index e82ffef..8c582b4 100644 --- a/Makefile +++ b/Makefile @@ -15,17 +15,14 @@ build: windows linux darwin @echo version: $(VERSION) windows: $(WINDOWS) - -linux: $(LINUX) - -darwin: $(DARWIN) - $(WINDOWS): env GOOS=windows GOARCH=amd64 go build -v -o bin/$(WINDOWS) -ldflags="-s -w -X main.version=$(VERSION)" ./cli/main.go +linux: $(LINUX) $(LINUX): env GOOS=linux GOARCH=amd64 go build -v -o bin/$(LINUX) -ldflags="-s -w -X main.version=$(VERSION)" ./cli/main.go +darwin: $(DARWIN) $(DARWIN): env GOOS=darwin GOARCH=amd64 go build -v -o bin/$(DARWIN) -ldflags="-s -w -X main.version=$(VERSION)" ./cli/main.go diff --git a/README.md b/README.md index 4c84719..fe562ef 100644 --- a/README.md +++ b/README.md @@ -1,166 +1,3 @@ -# assocentity - -[![Go Report Card](https://goreportcard.com/badge/github.com/ndabAP/assocentity/v13)](https://goreportcard.com/report/github.com/ndabAP/assocentity/v13) - -Package assocentity is a social science tool to analyze the relative distance -from tokens to entities. The motiviation is to make conclusions based on the -distance from interesting tokens to a certain entity and its synonyms. - -## Features - -- Provide your own tokenizer -- Provides a default NLP tokenizer (by Google) -- Define aliases for entities -- Provides a multi-OS, language-agnostic CLI version - -## Installation - -```bash -$ go get github.com/ndabAP/assocentity/v13 -``` - -## Prerequisites - -If you want to analyze human readable texts you can use the provided Natural -Language tokenizer (powered by Google). To do so, sign-up for a Cloud Natural -Language API service account key and download the generated JSON file. This -equals the `credentialsFile` at the example below. You should never commit that -file. - -A possible offline tokenizer would be a white space tokenizer. You also might -use a parser depending on your purposes. - -## Example - -We would like to find out which adjectives are how close in average to a certain -public person. Let's take George W. Bush and 1,000 NBC news articles as an -example. "George Bush" is the entity and synonyms are "George Walker Bush" and -"Bush" and so on. The text is each of the 1,000 NBC news articles. - -Defining a text source and to set the entity would be first step. Next, we need -to instantiate our tokenizer. In this case, we use the provided Google NLP -tokenizer. Finally, we can calculate our mean distances. We can use -`assocentity.Distances`, which accepts multiple texts. Notice -how we pass `tokenize.ADJ` to only include adjectives as part of speech. -Finally, we can take the mean by passing the result to `assocentity.Mean`. - -```go -// Define texts source and entity -texts := []string{ - "Former Presidents Barack Obama, Bill Clinton and ...", // Truncated - "At the pentagon on the afternoon of 9/11, ...", - "Tony Blair moved swiftly to place his relationship with ...", -} -entities := []string{ - "Goerge Walker Bush", - "Goerge Bush", - "Bush", -} -source := assocentity.NewSource(entities, texts) - -// Instantiate the NLP tokenizer (powered by Google) -nlpTok := nlp.NewNLPTokenizer(credentialsFile, nlp.AutoLang) - -// Get the distances to adjectives -ctx := context.TODO() -dists, err := assocentity.Distances(ctx, nlpTok, tokenize.ADJ, source) -if err != nil { - // Handle error -} -// Get the mean from the distances -mean := assocentity.Mean(dists) -``` - -### Tokenization - -If you provide your own tokenizer you must implement the interface with the -method `Tokenize` and the following signature: - -```go -type Tokenizer interface { - Tokenize(ctx context.Context, text string) ([]Token, error) -} -``` - -`Token` is of type: - -```go -type Token struct { - PoS PoS // Part of speech - Text string // Text -} - -// Part of speech -type PoS int -``` - -For example, given the text: - -```go -text := "Punchinello was burning to get me" -``` - -The result from `Tokenize` would be: - -```go -[]Token{ - { - Text: "Punchinello", - PoS: tokenize.NOUN, - }, - { - Text: "was", - PoS: tokenize.VERB, - }, - { - Text: "burning", - PoS: tokenize.VERB, - }, - { - Text: "to", - PoS: tokenize.PRT, - }, - { - Text: "get", - PoS: tokenize.VERB, - }, - { - Text: "me", - PoS: tokenize.PRON, - }, -} -``` - -## CLI - -There is also a language-agnostic terminal version available for either Windows, -Mac (Darwin) or Linux (only with 64-bit support) if you don't have Go available. -The application expects the text from "stdin" and accepts the following flags: - -| Flag | Description | Type | Default | -| ------------- | ------------------------------------------------------------------------------------------------- | -------- | ------- | -| `entities` | Define entities to be searched within input, example: `-entities="Max Payne,Payne"` | `string` | | -| `gog-svc-loc` | Google Clouds NLP JSON service account file, example: `-gog-svc-loc="/home/max/gog-svc-loc.json"` | `string` | | -| `op` | Operation to excute: `-op="mean"` | `string` | `mean` | -| `pos` | Defines part of speeches to keep, example: `-pos=noun,verb,pron` | `string` | `any` | - -Example: - -```bash -echo "Relax, Max. You're a nice guy." | ./bin/assocentity_linux_amd64_v13.0.0-0-g948274a-dirty -gog-svc-loc=/home/max/.config/assocentity/google-service.json -entities="Max Payne,Payne,Max" -``` - -The output is written to "stdout" in appropoiate formats. - -## Projects using assocentity - -- [entityscrape](https://github.com/ndabAP/entityscrape) - Distance between word - types (default: adjectives) in news articles and persons - -## Author - -[Julian Claus](https://www.julian-claus.de) and contributors. - -## License - -MIT +- TODO: Entities European leaders, reuse text entities, create sentiment with +descriptions like "Macron is a bad president" +- WithLema normalizer \ No newline at end of file diff --git a/bin/assocentity_darwin_amd64_v13.0.3-0-g9d9e27c-dirty b/__debug_bin2883779316 similarity index 51% rename from bin/assocentity_darwin_amd64_v13.0.3-0-g9d9e27c-dirty rename to __debug_bin2883779316 index f6fc464..b26ae57 100755 Binary files a/bin/assocentity_darwin_amd64_v13.0.3-0-g9d9e27c-dirty and b/__debug_bin2883779316 differ diff --git a/__debug_bin3359489860 b/__debug_bin3359489860 new file mode 100755 index 0000000..750303f Binary files /dev/null and b/__debug_bin3359489860 differ diff --git a/analyses.go b/analyses.go new file mode 100644 index 0000000..3c5c95d --- /dev/null +++ b/analyses.go @@ -0,0 +1,48 @@ +package assocentity + +import ( + "iter" + + "github.com/ndabAP/assocentity/v15/tokenize" +) + +type ( + Analyses struct { + frames []frame + entities map[string][]*tokenize.Token + } + + frame struct { + dataset tokenize.Dataset + locs map[int]int + } +) + +func (frame frame) Sentences() iter.Seq[[]*tokenize.Token] { + return func(yield func([]*tokenize.Token) bool) { + var ( + sentences = frame.dataset.Sentences + tokens = frame.dataset.Tokens + ) + for i := range sentences { + var next int32 = 0 + if i < len(sentences)-1 { + next = sentences[i+1].Text.BeginOffset + } + + toks := make([]*tokenize.Token, 0) + for _, token := range tokens { + if next == token.Text.BeginOffset { + // Next sentence + break + } + + toks = append(toks, token) + } + + if !yield(toks) { + break + } + } + } +} diff --git a/analyses_deps.go b/analyses_deps.go new file mode 100644 index 0000000..27cb689 --- /dev/null +++ b/analyses_deps.go @@ -0,0 +1,27 @@ +package assocentity + +import ( + "cloud.google.com/go/language/apiv1beta2/languagepb" +) + +type PartOfSpeech = languagepb.PartOfSpeech_Tag + +func (analyses Analyses) Deps(pos PartOfSpeech) []string { + // for i, text := range analyses.dataset { + // coords := analyses.coords(i) + + // // TODO: For each sentence + + // for _, token := range text.Tokens { + // if token == tokenize.NilToken { + // continue + // } + + // for s, e := range coords { + // if token.DependencyEdge.HeadTokenIndex == int32(s) { + // } + // } + // } + // } + return nil +} diff --git a/analyses_test.go b/analyses_test.go new file mode 100644 index 0000000..27133e4 --- /dev/null +++ b/analyses_test.go @@ -0,0 +1,154 @@ +package assocentity + +import ( + "bytes" + "encoding/json" + "reflect" + "testing" + + "cloud.google.com/go/language/apiv1beta2/languagepb" + "github.com/ndabAP/assocentity/v15/tokenize" +) + +func TestFrameSentences(t *testing.T) { + tests := []struct { + frame frame + want [][]*tokenize.Token + }{ + { + frame: frame{ + dataset: tokenize.Dataset{ + Sentences: []*tokenize.Sentence{ + {Text: &languagepb.TextSpan{BeginOffset: 0}}, + }, + Tokens: []*tokenize.Token{ + {Text: &languagepb.TextSpan{BeginOffset: 0}}, + {Text: &languagepb.TextSpan{BeginOffset: 3}}, + {Text: &languagepb.TextSpan{BeginOffset: 6}}, + }, + }, + }, + want: [][]*tokenize.Token{ + { + {Text: &languagepb.TextSpan{BeginOffset: 0}}, + {Text: &languagepb.TextSpan{BeginOffset: 3}}, + {Text: &languagepb.TextSpan{BeginOffset: 6}}, + }, + }, + }, + { + frame: frame{ + dataset: tokenize.Dataset{ + Sentences: []*tokenize.Sentence{ + {Text: &languagepb.TextSpan{BeginOffset: 0}}, + {Text: &languagepb.TextSpan{BeginOffset: 10}}, + }, + Tokens: []*tokenize.Token{ + {Text: &languagepb.TextSpan{BeginOffset: 0}}, + {Text: &languagepb.TextSpan{BeginOffset: 3}}, + {Text: &languagepb.TextSpan{BeginOffset: 6}}, + {Text: &languagepb.TextSpan{BeginOffset: 10}}, + {Text: &languagepb.TextSpan{BeginOffset: 12}}, + }, + }, + }, + want: [][]*tokenize.Token{ + { + {Text: &languagepb.TextSpan{BeginOffset: 0}}, + {Text: &languagepb.TextSpan{BeginOffset: 3}}, + {Text: &languagepb.TextSpan{BeginOffset: 6}}, + }, + { + {Text: &languagepb.TextSpan{BeginOffset: 10}}, + {Text: &languagepb.TextSpan{BeginOffset: 12}}, + }, + }, + }, + { + frame: frame{ + dataset: tokenize.Dataset{ + Sentences: []*tokenize.Sentence{ + {Text: &languagepb.TextSpan{BeginOffset: 0}}, + {Text: &languagepb.TextSpan{BeginOffset: 10}}, + }, + Tokens: []*tokenize.Token{ + {Text: &languagepb.TextSpan{BeginOffset: 0}}, + {Text: &languagepb.TextSpan{BeginOffset: 3}}, + {Text: &languagepb.TextSpan{BeginOffset: 6}}, + {Text: &languagepb.TextSpan{BeginOffset: 10}}, + }, + }, + }, + want: [][]*tokenize.Token{ + { + {Text: &languagepb.TextSpan{BeginOffset: 0}}, + {Text: &languagepb.TextSpan{BeginOffset: 3}}, + {Text: &languagepb.TextSpan{BeginOffset: 6}}, + }, + { + {Text: &languagepb.TextSpan{BeginOffset: 10}}, + }, + }, + }, + } + + for _, test := range tests { + var got [][]*tokenize.Token + for t := range test.frame.Sentences() { + got = append(got, t) + } + + if !bytes.Equal(marshalJSON(t, got), marshalJSON(t, test.want)) { + t.Errorf("frame.Sentences() = %v, want %v", got, test.want) + } + + } +} + +func marshalJSON(t *testing.T, v any) (b []byte) { + t.Helper() + m := structToMap(v) + b, _ = json.Marshal(m) + return +} + +func structToMap(s interface{}) map[string]interface{} { + v := reflect.ValueOf(s) + + if v.Kind() == reflect.Ptr { + v = v.Elem() // Dereference the pointer if it is one + } + + if v.Kind() != reflect.Struct { + panic("not a struct") + } + + m := make(map[string]interface{}) + for i := 0; i < v.NumField(); i++ { + f := v.Type().Field(i) + fieldValue := v.Field(i) + + fieldMap := make(map[string]interface{}) + fieldMap["name"] = f.Name + fieldMap["type"] = f.Type.String() + + if f.PkgPath == "" { // Exported field + switch fieldValue.Kind() { + case reflect.Slice: + sliceValues := make([]interface{}, fieldValue.Len()) + for j := 0; j < fieldValue.Len(); j++ { + sliceValues[j] = fieldValue.Index(j).Interface() + } + fieldMap["value"] = sliceValues + case reflect.Struct: + fieldMap["value"] = structToMap(fieldValue.Interface()) // Recursive call + default: + fieldMap["value"] = fieldValue.Interface() + } + } else { + // Unexported field - value not accessible + } + m[f.Name] = fieldMap + } + return m +} diff --git a/analyses_vecs.go b/analyses_vecs.go new file mode 100644 index 0000000..42bebf2 --- /dev/null +++ b/analyses_vecs.go @@ -0,0 +1,60 @@ +package assocentity + +import ( + "golang.org/x/exp/constraints" +) + +// Vecs returns a vector from each token to every entity found +// func (analyses Analyses) Vecs() map[*tokenize.Token][]int { +// vecs := make(map[*tokenize.Token][]int) +// for i, text := range analyses.texts { +// coords := analyses.coords(i) + +// // Foward +// func() { +// next, stop := iter.Pull(maps.Values(coords)) +// defer stop() +// for j, token := range text.Tokens { +// if token == tokenize.NilToken { +// continue +// } + +// pairs, ok := next() +// if !ok { +// continue +// } +// for i := range pairs.x1s() { +// vecs[token] = append(vecs[token], abs(i-j)) +// } +// } +// }() + +// // Backward +// func() { +// next, stop := iter.Pull(maps.Values(coords)) +// defer stop() +// for j, token := range slices.Backward(text.Tokens) { +// if token == tokenize.NilToken { +// continue +// } + +// pairs, ok := next() +// if !ok { +// continue +// } +// for i := range pairs.x2s() { +// vecs[token] = append(vecs[token], abs(i-j)) // 8-5 +// } +// } +// }() +// } + +// return vecs +// } + +func abs[T constraints.Float | constraints.Integer](x T) T { + if x < 0 { + return -x + } + return x +} diff --git a/assocentity.go b/assocentity.go deleted file mode 100644 index b6a3469..0000000 --- a/assocentity.go +++ /dev/null @@ -1,185 +0,0 @@ -package assocentity - -import ( - "context" - "math" - - "github.com/ndabAP/assocentity/v13/internal/comp" - "github.com/ndabAP/assocentity/v13/internal/iterator" - "github.com/ndabAP/assocentity/v13/internal/pos" - "github.com/ndabAP/assocentity/v13/tokenize" -) - -// source wraps entities and texts -type source struct { - Entities []string - Texts []string -} - -// NewSource returns a new source consisting of entities and texts -func NewSource(entities, texts []string) source { - return source{ - Entities: entities, - Texts: texts, - } -} - -// Distances returns the distances from entities to a list of texts -func Distances( - ctx context.Context, - tokenizer tokenize.Tokenizer, - poS tokenize.PoS, - source source, -) (map[tokenize.Token][]float64, error) { - var ( - dists = make(map[tokenize.Token][]float64) - err error - ) - for _, text := range source.Texts { - d, err := distances(ctx, tokenizer, poS, text, source.Entities) - if err != nil { - return dists, err - } - - for tok, dist := range d { - dists[tok] = append(dists[tok], dist...) - } - } - - return dists, err -} - -// distances returns the distances to entities for one text -func distances( - ctx context.Context, - tokenizer tokenize.Tokenizer, - poS tokenize.PoS, - text string, - entities []string, -) (map[tokenize.Token][]float64, error) { - var ( - dists = make(map[tokenize.Token][]float64) - err error - ) - - // Tokenize text - textTokens, err := tokenizer.Tokenize(ctx, text) - if err != nil { - return dists, err - } - - // Tokenize entities - var entityTokens [][]tokenize.Token - for _, entity := range entities { - tokens, err := tokenizer.Tokenize(ctx, entity) - if err != nil { - return dists, err - } - entityTokens = append(entityTokens, tokens) - } - - // Determinate part of speech - posDetermer := pos.NewPoSDetermer(poS) - determTokens := posDetermer.DetermPoS(textTokens, entityTokens) - - // Check if any given PoS was found in text tokens - if len(determTokens) == 0 { - return dists, nil - } - - // Create iterators - - determTokensIter := iterator.New(determTokens) - - // Use iterators to search for entities in positive and negative direction - posDirIter := iterator.New(determTokens) - negDirIter := iterator.New(determTokens) - - entityTokensIter := iterator.New(entityTokens) - - // Iterate through part of speech determinated text tokens - for determTokensIter.Next() { - // If the current text token is an entity, we skip about the entity - currDetermTokensPos := determTokensIter.CurrPos() - isEntity, entity := comp.TextWithEntities( - determTokensIter, - entityTokensIter, - comp.DirPos, - ) - if isEntity { - determTokensIter.Forward(len(entity) - 1) - continue - } - - // Now we can collect the actual distances - - // Finds/counts entities in positive direction - posDirIter.SetPos(currDetermTokensPos) - for posDirIter.Next() { - // [I, was, (with), Max, Payne, here] -> true, Max Payne - // [I, was, with, Max, Payne, (here)] -> false, "" - isEntity, entity := comp.TextWithEntities( - posDirIter, - entityTokensIter, - comp.DirPos, - ) - if isEntity { - appendDist(dists, determTokensIter, posDirIter) - // Skip about entity - posDirIter.Forward(len(entity) - 1) // Next increments - } - } - - // Finds/counts entities in negative direction - negDirIter.SetPos(currDetermTokensPos) - for negDirIter.Prev() { - // [I, was, (with), Max, Payne, here] -> false, "" - // [I, was, with, Max, Payne, (here)] -> true, Max Payne - isEntity, entity := comp.TextWithEntities( - negDirIter, - entityTokensIter, - comp.DirNeg, - ) - if isEntity { - appendDist(dists, determTokensIter, negDirIter) - negDirIter.Rewind(len(entity) - 1) - } - } - } - - return dists, err -} - -// Helper to append a float64 to a map of tokens and distances -func appendDist( - m map[tokenize.Token][]float64, - k *iterator.Iterator[tokenize.Token], - v *iterator.Iterator[tokenize.Token], -) { - token := k.CurrElem() - dist := math.Abs(float64(v.CurrPos() - k.CurrPos())) - m[token] = append(m[token], dist) -} - -// Mean returns the mean of the provided distances -func Mean(dists map[tokenize.Token][]float64) map[tokenize.Token]float64 { - mean := make(map[tokenize.Token]float64) - for token, d := range dists { - mean[token] = meanFloat64(d) - } - return mean -} - -// Returns the mean of a 64-bit float slice -func meanFloat64(xs []float64) float64 { - // Prevent /0 - if len(xs) == 0 { - return 0 - } - - sum := 0.0 - for _, x := range xs { - sum += x - } - return sum / float64(len(xs)) -} diff --git a/assocentity_test.go b/assocentity_test.go deleted file mode 100644 index 775c509..0000000 --- a/assocentity_test.go +++ /dev/null @@ -1,340 +0,0 @@ -package assocentity - -import ( - "context" - "reflect" - "strings" - "testing" - - "github.com/ndabAP/assocentity/v13/tokenize" -) - -// whiteSpaceTokenizer tokenizes a text by empty space and assigns unknown -// pos -type whiteSpaceTokenizer int - -func (t whiteSpaceTokenizer) Tokenize(ctx context.Context, text string) ([]tokenize.Token, error) { - spl := strings.Split(text, " ") - tokens := make([]tokenize.Token, 0) - for _, s := range spl { - tokens = append(tokens, tokenize.Token{ - PoS: tokenize.UNKN, - Text: s, - }) - } - - return tokens, nil -} - -func TestMean(t *testing.T) { - type args struct { - ctx context.Context - tokenizer tokenize.Tokenizer - poS tokenize.PoS - texts []string - entities []string - } - tests := []struct { - args args - want map[tokenize.Token]float64 - wantErr bool - }{ - { - args: args{ - ctx: context.Background(), - tokenizer: new(whiteSpaceTokenizer), - poS: tokenize.ANY, - texts: []string{ - "AA B $ CCC ++", - "$ E ++ AA $ B", - }, - entities: []string{"$", "++"}, - }, - want: map[tokenize.Token]float64{ - { - PoS: tokenize.UNKN, - Text: "AA", - }: 2.2, - { - PoS: tokenize.UNKN, - Text: "B", - }: 2.6, - { - PoS: tokenize.UNKN, - Text: "CCC", - }: 1, - { - PoS: tokenize.UNKN, - Text: "E", - }: 1.6666666666666667, - }, - }, - } - for _, tt := range tests { - t.Run("", func(t *testing.T) { - source := NewSource(tt.args.entities, tt.args.texts) - dists, err := Distances( - tt.args.ctx, - tt.args.tokenizer, - tt.args.poS, - source, - ) - if err != nil { - t.Error(err) - } - - got := Mean(dists) - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("Mean() = %v, want %v", got, tt.want) - } - }) - } -} - -// concreteTokenizer is a tokenizer with a fixed set of tokens -type concreteTokenizer int - -func (t concreteTokenizer) Tokenize(ctx context.Context, text string) ([]tokenize.Token, error) { - spl := strings.Split(text, " ") - tokens := make([]tokenize.Token, 0) - for _, s := range spl { - var poS tokenize.PoS - switch s { - case "English": - poS = tokenize.NOUN - - case ".": - poS = tokenize.PUNCT - - case "run": - poS = tokenize.VERB - - default: - continue - } - - tokens = append(tokens, tokenize.Token{ - PoS: poS, - Text: s, - }) - } - - return tokens, nil -} - -func Test_distances(t *testing.T) { - type args struct { - ctx context.Context - tokenizer tokenize.Tokenizer - poS tokenize.PoS - text string - entities []string - } - tests := []struct { - args args - want map[tokenize.Token][]float64 - wantErr bool - }{ - { - args: args{ - ctx: context.Background(), - tokenizer: new(concreteTokenizer), - poS: tokenize.NOUN | tokenize.PUNCT | tokenize.VERB, - text: "English x . x xx run", - entities: []string{"run"}, - }, - want: map[tokenize.Token][]float64{ - { - PoS: tokenize.NOUN, - Text: "English", - }: {2}, - { - PoS: tokenize.PUNCT, - Text: ".", - }: {1}, - }, - }, - } - for _, tt := range tests { - t.Run("", func(t *testing.T) { - got, err := distances( - tt.args.ctx, - tt.args.tokenizer, - tt.args.poS, - tt.args.text, - tt.args.entities, - ) - if err != nil { - t.Error(err) - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("dist() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestNormalize(t *testing.T) { - t.Run("HumandReadableNormalizer", func(t *testing.T) { - got := map[tokenize.Token][]float64{ - { - PoS: tokenize.UNKN, - Text: "A", - }: {}, - { - PoS: tokenize.UNKN, - Text: "a", - }: {}, - { - PoS: tokenize.UNKN, - Text: "b", - }: {}, - { - PoS: tokenize.UNKN, - Text: "&", - }: {}, - } - want := map[tokenize.Token][]float64{ - { - PoS: tokenize.UNKN, - Text: "a", - }: {}, - { - PoS: tokenize.UNKN, - Text: "b", - }: {}, - { - PoS: tokenize.UNKN, - Text: "and", - }: {}, - } - Normalize(got, HumandReadableNormalizer) - - if !reflect.DeepEqual(got, want) { - t.Errorf("Normalize() = %v, want %v", got, want) - } - }) -} - -func TestThreshold(t *testing.T) { - type args struct { - dists map[tokenize.Token][]float64 - threshold float64 - } - tests := []struct { - args args - want map[tokenize.Token][]float64 - }{ - { - args: args{ - dists: map[tokenize.Token][]float64{ - { - PoS: tokenize.UNKN, - Text: "A", - }: {1}, - { - PoS: tokenize.UNKN, - Text: "B", - }: {1, 1}, - { - PoS: tokenize.UNKN, - Text: "C", - }: {1, 1, 1}, - { - PoS: tokenize.UNKN, - Text: "D", - }: {1, 1, 1}, - }, - threshold: 75, - }, - want: map[tokenize.Token][]float64{ - { - PoS: tokenize.UNKN, - Text: "C", - }: {1, 1, 1}, - { - PoS: tokenize.UNKN, - Text: "D", - }: {1, 1, 1}, - }, - }, - { - args: args{ - dists: map[tokenize.Token][]float64{ - { - PoS: tokenize.UNKN, - Text: "A", - }: {1}, - { - PoS: tokenize.UNKN, - Text: "B", - }: {1, 1}, - { - PoS: tokenize.UNKN, - Text: "C", - }: {1, 1, 1}, - { - PoS: tokenize.UNKN, - Text: "D", - }: {1, 1, 1, 1}, - }, - threshold: 76, - }, - want: map[tokenize.Token][]float64{ - { - PoS: tokenize.UNKN, - Text: "D", - }: {1, 1, 1, 1}, - }, - }, - { - args: args{ - dists: map[tokenize.Token][]float64{ - { - PoS: tokenize.UNKN, - Text: "A", - }: {1}, - { - PoS: tokenize.UNKN, - Text: "B", - }: {1, 1}, - { - PoS: tokenize.UNKN, - Text: "C", - }: {1, 1, 1}, - { - PoS: tokenize.UNKN, - Text: "D", - }: {1, 1, 1, 1}, - }, - threshold: 1, - }, - want: map[tokenize.Token][]float64{ - { - PoS: tokenize.UNKN, - Text: "A", - }: {1}, - { - PoS: tokenize.UNKN, - Text: "B", - }: {1, 1}, - { - PoS: tokenize.UNKN, - Text: "C", - }: {1, 1, 1}, - { - PoS: tokenize.UNKN, - Text: "D", - }: {1, 1, 1, 1}, - }, - }, - } - for _, tt := range tests { - t.Run("", func(t *testing.T) { - Threshold(tt.args.dists, tt.args.threshold) - if !reflect.DeepEqual(tt.args.dists, tt.want) { - t.Errorf("Threshold() = %v, want %v", tt.args.dists, tt.want) - } - }) - } -} diff --git a/bin/assocentity_linux_amd64_v13.0.3-0-g9d9e27c-dirty b/bin/assocentity_linux_amd64_v13.0.3-0-g9d9e27c-dirty deleted file mode 100755 index 96663ec..0000000 Binary files a/bin/assocentity_linux_amd64_v13.0.3-0-g9d9e27c-dirty and /dev/null differ diff --git a/bin/assocentity_windows_amd64_v13.0.3-0-g9d9e27c-dirty.exe b/bin/assocentity_windows_amd64_v13.0.3-0-g9d9e27c-dirty.exe deleted file mode 100755 index 9738e51..0000000 Binary files a/bin/assocentity_windows_amd64_v13.0.3-0-g9d9e27c-dirty.exe and /dev/null differ diff --git a/cli/main.go b/cli/main.go deleted file mode 100644 index 6a5b4a5..0000000 --- a/cli/main.go +++ /dev/null @@ -1,151 +0,0 @@ -package main - -import ( - "context" - "encoding/csv" - "errors" - "flag" - "fmt" - "io" - "log" - "os" - "strings" - - "github.com/ndabAP/assocentity/v13" - "github.com/ndabAP/assocentity/v13/nlp" - "github.com/ndabAP/assocentity/v13/tokenize" -) - -var logger = log.Default() - -func init() { - log.SetFlags(0) - logger.SetOutput(os.Stderr) - flag.Parse() -} - -var ( - entitiesF = flag.String( - "entities", - "", - "Define entities to be searched within input, example: -entities=\"Max Payne,Payne\"", - ) - gogSvcLocF = flag.String( - "gog-svc-loc", - "", - "Google Clouds NLP JSON service account file, example: -gog-svc-loc=\"~/gog-svc-loc.json\"", - ) - opF = flag.String( - "op", - "mean", - "Operation to execute", - ) - posF = flag.String( - "pos", - "any", - "Defines part of speeches to be included, example: -pos=noun,verb,pron", - ) -) - -func main() { - if len(*gogSvcLocF) == 0 { - printHelpAndFail(errors.New("missing google service account file")) - } - - // Read text from stdin - textBytes, err := io.ReadAll(os.Stdin) - if err != nil { - printHelpAndFail(err) - } - if len(textBytes) == 0 { - printHelpAndFail(errors.New("empty text")) - } - - credentialsFilename := *gogSvcLocF - nlpTok := nlp.NewNLPTokenizer(credentialsFilename, nlp.AutoLang) - - // Set part of speech - posArr := strings.Split(*posF, ",") - if len(posArr) == 0 { - printHelpAndFail(errors.New("missing pos")) - } - // Parse part of speech flag and use PoS type - poS := parsePoS(posArr) - - // Prepare text and entities - text := string(textBytes) - entities := strings.Split(*entitiesF, ",") - if len(entities) == 0 { - printHelpAndFail(errors.New("missing entities")) - } - - // Recover to provide an unified API response - defer func() { - if r := recover(); r != nil { - printHelpAndFail(r) - } - }() - - // Should we set a timeout? - var ctx = context.Background() - - switch *opF { - case "mean": - source := assocentity.NewSource(entities, []string{text}) - dists, err := assocentity.Distances( - ctx, - nlpTok, - poS, - source, - ) - if err != nil { - printHelpAndFail(err) - } - mean := assocentity.Mean(dists) - - // Write CSV to stdout - csvwr := csv.NewWriter(os.Stdout) - defer csvwr.Flush() - for tok, dist := range mean { - poS, ok := tokenize.PoSMapStr[tok.PoS] - if !ok { - printHelpAndFail(errors.New("unassigned part of speech")) - } - record := []string{ - // Text - tok.Text, - // Part of speech - poS, - // Distance - fmt.Sprintf("%f", dist), - } - if err := csvwr.Write(record); err != nil { - printHelpAndFail(err) - } - } - - default: - printHelpAndFail(errors.New("unknown operation")) - } -} - -// ["noun", "adj", "verb"] -> 11 -func parsePoS(posArr []string) (pos tokenize.PoS) { - for _, p := range posArr { - if p, ok := tokenize.PoSMap[p]; ok { - // Add bits - pos += p - } - } - return -} - -func printHelpAndFail(reason any) { - logger.Println(reason) - logger.Println() - logger.Println("Usage:") - logger.Println() - flag.PrintDefaults() - - os.Exit(1) -} diff --git a/errors.go b/errors.go new file mode 100644 index 0000000..686a8dd --- /dev/null +++ b/errors.go @@ -0,0 +1 @@ +package assocentity diff --git a/examples/nlp.go b/examples/nlp.go new file mode 100644 index 0000000..c919a8b --- /dev/null +++ b/examples/nlp.go @@ -0,0 +1,47 @@ +package main + +import ( + "context" + "log" + "os" + + "github.com/ndabAP/assocentity/v15/tokenize" + "github.com/ndabAP/assocentity/v15/tokenize/nlp" +) + +func main() { + log.SetFlags(0) + + var ( + ctx = context.Background() + creds = os.Getenv("GOOGLE_NLP_CREDS_PATH") + ) + texts := []string{ + "You can't win this one, Max.", + "Vlad was one of those old time bad guys with honor and morals, which almost made him one of the good guys. None of us was a saint.", + "So what does B.B. stand for anyway, backstabbing bastard?", + "I lied to myself that it was over. I was still alive, my loved ones were still dead. It wasn't over.", + } + nlp := nlp.New(creds, nlp.AutoLang) + for i, text := range texts { + log.Printf("text: %s", text) + + dataset, err := nlp.Tokenize(ctx, text, tokenize.FeatureAll) + if err != nil { + panic(err) + } + + for i, sentence := range dataset.Sentences { + log.Printf("index: %d %s", i, sentence) + } + for i, token := range dataset.Tokens { + log.Printf("index: %d %s", i, token) + } + + log.Printf("sentiment: {magnitude: %f score:%f}", dataset.Sentiment.Magnitude, dataset.Sentiment.Score) + + if i != len(texts)-1 { + log.Println() + } + } +} diff --git a/go.mod b/go.mod index f1ec677..7f930dc 100644 --- a/go.mod +++ b/go.mod @@ -1,33 +1,54 @@ -module github.com/ndabAP/assocentity/v13 +module github.com/ndabAP/assocentity/v15 -go 1.18 +go 1.23.5 require ( - cloud.google.com/go v0.34.0 + cloud.google.com/go v0.116.0 github.com/joho/godotenv v1.3.0 - google.golang.org/api v0.102.0 - google.golang.org/genproto v0.0.0-20221024183307-1bc688fe9f3e + google.golang.org/api v0.214.0 + google.golang.org/genproto v0.0.0-20241118233622-e639e219e697 +) + +require ( + cloud.google.com/go/auth v0.13.0 // indirect + cloud.google.com/go/auth/oauth2adapt v0.2.6 // indirect + cloud.google.com/go/compute/metadata v0.6.0 // indirect + cloud.google.com/go/language v1.14.3 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/s2a-go v0.1.8 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 // indirect + go.opentelemetry.io/otel v1.29.0 // indirect + go.opentelemetry.io/otel/metric v1.29.0 // indirect + go.opentelemetry.io/otel/trace v1.29.0 // indirect + golang.org/x/crypto v0.32.0 // indirect + golang.org/x/sync v0.10.0 // indirect + golang.org/x/time v0.8.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20241118233622-e639e219e697 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576 // indirect ) require ( github.com/BurntSushi/toml v0.3.1 // indirect - github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect - github.com/golang/protobuf v1.5.2 // indirect - github.com/google/go-cmp v0.5.9 // indirect - github.com/googleapis/enterprise-certificate-proxy v0.2.0 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect github.com/googleapis/gax-go v1.0.3 // indirect - github.com/googleapis/gax-go/v2 v2.7.0 - go.opencensus.io v0.23.0 // indirect - golang.org/x/exp v0.0.0-20221026153819-32f3d567a233 // indirect + github.com/googleapis/gax-go/v2 v2.14.0 + go.opencensus.io v0.24.0 // indirect + golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3 // indirect - golang.org/x/mod v0.6.0 // indirect - golang.org/x/net v0.7.0 // indirect - golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783 // indirect - golang.org/x/sys v0.5.0 // indirect - golang.org/x/text v0.7.0 // indirect - golang.org/x/tools v0.2.0 // indirect - google.golang.org/appengine v1.6.7 // indirect - google.golang.org/grpc v1.50.1 // indirect - google.golang.org/protobuf v1.28.1 // indirect + golang.org/x/mod v0.22.0 // indirect + golang.org/x/net v0.34.0 // indirect + golang.org/x/oauth2 v0.24.0 // indirect + golang.org/x/sys v0.29.0 // indirect + golang.org/x/text v0.21.0 // indirect + golang.org/x/tools v0.29.0 // indirect + google.golang.org/appengine v1.6.8 // indirect + google.golang.org/grpc v1.67.3 // indirect + google.golang.org/protobuf v1.35.2 // indirect honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc // indirect ) diff --git a/go.sum b/go.sum index 20722e7..8051a17 100644 --- a/go.sum +++ b/go.sum @@ -1,19 +1,40 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0 h1:eOI3/cP2VTU6uZLDYAoic+eyzzB9YyGmJ7eIjl8rOPg= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.116.0 h1:B3fRrSDkLRt5qSHWe40ERJvhvnQwdZiHu0bJOpldweE= +cloud.google.com/go v0.116.0/go.mod h1:cEPSRWPzZEswwdr9BxE6ChEn01dWlTaF05LiC2Xs70U= +cloud.google.com/go/auth v0.13.0 h1:8Fu8TZy167JkW8Tj3q7dIkr2v4cndv41ouecJx0PAHs= +cloud.google.com/go/auth v0.13.0/go.mod h1:COOjD9gwfKNKz+IIduatIhYJQIc0mG3H102r/EMxX6Q= +cloud.google.com/go/auth/oauth2adapt v0.2.6 h1:V6a6XDu2lTwPZWOawrAa9HUK+DB2zfJyTuciBG5hFkU= +cloud.google.com/go/auth/oauth2adapt v0.2.6/go.mod h1:AlmsELtlEBnaNTL7jCj8VQFLy6mbZv0s4Q7NGBeQ5E8= +cloud.google.com/go/compute v1.29.0 h1:Lph6d8oPi38NHkOr6S55Nus/Pbbcp37m/J0ohgKAefs= +cloud.google.com/go/compute/metadata v0.6.0 h1:A6hENjEsCDtC1k8byVsgwvVcioamEHvZ4j01OwKxG9I= +cloud.google.com/go/compute/metadata v0.6.0/go.mod h1:FjyFAW1MW0C203CEOMDTu3Dk1FlqW3Rga40jzHL4hfg= +cloud.google.com/go/language v1.14.3 h1:8hmFMiS3wjjj3TX/U1zZYTgzwZoUjDbo9PaqcYEmuB4= +cloud.google.com/go/language v1.14.3/go.mod h1:hjamj+KH//QzF561ZuU2J+82DdMlFUjmiGVWpovGGSA= github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:tluoj9z5200jBnyusfRPU2LqT6J+DAorxEvtC7LHB+E= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -29,6 +50,8 @@ github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= @@ -38,82 +61,159 @@ github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM= +github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.2.0 h1:y8Yozv7SZtlU//QXbezB6QkpuE6jMD2/gfzk4AftXjs= github.com/googleapis/enterprise-certificate-proxy v0.2.0/go.mod h1:8C0jb7/mgJe/9KK8Lm7X9ctZC2t60YyIpYEI16jx0Qg= +github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw= +github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA= github.com/googleapis/gax-go v1.0.3 h1:9dMLqhaibYONnDRcnHdUs9P8Mw64jLlZTYlDe3leBtQ= github.com/googleapis/gax-go v1.0.3/go.mod h1:QyXYajJFdARxGzjwUfbDFIse7Spkw81SJ4LrBJXtlQ8= github.com/googleapis/gax-go/v2 v2.0.2/go.mod h1:LLvjysVCY1JZeum8Z6l8qUty8fiNwE08qbEPm1M08qg= github.com/googleapis/gax-go/v2 v2.7.0 h1:IcsPKeInNvYi7eqSaDjiZqDDKu5rsmunY0Y1YupQSSQ= github.com/googleapis/gax-go/v2 v2.7.0/go.mod h1:TEop28CZZQ2y+c0VxMUmu1lV+fQx57QpBWsYpwqHJx8= +github.com/googleapis/gax-go/v2 v2.14.0 h1:f+jMrjBPl+DL9nI4IQzLUxMq7XrAqFYB7hBPqMNIe8o= +github.com/googleapis/gax-go/v2 v2.14.0/go.mod h1:lhBCnjdLrWRaPvLWhmc8IS24m9mr07qSYnHncrgo+zk= github.com/joho/godotenv v1.3.0 h1:Zjp+RcGpHhGlrMbJzXTrZZPrWj+1vfm90La1wgB6Bhc= github.com/joho/godotenv v1.3.0/go.mod h1:7hK45KPybAkOC6peb+G5yklZfMxEjkZhHbwpqxOKXbg= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.opencensus.io v0.23.0 h1:gqCw0LfLxScz8irSi8exQc7fyQ0fKQU/qnC/X8+V/1M= go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= +go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 h1:r6I7RJCN86bpD/FQwedZ0vSixDpwuWREjW9oRMsmqDc= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0/go.mod h1:B9yO6b04uB80CzjedvewuqDhxJxi11s7/GtiGa8bAjI= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 h1:TT4fX+nBOA/+LUkobKGW1ydGcn+G3vRw9+g5HwCphpk= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0/go.mod h1:L7UH0GbB0p47T4Rri3uHjbpCFYrVrwc1I25QhNPiGK8= +go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw= +go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8= +go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc= +go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8= +go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4= +go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc= +golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190221220918-438050ddec5e/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20221026153819-32f3d567a233 h1:9bNbSKT4RPLEzne0Xh1v3NaNecsa1DKjkOuTbY6V9rI= golang.org/x/exp v0.0.0-20221026153819-32f3d567a233/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= +golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 h1:yqrTHse8TCMW1M1ZCP+VAR/l0kKxwaAIqN/il7x4voA= +golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8/go.mod h1:tujkw807nyEEAamNbDrEGzRav+ilXA7PCRAd6xsmwiU= golang.org/x/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3 h1:XQyxROzUlZH+WIQwySDgnISgOivlhjIEwaQaJEJrrN0= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.6.0 h1:b9gGHsz9/HhJ3HF5DHQytPpuwocVTChQJK3AvoLRD5I= golang.org/x/mod v0.6.0/go.mod h1:4mET923SAdbXp2ki8ey+zGs1SLqsuM2Y0uvdZR/fUNI= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= +golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783 h1:nt+Q6cXKz4MosCSpnbMtqiQ8Oz0pxTef2B4Vca2lvfk= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= +golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= +golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= +golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/time v0.8.0 h1:9i3RxcPv3PZnitoVGMPDKZSq1xW1gK1Xy3ArNOGZfEg= +golang.org/x/time v0.8.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.2.0 h1:G6AHpWxTMGY1KyEYoAQ5WTtIekUUvDNjan3ugu60JvE= golang.org/x/tools v0.2.0/go.mod h1:y4OqIKeOV/fWJetJ8bXPU1sEVniLMIyDAZWeHdV+NTA= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.29.0/go.mod h1:KMQVMRsVxU6nHCFXrBPhDB8XncLNLM0lIy/F14RP588= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/api v0.102.0 h1:JxJl2qQ85fRMPNvlZY/enexbxpCjLwGhZUtgfGeQ51I= google.golang.org/api v0.102.0/go.mod h1:3VFl6/fzoA+qNuS1N1/VfXY4LjoXN/wzeIp7TweWwGo= +google.golang.org/api v0.214.0 h1:h2Gkq07OYi6kusGOaT/9rnNljuXmqPnaig7WGPmKbwA= +google.golang.org/api v0.214.0/go.mod h1:bYPpLG8AyeMWwDU6NXoB00xC0DFkikVvd5MfwoxjLqE= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= google.golang.org/genproto v0.0.0-20221024183307-1bc688fe9f3e h1:S9GbmC1iCgvbLyAokVCwiO6tVIrU9Y7c5oMx1V/ki/Y= google.golang.org/genproto v0.0.0-20221024183307-1bc688fe9f3e/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= +google.golang.org/genproto v0.0.0-20241118233622-e639e219e697 h1:ToEetK57OidYuqD4Q5w+vfEnPvPpuTwedCNVohYJfNk= +google.golang.org/genproto v0.0.0-20241118233622-e639e219e697/go.mod h1:JJrvXBWRZaFMxBufik1a4RpFw4HhgVtBBWQeQgUj2cc= +google.golang.org/genproto/googleapis/api v0.0.0-20241118233622-e639e219e697 h1:pgr/4QbFyktUv9CtQ/Fq4gzEE6/Xs7iCXbktaGzLHbQ= +google.golang.org/genproto/googleapis/api v0.0.0-20241118233622-e639e219e697/go.mod h1:+D9ySVjN8nY8YCVjc5O7PZDIdZporIDY3KaGfJunh88= +google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576 h1:8ZmaLZE4XWrtU3MyClkYqqtl6Oegr3235h7jxsDyqCY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576/go.mod h1:5uTbfoYQed2U9p3KIj2/Zzm02PYhndfdmML0qC3q3FU= google.golang.org/grpc v1.16.0/go.mod h1:0JHn/cJsOMiMfNA9+DeHDlAU7KAAB5GDlYFpa9MZMio= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= @@ -122,6 +222,8 @@ google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8 google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= google.golang.org/grpc v1.50.1 h1:DS/BukOZWp8s6p4Dt/tOaJaTQyPyOoCcrjroHuCeLzY= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= +google.golang.org/grpc v1.67.3 h1:OgPcDAFKHnH8X3O4WcO4XUc8GRDeKsKReqbQtiCj7N8= +google.golang.org/grpc v1.67.3/go.mod h1:YGaHCc6Oap+FzBJTZLBzkGSYt/cvGPFTPxkn7QfSU8s= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= @@ -135,8 +237,11 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io= +google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc h1:/hemPrYIhOhy8zYrNj+069zDB68us2sMGsfkFJO0iZs= diff --git a/internal/comp/comp.go b/internal/comp/comp.go deleted file mode 100644 index 53eda72..0000000 --- a/internal/comp/comp.go +++ /dev/null @@ -1,72 +0,0 @@ -package comp - -import ( - "github.com/ndabAP/assocentity/v13/internal/iterator" - "github.com/ndabAP/assocentity/v13/tokenize" -) - -type Direction int - -var ( - DirPos Direction = 1 - DirNeg Direction = -1 -) - -// Checks if current text token is entity and if, returns entity -func TextWithEntities(textIter *iterator.Iterator[tokenize.Token], entityTokensIter *iterator.Iterator[[]tokenize.Token], entityIterDir Direction) (bool, []tokenize.Token) { - // Reset iterators before and comparing - entityTokensIter.Reset() - defer entityTokensIter.Reset() - currTextPos := textIter.CurrPos() - defer textIter.SetPos(currTextPos) - - // By default, we assume an entity - var isEntity bool = true - - for entityTokensIter.Next() { - // Reset - isEntity = true - - entityIter := iterator.New(entityTokensIter.CurrElem()) - - switch entityIterDir { - - // -> - case DirPos: - for entityIter.Next() { - // Check if text token matches the entity token - if !eqItersElems(textIter, entityIter) { - isEntity = false - } - - // Advance text iterator to compare against - textIter.Next() - } - - // <- - case DirNeg: - // We scan backwards and start from top - entityIter.SetPos(entityIter.Len()) // [1, 2, 3],(4) - for entityIter.Prev() { // [1, 2, (3)] - if !eqItersElems(textIter, entityIter) { - isEntity = false - } - - textIter.Prev() - } - } - - if isEntity { - return true, entityTokensIter.CurrElem() - } - - // Reset to compare with next entity tokens - textIter.SetPos(currTextPos) - } - - return false, []tokenize.Token{} -} - -func eqItersElems(x *iterator.Iterator[tokenize.Token], y *iterator.Iterator[tokenize.Token]) bool { - return x.CurrElem() == y.CurrElem() -} diff --git a/internal/comp/comp_test.go b/internal/comp/comp_test.go deleted file mode 100644 index bd40a9f..0000000 --- a/internal/comp/comp_test.go +++ /dev/null @@ -1,212 +0,0 @@ -package comp_test - -import ( - "reflect" - "testing" - - "github.com/ndabAP/assocentity/v13/internal/comp" - "github.com/ndabAP/assocentity/v13/internal/iterator" - "github.com/ndabAP/assocentity/v13/tokenize" -) - -func TestTextWithEntity(t *testing.T) { - type args struct { - textIter *iterator.Iterator[tokenize.Token] - entityTokensIter *iterator.Iterator[[]tokenize.Token] - dir comp.Direction - } - tests := []struct { - name string - args args - want bool - want1 []tokenize.Token - }{ - { - name: "no entity", - args: args{ - textIter: iterator.New([]tokenize.Token{ - { - PoS: tokenize.ADP, - Text: "Without", - }, - { - PoS: tokenize.NOUN, - Text: "Mona", - }, - { - PoS: tokenize.PRT, - Text: "'s'", - }, - { - PoS: tokenize.NOUN, - Text: "help", - }, - { - PoS: tokenize.PUNCT, - Text: ",", - }, - { - PoS: tokenize.PRON, - Text: "I", - }, - { - PoS: tokenize.VERB, - Text: "'d'", - }, - { - PoS: tokenize.VERB, - Text: "be", - }, - { - PoS: tokenize.DET, - Text: "a", - }, - { - PoS: tokenize.ADJ, - Text: "dead", - }, - { - PoS: tokenize.NOUN, - Text: "man", - }, - }), - entityTokensIter: iterator.New([][]tokenize.Token{ - { - { - PoS: tokenize.NOUN, - Text: "Alex", - }, - }, - }), - dir: comp.DirPos, - }, - want: false, - want1: make([]tokenize.Token, 0), - }, - { - name: "one entity", - args: args{ - textIter: iterator.New([]tokenize.Token{ - { - PoS: tokenize.ADP, - Text: "Without", - }, - { - PoS: tokenize.NOUN, - Text: "Mona", - }, - { - PoS: tokenize.PRT, - Text: "'s'", - }, - { - PoS: tokenize.NOUN, - Text: "help", - }, - { - PoS: tokenize.PUNCT, - Text: ",", - }, - { - PoS: tokenize.PRON, - Text: "I", - }, - { - PoS: tokenize.VERB, - Text: "'d'", - }, - { - PoS: tokenize.VERB, - Text: "be", - }, - { - PoS: tokenize.DET, - Text: "a", - }, - { - PoS: tokenize.ADJ, - Text: "dead", - }, - { - PoS: tokenize.NOUN, - Text: "man", - }, - }).SetPos(1), - entityTokensIter: iterator.New([][]tokenize.Token{ - { - tokenize.Token{ - PoS: tokenize.NOUN, - Text: "Mona", - }, - }, - }), - dir: comp.DirPos, - }, - want: true, - want1: []tokenize.Token{ - { - PoS: tokenize.NOUN, - Text: "Mona", - }, - }, - }, - { - name: "one two tokens long entity", - args: args{ - textIter: iterator.New([]tokenize.Token{ - { - PoS: tokenize.ANY, - Text: "a", - }, - { - PoS: tokenize.ANY, - Text: "a", - }, - { - PoS: tokenize.ANY, - Text: "b", - }, - { - PoS: tokenize.ANY, - Text: "b", - }, - }).SetPos(2), - entityTokensIter: iterator.New([][]tokenize.Token{ - { - tokenize.Token{ - PoS: tokenize.ANY, - Text: "b", - }, - tokenize.Token{ - PoS: tokenize.ANY, - Text: "b", - }, - }, - }), - dir: comp.DirPos, - }, - want: true, - want1: []tokenize.Token{ - { - PoS: tokenize.ANY, - Text: "b", - }, - { - PoS: tokenize.ANY, - Text: "b", - }, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, got1 := comp.TextWithEntities(tt.args.textIter, tt.args.entityTokensIter, tt.args.dir) - if got != tt.want { - t.Errorf("TextWithEntity() got = %v, want %v", got, tt.want) - } - if !reflect.DeepEqual(got1, tt.want1) { - t.Errorf("TextWithEntity() got1 = %v, want %v", got1, tt.want1) - } - }) - } -} diff --git a/internal/iterator/iterator.go b/internal/iterator/iterator.go deleted file mode 100644 index 3175502..0000000 --- a/internal/iterator/iterator.go +++ /dev/null @@ -1,77 +0,0 @@ -package iterator - -type Iterator[T any] struct { - el T - elems []T - len int - pos int -} - -func New[T any](elems []T) *Iterator[T] { - return &Iterator[T]{ - *new(T), - elems, - len(elems), - -1, - } -} - -func (it *Iterator[T]) Next() bool { - if it.pos+1 >= it.len { - return false - } - it.pos++ - it.el = it.elems[it.pos] - return true -} - -func (it *Iterator[T]) Prev() bool { - if it.pos-1 < 0 { - return false - } - it.pos-- - it.el = it.elems[it.pos] - return true -} - -func (it *Iterator[T]) Reset() *Iterator[T] { - it.pos = -1 - it.el = *new(T) - return it -} - -func (it *Iterator[T]) CurrPos() int { - return it.pos -} - -func (it *Iterator[T]) CurrElem() T { - return it.el -} - -func (it *Iterator[T]) Len() int { - return it.len -} - -func (it *Iterator[T]) SetPos(pos int) *Iterator[T] { - it.pos = pos - it.setEl() - return it -} - -func (it *Iterator[T]) Rewind(pos int) *Iterator[T] { - it.pos -= pos - it.setEl() - return it -} - -func (it *Iterator[T]) Forward(pos int) *Iterator[T] { - it.pos += pos - it.setEl() - return it -} - -func (it *Iterator[T]) setEl() { - if len(it.elems)-1 > it.pos && it.pos >= 0 { - it.el = it.elems[it.pos] - } -} diff --git a/internal/iterator/iterator_test.go b/internal/iterator/iterator_test.go deleted file mode 100644 index c4f441d..0000000 --- a/internal/iterator/iterator_test.go +++ /dev/null @@ -1,67 +0,0 @@ -package iterator_test - -import ( - "testing" - - "github.com/ndabAP/assocentity/v13/internal/iterator" -) - -var testElems = []int{1, 2, 3, 3, 1, 5, 6} - -func TestNav(t *testing.T) { - it := iterator.New(testElems) - - it.Next() - if it.CurrElem() != testElems[0] { - t.Errorf("CurrElem() got = %v, want = %v", it.CurrElem(), testElems[0]) - } - - it.Prev() - if it.CurrElem() != testElems[0] { - t.Errorf("CurrElem() got = %v, want = %v", it.CurrElem(), testElems[0]) - } - - it.Forward(1) - if it.CurrElem() != testElems[1] { - t.Errorf("CurrElem() got = %v, want = %v", it.CurrElem(), testElems[1]) - } - - it.Rewind(1) - if it.CurrElem() != testElems[0] { - t.Errorf("CurrElem() got = %v, want = %v", it.CurrElem(), testElems[0]) - } - - it.Reset() - // We need an independent counter - i := 0 - for it.Next() { - if testElems[i] != it.CurrElem() { - t.Errorf("CurrElem() got = %v, want = %v", it.CurrElem(), testElems[i]) - } - i++ - } - - it.SetPos(len(testElems)) - i = len(testElems) - 1 - for it.Prev() { - if testElems[i] != it.CurrElem() { - t.Errorf("CurrElem() got = %v, want = %v", it.CurrElem(), testElems[i]) - } - i-- - } -} - -func TestCurrElem(t *testing.T) { - it := iterator.New(testElems) - - it.SetPos(1) - if it.CurrElem() != testElems[1] { - t.Errorf("SetPos(1) got = %v, want = %v", it.CurrElem(), testElems[1]) - } - - it.Reset() - it.Next() - if it.CurrElem() != testElems[0] { - t.Errorf("Reset() got = %v, want = %v", it.CurrElem(), testElems[1]) - } -} diff --git a/internal/pos/pos_determ.go b/internal/pos/pos_determ.go deleted file mode 100644 index 6c628dc..0000000 --- a/internal/pos/pos_determ.go +++ /dev/null @@ -1,45 +0,0 @@ -package pos - -import ( - "github.com/ndabAP/assocentity/v13/internal/comp" - "github.com/ndabAP/assocentity/v13/internal/iterator" - "github.com/ndabAP/assocentity/v13/tokenize" -) - -// poSDetermer represents the default part of speech determinator -type poSDetermer struct{ poS tokenize.PoS } - -// NewPoSDetermer returns a new default part of speech determinator -func NewPoSDetermer(poS tokenize.PoS) poSDetermer { return poSDetermer{poS} } - -// DetermPoS deterimantes if a part of speech tag should be kept. It always -// appends entities -func (dps poSDetermer) DetermPoS(textTokens []tokenize.Token, entityTokens [][]tokenize.Token) []tokenize.Token { - // If any part of speech, no need to determinate - if dps.poS == tokenize.ANY { - return textTokens - } - - var determTokens []tokenize.Token - - textIter := iterator.New(textTokens) - entityTokensIter := iterator.New(entityTokens) - - for textIter.Next() { - currTextPos := textIter.CurrPos() - isEntity, entity := comp.TextWithEntities(textIter, entityTokensIter, comp.DirPos) - if isEntity { - textIter.SetPos(currTextPos + len(entity)) - // Entity is always kept - determTokens = append(determTokens, entity...) - continue - } - - // Non-entity tokens - if textIter.CurrElem().PoS&dps.poS != 0 { - determTokens = append(determTokens, textIter.CurrElem()) - } - } - - return determTokens -} diff --git a/internal/pos/pos_determ_test.go b/internal/pos/pos_determ_test.go deleted file mode 100644 index 166ffb5..0000000 --- a/internal/pos/pos_determ_test.go +++ /dev/null @@ -1,171 +0,0 @@ -package pos - -import ( - "reflect" - "testing" - - "github.com/ndabAP/assocentity/v13/tokenize" -) - -func TestPoSDetermer_DetermPoS(t *testing.T) { - type fields struct { - poS tokenize.PoS - } - type args struct { - textTokens []tokenize.Token - entityTokens [][]tokenize.Token - } - tests := []struct { - name string - fields fields - args args - want []tokenize.Token - }{ - { - name: "any", - fields: fields{ - poS: tokenize.ANY, - }, - args: args{ - textTokens: []tokenize.Token{ - {PoS: tokenize.NOUN, Text: "Cold"}, - {PoS: tokenize.ADP, Text: "as"}, - {PoS: tokenize.DET, Text: "a"}, - {PoS: tokenize.NOUN, Text: "gun"}, - }, - entityTokens: [][]tokenize.Token{ - { - { - Text: "Max", - PoS: tokenize.NOUN, - }, - { - Text: "Payne", - PoS: tokenize.NOUN, - }, - }, - }, - }, - want: []tokenize.Token{ - {PoS: tokenize.NOUN, Text: "Cold"}, - {PoS: tokenize.ADP, Text: "as"}, - {PoS: tokenize.DET, Text: "a"}, - {PoS: tokenize.NOUN, Text: "gun"}, - }, - }, - { - name: "noun", - fields: fields{ - poS: tokenize.NOUN, - }, - args: args{ - textTokens: []tokenize.Token{ - {PoS: tokenize.NOUN, Text: "Cold"}, - {PoS: tokenize.ADP, Text: "as"}, - {PoS: tokenize.DET, Text: "a"}, - {PoS: tokenize.NOUN, Text: "gun"}, - }, - entityTokens: [][]tokenize.Token{ - { - { - Text: "Max", - PoS: tokenize.NOUN, - }, - { - Text: "Payne", - PoS: tokenize.NOUN, - }, - }, - }, - }, - want: []tokenize.Token{ - {PoS: tokenize.NOUN, Text: "Cold"}, - {PoS: tokenize.NOUN, Text: "gun"}, - }, - }, - { - name: "noun, adposition", - fields: fields{ - poS: tokenize.NOUN | tokenize.ADP, - }, - args: args{ - textTokens: []tokenize.Token{ - {PoS: tokenize.NOUN, Text: "Cold"}, - {PoS: tokenize.ADP, Text: "as"}, - {PoS: tokenize.DET, Text: "a"}, - {PoS: tokenize.NOUN, Text: "gun"}, - }, - entityTokens: [][]tokenize.Token{ - { - { - Text: "Max", - PoS: tokenize.NOUN, - }, - { - Text: "Payne", - PoS: tokenize.NOUN, - }, - }, - }, - }, - want: []tokenize.Token{ - {PoS: tokenize.NOUN, Text: "Cold"}, - {PoS: tokenize.ADP, Text: "as"}, - {PoS: tokenize.NOUN, Text: "gun"}, - }, - }, - { - name: "skip entity", - fields: fields{ - poS: tokenize.VERB, - }, - args: args{ - textTokens: []tokenize.Token{ - {PoS: tokenize.VERB, Text: "Relax"}, - {PoS: tokenize.PUNCT, Text: ","}, - {PoS: tokenize.NOUN, Text: "Max"}, - {PoS: tokenize.PUNCT, Text: "."}, - {PoS: tokenize.PRON, Text: "You"}, - {PoS: tokenize.VERB, Text: "'re"}, - {PoS: tokenize.DET, Text: "a"}, - {PoS: tokenize.ADJ, Text: "nice"}, - {PoS: tokenize.NOUN, Text: "guy"}, - {PoS: tokenize.PUNCT, Text: "."}, - }, - entityTokens: [][]tokenize.Token{ - { - { - Text: "Max", - PoS: tokenize.NOUN, - }, - { - Text: "Payne", - PoS: tokenize.NOUN, - }, - }, - { - { - Text: "Max", - PoS: tokenize.NOUN, - }, - }, - }, - }, - want: []tokenize.Token{ - {PoS: tokenize.VERB, Text: "Relax"}, - {PoS: tokenize.NOUN, Text: "Max"}, - {PoS: tokenize.VERB, Text: "'re"}, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - dps := poSDetermer{ - poS: tt.fields.poS, - } - if got := dps.DetermPoS(tt.args.textTokens, tt.args.entityTokens); !reflect.DeepEqual(got, tt.want) { - t.Errorf("NLPPoSDetermer.DetermPoS() = %v, want %v", got, tt.want) - } - }) - } -} diff --git a/nlp/tokenize.go b/nlp/tokenize.go deleted file mode 100644 index 2023a54..0000000 --- a/nlp/tokenize.go +++ /dev/null @@ -1,143 +0,0 @@ -package nlp - -import ( - "context" - "errors" - "time" - - language "cloud.google.com/go/language/apiv1" - "github.com/googleapis/gax-go/v2/apierror" - "github.com/ndabAP/assocentity/v13/tokenize" - "google.golang.org/api/option" - "google.golang.org/genproto/googleapis/api/error_reason" - languagepb "google.golang.org/genproto/googleapis/cloud/language/v1" -) - -var ( - ErrMaxRetries = errors.New("max retries reached") -) - -var poSMap = map[languagepb.PartOfSpeech_Tag]tokenize.PoS{ - languagepb.PartOfSpeech_ADJ: tokenize.ADJ, - languagepb.PartOfSpeech_ADP: tokenize.ADP, - languagepb.PartOfSpeech_ADV: tokenize.ADV, - languagepb.PartOfSpeech_AFFIX: tokenize.AFFIX, - languagepb.PartOfSpeech_CONJ: tokenize.CONJ, - languagepb.PartOfSpeech_DET: tokenize.DET, - languagepb.PartOfSpeech_NOUN: tokenize.NOUN, - languagepb.PartOfSpeech_NUM: tokenize.NUM, - languagepb.PartOfSpeech_PRON: tokenize.PRON, - languagepb.PartOfSpeech_PRT: tokenize.PRT, - languagepb.PartOfSpeech_PUNCT: tokenize.PUNCT, - languagepb.PartOfSpeech_UNKNOWN: tokenize.UNKN, - languagepb.PartOfSpeech_VERB: tokenize.VERB, - languagepb.PartOfSpeech_X: tokenize.X, -} - -// AutoLang tries to automatically recognize the language -var AutoLang string = "auto" - -// NLPTokenizer tokenizes a text using Google NLP -type NLPTokenizer struct { - credsFilename string - lang string -} - -// NewNLPTokenizer returns a new NLP tokenizer instance. Note that NLPTokenizer -// has a built-in retrier -func NewNLPTokenizer(credentialsFilename string, lang string) tokenize.Tokenizer { - return NLPTokenizer{ - credsFilename: credentialsFilename, - lang: lang, - } -} - -// Tokenize tokenizes a text -func (nlp NLPTokenizer) Tokenize(ctx context.Context, text string) ([]tokenize.Token, error) { - res, err := nlp.req(ctx, text) - if err != nil { - return []tokenize.Token{}, err - } - - tokens := make([]tokenize.Token, 0) - for _, tok := range res.GetTokens() { - if _, ok := poSMap[tok.PartOfSpeech.Tag]; !ok { - return tokens, errors.New("can't find pos match") - } - - tokens = append(tokens, tokenize.Token{ - PoS: poSMap[tok.PartOfSpeech.Tag], - Text: tok.GetText().GetContent(), - }) - } - return tokens, nil -} - -// req sends a request to the Google server. It retries if the API rate limited -// is reached -func (nlp NLPTokenizer) req(ctx context.Context, text string) (*languagepb.AnnotateTextResponse, error) { - client, err := language.NewClient(ctx, option.WithCredentialsFile(nlp.credsFilename)) - if err != nil { - return &languagepb.AnnotateTextResponse{}, err - } - - defer client.Close() - - doc := &languagepb.Document{ - Source: &languagepb.Document_Content{ - Content: text, - }, - Type: languagepb.Document_PLAIN_TEXT, - } - // Set the desired language if not auto - if nlp.lang != AutoLang { - doc.Language = nlp.lang - } - - // Google rate limit timeout - const apiRateTimeout = 1.0 // In Minutes - var ( - // Google errors - apiErr *apierror.APIError - errReasonRateLimitExceeded = error_reason.ErrorReason_RATE_LIMIT_EXCEEDED.String() - - delay = apiRateTimeout - delayMult = 1.05 // Delay multiplier - retries = 0 - ) - const ( - delayGrowth = 1.05 // Delay growth rate - maxRetries = 6 - ) - // Retry request up to maxRetries times if rate limit exceeded with an - // growing delay - for { - if retries >= maxRetries { - return &languagepb.AnnotateTextResponse{}, ErrMaxRetries - } - - // Do the actual request - res, err := client.AnnotateText(ctx, &languagepb.AnnotateTextRequest{ - Document: doc, - Features: &languagepb.AnnotateTextRequest_Features{ - ExtractSyntax: true, - }, - EncodingType: languagepb.EncodingType_UTF8, - }) - // Check for rate limit exceeded error to retry - if errors.As(err, &apiErr) { - if apiErr.Reason() == errReasonRateLimitExceeded { - time.Sleep(time.Minute * time.Duration(delay)) - - // Retryer logic - retries += 1 - delay *= delayMult - delayMult *= delayGrowth - - continue - } - } else { - return res, err - } - } -} diff --git a/nlp/tokenize_test.go b/nlp/tokenize_test.go deleted file mode 100644 index 8fe6896..0000000 --- a/nlp/tokenize_test.go +++ /dev/null @@ -1,77 +0,0 @@ -package nlp_test - -import ( - "context" - "os" - "reflect" - "testing" - - "github.com/joho/godotenv" - "github.com/ndabAP/assocentity/v13/nlp" - "github.com/ndabAP/assocentity/v13/tokenize" -) - -func TestTokenize(t *testing.T) { - if testing.Short() { - t.SkipNow() - } - - if err := godotenv.Load("../.env"); err != nil { - t.Fatal(err) - } - - credentialsFile := os.Getenv("GOOGLE_NLP_SERVICE_ACCOUNT_FILE_LOCATION") - - tests := []struct { - text string - want []tokenize.Token - wantErr bool - }{ - { - text: "Punchinello was burning to get me", - want: []tokenize.Token{ - { - Text: "Punchinello", - PoS: tokenize.NOUN, - }, - { - Text: "was", - PoS: tokenize.VERB, - }, - { - Text: "burning", - PoS: tokenize.VERB, - }, - { - Text: "to", - PoS: tokenize.PRT, - }, - { - Text: "get", - PoS: tokenize.VERB, - }, - { - Text: "me", - PoS: tokenize.PRON, - }, - }, - wantErr: false, - }, - } - for _, tt := range tests { - t.Run("", func(t *testing.T) { - nlp := nlp.NewNLPTokenizer( - credentialsFile, - nlp.AutoLang, - ) - got, err := nlp.Tokenize(context.Background(), tt.text) - if (err != nil) != tt.wantErr { - t.Errorf("nlp.Tokenize() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("nlp.Tokenize() = %v, want %v", got, tt.want) - } - }) - } -} diff --git a/normalizer.go b/normalizer.go deleted file mode 100644 index c99b9fb..0000000 --- a/normalizer.go +++ /dev/null @@ -1,62 +0,0 @@ -package assocentity - -import ( - "strings" - - "github.com/ndabAP/assocentity/v13/tokenize" -) - -// Normalizer normalizes tokens like lower casing them -type Normalizer func(tokenize.Token) tokenize.Token - -// HumandReadableNormalizer normalizes tokens through lower casing them and -// replacing them with their synonyms -var HumandReadableNormalizer Normalizer = func(tok tokenize.Token) tokenize.Token { - t := tokenize.Token{ - PoS: tok.PoS, - Text: strings.ToLower(tok.Text), - } - - // This can increase the result data quality and could include more synonyms - switch tok.Text { - case "&": - t.Text = "and" - } - - return t -} - -// Normalize normalizes tokens with provided normalizer -func Normalize(dists map[tokenize.Token][]float64, norm Normalizer) { - for tok, d := range dists { - t := norm(tok) - - // Check if text is the same as non-normalized - if t == tok { - continue - } - if _, ok := dists[t]; ok { - dists[t] = append(dists[tok], d...) - } else { - dists[t] = d - } - - delete(dists, tok) - } -} - -// Threshold excludes results that are below the given threshold. The threshold -// is described through the amount of distances per token relative to the total -// amount of tokens -func Threshold(dists map[tokenize.Token][]float64, threshold float64) { - // Length of dists is amount of total tokens - distsN := len(dists) - for tok, d := range dists { - dN := len(d) - // Amount of distances per token relative to the amount of all tokens - t := (float64(dN) / float64(distsN)) * 100 - if t < threshold { - delete(dists, tok) - } - } -} diff --git a/source.go b/source.go new file mode 100644 index 0000000..ae95d13 --- /dev/null +++ b/source.go @@ -0,0 +1,30 @@ +package assocentity + +import ( + "slices" +) + +type ( + // source wraps entities and texts, and returns a Analyses instance + source struct { + Entities []string + Texts []string + } +) + +// NewSource returns a new source, consisting of entities and texts. Duplicate +// entities are removed. A source is the base for an analysis +func NewSource(entities, texts []string) source { + // De-duplicate entities + dedup := make([]string, 0) + for _, entity := range entities { + if !slices.Contains(dedup, entity) { + dedup = append(dedup, entity) + } + } + + return source{ + Entities: dedup, + Texts: texts, + } +} diff --git a/source_analyses.go b/source_analyses.go new file mode 100644 index 0000000..daf5613 --- /dev/null +++ b/source_analyses.go @@ -0,0 +1,169 @@ +package assocentity + +import ( + "context" + "iter" + "maps" + "slices" + + "github.com/ndabAP/assocentity/v15/tokenize" +) + +type ( + WithTransformer func(*tokenize.Token) *tokenize.Token +) + +func (source source) Analyses( + ctx context.Context, + tokenizer tokenize.Tokenizer, + feats tokenize.Feature, + transformers ...WithTransformer, +) (Analyses, error) { + var analyses Analyses + + // Tokenize entities + entities := make(map[string][]*tokenize.Token, len(source.Entities)) + for _, entity := range source.Entities { + select { + case <-ctx.Done(): + return analyses, ctx.Err() + default: + } + + dataset, err := tokenizer.Tokenize(ctx, entity, tokenize.FeatureSyntax) + if err != nil { + return analyses, err + } + + for _, token := range dataset.Tokens { + // protoimpl.MessageState should not be copied + t := tokenize.Token{ + DependencyEdge: token.DependencyEdge, + Lemma: token.Lemma, + PartOfSpeech: token.PartOfSpeech, + Text: token.Text, + } + entities[entity] = append(entities[entity], &t) + } + } + analyses.entities = entities + + // Tokenize texts + frames := make([]frame, 0, len(source.Texts)) + for _, text := range source.Texts { + select { + case <-ctx.Done(): + return analyses, ctx.Err() + default: + } + + var frame frame + dataset, err := tokenizer.Tokenize(ctx, text, feats) + if err != nil { + return analyses, err + } + for _, token := range dataset.Tokens { + // Apply mutators + for _, f := range transformers { + *token = *f(token) + } + } + frame.dataset = dataset + + // Entity locations + var ( + tokens = dataset.Tokens + + locs = make(map[int]int) + i = 0 + ) + for { + if i == len(tokens)-1 { + break + } + + // Peek entity + _, _, j := source.peek(tokens[i:], entities) + if j > -1 { + locs[i] = j + + // Skip entity + switch j { + case 0: + i += 1 + default: + i += j + } + + continue + } + + i++ + } + frame.locs = locs + + frames = append(frames, frame) + } + analyses.frames = frames + + return analyses, nil +} + +// peek checks if the next text tokens are entity tokens +func (s source) peek(text []*tokenize.Token, entities map[string][]*tokenize.Token) (string, []*tokenize.Token, int) { + // i contains the final index when the entity was found + var i int = 0 + + // Entity alias iterator + next, stop := iter.Pull2(maps.All(entities)) + defer stop() + for { + found := true + entity, tok, ok := next() + if !ok { + break + } + + // Entity buffer + buf := make([]*tokenize.Token, 0, len(tok)) + + // Entity iterator + n, s := iter.Pull2(slices.All(tok)) + // Text iterator + m, t := iter.Pull2(slices.All(text)) + for { + // If no entity is left, cancel + j, v, ok := n() + if !ok { + s() + t() + break + } + // If no text is left, cancel + _, w, ok := m() + if !ok { + s() + t() + break + } + + if w != v { + i = 0 + found = false + s() + t() + // Continue with next entity + break + } + + buf = append(buf, w) + i = j + } + + if found { + return entity, buf, i + } + } + + return "", nil, -1 +} diff --git a/source_analyses_test.go b/source_analyses_test.go new file mode 100644 index 0000000..9f2bf69 --- /dev/null +++ b/source_analyses_test.go @@ -0,0 +1,106 @@ +package assocentity + +import ( + "bytes" + "context" + "testing" + + "cloud.google.com/go/language/apiv1beta2/languagepb" + "github.com/ndabAP/assocentity/v15/tokenize" + "github.com/ndabAP/assocentity/v15/tokenize/ascii" +) + +type mockTokenizer struct{} + +func (t mockTokenizer) Tokenize(ctx context.Context, text string, feats tokenize.Feature) (tokenize.Dataset, error) { + tokenizer := ascii.New(ascii.EnglishSep, ascii.EnglishIntPunct, ascii.EnglishTermPunct) + dataset, err := tokenizer.Tokenize(context.Background(), text, feats) + if err != nil { + return tokenize.Dataset{}, err + } + + return dataset, nil +} + +func TestSourceAnalyses(t *testing.T) { + tests := []struct { + source source + want Analyses + }{ + { + source: source{ + Texts: []string{"You can't win this one, Max."}, + Entities: []string{"Max"}, + }, + want: Analyses{ + frames: []frame{ + { + dataset: tokenize.Dataset{ + Sentences: []*tokenize.Sentence{ + {Text: &languagepb.TextSpan{Content: "You can't win this one, Max."}}, + }, + Tokens: []*tokenize.Token{ + {Text: &languagepb.TextSpan{Content: "You"}}, + {Text: &languagepb.TextSpan{Content: "can"}}, + {Text: &languagepb.TextSpan{Content: "'"}}, + {Text: &languagepb.TextSpan{Content: "t"}}, + {Text: &languagepb.TextSpan{Content: "win"}}, + {Text: &languagepb.TextSpan{Content: "this"}}, + {Text: &languagepb.TextSpan{Content: "one"}}, + {Text: &languagepb.TextSpan{Content: ","}}, + {Text: &languagepb.TextSpan{Content: "Max"}}, + {Text: &languagepb.TextSpan{Content: "."}}, + }, + }, + locs: map[int]int{5: 5}, + }, + }, + entities: map[string][]*tokenize.Token{ + "Max": {{Text: &languagepb.TextSpan{Content: "Max"}}}, + }, + }, + }, + { + source: source{ + Texts: []string{"Punchinello wanted Payne? He'd see the pain"}, + Entities: []string{"Payne"}, + }, + want: Analyses{ + frames: []frame{ + { + dataset: tokenize.Dataset{ + Sentences: []*tokenize.Sentence{ + {Text: &languagepb.TextSpan{Content: "Punchinello wanted Payne?"}}, + {Text: &languagepb.TextSpan{Content: "He'd see the pain"}}, + }, + Tokens: []*tokenize.Token{ + {Text: &languagepb.TextSpan{Content: "Punchinello"}}, + {Text: &languagepb.TextSpan{Content: "wanted"}}, + {Text: &languagepb.TextSpan{Content: "Payne"}}, + {Text: &languagepb.TextSpan{Content: "?"}}, + {Text: &languagepb.TextSpan{Content: "He"}}, + {Text: &languagepb.TextSpan{Content: "'"}}, + {Text: &languagepb.TextSpan{Content: "d"}}, + {Text: &languagepb.TextSpan{Content: "see"}}, + {Text: &languagepb.TextSpan{Content: "the"}}, + {Text: &languagepb.TextSpan{Content: "pain"}}, + }, + }, + locs: map[int]int{2: 2}, + }, + }, + entities: map[string][]*tokenize.Token{ + "Payne": {{Text: &languagepb.TextSpan{Content: "Payne"}}}, + }, + }, + }, + } + + tokenizer := mockTokenizer{} + for _, test := range tests { + got, _ := test.source.Analyses(context.Background(), tokenizer, tokenize.FeatureSyntax) + if !bytes.Equal(marshalJSON(t, got), marshalJSON(t, test.want)) { + t.Errorf("Source.Analyses() = %+v, want %+v", got, test.want) + } + } +} diff --git a/tokenize/ascii/tokenizer.go b/tokenize/ascii/tokenizer.go new file mode 100644 index 0000000..4fb34ee --- /dev/null +++ b/tokenize/ascii/tokenizer.go @@ -0,0 +1,191 @@ +package ascii + +import ( + "context" + "strings" + + "cloud.google.com/go/language/apiv1beta2/languagepb" + "github.com/ndabAP/assocentity/v15/tokenize" +) + +type ( + ascii struct { + sep rune + intpunct, termpunct func(rune) bool + } +) + +const ( + ReplacementChar = '\uFFFD' +) + +var ( + // Separator + EnglishSep = ' ' + + // Internal punctuation, such as ",", "#" and "(" + EnglishIntPunct = func(r rune) bool { + switch r { + case '"', '\'': + return true + case ':', ';': + return true + case '?', '!': + return true + case ',': + return true + case '(', ')', '[', ']', '{', '}': + return true + case '/', '\\': + return true + case '#': + return true + case '@': + return true + case '$': + return true + case '%': + return true + case '&': + return true + case '<', '>': + return true + case '*': + return true + case '+', '-', '=': + return true + case '`': + return true + case '~': + return true + case '|': + return true + case '^': + return true + case '_': + return true + } + + return false + } + + // Terminal punctuation: ".", "!", "?" + EnglishTermPunct = func(r rune) bool { + switch r { + case '.', '!', '?': + return true + } + + return false + } + + _ tokenize.Tokenizer = (*ascii)(nil) +) + +// New returns an ASCII tokenizer +func New(sep rune, punctmarks, termpunct func(rune) bool) ascii { + return ascii{sep, punctmarks, termpunct} +} + +func (ascii ascii) Tokenize(ctx context.Context, text string, feats tokenize.Feature) (tokenize.Dataset, error) { + var dataset tokenize.Dataset + sentences := make([]*tokenize.Sentence, 0) + tokens := make([]*tokenize.Token, 0) + + var sentence, token strings.Builder + for _, r := range text { + if r > 127 { + // Unsupported range + token.WriteRune(ReplacementChar) + continue + } + + // Seperator terminates a token + if r == ascii.sep { + // Preceding token has been consumed + tok := terminate(&token, &tokens) + + sentence.WriteString(tok) + sentence.WriteRune(r) + + continue + } + + // Internal punctuation + if ascii.intpunct(r) { + tok := terminate(&token, &tokens) + + // Punctuation token + tokens = append(tokens, &tokenize.Token{ + Text: &languagepb.TextSpan{ + Content: string(r), + }, + }) + + sentence.WriteString(tok) + sentence.WriteRune(r) + + continue + } + + // Terminal punctuation + if ascii.termpunct(r) { + tok := terminate(&token, &tokens) + + // Punctuation token + tokens = append(tokens, &tokenize.Token{ + Text: &languagepb.TextSpan{ + Content: string(r), + }, + }) + + // Sentence has been consumed + sentence.WriteString(tok) + sentence.WriteRune(r) + sentences = append(sentences, &tokenize.Sentence{ + Text: &languagepb.TextSpan{ + Content: sentence.String(), + }, + }) + sentence.Reset() + + continue + } + + token.WriteRune(r) + } + + // Special case: Final token is not a separator + if token.Len() > 0 { + tokens = append(tokens, &tokenize.Token{ + Text: &languagepb.TextSpan{ + Content: token.String(), + }, + }) + } + + // Special case: No terminal punctuation + if len(sentences) == 0 { + sentences = append(sentences, &tokenize.Sentence{ + Text: &languagepb.TextSpan{ + Content: text, + }, + }) + } + + dataset.Tokens = tokens + dataset.Sentences = sentences + + return dataset, nil +} + +func terminate(token *strings.Builder, tokens *[]*tokenize.Token) (tok string) { + tok = token.String() + *tokens = append(*tokens, &tokenize.Token{ + Text: &languagepb.TextSpan{ + Content: tok, + }, + }) + token.Reset() + return +} diff --git a/tokenize/dataset.go b/tokenize/dataset.go new file mode 100644 index 0000000..43a8798 --- /dev/null +++ b/tokenize/dataset.go @@ -0,0 +1,8 @@ +package tokenize + +type Dataset struct { + Sentiment *Sentiment + + Sentences []*Sentence + Tokens []*Token +} diff --git a/tokenize/feats.go b/tokenize/feats.go new file mode 100644 index 0000000..be05fbc --- /dev/null +++ b/tokenize/feats.go @@ -0,0 +1,10 @@ +package tokenize + +type Feature int + +const ( + FeatureAll Feature = FeatureSyntax | FeatureSentiment + + FeatureSyntax Feature = 1 << iota + FeatureSentiment +) diff --git a/tokenize/interface.go b/tokenize/interface.go new file mode 100644 index 0000000..c30aa33 --- /dev/null +++ b/tokenize/interface.go @@ -0,0 +1,9 @@ +package tokenize + +import ( + "context" +) + +type Tokenizer interface { + Tokenize(ctx context.Context, text string, feats Feature) (Dataset, error) +} diff --git a/tokenize/nlp/retry/errors.go b/tokenize/nlp/retry/errors.go new file mode 100644 index 0000000..1980029 --- /dev/null +++ b/tokenize/nlp/retry/errors.go @@ -0,0 +1,5 @@ +package retry + +import "errors" + +var ErrRetriesExhausted = errors.New("max retries reached") diff --git a/tokenize/nlp/retry/retry.go b/tokenize/nlp/retry/retry.go new file mode 100644 index 0000000..7bf7348 --- /dev/null +++ b/tokenize/nlp/retry/retry.go @@ -0,0 +1,60 @@ +package retry + +import ( + "context" + "errors" + "math/rand/v2" + "time" + + "github.com/googleapis/gax-go/v2/apierror" + "google.golang.org/genproto/googleapis/api/error_reason" +) + +func Retry(ctx context.Context, req func() error) error { + // Retry request up to retries times if rate limit exceeded with an + // growing delay + const ( + retries = 6 + backoff = 180 // In seconds + ) + + var ( + try = 0 + delay = 1 // In seconds + ) + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + // Retrier exhausted + if try >= retries { + return ErrRetriesExhausted + } + + // Request + err := req() + var e *apierror.APIError + if errors.As(err, &e) { + // Check for rate limit exceeded error + if e.Reason() != error_reason.ErrorReason_RATE_LIMIT_EXCEEDED.String() { + // Other error + return err + } + + // Exponentially back-off + time.Sleep(time.Second * time.Duration(delay)) + delay = min( + backoff, + delay*delay+rand.IntN(10-1)+1, // delay² + jitter[1, 10) + ) + + try++ + continue + } + + return err + } +} diff --git a/tokenize/nlp/tokenizer.go b/tokenize/nlp/tokenizer.go new file mode 100644 index 0000000..52978bb --- /dev/null +++ b/tokenize/nlp/tokenizer.go @@ -0,0 +1,85 @@ +package nlp + +import ( + "context" + + "github.com/ndabAP/assocentity/v15/tokenize" + v1 "github.com/ndabAP/assocentity/v15/tokenize/nlp/v1" + v2 "github.com/ndabAP/assocentity/v15/tokenize/nlp/v2" + "golang.org/x/sync/errgroup" +) + +// AutoLang tries to automatically recognize the language +const AutoLang = "auto" + +// nlp tokenizers a text using Googles Natural Language AI +type nlp struct { + creds string + lang string +} + +// New returns a new Google Natural Language AI tokenizer instance. Note that +// NLP has a built-in retrier +func New(creds, lang string) tokenize.Tokenizer { + return nlp{ + creds: creds, + lang: lang, + } +} + +// Tokenize implements tokenize.Tokenizer +func (nlp nlp) Tokenize(ctx context.Context, text string, feats tokenize.Feature) (tokenize.Dataset, error) { + dataset := tokenize.Dataset{} + + fns := make([]func() error, 0) + + // Analyse syntax + syntaxfn := func() error { + res, err := v1.NewV1(nlp.creds, nlp.lang).Syntax(ctx, text) + if err != nil { + return err + } + + dataset.Sentences = res.GetSentences() + dataset.Tokens = res.GetTokens() + return nil + } + if feats&tokenize.FeatureSyntax != 0 { + fns = append(fns, syntaxfn) + } + // Analyse sentiment + var v2feats v2.Features + annotatefn := func(feats v2.Features) func() error { + return func() error { + res, err := v2.NewV2(nlp.creds, nlp.lang).Annotate(ctx, text, feats) + if err != nil { + return err + } + + dataset.Sentiment = res.GetDocumentSentiment() + return nil + } + } + + // Features + if feats&tokenize.FeatureSentiment != 0 { + v2feats.ExtractSentiment = true + } + if feats&tokenize.FeatureSentiment != 0 { + fns = append(fns, annotatefn(v2feats)) + } + // All features + if feats == tokenize.FeatureAll { + fns = []func() error{syntaxfn, annotatefn(v2feats)} + } + + g, ctx := errgroup.WithContext(ctx) + for _, fn := range fns { + g.Go(fn) + } + if err := g.Wait(); err != nil { + return dataset, err + } + + return dataset, nil +} diff --git a/tokenize/nlp/v1/req.go b/tokenize/nlp/v1/req.go new file mode 100644 index 0000000..5b9d76f --- /dev/null +++ b/tokenize/nlp/v1/req.go @@ -0,0 +1,54 @@ +package v1 + +import ( + "context" + + language "cloud.google.com/go/language/apiv1beta2" + "cloud.google.com/go/language/apiv1beta2/languagepb" + "github.com/ndabAP/assocentity/v15/tokenize/nlp/retry" + "google.golang.org/api/option" +) + +type api struct { + creds string + lang string +} + +func NewV1(creds, lang string) api { + return api{ + creds: creds, + lang: lang, + } +} + +func (v1 api) Syntax(ctx context.Context, text string) (*languagepb.AnalyzeSyntaxResponse, error) { + client, err := language.NewClient(ctx, option.WithCredentialsFile(v1.creds)) + if err != nil { + return &languagepb.AnalyzeSyntaxResponse{}, err + } + defer client.Close() + + doc := &languagepb.Document{ + Source: &languagepb.Document_Content{ + Content: text, + }, + Type: languagepb.Document_PLAIN_TEXT, + } + if v1.lang != "auto" { + doc.Language = v1.lang + } + + var res *languagepb.AnalyzeSyntaxResponse + if err := retry.Retry(ctx, func() error { + res, err = client.AnalyzeSyntax(ctx, &languagepb.AnalyzeSyntaxRequest{ + EncodingType: languagepb.EncodingType_UTF8, + Document: doc, + }) + + return err + }); err != nil { + return res, err + } + + return res, err +} diff --git a/tokenize/nlp/v2/req.go b/tokenize/nlp/v2/req.go new file mode 100644 index 0000000..023fb85 --- /dev/null +++ b/tokenize/nlp/v2/req.go @@ -0,0 +1,63 @@ +package v2 + +import ( + "context" + + language "cloud.google.com/go/language/apiv2" + "cloud.google.com/go/language/apiv2/languagepb" + "github.com/ndabAP/assocentity/v15/tokenize/nlp/retry" + "google.golang.org/api/option" +) + +type api struct { + creds string + lang string +} + +type Features struct { + ExtractSentiment bool +} + +func NewV2(creds, lang string) api { + return api{ + creds: creds, + lang: lang, + } +} + +func (v2 api) Annotate(ctx context.Context, text string, feats Features) (*languagepb.AnnotateTextResponse, error) { + client, err := language.NewClient(ctx, option.WithCredentialsFile(v2.creds)) + if err != nil { + return &languagepb.AnnotateTextResponse{}, err + } + defer client.Close() + + doc := &languagepb.Document{ + Source: &languagepb.Document_Content{ + Content: text, + }, + Type: languagepb.Document_PLAIN_TEXT, + } + if v2.lang != "auto" { + doc.LanguageCode = v2.lang + } + + var res *languagepb.AnnotateTextResponse + if err := retry.Retry(ctx, func() error { + f := &languagepb.AnnotateTextRequest_Features{} + if feats.ExtractSentiment { + f.ExtractDocumentSentiment = true + } + + res, err = client.AnnotateText(ctx, &languagepb.AnnotateTextRequest{ + Document: doc, + Features: f, + }) + + return err + }); err != nil { + return res, err + } + + return res, err +} diff --git a/tokenize/relationship.go b/tokenize/relationship.go new file mode 100644 index 0000000..94ec223 --- /dev/null +++ b/tokenize/relationship.go @@ -0,0 +1,71 @@ +package tokenize + +import ( + v1beta2 "cloud.google.com/go/language/apiv1beta2/languagepb" +) + +// Relationships returns all relationsships the given token has +func (dataset Dataset) Relationships(token *Token) *[]*v1beta2.DependencyEdge { + if token == NilToken { + return nil + } + if token == nil { + return nil + } + if token.DependencyEdge == nil { + return nil + } + + deps := make([]*v1beta2.DependencyEdge, 0) + for _, t := range dataset.Tokens { + if t == NilToken { + continue + } + if t.DependencyEdge == nil { + continue + } + + if t.DependencyEdge.HeadTokenIndex == token.DependencyEdge.HeadTokenIndex { + deps = append(deps, t.DependencyEdge) + } + } + + return &deps +} + +func (dataset Dataset) HasRelationship(token, u *Token, dep v1beta2.DependencyEdge_Label) bool { + if token == NilToken { + return false + } + if token == nil { + return false + } + if token.DependencyEdge == nil { + return false + } + if u == NilToken { + return false + } + if u == nil { + return false + } + if u.DependencyEdge == nil { + return false + } + + deps := dataset.Relationships(token) + if deps == nil { + return false + } + for _, d := range *deps { + if u.DependencyEdge.HeadTokenIndex != d.HeadTokenIndex { + continue + } + + if d.Label == dep { + return true + } + } + + return false +} diff --git a/tokenize/sentence.go b/tokenize/sentence.go new file mode 100644 index 0000000..c3a7872 --- /dev/null +++ b/tokenize/sentence.go @@ -0,0 +1,7 @@ +package tokenize + +import ( + v1beta2 "cloud.google.com/go/language/apiv1beta2/languagepb" +) + +type Sentence = v1beta2.Sentence diff --git a/tokenize/sentiment.go b/tokenize/sentiment.go new file mode 100644 index 0000000..51d82f4 --- /dev/null +++ b/tokenize/sentiment.go @@ -0,0 +1,5 @@ +package tokenize + +import v2 "cloud.google.com/go/language/apiv2/languagepb" + +type Sentiment = v2.Sentiment diff --git a/tokenize/token.go b/tokenize/token.go new file mode 100644 index 0000000..0de492b --- /dev/null +++ b/tokenize/token.go @@ -0,0 +1,11 @@ +package tokenize + +import ( + v1beta2 "cloud.google.com/go/language/apiv1beta2/languagepb" +) + +type Token = v1beta2.Token + +// NilToken can be used as placeholder tokens to not loose relationship +// properties +var NilToken = &Token{} diff --git a/tokenize/tokenize.go b/tokenize/tokenize.go deleted file mode 100644 index ad8e637..0000000 --- a/tokenize/tokenize.go +++ /dev/null @@ -1,77 +0,0 @@ -package tokenize - -import ( - "context" -) - -// Part of speech -type PoS int - -const ( - ANY = ADJ | ADP | ADV | AFFIX | CONJ | DET | NOUN | NUM | PRON | PRT | PUNCT | UNKN | VERB | X - - UNKN PoS = 1 << iota // Unknown - X // Other: foreign words, typos, abbreviations - - ADJ // Adjective - ADP // Adposition - ADV // Adverb - AFFIX // Affix - CONJ // Conjunction - DET // Determiner - NOUN // Noun - NUM // Cardinal number - PRON // Pronoun - PRT // Particle or other function word - PUNCT // Punctuation - VERB // Verb (all tenses and modes) -) - -// Tokenizer tokenizes a text and entities -type Tokenizer interface { - Tokenize(ctx context.Context, text string) ([]Token, error) -} - -// Token represents a tokenized text unit -type Token struct { - PoS PoS // Part of speech - Text string // Text -} - -var ( - // PoSMap maps pos strings to types - PoSMap = map[string]PoS{ - "any": ANY, - "adj": ADJ, - "adv": ADV, - "affix": AFFIX, - "conj": CONJ, - "det": DET, - "noun": NOUN, - "num": NUM, - "pron": PRON, - "prt": PRT, - "punct": PUNCT, - "unknown": UNKN, - "verb": VERB, - "x": X, - } - - // PoSMap maps pos types to strings - PoSMapStr = map[PoS]string{ - UNKN: "UNKNOWN", - ADJ: "ADJ", - ADP: "ADP", - ADV: "ADV", - CONJ: "CONJ", - DET: "DET", - NOUN: "NOUN", - NUM: "NUM", - PRON: "PRON", - PRT: "PRT", - PUNCT: "PUNCT", - VERB: "VERB", - X: "X", - AFFIX: "AFFIX", - } -)