ndabAP · ndabAP · Jan 18, 2025 · Jan 18, 2025 · Jan 18, 2025 · Jan 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,4 +0,0 @@
-.idea/
-.vscode/
-*.json
-.env

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,26 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "NLP tokenize example",
+            "type": "go",
+            "request": "launch",
+            "mode": "auto",
+            "program": "${workspaceFolder}/examples/nlp.go",
+            "env": {
+                "GOOGLE_NLP_CREDS_PATH": "${input:google_nlp_creds_path}"
+            },
+            "args": []
+        }
+    ],
+    "inputs": [
+        {
+            "id": "google_nlp_creds_path",
+            "description": "Google Natural Language API credentials path",
+            "type": "promptString"
+        }
+    ]
+}
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2020 Julian Claus
+Copyright (c) 2025 Julian Claus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/Makefile b/Makefile
@@ -15,17 +15,14 @@ build: windows linux darwin
 	@echo version: $(VERSION)
 
 windows: $(WINDOWS)
-
-linux: $(LINUX)
-
-darwin: $(DARWIN)
-
 $(WINDOWS):
 	env GOOS=windows GOARCH=amd64 go build -v -o bin/$(WINDOWS) -ldflags="-s -w -X main.version=$(VERSION)" ./cli/main.go
 
+linux: $(LINUX)
 $(LINUX):
 	env GOOS=linux GOARCH=amd64 go build -v -o bin/$(LINUX) -ldflags="-s -w -X main.version=$(VERSION)" ./cli/main.go
 
+darwin: $(DARWIN)
 $(DARWIN):
 	env GOOS=darwin GOARCH=amd64 go build -v -o bin/$(DARWIN) -ldflags="-s -w -X main.version=$(VERSION)" ./cli/main.go
 

diff --git a/README.md b/README.md
@@ -1,166 +1,3 @@
-# assocentity
-
-[![Go Report Card](https://goreportcard.com/badge/github.com/ndabAP/assocentity/v13)](https://goreportcard.com/report/github.com/ndabAP/assocentity/v13)
-
-Package assocentity is a social science tool to analyze the relative distance
-from tokens to entities. The motiviation is to make conclusions based on the
-distance from interesting tokens to a certain entity and its synonyms.
-
-## Features
-
-- Provide your own tokenizer
-- Provides a default NLP tokenizer (by Google)
-- Define aliases for entities
-- Provides a multi-OS, language-agnostic CLI version
-
-## Installation
-
-```bash
-$ go get github.com/ndabAP/assocentity/v13
-```
-
-## Prerequisites
-
-If you want to analyze human readable texts you can use the provided Natural
-Language tokenizer (powered by Google). To do so, sign-up for a Cloud Natural
-Language API service account key and download the generated JSON file. This
-equals the `credentialsFile` at the example below. You should never commit that
-file.
-
-A possible offline tokenizer would be a white space tokenizer. You also might
-use a parser depending on your purposes.
-
-## Example
-
-We would like to find out which adjectives are how close in average to a certain
-public person. Let's take George W. Bush and 1,000 NBC news articles as an
-example. "George Bush" is the entity and synonyms are "George Walker Bush" and
-"Bush" and so on. The text is each of the 1,000 NBC news articles.
-
-Defining a text source and to set the entity would be first step. Next, we need
-to instantiate our tokenizer. In this case, we use the provided Google NLP
-tokenizer. Finally, we can calculate our mean distances. We can use
-`assocentity.Distances`, which accepts multiple texts. Notice
-how we pass `tokenize.ADJ` to only include adjectives as part of speech.
-Finally, we can take the mean by passing the result to `assocentity.Mean`.
-
-```go
-// Define texts source and entity
-texts := []string{
-	"Former Presidents Barack Obama, Bill Clinton and ...", // Truncated
-	"At the pentagon on the afternoon of 9/11, ...",
-	"Tony Blair moved swiftly to place his relationship with ...",
-}
-entities := []string{
-	"Goerge Walker Bush",
-	"Goerge Bush",
-	"Bush",
-}
-source := assocentity.NewSource(entities, texts)
-
-// Instantiate the NLP tokenizer (powered by Google)
-nlpTok := nlp.NewNLPTokenizer(credentialsFile, nlp.AutoLang)
-
-// Get the distances to adjectives
-ctx := context.TODO()
-dists, err := assocentity.Distances(ctx, nlpTok, tokenize.ADJ, source)
-if err != nil {
-	// Handle error
-}
-// Get the mean from the distances
-mean := assocentity.Mean(dists)
-```
-
-### Tokenization
-
-If you provide your own tokenizer you must implement the interface with the
-method `Tokenize` and the following signature:
-
-```go
-type Tokenizer interface {
-	Tokenize(ctx context.Context, text string) ([]Token, error)
-}
-```
-
-`Token` is of type:
-
-```go
-type Token struct {
-	PoS  PoS    // Part of speech
-	Text string // Text
-}
-
-// Part of speech
-type PoS int
-```
-
-For example, given the text:
-
-```go
-text := "Punchinello was burning to get me"
-```
-
-The result from `Tokenize` would be:
-
-```go
-[]Token{
-	{
-		Text: "Punchinello",
-		PoS:  tokenize.NOUN,
-	},
-	{
-		Text: "was",
-		PoS:  tokenize.VERB,
-	},
-	{
-		Text: "burning",
-		PoS:  tokenize.VERB,
-	},
-	{
-		Text: "to",
-		PoS:  tokenize.PRT,
-	},
-	{
-		Text: "get",
-		PoS:  tokenize.VERB,
-	},
-	{
-		Text: "me",
-		PoS:  tokenize.PRON,
-	},
-}
-```
-
-## CLI
-
-There is also a language-agnostic terminal version available for either Windows,
-Mac (Darwin) or Linux (only with 64-bit support) if you don't have Go available.
-The application expects the text from "stdin" and accepts the following flags:
-
-| Flag          | Description                                                                                       | Type     | Default |
-| ------------- | ------------------------------------------------------------------------------------------------- | -------- | ------- |
-| `entities`    | Define entities to be searched within input, example: `-entities="Max Payne,Payne"`               | `string` |         |
-| `gog-svc-loc` | Google Clouds NLP JSON service account file, example: `-gog-svc-loc="/home/max/gog-svc-loc.json"` | `string` |         |
-| `op`          | Operation to excute: `-op="mean"`                                                                 | `string` | `mean`  |
-| `pos`         | Defines part of speeches to keep, example: `-pos=noun,verb,pron`                                  | `string` | `any`   |
-
-Example:
-
-```bash
-echo "Relax, Max. You're a nice guy." | ./bin/assocentity_linux_amd64_v13.0.0-0-g948274a-dirty -gog-svc-loc=/home/max/.config/assocentity/google-service.json -entities="Max Payne,Payne,Max"
-```
-
-The output is written to "stdout" in appropoiate formats.
-
-## Projects using assocentity
-
-- [entityscrape](https://github.com/ndabAP/entityscrape) - Distance between word
-  types (default: adjectives) in news articles and persons
-
-## Author
-
-[Julian Claus](https://www.julian-claus.de) and contributors.
-
-## License
-
-MIT
+- TODO: Entities European leaders, reuse text entities, create sentiment with
+descriptions like "Macron is a bad president"
+- WithLema normalizer
diff --git a/__debug_bin1563460954 b/__debug_bin1563460954
diff --git a/...ity_darwin_amd64_v13.0.3-0-g9d9e27c-dirty → __debug_bin2883779316 b/...ity_darwin_amd64_v13.0.3-0-g9d9e27c-dirty → __debug_bin2883779316
diff --git a/analyses.go b/analyses.go
@@ -0,0 +1,48 @@
+package assocentity
+
+import (
+	"iter"
+
+	"github.com/ndabAP/assocentity/v15/tokenize"
+)
+
+type (
+	Analyses struct {
+		frames   []frame
+		entities map[string][]*tokenize.Token
+	}
+
+	frame struct {
+		dataset tokenize.Dataset
+		locs    map[int]int
+	}
+)
+
+func (frame frame) Sentences() iter.Seq[[]*tokenize.Token] {
+	return func(yield func([]*tokenize.Token) bool) {
+		var (
+			sentences = frame.dataset.Sentences
+			tokens    = frame.dataset.Tokens
+		)
+		for i := range sentences {
+			var next int32 = 0
+			if i < len(sentences)-1 {
+				next = sentences[i+1].Text.BeginOffset
+			}
+
+			toks := make([]*tokenize.Token, 0)
+			for _, token := range tokens {
+				if next == token.Text.BeginOffset {
+					// Next sentence
+					break
+				}
+
+				toks = append(toks, token)
+			}
+
+			if !yield(toks) {
+				break
+			}
+		}
+	}
+}
diff --git a/analyses_deps.go b/analyses_deps.go
@@ -0,0 +1,27 @@
+package assocentity
+
+import (
+	"cloud.google.com/go/language/apiv1beta2/languagepb"
+)
+
+type PartOfSpeech = languagepb.PartOfSpeech_Tag
+
+func (analyses Analyses) Deps(pos PartOfSpeech) []string {
+	// for i, text := range analyses.dataset {
+	// 	coords := analyses.coords(i)
+
+	// 	// TODO: For each sentence
+
+	// 	for _, token := range text.Tokens {
+	// 		if token == tokenize.NilToken {
+	// 			continue
+	// 		}
+
+	// 		for s, e := range coords {
+	// 			if token.DependencyEdge.HeadTokenIndex == int32(s) {
+	// 			}
+	// 		}
+	// 	}
+	// }
+	return nil
+}