Merge pull request #1 from londogard/Lundez/gitpod-setup

GitPod + BeamSearch (& more)
londogard · Aug 21, 2020 · e3c1467 · e3c1467
2 parents b0b16ef + 7969d45
commit e3c1467
Show file tree

Hide file tree

Showing 56 changed files with 2,613 additions and 2,098 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,10 @@
+version: 2
+
+updates:
+  # Enable version updates for Docker
+  - package-ecosystem: "gradle"
+    # Look for a `Dockerfile` in the `root` directory
+    directory: "/"
+    # Check for updates once a week
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/build_project.yml b/.github/workflows/build_project.yml
@@ -18,9 +18,6 @@ jobs:
       - name: Download Dependencies
         run: ./gradlew dependencies
 
-      - name: Check Dependency Versions
-        run: ./gradlew dependencyUpdates
-
       #- name: Check Kotlin Formatting
       #  run: ./gradlew ktlintCheck --continue
 

diff --git a/.github/workflows/dependencies.yml b/.github/workflows/dependencies.yml
diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile
@@ -0,0 +1 @@
+RUN brew install kotlin
diff --git a/.gitpod.yml b/.gitpod.yml
@@ -0,0 +1,7 @@
+tasks:
+  - init: ./gradlew clean build
+    command: ./gradlew test
+
+vscode:
+  extensions:
+    - mathiasfrohlich.Kotlin@1.7.1:2Uw0hH38NcpgInldjuwF3g==
diff --git a/README.md b/README.md
@@ -1,71 +1,156 @@
+[![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/londogard/text-gen-kt)
+
 <a href='https://ko-fi.com/O5O819SEH' target='_blank'><img height='24' style='border:0px;height:24px;' src='https://az743702.vo.msecnd.net/cdn/kofi2.png?v=2' border='0' alt='Buy Me a Coffee at ko-fi.com' /></a>[![](https://jitpack.io/v/com.londogard/text-gen-kt.svg)](https://jitpack.io/#com.londogard/text-gen-kt)
 
 # text-gen-kt
-Text Generation in Kotlin. Will include multiple pre-trained models &amp; ability to train your own. Easy-to-use API as a goal too.
+Text Generation in Kotlin that's 'light' on resources. 
+
+ - Pre-trained models (Shakespeare & Cards Against Humanity)
+ - Easy-to-use API (training & generating text)
+ - Customizable
 
 ## Installation
-### Jitpack (easiest)
-Add the following to your `build.gradle`. `$version` should be equal to the version supplied by tag above.
-```
-   repositories {
-        maven { url "https://jitpack.io" }
-   }
-   dependencies {
-         implementation 'com.londogard:text-gen-kt:$version'
-   }
-```
-### GitHub Packages
-Add the following to your `build.gradle`. `$version` (this one messed up, is 1.0-beta) should be equal to the version supplied by tag above.  
+<details open>
+<summary><b>Jitpack</b> (the easiest)</summary>
+<br>
+Add the following to your <code>build.gradle</code>. <code>$version</code> should be equal to the version supplied by tag above.
+<br>
+<br>
+<pre>
+repositories {
+  maven { url "https://jitpack.io" }
+}
+dependencies {
+  implementation 'com.londogard:text-gen-kt:$version'
+}        
+</pre>
+</details>
+<details>
+   <summary><b>GitHub Packages</b></summary>
+<br>
+Add the following to your <code>build.gradle</code>. <code>$version</code> should be equal to the version supplied by tag above.  
 The part with logging into github repository is how I understand that you need to login. If you know a better way please ping me in an issue.
-```
+<br>
+<br>
+<pre>
 repositories {
    maven {
-     url = uri("https://maven.pkg.github.com/londogard/text-gen-kt")
+     url = uri("https://maven.pkg.github.com/londogard/smile-nlp-kt")
      credentials {
          username = project.findProperty("gpr.user") ?: System.getenv("GH_USERNAME")
          password = project.findProperty("gpr.key") ?: System.getenv("GH_TOKEN")
      }
 }
 }
 dependencies {
-   implementation "com.londogard:summarize-kt:$version"
-}
-```
+   implementation "com.londogard:text-gen-kt:$version"
+}   
+</pre>
+</details>
 
-## [BETA] Usage
-OBS - usage might change in future as it's beta.  
-
-#### Create model and generate text
+## Usage
+Only the simplest API-usages shown with no overrides. It should be straight-forward to override
+ different options.  
+
+**Loading a Pretrained Model and Text Generation**  
+Find a few pre-trained models [here](https://github.com/londogard/text-gen-kt/blob/master/files/models/).  
+Includes Shakespeare, Cards Against Humanity (Black & White Card versions).
 ```kotlin
-val model = LanguageModelImpl(PretrainedModels.SHAKESPEARE, GenerationLevel.WORD)
-val generatedText = model.generateText("have a", 150, 0.1)
-```
-So what are we doing here?  
-1. Creating a pretrained model of **Shakespeare**
-2. Selecting **word-level** generation
-3. Generating text using a **prefix** with **n** units (in this case words) and a **temperature** of 0.1 (the more, the crazier) 
+// Have a pretrained model locally, in say 'shakespeare.cbor'
+val absPathToModel = "/path/to/shakespeare.cbor"
+val languageModel = LanguageModel.loadPretrainedModel(absPathToModel)
 
-`val generatedText` now contains 150 words starting with "have a".
+// There exists a lot configs to change if you'd like, but this is the simplest text generation.
+val generatedSentences: List<String> = SimpleTextGeneration.generateText(languageModel = languageModel)
+generatedSentences.foreach(::println)
 
-##### PretrainedModels
-`CUSTOM` might not be working for sure. Works to create, but saving is currently not implemented for sure.  
-The models are saved using `Concise Binary Object Representation` (RFC 7049).
-```kotlin
-enum class PretrainedModels {
-    SHAKESPEARE,
-    CARDS_AGAINST_WHITE,
-    CARDS_AGAINST_BLACK,
-    CUSTOM
-}
+SimpleTextGeneration
+    .generateText(languageModel, seed = "This is who I am")
+    .foreach(::println)
+
+// Prints the generated sentences. All which starts with "This is who I am"
 ```
-##### GenerationLevel
-No pretrained models exist for `CHAR` currently. `CHAR` is not supported either.
+
+**Training your own Model**  
 ```kotlin
-enum class GenerationLevel {
-    WORD,
-    CHAR
-}
+// Have some text you wish to run on
+val documents: List<String> = listOf(File('somePath').readText)
+
+// n selects how much you want the model to remember. We use default tokenizer here.
+val trainedModel = LanguageModel.trainModel(documents, n=3)
+trainModel.serialize("/path/to/model.cbor")
+
+val generatedSentences: List<String> = SimpleTextGeneration.generateText(languageModel = trainedModel)
+generatedSentences.foreach(::println)
 ```
-### CUSTOM
-To train a custom model, initiate model as usual and then call  
-`model.createCustomModel("<PATH>", "<MODEL_TO_SAVE_NAME>", <IS_EACH_LINE_NEW_DOCUMENT?>)`
+
+## Steps in text-generation
+Search calls smoothing to retrieve tokens & probabilities.   
+Smoothing access the Language Model to retrieve probabilities, and if they don't 
+exist smooth it out somehow, meaning that you find the closest match. 
+>**Smoothing example**  
+> \["hej", "där", "borta"] has never been seen in the data, then we don't know what to generate as 
+>the next word. Simple back-off smoothing would then try to see if \["där", "borta"] exists in the data and try 
+>to generate a word from that instead.  
+>There's different 
+> ways to smooth data, but in its essence it's the idea of finding a value of something we've never 
+> seen before.
+   
+Smoothing then applies penalties and finally normalization.
+
+## Structure
+There's a few different components
+
+ 1. Language Model
+ 2. Tokenizer
+ 3. Normalization
+ 4. Smoothing
+ 5. Search
+ 6. Penalties
+
+The idea is that the Language Model is basically a storage of probabilities.  
+To generate text we somehow need to tap into this 'database' and fetch values 
+in a interesting way. This is done using the tools in 2-6.  
+This division is done in a fashion were we actually don't care if it's word-level 
+or character-level text generation (or anything else really). The trained Language Model 
+can simply be used to generate text in a lot of different fashions, with different penalties 
+and a lot other!
+
+### Language Model
+The Language Model is basically just a storage, with some clever structure.
+There's two ways to get a Language Model, either load a pretrained model through 
+a config-file or train it yourself on some text!
+
+### Tokenizer
+Tokenizer is a tool to tokenize text into tokens. A simple tokenizer could be either 
+tokenize characters, i.e. one character per token. Another could be to split words, e.g. 
+`tokenize("vem kan hitta min keps?") = ["vem", "kan", "hitta", "min", "keps", "?"]`.  
+Clever approaches sometimes split words like `kasper's` into `kasper & 's`, which 
+reduces the dimensionality a bit.
+
+### Normalization
+When all probabilities are retrieved they need to be normalized to be in `[0,1)` (0-100 %).  
+This can be done in different ways, the simplest being to just divide all by the sum.
+
+### Smoothing
+Smoothing in this case is to retrieve probabilities. If the Language Model does not contain 
+a word we still need to generate text, how is this done? Smoothing says how the probabilities 
+and tokens should be found.  
+A simple method is to "backoff", that is if we don't find something for `"who is there"` the 
+model can still have `"is that"`, which we then want to return.
+
+### Search
+Search is basically how we should select the tokens received by the Smoothing.  
+A greedy search is to just select the top probability each time.
+
+### Penalties
+Penalty is simply a way to penalize certain features. E.g. swear words might be off-limit, we 
+might not want to generate the same ngram again? It's up to you!
+
+## Available Models
+ - shakespeare_char.cbor (n=100, keepMinFreq=5)
+ - shakespeare_word.cbor (n=100, keepMinFreq=1)
+ - cardsagainst_white_char.cbor (n=100, keepMinFreq=1)
+ - cardsagainst_white_word.cbor (n=100, keepMinFreq=1)
+ - cardsagainst_black_char.cbor (n=100, keepMinFreq=1)
+ - cardsagainst_black_word.cbor (n=100, keepMinFreq=1)
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -1,31 +1,25 @@
 import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
 
 buildscript {
-    val kotlin_version = "1.3.72"
-    repositories {
-        mavenCentral()
-        jcenter()
-    }
-
+    val kotlinVersion = "1.4.0"
     dependencies {
-        classpath(kotlin("gradle-plugin", kotlin_version))
-        classpath("org.jetbrains.kotlin:kotlin-serialization:$kotlin_version")
+        classpath(kotlin("gradle-plugin", kotlinVersion))
+        classpath("org.jetbrains.kotlin:kotlin-serialization:$kotlinVersion")
     }
 }
 
-
 plugins {
     `maven-publish`
-    id("org.jetbrains.dokka") version "0.10.0"
-    id("com.github.ben-manes.versions") version "0.27.0"
-    id("org.jetbrains.kotlin.plugin.serialization") version ("1.3.72")
-    kotlin("jvm") version "1.3.72"
+    id("org.jetbrains.dokka") version "0.10.1"
+    id("org.jetbrains.kotlin.plugin.serialization") version ("1.4.0")
+    kotlin("jvm") version "1.4.0"
 }
 
 
 group = "com.londogard"
-version = "1.0.2-beta"
-val smileVersion = "2.4.0"
+version = "1.1.0"
+val serializationVersion = "1.0.0-RC"
+val kotlinVersion = "1.4.0"
 
 repositories {
     mavenCentral()
@@ -34,13 +28,16 @@ repositories {
 }
 
 dependencies {
-    implementation(kotlin("stdlib-jdk8"))
+    implementation(kotlin("stdlib"))
+    api("org.slf4j:slf4j-api:1.7.30")
+    // implementation("it.unimi.dsi:fastutil:8.4.1")
+
+    implementation("org.jetbrains.kotlinx:kotlinx-serialization-core:$serializationVersion")
+    implementation("org.jetbrains.kotlinx:kotlinx-serialization-cbor:$serializationVersion")
 
-    implementation("com.github.haifengl:smile-nlp:$smileVersion")
-    implementation("com.github.haifengl:smile-kotlin:$smileVersion")
-
-    implementation("org.jetbrains.kotlinx:kotlinx-serialization-runtime:0.20.0")
-    implementation("org.jetbrains.kotlinx:kotlinx-serialization-cbor:0.20.0")
+    testImplementation("org.jetbrains.kotlin:kotlin-test:$kotlinVersion")
+    testImplementation("org.jetbrains.kotlin:kotlin-test-junit:$kotlinVersion")
+    testImplementation("org.amshove.kluent:kluent:1.61")
 }
 
 tasks.withType<KotlinCompile> {