Added tests

londogard · Apr 6, 2021 · a6e954b · a6e954b
1 parent 78cd483
commit a6e954b
Show file tree

Hide file tree

Showing 30 changed files with 440 additions and 126 deletions.
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -2,14 +2,13 @@ import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
 
 plugins {
     `maven-publish`
-    id("idea")
     kotlin("jvm") version "1.4.32"
     kotlin("plugin.serialization") version "1.4.32"
     id("org.jetbrains.dokka") version "1.4.30"
 }
 
 group = "com.londogard"
-version = "1.0-beta"
+version = "1.0"
 
 repositories {
     mavenCentral()
@@ -27,26 +26,24 @@ dependencies {
     implementation("org.ejml:ejml-simple:0.40")
     implementation("org.ejml:ejml-kotlin:0.40")
 
-    // implementation("com.github.levyfan:sentencepiece-jni:v0.0.2")
     implementation("ai.djl.sentencepiece:sentencepiece:0.10.0")
     implementation("com.github.rholder:snowball-stemmer:1.3.0.581.1")
-
-    // https://mvnrepository.com/artifact/org.apache.commons/commons-compress
     implementation("org.apache.commons:commons-compress:1.20")
 
-    implementation("org.codehaus.plexus:plexus-archiver:4.2.4")
-
     testImplementation("org.amshove.kluent:kluent:$kluentVersion")
     testImplementation("org.jetbrains.kotlin:kotlin-test:1.4.32")
     testImplementation(kotlin("test-junit"))
+    implementation(kotlin("stdlib-jdk8"))
 }
 
 tasks.test {
     useJUnit()
 }
 
-tasks.withType<KotlinCompile> {
-    kotlinOptions.jvmTarget = "1.8"
+tasks.withType<KotlinCompile>().configureEach {
+    kotlinOptions {
+        useIR = true
+    }
 }
 
 publishing {
@@ -65,4 +62,9 @@ publishing {
             from(components["java"])
         }
     }
+}
+
+val compileTestKotlin: KotlinCompile by tasks
+compileTestKotlin.kotlinOptions {
+    jvmTarget = "1.8"
 }
diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/BpeEmbeddings.kt b/src/main/kotlin/com/londogard/nlp/embeddings/BpeEmbeddings.kt
@@ -23,10 +23,14 @@ class BpeEmbeddings(
             ?.avgNorm()
     }
 
+    override fun contains(word: String): Boolean {
+        return tokenizer.split(word).all(embeddings::contains)
+    }
+
     fun subwordVector(subword: String): SimpleMatrix? = embeddings[subword]
 
     companion object {
-        fun toTokenizer(filePath: Path): Tokenizer {
+        @JvmStatic fun toTokenizer(filePath: Path): Tokenizer {
             val rawNameTokens = filePath.fileName.toString().split('.')
 
             val languageSupport = LanguageSupport.valueOf(rawNameTokens.first())

diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/EmbeddingLoader.kt b/src/main/kotlin/com/londogard/nlp/embeddings/EmbeddingLoader.kt
@@ -5,7 +5,6 @@ import com.londogard.nlp.utils.LanguageSupport
 import com.londogard.nlp.utils.useLines
 import org.ejml.simple.SimpleMatrix
 import java.nio.file.Path
-import kotlin.io.path.bufferedReader
 import kotlin.math.min
 
 object EmbeddingLoader {
@@ -24,7 +23,13 @@ object EmbeddingLoader {
         }
     }
 
-    // TODO inline fun <reified T: Embeddings> fromUrl(url: String): Map<String, SimpleMatrix> = TODO("")
+    inline fun <reified T: Embeddings> fromFile(path: Path): T {
+        return when {
+            T::class == LightWordEmbeddings::class -> LightWordEmbeddings(path) as T
+            T::class == BpeEmbeddings::class -> BpeEmbeddings(path) as T
+            else -> WordEmbeddings(path) as T
+        }
+    }
 
     internal fun fromFile(path: Path,
                           delimiter: Char,
@@ -47,9 +52,4 @@ object EmbeddingLoader {
                     }
                     .toMap(LinkedHashMap(numLinesToUse)) // optimization by creating the full map directly
             }
-}
-
-fun main() {
-    val embeddings = EmbeddingLoader.fromLanguageOrNull<LightWordEmbeddings>(LanguageSupport.sv)
-    println(embeddings?.vector("Hej"))
 }
diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/WordEmbeddings.kt b/src/main/kotlin/com/londogard/nlp/embeddings/WordEmbeddings.kt
@@ -89,7 +89,7 @@ class WordEmbeddings(
         /** Pretty print the list of words and their associated scores.
          * @param words List of (word, score) pairs to be printed.
          */
-        fun pprint(words: List<Pair<String, Double>>) {
+        @JvmStatic fun pprint(words: List<Pair<String, Double>>) {
             println("\n%50s${" ".repeat(7)}Cosine distance\n${"-".repeat(72)}".format("Word"))
             println(words.joinToString("\n") { (word, dist) -> "%50s${" ".repeat(7)}%15f".format(word, dist) })
         }

diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/sentence/AverageSentenceEmbeddings.kt b/src/main/kotlin/com/londogard/nlp/embeddings/sentence/AverageSentenceEmbeddings.kt
@@ -1,6 +1,7 @@
 package com.londogard.nlp.embeddings.sentence
 
 import com.londogard.nlp.embeddings.Embeddings
+import com.londogard.nlp.utils.avgNorm
 import com.londogard.nlp.utils.normalize
 import org.ejml.simple.SimpleMatrix
 
@@ -11,7 +12,6 @@ class AverageSentenceEmbeddings(override val tokenEmbeddings: Embeddings): Sente
     override fun getSentenceEmbedding(sentence: List<String>): SimpleMatrix {
         return tokenEmbeddings
             .traverseVectors(sentence)
-            .reduce { acc, simpleMatrix -> acc + simpleMatrix }
-            .normalize()
+            .avgNorm()
     }
 }
diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/sentence/USifSentenceEmbeddings.kt b/src/main/kotlin/com/londogard/nlp/embeddings/sentence/USifSentenceEmbeddings.kt
@@ -12,7 +12,7 @@ import kotlin.math.pow
 class USifSentenceEmbeddings(
     override val tokenEmbeddings: Embeddings,
     private val wordProb: Map<String, Float>,
-    randomWalkLength: Int, // = n, ~11
+    randomWalkLength: Int = 11, // = n, ~11
     private val numCommonDiscourseVector: Int = 5 // = m, 0 should work. In practise max 5.
 ) : SentenceEmbeddings {
     private val vocabSize = wordProb.size.toFloat()

diff --git a/src/main/kotlin/com/londogard/nlp/stemmer/Stemmer.kt b/src/main/kotlin/com/londogard/nlp/stemmer/Stemmer.kt
@@ -37,7 +37,7 @@ class Stemmer(language: LanguageSupport) {
         var cache: Pair<LanguageSupport, Stemmer>? = null
 
         // Default to PorterStemmer if not supported!
-        fun stem(word: String, language: LanguageSupport): String {
+        @JvmStatic fun stem(word: String, language: LanguageSupport): String {
             val cachedStemmer = cache
 
             return when (cachedStemmer?.first) {

diff --git a/src/main/kotlin/com/londogard/nlp/stopwords/Stopwords.kt b/src/main/kotlin/com/londogard/nlp/stopwords/Stopwords.kt
@@ -15,6 +15,7 @@ object Stopwords {
     fun isStopword(word: String, language: LanguageSupport): Boolean =
         stopwordsOrNull(language)?.contains(word) == true
 
+    @Throws(IllegalArgumentException::class)
     fun stopwords(language: LanguageSupport): Set<String> =
         stopwordsOrNull(language)
             ?: throw IllegalArgumentException("There exists not stopwords for language ${language.name}. Please try again with one of the supported languages.")

diff --git a/src/main/kotlin/com/londogard/nlp/structures/trie/Trie.kt b/src/main/kotlin/com/londogard/nlp/structures/trie/Trie.kt
@@ -64,20 +64,4 @@ fun findFirstMerger(trie: TrieNode, string: String): String? {
                 .firstOrNull()
         }
     }
-}
-
-fun main() {
-    val vocab = WordFrequencies.getAllWordFrequenciesOrNull(LanguageSupport.sv)?.toVocab() ?: emptyMap()
-    println(vocab.entries.sortedBy { it.value }.reversed().take(5))
-    val trie = Trie(vocab)
-    println(findFirstMerger(trie.rootNode.childNodes.entries.first().value, ""))
-
-    // could use foldRight (goes from other end..!)
-    // val reverseTrie = Trie(vocab.mapKeys { (key,_) -> key.reversed() })
-    // println(reverseTrie.rootNode.childNodes.map { it.key to it.value.count })
-
-    println(trie.rootNode.childNodes.getValue('ä').childNodes.map { it.key to it.value.count })
-    println(trie.rootNode.childNodes.map { it.key to it.value.count })
-    println(trie.rootNode.char)
-    println(trie.rootNode.count)
 }
diff --git a/src/main/kotlin/com/londogard/nlp/tokenizer/SentencePieceTokenizer.kt b/src/main/kotlin/com/londogard/nlp/tokenizer/SentencePieceTokenizer.kt
@@ -17,12 +17,12 @@ class SentencePieceTokenizer(modelPath: Path, vocabPath: Path? = null): Tokenize
     override fun split(text: String): List<String> = sentencePieceTokenizer.tokenize(text)
 
     companion object {
-        fun fromLanguageSupportOrNull(languageSupport: LanguageSupport): SentencePieceTokenizer? =
+        const val beginningOfWord: Char = '▁'
+        @JvmStatic fun fromLanguageSupportOrNull(languageSupport: LanguageSupport): SentencePieceTokenizer? =
             fromLanguageSupportAndSizeOrNull(languageSupport, VocabSize.v10_000)
-
-        fun fromLanguageSupportAndSizeOrNull(languageSupport: LanguageSupport, vocabSize: VocabSize) =
+        @JvmStatic fun fromLanguageSupportAndSizeOrNull(languageSupport: LanguageSupport, vocabSize: VocabSize) =
             if (languageSupport.hasSentencePiece()) {
-                val (vocab, model) = DownloadHelper.getBpeVocabModel(languageSupport, vocabSize.size)
+                val (vocab, model) = DownloadHelper.getSentencePieceVocabModel(languageSupport, vocabSize.size)
                 SentencePieceTokenizer(model, vocab)
             } else null
     }

diff --git a/src/main/kotlin/com/londogard/nlp/utils/CompressionUtil.kt b/src/main/kotlin/com/londogard/nlp/utils/CompressionUtil.kt
@@ -6,12 +6,12 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream
 import java.io.BufferedInputStream
 import java.io.IOException
 import java.io.InputStream
-import java.io.OutputStream
 import java.nio.file.Files
 import java.nio.file.Path
 import java.nio.file.StandardCopyOption
 import java.util.zip.GZIPInputStream
 
+/** Simplified usage of apache compress through Object functions. */
 object CompressionUtil {
     fun gunzip(path: Path): InputStream =
         path.toFile()

diff --git a/src/main/kotlin/com/londogard/nlp/utils/CustomExtensions.kt b/src/main/kotlin/com/londogard/nlp/utils/CustomExtensions.kt
@@ -5,6 +5,8 @@ import java.nio.file.Files
 import java.nio.file.Path
 import java.nio.file.attribute.FileAttribute
 
+/** Custom extensions for Path taken from Kotlin EXPERIMENTAL. */
+
 // Taken from Kotlin stdlib (EXPERIMENTAL)
 internal inline fun Path.readLines(charset: Charset = Charsets.UTF_8): List<String> =
     Files.readAllLines(this, charset)

diff --git a/src/main/kotlin/com/londogard/nlp/utils/DownloadHelper.kt b/src/main/kotlin/com/londogard/nlp/utils/DownloadHelper.kt
@@ -1,115 +1,79 @@
 package com.londogard.nlp.utils
 
 import com.londogard.nlp.embeddings.EmbeddingLoader.BpeDefaultEmbeddingDimension
-import com.londogard.nlp.tokenizer.toVocabSize
 import com.londogard.nlp.wordfreq.WordFrequencySize
 import java.net.URL
 import java.nio.file.Files
 import java.nio.file.Path
-import java.nio.file.Paths
 
 @PublishedApi
 internal object DownloadHelper {
-    private val rootPath: Path = Paths.get(System.getProperty("user.home")).resolve(".londogard")
-    private const val dataUrl: String = "https://raw.githubusercontent.com/londogard/londogard-nlp-toolkit/main/data"
-    private const val bpeUrl: String = "https://nlp.h-its.org/bpemb/"
-    private val stopwordPath: Path = rootPath.resolve("stopwords")
-    private val wordFrequencyPath: Path = rootPath.resolve("wordfreq")
-    private val embeddingPath: Path = rootPath.resolve("embeddings")
-    private val bpePath: Path = rootPath.resolve("bpe")
-
     fun getStopWords(language: LanguageSupport): Path {
-        val path = stopwordPath.resolve(language.name)
-        if (!Files.exists(path)) {
-            println("Language ${language.name} does not have stopwords locally. Will download (few KBs)...")
+        val fileInfo = UrlProvider.stopwords(language)
+        downloadFileIfMissing(fileInfo)
 
-            "$dataUrl/stopwords/${language.name}".saveTo(path)
+        return fileInfo.path
+    }
 
-            println("Download done! ${language.name} stopwords located at ${path.toAbsolutePath()}")
-        }
+    fun getWordFrequencies(language: LanguageSupport, size: WordFrequencySize = WordFrequencySize.Smallest): Path {
+        val fileInfo = UrlProvider.wordfreq(language, size)
+        downloadFileIfMissing(fileInfo)
 
-        return path
+        return fileInfo.path
     }
 
-    fun getWordFrequencies(language: LanguageSupport, size: WordFrequencySize = WordFrequencySize.Smallest): Path {
-        val filename = size.toFileName(language)
-        val path = wordFrequencyPath.resolve(filename)
-        if (!Files.exists(path)) {
-            println("Language ${language.name} does not have (${size.name}) word frequencies locally. Will download (few KBs)...")
+    private fun downloadFileIfMissing(fileInfo: FileInfo) {
+        if (!Files.exists(fileInfo.path)) {
+            println("Downloading ${fileInfo.description} for ${fileInfo.language} as files don't exist locally.")
 
-            "$dataUrl/wordfreq/${filename}".saveTo(path)
+            fileInfo.toUrl().saveTo(fileInfo.path)
 
-            println("Download done! ${language.name} (${size.name}) word frequencies located at ${path.toAbsolutePath()}")
+            println("Download completed! ${fileInfo.language} ${fileInfo.description} located at ${fileInfo.path.toAbsolutePath()}")
         }
-
-        return path
     }
 
     fun getWordEmbeddings(language: LanguageSupport): Path {
-        val filename = "cc.${language.name}.300.vec"
-        val path = embeddingPath.resolve(filename)
-        if (!Files.exists(path)) {
-            path.parent.createDirectories()
-            println("Language ${language.name} does not have word embeddings locally. Will download (could be GBs)...")
-            val url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/$filename.gz"
+        val fileInfo = UrlProvider.fastText(language)
+        if (!Files.exists(fileInfo.path)) {
             val tmpPath = Files.createTempFile("tmp", ".gz")
-
-            url.saveTo(tmpPath)
-            Files.newOutputStream(path).use { out ->
+            downloadFileIfMissing(fileInfo.copy(path = tmpPath))
+            Files.newOutputStream(fileInfo.path).use { out ->
                 CompressionUtil.gunzip(tmpPath).use { input -> input.copyTo(out) }
             }
             Files.deleteIfExists(tmpPath)
-
-            println("Download completed! ${language.name} word embeddings located at ${path.toAbsolutePath()}")
         }
-        return path
+        return fileInfo.path
     }
 
-    private fun getBpeBaseUrl(language: LanguageSupport, numMerges: Int): String =
-        "$bpeUrl/$language/$language.wiki.bpe.vs$numMerges"
-
     // TODO improve by `data class`
-    fun getBpeVocabModel(language: LanguageSupport, vocabSize: Int = 10_000): Pair<Path, Path> {
-        val baseUrl = getBpeBaseUrl(language, vocabSize)
-        val vocab = getBpeFile("$baseUrl.vocab")
-        val model = getBpeFile("$baseUrl.model")
-
-        return vocab to model
-    }
-
-    private fun getBpeFile(url: String): Path {
-        val filename = url.takeLastWhile { it != '/' }
-        val path = bpePath.resolve(filename)
-        if (!Files.exists(path)) {
-            println("Downloading BPE Model/Vocab/Embedding ($filename)")
-            url.saveTo(path)
-            println("Download completed! $filename located at ${path.parent}")
-        }
+    fun getSentencePieceVocabModel(language: LanguageSupport, vocabSize: Int = 10_000): Pair<Path, Path> {
+        val (vocab, model) = UrlProvider.sentencePiece(language, vocabSize)
+        downloadFileIfMissing(vocab)
+        downloadFileIfMissing(model)
 
-        return path
+        return vocab.path to model.path
     }
 
-    fun getBpeEmbeddings(language: LanguageSupport, vocabSize: Int = 10_000, dimensions: Int = BpeDefaultEmbeddingDimension): Path {
-        val filePath = bpePath.resolve("$language.wiki.bpe.vs$vocabSize.d$dimensions.w2v.txt")
-
-        return if (Files.exists(filePath)) {
-            filePath
-        } else {
-            val baseUrl = getBpeBaseUrl(language, vocabSize)
-            val embeddingsCompressed = getBpeFile("$baseUrl.d$dimensions.w2v.txt.tar.gz")
-            val tmpPath = CompressionUtil.uncompressTarGz(embeddingsCompressed)
-            embeddingsCompressed.toFile().deleteRecursively()
-            Files.move(tmpPath, filePath)
-            bpePath.resolve("data").toFile().deleteRecursively()
-
-            filePath
+    fun getBpeEmbeddings(
+        language: LanguageSupport,
+        vocabSize: Int = 10_000,
+        dimensions: Int = BpeDefaultEmbeddingDimension
+    ): Path {
+        val fileInfo = UrlProvider.bpeEmbedding(language, vocabSize, dimensions)
+
+        if (!Files.exists(fileInfo.path)) {
+            val tmpPath = fileInfo.path.parent.resolve("${fileInfo.filename}.tar.gz")
+            downloadFileIfMissing(fileInfo.copy(path = tmpPath))
+            CompressionUtil.uncompressTarGz(tmpPath)
+            Files.deleteIfExists(tmpPath)
         }
+        return fileInfo.path
     }
 
-    private fun String.saveTo(path: Path) {
+    private fun URL.saveTo(path: Path) {
         Files.createDirectories(path.parent)
 
-        URL(this).openStream().use { input ->
+        openStream().use { input ->
             path.toFile().outputStream().use { output ->
                 input.copyTo(output)
             }

diff --git a/src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt b/src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt
@@ -7,6 +7,10 @@ import org.ejml.dense.row.NormOps_FDRM
 import org.ejml.kotlin.*
 import org.ejml.simple.SimpleMatrix
 
+/**
+ * Custom extensions for EJML simplification in Kotlin. Some optimized for speed.
+ */
+
 /** Basic Retrieval */
 fun SimpleMatrix.getRow(index: Int): SimpleMatrix = extractVector(true, index)
 fun SimpleMatrix.getRows(rows: IntArray): SimpleMatrix =

diff --git a/src/main/kotlin/com/londogard/nlp/utils/FileInfo.kt b/src/main/kotlin/com/londogard/nlp/utils/FileInfo.kt
@@ -0,0 +1,8 @@
+package com.londogard.nlp.utils
+
+import java.net.URL
+import java.nio.file.Path
+
+data class FileInfo(val filename: String, val path: Path, val url: String, val description: String, val language: LanguageSupport) {
+    fun toUrl(): URL = URL(url)
+}