diff --git a/build.gradle.kts b/build.gradle.kts index cf5ed556..ac600654 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -2,14 +2,13 @@ import org.jetbrains.kotlin.gradle.tasks.KotlinCompile plugins { `maven-publish` - id("idea") kotlin("jvm") version "1.4.32" kotlin("plugin.serialization") version "1.4.32" id("org.jetbrains.dokka") version "1.4.30" } group = "com.londogard" -version = "1.0-beta" +version = "1.0" repositories { mavenCentral() @@ -27,26 +26,24 @@ dependencies { implementation("org.ejml:ejml-simple:0.40") implementation("org.ejml:ejml-kotlin:0.40") - // implementation("com.github.levyfan:sentencepiece-jni:v0.0.2") implementation("ai.djl.sentencepiece:sentencepiece:0.10.0") implementation("com.github.rholder:snowball-stemmer:1.3.0.581.1") - - // https://mvnrepository.com/artifact/org.apache.commons/commons-compress implementation("org.apache.commons:commons-compress:1.20") - implementation("org.codehaus.plexus:plexus-archiver:4.2.4") - testImplementation("org.amshove.kluent:kluent:$kluentVersion") testImplementation("org.jetbrains.kotlin:kotlin-test:1.4.32") testImplementation(kotlin("test-junit")) + implementation(kotlin("stdlib-jdk8")) } tasks.test { useJUnit() } -tasks.withType { - kotlinOptions.jvmTarget = "1.8" +tasks.withType().configureEach { + kotlinOptions { + useIR = true + } } publishing { @@ -65,4 +62,9 @@ publishing { from(components["java"]) } } +} + +val compileTestKotlin: KotlinCompile by tasks +compileTestKotlin.kotlinOptions { + jvmTarget = "1.8" } \ No newline at end of file diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/BpeEmbeddings.kt b/src/main/kotlin/com/londogard/nlp/embeddings/BpeEmbeddings.kt index 59f29343..b4c81f5a 100644 --- a/src/main/kotlin/com/londogard/nlp/embeddings/BpeEmbeddings.kt +++ b/src/main/kotlin/com/londogard/nlp/embeddings/BpeEmbeddings.kt @@ -23,10 +23,14 @@ class BpeEmbeddings( ?.avgNorm() } + override fun contains(word: String): Boolean { + return tokenizer.split(word).all(embeddings::contains) + } + fun subwordVector(subword: String): SimpleMatrix? = embeddings[subword] companion object { - fun toTokenizer(filePath: Path): Tokenizer { + @JvmStatic fun toTokenizer(filePath: Path): Tokenizer { val rawNameTokens = filePath.fileName.toString().split('.') val languageSupport = LanguageSupport.valueOf(rawNameTokens.first()) diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/EmbeddingLoader.kt b/src/main/kotlin/com/londogard/nlp/embeddings/EmbeddingLoader.kt index 49cadc95..c0fd19e4 100644 --- a/src/main/kotlin/com/londogard/nlp/embeddings/EmbeddingLoader.kt +++ b/src/main/kotlin/com/londogard/nlp/embeddings/EmbeddingLoader.kt @@ -5,7 +5,6 @@ import com.londogard.nlp.utils.LanguageSupport import com.londogard.nlp.utils.useLines import org.ejml.simple.SimpleMatrix import java.nio.file.Path -import kotlin.io.path.bufferedReader import kotlin.math.min object EmbeddingLoader { @@ -24,7 +23,13 @@ object EmbeddingLoader { } } - // TODO inline fun fromUrl(url: String): Map = TODO("") + inline fun fromFile(path: Path): T { + return when { + T::class == LightWordEmbeddings::class -> LightWordEmbeddings(path) as T + T::class == BpeEmbeddings::class -> BpeEmbeddings(path) as T + else -> WordEmbeddings(path) as T + } + } internal fun fromFile(path: Path, delimiter: Char, @@ -47,9 +52,4 @@ object EmbeddingLoader { } .toMap(LinkedHashMap(numLinesToUse)) // optimization by creating the full map directly } -} - -fun main() { - val embeddings = EmbeddingLoader.fromLanguageOrNull(LanguageSupport.sv) - println(embeddings?.vector("Hej")) } \ No newline at end of file diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/WordEmbeddings.kt b/src/main/kotlin/com/londogard/nlp/embeddings/WordEmbeddings.kt index fe749ce9..f553a09a 100644 --- a/src/main/kotlin/com/londogard/nlp/embeddings/WordEmbeddings.kt +++ b/src/main/kotlin/com/londogard/nlp/embeddings/WordEmbeddings.kt @@ -89,7 +89,7 @@ class WordEmbeddings( /** Pretty print the list of words and their associated scores. * @param words List of (word, score) pairs to be printed. */ - fun pprint(words: List>) { + @JvmStatic fun pprint(words: List>) { println("\n%50s${" ".repeat(7)}Cosine distance\n${"-".repeat(72)}".format("Word")) println(words.joinToString("\n") { (word, dist) -> "%50s${" ".repeat(7)}%15f".format(word, dist) }) } diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/sentence/AverageSentenceEmbeddings.kt b/src/main/kotlin/com/londogard/nlp/embeddings/sentence/AverageSentenceEmbeddings.kt index e8f8d4bb..68912a2d 100644 --- a/src/main/kotlin/com/londogard/nlp/embeddings/sentence/AverageSentenceEmbeddings.kt +++ b/src/main/kotlin/com/londogard/nlp/embeddings/sentence/AverageSentenceEmbeddings.kt @@ -1,6 +1,7 @@ package com.londogard.nlp.embeddings.sentence import com.londogard.nlp.embeddings.Embeddings +import com.londogard.nlp.utils.avgNorm import com.londogard.nlp.utils.normalize import org.ejml.simple.SimpleMatrix @@ -11,7 +12,6 @@ class AverageSentenceEmbeddings(override val tokenEmbeddings: Embeddings): Sente override fun getSentenceEmbedding(sentence: List): SimpleMatrix { return tokenEmbeddings .traverseVectors(sentence) - .reduce { acc, simpleMatrix -> acc + simpleMatrix } - .normalize() + .avgNorm() } } \ No newline at end of file diff --git a/src/main/kotlin/com/londogard/nlp/embeddings/sentence/USifSentenceEmbeddings.kt b/src/main/kotlin/com/londogard/nlp/embeddings/sentence/USifSentenceEmbeddings.kt index 8f283065..19caf76a 100644 --- a/src/main/kotlin/com/londogard/nlp/embeddings/sentence/USifSentenceEmbeddings.kt +++ b/src/main/kotlin/com/londogard/nlp/embeddings/sentence/USifSentenceEmbeddings.kt @@ -12,7 +12,7 @@ import kotlin.math.pow class USifSentenceEmbeddings( override val tokenEmbeddings: Embeddings, private val wordProb: Map, - randomWalkLength: Int, // = n, ~11 + randomWalkLength: Int = 11, // = n, ~11 private val numCommonDiscourseVector: Int = 5 // = m, 0 should work. In practise max 5. ) : SentenceEmbeddings { private val vocabSize = wordProb.size.toFloat() diff --git a/src/main/kotlin/com/londogard/nlp/stemmer/Stemmer.kt b/src/main/kotlin/com/londogard/nlp/stemmer/Stemmer.kt index 54cc18ba..e104cc53 100644 --- a/src/main/kotlin/com/londogard/nlp/stemmer/Stemmer.kt +++ b/src/main/kotlin/com/londogard/nlp/stemmer/Stemmer.kt @@ -37,7 +37,7 @@ class Stemmer(language: LanguageSupport) { var cache: Pair? = null // Default to PorterStemmer if not supported! - fun stem(word: String, language: LanguageSupport): String { + @JvmStatic fun stem(word: String, language: LanguageSupport): String { val cachedStemmer = cache return when (cachedStemmer?.first) { diff --git a/src/main/kotlin/com/londogard/nlp/stopwords/Stopwords.kt b/src/main/kotlin/com/londogard/nlp/stopwords/Stopwords.kt index 5ef38ef3..c50cc785 100644 --- a/src/main/kotlin/com/londogard/nlp/stopwords/Stopwords.kt +++ b/src/main/kotlin/com/londogard/nlp/stopwords/Stopwords.kt @@ -15,6 +15,7 @@ object Stopwords { fun isStopword(word: String, language: LanguageSupport): Boolean = stopwordsOrNull(language)?.contains(word) == true + @Throws(IllegalArgumentException::class) fun stopwords(language: LanguageSupport): Set = stopwordsOrNull(language) ?: throw IllegalArgumentException("There exists not stopwords for language ${language.name}. Please try again with one of the supported languages.") diff --git a/src/main/kotlin/com/londogard/nlp/structures/trie/Trie.kt b/src/main/kotlin/com/londogard/nlp/structures/trie/Trie.kt index 047e594c..31eef4c7 100644 --- a/src/main/kotlin/com/londogard/nlp/structures/trie/Trie.kt +++ b/src/main/kotlin/com/londogard/nlp/structures/trie/Trie.kt @@ -64,20 +64,4 @@ fun findFirstMerger(trie: TrieNode, string: String): String? { .firstOrNull() } } -} - -fun main() { - val vocab = WordFrequencies.getAllWordFrequenciesOrNull(LanguageSupport.sv)?.toVocab() ?: emptyMap() - println(vocab.entries.sortedBy { it.value }.reversed().take(5)) - val trie = Trie(vocab) - println(findFirstMerger(trie.rootNode.childNodes.entries.first().value, "")) - - // could use foldRight (goes from other end..!) - // val reverseTrie = Trie(vocab.mapKeys { (key,_) -> key.reversed() }) - // println(reverseTrie.rootNode.childNodes.map { it.key to it.value.count }) - - println(trie.rootNode.childNodes.getValue('ä').childNodes.map { it.key to it.value.count }) - println(trie.rootNode.childNodes.map { it.key to it.value.count }) - println(trie.rootNode.char) - println(trie.rootNode.count) } \ No newline at end of file diff --git a/src/main/kotlin/com/londogard/nlp/tokenizer/SentencePieceTokenizer.kt b/src/main/kotlin/com/londogard/nlp/tokenizer/SentencePieceTokenizer.kt index 496d1881..37af1a85 100644 --- a/src/main/kotlin/com/londogard/nlp/tokenizer/SentencePieceTokenizer.kt +++ b/src/main/kotlin/com/londogard/nlp/tokenizer/SentencePieceTokenizer.kt @@ -17,12 +17,12 @@ class SentencePieceTokenizer(modelPath: Path, vocabPath: Path? = null): Tokenize override fun split(text: String): List = sentencePieceTokenizer.tokenize(text) companion object { - fun fromLanguageSupportOrNull(languageSupport: LanguageSupport): SentencePieceTokenizer? = + const val beginningOfWord: Char = '▁' + @JvmStatic fun fromLanguageSupportOrNull(languageSupport: LanguageSupport): SentencePieceTokenizer? = fromLanguageSupportAndSizeOrNull(languageSupport, VocabSize.v10_000) - - fun fromLanguageSupportAndSizeOrNull(languageSupport: LanguageSupport, vocabSize: VocabSize) = + @JvmStatic fun fromLanguageSupportAndSizeOrNull(languageSupport: LanguageSupport, vocabSize: VocabSize) = if (languageSupport.hasSentencePiece()) { - val (vocab, model) = DownloadHelper.getBpeVocabModel(languageSupport, vocabSize.size) + val (vocab, model) = DownloadHelper.getSentencePieceVocabModel(languageSupport, vocabSize.size) SentencePieceTokenizer(model, vocab) } else null } diff --git a/src/main/kotlin/com/londogard/nlp/utils/CompressionUtil.kt b/src/main/kotlin/com/londogard/nlp/utils/CompressionUtil.kt index 98930f37..56093611 100644 --- a/src/main/kotlin/com/londogard/nlp/utils/CompressionUtil.kt +++ b/src/main/kotlin/com/londogard/nlp/utils/CompressionUtil.kt @@ -6,12 +6,12 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream import java.io.BufferedInputStream import java.io.IOException import java.io.InputStream -import java.io.OutputStream import java.nio.file.Files import java.nio.file.Path import java.nio.file.StandardCopyOption import java.util.zip.GZIPInputStream +/** Simplified usage of apache compress through Object functions. */ object CompressionUtil { fun gunzip(path: Path): InputStream = path.toFile() diff --git a/src/main/kotlin/com/londogard/nlp/utils/CustomExtensions.kt b/src/main/kotlin/com/londogard/nlp/utils/CustomExtensions.kt index 1ff97c0d..6e5d86e6 100644 --- a/src/main/kotlin/com/londogard/nlp/utils/CustomExtensions.kt +++ b/src/main/kotlin/com/londogard/nlp/utils/CustomExtensions.kt @@ -5,6 +5,8 @@ import java.nio.file.Files import java.nio.file.Path import java.nio.file.attribute.FileAttribute +/** Custom extensions for Path taken from Kotlin EXPERIMENTAL. */ + // Taken from Kotlin stdlib (EXPERIMENTAL) internal inline fun Path.readLines(charset: Charset = Charsets.UTF_8): List = Files.readAllLines(this, charset) diff --git a/src/main/kotlin/com/londogard/nlp/utils/DownloadHelper.kt b/src/main/kotlin/com/londogard/nlp/utils/DownloadHelper.kt index 22aea6c2..3f795901 100644 --- a/src/main/kotlin/com/londogard/nlp/utils/DownloadHelper.kt +++ b/src/main/kotlin/com/londogard/nlp/utils/DownloadHelper.kt @@ -1,115 +1,79 @@ package com.londogard.nlp.utils import com.londogard.nlp.embeddings.EmbeddingLoader.BpeDefaultEmbeddingDimension -import com.londogard.nlp.tokenizer.toVocabSize import com.londogard.nlp.wordfreq.WordFrequencySize import java.net.URL import java.nio.file.Files import java.nio.file.Path -import java.nio.file.Paths @PublishedApi internal object DownloadHelper { - private val rootPath: Path = Paths.get(System.getProperty("user.home")).resolve(".londogard") - private const val dataUrl: String = "https://raw.githubusercontent.com/londogard/londogard-nlp-toolkit/main/data" - private const val bpeUrl: String = "https://nlp.h-its.org/bpemb/" - private val stopwordPath: Path = rootPath.resolve("stopwords") - private val wordFrequencyPath: Path = rootPath.resolve("wordfreq") - private val embeddingPath: Path = rootPath.resolve("embeddings") - private val bpePath: Path = rootPath.resolve("bpe") - fun getStopWords(language: LanguageSupport): Path { - val path = stopwordPath.resolve(language.name) - if (!Files.exists(path)) { - println("Language ${language.name} does not have stopwords locally. Will download (few KBs)...") + val fileInfo = UrlProvider.stopwords(language) + downloadFileIfMissing(fileInfo) - "$dataUrl/stopwords/${language.name}".saveTo(path) + return fileInfo.path + } - println("Download done! ${language.name} stopwords located at ${path.toAbsolutePath()}") - } + fun getWordFrequencies(language: LanguageSupport, size: WordFrequencySize = WordFrequencySize.Smallest): Path { + val fileInfo = UrlProvider.wordfreq(language, size) + downloadFileIfMissing(fileInfo) - return path + return fileInfo.path } - fun getWordFrequencies(language: LanguageSupport, size: WordFrequencySize = WordFrequencySize.Smallest): Path { - val filename = size.toFileName(language) - val path = wordFrequencyPath.resolve(filename) - if (!Files.exists(path)) { - println("Language ${language.name} does not have (${size.name}) word frequencies locally. Will download (few KBs)...") + private fun downloadFileIfMissing(fileInfo: FileInfo) { + if (!Files.exists(fileInfo.path)) { + println("Downloading ${fileInfo.description} for ${fileInfo.language} as files don't exist locally.") - "$dataUrl/wordfreq/${filename}".saveTo(path) + fileInfo.toUrl().saveTo(fileInfo.path) - println("Download done! ${language.name} (${size.name}) word frequencies located at ${path.toAbsolutePath()}") + println("Download completed! ${fileInfo.language} ${fileInfo.description} located at ${fileInfo.path.toAbsolutePath()}") } - - return path } fun getWordEmbeddings(language: LanguageSupport): Path { - val filename = "cc.${language.name}.300.vec" - val path = embeddingPath.resolve(filename) - if (!Files.exists(path)) { - path.parent.createDirectories() - println("Language ${language.name} does not have word embeddings locally. Will download (could be GBs)...") - val url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/$filename.gz" + val fileInfo = UrlProvider.fastText(language) + if (!Files.exists(fileInfo.path)) { val tmpPath = Files.createTempFile("tmp", ".gz") - - url.saveTo(tmpPath) - Files.newOutputStream(path).use { out -> + downloadFileIfMissing(fileInfo.copy(path = tmpPath)) + Files.newOutputStream(fileInfo.path).use { out -> CompressionUtil.gunzip(tmpPath).use { input -> input.copyTo(out) } } Files.deleteIfExists(tmpPath) - - println("Download completed! ${language.name} word embeddings located at ${path.toAbsolutePath()}") } - return path + return fileInfo.path } - private fun getBpeBaseUrl(language: LanguageSupport, numMerges: Int): String = - "$bpeUrl/$language/$language.wiki.bpe.vs$numMerges" - // TODO improve by `data class` - fun getBpeVocabModel(language: LanguageSupport, vocabSize: Int = 10_000): Pair { - val baseUrl = getBpeBaseUrl(language, vocabSize) - val vocab = getBpeFile("$baseUrl.vocab") - val model = getBpeFile("$baseUrl.model") - - return vocab to model - } - - private fun getBpeFile(url: String): Path { - val filename = url.takeLastWhile { it != '/' } - val path = bpePath.resolve(filename) - if (!Files.exists(path)) { - println("Downloading BPE Model/Vocab/Embedding ($filename)") - url.saveTo(path) - println("Download completed! $filename located at ${path.parent}") - } + fun getSentencePieceVocabModel(language: LanguageSupport, vocabSize: Int = 10_000): Pair { + val (vocab, model) = UrlProvider.sentencePiece(language, vocabSize) + downloadFileIfMissing(vocab) + downloadFileIfMissing(model) - return path + return vocab.path to model.path } - fun getBpeEmbeddings(language: LanguageSupport, vocabSize: Int = 10_000, dimensions: Int = BpeDefaultEmbeddingDimension): Path { - val filePath = bpePath.resolve("$language.wiki.bpe.vs$vocabSize.d$dimensions.w2v.txt") - - return if (Files.exists(filePath)) { - filePath - } else { - val baseUrl = getBpeBaseUrl(language, vocabSize) - val embeddingsCompressed = getBpeFile("$baseUrl.d$dimensions.w2v.txt.tar.gz") - val tmpPath = CompressionUtil.uncompressTarGz(embeddingsCompressed) - embeddingsCompressed.toFile().deleteRecursively() - Files.move(tmpPath, filePath) - bpePath.resolve("data").toFile().deleteRecursively() - - filePath + fun getBpeEmbeddings( + language: LanguageSupport, + vocabSize: Int = 10_000, + dimensions: Int = BpeDefaultEmbeddingDimension + ): Path { + val fileInfo = UrlProvider.bpeEmbedding(language, vocabSize, dimensions) + + if (!Files.exists(fileInfo.path)) { + val tmpPath = fileInfo.path.parent.resolve("${fileInfo.filename}.tar.gz") + downloadFileIfMissing(fileInfo.copy(path = tmpPath)) + CompressionUtil.uncompressTarGz(tmpPath) + Files.deleteIfExists(tmpPath) } + return fileInfo.path } - private fun String.saveTo(path: Path) { + private fun URL.saveTo(path: Path) { Files.createDirectories(path.parent) - URL(this).openStream().use { input -> + openStream().use { input -> path.toFile().outputStream().use { output -> input.copyTo(output) } diff --git a/src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt b/src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt index dfc00577..5f5b70bd 100644 --- a/src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt +++ b/src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt @@ -7,6 +7,10 @@ import org.ejml.dense.row.NormOps_FDRM import org.ejml.kotlin.* import org.ejml.simple.SimpleMatrix +/** + * Custom extensions for EJML simplification in Kotlin. Some optimized for speed. + */ + /** Basic Retrieval */ fun SimpleMatrix.getRow(index: Int): SimpleMatrix = extractVector(true, index) fun SimpleMatrix.getRows(rows: IntArray): SimpleMatrix = diff --git a/src/main/kotlin/com/londogard/nlp/utils/FileInfo.kt b/src/main/kotlin/com/londogard/nlp/utils/FileInfo.kt new file mode 100644 index 00000000..9090b8f2 --- /dev/null +++ b/src/main/kotlin/com/londogard/nlp/utils/FileInfo.kt @@ -0,0 +1,8 @@ +package com.londogard.nlp.utils + +import java.net.URL +import java.nio.file.Path + +data class FileInfo(val filename: String, val path: Path, val url: String, val description: String, val language: LanguageSupport) { + fun toUrl(): URL = URL(url) +} diff --git a/src/main/kotlin/com/londogard/nlp/utils/LanguageSupport.kt b/src/main/kotlin/com/londogard/nlp/utils/LanguageSupport.kt index e2a45f9d..aa675278 100644 --- a/src/main/kotlin/com/londogard/nlp/utils/LanguageSupport.kt +++ b/src/main/kotlin/com/londogard/nlp/utils/LanguageSupport.kt @@ -3,7 +3,9 @@ package com.londogard.nlp.utils /** * All languages and their support. * For conversion from ISO-code, see https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes - * 'nb' = Norwegian Bokmål. But you can call 'no' too. + * + * 'nb' = Norwegian Bokmål. Calling 'no' is the recommended way as that supports + * most and automatically cast to 'nb' if required. */ enum class LanguageSupport { ab, ace, ady, af, ak, als, am, an, ang, ar, arc, arz, `as`, ast, atj, av, ay, az, azb, @@ -20,14 +22,14 @@ enum class LanguageSupport { ta, tcy, te, tet, tg, th, ti, tk, tl, tn, to, tpi, tr, ts, tt, tum, tw, ty, tyv, udm, ug, uk, ur, uz, ve, vec, vep, vi, vls, vo, wa, war, wo, wuu, xal, xh, xmf, yi, yo, za, zea, zh, zu; - // Supported through SnowballStemmer (http://snowball.tartarus.org/) + /** Validates if Stemmer is supported for LanguageSupport. Support via [SnowballStemmer](http://snowball.tartarus.org/) */ fun hasStemmer(): Boolean = when (this) { sv, nl, en, fi, fr, de, hu, it, no, pt, ro, ru, es, tr -> true else -> false } - // Supported through FastText vectors (https://fasttext.cc/docs/en/crawl-vectors.html) - fun hasWordEmbeddings(): Boolean = when (this) { // TODO add fastText vector extraction + /** Validates if WordEmbedding (fastText) is downloadable for LanguageSupport. Support via [fastText](https://fasttext.cc/docs/en/crawl-vectors.html) */ + fun hasWordEmbeddings(): Boolean = when (this) { en, ky, xmf, mwl, tt, vec, ml, pfl, ro, war, tk, mhr, sc, am, cv, `as`, nn, vo, az, ia, th, ka, gl, sco, co, mt, rm, bar, zh, pt, kk, fy, pms, mzn, ba, cy, li, et, fa, bg, sl, ast, `is`, ja, de, hif, nds, bcl, so, ceb, @@ -41,14 +43,14 @@ enum class LanguageSupport { else -> false } - // Supported through NLTKs stopword lists (https://www.nltk.org/) + /** Validates if StopWord is supported for LanguageSupport. Support via NLTKs lists, [NLTK](https://www.nltk.org/) */ fun hasStopWordSupport(): Boolean = when (this) { ar, az, da, de, el, en, es, fi, fr, hu, id, it, kk, ne, nl, no, pt, ro, ru, sl, sv, tg, tr -> true else -> false } - // Supported through wordfreq.py datasets (https://pypi.org/project/wordfreq/) + /** Validates if WordFrequencies is supported for LanguageSupport. Support via [wordfreq.py](https://pypi.org/project/wordfreq/) amazing files. */ fun hasWordFrequencySupport(): Boolean = when (this) { ar, cs, de, en, es, fi, fr, it, ja, nl, pl, uk, pt, ru, zh, bg, bn, ca, da, el, fa, he, hi, ba, hr, rs, me, @@ -56,13 +58,14 @@ enum class LanguageSupport { else -> false } + /** Returns largestWordFrequency or null for LanguageSupport */ fun largestWordFrequency(): String? = when (this) { ar, cs, de, en, es, fi, fr, it, ja, nl, pl, pt, ru, zh -> "large" bg, bn, ca, da, el, fa, he, hi, hu, id, ko, lv, mk, ms, nb, no, ro, sh, sv, tr, uk,ba, hr, rs, me -> "small" else -> null } - // Download custom model/vocab from here: https://nlp.h-its.org/bpemb/ or create your own. + /** Validates if SentencePiece exists pretrained for LanguageSupport. Support via [BPEmb](https://nlp.h-its.org/bpemb/) which are trained on Wikipedia.org. */ fun hasSentencePiece(): Boolean = when (this) { nb, nah, bh, eml -> false else -> true diff --git a/src/main/kotlin/com/londogard/nlp/utils/MapExtensions.kt b/src/main/kotlin/com/londogard/nlp/utils/MapExtensions.kt index 2b7ec5c0..4a9da62e 100644 --- a/src/main/kotlin/com/londogard/nlp/utils/MapExtensions.kt +++ b/src/main/kotlin/com/londogard/nlp/utils/MapExtensions.kt @@ -2,6 +2,9 @@ package com.londogard.nlp.utils import kotlin.math.roundToInt +/** + * Custom extension functions for Map(s). + */ object MapExtensions { fun Map.toVocab(): Map { val min = this.values.minOrNull() ?: 1f diff --git a/src/main/kotlin/com/londogard/nlp/utils/UrlProvider.kt b/src/main/kotlin/com/londogard/nlp/utils/UrlProvider.kt new file mode 100644 index 00000000..4d024ece --- /dev/null +++ b/src/main/kotlin/com/londogard/nlp/utils/UrlProvider.kt @@ -0,0 +1,53 @@ +package com.londogard.nlp.utils + +import com.londogard.nlp.embeddings.EmbeddingLoader +import com.londogard.nlp.wordfreq.WordFrequencySize +import java.net.URL +import java.nio.file.Path +import java.nio.file.Paths + +object UrlProvider { + private const val githubDataUrl: String = "https://raw.githubusercontent.com/londogard/londogard-nlp-toolkit/main/data" + private const val bpeUrl: String = "https://nlp.h-its.org/bpemb/" + + private val rootPath: Path = Paths.get(System.getProperty("user.home")).resolve(".londogard") + private val stopwordPath: Path = rootPath.resolve("stopwords") + private val wordFrequencyPath: Path = rootPath.resolve("wordfreq") + private val embeddingPath: Path = rootPath.resolve("embeddings") + private val bpePath: Path = rootPath.resolve("bpe") + + fun fastText(language: LanguageSupport): FileInfo { + val filename = "cc.${language.name}.300.vec" + val url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/$filename.gz" + return FileInfo(filename, embeddingPath.resolve(filename), url, "fastText (GB(s))", language) + } + + fun bpeEmbedding(language: LanguageSupport, vocabSize: Int, dimensions: Int): FileInfo { + val filename = "$language.wiki.bpe.vs$vocabSize.d$dimensions.w2v.txt" + val url = "${getBpeBaseUrl(language, vocabSize)}.d$dimensions.w2v.txt.tar.gz" + + return FileInfo(filename, bpePath.resolve(filename), url, "bpemb embeddings (100KB - 45MB)", language) + } + + fun sentencePiece(language: LanguageSupport, vocabSize: Int): Pair { + val baseUrl = getBpeBaseUrl(language, vocabSize) + val baseFileName = baseUrl.takeLastWhile { char -> char != '/' } + val vocabFilename = "$baseFileName.vocab" + val modelFilename = "$baseFileName.model" + val vocab = FileInfo(vocabFilename, bpePath.resolve(vocabFilename), "$baseUrl.vocab", "sentencepiece (bpemb) vocab (<10 KB)", language) + val model = FileInfo(modelFilename, bpePath.resolve(modelFilename), "$baseUrl.model", "sentencepiece (bpemb) model (<4 MB)", language) + + return vocab to model + } + + fun stopwords(language: LanguageSupport): FileInfo = + FileInfo(language.toString(), stopwordPath.resolve(language.toString()),"$githubDataUrl/stopwords/$language", "stopwords (<2 KB)", language) + + fun wordfreq(language: LanguageSupport, size: WordFrequencySize): FileInfo { + val filename = size.toFileName(language) + return FileInfo(filename, wordFrequencyPath.resolve(filename), "$githubDataUrl/wordfreq/$filename", "wordfreq (<3 MB)", language) + } + + private fun getBpeBaseUrl(language: LanguageSupport, vocabSize: Int): String = + "$bpeUrl/$language/$language.wiki.bpe.vs$vocabSize" +} \ No newline at end of file diff --git a/src/main/kotlin/com/londogard/nlp/wordfreq/WordFrequencies.kt b/src/main/kotlin/com/londogard/nlp/wordfreq/WordFrequencies.kt index 15c45720..9a9182f5 100644 --- a/src/main/kotlin/com/londogard/nlp/wordfreq/WordFrequencies.kt +++ b/src/main/kotlin/com/londogard/nlp/wordfreq/WordFrequencies.kt @@ -5,7 +5,6 @@ import com.londogard.nlp.utils.DownloadHelper import com.londogard.nlp.utils.LanguageSupport import java.io.InputStream import java.nio.file.Path -import java.util.zip.GZIPInputStream import kotlin.math.log10 import kotlin.math.pow diff --git a/src/main/kotlin/com/londogard/nlp/wordfreq/WordFrequencySize.kt b/src/main/kotlin/com/londogard/nlp/wordfreq/WordFrequencySize.kt index 9d110d40..9322bba9 100644 --- a/src/main/kotlin/com/londogard/nlp/wordfreq/WordFrequencySize.kt +++ b/src/main/kotlin/com/londogard/nlp/wordfreq/WordFrequencySize.kt @@ -7,7 +7,7 @@ enum class WordFrequencySize { Largest, Smallest; private fun stringify(languageSupport: LanguageSupport): String = when(languageSupport) { - nb -> no.toString() + no -> nb.toString() ba, hr, rs, cs, me -> sh.toString() else -> languageSupport.toString() } diff --git a/src/test/kotlin/com/londogard/nlp/CompressionUtilTests.kt b/src/test/kotlin/com/londogard/nlp/CompressionUtilTests.kt new file mode 100644 index 00000000..a92af87a --- /dev/null +++ b/src/test/kotlin/com/londogard/nlp/CompressionUtilTests.kt @@ -0,0 +1,36 @@ +package com.londogard.nlp + +import com.londogard.nlp.utils.CompressionUtil +import org.amshove.kluent.shouldContain +import org.amshove.kluent.shouldHaveSize +import java.nio.file.Files +import java.nio.file.Paths +import kotlin.test.Test + +class CompressionUtilTests { + @Test + fun testGunzip() { + val lines = CompressionUtil.gunzip(Paths.get(javaClass.getResource("/hej.txt.gz")?.path ?: "")) + .bufferedReader() + .readLines() + .filter(String::isNotBlank) + + lines shouldHaveSize 1 + lines shouldContain "hej" + } + + @Test + fun testUncompressTarGz() { + val lines = CompressionUtil.uncompressTarGz( + Paths.get(javaClass.getResource("/hej.tar.gz")?.path ?: ""), + Files.createTempDirectory("tmp") + ) + .toFile() + .readLines() + .filter(String::isNotBlank) + + lines shouldHaveSize 1 + lines shouldContain "hej" + } + +} \ No newline at end of file diff --git a/src/test/kotlin/com/londogard/nlp/EmbeddingTest.kt b/src/test/kotlin/com/londogard/nlp/EmbeddingTest.kt new file mode 100644 index 00000000..d1ed743e --- /dev/null +++ b/src/test/kotlin/com/londogard/nlp/EmbeddingTest.kt @@ -0,0 +1,46 @@ +package com.londogard.nlp + +import com.londogard.nlp.embeddings.BpeEmbeddings +import com.londogard.nlp.embeddings.EmbeddingLoader +import com.londogard.nlp.embeddings.LightWordEmbeddings +import com.londogard.nlp.embeddings.WordEmbeddings +import com.londogard.nlp.utils.LanguageSupport +import org.amshove.kluent.shouldBe +import org.amshove.kluent.shouldNotBe +import java.nio.file.Path +import kotlin.test.Test + +class EmbeddingTest { + @Test + fun testBpeEmb() { + val embeddings = EmbeddingLoader.fromLanguageOrNull(LanguageSupport.sv) + + embeddings shouldNotBe null + + embeddings?.vector("hej") shouldNotBe null + embeddings?.subwordVector("h") shouldNotBe null + } + + @Test + fun testLightWordEmbeddings() { + val embeddings = LightWordEmbeddings(Path.of(javaClass.getResource("/sv_embeddings_cut.txt")!!.toURI()), maxWordCount = 1) + + embeddings.embeddings.size shouldBe 1 + embeddings.contains("hej") shouldBe true + embeddings.addWords(setOf("då")) + embeddings.embeddings.size shouldBe 1 + embeddings.contains("då") shouldBe true + embeddings.contains("hej") shouldBe false + embeddings.vector("då")?.numCols() shouldBe 3 + } + + @Test + fun testWordEmbeddings() { + val embeddings = WordEmbeddings(Path.of(javaClass.getResource("/sv_embeddings_cut.txt")!!.toURI())) + embeddings.embeddings.size shouldBe 2 + embeddings.contains("hej") shouldBe true + embeddings.contains("då") shouldBe true + embeddings.contains("Då") shouldBe false + embeddings.vector("då")?.numCols() shouldBe 3 + } +} \ No newline at end of file diff --git a/src/test/kotlin/com/londogard/nlp/SentenceEmbeddingTests.kt b/src/test/kotlin/com/londogard/nlp/SentenceEmbeddingTests.kt new file mode 100644 index 00000000..77150682 --- /dev/null +++ b/src/test/kotlin/com/londogard/nlp/SentenceEmbeddingTests.kt @@ -0,0 +1,41 @@ +package com.londogard.nlp + +import com.londogard.nlp.embeddings.BpeEmbeddings +import com.londogard.nlp.embeddings.EmbeddingLoader +import com.londogard.nlp.embeddings.WordEmbeddings +import com.londogard.nlp.embeddings.sentence.AverageSentenceEmbeddings +import com.londogard.nlp.embeddings.sentence.USifSentenceEmbeddings +import com.londogard.nlp.utils.LanguageSupport +import com.londogard.nlp.utils.avgNorm +import com.londogard.nlp.utils.normalize +import com.londogard.nlp.wordfreq.WordFrequencies +import org.amshove.kluent.shouldBe +import org.amshove.kluent.shouldBeEqualTo +import org.amshove.kluent.shouldNotBeEqualTo +import java.nio.file.Path +import kotlin.test.Test + +class SentenceEmbeddingTests { + @Test + fun testUsifEmbeddings() { + val embeddings = EmbeddingLoader.fromLanguageOrNull(LanguageSupport.sv)!! + val usif = USifSentenceEmbeddings(embeddings, WordFrequencies.getAllWordFrequenciesOrNull(LanguageSupport.sv) ?: emptyMap()) + val avgSentenceEmbedding = AverageSentenceEmbeddings(embeddings) + val embedding = usif.getSentenceEmbedding(listOf("hej", "där", "borta")) + val rawData = embedding.fdrm.data + + rawData[0] shouldNotBeEqualTo rawData[1] + rawData.size shouldBe 50 + + usif.getSentenceEmbedding(listOf("hej", "då")).toString() shouldNotBeEqualTo avgSentenceEmbedding.getSentenceEmbedding(listOf("hej", "då")).toString() + } + + @Test + fun testAvgSentenceEmbeddings() { + val embeddings = WordEmbeddings(Path.of(javaClass.getResource("/sv_embeddings_cut.txt")!!.toURI())) + val avgSentenceEmbeddings = AverageSentenceEmbeddings(embeddings) + + embeddings.traverseVectors(listOf("hej", "då")).avgNorm().toString() shouldBeEqualTo avgSentenceEmbeddings.getSentenceEmbedding(listOf("hej", "då")).toString() + embeddings.vector("hej")?.normalize()?.toString() shouldBeEqualTo avgSentenceEmbeddings.getSentenceEmbedding(listOf("hej")).toString() + } +} \ No newline at end of file diff --git a/src/test/kotlin/com/londogard/nlp/StemmerTests.kt b/src/test/kotlin/com/londogard/nlp/StemmerTests.kt new file mode 100644 index 00000000..df13e03c --- /dev/null +++ b/src/test/kotlin/com/londogard/nlp/StemmerTests.kt @@ -0,0 +1,22 @@ +package com.londogard.nlp + +import com.londogard.nlp.stemmer.Stemmer +import com.londogard.nlp.utils.LanguageSupport +import org.amshove.kluent.shouldBeEqualTo +import kotlin.test.Test + +class StemmerTests { + @Test + fun testStemmer() { + val stemmer = Stemmer(LanguageSupport.sv) + + stemmer.stem("hej") shouldBeEqualTo "hej" + stemmer.stem("katten") shouldBeEqualTo "katt" + } + + @Test + fun testStemmerObject() { + Stemmer.stem("hej", LanguageSupport.sv) shouldBeEqualTo "hej" + Stemmer.stem("katten", LanguageSupport.sv) shouldBeEqualTo "katt" + } +} \ No newline at end of file diff --git a/src/test/kotlin/com/londogard/nlp/StopwordTests.kt b/src/test/kotlin/com/londogard/nlp/StopwordTests.kt new file mode 100644 index 00000000..3503a79d --- /dev/null +++ b/src/test/kotlin/com/londogard/nlp/StopwordTests.kt @@ -0,0 +1,24 @@ +package com.londogard.nlp + +import com.londogard.nlp.stopwords.Stopwords +import com.londogard.nlp.utils.LanguageSupport +import org.amshove.kluent.shouldBe +import org.amshove.kluent.shouldContain +import org.amshove.kluent.shouldNotBe +import kotlin.test.Test + +class StopwordTests { + @Test + fun testFullStopwords() { + val stopwords = Stopwords.stopwordsOrNull(LanguageSupport.tr) + + stopwords shouldNotBe null + stopwords!! shouldContain "acaba" + } + + @Test + fun testStopwords() { + Stopwords.isStopword("acaba", LanguageSupport.tr) shouldBe true + Stopwords.isStopword("d", LanguageSupport.tr) shouldBe false + } +} \ No newline at end of file diff --git a/src/test/kotlin/com/londogard/nlp/TokenizerTests.kt b/src/test/kotlin/com/londogard/nlp/TokenizerTests.kt new file mode 100644 index 00000000..7379637b --- /dev/null +++ b/src/test/kotlin/com/londogard/nlp/TokenizerTests.kt @@ -0,0 +1,35 @@ +package com.londogard.nlp + +import com.londogard.nlp.tokenizer.CharTokenizer +import com.londogard.nlp.tokenizer.SentencePieceTokenizer +import com.londogard.nlp.tokenizer.SimpleTokenizer +import com.londogard.nlp.tokenizer.VocabSize +import com.londogard.nlp.utils.LanguageSupport +import org.amshove.kluent.shouldBeEqualTo +import kotlin.test.Test + +class TokenizerTests { + @Test + fun testCharTokenizer() { + val tokenizer = CharTokenizer() + + tokenizer.split("abc") shouldBeEqualTo listOf("a", "b", "c") + tokenizer.split("a bc") shouldBeEqualTo listOf("a", " ", "b", "c") + } + + @Test + fun testSimpleTokenizer() { + val tokenizer = SimpleTokenizer() + + tokenizer.split("abc") shouldBeEqualTo listOf("abc") + tokenizer.split("a bc") shouldBeEqualTo listOf("a", "bc") + tokenizer.split("and, some") shouldBeEqualTo listOf("and", ",", "some") + } + + @Test + fun testSentencePieceTokenizer() { + val tokenizer = SentencePieceTokenizer.fromLanguageSupportAndSizeOrNull(LanguageSupport.sv, VocabSize.v1000) + + tokenizer?.split("hej där borta?") shouldBeEqualTo listOf("▁h", "e", "j", "▁där", "▁b", "or", "ta", "?") + } +} \ No newline at end of file diff --git a/src/test/kotlin/com/londogard/nlp/UrlProviderTest.kt b/src/test/kotlin/com/londogard/nlp/UrlProviderTest.kt new file mode 100644 index 00000000..f4aaf052 --- /dev/null +++ b/src/test/kotlin/com/londogard/nlp/UrlProviderTest.kt @@ -0,0 +1,56 @@ +package com.londogard.nlp + +import com.londogard.nlp.embeddings.EmbeddingLoader +import com.londogard.nlp.tokenizer.VocabSize +import com.londogard.nlp.utils.FileInfo +import com.londogard.nlp.utils.LanguageSupport +import com.londogard.nlp.utils.UrlProvider +import com.londogard.nlp.wordfreq.WordFrequencySize +import org.amshove.kluent.shouldBe +import java.net.HttpURLConnection +import kotlin.test.Test + +class UrlProviderTest { + @Test + fun testFastText() { + val fileInfo = UrlProvider.fastText(LanguageSupport.sv) + checkUrlWorks(fileInfo) shouldBe true + } + + @Test + fun testSentencePiece() { + val (vocab, model) = UrlProvider.sentencePiece(LanguageSupport.sv, VocabSize.v1000.size) + + checkUrlWorks(vocab) shouldBe true + checkUrlWorks(model) shouldBe true + } + + @Test + fun testBpeEmbedding() { + val fileInfo = UrlProvider.bpeEmbedding(LanguageSupport.sv, VocabSize.v1000.size, EmbeddingLoader.BpeDefaultEmbeddingDimension) + + checkUrlWorks(fileInfo) shouldBe true + } + + @Test + fun testStopwords() { + val fileInfo = UrlProvider.stopwords(LanguageSupport.sv) + + checkUrlWorks(fileInfo) shouldBe true + } + + @Test + fun testWordFreq() { + val fileInfo = UrlProvider.wordfreq(LanguageSupport.sv, WordFrequencySize.Smallest) + + checkUrlWorks(fileInfo) shouldBe true + } + + private fun checkUrlWorks(fileInfo: FileInfo): Boolean { + val url = fileInfo.toUrl() + println(fileInfo.url) + val connection: HttpURLConnection = (url.openConnection() as HttpURLConnection).apply { requestMethod = "HEAD" } + + return connection.responseCode == 200 + } +} \ No newline at end of file diff --git a/src/test/kotlin/com/londogard/nlp/WordFrequencyTests.kt b/src/test/kotlin/com/londogard/nlp/WordFrequencyTests.kt new file mode 100644 index 00000000..b9789fe9 --- /dev/null +++ b/src/test/kotlin/com/londogard/nlp/WordFrequencyTests.kt @@ -0,0 +1,24 @@ +package com.londogard.nlp + +import com.londogard.nlp.utils.LanguageSupport +import com.londogard.nlp.wordfreq.WordFrequencies +import org.amshove.kluent.shouldBe +import org.amshove.kluent.shouldNotBe +import kotlin.test.Test + +class WordFrequencyTests { + @Test fun testFullWordFreq() { + val wordFreqNb = WordFrequencies.getAllWordFrequenciesOrNull(LanguageSupport.nb) + + wordFreqNb shouldNotBe null + wordFreqNb?.containsKey("er") shouldBe true + } + + @Test fun testSingleWordFreq() { + WordFrequencies.wordFrequencyOrNull("er", LanguageSupport.nb) shouldNotBe null + } + + @Test fun testSingleZipf() { + WordFrequencies.zipfFrequencyOrNull("er", LanguageSupport.nb) shouldNotBe null + } +} \ No newline at end of file diff --git a/src/test/resources/hej.txt.gz b/src/test/resources/hej.txt.gz new file mode 100644 index 00000000..86d21900 Binary files /dev/null and b/src/test/resources/hej.txt.gz differ diff --git a/src/test/resources/sv_embeddings_cut.txt b/src/test/resources/sv_embeddings_cut.txt new file mode 100644 index 00000000..64970a2c --- /dev/null +++ b/src/test/resources/sv_embeddings_cut.txt @@ -0,0 +1,3 @@ +2 3 +hej 0 1 1 +då 0 2 2 \ No newline at end of file