Skip to content

Commit

Permalink
Added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Lundez committed Apr 6, 2021
1 parent 78cd483 commit a6e954b
Show file tree
Hide file tree
Showing 30 changed files with 440 additions and 126 deletions.
20 changes: 11 additions & 9 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@ import org.jetbrains.kotlin.gradle.tasks.KotlinCompile

plugins {
`maven-publish`
id("idea")
kotlin("jvm") version "1.4.32"
kotlin("plugin.serialization") version "1.4.32"
id("org.jetbrains.dokka") version "1.4.30"
}

group = "com.londogard"
version = "1.0-beta"
version = "1.0"

repositories {
mavenCentral()
Expand All @@ -27,26 +26,24 @@ dependencies {
implementation("org.ejml:ejml-simple:0.40")
implementation("org.ejml:ejml-kotlin:0.40")

// implementation("com.github.levyfan:sentencepiece-jni:v0.0.2")
implementation("ai.djl.sentencepiece:sentencepiece:0.10.0")
implementation("com.github.rholder:snowball-stemmer:1.3.0.581.1")

// https://mvnrepository.com/artifact/org.apache.commons/commons-compress
implementation("org.apache.commons:commons-compress:1.20")

implementation("org.codehaus.plexus:plexus-archiver:4.2.4")

testImplementation("org.amshove.kluent:kluent:$kluentVersion")
testImplementation("org.jetbrains.kotlin:kotlin-test:1.4.32")
testImplementation(kotlin("test-junit"))
implementation(kotlin("stdlib-jdk8"))
}

tasks.test {
useJUnit()
}

tasks.withType<KotlinCompile> {
kotlinOptions.jvmTarget = "1.8"
tasks.withType<KotlinCompile>().configureEach {
kotlinOptions {
useIR = true
}
}

publishing {
Expand All @@ -65,4 +62,9 @@ publishing {
from(components["java"])
}
}
}

val compileTestKotlin: KotlinCompile by tasks
compileTestKotlin.kotlinOptions {
jvmTarget = "1.8"
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,14 @@ class BpeEmbeddings(
?.avgNorm()
}

override fun contains(word: String): Boolean {
return tokenizer.split(word).all(embeddings::contains)
}

fun subwordVector(subword: String): SimpleMatrix? = embeddings[subword]

companion object {
fun toTokenizer(filePath: Path): Tokenizer {
@JvmStatic fun toTokenizer(filePath: Path): Tokenizer {
val rawNameTokens = filePath.fileName.toString().split('.')

val languageSupport = LanguageSupport.valueOf(rawNameTokens.first())
Expand Down
14 changes: 7 additions & 7 deletions src/main/kotlin/com/londogard/nlp/embeddings/EmbeddingLoader.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import com.londogard.nlp.utils.LanguageSupport
import com.londogard.nlp.utils.useLines
import org.ejml.simple.SimpleMatrix
import java.nio.file.Path
import kotlin.io.path.bufferedReader
import kotlin.math.min

object EmbeddingLoader {
Expand All @@ -24,7 +23,13 @@ object EmbeddingLoader {
}
}

// TODO inline fun <reified T: Embeddings> fromUrl(url: String): Map<String, SimpleMatrix> = TODO("")
inline fun <reified T: Embeddings> fromFile(path: Path): T {
return when {
T::class == LightWordEmbeddings::class -> LightWordEmbeddings(path) as T
T::class == BpeEmbeddings::class -> BpeEmbeddings(path) as T
else -> WordEmbeddings(path) as T
}
}

internal fun fromFile(path: Path,
delimiter: Char,
Expand All @@ -47,9 +52,4 @@ object EmbeddingLoader {
}
.toMap(LinkedHashMap(numLinesToUse)) // optimization by creating the full map directly
}
}

fun main() {
val embeddings = EmbeddingLoader.fromLanguageOrNull<LightWordEmbeddings>(LanguageSupport.sv)
println(embeddings?.vector("Hej"))
}
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class WordEmbeddings(
/** Pretty print the list of words and their associated scores.
* @param words List of (word, score) pairs to be printed.
*/
fun pprint(words: List<Pair<String, Double>>) {
@JvmStatic fun pprint(words: List<Pair<String, Double>>) {
println("\n%50s${" ".repeat(7)}Cosine distance\n${"-".repeat(72)}".format("Word"))
println(words.joinToString("\n") { (word, dist) -> "%50s${" ".repeat(7)}%15f".format(word, dist) })
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.londogard.nlp.embeddings.sentence

import com.londogard.nlp.embeddings.Embeddings
import com.londogard.nlp.utils.avgNorm
import com.londogard.nlp.utils.normalize
import org.ejml.simple.SimpleMatrix

Expand All @@ -11,7 +12,6 @@ class AverageSentenceEmbeddings(override val tokenEmbeddings: Embeddings): Sente
override fun getSentenceEmbedding(sentence: List<String>): SimpleMatrix {
return tokenEmbeddings
.traverseVectors(sentence)
.reduce { acc, simpleMatrix -> acc + simpleMatrix }
.normalize()
.avgNorm()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import kotlin.math.pow
class USifSentenceEmbeddings(
override val tokenEmbeddings: Embeddings,
private val wordProb: Map<String, Float>,
randomWalkLength: Int, // = n, ~11
randomWalkLength: Int = 11, // = n, ~11
private val numCommonDiscourseVector: Int = 5 // = m, 0 should work. In practise max 5.
) : SentenceEmbeddings {
private val vocabSize = wordProb.size.toFloat()
Expand Down
2 changes: 1 addition & 1 deletion src/main/kotlin/com/londogard/nlp/stemmer/Stemmer.kt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class Stemmer(language: LanguageSupport) {
var cache: Pair<LanguageSupport, Stemmer>? = null

// Default to PorterStemmer if not supported!
fun stem(word: String, language: LanguageSupport): String {
@JvmStatic fun stem(word: String, language: LanguageSupport): String {
val cachedStemmer = cache

return when (cachedStemmer?.first) {
Expand Down
1 change: 1 addition & 0 deletions src/main/kotlin/com/londogard/nlp/stopwords/Stopwords.kt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ object Stopwords {
fun isStopword(word: String, language: LanguageSupport): Boolean =
stopwordsOrNull(language)?.contains(word) == true

@Throws(IllegalArgumentException::class)
fun stopwords(language: LanguageSupport): Set<String> =
stopwordsOrNull(language)
?: throw IllegalArgumentException("There exists not stopwords for language ${language.name}. Please try again with one of the supported languages.")
Expand Down
16 changes: 0 additions & 16 deletions src/main/kotlin/com/londogard/nlp/structures/trie/Trie.kt
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,4 @@ fun findFirstMerger(trie: TrieNode, string: String): String? {
.firstOrNull()
}
}
}

fun main() {
val vocab = WordFrequencies.getAllWordFrequenciesOrNull(LanguageSupport.sv)?.toVocab() ?: emptyMap()
println(vocab.entries.sortedBy { it.value }.reversed().take(5))
val trie = Trie(vocab)
println(findFirstMerger(trie.rootNode.childNodes.entries.first().value, ""))

// could use foldRight (goes from other end..!)
// val reverseTrie = Trie(vocab.mapKeys { (key,_) -> key.reversed() })
// println(reverseTrie.rootNode.childNodes.map { it.key to it.value.count })

println(trie.rootNode.childNodes.getValue('ä').childNodes.map { it.key to it.value.count })
println(trie.rootNode.childNodes.map { it.key to it.value.count })
println(trie.rootNode.char)
println(trie.rootNode.count)
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ class SentencePieceTokenizer(modelPath: Path, vocabPath: Path? = null): Tokenize
override fun split(text: String): List<String> = sentencePieceTokenizer.tokenize(text)

companion object {
fun fromLanguageSupportOrNull(languageSupport: LanguageSupport): SentencePieceTokenizer? =
const val beginningOfWord: Char = ''
@JvmStatic fun fromLanguageSupportOrNull(languageSupport: LanguageSupport): SentencePieceTokenizer? =
fromLanguageSupportAndSizeOrNull(languageSupport, VocabSize.v10_000)

fun fromLanguageSupportAndSizeOrNull(languageSupport: LanguageSupport, vocabSize: VocabSize) =
@JvmStatic fun fromLanguageSupportAndSizeOrNull(languageSupport: LanguageSupport, vocabSize: VocabSize) =
if (languageSupport.hasSentencePiece()) {
val (vocab, model) = DownloadHelper.getBpeVocabModel(languageSupport, vocabSize.size)
val (vocab, model) = DownloadHelper.getSentencePieceVocabModel(languageSupport, vocabSize.size)
SentencePieceTokenizer(model, vocab)
} else null
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/kotlin/com/londogard/nlp/utils/CompressionUtil.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream
import java.io.BufferedInputStream
import java.io.IOException
import java.io.InputStream
import java.io.OutputStream
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.StandardCopyOption
import java.util.zip.GZIPInputStream

/** Simplified usage of apache compress through Object functions. */
object CompressionUtil {
fun gunzip(path: Path): InputStream =
path.toFile()
Expand Down
2 changes: 2 additions & 0 deletions src/main/kotlin/com/londogard/nlp/utils/CustomExtensions.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.attribute.FileAttribute

/** Custom extensions for Path taken from Kotlin EXPERIMENTAL. */

// Taken from Kotlin stdlib (EXPERIMENTAL)
internal inline fun Path.readLines(charset: Charset = Charsets.UTF_8): List<String> =
Files.readAllLines(this, charset)
Expand Down
112 changes: 38 additions & 74 deletions src/main/kotlin/com/londogard/nlp/utils/DownloadHelper.kt
Original file line number Diff line number Diff line change
@@ -1,115 +1,79 @@
package com.londogard.nlp.utils

import com.londogard.nlp.embeddings.EmbeddingLoader.BpeDefaultEmbeddingDimension
import com.londogard.nlp.tokenizer.toVocabSize
import com.londogard.nlp.wordfreq.WordFrequencySize
import java.net.URL
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths

@PublishedApi
internal object DownloadHelper {
private val rootPath: Path = Paths.get(System.getProperty("user.home")).resolve(".londogard")
private const val dataUrl: String = "https://raw.githubusercontent.com/londogard/londogard-nlp-toolkit/main/data"
private const val bpeUrl: String = "https://nlp.h-its.org/bpemb/"
private val stopwordPath: Path = rootPath.resolve("stopwords")
private val wordFrequencyPath: Path = rootPath.resolve("wordfreq")
private val embeddingPath: Path = rootPath.resolve("embeddings")
private val bpePath: Path = rootPath.resolve("bpe")

fun getStopWords(language: LanguageSupport): Path {
val path = stopwordPath.resolve(language.name)
if (!Files.exists(path)) {
println("Language ${language.name} does not have stopwords locally. Will download (few KBs)...")
val fileInfo = UrlProvider.stopwords(language)
downloadFileIfMissing(fileInfo)

"$dataUrl/stopwords/${language.name}".saveTo(path)
return fileInfo.path
}

println("Download done! ${language.name} stopwords located at ${path.toAbsolutePath()}")
}
fun getWordFrequencies(language: LanguageSupport, size: WordFrequencySize = WordFrequencySize.Smallest): Path {
val fileInfo = UrlProvider.wordfreq(language, size)
downloadFileIfMissing(fileInfo)

return path
return fileInfo.path
}

fun getWordFrequencies(language: LanguageSupport, size: WordFrequencySize = WordFrequencySize.Smallest): Path {
val filename = size.toFileName(language)
val path = wordFrequencyPath.resolve(filename)
if (!Files.exists(path)) {
println("Language ${language.name} does not have (${size.name}) word frequencies locally. Will download (few KBs)...")
private fun downloadFileIfMissing(fileInfo: FileInfo) {
if (!Files.exists(fileInfo.path)) {
println("Downloading ${fileInfo.description} for ${fileInfo.language} as files don't exist locally.")

"$dataUrl/wordfreq/${filename}".saveTo(path)
fileInfo.toUrl().saveTo(fileInfo.path)

println("Download done! ${language.name} (${size.name}) word frequencies located at ${path.toAbsolutePath()}")
println("Download completed! ${fileInfo.language} ${fileInfo.description} located at ${fileInfo.path.toAbsolutePath()}")
}

return path
}

fun getWordEmbeddings(language: LanguageSupport): Path {
val filename = "cc.${language.name}.300.vec"
val path = embeddingPath.resolve(filename)
if (!Files.exists(path)) {
path.parent.createDirectories()
println("Language ${language.name} does not have word embeddings locally. Will download (could be GBs)...")
val url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/$filename.gz"
val fileInfo = UrlProvider.fastText(language)
if (!Files.exists(fileInfo.path)) {
val tmpPath = Files.createTempFile("tmp", ".gz")

url.saveTo(tmpPath)
Files.newOutputStream(path).use { out ->
downloadFileIfMissing(fileInfo.copy(path = tmpPath))
Files.newOutputStream(fileInfo.path).use { out ->
CompressionUtil.gunzip(tmpPath).use { input -> input.copyTo(out) }
}
Files.deleteIfExists(tmpPath)

println("Download completed! ${language.name} word embeddings located at ${path.toAbsolutePath()}")
}
return path
return fileInfo.path
}

private fun getBpeBaseUrl(language: LanguageSupport, numMerges: Int): String =
"$bpeUrl/$language/$language.wiki.bpe.vs$numMerges"

// TODO improve by `data class`
fun getBpeVocabModel(language: LanguageSupport, vocabSize: Int = 10_000): Pair<Path, Path> {
val baseUrl = getBpeBaseUrl(language, vocabSize)
val vocab = getBpeFile("$baseUrl.vocab")
val model = getBpeFile("$baseUrl.model")

return vocab to model
}

private fun getBpeFile(url: String): Path {
val filename = url.takeLastWhile { it != '/' }
val path = bpePath.resolve(filename)
if (!Files.exists(path)) {
println("Downloading BPE Model/Vocab/Embedding ($filename)")
url.saveTo(path)
println("Download completed! $filename located at ${path.parent}")
}
fun getSentencePieceVocabModel(language: LanguageSupport, vocabSize: Int = 10_000): Pair<Path, Path> {
val (vocab, model) = UrlProvider.sentencePiece(language, vocabSize)
downloadFileIfMissing(vocab)
downloadFileIfMissing(model)

return path
return vocab.path to model.path
}

fun getBpeEmbeddings(language: LanguageSupport, vocabSize: Int = 10_000, dimensions: Int = BpeDefaultEmbeddingDimension): Path {
val filePath = bpePath.resolve("$language.wiki.bpe.vs$vocabSize.d$dimensions.w2v.txt")

return if (Files.exists(filePath)) {
filePath
} else {
val baseUrl = getBpeBaseUrl(language, vocabSize)
val embeddingsCompressed = getBpeFile("$baseUrl.d$dimensions.w2v.txt.tar.gz")
val tmpPath = CompressionUtil.uncompressTarGz(embeddingsCompressed)
embeddingsCompressed.toFile().deleteRecursively()
Files.move(tmpPath, filePath)
bpePath.resolve("data").toFile().deleteRecursively()

filePath
fun getBpeEmbeddings(
language: LanguageSupport,
vocabSize: Int = 10_000,
dimensions: Int = BpeDefaultEmbeddingDimension
): Path {
val fileInfo = UrlProvider.bpeEmbedding(language, vocabSize, dimensions)

if (!Files.exists(fileInfo.path)) {
val tmpPath = fileInfo.path.parent.resolve("${fileInfo.filename}.tar.gz")
downloadFileIfMissing(fileInfo.copy(path = tmpPath))
CompressionUtil.uncompressTarGz(tmpPath)
Files.deleteIfExists(tmpPath)
}
return fileInfo.path
}

private fun String.saveTo(path: Path) {
private fun URL.saveTo(path: Path) {
Files.createDirectories(path.parent)

URL(this).openStream().use { input ->
openStream().use { input ->
path.toFile().outputStream().use { output ->
input.copyTo(output)
}
Expand Down
4 changes: 4 additions & 0 deletions src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ import org.ejml.dense.row.NormOps_FDRM
import org.ejml.kotlin.*
import org.ejml.simple.SimpleMatrix

/**
* Custom extensions for EJML simplification in Kotlin. Some optimized for speed.
*/

/** Basic Retrieval */
fun SimpleMatrix.getRow(index: Int): SimpleMatrix = extractVector(true, index)
fun SimpleMatrix.getRows(rows: IntArray): SimpleMatrix =
Expand Down
8 changes: 8 additions & 0 deletions src/main/kotlin/com/londogard/nlp/utils/FileInfo.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package com.londogard.nlp.utils

import java.net.URL
import java.nio.file.Path

data class FileInfo(val filename: String, val path: Path, val url: String, val description: String, val language: LanguageSupport) {
fun toUrl(): URL = URL(url)
}
Loading

0 comments on commit a6e954b

Please sign in to comment.