Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Penn tree bank #49

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package com.londogard.nlp.meachinelearning.datasets.prepare

import org.jetbrains.kotlinx.multik.api.mk
import org.jetbrains.kotlinx.multik.api.ndarray
import org.jetbrains.kotlinx.multik.ndarray.data.D1Array

object PennTreeBankPreparer {
data class PennTreeBankDataset(
val tokenIndexing: Map<String, Int>,
val tagIndexing: Map<String, Int>,
val trainDataset: Pair<List<D1Array<Int>>, List<D1Array<Int>>>,
// val testDataset: Pair<List<D1Array<Int>>, List<D1Array<Int>>>
) {
val reverseTagIndexing by lazy { tagIndexing.entries.associate { it.value to it.key } }
val reverseTokenIndexing by lazy { tokenIndexing.entries.associate { it.value to it.key } }
}

/**
* Prepares the standard format of
* Oct. NNP
* 19 CD
* ..
*
* Into a usable format for our Sequence Classifiers
* */
fun prepare(text: String, delimeter: Char = '\t'): PennTreeBankDataset {
val (tokens, tags) = text
.split('\n', delimeter)
.windowed(2, 2) { (a, b) -> a to b }
.unzip()
val tokenIndices = tokens.toSet().withIndex().associate { it.value to it.index }
val tagIndices = (tags + "BOS").toSet().withIndex().associate { it.value to it.index }
val X = listOf(mk.ndarray(tokens.map(tokenIndices::getValue).toIntArray()))
val y = listOf(mk.ndarray(tags.map(tagIndices::getValue).toIntArray()))

return PennTreeBankDataset(tokenIndices, tagIndices, X to y)
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.londogard.nlp.meachinelearning.predictors.sequence

import com.londogard.nlp.meachinelearning.datasets.prepare.PennTreeBankPreparer
import org.jetbrains.kotlinx.multik.api.empty
import org.jetbrains.kotlinx.multik.api.mk
import org.jetbrains.kotlinx.multik.api.ndarray
Expand Down Expand Up @@ -131,4 +132,25 @@ class HiddenMarkovModel(
}

}

companion object {
/**
* Simplifies the toolchain
*/
fun fromPennTreebank(
pennTreeBankDataset: PennTreeBankPreparer.PennTreeBankDataset,
alpha: Float = 0.001f,
BegginingOfSentence: Int = pennTreeBankDataset.tagIndexing.getOrDefault("BOS", 0)
): HiddenMarkovModel {
val hmm = HiddenMarkovModel(
pennTreeBankDataset.reverseTagIndexing,
pennTreeBankDataset.reverseTokenIndexing,
alpha,
BegginingOfSentence
)
hmm.fit(pennTreeBankDataset.trainDataset.first, pennTreeBankDataset.trainDataset.second)

return hmm
}
}
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
package com.londogard.nlp.machinelearning

import com.londogard.nlp.meachinelearning.datasets.prepare.PennTreeBankPreparer
import com.londogard.nlp.meachinelearning.predictors.sequence.HiddenMarkovModel
import org.amshove.kluent.shouldBeEqualTo
import org.jetbrains.kotlinx.multik.api.mk
import org.jetbrains.kotlinx.multik.api.ndarray
import kotlin.test.Test

class SequenceClassifierTest {
Expand Down Expand Up @@ -34,26 +33,10 @@ Stage NN
in IN
Windy NNP
City NNP"""
val (tokensText, tagsText) = text
.split('\n')
.map {
val (a, b) = it.split('\t')
a to b
}.unzip()
val tokenMap = (tokensText).toSet().withIndex().associate { elem -> elem.value to elem.index }
val tagMap = (tagsText + "BOS").toSet().withIndex().associate { elem -> elem.value to elem.index }
val reversetagMap = tagMap.asIterable().associate { (key, value) -> value to key }
val hmm = HiddenMarkovModel(
tagMap.asIterable().associate { (key, value) -> value to key },
tokenMap.asIterable().associate { (key, value) -> value to key },
BegginingOfSentence = tokenMap.getOrDefault("BOS", 0)
)
val pennTreeBankDataset = PennTreeBankPreparer.prepare(text)
val hmm = HiddenMarkovModel.fromPennTreebank(pennTreeBankDataset = pennTreeBankDataset)

val x = listOf(mk.ndarray(tokensText.mapNotNull(tokenMap::get).toIntArray()))
val y = listOf(mk.ndarray(tagsText.mapNotNull(tagMap::get).toIntArray()))

hmm.fit(x, y)
// predict.map { t -> t.data.map { reversetagMap[it] } } to get the real labels!
hmm.predict(x) shouldBeEqualTo y
// use the reverse map to get the true labels (string) rather than int
hmm.predict(pennTreeBankDataset.trainDataset.first) shouldBeEqualTo pennTreeBankDataset.trainDataset.second
}
}