Skip to content

Commit

Permalink
feat: Multiclass Logistic Regression (#143)
Browse files Browse the repository at this point in the history
* feat: MultiClass LogisticRegression

* bump: to 1.2.0-BETA2
  • Loading branch information
Lundez authored Aug 31, 2022
1 parent e8571bb commit 955c97a
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 10 deletions.
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ plugins {
}

group = "com.londogard"
version = "1.2.0-BETA"
version = "1.2.0-BETA2"

repositories {
mavenCentral()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.londogard.nlp.meachinelearning.encoders

import org.jetbrains.kotlinx.multik.ndarray.data.*


interface Encoder {
fun fit(input: D1Array<Int>): Unit
fun transform(input: D1Array<Int>): D2Array<Int>
fun fitTransform(input: D1Array<Int>): D2Array<Int> {
fit(input)
return transform(input)
}
fun invert(input: D2Array<Int>): D1Array<Int>
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package com.londogard.nlp.meachinelearning.encoders

import org.jetbrains.kotlinx.multik.api.mk
import org.jetbrains.kotlinx.multik.api.zeros
import org.jetbrains.kotlinx.multik.ndarray.data.*
import org.jetbrains.kotlinx.multik.ndarray.operations.forEachIndexed
import org.jetbrains.kotlinx.multik.ndarray.operations.max
import kotlin.properties.Delegates

class OneHotEncoder: Encoder {
private var yMax by Delegates.notNull<Int>()

override fun fit(input: D1Array<Int>) {
yMax = (input.max() ?: 0) + 1 // minimum of 2 classes = [0,1]
}

override fun transform(input: D1Array<Int>): D2Array<Int> {
val out = mk.zeros<Int>(input.shape[0], yMax)
input.forEachIndexed { index, i -> out[index, i] = 1 }

return out
}

override fun invert(input: D2Array<Int>): D1Array<Int> {
return mk.math.argMaxD2(input, 1)
}
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
package com.londogard.nlp.meachinelearning.predictors

import com.londogard.nlp.meachinelearning.predictors.classifiers.AutoOneHotClassifier
import org.jetbrains.kotlinx.multik.ndarray.data.D2
import org.jetbrains.kotlinx.multik.ndarray.data.D2Array
import org.jetbrains.kotlinx.multik.ndarray.data.MultiArray

interface BasePredictor<T: Number> {
interface BasePredictor<T : Number> {
fun fit(X: MultiArray<Float, D2>, y: D2Array<T>)
fun predict(X: MultiArray<Float, D2>): D2Array<T>
}
}

fun <T : BasePredictor<Int>> T.asAutoOneHotClassifier(): AutoOneHotClassifier<T> =
AutoOneHotClassifier(this)
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.londogard.nlp.meachinelearning.predictors.classifiers

import com.londogard.nlp.meachinelearning.encoders.OneHotEncoder
import com.londogard.nlp.meachinelearning.predictors.BasePredictor
import org.jetbrains.kotlinx.multik.ndarray.data.D1Array
import org.jetbrains.kotlinx.multik.ndarray.data.D2
import org.jetbrains.kotlinx.multik.ndarray.data.MultiArray

class AutoOneHotClassifier<T : BasePredictor<Int>>(val predictor: T) : BasePredictor<Int> by predictor {
private val oneHotEncoder = OneHotEncoder()

@JvmName("fitSimple")
fun fit(X: MultiArray<Float, D2>, y: D1Array<Int>) {
val yEncoded = oneHotEncoder.fitTransform(y)
predictor.fit(X, yEncoded)
}

fun predictSimple(X: MultiArray<Float, D2>): D1Array<Int> {
return oneHotEncoder.invert(predictor.predict(X))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,25 @@ import org.jetbrains.kotlinx.multik.ndarray.data.D2
import org.jetbrains.kotlinx.multik.ndarray.data.D2Array
import org.jetbrains.kotlinx.multik.ndarray.data.MultiArray

class LogisticRegression: Classifier {
class LogisticRegression(
val optimizer: GradientDescent = GradientDescent(1000, 0.01f, 1e-6f)
) : Classifier {
private lateinit var weights: D2Array<Float>
private lateinit var losses: List<Float>

override fun fit(X: MultiArray<Float, D2>, y: D2Array<Int>) {
weights = mk.zeros(1, X.shape[1])

val optimizer = GradientDescent(1000, 0.01f, 1e-6f)
weights = mk.zeros(y.shape[1], X.shape[1])

val (weightOut, lossesOut) = optimizer.optimize(LogisticLoss(), weights, X, y.asType())

weights = weightOut
losses = lossesOut
}

override fun predict(X: MultiArray<Float, D2>): D2Array<Int> {
val proba = predictProba(X.toDense())

return mk.d2array(X.shape[0], 1) { i -> if (proba.data[i] < 0.5f) 0 else 1 }
return mk.d2array(X.shape[0], weights.shape[0]) { i -> if (proba.data[i] < 0.5f) 0 else 1 }
}

fun predictProba(X: MultiArray<Float, D2>): MultiArray<Float, D2> =
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,36 @@
package com.londogard.nlp.machinelearning

import com.londogard.nlp.meachinelearning.predictors.asAutoOneHotClassifier
import com.londogard.nlp.meachinelearning.predictors.classifiers.LogisticRegression
import com.londogard.nlp.meachinelearning.predictors.classifiers.NaiveBayes
import com.londogard.nlp.meachinelearning.toDense
import com.londogard.nlp.meachinelearning.vectorizer.TfIdfVectorizer
import com.londogard.nlp.tokenizer.SimpleTokenizer
import org.amshove.kluent.shouldBeEqualTo
import org.jetbrains.kotlinx.multik.api.d1array
import org.jetbrains.kotlinx.multik.api.mk
import org.jetbrains.kotlinx.multik.api.ndarray
import org.jetbrains.kotlinx.multik.api.zeros
import org.jetbrains.kotlinx.multik.ndarray.data.set
import org.jetbrains.kotlinx.multik.ndarray.operations.first
import org.jetbrains.kotlinx.multik.ndarray.operations.map
import org.junit.Test
import kotlin.time.ExperimentalTime
import kotlin.time.measureTime

class ClassifierTest {
val simpleTok = SimpleTokenizer()
val simpleTexts = listOf("hejsan jag älskar sverige", "hej vad bra det är i sverige", "jag älskar sverige", "norge är ett land i norden", "norge norden", "norge norden", "norge norden")
val simpleTexts = listOf(
"hejsan jag älskar sverige",
"hej vad bra det är i sverige",
"jag älskar sverige",
"norge är ett land i norden",
"norge norden",
"norge norden",
"norge norden"
)
.map(simpleTok::split)
val y = mk.ndarray(intArrayOf(1,1,1,0,0, 0, 0), 7, 1)
val y = mk.ndarray(intArrayOf(1, 1, 1, 0, 0, 0, 0), 7, 1)

@Test
fun testLogisticRegression() {
Expand All @@ -29,6 +44,44 @@ class ClassifierTest {
lr.predict(tfidf.transform(listOf(simpleTexts.first()))).first() shouldBeEqualTo 1
}

@Test
fun logisticTest() {
val labelsMap = mapOf(
0 to "Bank Charges",
1 to "Betting",
2 to "Card fees",
3 to "Food",
4 to "Lifestyle",
5 to "Loan",
6 to "Reversal",
7 to "Salary",
8 to "Unknown",
9 to "Utilities & Bills",
10 to "Withdrawal"
)
val reversedLabelMap = labelsMap.asSequence().map { it.value to it.key }.toMap()

val (data, categories) = listOf(
"Vat amount charges" to "Bank Charges",
"Loan payment credit" to "Loan",
"Salary for Aug" to "Salary",
"Payment from betking" to "Betting",
"Purchase from Shoprite" to "Food",
).unzip()
val simpleTok = SimpleTokenizer()
val xData = data.map(simpleTok::split)
val yList = categories.map { category -> reversedLabelMap.getOrDefault(category, 0) }
val y = mk.ndarray(yList)

val tfidf = TfIdfVectorizer<Float>()
val lr = LogisticRegression().asAutoOneHotClassifier()

val transformedData = tfidf.fitTransform(xData)
lr.fit(transformedData, y)

lr.predictSimple(transformedData) shouldBeEqualTo y
}

@Test
fun testNaiveBayes() {
val tfidf = TfIdfVectorizer<Float>()
Expand Down

0 comments on commit 955c97a

Please sign in to comment.