Skip to content

Commit

Permalink
Add sorting to Dense vector check (#148)
Browse files Browse the repository at this point in the history
  • Loading branch information
blee1234 authored Apr 25, 2022
1 parent 3cc5cf7 commit 2cedb34
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 5 deletions.
6 changes: 3 additions & 3 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ val localAndCloudDiffDependencies = Seq(
"org.apache.spark" %% "spark-catalyst" % sparkVersion,
"org.apache.logging.log4j" % "log4j-core" % "2.17.1",
"com.typesafe" % "config" % "1.3.2",
"com.fasterxml.jackson.core" % "jackson-databind" % "2.6.5",
"com.fasterxml.jackson.core" % "jackson-databind" % "2.10.2",
"org.apache.hadoop" % "hadoop-mapreduce-client-core" % "2.7.2",
"org.apache.hadoop" % "hadoop-common" % "2.7.2",
"org.apache.avro" % "avro" % "1.8.2",
Expand All @@ -33,8 +33,8 @@ val localAndCloudCommonDependencies = Seq(
"it.unimi.dsi" % "fastutil" % "8.1.1",
"org.mvel" % "mvel2" % "2.2.7.Final",
"com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.6.7.1",
"com.fasterxml.jackson.dataformat" % "jackson-dataformat-yaml" % "2.6.5",
"com.fasterxml.jackson.dataformat" % "jackson-dataformat-csv" % "2.4.4",
"com.fasterxml.jackson.dataformat" % "jackson-dataformat-yaml" % "2.10.2",
"com.fasterxml.jackson.dataformat" % "jackson-dataformat-csv" % "2.10.2",
"com.jasonclawson" % "jackson-dataformat-hocon" % "1.1.0",
"com.redislabs" %% "spark-redis" % "3.0.0",
"org.scalatest" %% "scalatest" % "3.0.0" % "test",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ private[offline] object CoercionUtilsScala {
if (valueSet.size == 1 && valueSet.contains(1.0f)) { // a categorical set feature
featureValue.getValue.keySet().toSeq
} else {
val isDenseVector = featureValue.getValue.keys.zipWithIndex.filter(pr => pr._1 != pr._2.toString).size == 0
val isDenseVector = featureValue.getValue.keys.toSeq.map(_.toInt).sorted.zipWithIndex.filter(pr => pr._1 != pr._2).size == 0
val isCategoricalSet = featureValue.getValue.values.filter(!_.equals(1.0f)).size == 0
if (isDenseVector) {
featureValue.getValue.values().map(_.toLong.toString).toSeq
featureValue.getValue.toSeq.sortBy(_._1.toInt).map(_._2.toLong.toString)
} else if (isCategoricalSet) {
featureValue.getValue.keys.toSeq
} else {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package com.linkedin.feathr.offline.util

import com.linkedin.feathr.common
import com.linkedin.feathr.offline.util.CoercionUtilsScala.coerceFeatureValueToStringKey
import org.testng.Assert.assertEquals
import org.testng.annotations.Test

import scala.util.Random.shuffle
import scala.collection.JavaConverters._

class TestCoercionUtilsScala {
@Test(description = "Verifies that coerceFeatureValueToStringKey works properly for unordered NTV values")
def testFDSTensorSchemas(): Unit = {
val inputNTV = shuffle(List.range(0, 50).map(_.toString).zipWithIndex).map(kv => (kv._1, kv._2.toFloat)).toMap
val stringKey = coerceFeatureValueToStringKey(new common.FeatureValue(inputNTV.asJava))
assertEquals(stringKey.toList, List.range(0, 50).map(_.toString))
}

}

0 comments on commit 2cedb34

Please sign in to comment.