Add sorting to Dense vector check (#148)

feathr-ai · Apr 25, 2022 · 2cedb34 · 2cedb34
1 parent 3cc5cf7
commit 2cedb34
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 5 deletions.
diff --git a/build.sbt b/build.sbt
@@ -12,7 +12,7 @@ val localAndCloudDiffDependencies = Seq(
     "org.apache.spark" %% "spark-catalyst" % sparkVersion,
     "org.apache.logging.log4j" % "log4j-core" % "2.17.1",
     "com.typesafe" % "config" % "1.3.2",
-    "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.5",
+    "com.fasterxml.jackson.core" % "jackson-databind" % "2.10.2",
     "org.apache.hadoop" % "hadoop-mapreduce-client-core" % "2.7.2",
     "org.apache.hadoop" % "hadoop-common" % "2.7.2",
     "org.apache.avro" % "avro" % "1.8.2",
@@ -33,8 +33,8 @@ val localAndCloudCommonDependencies = Seq(
     "it.unimi.dsi" % "fastutil" % "8.1.1",
     "org.mvel" % "mvel2" % "2.2.7.Final",
     "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.6.7.1",
-    "com.fasterxml.jackson.dataformat" % "jackson-dataformat-yaml" % "2.6.5",
-    "com.fasterxml.jackson.dataformat" % "jackson-dataformat-csv" % "2.4.4",
+    "com.fasterxml.jackson.dataformat" % "jackson-dataformat-yaml" % "2.10.2",
+    "com.fasterxml.jackson.dataformat" % "jackson-dataformat-csv" % "2.10.2",
     "com.jasonclawson" % "jackson-dataformat-hocon" % "1.1.0",
     "com.redislabs" %% "spark-redis" % "3.0.0",
     "org.scalatest" %% "scalatest" % "3.0.0" % "test",

diff --git a/src/main/scala/com/linkedin/feathr/offline/util/CoercionUtilsScala.scala b/src/main/scala/com/linkedin/feathr/offline/util/CoercionUtilsScala.scala
@@ -62,10 +62,10 @@ private[offline] object CoercionUtilsScala {
       if (valueSet.size == 1 && valueSet.contains(1.0f)) { // a categorical set feature
         featureValue.getValue.keySet().toSeq
       } else {
-        val isDenseVector = featureValue.getValue.keys.zipWithIndex.filter(pr => pr._1 != pr._2.toString).size == 0
+        val isDenseVector = featureValue.getValue.keys.toSeq.map(_.toInt).sorted.zipWithIndex.filter(pr => pr._1 != pr._2).size == 0
         val isCategoricalSet = featureValue.getValue.values.filter(!_.equals(1.0f)).size == 0
         if (isDenseVector) {
-          featureValue.getValue.values().map(_.toLong.toString).toSeq
+          featureValue.getValue.toSeq.sortBy(_._1.toInt).map(_._2.toLong.toString)
         } else if (isCategoricalSet) {
           featureValue.getValue.keys.toSeq
         } else {

diff --git a/src/test/scala/com/linkedin/feathr/offline/util/TestCoercionUtilsScala.scala b/src/test/scala/com/linkedin/feathr/offline/util/TestCoercionUtilsScala.scala
@@ -0,0 +1,19 @@
+package com.linkedin.feathr.offline.util
+
+import com.linkedin.feathr.common
+import com.linkedin.feathr.offline.util.CoercionUtilsScala.coerceFeatureValueToStringKey
+import org.testng.Assert.assertEquals
+import org.testng.annotations.Test
+
+import scala.util.Random.shuffle
+import scala.collection.JavaConverters._
+
+class TestCoercionUtilsScala {
+  @Test(description = "Verifies that coerceFeatureValueToStringKey works properly for unordered NTV values")
+  def testFDSTensorSchemas(): Unit = {
+    val inputNTV = shuffle(List.range(0, 50).map(_.toString).zipWithIndex).map(kv => (kv._1, kv._2.toFloat)).toMap
+    val stringKey = coerceFeatureValueToStringKey(new common.FeatureValue(inputNTV.asJava))
+    assertEquals(stringKey.toList, List.range(0, 50).map(_.toString))
+  }
+
+}