Qbeast-io · osopardo1 · Dec 15, 2023 · Nov 30, 2023 · Dec 5, 2023 · Dec 5, 2023
diff --git a/build.sbt b/build.sbt
@@ -20,6 +20,7 @@ lazy val qbeastSpark = (project in file("."))
       sparkSql % Provided,
       hadoopClient % Provided,
       deltaCore % Provided,
+      sparkml % Provided,
       amazonAws % Test,
       hadoopCommons % Test,
       hadoopAws % Test),

diff --git a/core/src/main/scala/io/qbeast/core/model/ColumnsToIndexSelector.scala b/core/src/main/scala/io/qbeast/core/model/ColumnsToIndexSelector.scala
@@ -0,0 +1,36 @@
+package io.qbeast.core.model
+
+/**
+ * ColumnsToIndexSelector interface to automatically select which columns to index.
+ * @tparam DATA
+ *   the data to index
+ */
+trait ColumnsToIndexSelector[DATA] {
+
+  /**
+   * The maximum number of columns to index.
+   * @return
+   */
+  def MAX_COLUMNS_TO_INDEX: Int
+
+  /**
+   * Selects the columns to index given a DataFrame
+   * @param data
+   *   the data to index
+   * @return
+   */
+  def selectColumnsToIndex(data: DATA): Seq[String] =
+    selectColumnsToIndex(data, MAX_COLUMNS_TO_INDEX)
+
+  /**
+   * Selects the columns to index with a given number of columns to index
+   * @param data
+   *   the data to index
+   * @param numColumnsToIndex
+   *   the number of columns to index
+   * @return
+   *   A sequence with the names of the columns to index
+   */
+  def selectColumnsToIndex(data: DATA, numColumnsToIndex: Int): Seq[String]
+
+}
diff --git a/core/src/main/scala/io/qbeast/core/model/QbeastCoreContext.scala b/core/src/main/scala/io/qbeast/core/model/QbeastCoreContext.scala
@@ -21,6 +21,7 @@ trait QbeastCoreContext[DATA, DataSchema, QbeastOptions, FileDescriptor] {
   def indexManager: IndexManager[DATA]
   def queryManager[QUERY: ClassTag]: QueryManager[QUERY, DATA]
   def revisionBuilder: RevisionFactory[DataSchema, QbeastOptions]
+  def columnSelector: ColumnsToIndexSelector[DATA]
   def keeper: Keeper
 
 }

diff --git a/docs/AdvancedConfiguration.md b/docs/AdvancedConfiguration.md
@@ -65,6 +65,22 @@ You can specify different advanced options to the columns to index:
 df.write.format("qbeast").option("columnsToIndex", "column:type,column2:type...")
 ```
 
+## Automatic Column Selection
+
+To **avoid specifying the `columnsToIndex`**, you can enable auto indexer through the Spark Configuration:
+
+```shell
+--conf spark.qbeast.index.columnsToIndex.auto=true \
+--conf spark.qbeast.index.columnsToIndex.auto.max=10
+```
+And write the DataFrame without any extra option:
+
+```scala
+df.write.format("qbeast").save("path/to/table")
+```
+
+Read more about it in the [Columns to Index selector](ColumnsToIndexSelector.md) section.
+
 ## CubeSize
 
 CubeSize option lets you specify the maximum size of the cube, in number of records. By default, it's set to 5M.

diff --git a/docs/ColumnsToIndexSelector.md b/docs/ColumnsToIndexSelector.md
@@ -0,0 +1,73 @@
+## Columns To Index Selector
+
+Qbeast Format organizes the records using a multidimensional index. This index is built on a subset of the columns in the table. From `1.0.0` version, **the columns can be selected automatically by enabling the automatic column index selector or manually by the user**.
+
+If you want to forget about the distribution and let qbeast handle all the indexing pre-process, there's no need to specify the `columnsToIndex` in the **DataFrame**.
+
+You only need to **enable the Columns To Index Selector in the `SparkConf`**:
+
+```shell
+--conf spark.qbeast.index.columnsToIndex.auto=true \
+--conf spark.qbeast.index.columnsToIndex.auto.max=10
+```
+
+And **write the DataFrame as usual**:
+
+```scala
+df.write.format("qbeast").save("path/to/table")
+```
+
+Or use SQL:
+
+```scala
+spark.sql("CREATE TABLE table_name USING qbeast LOCATION 'path/to/table'")
+```
+### Interface
+
+The `ColumnsToIndexSelector` is an interface that can be implemented by different classes. The interface is defined as follows:
+
+```scala
+trait ColumnsToIndexSelector[DATA] {
+
+  /**
+   * The maximum number of columns to index.
+   * @return
+   */
+  def MAX_COLUMNS_TO_INDEX: Int
+
+  /**
+   * Selects the columns to index given a DataFrame
+   * @param data
+   *   the data to index
+   * @return
+   */
+  def selectColumnsToIndex(data: DATA): Seq[String] =
+    selectColumnsToIndex(data, MAX_COLUMNS_TO_INDEX)
+
+  /**
+   * Selects the columns to index with a given number of columns to index
+   * @param data
+   *   the data to index
+   * @param numColumnsToIndex
+   *   the number of columns to index
+   * @return
+   *   A sequence with the names of the columns to index
+   */
+  def selectColumnsToIndex(data: DATA, numColumnsToIndex: Int): Seq[String]
+
+}
+
+```
+
+### SparkColumnsToIndexSelector
+
+`SparkColumnsToIndexSelector` is the first implementation of the `ColumnsToIndexSelector` process. Is designed to work with Apache Spark DataFrames and **provides functionality to automatically select columns for indexing based on certain criteria**.
+
+The steps are the following:
+
+1. **Convert Timestamp columns** to Unix timestamps and update the DataFrame.
+2. **Initialize Vector Assembler** for each column. For String columns, transform them into numeric with StringIndexer.
+4. **Combine features** from VectorAssembler into a Single Vector column.
+5. Calculate the **Correlation Matrix**.
+6. Calculate the **absolute correlation** for each column.
+7. Get the **top N columns that have the lowest average correlation**.
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -20,4 +20,5 @@ object Dependencies {
   val hadoopCommons = "org.apache.hadoop" % "hadoop-common" % hadoopVersion
   val hadoopAws = "org.apache.hadoop" % "hadoop-aws" % hadoopVersion
   val fasterxml = "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.12.0"
+  val sparkml = "org.apache.spark" %% "spark-mllib" % sparkVersion
 }
diff --git a/src/main/scala/io/qbeast/context/QbeastContext.scala b/src/main/scala/io/qbeast/context/QbeastContext.scala
@@ -8,6 +8,7 @@ import io.qbeast.core.keeper.LocalKeeper
 import io.qbeast.core.model._
 import io.qbeast.spark.delta.writer.RollupDataWriter
 import io.qbeast.spark.delta.SparkDeltaMetadataManager
+import io.qbeast.spark.index.SparkColumnsToIndexSelector
 import io.qbeast.spark.index.SparkOTreeManager
 import io.qbeast.spark.index.SparkRevisionFactory
 import io.qbeast.spark.internal.QbeastOptions
@@ -92,6 +93,8 @@ object QbeastContext
   override def revisionBuilder: RevisionFactory[StructType, QbeastOptions] =
     SparkRevisionFactory
 
+  override def columnSelector: ColumnsToIndexSelector[DataFrame] = SparkColumnsToIndexSelector
+
   /**
    * Sets the unmanaged context. The specified context will not be disposed automatically at the
    * end of the Spark session.
@@ -146,7 +149,8 @@ object QbeastContext
       indexManager,
       metadataManager,
       dataWriter,
-      revisionBuilder)
+      revisionBuilder,
+      columnSelector)
 
   private def destroyManaged(): Unit = this.synchronized {
     managedOption.foreach(_.keeper.stop())

diff --git a/src/main/scala/io/qbeast/spark/index/SparkColumnsToIndexSelector.scala b/src/main/scala/io/qbeast/spark/index/SparkColumnsToIndexSelector.scala
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2021 Qbeast Analytics, S.L.
+ */
+package io.qbeast.spark.index
+
+import io.qbeast.core.model.ColumnsToIndexSelector
+import org.apache.spark.ml.feature.OneHotEncoder
+import org.apache.spark.ml.feature.StringIndexer
+import org.apache.spark.ml.feature.VectorAssembler
+import org.apache.spark.ml.linalg.Matrix
+import org.apache.spark.ml.stat.Correlation
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.qbeast.config.MAX_NUM_COLUMNS_TO_INDEX
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.functions.unix_timestamp
+import org.apache.spark.sql.types.StringType
+import org.apache.spark.sql.types.StructField
+import org.apache.spark.sql.types.TimestampType
+import org.apache.spark.sql.DataFrame
+
+object SparkColumnsToIndexSelector extends ColumnsToIndexSelector[DataFrame] with Serializable {
+
+  /**
+   * The maximum number of columns to index.
+   *
+   * @return
+   */
+  override def MAX_COLUMNS_TO_INDEX: Int = MAX_NUM_COLUMNS_TO_INDEX
+
+  /**
+   * Adds unix timestamp columns to the DataFrame for the columns specified
+   * @param data
+   * @param inputCols
+   * @return
+   */
+  private def withUnixTimestamp(data: DataFrame, inputCols: Seq[StructField]): DataFrame = {
+    val timestampColsTransformation = inputCols
+      .filter(_.dataType == TimestampType)
+      .map(c => (c.name, unix_timestamp(col(c.name))))
+      .toMap
+
+    data.withColumns(timestampColsTransformation)
+  }
+
+  /**
+   * Adds preprocessing transformers to the DataFrame for the columns specified
+   * @param data
+   *   the DataFrame
+   * @param inputCols
+   *   the columns to preprocess
+   * @return
+   */
+  protected def withPreprocessedPipeline(
+      data: DataFrame,
+      inputCols: Seq[StructField]): DataFrame = {
+
+    val transformers = inputCols
+      .collect {
+        case column if column.dataType == StringType =>
+          val colName = column.name
+          val indexer = new StringIndexer().setInputCol(colName).setOutputCol(s"${colName}_Index")
+          val encoder =
+            new OneHotEncoder().setInputCol(s"${colName}_Index").setOutputCol(s"${colName}_Vec")
+          Seq(indexer, encoder)
+
+        case column =>
+          val colName = column.name
+          Seq(
+            new VectorAssembler()
+              .setInputCols(Array(colName))
+              .setOutputCol(s"${colName}_Vec")
+              .setHandleInvalid("keep"))
+      }
+      .flatten
+      .toArray
+
+    val preprocessingPipeline = new Pipeline().setStages(transformers)
+    val preprocessingModel = preprocessingPipeline.fit(data)
+    val preprocessedData = preprocessingModel.transform(data)
+
+    preprocessedData
+  }
+
+  /**
+   * Selects the top N minimum absolute correlated columns
+   * @param data
+   *   the DataFrame
+   * @param inputCols
+   *   the columns to preprocess
+   * @param numCols
+   *   the number of columns to return
+   * @return
+   */
+  protected def selectTopNCorrelatedColumns(
+      data: DataFrame,
+      inputCols: Seq[StructField],
+      numCols: Int): Array[String] = {
+
+    val inputVecCols = inputCols.map(_.name + "_Vec").toArray
+
+    val assembler = new VectorAssembler()
+      .setInputCols(inputVecCols)
+      .setOutputCol("features")
+      .setHandleInvalid("keep")
+
+    val vectorDf = assembler.transform(data)
+
+    // Calculate the correlation matrix
+    val correlationMatrix: DataFrame = Correlation.corr(vectorDf, "features")
+    // Extract the correlation matrix as a Matrix
+    val corrArray = correlationMatrix.select("pearson(features)").head.getAs[Matrix](0)
+
+    // Calculate the average absolute correlation for each column
+    val averageCorrelation =
+      corrArray.toArray.map(Math.abs).grouped(inputVecCols.length).toArray.head
+
+    // Get the indices of columns with the lowest average correlation
+    val sortedIndices = averageCorrelation.zipWithIndex.sortBy { case (corr, _) => corr }
+    val selectedIndices = sortedIndices.take(numCols).map(_._2)
+
+    val selectedCols = selectedIndices.map(inputCols(_).name)
+    selectedCols
+
+  }
+
+  override def selectColumnsToIndex(data: DataFrame, numColumnsToIndex: Int): Seq[String] = {
+
+    // IF there's no data to write, we return all the columns to index
+    if (data.isEmpty) {
+      return data.columns.take(numColumnsToIndex)
+    }
+
+    val inputCols = data.schema
+    // Add unix timestamp columns
+    val updatedData = withUnixTimestamp(data, inputCols)
+    // Add column transformers
+    val preprocessedPipeline = withPreprocessedPipeline(updatedData, inputCols)
+    // Calculate the top N minimum absolute correlated columns
+    val selectedColumns =
+      selectTopNCorrelatedColumns(preprocessedPipeline, inputCols, numColumnsToIndex)
+
+    selectedColumns
+
+  }
+
+}
diff --git a/src/main/scala/io/qbeast/spark/internal/sources/QbeastDataSource.scala b/src/main/scala/io/qbeast/spark/internal/sources/QbeastDataSource.scala
@@ -10,6 +10,7 @@ import io.qbeast.spark.internal.QbeastOptions
 import io.qbeast.spark.table.IndexedTableFactory
 import org.apache.hadoop.fs.FileStatus
 import org.apache.hadoop.fs.Path
+import org.apache.spark.qbeast.config.COLUMN_SELECTOR_ENABLED
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.connector.catalog._
 import org.apache.spark.sql.connector.expressions.Transform
@@ -94,7 +95,7 @@ class QbeastDataSource private[sources] (private val tableFactory: IndexedTableF
       data: DataFrame): BaseRelation = {
 
     require(
-      parameters.contains("columnsToIndex") || mode == SaveMode.Append,
+      parameters.contains("columnsToIndex") || mode == SaveMode.Append || COLUMN_SELECTOR_ENABLED,
       throw AnalysisExceptionFactory.create("'columnsToIndex' is not specified"))
 
     val tableId = QbeastOptions.loadTableIDFromParameters(parameters)

diff --git a/src/main/scala/io/qbeast/spark/internal/sources/catalog/QbeastCatalog.scala b/src/main/scala/io/qbeast/spark/internal/sources/catalog/QbeastCatalog.scala
@@ -6,7 +6,6 @@ package io.qbeast.spark.internal.sources.catalog
 import io.qbeast.context.QbeastContext
 import io.qbeast.spark.internal.sources.v2.QbeastStagedTableImpl
 import io.qbeast.spark.internal.sources.v2.QbeastTableImpl
-import io.qbeast.spark.internal.QbeastOptions.checkQbeastProperties
 import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException
 import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException
@@ -109,7 +108,6 @@ class QbeastCatalog[T <: TableCatalog with SupportsNamespaces with FunctionCatal
       properties: util.Map[String, String]): Table = {
 
     if (QbeastCatalogUtils.isQbeastProvider(properties)) {
-      checkQbeastProperties(properties.asScala.toMap)
       // Create the table
       QbeastCatalogUtils.createQbeastTable(
         ident,

diff --git a/src/main/scala/io/qbeast/spark/internal/sources/v2/QbeastStagedTableImpl.scala b/src/main/scala/io/qbeast/spark/internal/sources/v2/QbeastStagedTableImpl.scala
@@ -5,7 +5,6 @@ package io.qbeast.spark.internal.sources.v2
 
 import io.qbeast.spark.internal.sources.catalog.CreationMode
 import io.qbeast.spark.internal.sources.catalog.QbeastCatalogUtils
-import io.qbeast.spark.internal.QbeastOptions.checkQbeastProperties
 import io.qbeast.spark.table.IndexedTableFactory
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.connector.catalog.Identifier
@@ -72,9 +71,6 @@ private[sources] class QbeastStagedTableImpl(
     // we pass all the writeOptions to the properties as well
     writeOptions.foreach { case (k, v) => props.put(k, v) }
 
-    // Check all the Qbeast properties are correctly specified
-    checkQbeastProperties(props.asScala.toMap)
-
     // Creates the corresponding table on the Catalog and executes
     // the writing of the dataFrame (if any)
     QbeastCatalogUtils.createQbeastTable(