-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-53732][SQL] Remember TimeTravelSpec in DataSourceV2Relation #52599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,8 +27,13 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap | |
|
|
||
| sealed trait TimeTravelSpec | ||
|
|
||
| case class AsOfTimestamp(timestamp: Long) extends TimeTravelSpec | ||
| case class AsOfVersion(version: String) extends TimeTravelSpec | ||
| case class AsOfTimestamp(timestamp: Long) extends TimeTravelSpec { | ||
| override def toString: String = s"TIMESTAMP AS OF $timestamp" | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needed for proper |
||
| } | ||
|
|
||
| case class AsOfVersion(version: String) extends TimeTravelSpec { | ||
aokolnychyi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| override def toString: String = s"VERSION AS OF '$version'" | ||
| } | ||
|
|
||
| object TimeTravelSpec { | ||
| def create( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -748,6 +748,13 @@ class SessionCatalog( | |
| getRawLocalOrGlobalTempView(toNameParts(name)).map(getTempViewPlan) | ||
| } | ||
|
|
||
| /** | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An overloaded version that takes name parts directly instead of taking |
||
| * Generate a [[View]] operator from the local or global temporary view stored. | ||
| */ | ||
| def getLocalOrGlobalTempView(name: Seq[String]): Option[View] = { | ||
| getRawLocalOrGlobalTempView(name).map(getTempViewPlan) | ||
| } | ||
|
|
||
| /** | ||
| * Return the raw logical plan of a temporary local or global view for the given name. | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,7 @@ | |
| package org.apache.spark.sql.execution.datasources.v2 | ||
|
|
||
| import org.apache.spark.SparkException | ||
| import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation} | ||
| import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation, TimeTravelSpec} | ||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Expression, SortOrder} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, ExposesMetadataColumns, Histogram, HistogramBin, LeafNode, LogicalPlan, Statistics} | ||
| import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes | ||
|
|
@@ -45,7 +45,8 @@ abstract class DataSourceV2RelationBase( | |
| output: Seq[AttributeReference], | ||
| catalog: Option[CatalogPlugin], | ||
| identifier: Option[Identifier], | ||
| options: CaseInsensitiveStringMap) | ||
| options: CaseInsensitiveStringMap, | ||
| timeTravelSpec: Option[TimeTravelSpec] = None) | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| extends LeafNode with MultiInstanceRelation with NamedRelation { | ||
|
|
||
| import DataSourceV2Implicits._ | ||
|
|
@@ -65,7 +66,12 @@ abstract class DataSourceV2RelationBase( | |
| override def skipSchemaResolution: Boolean = table.supports(TableCapability.ACCEPT_ANY_SCHEMA) | ||
|
|
||
| override def simpleString(maxFields: Int): String = { | ||
| s"RelationV2${truncatedString(output, "[", ", ", "]", maxFields)} $name" | ||
| val outputString = truncatedString(output, "[", ", ", "]", maxFields) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Covered with tests. |
||
| val nameWithTimeTravelSpec = timeTravelSpec match { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: seems clearer to have this just return either $spec or empty, and have the final string be "RelationV2$outputString $timeTravelSpec $name"
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would require an extra unnecessary space after the name if the spec is empty. |
||
| case Some(spec) => s"$name $spec" | ||
| case _ => name | ||
| } | ||
| s"RelationV2$outputString $nameWithTimeTravelSpec" | ||
| } | ||
|
|
||
| override def computeStats(): Statistics = { | ||
|
|
@@ -96,8 +102,9 @@ case class DataSourceV2Relation( | |
| override val output: Seq[AttributeReference], | ||
| catalog: Option[CatalogPlugin], | ||
| identifier: Option[Identifier], | ||
| options: CaseInsensitiveStringMap) | ||
| extends DataSourceV2RelationBase(table, output, catalog, identifier, options) | ||
| options: CaseInsensitiveStringMap, | ||
| timeTravelSpec: Option[TimeTravelSpec] = None) | ||
| extends DataSourceV2RelationBase(table, output, catalog, identifier, options, timeTravelSpec) | ||
| with ExposesMetadataColumns { | ||
|
|
||
| import DataSourceV2Implicits._ | ||
|
|
@@ -117,7 +124,7 @@ case class DataSourceV2Relation( | |
| def withMetadataColumns(): DataSourceV2Relation = { | ||
| val newMetadata = metadataOutput.filterNot(outputSet.contains) | ||
| if (newMetadata.nonEmpty) { | ||
| DataSourceV2Relation(table, output ++ newMetadata, catalog, identifier, options) | ||
| copy(output = output ++ newMetadata) | ||
| } else { | ||
| this | ||
| } | ||
|
|
@@ -151,7 +158,12 @@ case class DataSourceV2ScanRelation( | |
| override def name: String = relation.name | ||
|
|
||
| override def simpleString(maxFields: Int): String = { | ||
| s"RelationV2${truncatedString(output, "[", ", ", "]", maxFields)} $name" | ||
| val outputString = truncatedString(output, "[", ", ", "]", maxFields) | ||
| val nameWithTimeTravelSpec = relation.timeTravelSpec match { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment |
||
| case Some(spec) => s"$name $spec" | ||
| case _ => name | ||
| } | ||
| s"RelationV2$outputString $nameWithTimeTravelSpec" | ||
| } | ||
|
|
||
| override def computeStats(): Statistics = { | ||
|
|
@@ -235,17 +247,29 @@ object ExtractV2Table { | |
| def unapply(relation: DataSourceV2Relation): Option[Table] = Some(relation.table) | ||
| } | ||
|
|
||
| object ExtractV2CatalogAndIdentifier { | ||
| def unapply(relation: DataSourceV2Relation): Option[(CatalogPlugin, Identifier)] = { | ||
| relation match { | ||
| case DataSourceV2Relation(_, _, Some(catalog), Some(identifier), _, _) => | ||
| Some((catalog, identifier)) | ||
| case _ => | ||
| None | ||
| } | ||
| } | ||
| } | ||
|
|
||
| object DataSourceV2Relation { | ||
| def create( | ||
| table: Table, | ||
| catalog: Option[CatalogPlugin], | ||
| identifier: Option[Identifier], | ||
| options: CaseInsensitiveStringMap): DataSourceV2Relation = { | ||
| options: CaseInsensitiveStringMap, | ||
| timeTravelSpec: Option[TimeTravelSpec] = None): DataSourceV2Relation = { | ||
| import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ | ||
| // The v2 source may return schema containing char/varchar type. We replace char/varchar | ||
| // with "annotated" string type here as the query engine doesn't support char/varchar yet. | ||
| val schema = CharVarcharUtils.replaceCharVarcharWithStringInSchema(table.columns.asSchema) | ||
| DataSourceV2Relation(table, toAttributes(schema), catalog, identifier, options) | ||
| DataSourceV2Relation(table, toAttributes(schema), catalog, identifier, options, timeTravelSpec) | ||
| } | ||
|
|
||
| def create( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,6 +66,19 @@ class BasicInMemoryTableCatalog extends TableCatalog { | |
| } | ||
| } | ||
|
|
||
| def pinTable(ident: Identifier, version: String): Unit = { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Used in time travel tests. |
||
| Option(tables.get(ident)) match { | ||
| case Some(table: InMemoryBaseTable) => | ||
| val versionIdent = Identifier.of(ident.namespace, ident.name + version) | ||
| val versionTable = table.copy() | ||
| tables.put(versionIdent, versionTable) | ||
| case Some(table) => | ||
| throw new UnsupportedOperationException(s"Can't pin ${table.getClass.getName}") | ||
| case _ => | ||
| throw new NoSuchTableException(ident.asMultipartIdentifier) | ||
| } | ||
| } | ||
|
|
||
| override def loadTable(ident: Identifier, version: String): Table = { | ||
| val versionIdent = Identifier.of(ident.namespace, ident.name + version) | ||
| Option(tables.get(versionIdent)) match { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,7 @@ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.{CatalogHelper, | |
| import org.apache.spark.sql.connector.catalog.CatalogV2Util.v2ColumnsToStructType | ||
| import org.apache.spark.sql.errors.QueryCompilationErrors | ||
| import org.apache.spark.sql.execution.command.{ShowNamespacesCommand, ShowTablesCommand} | ||
| import org.apache.spark.sql.execution.command.CommandUtils | ||
| import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation} | ||
| import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation | ||
| import org.apache.spark.sql.internal.connector.V1Function | ||
|
|
@@ -810,20 +811,13 @@ class Catalog(sparkSession: SparkSession) extends catalog.Catalog { | |
| * @since 2.0.0 | ||
| */ | ||
| override def uncacheTable(tableName: String): Unit = { | ||
| // We first try to parse `tableName` to see if it is 2 part name. If so, then in HMS we check | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I needed to migrate this logic to uncaching by name. I feel like the try/catch was redundant. Keep in mind that internally calls It would be great to have another pair of eyes on this one, though. |
||
| // if it is a temp view and uncache the temp view from HMS, otherwise we uncache it from the | ||
| // cache manager. | ||
| // if `tableName` is not 2 part name, then we directly uncache it from the cache manager. | ||
| try { | ||
| val tableIdent = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName) | ||
| sessionCatalog.getLocalOrGlobalTempView(tableIdent).map(uncacheView).getOrElse { | ||
| sparkSession.sharedState.cacheManager.uncacheQuery(sparkSession.table(tableName), | ||
| cascade = true) | ||
| } | ||
| } catch { | ||
| case e: org.apache.spark.sql.catalyst.parser.ParseException => | ||
| sparkSession.sharedState.cacheManager.uncacheQuery(sparkSession.table(tableName), | ||
| cascade = true) | ||
| // parse the table name and check if it's a temp view (must have 1-2 name parts) | ||
| // temp views are uncached using uncacheView which respects view text semantics (SPARK-33142) | ||
| // use CommandUtils for all tables (including with 3+ part names) | ||
| val nameParts = sparkSession.sessionState.sqlParser.parseMultipartIdentifier(tableName) | ||
| sessionCatalog.getLocalOrGlobalTempView(nameParts).map(uncacheView).getOrElse { | ||
| val relation = resolveRelation(tableName) | ||
| CommandUtils.uncacheTableOrView(sparkSession, relation, cascade = true) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -868,7 +862,7 @@ class Catalog(sparkSession: SparkSession) extends catalog.Catalog { | |
| * @since 2.0.0 | ||
| */ | ||
| override def refreshTable(tableName: String): Unit = { | ||
| val relation = sparkSession.table(tableName).queryExecution.analyzed | ||
| val relation = resolveRelation(tableName) | ||
|
|
||
| relation.refresh() | ||
|
|
||
|
|
@@ -891,7 +885,11 @@ class Catalog(sparkSession: SparkSession) extends catalog.Catalog { | |
| // Note this is a no-op for the relation itself if it's not cached, but will clear all | ||
| // caches referencing this relation. If this relation is cached as an InMemoryRelation, | ||
| // this will clear the relation cache and caches of all its dependents. | ||
| sparkSession.sharedState.cacheManager.recacheByPlan(sparkSession, relation) | ||
| CommandUtils.recacheTableOrView(sparkSession, relation) | ||
| } | ||
|
|
||
| private def resolveRelation(tableName: String): LogicalPlan = { | ||
| sparkSession.table(tableName).queryExecution.analyzed | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should be impossible to reach this line with a valid time travel spec. Just a sanity check.