diff --git a/.gitignore b/.gitignore index c2def9d8a1..bc3924242b 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ winpkg/target .DS_Store .idea +/derby.log diff --git a/agents-audit/pom.xml b/agents-audit/pom.xml index 4aaec0996a..86f1056d0a 100644 --- a/agents-audit/pom.xml +++ b/agents-audit/pom.xml @@ -104,4 +104,9 @@ + + + target/classes + target/test-classes + diff --git a/agents-common/pom.xml b/agents-common/pom.xml index 4486af3598..9964c5fc10 100644 --- a/agents-common/pom.xml +++ b/agents-common/pom.xml @@ -21,6 +21,8 @@ Common library for Plugins Plugins Common + target/classes + target/test-classes org.apache.maven.plugins @@ -86,4 +88,5 @@ mysql-connector-java + diff --git a/agents-cred/pom.xml b/agents-cred/pom.xml index c62268329a..9542d7d93e 100644 --- a/agents-cred/pom.xml +++ b/agents-cred/pom.xml @@ -51,4 +51,9 @@ junit + + + target/classes + target/test-classes + diff --git a/agents-installer/pom.xml b/agents-installer/pom.xml index 20902cb62d..776962547c 100644 --- a/agents-installer/pom.xml +++ b/agents-installer/pom.xml @@ -34,4 +34,9 @@ ${commons.cli.version} + + + target/classes + target/test-classes + diff --git a/credentialbuilder/pom.xml b/credentialbuilder/pom.xml index 461dcd0b67..71ce3f62cb 100644 --- a/credentialbuilder/pom.xml +++ b/credentialbuilder/pom.xml @@ -78,4 +78,9 @@ ${hadoop.version} + + + target/classes + target/test-classes + diff --git a/embeddedwebserver/pom.xml b/embeddedwebserver/pom.xml index a8c7cba751..6ef58766fc 100644 --- a/embeddedwebserver/pom.xml +++ b/embeddedwebserver/pom.xml @@ -84,4 +84,9 @@ ${project.version} + + + target/classes + target/test-classes + diff --git a/hbase-agent/pom.xml b/hbase-agent/pom.xml index c044ef0c30..d4af2a1343 100644 --- a/hbase-agent/pom.xml +++ b/hbase-agent/pom.xml @@ -95,6 +95,8 @@ + target/classes + target/test-classes src/test/resources diff --git a/hive-agent/pom.xml b/hive-agent/pom.xml index 0a091b202c..23a55d7195 100644 --- a/hive-agent/pom.xml +++ b/hive-agent/pom.xml @@ -84,6 +84,8 @@ + target/classes + target/test-classes src/test/resources diff --git a/jisql/pom.xml b/jisql/pom.xml index 07fb08b360..81695af3d2 100644 --- a/jisql/pom.xml +++ b/jisql/pom.xml @@ -34,4 +34,9 @@ 3.2 + + + target/classes + target/test-classes + diff --git a/kms/pom.xml b/kms/pom.xml index 043c67fde4..49e4e6cc85 100644 --- a/kms/pom.xml +++ b/kms/pom.xml @@ -263,6 +263,8 @@ + target/classes + target/test-classes diff --git a/knox-agent/pom.xml b/knox-agent/pom.xml index 0c45c84baa..193b385673 100644 --- a/knox-agent/pom.xml +++ b/knox-agent/pom.xml @@ -155,6 +155,8 @@ + target/classes + target/test-classes ${basedir}/src/main/java ${basedir}/src/test/java diff --git a/plugin-atlas/pom.xml b/plugin-atlas/pom.xml index 87ff40d149..868844ba46 100644 --- a/plugin-atlas/pom.xml +++ b/plugin-atlas/pom.xml @@ -85,6 +85,11 @@ ${google.guava.version} + + + target/classes + target/test-classes + diff --git a/plugin-elasticsearch/pom.xml b/plugin-elasticsearch/pom.xml index b9973105c9..acefa8462d 100644 --- a/plugin-elasticsearch/pom.xml +++ b/plugin-elasticsearch/pom.xml @@ -66,4 +66,9 @@ ${httpcomponents.httpcore.version} + + + target/classes + target/test-classes + diff --git a/plugin-kafka/pom.xml b/plugin-kafka/pom.xml index 132baf7679..001d336390 100644 --- a/plugin-kafka/pom.xml +++ b/plugin-kafka/pom.xml @@ -92,6 +92,8 @@ + target/classes + target/test-classes src/test/resources diff --git a/plugin-kms/pom.xml b/plugin-kms/pom.xml index 1940f41691..546db6d212 100644 --- a/plugin-kms/pom.xml +++ b/plugin-kms/pom.xml @@ -64,6 +64,8 @@ + target/classes + target/test-classes src/test/resources diff --git a/plugin-kylin/pom.xml b/plugin-kylin/pom.xml index 69badfc7ac..298966c310 100644 --- a/plugin-kylin/pom.xml +++ b/plugin-kylin/pom.xml @@ -88,4 +88,8 @@ ${httpcomponents.httpcore.version} + + target/classes + target/test-classes + \ No newline at end of file diff --git a/plugin-nifi-registry/pom.xml b/plugin-nifi-registry/pom.xml index 3e99f04a4c..095a744268 100644 --- a/plugin-nifi-registry/pom.xml +++ b/plugin-nifi-registry/pom.xml @@ -62,4 +62,8 @@ + + target/classes + target/test-classes + diff --git a/plugin-nifi/pom.xml b/plugin-nifi/pom.xml index c510d87fe9..3380b1ac60 100644 --- a/plugin-nifi/pom.xml +++ b/plugin-nifi/pom.xml @@ -59,4 +59,9 @@ test + + + target/classes + target/test-classes + diff --git a/plugin-solr/pom.xml b/plugin-solr/pom.xml index 619cc9ebdb..a9e76d51a4 100644 --- a/plugin-solr/pom.xml +++ b/plugin-solr/pom.xml @@ -52,4 +52,9 @@ ${solr.version} + + + target/classes + target/test-classes + diff --git a/plugin-spark/.gitignore b/plugin-spark/.gitignore new file mode 100644 index 0000000000..eebb5db03c --- /dev/null +++ b/plugin-spark/.gitignore @@ -0,0 +1,3 @@ +target/ +dependency-reduced-pom.xml +/derby.log diff --git a/plugin-spark/pom.xml b/plugin-spark/pom.xml new file mode 100644 index 0000000000..7b9130323d --- /dev/null +++ b/plugin-spark/pom.xml @@ -0,0 +1,405 @@ + + + + 4.0.0 + ranger-spark-plugin + Spark SQL Security Plugin + jar + + 2.11.8 + 2.11 + 2.2.6 + 2.3.2 + provided + ranger_spark_project + + + ranger + org.apache.ranger + 2.0.0-SNAPSHOT + .. + + + + org.scala-lang + scala-library + ${scala.version} + provided + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${spark.version} + ${spark.scope} + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + ${spark.scope} + + + org.spark-project + * + + + + + org.apache.ranger + ranger-plugins-common + ${project.version} + + + * + * + + + + + org.apache.ranger + ranger-plugins-cred + ${project.version} + + + * + * + + + + + org.apache.ranger + ranger-plugins-audit + ${project.version} + + + * + * + + + + + org.apache.hive + hive-exec + 2.3.2 + + + * + * + + + + + org.eclipse.persistence + eclipselink + ${eclipse.jpa.version} + + + * + * + + + + + com.google.code.gson + gson + ${gson.version} + + + * + * + + + + + org.eclipse.persistence + javax.persistence + ${javax.persistence.version} + + + * + * + + + + + org.apache.httpcomponents + httpcore + ${httpcomponents.httpcore.version} + + + * + * + + + + + org.apache.httpcomponents + httpmime + ${httpcomponents.httpmime.version} + + + * + * + + + + + org.apache.httpcomponents + httpclient + ${httpcomponents.httpclient.version} + + + * + * + + + + + com.sun.jersey + jersey-bundle + ${jersey-bundle.version} + + + * + * + + + + + org.noggit + noggit + ${noggit.version} + + + * + * + + + + + org.apache.solr + solr-solrj + ${solr.version} + + + * + * + + + + + + + org.scalatest + scalatest_${scala.binary.version} + 3.0.3 + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + test-jar + + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + test-jar + + + + com.kstruct + gethostname4j + 0.0.2 + test + + + + + + target/classes + target/test-classes + + + ${project.basedir}/src/test/resources + + + + + net.alchim31.maven + scala-maven-plugin + 3.2.2 + + + eclipse-add-source + + add-source + + + + scala-compile-first + + compile + + + + scala-test-compile-first + + testCompile + + + + + ${scala.version} + incremental + true + + -unchecked + -deprecation + -feature + -explaintypes + -Yno-adapted-args + + + -Xms1024m + -Xmx1024m + -XX:ReservedCodeCacheSize=512M + + + -source + ${java.version.required} + -target + ${java.version.required} + -Xlint:all,-serial,-path,-try + + + + + + org.apache.maven.plugins + maven-shade-plugin + + false + + + com.google.code.gson:gson + com.sun.jersey:jersey-bundle + org.apache.httpcomponents:httpclient + org.apache.httpcomponents:httpcore + org.apache.httpcomponents:httpmime + org.apache.ranger:ranger-plugins-common + org.apache.ranger:ranger-plugins-cred + org.apache.ranger:ranger-plugins-audit + org.apache.solr:solr-solrj + org.codehaus.jackson:jackson-core-asl + org.codehaus.jackson:jackson-jaxrs + org.codehaus.jackson:jackson-mapper-asl + org.codehaus.jackson:jackson-xc + org.eclipse.persistence:eclipselink + org.eclipse.persistence:javax.persistence + org.apache.hive:hive-exec + org.noggit:noggit + + + + + org.apache.hive:hive-exec + + + org/apache/hadoop/hive/ql/udf/generic/**Mask** + org/apache/hadoop/hive/ql/udf/generic/**Transform** + + + + + + com.sun.jersey + ${ranger.spark.package}.jersey + + + + + + package + + shade + + + + + + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.12.4 + + true + + + + + org.scalatest + scalatest-maven-plugin + 1.0 + + ${project.build.directory}/surefire-reports + . + TestSuite.txt + + + + test + + test + + + + + + + + + + + spark-2.3 + + 2.3.2 + 3.0.3 + + + + + spark-2.4 + + 2.4.0 + 3.0.3 + + + + + diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkAccessRequest.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkAccessRequest.scala new file mode 100644 index 0000000000..11a0746054 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkAccessRequest.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +import java.util.Date + +import org.apache.ranger.authorization.spark.authorizer.SparkAccessType.SparkAccessType +import org.apache.ranger.plugin.policyengine.{RangerAccessRequestImpl, RangerPolicyEngine} +import org.apache.ranger.plugin.util.RangerAccessRequestUtil + +import scala.collection.JavaConverters._ + +class RangerSparkAccessRequest private extends RangerAccessRequestImpl { + + private var accessType = SparkAccessType.NONE + + def this( + resource: RangerSparkResource, + user: String, + groups: Set[String], + opType: String, + accessType: SparkAccessType, + clusterName: String) { + this() + this.setResource(resource) + this.setUser(user) + this.setUserGroups(groups.asJava) + this.setAccessTime(new Date) + this.setAction(opType) + this.setSparkAccessType(accessType) + this.setUser(user) + this.setClusterName(clusterName) + } + + def this(resource: RangerSparkResource, user: String, groups: Set[String], + clusterName: String) = { + this(resource, user, groups, "METADATA OPERATION", SparkAccessType.USE, clusterName) + } + + def getSparkAccessType: SparkAccessType = accessType + + def setSparkAccessType(accessType: SparkAccessType): Unit = { + this.accessType = accessType + accessType match { + case SparkAccessType.USE => this.setAccessType(RangerPolicyEngine.ANY_ACCESS) + case SparkAccessType.ADMIN => this.setAccessType(RangerPolicyEngine.ADMIN_ACCESS) + case _ => this.setAccessType(accessType.toString.toLowerCase) + } + } + + def copy(): RangerSparkAccessRequest = { + val ret = new RangerSparkAccessRequest() + ret.setResource(getResource) + ret.setAccessType(getAccessType) + ret.setUser(getUser) + ret.setUserGroups(getUserGroups) + ret.setAccessTime(getAccessTime) + ret.setAction(getAction) + ret.setClientIPAddress(getClientIPAddress) + ret.setRemoteIPAddress(getRemoteIPAddress) + ret.setForwardedAddresses(getForwardedAddresses) + ret.setRequestData(getRequestData) + ret.setClientType(getClientType) + ret.setSessionId(getSessionId) + ret.setContext(RangerAccessRequestUtil.copyContext(getContext)) + ret.accessType = accessType + ret.setClusterName(getClusterName) + ret + } +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkAuditHandler.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkAuditHandler.scala new file mode 100644 index 0000000000..9303808e1d --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkAuditHandler.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +import org.apache.ranger.plugin.audit.RangerDefaultAuditHandler + +class RangerSparkAuditHandler extends RangerDefaultAuditHandler { + + // TODO(Kent Yao): Implementing meaningfully audit functions + +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkAuthorizer.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkAuthorizer.scala new file mode 100644 index 0000000000..dce419f791 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkAuthorizer.scala @@ -0,0 +1,292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +import java.util.{List => JList} + +import org.apache.commons.lang.StringUtils +import org.apache.commons.logging.LogFactory +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.permission.FsAction +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.hive.common.FileUtils +import org.apache.hadoop.hive.ql.security.authorization.plugin._ +import org.apache.hadoop.security.UserGroupInformation +import org.apache.ranger.authorization.spark.authorizer.SparkAccessType.SparkAccessType +import org.apache.ranger.authorization.spark.authorizer.SparkObjectType.SparkObjectType +import org.apache.ranger.authorization.spark.authorizer.SparkOperationType.SparkOperationType +import org.apache.ranger.authorization.utils.StringUtil +import org.apache.ranger.plugin.policyengine.RangerAccessRequest +import org.apache.ranger.plugin.util.RangerPerfTracer +import org.apache.spark.sql.SparkSession + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +object RangerSparkAuthorizer { + private val LOG = LogFactory.getLog(this.getClass.getSimpleName.stripSuffix("$")) + private val sparkPlugin = RangerSparkPlugin.build().getOrCreate() + + private def currentUser: UserGroupInformation = UserGroupInformation.getCurrentUser + + def checkPrivileges( + spark: SparkSession, + opType: SparkOperationType, + inputs: Seq[SparkPrivilegeObject], + outputs: Seq[SparkPrivilegeObject]): Unit = { + + val ugi = currentUser + val user = ugi.getShortUserName + val groups = ugi.getGroupNames.toSet + val auditHandler = new RangerSparkAuditHandler + val perf = if (RangerPerfTracer.isPerfTraceEnabled(PERF_SPARKAUTH_REQUEST_LOG)) { + RangerPerfTracer.getPerfTracer(PERF_SPARKAUTH_REQUEST_LOG, + "RangerSparkAuthorizer.checkPrivileges()") + } else { + null + } + try { + val requests = new ArrayBuffer[RangerSparkAccessRequest]() + if (inputs.isEmpty && opType == SparkOperationType.SHOWDATABASES) { + val resource = new RangerSparkResource(SparkObjectType.DATABASE, None) + requests += new RangerSparkAccessRequest(resource, user, groups, opType.toString, + SparkAccessType.USE, sparkPlugin.getClusterName) + } + + def addAccessRequest(objs: Seq[SparkPrivilegeObject], isInput: Boolean): Unit = { + objs.foreach { obj => + val resource = getSparkResource(obj, opType) + if (resource != null) { + val objectName = obj.getObjectName + val objectType = resource.getObjectType + if (objectType == SparkObjectType.URI && isPathInFSScheme(objectName)) { + val fsAction = getURIAccessType(opType) + val hadoopConf = spark.sparkContext.hadoopConfiguration + if (!canAccessURI(user, fsAction, objectName, hadoopConf)) { + throw new HiveAccessControlException(s"Permission denied: user [$user] does not" + + s" have [${fsAction.name}] privilege on [$objectName]") + } + } else { + val accessType = getAccessType(obj, opType, objectType, isInput) + if (accessType != SparkAccessType.NONE && !requests.exists( + o => o.getSparkAccessType == accessType && o.getResource == resource)) { + requests += new RangerSparkAccessRequest(resource, user, groups, opType.toString, + accessType, sparkPlugin.getClusterName) + } + } + } + } + } + + addAccessRequest(inputs, isInput = true) + addAccessRequest(outputs, isInput = false) + requests.foreach { request => + val resource = request.getResource.asInstanceOf[RangerSparkResource] + if (resource.getObjectType == SparkObjectType.COLUMN && + StringUtils.contains(resource.getColumn, ",")) { + resource.setServiceDef(sparkPlugin.getServiceDef) + val colReqs: JList[RangerAccessRequest] = resource.getColumn.split(",") + .filter(StringUtils.isNotBlank).map { c => + val colRes = new RangerSparkResource(SparkObjectType.COLUMN, + Option(resource.getDatabase), resource.getTable, c) + val colReq = request.copy() + colReq.setResource(colRes) + colReq.asInstanceOf[RangerAccessRequest] + }.toList.asJava + val colResults = sparkPlugin.isAccessAllowed(colReqs, auditHandler) + if (colResults != null) { + for (c <- colResults.asScala) { + if (c != null && !c.getIsAllowed) { + throw new SparkAccessControlException(s"Permission denied: user [$user] does not" + + s" have [${request.getSparkAccessType}] privilege on [${resource.getAsString}]") + } + } + } + } else { + val result = sparkPlugin.isAccessAllowed(request, auditHandler) + if (result != null && !result.getIsAllowed) { + throw new SparkAccessControlException(s"Permission denied: user [$user] does not" + + s" have [${request.getSparkAccessType}] privilege on [${resource.getAsString}]") + } + } + } + } finally { + // TODO(Kent Yao) add auditHandler.flush() + RangerPerfTracer.log(perf) + } + } + + def isAllowed(obj: SparkPrivilegeObject): Boolean = { + val ugi = currentUser + val user = ugi.getShortUserName + val groups = ugi.getGroupNames.toSet + createSparkResource(obj) match { + case Some(resource) => + val request = + new RangerSparkAccessRequest(resource, user, groups, sparkPlugin.getClusterName) + val result = sparkPlugin.isAccessAllowed(request) + if (request == null) { + LOG.error("Internal error: null RangerAccessResult received back from isAccessAllowed") + false + } else if (!result.getIsAllowed) { + if (LOG.isDebugEnabled) { + val path = resource.getAsString + LOG.debug(s"Permission denied: user [$user] does not have" + + s" [${request.getSparkAccessType}] privilege on [$path]. resource[$resource]," + + s" request[$request], result[$result]") + } + false + } else { + true + } + case _ => + LOG.error("RangerSparkResource returned by createSparkResource is null") + false + } + + } + + private val PERF_SPARKAUTH_REQUEST_LOG = RangerPerfTracer.getPerfLogger("sparkauth.request") + + def createSparkResource(privilegeObject: SparkPrivilegeObject): Option[RangerSparkResource] = { + val objectName = privilegeObject.getObjectName + val dbName = privilegeObject.getDbname + val objectType = privilegeObject.getType + objectType match { + case SparkPrivilegeObjectType.DATABASE => + Some(RangerSparkResource(SparkObjectType.DATABASE, Option(objectName))) + case SparkPrivilegeObjectType.TABLE_OR_VIEW => + Some(RangerSparkResource(SparkObjectType.DATABASE, Option(dbName), objectName)) + case _ => + LOG.warn(s"RangerSparkAuthorizer.createSparkResource: unexpected objectType: $objectType") + None + } + } + + private def getAccessType(obj: SparkPrivilegeObject, opType: SparkOperationType, + objectType: SparkObjectType, isInput: Boolean): SparkAccessType = { + objectType match { + case SparkObjectType.URI if isInput => SparkAccessType.READ + case SparkObjectType.URI => SparkAccessType.WRITE + case _ => obj.getActionType match { + case SparkPrivObjectActionType.INSERT | SparkPrivObjectActionType.INSERT_OVERWRITE => + SparkAccessType.UPDATE + case SparkPrivObjectActionType.OTHER => + import SparkOperationType._ + opType match { + case CREATEDATABASE if obj.getType == SparkPrivilegeObjectType.DATABASE => + SparkAccessType.CREATE + case CREATEFUNCTION if obj.getType == SparkPrivilegeObjectType.FUNCTION => + SparkAccessType.CREATE + case CREATETABLE | CREATEVIEW | CREATETABLE_AS_SELECT + if obj.getType == SparkPrivilegeObjectType.TABLE_OR_VIEW => + if (isInput) SparkAccessType.SELECT else SparkAccessType.CREATE + case ALTERDATABASE | ALTERTABLE_ADDCOLS | + ALTERTABLE_ADDPARTS | ALTERTABLE_DROPPARTS | + ALTERTABLE_LOCATION | ALTERTABLE_PROPERTIES | ALTERTABLE_SERDEPROPERTIES | + ALTERVIEW_RENAME | MSCK => SparkAccessType.ALTER + case DROPFUNCTION | DROPTABLE | DROPVIEW | DROPDATABASE => + SparkAccessType.DROP + case LOAD => if (isInput) SparkAccessType.SELECT else SparkAccessType.UPDATE + case QUERY | SHOW_CREATETABLE | SHOWPARTITIONS | + SHOW_TBLPROPERTIES => SparkAccessType.SELECT + case SHOWCOLUMNS | DESCTABLE => + StringUtil.toLower(RangerSparkPlugin.showColumnsOption) match { + case "show-all" => SparkAccessType.USE + case _ => SparkAccessType.SELECT + } + case SHOWDATABASES | SWITCHDATABASE | DESCDATABASE| SHOWTABLES => SparkAccessType.USE + case TRUNCATETABLE => SparkAccessType.UPDATE + case _ => SparkAccessType.NONE + } + } + } + } + + private def getObjectType( + obj: SparkPrivilegeObject, opType: SparkOperationType): SparkObjectType = { + obj.getType match { + case SparkPrivilegeObjectType.DATABASE | null => SparkObjectType.DATABASE + case SparkPrivilegeObjectType.TABLE_OR_VIEW if !StringUtil.isEmpty(obj.getColumns.asJava) => + SparkObjectType.COLUMN + case SparkPrivilegeObjectType.TABLE_OR_VIEW if opType.toString.toLowerCase.contains("view") => + SparkObjectType.VIEW + case SparkPrivilegeObjectType.TABLE_OR_VIEW => SparkObjectType.TABLE + case SparkPrivilegeObjectType.FUNCTION => SparkObjectType.FUNCTION + case SparkPrivilegeObjectType.DFS_URI => SparkObjectType.URI + case _ => SparkObjectType.NONE + } + } + + private def getSparkResource( + obj: SparkPrivilegeObject, opType: SparkOperationType): RangerSparkResource = { + import SparkObjectType._ + val objectType = getObjectType(obj, opType) + val resource = objectType match { + case DATABASE => RangerSparkResource(objectType, Option(obj.getDbname)) + case TABLE | VIEW | FUNCTION => + RangerSparkResource(objectType, Option(obj.getDbname), obj.getObjectName) + case COLUMN => + RangerSparkResource(objectType, Option(obj.getDbname), obj.getObjectName, + obj.getColumns.mkString(",")) + case _ => null + } + if (resource != null) resource.setServiceDef(sparkPlugin.getServiceDef) + resource + } + + private def canAccessURI( + user: String, action: FsAction, uri: String, conf: Configuration): Boolean = action match { + case FsAction.NONE => true + case _ => + try { + val filePath = new Path(uri) + val fs = FileSystem.get(filePath.toUri, conf) + val fileStat = fs.globStatus(filePath) + if (fileStat != null && fileStat.nonEmpty) fileStat.forall { file => + FileUtils.isOwnerOfFileHierarchy(fs, file, user) || + FileUtils.isActionPermittedForFileHierarchy(fs, file, user, action) + } else { + val file = FileUtils.getPathOrParentThatExists(fs, filePath) + FileUtils.checkFileAccessWithImpersonation(fs, file, action, user) + true + } + } catch { + case e: Exception => + LOG.error("Error getting permissions for " + uri, e) + false + } + } + + private def getURIAccessType(operationType: SparkOperationType): FsAction = { + import SparkOperationType._ + + operationType match { + case LOAD => FsAction.READ + case CREATEDATABASE | CREATETABLE | CREATETABLE_AS_SELECT | ALTERDATABASE | + ALTERTABLE_ADDCOLS | ALTERTABLE_RENAMECOL | ALTERTABLE_RENAMEPART | ALTERTABLE_RENAME | + ALTERTABLE_DROPPARTS | ALTERTABLE_ADDPARTS | ALTERTABLE_PROPERTIES | + ALTERTABLE_SERDEPROPERTIES | ALTERTABLE_LOCATION | QUERY => FsAction.ALL + case _ => FsAction.NONE + } + } + + private def isPathInFSScheme(objectName: String): Boolean = { + objectName.nonEmpty && sparkPlugin.fsScheme.exists(objectName.startsWith) + } +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkPlugin.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkPlugin.scala new file mode 100644 index 0000000000..2a42e8fc9a --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkPlugin.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +import java.io.{File, IOException} + +import org.apache.commons.logging.LogFactory +import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthzSessionContext +import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthzSessionContext.CLIENT_TYPE +import org.apache.ranger.authorization.hadoop.config.RangerConfiguration +import org.apache.ranger.plugin.service.RangerBasePlugin + +class RangerSparkPlugin private extends RangerBasePlugin("spark", "sparkSql") { + import RangerSparkPlugin._ + + private val LOG = LogFactory.getLog(classOf[RangerSparkPlugin]) + + lazy val fsScheme: Array[String] = RangerConfiguration.getInstance() + .get("ranger.plugin.spark.urlauth.filesystem.schemes", "hdfs:,file:") + .split(",") + .map(_.trim) + + override def init(): Unit = { + super.init() + val cacheDir = new File(rangerConf.get("ranger.plugin.spark.policy.cache.dir")) + if (cacheDir.exists() && + (!cacheDir.isDirectory || !cacheDir.canRead || !cacheDir.canWrite)) { + throw new IOException("Policy cache directory already exists at" + + cacheDir.getAbsolutePath + ", but it is unavailable") + } + + if (!cacheDir.exists() && !cacheDir.mkdirs()) { + throw new IOException("Unable to create ranger policy cache directory at" + + cacheDir.getAbsolutePath) + } + LOG.info("Policy cache directory successfully set to " + cacheDir.getAbsolutePath) + } +} + +object RangerSparkPlugin { + + private val rangerConf: RangerConfiguration = RangerConfiguration.getInstance + + val showColumnsOption: String = rangerConf.get( + "xasecure.spark.describetable.showcolumns.authorization.option", "NONE") + + def build(): Builder = new Builder + + class Builder { + + @volatile private var sparkPlugin: RangerSparkPlugin = _ + + def getOrCreate(): RangerSparkPlugin = RangerSparkPlugin.synchronized { + if (sparkPlugin == null) { + sparkPlugin = new RangerSparkPlugin + sparkPlugin.init() + sparkPlugin + } else { + sparkPlugin + } + } + } +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkResource.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkResource.scala new file mode 100644 index 0000000000..2daa6368fc --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkResource.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +import org.apache.ranger.authorization.spark.authorizer.SparkObjectType.SparkObjectType +import org.apache.ranger.plugin.policyengine.RangerAccessResourceImpl + +class RangerSparkResource( + objectType: SparkObjectType, + databaseOrUrl: Option[String], + tableOrUdf: String, + column: String) extends RangerAccessResourceImpl { + import SparkObjectType._ + import RangerSparkResource._ + + def this(objectType: SparkObjectType, databaseOrUrl: Option[String], tableOrUdf: String) = { + this(objectType, databaseOrUrl, tableOrUdf, null) + } + + def this(objectType: SparkObjectType, databaseOrUrl: Option[String]) = { + this(objectType, databaseOrUrl, null) + } + + objectType match { + case DATABASE => setValue(KEY_DATABASE, databaseOrUrl.getOrElse("*")) + case FUNCTION => + setValue(KEY_DATABASE, databaseOrUrl.getOrElse("")) + setValue(KEY_UDF, tableOrUdf) + case COLUMN => + setValue(KEY_DATABASE, databaseOrUrl.getOrElse("*")) + setValue(KEY_TABLE, tableOrUdf) + setValue(KEY_COLUMN, column) + case TABLE | VIEW => + setValue(KEY_DATABASE, databaseOrUrl.getOrElse("*")) + setValue(KEY_TABLE, tableOrUdf) + case URI => setValue(KEY_URL, databaseOrUrl.getOrElse("*")) + case _ => + } + + def getObjectType: SparkObjectType = objectType + + def getDatabase: String = getValue(KEY_DATABASE).asInstanceOf[String] + + def getTable: String = getValue(KEY_TABLE).asInstanceOf[String] + + def getUdf: String = getValue(KEY_UDF).asInstanceOf[String] + + def getColumn: String = getValue(KEY_COLUMN).asInstanceOf[String] + + def getUrl: String = getValue(KEY_URL).asInstanceOf[String] + +} + +object RangerSparkResource { + + def apply(objectType: SparkObjectType, databaseOrUrl: Option[String], tableOrUdf: String, + column: String): RangerSparkResource = { + new RangerSparkResource(objectType, databaseOrUrl, tableOrUdf, column) + } + + def apply(objectType: SparkObjectType, databaseOrUrl: Option[String], + tableOrUdf: String): RangerSparkResource = { + new RangerSparkResource(objectType, databaseOrUrl, tableOrUdf) + } + + def apply(objectType: SparkObjectType, databaseOrUrl: Option[String]): RangerSparkResource = { + new RangerSparkResource(objectType, databaseOrUrl) + } + + private val KEY_DATABASE = "database" + private val KEY_TABLE = "table" + private val KEY_UDF = "udf" + private val KEY_COLUMN = "column" + private val KEY_URL = "url" +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkSQLExtension.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkSQLExtension.scala new file mode 100644 index 0000000000..5d7b2d07fa --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/RangerSparkSQLExtension.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +import org.apache.spark.sql.SparkSessionExtensions +import org.apache.spark.sql.catalyst.optimizer.{RangerSparkAuthorizerExtension, RangerSparkMaskingExtension, RangerSparkRowFilterExtension} +import org.apache.spark.sql.execution.RangerSparkPlanOmitStrategy + +class RangerSparkSQLExtension extends Extensions { + override def apply(ext: SparkSessionExtensions): Unit = { + ext.injectOptimizerRule(RangerSparkAuthorizerExtension) + ext.injectOptimizerRule(RangerSparkRowFilterExtension) + ext.injectOptimizerRule(RangerSparkMaskingExtension) + ext.injectPlannerStrategy(RangerSparkPlanOmitStrategy) + } +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkAccessControlException.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkAccessControlException.scala new file mode 100644 index 0000000000..78837dfa9a --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkAccessControlException.scala @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +class SparkAccessControlException(msg: String) extends Exception(msg) diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkAccessType.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkAccessType.scala new file mode 100644 index 0000000000..1ce31d1697 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkAccessType.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +object SparkAccessType extends Enumeration { + type SparkAccessType = Value + + val NONE, CREATE, ALTER, DROP, SELECT, UPDATE, USE, READ, WRITE, ALL, ADMIN = Value +} + diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkObjectType.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkObjectType.scala new file mode 100644 index 0000000000..826a3d368d --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkObjectType.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +object SparkObjectType extends Enumeration { + type SparkObjectType = Value + + val NONE, DATABASE, TABLE, VIEW, COLUMN, FUNCTION, URI = Value +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkOperationType.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkOperationType.scala new file mode 100644 index 0000000000..b23885563d --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkOperationType.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +/** + * Subset of HiveOperationTypes supported by Apache Spark. + * + */ +object SparkOperationType extends Enumeration { + type SparkOperationType = Value + + val + ALTERDATABASE, ALTERTABLE_ADDCOLS, ALTERTABLE_ADDPARTS, ALTERTABLE_RENAMECOL, + ALTERTABLE_DROPPARTS, MSCK, ALTERTABLE_RENAMEPART, ALTERTABLE_RENAME, + ALTERVIEW_RENAME, ALTERTABLE_PROPERTIES, ALTERTABLE_SERDEPROPERTIES, + ALTERTABLE_LOCATION, QUERY, CREATEDATABASE, CREATETABLE_AS_SELECT, CREATEFUNCTION, CREATETABLE, + CREATEVIEW, DESCTABLE, DESCDATABASE, DESCFUNCTION, DROPDATABASE, DROPTABLE, DROPFUNCTION, LOAD, + SHOWCONF, SWITCHDATABASE, SHOW_CREATETABLE, SHOWCOLUMNS, SHOWDATABASES, SHOWFUNCTIONS, + SHOWPARTITIONS, SHOWTABLES, SHOW_TBLPROPERTIES, TRUNCATETABLE, DROPVIEW, EXPLAIN = Value + +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkPrivObjectActionType.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkPrivObjectActionType.scala new file mode 100644 index 0000000000..e320439dfa --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkPrivObjectActionType.scala @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +object SparkPrivObjectActionType extends Enumeration { + type SparkPrivObjectActionType = Value + val OTHER, INSERT, INSERT_OVERWRITE = Value +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkPrivilegeObject.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkPrivilegeObject.scala new file mode 100644 index 0000000000..4f67ff775a --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkPrivilegeObject.scala @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +import org.apache.ranger.authorization.spark.authorizer.SparkPrivilegeObjectType.SparkPrivilegeObjectType + +import scala.collection.JavaConverters._ +import org.apache.ranger.authorization.spark.authorizer.SparkPrivObjectActionType.SparkPrivObjectActionType + +class SparkPrivilegeObject( + private val typ: SparkPrivilegeObjectType, + private val dbname: String, + private val objectName: String, + private val partKeys: Seq[String], + private val columns: Seq[String], + private val actionType: SparkPrivObjectActionType) + extends Ordered[SparkPrivilegeObject] { + + override def compare(that: SparkPrivilegeObject): Int = { + typ compareTo that.typ match { + case 0 => + compare(dbname, that.dbname) match { + case 0 => + compare(objectName, that.objectName) match { + case 0 => + compare(partKeys, that.partKeys) match { + case 0 => compare(columns, that.columns) + case o => o + } + case o => o + } + case o => o + } + case o => o + } + } + + private def compare(o1: String, o2: String): Int = { + if (o1 != null) { + if (o2 != null) o1.compareTo(o2) else 1 + } else { + if (o2 != null) -1 else 0 + } + } + + private def compare(o1: Seq[String], o2: Seq[String]): Int = { + if (o1 != null) { + if (o2 != null) { + for ((x, y) <- o1.zip(o2)) { + val ret = compare(x, y) + if (ret != 0) { + return ret + } + } + if (o1.size > o2.size) { + 1 + } else if (o1.size < o2.size) { + -1 + } else { + 0 + } + } else { + 1 + } + } else { + if (o2 != null) { + -1 + } else { + 0 + } + } + } + + def this(typ: SparkPrivilegeObjectType, dbname: String, objectName: String, + partKeys: Seq[String], columns: Seq[String]) = + this(typ, dbname, objectName, partKeys, columns, SparkPrivObjectActionType.OTHER) + + def this(typ: SparkPrivilegeObjectType, dbname: String, objectName: String, + actionType: SparkPrivObjectActionType) = + this(typ, dbname, objectName, Nil, Nil, actionType) + + def this(typ: SparkPrivilegeObjectType, dbname: String, objectName: String) = + this(typ, dbname, objectName, SparkPrivObjectActionType.OTHER) + + def getType: SparkPrivilegeObjectType = typ + + def getDbname: String = dbname + + def getObjectName: String = objectName + + def getActionType: SparkPrivObjectActionType = actionType + + def getPartKeys: Seq[String] = partKeys + + def getColumns: Seq[String] = columns + + override def toString: String = { + val name = typ match { + case SparkPrivilegeObjectType.DATABASE => dbname + case SparkPrivilegeObjectType.TABLE_OR_VIEW => + getDbObjectName + (if (partKeys != null) partKeys.asJava.toString else "") + case SparkPrivilegeObjectType.FUNCTION => getDbObjectName + case _ => "" + } + + val at = if (actionType != null) { + actionType match { + case SparkPrivObjectActionType.INSERT | + SparkPrivObjectActionType.INSERT_OVERWRITE => ", action=" + actionType + case _ => "" + } + } else { + "" + } + "Object [type=" + typ + ", name=" + name + at + "]" + } + + private def getDbObjectName: String = { + (if (dbname == null) "" else dbname + ".") + objectName + } +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkPrivilegeObjectType.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkPrivilegeObjectType.scala new file mode 100644 index 0000000000..e1b9e70df7 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/SparkPrivilegeObjectType.scala @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark.authorizer + +object SparkPrivilegeObjectType extends Enumeration { + type SparkPrivilegeObjectType = Value + val DATABASE, TABLE_OR_VIEW, FUNCTION, DFS_URI = Value +} diff --git a/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/authorizer.scala b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/authorizer.scala new file mode 100644 index 0000000000..fdf0c850cc --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/ranger/authorization/spark/authorizer/authorizer.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.authorization.spark + +import org.apache.spark.sql.SparkSessionExtensions + +package object authorizer { + + type Extensions = SparkSessionExtensions => Unit + +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/AuthzUtils.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/AuthzUtils.scala new file mode 100644 index 0000000000..ae124bf75a --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/AuthzUtils.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import scala.util.{Failure, Success, Try} + +private[sql] object AuthzUtils { + + def getFieldVal(o: Any, name: String): Any = { + Try { + val field = o.getClass.getDeclaredField(name) + field.setAccessible(true) + field.get(o) + } match { + case Success(value) => value + case Failure(exception) => throw exception + } + } + + def setFieldVal(o: Any, name: String, value: Any): Unit = { + Try { + val field = o.getClass.getDeclaredField(name) + field.setAccessible(true) + field.set(o, value.asInstanceOf[AnyRef]) + } match { + case Failure(exception) => throw exception + case _ => + } + } +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkAuthorizerExtension.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkAuthorizerExtension.scala new file mode 100644 index 0000000000..da68923a08 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkAuthorizerExtension.scala @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.commons.logging.LogFactory +import org.apache.ranger.authorization.spark.authorizer.{RangerSparkAuthorizer, SparkAccessControlException, SparkOperationType} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.command._ +import org.apache.spark.sql.execution.datasources.{CreateTempViewUsing, InsertIntoDataSourceCommand, InsertIntoHadoopFsRelationCommand} +import org.apache.spark.sql.execution.{RangerShowDatabasesCommand, RangerShowTablesCommand} +import org.apache.spark.sql.hive.PrivilegesBuilder +import org.apache.spark.sql.hive.execution.CreateHiveTableAsSelectCommand + +/** + * An Optimizer Rule to do Hive Authorization V2 for Spark SQL. + * + * For Apache Spark 2.2.x and later + */ +case class RangerSparkAuthorizerExtension(spark: SparkSession) extends Rule[LogicalPlan] { + import SparkOperationType._ + + private val LOG = LogFactory.getLog(classOf[RangerSparkAuthorizerExtension]) + + /** + * Visit the [[LogicalPlan]] recursively to get all spark privilege objects, check the privileges + * + * If the user is authorized, then the original plan will be returned; otherwise, interrupted by + * some particular privilege exceptions. + * @param plan a spark LogicalPlan for verifying privileges + * @return a plan itself which has gone through the privilege check. + */ + override def apply(plan: LogicalPlan): LogicalPlan = { + plan match { + case s: ShowTablesCommand => RangerShowTablesCommand(s) + case s: ShowDatabasesCommand => RangerShowDatabasesCommand(s) + case r: RangerShowTablesCommand => r + case r: RangerShowDatabasesCommand => r + case _ => + val operationType: SparkOperationType = toOperationType(plan) + val (in, out) = PrivilegesBuilder.build(plan) + try { + RangerSparkAuthorizer.checkPrivileges(spark, operationType, in, out) + plan + } catch { + case ace: SparkAccessControlException => + LOG.error( + s""" + |+===============================+ + ||Spark SQL Authorization Failure| + ||-------------------------------| + ||${ace.getMessage} + ||-------------------------------| + ||Spark SQL Authorization Failure| + |+===============================+ + """.stripMargin) + throw ace + } + } + } + + /** + * Mapping of [[LogicalPlan]] -> [[SparkOperationType]] + * @param plan a spark LogicalPlan + * @return + */ + private def toOperationType(plan: LogicalPlan): SparkOperationType = { + plan match { + case c: Command => c match { + case _: AlterDatabasePropertiesCommand => ALTERDATABASE + case p if p.nodeName == "AlterTableAddColumnsCommand" => ALTERTABLE_ADDCOLS + case _: AlterTableAddPartitionCommand => ALTERTABLE_ADDPARTS + case p if p.nodeName == "AlterTableChangeColumnCommand" => ALTERTABLE_RENAMECOL + case _: AlterTableDropPartitionCommand => ALTERTABLE_DROPPARTS + case _: AlterTableRecoverPartitionsCommand => MSCK + case _: AlterTableRenamePartitionCommand => ALTERTABLE_RENAMEPART + case a: AlterTableRenameCommand => if (!a.isView) ALTERTABLE_RENAME else ALTERVIEW_RENAME + case _: AlterTableSetPropertiesCommand + | _: AlterTableUnsetPropertiesCommand => ALTERTABLE_PROPERTIES + case _: AlterTableSerDePropertiesCommand => ALTERTABLE_SERDEPROPERTIES + case _: AlterTableSetLocationCommand => ALTERTABLE_LOCATION + case _: AlterViewAsCommand => QUERY + + case _: AnalyzeColumnCommand => QUERY + // case _: AnalyzeTableCommand => HiveOperation.ANALYZE_TABLE + // Hive treat AnalyzeTableCommand as QUERY, obey it. + case _: AnalyzeTableCommand => QUERY + case p if p.nodeName == "AnalyzePartitionCommand" => QUERY + + case _: CreateDatabaseCommand => CREATEDATABASE + case _: CreateDataSourceTableAsSelectCommand + | _: CreateHiveTableAsSelectCommand => CREATETABLE_AS_SELECT + case _: CreateFunctionCommand => CREATEFUNCTION + case _: CreateTableCommand + | _: CreateDataSourceTableCommand => CREATETABLE + case _: CreateTableLikeCommand => CREATETABLE + case _: CreateViewCommand + | _: CacheTableCommand + | _: CreateTempViewUsing => CREATEVIEW + + case p if p.nodeName == "DescribeColumnCommand" => DESCTABLE + case _: DescribeDatabaseCommand => DESCDATABASE + case _: DescribeFunctionCommand => DESCFUNCTION + case _: DescribeTableCommand => DESCTABLE + + case _: DropDatabaseCommand => DROPDATABASE + // Hive don't check privileges for `drop function command`, what about a unverified user + // try to drop functions. + // We treat permanent functions as tables for verifying. + case d: DropFunctionCommand if !d.isTemp => DROPTABLE + case d: DropFunctionCommand if d.isTemp => DROPFUNCTION + case _: DropTableCommand => DROPTABLE + + case e: ExplainCommand => toOperationType(e.logicalPlan) + + case _: InsertIntoDataSourceCommand => QUERY + case p if p.nodeName == "InsertIntoDataSourceDirCommand" => QUERY + case _: InsertIntoHadoopFsRelationCommand => CREATETABLE_AS_SELECT + case p if p.nodeName == "InsertIntoHiveDirCommand" => QUERY + case p if p.nodeName == "InsertIntoHiveTable" => QUERY + + case _: LoadDataCommand => LOAD + + case p if p.nodeName == "SaveIntoDataSourceCommand" => QUERY + case s: SetCommand if s.kv.isEmpty || s.kv.get._2.isEmpty => SHOWCONF + case _: SetDatabaseCommand => SWITCHDATABASE + case _: ShowCreateTableCommand => SHOW_CREATETABLE + case _: ShowColumnsCommand => SHOWCOLUMNS + case _: ShowDatabasesCommand => SHOWDATABASES + case _: ShowFunctionsCommand => SHOWFUNCTIONS + case _: ShowPartitionsCommand => SHOWPARTITIONS + case _: ShowTablesCommand => SHOWTABLES + case _: ShowTablePropertiesCommand => SHOW_TBLPROPERTIES + case s: StreamingExplainCommand => + toOperationType(s.queryExecution.optimizedPlan) + + case _: TruncateTableCommand => TRUNCATETABLE + + case _: UncacheTableCommand => DROPVIEW + + // Commands that do not need build privilege goes as explain type + case _ => + // AddFileCommand + // AddJarCommand + // ... + EXPLAIN + } + case _ => QUERY + } + } + +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkMaskingExtension.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkMaskingExtension.scala new file mode 100644 index 0000000000..b758e3bb63 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkMaskingExtension.scala @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.commons.lang3.StringUtils +import org.apache.hadoop.security.UserGroupInformation +import org.apache.ranger.authorization.spark.authorizer._ +import org.apache.ranger.plugin.model.RangerPolicy +import org.apache.ranger.plugin.policyengine.RangerAccessResult +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, CatalogTable, HiveTableRelation} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, ExprId, NamedExpression, SubqueryExpression} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, CreateViewCommand, InsertIntoDataSourceDirCommand} +import org.apache.spark.sql.execution.datasources.{InsertIntoDataSourceCommand, InsertIntoHadoopFsRelationCommand, LogicalRelation, SaveIntoDataSourceCommand} +import org.apache.spark.sql.hive.execution.{CreateHiveTableAsSelectCommand, InsertIntoHiveDirCommand, InsertIntoHiveTable} + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +/** + * An Apache Spark's [[Optimizer]] extension for column data masking. + */ +case class RangerSparkMaskingExtension(spark: SparkSession) extends Rule[LogicalPlan] { + import RangerPolicy._ + + // register all built-in masking udfs + Map("mask" -> "org.apache.hadoop.hive.ql.udf.generic.GenericUDFMask", + "mask_first_n" -> "org.apache.hadoop.hive.ql.udf.generic.GenericUDFMaskFirstN", + "mask_hash" -> "org.apache.hadoop.hive.ql.udf.generic.GenericUDFMaskHash", + "mask_last_n" -> "org.apache.hadoop.hive.ql.udf.generic.GenericUDFMaskLastN", + "mask_show_first_n" -> "org.apache.hadoop.hive.ql.udf.generic.GenericUDFMaskShowFirstN", + "mask_show_last_n" -> "org.apache.hadoop.hive.ql.udf.generic.GenericUDFMaskShowLastN") + .map(x => CatalogFunction(FunctionIdentifier(x._1), x._2, Seq.empty)) + .foreach(spark.sessionState.catalog.registerFunction(_, true)) + + private lazy val sparkPlugin = RangerSparkPlugin.build().getOrCreate() + private lazy val sqlParser = spark.sessionState.sqlParser + private lazy val analyzer = spark.sessionState.analyzer + private lazy val rangerSparkOptimizer = new RangerSparkOptimizer(spark) + + /** + * Collecting transformers from Ranger data masking policies, and mapping the to the + * [[LogicalPlan]] output attributes. + * + * @param plan the original logical plan with a underlying catalog table + * @param table the catalog table + * @return a list of key-value pairs of original expression with its masking representation + */ + private def collectTransformers( + plan: LogicalPlan, + table: CatalogTable, + aliases: mutable.Map[Alias, ExprId]): Map[ExprId, NamedExpression] = { + val auditHandler = new RangerSparkAuditHandler() + val ugi = UserGroupInformation.getCurrentUser + val userName = ugi.getShortUserName + val groups = ugi.getGroupNames.toSet + try { + val identifier = table.identifier + import SparkObjectType._ + + val maskEnableResults = plan.output.map { expr => + val resource = RangerSparkResource(COLUMN, identifier.database, identifier.table, expr.name) + val req = new RangerSparkAccessRequest(resource, userName, groups, COLUMN.toString, + SparkAccessType.SELECT, sparkPlugin.getClusterName) + (expr, sparkPlugin.evalDataMaskPolicies(req, auditHandler)) + }.filter(x => isMaskEnabled(x._2)) + + val originMaskers = maskEnableResults.map { case (expr, result) => + if (StringUtils.equalsIgnoreCase(result.getMaskType, MASK_TYPE_NULL)) { + val sql = s"SELECT NULL AS ${expr.name} FROM ${table.qualifiedName}" + val plan = analyzer.execute(sqlParser.parsePlan(sql)) + (expr, plan) + } else if (StringUtils.equalsIgnoreCase(result.getMaskType, MASK_TYPE_CUSTOM)) { + val maskVal = result.getMaskedValue + if (maskVal == null) { + val sql = s"SELECT NULL AS ${expr.name} FROM ${table.qualifiedName}" + val plan = analyzer.execute(sqlParser.parsePlan(sql)) + (expr, plan) + } else { + val sql = s"SELECT ${maskVal.replace("{col}", expr.name)} AS ${expr.name} FROM" + + s" ${table.qualifiedName}" + val plan = analyzer.execute(sqlParser.parsePlan(sql)) + (expr, plan) + } + } else if (result.getMaskTypeDef != null) { + val transformer = result.getMaskTypeDef.getTransformer + if (StringUtils.isNotEmpty(transformer)) { + val trans = transformer.replace("{col}", expr.name) + val sql = s"SELECT $trans AS ${expr.name} FROM ${table.qualifiedName}" + val plan = analyzer.execute(sqlParser.parsePlan(sql)) + (expr, plan) + } else { + (expr, null) + } + } else { + (expr, null) + } + }.filter(_._2 != null) + + val formedMaskers: Map[ExprId, Alias] = + originMaskers.map { case (expr, p) => (expr, p.asInstanceOf[Project].projectList.head) } + .map { case (expr, attr) => + val originalAlias = attr.asInstanceOf[Alias] + val newChild = originalAlias.child mapChildren { + case _: AttributeReference => expr + case o => o + } + val newAlias = originalAlias.copy(child = newChild)( + originalAlias.exprId, originalAlias.qualifier, originalAlias.explicitMetadata) + (expr.exprId, newAlias) + }.toMap + + val aliasedMaskers = new mutable.HashMap[ExprId, Alias]() + for ((alias, id) <- aliases if formedMaskers.contains(id)) { + val originalAlias = formedMaskers(id) + val newChild = originalAlias.child mapChildren { + case ar: AttributeReference => + ar.copy(name = alias.name)(alias.exprId, alias.qualifier) + case o => o + } + val newAlias = originalAlias.copy(child = newChild, alias.name)( + originalAlias.exprId, originalAlias.qualifier, originalAlias.explicitMetadata) + aliasedMaskers.put(alias.exprId, newAlias) + } + formedMaskers ++ aliasedMaskers + } catch { + case e: Exception => throw e + } + } + + private def isMaskEnabled(result: RangerAccessResult): Boolean = { + result != null && result.isMaskEnabled + } + + private def hasCatalogTable(plan: LogicalPlan): Boolean = plan match { + case _: HiveTableRelation => true + case l: LogicalRelation if l.catalogTable.isDefined => true + case _ => false + } + + private def collectAllAliases(plan: LogicalPlan): mutable.HashMap[Alias, ExprId] = { + val aliases = new mutable.HashMap[Alias, ExprId]() + plan.transformAllExpressions { + case a: Alias => + a.child match { + case ne: NamedExpression => + aliases.put(a, ne.exprId) + case _ => + } + a + } + aliases + } + + private def collectAllTransformers( + plan: LogicalPlan, aliases: mutable.Map[Alias, ExprId]): Map[ExprId, NamedExpression] = { + plan.collectLeaves().flatMap { + case h: HiveTableRelation => + collectTransformers(h, h.tableMeta, aliases) + case l: LogicalRelation if l.catalogTable.isDefined => + collectTransformers(l, l.catalogTable.get, aliases) + case _ => Seq.empty + }.toMap + } + + private def doMasking(plan: LogicalPlan): LogicalPlan = plan match { + case s: Subquery => s + case m: RangerSparkMasking => m // escape the optimize iteration if already masked + case fixed if fixed.find(_.isInstanceOf[RangerSparkMasking]).nonEmpty => fixed + case _ => + val aliases = collectAllAliases(plan) + val transformers = collectAllTransformers(plan, aliases) + val newPlan = + if (transformers.nonEmpty && plan.output.exists(o => transformers.get(o.exprId).nonEmpty)) { + val newOutput = plan.output.map(attr => transformers.getOrElse(attr.exprId, attr)) + Project(newOutput, plan) + } else { + plan + } + + val marked = newPlan transformUp { + case p if hasCatalogTable(p) => RangerSparkMasking(p) + } + + marked transformAllExpressions { + case s: SubqueryExpression => + val Subquery(newPlan) = + rangerSparkOptimizer.execute(Subquery(RangerSparkMasking(s.plan))) + s.withNewPlan(newPlan) + } + } + + override def apply(plan: LogicalPlan): LogicalPlan = plan match { + case c: Command => c match { + case c: CreateDataSourceTableAsSelectCommand => c.copy(query = doMasking(c.query)) + case c: CreateHiveTableAsSelectCommand => c.copy(query = doMasking(c.query)) + case c: CreateViewCommand => c.copy(child = doMasking(c.child)) + case i: InsertIntoDataSourceCommand => i.copy(query = doMasking(i.query)) + case i: InsertIntoDataSourceDirCommand => i.copy(query = doMasking(i.query)) + case i: InsertIntoHadoopFsRelationCommand => i.copy(query = doMasking(i.query)) + case i: InsertIntoHiveDirCommand => i.copy(query = doMasking(i.query)) + case i: InsertIntoHiveTable => i.copy(query = doMasking(i.query)) + case s: SaveIntoDataSourceCommand => s.copy(query = doMasking(s.query)) + case cmd => cmd + } + case other => doMasking(other) + } +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkOptimizer.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkOptimizer.scala new file mode 100644 index 0000000000..18a7e1a4b7 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkOptimizer.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.RuleExecutor + +/** + * An Optimizer without all `spark.sql.extensions` + */ +class RangerSparkOptimizer(spark: SparkSession) extends RuleExecutor[LogicalPlan] { + + override def batches: Seq[Batch] = { + val optimizer = spark.sessionState.optimizer + val extRules = optimizer.extendedOperatorOptimizationRules + optimizer.batches.map { batch => + val ruleSet = batch.rules.toSet -- extRules + Batch(batch.name, FixedPoint(batch.strategy.maxIterations), ruleSet.toSeq: _*) + } + } +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkRowFilterExtension.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkRowFilterExtension.scala new file mode 100644 index 0000000000..7edc9455d0 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkRowFilterExtension.scala @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.commons.lang.StringUtils +import org.apache.hadoop.security.UserGroupInformation +import org.apache.ranger.authorization.spark.authorizer._ +import org.apache.ranger.plugin.policyengine.RangerAccessResult +import org.apache.spark.sql.AuthzUtils.getFieldVal +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.SubqueryExpression +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, CreateViewCommand, InsertIntoDataSourceDirCommand} +import org.apache.spark.sql.execution.datasources.{InsertIntoDataSourceCommand, InsertIntoHadoopFsRelationCommand, LogicalRelation, SaveIntoDataSourceCommand} +import org.apache.spark.sql.hive.execution.{CreateHiveTableAsSelectCommand, InsertIntoHiveDirCommand, InsertIntoHiveTable} + +import scala.collection.JavaConverters._ + +/** + * An Apache Spark's [[Optimizer]] extension for row level filtering. + */ +case class RangerSparkRowFilterExtension(spark: SparkSession) extends Rule[LogicalPlan] { + private lazy val sparkPlugin = RangerSparkPlugin.build().getOrCreate() + private lazy val rangerSparkOptimizer = new RangerSparkOptimizer(spark) + + /** + * Transform a Relation to a parsed [[LogicalPlan]] with specified row filter expressions + * @param plan the original [[LogicalPlan]] + * @param table a Spark [[CatalogTable]] representation + * @return A new Spark [[LogicalPlan]] with specified row filter expressions + */ + private def applyingRowFilterExpr(plan: LogicalPlan, table: CatalogTable): LogicalPlan = { + val auditHandler = new RangerSparkAuditHandler() + try { + val identifier = table.identifier + val resource = + RangerSparkResource(SparkObjectType.TABLE, identifier.database, identifier.table) + val ugi = UserGroupInformation.getCurrentUser + val request = new RangerSparkAccessRequest(resource, ugi.getShortUserName, + ugi.getGroupNames.toSet, SparkObjectType.TABLE.toString, SparkAccessType.SELECT, + sparkPlugin.getClusterName) + val result = sparkPlugin.evalRowFilterPolicies(request, auditHandler) + if (isRowFilterEnabled(result)) { + val sql = s"select ${plan.output.map(_.name).mkString(",")} from ${table.qualifiedName}" + + s" where ${result.getFilterExpr}" + val parsed = spark.sessionState.sqlParser.parsePlan(sql) + + val parsedNew = parsed transform { + case Filter(condition, child) if !child.fastEquals(plan) => Filter(condition, plan) + } + val analyzed = spark.sessionState.analyzer.execute(parsedNew) + val optimized = analyzed transformAllExpressions { + case s: SubqueryExpression => + val Subquery(newPlan) = + rangerSparkOptimizer.execute(Subquery(RangerSparkRowFilter(s.plan))) + s.withNewPlan(newPlan) + } + RangerSparkRowFilter(optimized) + } else { + RangerSparkRowFilter(plan) + } + } catch { + case e: Exception => throw e + } + } + + private def isRowFilterEnabled(result: RangerAccessResult): Boolean = { + result != null && result.isRowFilterEnabled && StringUtils.isNotEmpty(result.getFilterExpr) + } + + private def doFiltering(plan: LogicalPlan): LogicalPlan = plan match { + case rf: RangerSparkRowFilter => rf + case fixed if fixed.find(_.isInstanceOf[RangerSparkRowFilter]).nonEmpty => fixed + case _ => + val plansWithTables = plan.collectLeaves().map { + case h if h.nodeName == "HiveTableRelation" => + (h, getFieldVal(h, "tableMeta").asInstanceOf[CatalogTable]) + case m if m.nodeName == "MetastoreRelation" => + (m, getFieldVal(m, "catalogTable").asInstanceOf[CatalogTable]) + case l: LogicalRelation if l.catalogTable.isDefined => + (l, l.catalogTable.get) + case _ => null + }.filter(_ != null).map(lt => (lt._1, applyingRowFilterExpr(lt._1, lt._2))).toMap + + plan transformUp { + case p => plansWithTables.getOrElse(p, p) + } + } + + /** + * Transform a spark logical plan to another plan with the row filer expressions + * @param plan the original [[LogicalPlan]] + * @return the logical plan with row filer expressions applied + */ + override def apply(plan: LogicalPlan): LogicalPlan = plan match { + case c: Command => c match { + case c: CreateDataSourceTableAsSelectCommand => c.copy(query = doFiltering(c.query)) + case c: CreateHiveTableAsSelectCommand => c.copy(query = doFiltering(c.query)) + case c: CreateViewCommand => c.copy(child = doFiltering(c.child)) + case i: InsertIntoDataSourceCommand => i.copy(query = doFiltering(i.query)) + case i: InsertIntoDataSourceDirCommand => i.copy(query = doFiltering(i.query)) + case i: InsertIntoHadoopFsRelationCommand => i.copy(query = doFiltering(i.query)) + case i: InsertIntoHiveDirCommand => i.copy(query = doFiltering(i.query)) + case i: InsertIntoHiveTable => i.copy(query = doFiltering(i.query)) + case s: SaveIntoDataSourceCommand => s.copy(query = doFiltering(s.query)) + case cmd => cmd + } + case other => doFiltering(other) + } +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/RangerSparkMasking.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/RangerSparkMasking.scala new file mode 100644 index 0000000000..8ec1e2bb2b --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/RangerSparkMasking.scala @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans.logical + +import org.apache.spark.sql.catalyst.expressions.Attribute + +/** + * A marker [[LogicalPlan]] for column data masking + */ +case class RangerSparkMasking(child: LogicalPlan) extends UnaryNode { + override def output: Seq[Attribute] = child.output +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/RangerSparkRowFilter.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/RangerSparkRowFilter.scala new file mode 100644 index 0000000000..bbc9324e4c --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/RangerSparkRowFilter.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.plans.logical +import org.apache.spark.sql.catalyst.expressions.Attribute + +/** + * A wrapper for a transformed plan with row level filter applied, which will be removed during + * LogicalPlan -> PhysicalPlan + * + */ +case class RangerSparkRowFilter(child: LogicalPlan) extends UnaryNode { + override def output: Seq[Attribute] = child.output +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/execution/RangerShowDatabasesCommand.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/execution/RangerShowDatabasesCommand.scala new file mode 100644 index 0000000000..1cde50b3e7 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/execution/RangerShowDatabasesCommand.scala @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.ranger.authorization.spark.authorizer.{RangerSparkAuthorizer, SparkPrivilegeObject, SparkPrivilegeObjectType} +import org.apache.spark.sql.execution.command.{RunnableCommand, ShowDatabasesCommand} +import org.apache.spark.sql.{Row, SparkSession} + +case class RangerShowDatabasesCommand(child: ShowDatabasesCommand) extends RunnableCommand { + override val output = child.output + + override def run(sparkSession: SparkSession): Seq[Row] = { + val rows = child.run(sparkSession) + rows.filter(r => RangerSparkAuthorizer.isAllowed(toSparkPrivilegeObject(r))) + } + + private def toSparkPrivilegeObject(row: Row): SparkPrivilegeObject = { + val database = row.getString(0) + new SparkPrivilegeObject(SparkPrivilegeObjectType.DATABASE, database, database) + } + + +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/execution/RangerShowTablesCommand.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/execution/RangerShowTablesCommand.scala new file mode 100644 index 0000000000..1c31fe32e4 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/execution/RangerShowTablesCommand.scala @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.ranger.authorization.spark.authorizer.{RangerSparkAuthorizer, SparkPrivilegeObject, SparkPrivilegeObjectType} +import org.apache.spark.sql.execution.command.{RunnableCommand, ShowTablesCommand} +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.expressions.Attribute + +case class RangerShowTablesCommand(child: ShowTablesCommand) extends RunnableCommand { + + override val output: Seq[Attribute] = child.output + override def run(sparkSession: SparkSession): Seq[Row] = { + val rows = child.run(sparkSession) + rows.filter(r => RangerSparkAuthorizer.isAllowed(toSparkPrivilegeObject(r))) + } + + private def toSparkPrivilegeObject(row: Row): SparkPrivilegeObject = { + val database = row.getString(0) + val table = row.getString(1) + new SparkPrivilegeObject(SparkPrivilegeObjectType.TABLE_OR_VIEW, database, table) + } +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/execution/RangerSparkPlanOmitStrategy.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/execution/RangerSparkPlanOmitStrategy.scala new file mode 100644 index 0000000000..4483b09953 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/execution/RangerSparkPlanOmitStrategy.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.{SparkSession, Strategy} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, RangerSparkMasking, RangerSparkRowFilter} + +/** + * An Apache Spark's [[Strategy]] extension for omitting marker for row level filtering and data + * masking. + */ +case class RangerSparkPlanOmitStrategy(spark: SparkSession) extends Strategy { + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case RangerSparkRowFilter(child) => planLater(child) :: Nil + case RangerSparkMasking(child) => planLater(child) :: Nil + case _ => Nil + } +} diff --git a/plugin-spark/src/main/scala/org/apache/spark/sql/hive/PrivilegesBuilder.scala b/plugin-spark/src/main/scala/org/apache/spark/sql/hive/PrivilegesBuilder.scala new file mode 100644 index 0000000000..fc20a24fc5 --- /dev/null +++ b/plugin-spark/src/main/scala/org/apache/spark/sql/hive/PrivilegesBuilder.scala @@ -0,0 +1,459 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + + +import org.apache.ranger.authorization.spark.authorizer.{SparkPrivilegeObject, SparkPrivilegeObjectType, SparkPrivObjectActionType} +import org.apache.ranger.authorization.spark.authorizer.SparkPrivObjectActionType.SparkPrivObjectActionType +import org.apache.spark.sql.SaveMode +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.NamedExpression +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.execution.command._ +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.AuthzUtils._ +import org.apache.spark.sql.hive.execution.CreateHiveTableAsSelectCommand +import org.apache.spark.sql.types.StructField + +import scala.collection.mutable.ArrayBuffer + +/** + * [[LogicalPlan]] -> list of [[SparkPrivilegeObject]]s + */ +private[sql] object PrivilegesBuilder { + + /** + * Build input and output privilege objects from a Spark's [[LogicalPlan]] + * + * For [[ExplainCommand]]s, build its child. + * For [[RunnableCommand]]s, build outputs if it has an target to write, build inputs for the + * inside query if exists. + * + * For other queries, build inputs. + * + * @param plan A Spark [[LogicalPlan]] + */ + def build(plan: LogicalPlan): (Seq[SparkPrivilegeObject], Seq[SparkPrivilegeObject]) = { + + def doBuild(plan: LogicalPlan): (Seq[SparkPrivilegeObject], Seq[SparkPrivilegeObject]) = { + val inputObjs = new ArrayBuffer[SparkPrivilegeObject] + val outputObjs = new ArrayBuffer[SparkPrivilegeObject] + plan match { + // RunnableCommand + case cmd: Command => buildCommand(cmd, inputObjs, outputObjs) + // Queries + case _ => buildQuery(plan, inputObjs) + } + (inputObjs, outputObjs) + } + + plan match { + case e: ExplainCommand => doBuild(e.logicalPlan) + case p => doBuild(p) + } + } + + /** + * Build SparkPrivilegeObjects from Spark LogicalPlan + * @param plan a Spark LogicalPlan used to generate SparkPrivilegeObjects + * @param privilegeObjects input or output spark privilege object list + * @param projectionList Projection list after pruning + */ + private def buildQuery( + plan: LogicalPlan, + privilegeObjects: ArrayBuffer[SparkPrivilegeObject], + projectionList: Seq[NamedExpression] = Nil): Unit = { + + /** + * Columns in Projection take priority for column level privilege checking + * @param table catalogTable of a given relation + */ + def mergeProjection(table: CatalogTable): Unit = { + if (projectionList.isEmpty) { + addTableOrViewLevelObjs( + table.identifier, + privilegeObjects, + table.partitionColumnNames, + table.schema.fieldNames) + } else { + addTableOrViewLevelObjs( + table.identifier, + privilegeObjects, + table.partitionColumnNames.filter(projectionList.map(_.name).contains(_)), + projectionList.map(_.name)) + } + } + + plan match { + case p: Project => buildQuery(p.child, privilegeObjects, p.projectList) + + case h if h.nodeName == "HiveTableRelation" => + mergeProjection(getFieldVal(h, "tableMeta").asInstanceOf[CatalogTable]) + + case m if m.nodeName == "MetastoreRelation" => + mergeProjection(getFieldVal(m, "catalogTable").asInstanceOf[CatalogTable]) + + case l: LogicalRelation if l.catalogTable.nonEmpty => mergeProjection(l.catalogTable.get) + + case u: UnresolvedRelation => + // Normally, we shouldn't meet UnresolvedRelation here in an optimized plan. + // Unfortunately, the real world is always a place where miracles happen. + // We check the privileges directly without resolving the plan and leave everything + // to spark to do. + addTableOrViewLevelObjs(u.tableIdentifier, privilegeObjects) + + case p => + for (child <- p.children) { + buildQuery(child, privilegeObjects, projectionList) + } + } + } + + /** + * Build SparkPrivilegeObjects from Spark LogicalPlan + * @param plan a Spark LogicalPlan used to generate SparkPrivilegeObjects + * @param inputObjs input spark privilege object list + * @param outputObjs output spark privilege object list + */ + private def buildCommand( + plan: LogicalPlan, + inputObjs: ArrayBuffer[SparkPrivilegeObject], + outputObjs: ArrayBuffer[SparkPrivilegeObject]): Unit = { + plan match { + case a: AlterDatabasePropertiesCommand => addDbLevelObjs(a.databaseName, outputObjs) + + case a if a.nodeName == "AlterTableAddColumnsCommand" => + addTableOrViewLevelObjs( + getFieldVal(a, "table").asInstanceOf[TableIdentifier], + inputObjs, + columns = getFieldVal(a, "colsToAdd").asInstanceOf[Seq[StructField]].map(_.name)) + addTableOrViewLevelObjs( + getFieldVal(a, "table").asInstanceOf[TableIdentifier], + outputObjs, + columns = getFieldVal(a, "colsToAdd").asInstanceOf[Seq[StructField]].map(_.name)) + + case a: AlterTableAddPartitionCommand => + addTableOrViewLevelObjs(a.tableName, inputObjs) + addTableOrViewLevelObjs(a.tableName, outputObjs) + + case a if a.nodeName == "AlterTableChangeColumnCommand" => + addTableOrViewLevelObjs( + getFieldVal(a, "tableName").asInstanceOf[TableIdentifier], + inputObjs, + columns = Seq(getFieldVal(a, "columnName").asInstanceOf[String])) + + case a: AlterTableDropPartitionCommand => + addTableOrViewLevelObjs(a.tableName, inputObjs) + addTableOrViewLevelObjs(a.tableName, outputObjs) + + case a: AlterTableRecoverPartitionsCommand => + addTableOrViewLevelObjs(a.tableName, inputObjs) + addTableOrViewLevelObjs(a.tableName, outputObjs) + + case a: AlterTableRenameCommand if !a.isView || a.oldName.database.nonEmpty => + // rename tables / permanent views + addTableOrViewLevelObjs(a.oldName, inputObjs) + addTableOrViewLevelObjs(a.newName, outputObjs) + + case a: AlterTableRenamePartitionCommand => + addTableOrViewLevelObjs(a.tableName, inputObjs) + addTableOrViewLevelObjs(a.tableName, outputObjs) + + case a: AlterTableSerDePropertiesCommand => + addTableOrViewLevelObjs(a.tableName, inputObjs) + addTableOrViewLevelObjs(a.tableName, outputObjs) + + case a: AlterTableSetLocationCommand => + addTableOrViewLevelObjs(a.tableName, inputObjs) + addTableOrViewLevelObjs(a.tableName, outputObjs) + + case a: AlterTableSetPropertiesCommand => + addTableOrViewLevelObjs(a.tableName, inputObjs) + addTableOrViewLevelObjs(a.tableName, outputObjs) + + case a: AlterTableUnsetPropertiesCommand => + addTableOrViewLevelObjs(a.tableName, inputObjs) + addTableOrViewLevelObjs(a.tableName, outputObjs) + + case a: AlterViewAsCommand => + if (a.name.database.nonEmpty) { + // it's a permanent view + addTableOrViewLevelObjs(a.name, outputObjs) + } + buildQuery(a.query, inputObjs) + + case a: AnalyzeColumnCommand => + addTableOrViewLevelObjs( + a.tableIdent, inputObjs, columns = a.columnNames) + addTableOrViewLevelObjs( + a.tableIdent, outputObjs, columns = a.columnNames) + + case a if a.nodeName == "AnalyzePartitionCommand" => + addTableOrViewLevelObjs( + getFieldVal(a, "tableIdent").asInstanceOf[TableIdentifier], inputObjs) + addTableOrViewLevelObjs( + getFieldVal(a, "tableIdent").asInstanceOf[TableIdentifier], outputObjs) + + case a: AnalyzeTableCommand => + addTableOrViewLevelObjs(a.tableIdent, inputObjs, columns = Seq("RAW__DATA__SIZE")) + addTableOrViewLevelObjs(a.tableIdent, outputObjs) + + case c: CacheTableCommand => c.plan.foreach { + buildQuery(_, inputObjs) + } + + case c: CreateDatabaseCommand => addDbLevelObjs(c.databaseName, outputObjs) + + case c: CreateDataSourceTableAsSelectCommand => + addDbLevelObjs(c.table.identifier, outputObjs) + addTableOrViewLevelObjs(c.table.identifier, outputObjs, mode = c.mode) + buildQuery(c.query, inputObjs) + + case c: CreateDataSourceTableCommand => + addTableOrViewLevelObjs(c.table.identifier, outputObjs) + + case c: CreateFunctionCommand if !c.isTemp => + addDbLevelObjs(c.databaseName, outputObjs) + addFunctionLevelObjs(c.databaseName, c.functionName, outputObjs) + + case c: CreateHiveTableAsSelectCommand => + addDbLevelObjs(c.tableDesc.identifier, outputObjs) + addTableOrViewLevelObjs(c.tableDesc.identifier, outputObjs) + buildQuery(c.query, inputObjs) + + case c: CreateTableCommand => addTableOrViewLevelObjs(c.table.identifier, outputObjs) + + case c: CreateTableLikeCommand => + addDbLevelObjs(c.targetTable, outputObjs) + addTableOrViewLevelObjs(c.targetTable, outputObjs) + // hive don't handle source table's privileges, we should not obey that, because + // it will cause meta information leak + addDbLevelObjs(c.sourceTable, inputObjs) + addTableOrViewLevelObjs(c.sourceTable, inputObjs) + + case c: CreateViewCommand => + c.viewType match { + case PersistedView => + // PersistedView will be tied to a database + addDbLevelObjs(c.name, outputObjs) + addTableOrViewLevelObjs(c.name, outputObjs) + case _ => + } + buildQuery(c.child, inputObjs) + + case d if d.nodeName == "DescribeColumnCommand" => + addTableOrViewLevelObjs( + getFieldVal(d, "table").asInstanceOf[TableIdentifier], + inputObjs, + columns = getFieldVal(d, "colNameParts").asInstanceOf[Seq[String]]) + + case d: DescribeDatabaseCommand => + addDbLevelObjs(d.databaseName, inputObjs) + + case d: DescribeFunctionCommand => + addFunctionLevelObjs(d.functionName.database, d.functionName.funcName, inputObjs) + + case d: DescribeTableCommand => addTableOrViewLevelObjs(d.table, inputObjs) + + case d: DropDatabaseCommand => + // outputObjs are enough for privilege check, adding inputObjs for consistency with hive + // behaviour in case of some unexpected issues. + addDbLevelObjs(d.databaseName, inputObjs) + addDbLevelObjs(d.databaseName, outputObjs) + + case d: DropFunctionCommand => + addFunctionLevelObjs(d.databaseName, d.functionName, outputObjs) + + case d: DropTableCommand => addTableOrViewLevelObjs(d.tableName, outputObjs) + + case i: InsertIntoDataSourceCommand => + i.logicalRelation.catalogTable.foreach { table => + addTableOrViewLevelObjs( + table.identifier, + outputObjs) + } + buildQuery(i.query, inputObjs) + + case i if i.nodeName =="InsertIntoDataSourceDirCommand" => + buildQuery(getFieldVal(i, "query").asInstanceOf[LogicalPlan], inputObjs) + + case i: InsertIntoHadoopFsRelationCommand => + // we are able to get the override mode here, but ctas for hive table with text/orc + // format and parquet with spark.sql.hive.convertMetastoreParquet=false can success + // with privilege checking without claiming for UPDATE privilege of target table, + // which seems to be same with Hive behaviour. + // So, here we ignore the overwrite mode for such a consistency. + i.catalogTable foreach { t => + addTableOrViewLevelObjs( + t.identifier, + outputObjs, + i.partitionColumns.map(_.name), + t.schema.fieldNames) + } + buildQuery(i.query, inputObjs) + + case i if i.nodeName == "InsertIntoHiveDirCommand" => + buildQuery(getFieldVal(i, "query").asInstanceOf[LogicalPlan], inputObjs) + + case i if i.nodeName == "InsertIntoHiveTable" => + addTableOrViewLevelObjs( + getFieldVal(i, "table").asInstanceOf[CatalogTable].identifier, outputObjs) + buildQuery(getFieldVal(i, "query").asInstanceOf[LogicalPlan], inputObjs) + + case l: LoadDataCommand => + addTableOrViewLevelObjs(l.table, outputObjs) + if (!l.isLocal) { + inputObjs += new SparkPrivilegeObject(SparkPrivilegeObjectType.DFS_URI, l.path, l.path) + } + + case s if s.nodeName == "SaveIntoDataSourceCommand" => + buildQuery(getFieldVal(s, "query").asInstanceOf[LogicalPlan], outputObjs) + + case s: SetDatabaseCommand => addDbLevelObjs(s.databaseName, inputObjs) + + case s: ShowColumnsCommand => addTableOrViewLevelObjs(s.tableName, inputObjs) + + case s: ShowCreateTableCommand => addTableOrViewLevelObjs(s.table, inputObjs) + + case s: ShowFunctionsCommand => s.db.foreach(addDbLevelObjs(_, inputObjs)) + + case s: ShowPartitionsCommand => addTableOrViewLevelObjs(s.tableName, inputObjs) + + case s: ShowTablePropertiesCommand => addTableOrViewLevelObjs(s.table, inputObjs) + + case s: ShowTablesCommand => addDbLevelObjs(s.databaseName, inputObjs) + + case s: TruncateTableCommand => addTableOrViewLevelObjs(s.tableName, outputObjs) + + case _ => + // AddFileCommand + // AddJarCommand + // AnalyzeColumnCommand + // ClearCacheCommand + // CreateTempViewUsing + // ListFilesCommand + // ListJarsCommand + // RefreshTable + // RefreshTable + // ResetCommand + // SetCommand + // ShowDatabasesCommand + // StreamingExplainCommand + // UncacheTableCommand + } + } + + /** + * Add database level spark privilege objects to input or output list + * @param dbName database name as spark privilege object + * @param privilegeObjects input or output list + */ + private def addDbLevelObjs( + dbName: String, + privilegeObjects: ArrayBuffer[SparkPrivilegeObject]): Unit = { + privilegeObjects += new SparkPrivilegeObject(SparkPrivilegeObjectType.DATABASE, dbName, dbName) + } + + /** + * Add database level spark privilege objects to input or output list + * @param dbOption an option of database name as spark privilege object + * @param privilegeObjects input or output spark privilege object list + */ + private def addDbLevelObjs( + dbOption: Option[String], + privilegeObjects: ArrayBuffer[SparkPrivilegeObject]): Unit = { + dbOption match { + case Some(db) => + privilegeObjects += new SparkPrivilegeObject(SparkPrivilegeObjectType.DATABASE, db, db) + case _ => + } + } + + /** + * Add database level spark privilege objects to input or output list + * @param identifier table identifier contains database name as hive privilege object + * @param privilegeObjects input or output spark privilege object list + */ + private def addDbLevelObjs( + identifier: TableIdentifier, + privilegeObjects: ArrayBuffer[SparkPrivilegeObject]): Unit = { + identifier.database match { + case Some(db) => + privilegeObjects += new SparkPrivilegeObject(SparkPrivilegeObjectType.DATABASE, db, db) + case _ => + } + } + + /** + * Add function level spark privilege objects to input or output list + * @param databaseName database name + * @param functionName function name as spark privilege object + * @param privilegeObjects input or output list + */ + private def addFunctionLevelObjs( + databaseName: Option[String], + functionName: String, + privilegeObjects: ArrayBuffer[SparkPrivilegeObject]): Unit = { + databaseName match { + case Some(db) => + privilegeObjects += new SparkPrivilegeObject( + SparkPrivilegeObjectType.FUNCTION, db, functionName) + case _ => + } + } + + /** + * Add table level spark privilege objects to input or output list + * @param identifier table identifier contains database name, and table name as hive + * privilege object + * @param privilegeObjects input or output list + * @param mode Append or overwrite + */ + private def addTableOrViewLevelObjs(identifier: TableIdentifier, + privilegeObjects: ArrayBuffer[SparkPrivilegeObject], partKeys: Seq[String] = Nil, + columns: Seq[String] = Nil, mode: SaveMode = SaveMode.ErrorIfExists): Unit = { + identifier.database match { + case Some(db) => + val tbName = identifier.table + val actionType = toActionType(mode) + privilegeObjects += new SparkPrivilegeObject( + SparkPrivilegeObjectType.TABLE_OR_VIEW, + db, + tbName, + partKeys, + columns, + actionType) + case _ => + } + } + + /** + * SparkPrivObjectActionType INSERT or INSERT_OVERWRITE + * + * @param mode Append or Overwrite + */ + private def toActionType(mode: SaveMode): SparkPrivObjectActionType = { + mode match { + case SaveMode.Append => SparkPrivObjectActionType.INSERT + case SaveMode.Overwrite => SparkPrivObjectActionType.INSERT_OVERWRITE + case _ => SparkPrivObjectActionType.OTHER + } + } +} diff --git a/plugin-spark/src/test/resources/data/files/kv1.txt b/plugin-spark/src/test/resources/data/files/kv1.txt new file mode 100644 index 0000000000..9825414ecf --- /dev/null +++ b/plugin-spark/src/test/resources/data/files/kv1.txt @@ -0,0 +1,500 @@ +238val_238 +86val_86 +311val_311 +27val_27 +165val_165 +409val_409 +255val_255 +278val_278 +98val_98 +484val_484 +265val_265 +193val_193 +401val_401 +150val_150 +273val_273 +224val_224 +369val_369 +66val_66 +128val_128 +213val_213 +146val_146 +406val_406 +429val_429 +374val_374 +152val_152 +469val_469 +145val_145 +495val_495 +37val_37 +327val_327 +281val_281 +277val_277 +209val_209 +15val_15 +82val_82 +403val_403 +166val_166 +417val_417 +430val_430 +252val_252 +292val_292 +219val_219 +287val_287 +153val_153 +193val_193 +338val_338 +446val_446 +459val_459 +394val_394 +237val_237 +482val_482 +174val_174 +413val_413 +494val_494 +207val_207 +199val_199 +466val_466 +208val_208 +174val_174 +399val_399 +396val_396 +247val_247 +417val_417 +489val_489 +162val_162 +377val_377 +397val_397 +309val_309 +365val_365 +266val_266 +439val_439 +342val_342 +367val_367 +325val_325 +167val_167 +195val_195 +475val_475 +17val_17 +113val_113 +155val_155 +203val_203 +339val_339 +0val_0 +455val_455 +128val_128 +311val_311 +316val_316 +57val_57 +302val_302 +205val_205 +149val_149 +438val_438 +345val_345 +129val_129 +170val_170 +20val_20 +489val_489 +157val_157 +378val_378 +221val_221 +92val_92 +111val_111 +47val_47 +72val_72 +4val_4 +280val_280 +35val_35 +427val_427 +277val_277 +208val_208 +356val_356 +399val_399 +169val_169 +382val_382 +498val_498 +125val_125 +386val_386 +437val_437 +469val_469 +192val_192 +286val_286 +187val_187 +176val_176 +54val_54 +459val_459 +51val_51 +138val_138 +103val_103 +239val_239 +213val_213 +216val_216 +430val_430 +278val_278 +176val_176 +289val_289 +221val_221 +65val_65 +318val_318 +332val_332 +311val_311 +275val_275 +137val_137 +241val_241 +83val_83 +333val_333 +180val_180 +284val_284 +12val_12 +230val_230 +181val_181 +67val_67 +260val_260 +404val_404 +384val_384 +489val_489 +353val_353 +373val_373 +272val_272 +138val_138 +217val_217 +84val_84 +348val_348 +466val_466 +58val_58 +8val_8 +411val_411 +230val_230 +208val_208 +348val_348 +24val_24 +463val_463 +431val_431 +179val_179 +172val_172 +42val_42 +129val_129 +158val_158 +119val_119 +496val_496 +0val_0 +322val_322 +197val_197 +468val_468 +393val_393 +454val_454 +100val_100 +298val_298 +199val_199 +191val_191 +418val_418 +96val_96 +26val_26 +165val_165 +327val_327 +230val_230 +205val_205 +120val_120 +131val_131 +51val_51 +404val_404 +43val_43 +436val_436 +156val_156 +469val_469 +468val_468 +308val_308 +95val_95 +196val_196 +288val_288 +481val_481 +457val_457 +98val_98 +282val_282 +197val_197 +187val_187 +318val_318 +318val_318 +409val_409 +470val_470 +137val_137 +369val_369 +316val_316 +169val_169 +413val_413 +85val_85 +77val_77 +0val_0 +490val_490 +87val_87 +364val_364 +179val_179 +118val_118 +134val_134 +395val_395 +282val_282 +138val_138 +238val_238 +419val_419 +15val_15 +118val_118 +72val_72 +90val_90 +307val_307 +19val_19 +435val_435 +10val_10 +277val_277 +273val_273 +306val_306 +224val_224 +309val_309 +389val_389 +327val_327 +242val_242 +369val_369 +392val_392 +272val_272 +331val_331 +401val_401 +242val_242 +452val_452 +177val_177 +226val_226 +5val_5 +497val_497 +402val_402 +396val_396 +317val_317 +395val_395 +58val_58 +35val_35 +336val_336 +95val_95 +11val_11 +168val_168 +34val_34 +229val_229 +233val_233 +143val_143 +472val_472 +322val_322 +498val_498 +160val_160 +195val_195 +42val_42 +321val_321 +430val_430 +119val_119 +489val_489 +458val_458 +78val_78 +76val_76 +41val_41 +223val_223 +492val_492 +149val_149 +449val_449 +218val_218 +228val_228 +138val_138 +453val_453 +30val_30 +209val_209 +64val_64 +468val_468 +76val_76 +74val_74 +342val_342 +69val_69 +230val_230 +33val_33 +368val_368 +103val_103 +296val_296 +113val_113 +216val_216 +367val_367 +344val_344 +167val_167 +274val_274 +219val_219 +239val_239 +485val_485 +116val_116 +223val_223 +256val_256 +263val_263 +70val_70 +487val_487 +480val_480 +401val_401 +288val_288 +191val_191 +5val_5 +244val_244 +438val_438 +128val_128 +467val_467 +432val_432 +202val_202 +316val_316 +229val_229 +469val_469 +463val_463 +280val_280 +2val_2 +35val_35 +283val_283 +331val_331 +235val_235 +80val_80 +44val_44 +193val_193 +321val_321 +335val_335 +104val_104 +466val_466 +366val_366 +175val_175 +403val_403 +483val_483 +53val_53 +105val_105 +257val_257 +406val_406 +409val_409 +190val_190 +406val_406 +401val_401 +114val_114 +258val_258 +90val_90 +203val_203 +262val_262 +348val_348 +424val_424 +12val_12 +396val_396 +201val_201 +217val_217 +164val_164 +431val_431 +454val_454 +478val_478 +298val_298 +125val_125 +431val_431 +164val_164 +424val_424 +187val_187 +382val_382 +5val_5 +70val_70 +397val_397 +480val_480 +291val_291 +24val_24 +351val_351 +255val_255 +104val_104 +70val_70 +163val_163 +438val_438 +119val_119 +414val_414 +200val_200 +491val_491 +237val_237 +439val_439 +360val_360 +248val_248 +479val_479 +305val_305 +417val_417 +199val_199 +444val_444 +120val_120 +429val_429 +169val_169 +443val_443 +323val_323 +325val_325 +277val_277 +230val_230 +478val_478 +178val_178 +468val_468 +310val_310 +317val_317 +333val_333 +493val_493 +460val_460 +207val_207 +249val_249 +265val_265 +480val_480 +83val_83 +136val_136 +353val_353 +172val_172 +214val_214 +462val_462 +233val_233 +406val_406 +133val_133 +175val_175 +189val_189 +454val_454 +375val_375 +401val_401 +421val_421 +407val_407 +384val_384 +256val_256 +26val_26 +134val_134 +67val_67 +384val_384 +379val_379 +18val_18 +462val_462 +492val_492 +100val_100 +298val_298 +9val_9 +341val_341 +498val_498 +146val_146 +458val_458 +362val_362 +186val_186 +285val_285 +348val_348 +167val_167 +18val_18 +273val_273 +183val_183 +281val_281 +344val_344 +97val_97 +469val_469 +315val_315 +84val_84 +28val_28 +37val_37 +448val_448 +152val_152 +348val_348 +307val_307 +194val_194 +414val_414 +477val_477 +222val_222 +126val_126 +90val_90 +169val_169 +403val_403 +400val_400 +200val_200 +97val_97 diff --git a/plugin-spark/src/test/resources/log4j.properties b/plugin-spark/src/test/resources/log4j.properties new file mode 100644 index 0000000000..41535d66b2 --- /dev/null +++ b/plugin-spark/src/test/resources/log4j.properties @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=FATAL, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n \ No newline at end of file diff --git a/plugin-spark/src/test/resources/ranger-spark-security.xml b/plugin-spark/src/test/resources/ranger-spark-security.xml new file mode 100644 index 0000000000..a19d424106 --- /dev/null +++ b/plugin-spark/src/test/resources/ranger-spark-security.xml @@ -0,0 +1,45 @@ + + + + + + + ranger.plugin.spark.service.name + hive_jenkins + + Name of the Ranger service containing policies for this SampleApp instance + + + + + ranger.plugin.spark.policy.source.impl + org.apache.ranger.services.spark.RangerAdminClientImpl + + Policy source. + + + + + ranger.plugin.spark.policy.cache.dir + target/test-classes + + Directory where Ranger policies are cached after successful retrieval from the source + + + + diff --git a/plugin-spark/src/test/resources/sparkSql_hive_jenkins.json b/plugin-spark/src/test/resources/sparkSql_hive_jenkins.json new file mode 100644 index 0000000000..6691216d7c --- /dev/null +++ b/plugin-spark/src/test/resources/sparkSql_hive_jenkins.json @@ -0,0 +1,2591 @@ +{ + "serviceName": "hive_jenkins", + "serviceId": 1, + "policyVersion": 85, + "policyUpdateTime": "20190429-21:36:09.000-+0800", + "policies": [ + { + "service": "hive_jenkins", + "name": "all - url", + "policyType": 0, + "policyPriority": 0, + "description": "Policy for all - url", + "isAuditEnabled": true, + "resources": { + "url": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": true + } + }, + "policyItems": [ + { + "accesses": [ + { + "type": "select", + "isAllowed": true + }, + { + "type": "update", + "isAllowed": true + }, + { + "type": "create", + "isAllowed": true + }, + { + "type": "drop", + "isAllowed": true + }, + { + "type": "alter", + "isAllowed": true + }, + { + "type": "index", + "isAllowed": true + }, + { + "type": "lock", + "isAllowed": true + }, + { + "type": "all", + "isAllowed": true + }, + { + "type": "read", + "isAllowed": true + }, + { + "type": "write", + "isAllowed": true + } + ], + "users": [ + "admin" + ], + "groups": [], + "conditions": [], + "delegateAdmin": true + } + ], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [], + "id": 1, + "guid": "cf7e6725-492f-434f-bffe-6bb4e3147246", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "all - database, table, column", + "policyType": 0, + "policyPriority": 0, + "description": "Policy for all - database, table, column", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [ + { + "accesses": [ + { + "type": "select", + "isAllowed": true + }, + { + "type": "update", + "isAllowed": true + }, + { + "type": "create", + "isAllowed": true + }, + { + "type": "drop", + "isAllowed": true + }, + { + "type": "alter", + "isAllowed": true + }, + { + "type": "index", + "isAllowed": true + }, + { + "type": "lock", + "isAllowed": true + }, + { + "type": "all", + "isAllowed": true + }, + { + "type": "read", + "isAllowed": true + }, + { + "type": "write", + "isAllowed": true + } + ], + "users": [ + "admin" + ], + "groups": [], + "conditions": [], + "delegateAdmin": true + } + ], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [], + "id": 2, + "guid": "3b96138a-af4d-48bc-9544-58c5bfa1979b", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "all - database, udf", + "policyType": 0, + "policyPriority": 0, + "description": "Policy for all - database, udf", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": false + }, + "udf": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [ + { + "accesses": [ + { + "type": "select", + "isAllowed": true + }, + { + "type": "update", + "isAllowed": true + }, + { + "type": "create", + "isAllowed": true + }, + { + "type": "drop", + "isAllowed": true + }, + { + "type": "alter", + "isAllowed": true + }, + { + "type": "index", + "isAllowed": true + }, + { + "type": "lock", + "isAllowed": true + }, + { + "type": "all", + "isAllowed": true + }, + { + "type": "read", + "isAllowed": true + }, + { + "type": "write", + "isAllowed": true + } + ], + "users": [ + "admin" + ], + "groups": [], + "conditions": [], + "delegateAdmin": true + } + ], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [], + "id": 3, + "guid": "db08fbb0-61da-4f33-8144-ccd89816151d", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "src_key _less_than_20", + "policyType": 2, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "src" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "key\u003c20" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "serviceType": "hive", + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 4, + "guid": "f588a9ed-f7b1-48f7-9d0d-c12cf2b9b7ed", + "isEnabled": true, + "version": 26 + }, + { + "service": "hive_jenkins", + "name": "default", + "policyType": 0, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [ + { + "accesses": [ + { + "type": "select", + "isAllowed": true + }, + { + "type": "update", + "isAllowed": true + }, + { + "type": "create", + "isAllowed": true + }, + { + "type": "drop", + "isAllowed": true + }, + { + "type": "alter", + "isAllowed": true + }, + { + "type": "index", + "isAllowed": true + }, + { + "type": "lock", + "isAllowed": true + }, + { + "type": "all", + "isAllowed": true + }, + { + "type": "read", + "isAllowed": true + }, + { + "type": "write", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 5, + "guid": "2db6099d-e4f1-41df-9d24-f2f47bed618e", + "isEnabled": true, + "version": 5 + }, + { + "service": "hive_jenkins", + "name": "default_kent", + "policyType": 0, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "key" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "src" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [ + { + "accesses": [ + { + "type": "select", + "isAllowed": true + }, + { + "type": "update", + "isAllowed": true + }, + { + "type": "create", + "isAllowed": true + }, + { + "type": "drop", + "isAllowed": true + }, + { + "type": "alter", + "isAllowed": true + }, + { + "type": "index", + "isAllowed": true + }, + { + "type": "lock", + "isAllowed": true + }, + { + "type": "all", + "isAllowed": true + }, + { + "type": "read", + "isAllowed": true + }, + { + "type": "write", + "isAllowed": true + } + ], + "users": [ + "kent" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 5, + "guid": "fd24db19-f7cc-4e13-a8ba-bbd5a07a2d8d", + "isEnabled": true, + "version": 5 + }, + { + "service": "hive_jenkins", + "name": "src_val_show_last_4", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "value" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "src" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_SHOW_LAST_4", + "valueExpr": "" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 6, + "guid": "b1261fcc-b2cd-49f2-85e8-93f254f987ec", + "isEnabled": true, + "version": 10 + }, + { + "service": "hive_jenkins", + "name": "store_sales", + "policyType": 2, + "policyPriority": 0, + "description": "equality", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "store_sales" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "ss_sold_date_sk\u003d2451546" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 7, + "guid": "08fa307f-77fa-4586-83d0-21d0eb68b0fc", + "isEnabled": true, + "version": 4 + }, + { + "service": "hive_jenkins", + "name": "default", + "policyType": 0, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "*" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [ + { + "accesses": [ + { + "type": "select", + "isAllowed": true + }, + { + "type": "update", + "isAllowed": true + }, + { + "type": "create", + "isAllowed": true + }, + { + "type": "drop", + "isAllowed": true + }, + { + "type": "alter", + "isAllowed": true + }, + { + "type": "index", + "isAllowed": true + }, + { + "type": "lock", + "isAllowed": true + }, + { + "type": "all", + "isAllowed": true + }, + { + "type": "read", + "isAllowed": true + }, + { + "type": "write", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 8, + "guid": "cfd49756-2d80-492d-bd26-6f67d531f28c", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "catalog_page", + "policyType": 2, + "policyPriority": 0, + "description": "key in another table", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "catalog_page" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "cp_start_date_sk in (select d_date_sk from date_dim)" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 9, + "guid": "ec617d1b-b85d-434f-b9db-8ef0178620f1", + "isEnabled": true, + "version": 2 + }, + { + "service": "hive_jenkins", + "name": "call_center", + "policyType": 2, + "policyPriority": 0, + "description": "is not null", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "call_center" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "cc_name is not null" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 10, + "guid": "c8259509-61ae-48f8-867f-be8cac339764", + "isEnabled": true, + "version": 2 + }, + { + "service": "hive_jenkins", + "name": "catalog_returns", + "policyType": 2, + "policyPriority": 0, + "description": "or expression", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "catalog_returns" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "cr_item_sk is null or cr_item_sk \u003e\u003d0" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 11, + "guid": "58aa8789-799b-4ce7-820e-9ed625ff2206", + "isEnabled": true, + "version": 2 + }, + { + "service": "hive_jenkins", + "name": "date_dim", + "policyType": 2, + "policyPriority": 0, + "description": "AND and UDF", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "date_dim" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "d_date_sk\u003d0 and d_date\u003dcurrent_date()" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 12, + "guid": "cc7b3ede-e483-4ba9-9584-2907f3237df0", + "isEnabled": true, + "version": 2 + }, + { + "service": "hive_jenkins", + "name": "reason", + "policyType": 2, + "policyPriority": 0, + "description": "row filter expression with a key in the table itself", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "reason" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "r_reason_sk in (select r_reason_sk from reason)" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 14, + "guid": "4c8d06ae-73ea-4ff8-aedb-4aeae6865768", + "isEnabled": true, + "version": 2 + }, + { + "service": "hive_jenkins", + "name": "inventory", + "policyType": 2, + "policyPriority": 0, + "description": "scalar expression with the table itself", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "inventory" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "inv_item_sk\u003d(select count(1) from inventory)" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 15, + "guid": "1e3da1db-47f3-465e-a604-aaf3d3a8de8e", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "item_i_item_id", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "i_item_id" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "item" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_SHOW_LAST_4", + "valueExpr": "" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 16, + "guid": "3bf13c7b-14b7-40cf-a7ed-913a3e528a11", + "isEnabled": true, + "version": 3 + }, + { + "service": "hive_jenkins", + "name": "customer_address", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "ca_state" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "customer_address" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_SHOW_LAST_4", + "valueExpr": "" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 17, + "guid": "a047b76d-ea97-4893-b469-94cc944b3edc", + "isEnabled": true, + "version": 4 + }, + { + "service": "hive_jenkins", + "name": "customer", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "c_customer_id" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "customer" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 18, + "guid": "ac2d963e-635f-49a8-a96c-ded88f68e731", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "date_dim_2", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "d_year" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "date_dim" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_NULL" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 19, + "guid": "07e7df0d-2cf7-4630-b796-31798a4496d4", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "item_i_brand_id", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "i_brand_id" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "item" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_HASH" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 20, + "guid": "35b5e3f7-c9f0-42d1-9118-56dc37ff42f5", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "item_i_item_sk", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "i_item_sk" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "item" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_SHOW_FIRST_4" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 21, + "guid": "7e16c0ca-927a-4e95-b42e-c93b62cb6dfa", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "item_i_class_id", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "i_class_id" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "item" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_NULL" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 22, + "guid": "b7847238-3a14-4d56-8257-b8625a7f25a1", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl1_key_equals_0", + "policyType": 2, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl1" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "key\u003d0" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 23, + "guid": "d52bc8de-2a6b-4f7c-ab26-fbaf22c05eb7", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl2_key_in_set", + "policyType": 2, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl2" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "key in (0, 1, 2)" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 24, + "guid": "06008a40-9b33-4699-8782-cc7e85101b85", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl3_key_in_subquery", + "policyType": 2, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl3" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "key in (select key from rangertbl2 where key \u003c 100)" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 25, + "guid": "d0ca382a-1d62-4faa-8b9b-aeb36d4e443e", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl4_key_in_self", + "policyType": 2, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl4" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "key in (select key from rangertbl4)" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 26, + "guid": "b2b730af-d106-41f2-a21e-c29626adf6f3", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl5_key_udf", + "policyType": 2, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl5" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "current_date()\u003d\"2019-04-28\"" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 27, + "guid": "0540df7e-fa14-4a41-b7d2-479fb42ddf5f", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl6_key_and_or", + "policyType": 2, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl6" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [], + "rowFilterPolicyItems": [ + { + "rowFilterInfo": { + "filterExpr": "key\u003e1 and key\u003c10 or key \u003d500" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 28, + "guid": "5805bb62-291e-44b1-81e2-9f5c5b2b3cca", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl1_value_redact", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "value" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl1" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 29, + "guid": "9e7a290a-3d24-4f19-a4c6-2cf0637204ab", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl2_value_sf4", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "value" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl2" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_SHOW_FIRST_4" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 30, + "guid": "9d50a525-b24c-4cf5-a885-d10d426368d1", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl3_value_hash", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "value" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl3" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_HASH" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 31, + "guid": "ed1868a1-bf79-4721-a3d5-6815cc7d4986", + "isEnabled": true, + "version": 1 + }, + { + "service": "hive_jenkins", + "name": "rangertbl4_value_nullify", + "policyType": 1, + "policyPriority": 0, + "description": "", + "isAuditEnabled": true, + "resources": { + "database": { + "values": [ + "default" + ], + "isExcludes": false, + "isRecursive": false + }, + "column": { + "values": [ + "value" + ], + "isExcludes": false, + "isRecursive": false + }, + "table": { + "values": [ + "rangertbl4" + ], + "isExcludes": false, + "isRecursive": false + } + }, + "policyItems": [], + "denyPolicyItems": [], + "allowExceptions": [], + "denyExceptions": [], + "dataMaskPolicyItems": [ + { + "dataMaskInfo": { + "dataMaskType": "MASK_NULL" + }, + "accesses": [ + { + "type": "select", + "isAllowed": true + } + ], + "users": [ + "bob" + ], + "groups": [], + "conditions": [], + "delegateAdmin": false + } + ], + "rowFilterPolicyItems": [], + "options": {}, + "validitySchedules": [], + "policyLabels": [ + "" + ], + "id": 32, + "guid": "98a04cd7-8d14-4466-adc9-126d87a3af69", + "isEnabled": true, + "version": 1 + } + ], + "serviceDef": { + "name": "hive", + "implClass": "org.apache.ranger.services.hive.RangerServiceHive", + "label": "Hive Server2", + "description": "Hive Server2", + "options": { + "enableDenyAndExceptionsInPolicies": "true" + }, + "configs": [ + { + "itemId": 1, + "name": "username", + "type": "string", + "mandatory": true, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "", + "label": "Username" + }, + { + "itemId": 2, + "name": "password", + "type": "password", + "mandatory": true, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "", + "label": "Password" + }, + { + "itemId": 3, + "name": "jdbc.driverClassName", + "type": "string", + "mandatory": true, + "defaultValue": "org.apache.hive.jdbc.HiveDriver", + "validationRegEx": "", + "validationMessage": "", + "uiHint": "" + }, + { + "itemId": 4, + "name": "jdbc.url", + "type": "string", + "mandatory": true, + "defaultValue": "", + "validationRegEx": "", + "validationMessage": "", + "uiHint": "{\"TextFieldWithIcon\":true, \"info\": \"1.For Remote Mode, eg.\u003cbr\u003ejdbc:hive2://\u0026lt;host\u0026gt;:\u0026lt;port\u0026gt;\u003cbr\u003e2.For Embedded Mode (no host or port), eg.\u003cbr\u003ejdbc:hive2:///;initFile\u003d\u0026lt;file\u0026gt;\u003cbr\u003e3.For HTTP Mode, eg.\u003cbr\u003ejdbc:hive2://\u0026lt;host\u0026gt;:\u0026lt;port\u0026gt;/;\u003cbr\u003etransportMode\u003dhttp;httpPath\u003d\u0026lt;httpPath\u0026gt;\u003cbr\u003e4.For SSL Mode, eg.\u003cbr\u003ejdbc:hive2://\u0026lt;host\u0026gt;:\u0026lt;port\u0026gt;/;ssl\u003dtrue;\u003cbr\u003esslTrustStore\u003dtStore;trustStorePassword\u003dpw\u003cbr\u003e5.For ZooKeeper Mode, eg.\u003cbr\u003ejdbc:hive2://\u0026lt;host\u0026gt;/;serviceDiscoveryMode\u003d\u003cbr\u003ezooKeeper;zooKeeperNamespace\u003dhiveserver2\u003cbr\u003e6.For Kerberos Mode, eg.\u003cbr\u003ejdbc:hive2://\u0026lt;host\u0026gt;:\u0026lt;port\u0026gt;/;\u003cbr\u003eprincipal\u003dhive/domain@EXAMPLE.COM\u003cbr\u003e\"}" + }, + { + "itemId": 5, + "name": "commonNameForCertificate", + "type": "string", + "mandatory": false, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "", + "label": "Common Name for Certificate" + } + ], + "resources": [ + { + "itemId": 1, + "name": "database", + "type": "string", + "level": 10, + "mandatory": true, + "lookupSupported": true, + "recursiveSupported": false, + "excludesSupported": true, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerDefaultResourceMatcher", + "matcherOptions": { + "wildCard": "true", + "ignoreCase": "true" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "", + "label": "Hive Database", + "description": "Hive Database", + "accessTypeRestrictions": [], + "isValidLeaf": false + }, + { + "itemId": 5, + "name": "url", + "type": "string", + "level": 10, + "mandatory": true, + "lookupSupported": false, + "recursiveSupported": true, + "excludesSupported": false, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerPathResourceMatcher", + "matcherOptions": { + "wildCard": "true", + "ignoreCase": "false" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "", + "label": "URL", + "description": "URL", + "accessTypeRestrictions": [], + "isValidLeaf": true + }, + { + "itemId": 2, + "name": "table", + "type": "string", + "level": 20, + "parent": "database", + "mandatory": true, + "lookupSupported": true, + "recursiveSupported": false, + "excludesSupported": true, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerDefaultResourceMatcher", + "matcherOptions": { + "wildCard": "true", + "ignoreCase": "true" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "", + "label": "Hive Table", + "description": "Hive Table", + "accessTypeRestrictions": [], + "isValidLeaf": false + }, + { + "itemId": 3, + "name": "udf", + "type": "string", + "level": 20, + "parent": "database", + "mandatory": true, + "lookupSupported": true, + "recursiveSupported": false, + "excludesSupported": true, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerDefaultResourceMatcher", + "matcherOptions": { + "wildCard": "true", + "ignoreCase": "true" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "", + "label": "Hive UDF", + "description": "Hive UDF", + "accessTypeRestrictions": [], + "isValidLeaf": true + }, + { + "itemId": 4, + "name": "column", + "type": "string", + "level": 30, + "parent": "table", + "mandatory": true, + "lookupSupported": true, + "recursiveSupported": false, + "excludesSupported": true, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerDefaultResourceMatcher", + "matcherOptions": { + "wildCard": "true", + "ignoreCase": "true" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "", + "label": "Hive Column", + "description": "Hive Column", + "accessTypeRestrictions": [], + "isValidLeaf": true + } + ], + "accessTypes": [ + { + "itemId": 1, + "name": "select", + "label": "select", + "impliedGrants": [] + }, + { + "itemId": 2, + "name": "update", + "label": "update", + "impliedGrants": [] + }, + { + "itemId": 3, + "name": "create", + "label": "Create", + "impliedGrants": [] + }, + { + "itemId": 4, + "name": "drop", + "label": "Drop", + "impliedGrants": [] + }, + { + "itemId": 5, + "name": "alter", + "label": "Alter", + "impliedGrants": [] + }, + { + "itemId": 6, + "name": "index", + "label": "Index", + "impliedGrants": [] + }, + { + "itemId": 7, + "name": "lock", + "label": "Lock", + "impliedGrants": [] + }, + { + "itemId": 8, + "name": "all", + "label": "All", + "impliedGrants": [ + "select", + "update", + "create", + "drop", + "alter", + "index", + "lock", + "read", + "write" + ] + }, + { + "itemId": 9, + "name": "read", + "label": "Read", + "impliedGrants": [] + }, + { + "itemId": 10, + "name": "write", + "label": "Write", + "impliedGrants": [] + } + ], + "policyConditions": [], + "contextEnrichers": [], + "enums": [], + "dataMaskDef": { + "maskTypes": [ + { + "itemId": 1, + "name": "MASK", + "label": "Redact", + "description": "Replace lowercase with \u0027x\u0027, uppercase with \u0027X\u0027, digits with \u00270\u0027", + "transformer": "mask({col})", + "dataMaskOptions": {} + }, + { + "itemId": 2, + "name": "MASK_SHOW_LAST_4", + "label": "Partial mask: show last 4", + "description": "Show last 4 characters; replace rest with \u0027x\u0027", + "transformer": "mask_show_last_n({col}, 4, \u0027x\u0027, \u0027x\u0027, \u0027x\u0027, -1, \u00271\u0027)", + "dataMaskOptions": {} + }, + { + "itemId": 3, + "name": "MASK_SHOW_FIRST_4", + "label": "Partial mask: show first 4", + "description": "Show first 4 characters; replace rest with \u0027x\u0027", + "transformer": "mask_show_first_n({col}, 4, \u0027x\u0027, \u0027x\u0027, \u0027x\u0027, -1, \u00271\u0027)", + "dataMaskOptions": {} + }, + { + "itemId": 4, + "name": "MASK_HASH", + "label": "Hash", + "description": "Hash the value", + "transformer": "mask_hash({col})", + "dataMaskOptions": {} + }, + { + "itemId": 5, + "name": "MASK_NULL", + "label": "Nullify", + "description": "Replace with NULL", + "dataMaskOptions": {} + }, + { + "itemId": 6, + "name": "MASK_NONE", + "label": "Unmasked (retain original value)", + "description": "No masking", + "dataMaskOptions": {} + }, + { + "itemId": 12, + "name": "MASK_DATE_SHOW_YEAR", + "label": "Date: show only year", + "description": "Date: show only year", + "transformer": "mask({col}, \u0027x\u0027, \u0027x\u0027, \u0027x\u0027, -1, \u00271\u0027, 1, 0, -1)", + "dataMaskOptions": {} + }, + { + "itemId": 13, + "name": "CUSTOM", + "label": "Custom", + "description": "Custom", + "dataMaskOptions": {} + } + ], + "accessTypes": [ + { + "itemId": 1, + "name": "select", + "label": "select", + "impliedGrants": [] + } + ], + "resources": [ + { + "itemId": 1, + "name": "database", + "type": "string", + "level": 10, + "mandatory": true, + "lookupSupported": true, + "recursiveSupported": false, + "excludesSupported": false, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerDefaultResourceMatcher", + "matcherOptions": { + "wildCard": "false", + "ignoreCase": "true" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "{ \"singleValue\":true }", + "label": "Hive Database", + "description": "Hive Database", + "accessTypeRestrictions": [], + "isValidLeaf": false + }, + { + "itemId": 2, + "name": "table", + "type": "string", + "level": 20, + "parent": "database", + "mandatory": true, + "lookupSupported": true, + "recursiveSupported": false, + "excludesSupported": false, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerDefaultResourceMatcher", + "matcherOptions": { + "wildCard": "false", + "ignoreCase": "true" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "{ \"singleValue\":true }", + "label": "Hive Table", + "description": "Hive Table", + "accessTypeRestrictions": [], + "isValidLeaf": false + }, + { + "itemId": 4, + "name": "column", + "type": "string", + "level": 30, + "parent": "table", + "mandatory": true, + "lookupSupported": true, + "recursiveSupported": false, + "excludesSupported": false, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerDefaultResourceMatcher", + "matcherOptions": { + "wildCard": "false", + "ignoreCase": "true" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "{ \"singleValue\":true }", + "label": "Hive Column", + "description": "Hive Column", + "accessTypeRestrictions": [], + "isValidLeaf": true + } + ] + }, + "rowFilterDef": { + "accessTypes": [ + { + "itemId": 1, + "name": "select", + "label": "select", + "impliedGrants": [] + } + ], + "resources": [ + { + "itemId": 1, + "name": "database", + "type": "string", + "level": 10, + "mandatory": true, + "lookupSupported": true, + "recursiveSupported": false, + "excludesSupported": false, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerDefaultResourceMatcher", + "matcherOptions": { + "wildCard": "false", + "ignoreCase": "true" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "{ \"singleValue\":true }", + "label": "Hive Database", + "description": "Hive Database", + "accessTypeRestrictions": [], + "isValidLeaf": false + }, + { + "itemId": 2, + "name": "table", + "type": "string", + "level": 20, + "parent": "database", + "mandatory": true, + "lookupSupported": true, + "recursiveSupported": false, + "excludesSupported": false, + "matcher": "org.apache.ranger.plugin.resourcematcher.RangerDefaultResourceMatcher", + "matcherOptions": { + "wildCard": "false", + "ignoreCase": "true" + }, + "validationRegEx": "", + "validationMessage": "", + "uiHint": "{ \"singleValue\":true }", + "label": "Hive Table", + "description": "Hive Table", + "accessTypeRestrictions": [], + "isValidLeaf": true + } + ] + }, + "id": 3, + "guid": "3e1afb5a-184a-4e82-9d9c-87a5cacc243c", + "isEnabled": true, + "createTime": "20190401-20:14:36.000-+0800", + "updateTime": "20190401-20:14:36.000-+0800", + "version": 1 + }, + "auditMode": "audit-default" +} \ No newline at end of file diff --git a/plugin-spark/src/test/resources/tpcds/q1.sql b/plugin-spark/src/test/resources/tpcds/q1.sql new file mode 100755 index 0000000000..4d20faad8e --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q1.sql @@ -0,0 +1,19 @@ +WITH customer_total_return AS +( SELECT + sr_customer_sk AS ctr_customer_sk, + sr_store_sk AS ctr_store_sk, + sum(sr_return_amt) AS ctr_total_return + FROM store_returns, date_dim + WHERE sr_returned_date_sk = d_date_sk AND d_year = 2000 + GROUP BY sr_customer_sk, sr_store_sk) +SELECT c_customer_id +FROM customer_total_return ctr1, store, customer +WHERE ctr1.ctr_total_return > + (SELECT avg(ctr_total_return) * 1.2 + FROM customer_total_return ctr2 + WHERE ctr1.ctr_store_sk = ctr2.ctr_store_sk) + AND s_store_sk = ctr1.ctr_store_sk + AND s_state = 'TN' + AND ctr1.ctr_customer_sk = c_customer_sk +ORDER BY c_customer_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q10.sql b/plugin-spark/src/test/resources/tpcds/q10.sql new file mode 100755 index 0000000000..5500e1aea1 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q10.sql @@ -0,0 +1,57 @@ +SELECT + cd_gender, + cd_marital_status, + cd_education_status, + count(*) cnt1, + cd_purchase_estimate, + count(*) cnt2, + cd_credit_rating, + count(*) cnt3, + cd_dep_count, + count(*) cnt4, + cd_dep_employed_count, + count(*) cnt5, + cd_dep_college_count, + count(*) cnt6 +FROM + customer c, customer_address ca, customer_demographics +WHERE + c.c_current_addr_sk = ca.ca_address_sk AND + ca_county IN ('Rush County', 'Toole County', 'Jefferson County', + 'Dona Ana County', 'La Porte County') AND + cd_demo_sk = c.c_current_cdemo_sk AND + exists(SELECT * + FROM store_sales, date_dim + WHERE c.c_customer_sk = ss_customer_sk AND + ss_sold_date_sk = d_date_sk AND + d_year = 2002 AND + d_moy BETWEEN 1 AND 1 + 3) AND + (exists(SELECT * + FROM web_sales, date_dim + WHERE c.c_customer_sk = ws_bill_customer_sk AND + ws_sold_date_sk = d_date_sk AND + d_year = 2002 AND + d_moy BETWEEN 1 AND 1 + 3) OR + exists(SELECT * + FROM catalog_sales, date_dim + WHERE c.c_customer_sk = cs_ship_customer_sk AND + cs_sold_date_sk = d_date_sk AND + d_year = 2002 AND + d_moy BETWEEN 1 AND 1 + 3)) +GROUP BY cd_gender, + cd_marital_status, + cd_education_status, + cd_purchase_estimate, + cd_credit_rating, + cd_dep_count, + cd_dep_employed_count, + cd_dep_college_count +ORDER BY cd_gender, + cd_marital_status, + cd_education_status, + cd_purchase_estimate, + cd_credit_rating, + cd_dep_count, + cd_dep_employed_count, + cd_dep_college_count +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q11.sql b/plugin-spark/src/test/resources/tpcds/q11.sql new file mode 100755 index 0000000000..3618fb14fa --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q11.sql @@ -0,0 +1,68 @@ +WITH year_total AS ( + SELECT + c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + c_preferred_cust_flag customer_preferred_cust_flag, + c_birth_country customer_birth_country, + c_login customer_login, + c_email_address customer_email_address, + d_year dyear, + sum(ss_ext_list_price - ss_ext_discount_amt) year_total, + 's' sale_type + FROM customer, store_sales, date_dim + WHERE c_customer_sk = ss_customer_sk + AND ss_sold_date_sk = d_date_sk + GROUP BY c_customer_id + , c_first_name + , c_last_name + , d_year + , c_preferred_cust_flag + , c_birth_country + , c_login + , c_email_address + , d_year + UNION ALL + SELECT + c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + c_preferred_cust_flag customer_preferred_cust_flag, + c_birth_country customer_birth_country, + c_login customer_login, + c_email_address customer_email_address, + d_year dyear, + sum(ws_ext_list_price - ws_ext_discount_amt) year_total, + 'w' sale_type + FROM customer, web_sales, date_dim + WHERE c_customer_sk = ws_bill_customer_sk + AND ws_sold_date_sk = d_date_sk + GROUP BY + c_customer_id, c_first_name, c_last_name, c_preferred_cust_flag, c_birth_country, + c_login, c_email_address, d_year) +SELECT t_s_secyear.customer_preferred_cust_flag +FROM year_total t_s_firstyear + , year_total t_s_secyear + , year_total t_w_firstyear + , year_total t_w_secyear +WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id + AND t_s_firstyear.customer_id = t_w_secyear.customer_id + AND t_s_firstyear.customer_id = t_w_firstyear.customer_id + AND t_s_firstyear.sale_type = 's' + AND t_w_firstyear.sale_type = 'w' + AND t_s_secyear.sale_type = 's' + AND t_w_secyear.sale_type = 'w' + AND t_s_firstyear.dyear = 2001 + AND t_s_secyear.dyear = 2001 + 1 + AND t_w_firstyear.dyear = 2001 + AND t_w_secyear.dyear = 2001 + 1 + AND t_s_firstyear.year_total > 0 + AND t_w_firstyear.year_total > 0 + AND CASE WHEN t_w_firstyear.year_total > 0 + THEN t_w_secyear.year_total / t_w_firstyear.year_total + ELSE NULL END + > CASE WHEN t_s_firstyear.year_total > 0 + THEN t_s_secyear.year_total / t_s_firstyear.year_total + ELSE NULL END +ORDER BY t_s_secyear.customer_preferred_cust_flag +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q12.sql b/plugin-spark/src/test/resources/tpcds/q12.sql new file mode 100755 index 0000000000..0382737f5a --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q12.sql @@ -0,0 +1,22 @@ +SELECT + i_item_desc, + i_category, + i_class, + i_current_price, + sum(ws_ext_sales_price) AS itemrevenue, + sum(ws_ext_sales_price) * 100 / sum(sum(ws_ext_sales_price)) + OVER + (PARTITION BY i_class) AS revenueratio +FROM + web_sales, item, date_dim +WHERE + ws_item_sk = i_item_sk + AND i_category IN ('Sports', 'Books', 'Home') + AND ws_sold_date_sk = d_date_sk + AND d_date BETWEEN cast('1999-02-22' AS DATE) + AND (cast('1999-02-22' AS DATE) + INTERVAL 30 days) +GROUP BY + i_item_id, i_item_desc, i_category, i_class, i_current_price +ORDER BY + i_category, i_class, i_item_id, i_item_desc, revenueratio +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q13.sql b/plugin-spark/src/test/resources/tpcds/q13.sql new file mode 100755 index 0000000000..32dc9e2609 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q13.sql @@ -0,0 +1,49 @@ +SELECT + avg(ss_quantity), + avg(ss_ext_sales_price), + avg(ss_ext_wholesale_cost), + sum(ss_ext_wholesale_cost) +FROM store_sales + , store + , customer_demographics + , household_demographics + , customer_address + , date_dim +WHERE s_store_sk = ss_store_sk + AND ss_sold_date_sk = d_date_sk AND d_year = 2001 + AND ((ss_hdemo_sk = hd_demo_sk + AND cd_demo_sk = ss_cdemo_sk + AND cd_marital_status = 'M' + AND cd_education_status = 'Advanced Degree' + AND ss_sales_price BETWEEN 100.00 AND 150.00 + AND hd_dep_count = 3 +) OR + (ss_hdemo_sk = hd_demo_sk + AND cd_demo_sk = ss_cdemo_sk + AND cd_marital_status = 'S' + AND cd_education_status = 'College' + AND ss_sales_price BETWEEN 50.00 AND 100.00 + AND hd_dep_count = 1 + ) OR + (ss_hdemo_sk = hd_demo_sk + AND cd_demo_sk = ss_cdemo_sk + AND cd_marital_status = 'W' + AND cd_education_status = '2 yr Degree' + AND ss_sales_price BETWEEN 150.00 AND 200.00 + AND hd_dep_count = 1 + )) + AND ((ss_addr_sk = ca_address_sk + AND ca_country = 'United States' + AND ca_state IN ('TX', 'OH', 'TX') + AND ss_net_profit BETWEEN 100 AND 200 +) OR + (ss_addr_sk = ca_address_sk + AND ca_country = 'United States' + AND ca_state IN ('OR', 'NM', 'KY') + AND ss_net_profit BETWEEN 150 AND 300 + ) OR + (ss_addr_sk = ca_address_sk + AND ca_country = 'United States' + AND ca_state IN ('VA', 'TX', 'MS') + AND ss_net_profit BETWEEN 50 AND 250 + )) diff --git a/plugin-spark/src/test/resources/tpcds/q14a.sql b/plugin-spark/src/test/resources/tpcds/q14a.sql new file mode 100755 index 0000000000..954ddd41be --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q14a.sql @@ -0,0 +1,120 @@ +WITH cross_items AS +(SELECT i_item_sk ss_item_sk + FROM item, + (SELECT + iss.i_brand_id brand_id, + iss.i_class_id class_id, + iss.i_category_id category_id + FROM store_sales, item iss, date_dim d1 + WHERE ss_item_sk = iss.i_item_sk + AND ss_sold_date_sk = d1.d_date_sk + AND d1.d_year BETWEEN 1999 AND 1999 + 2 + INTERSECT + SELECT + ics.i_brand_id, + ics.i_class_id, + ics.i_category_id + FROM catalog_sales, item ics, date_dim d2 + WHERE cs_item_sk = ics.i_item_sk + AND cs_sold_date_sk = d2.d_date_sk + AND d2.d_year BETWEEN 1999 AND 1999 + 2 + INTERSECT + SELECT + iws.i_brand_id, + iws.i_class_id, + iws.i_category_id + FROM web_sales, item iws, date_dim d3 + WHERE ws_item_sk = iws.i_item_sk + AND ws_sold_date_sk = d3.d_date_sk + AND d3.d_year BETWEEN 1999 AND 1999 + 2) x + WHERE i_brand_id = brand_id + AND i_class_id = class_id + AND i_category_id = category_id +), + avg_sales AS + (SELECT avg(quantity * list_price) average_sales + FROM ( + SELECT + ss_quantity quantity, + ss_list_price list_price + FROM store_sales, date_dim + WHERE ss_sold_date_sk = d_date_sk + AND d_year BETWEEN 1999 AND 2001 + UNION ALL + SELECT + cs_quantity quantity, + cs_list_price list_price + FROM catalog_sales, date_dim + WHERE cs_sold_date_sk = d_date_sk + AND d_year BETWEEN 1999 AND 1999 + 2 + UNION ALL + SELECT + ws_quantity quantity, + ws_list_price list_price + FROM web_sales, date_dim + WHERE ws_sold_date_sk = d_date_sk + AND d_year BETWEEN 1999 AND 1999 + 2) x) +SELECT + channel, + i_brand_id, + i_class_id, + i_category_id, + sum(sales), + sum(number_sales) +FROM ( + SELECT + 'store' channel, + i_brand_id, + i_class_id, + i_category_id, + sum(ss_quantity * ss_list_price) sales, + count(*) number_sales + FROM store_sales, item, date_dim + WHERE ss_item_sk IN (SELECT ss_item_sk + FROM cross_items) + AND ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND d_year = 1999 + 2 + AND d_moy = 11 + GROUP BY i_brand_id, i_class_id, i_category_id + HAVING sum(ss_quantity * ss_list_price) > (SELECT average_sales + FROM avg_sales) + UNION ALL + SELECT + 'catalog' channel, + i_brand_id, + i_class_id, + i_category_id, + sum(cs_quantity * cs_list_price) sales, + count(*) number_sales + FROM catalog_sales, item, date_dim + WHERE cs_item_sk IN (SELECT ss_item_sk + FROM cross_items) + AND cs_item_sk = i_item_sk + AND cs_sold_date_sk = d_date_sk + AND d_year = 1999 + 2 + AND d_moy = 11 + GROUP BY i_brand_id, i_class_id, i_category_id + HAVING sum(cs_quantity * cs_list_price) > (SELECT average_sales FROM avg_sales) + UNION ALL + SELECT + 'web' channel, + i_brand_id, + i_class_id, + i_category_id, + sum(ws_quantity * ws_list_price) sales, + count(*) number_sales + FROM web_sales, item, date_dim + WHERE ws_item_sk IN (SELECT ss_item_sk + FROM cross_items) + AND ws_item_sk = i_item_sk + AND ws_sold_date_sk = d_date_sk + AND d_year = 1999 + 2 + AND d_moy = 11 + GROUP BY i_brand_id, i_class_id, i_category_id + HAVING sum(ws_quantity * ws_list_price) > (SELECT average_sales + FROM avg_sales) + ) y +GROUP BY ROLLUP (channel, i_brand_id, i_class_id, i_category_id) +ORDER BY channel, i_brand_id, i_class_id, i_category_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q14b.sql b/plugin-spark/src/test/resources/tpcds/q14b.sql new file mode 100755 index 0000000000..929a8484bf --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q14b.sql @@ -0,0 +1,95 @@ +WITH cross_items AS +(SELECT i_item_sk ss_item_sk + FROM item, + (SELECT + iss.i_brand_id brand_id, + iss.i_class_id class_id, + iss.i_category_id category_id + FROM store_sales, item iss, date_dim d1 + WHERE ss_item_sk = iss.i_item_sk + AND ss_sold_date_sk = d1.d_date_sk + AND d1.d_year BETWEEN 1999 AND 1999 + 2 + INTERSECT + SELECT + ics.i_brand_id, + ics.i_class_id, + ics.i_category_id + FROM catalog_sales, item ics, date_dim d2 + WHERE cs_item_sk = ics.i_item_sk + AND cs_sold_date_sk = d2.d_date_sk + AND d2.d_year BETWEEN 1999 AND 1999 + 2 + INTERSECT + SELECT + iws.i_brand_id, + iws.i_class_id, + iws.i_category_id + FROM web_sales, item iws, date_dim d3 + WHERE ws_item_sk = iws.i_item_sk + AND ws_sold_date_sk = d3.d_date_sk + AND d3.d_year BETWEEN 1999 AND 1999 + 2) x + WHERE i_brand_id = brand_id + AND i_class_id = class_id + AND i_category_id = category_id +), + avg_sales AS + (SELECT avg(quantity * list_price) average_sales + FROM (SELECT + ss_quantity quantity, + ss_list_price list_price + FROM store_sales, date_dim + WHERE ss_sold_date_sk = d_date_sk AND d_year BETWEEN 1999 AND 1999 + 2 + UNION ALL + SELECT + cs_quantity quantity, + cs_list_price list_price + FROM catalog_sales, date_dim + WHERE cs_sold_date_sk = d_date_sk AND d_year BETWEEN 1999 AND 1999 + 2 + UNION ALL + SELECT + ws_quantity quantity, + ws_list_price list_price + FROM web_sales, date_dim + WHERE ws_sold_date_sk = d_date_sk AND d_year BETWEEN 1999 AND 1999 + 2) x) +SELECT * +FROM + (SELECT + 'store' channel, + i_brand_id, + i_class_id, + i_category_id, + sum(ss_quantity * ss_list_price) sales, + count(*) number_sales + FROM store_sales, item, date_dim + WHERE ss_item_sk IN (SELECT ss_item_sk + FROM cross_items) + AND ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND d_week_seq = (SELECT d_week_seq + FROM date_dim + WHERE d_year = 1999 + 1 AND d_moy = 12 AND d_dom = 11) + GROUP BY i_brand_id, i_class_id, i_category_id + HAVING sum(ss_quantity * ss_list_price) > (SELECT average_sales + FROM avg_sales)) this_year, + (SELECT + 'store' channel, + i_brand_id, + i_class_id, + i_category_id, + sum(ss_quantity * ss_list_price) sales, + count(*) number_sales + FROM store_sales, item, date_dim + WHERE ss_item_sk IN (SELECT ss_item_sk + FROM cross_items) + AND ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND d_week_seq = (SELECT d_week_seq + FROM date_dim + WHERE d_year = 1999 AND d_moy = 12 AND d_dom = 11) + GROUP BY i_brand_id, i_class_id, i_category_id + HAVING sum(ss_quantity * ss_list_price) > (SELECT average_sales + FROM avg_sales)) last_year +WHERE this_year.i_brand_id = last_year.i_brand_id + AND this_year.i_class_id = last_year.i_class_id + AND this_year.i_category_id = last_year.i_category_id +ORDER BY this_year.channel, this_year.i_brand_id, this_year.i_class_id, this_year.i_category_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q15.sql b/plugin-spark/src/test/resources/tpcds/q15.sql new file mode 100755 index 0000000000..b8182e23b0 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q15.sql @@ -0,0 +1,15 @@ +SELECT + ca_zip, + sum(cs_sales_price) +FROM catalog_sales, customer, customer_address, date_dim +WHERE cs_bill_customer_sk = c_customer_sk + AND c_current_addr_sk = ca_address_sk + AND (substr(ca_zip, 1, 5) IN ('85669', '86197', '88274', '83405', '86475', + '85392', '85460', '80348', '81792') + OR ca_state IN ('CA', 'WA', 'GA') + OR cs_sales_price > 500) + AND cs_sold_date_sk = d_date_sk + AND d_qoy = 2 AND d_year = 2001 +GROUP BY ca_zip +ORDER BY ca_zip +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q16.sql b/plugin-spark/src/test/resources/tpcds/q16.sql new file mode 100755 index 0000000000..732ad0d848 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q16.sql @@ -0,0 +1,23 @@ +SELECT + count(DISTINCT cs_order_number) AS `order count `, + sum(cs_ext_ship_cost) AS `total shipping cost `, + sum(cs_net_profit) AS `total net profit ` +FROM + catalog_sales cs1, date_dim, customer_address, call_center +WHERE + d_date BETWEEN '2002-02-01' AND (CAST('2002-02-01' AS DATE) + INTERVAL 60 days) + AND cs1.cs_ship_date_sk = d_date_sk + AND cs1.cs_ship_addr_sk = ca_address_sk + AND ca_state = 'GA' + AND cs1.cs_call_center_sk = cc_call_center_sk + AND cc_county IN + ('Williamson County', 'Williamson County', 'Williamson County', 'Williamson County', 'Williamson County') + AND EXISTS(SELECT * + FROM catalog_sales cs2 + WHERE cs1.cs_order_number = cs2.cs_order_number + AND cs1.cs_warehouse_sk <> cs2.cs_warehouse_sk) + AND NOT EXISTS(SELECT * + FROM catalog_returns cr1 + WHERE cs1.cs_order_number = cr1.cr_order_number) +ORDER BY count(DISTINCT cs_order_number) +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q17.sql b/plugin-spark/src/test/resources/tpcds/q17.sql new file mode 100755 index 0000000000..4d647f7956 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q17.sql @@ -0,0 +1,33 @@ +SELECT + i_item_id, + i_item_desc, + s_state, + count(ss_quantity) AS store_sales_quantitycount, + avg(ss_quantity) AS store_sales_quantityave, + stddev_samp(ss_quantity) AS store_sales_quantitystdev, + stddev_samp(ss_quantity) / avg(ss_quantity) AS store_sales_quantitycov, + count(sr_return_quantity) as_store_returns_quantitycount, + avg(sr_return_quantity) as_store_returns_quantityave, + stddev_samp(sr_return_quantity) as_store_returns_quantitystdev, + stddev_samp(sr_return_quantity) / avg(sr_return_quantity) AS store_returns_quantitycov, + count(cs_quantity) AS catalog_sales_quantitycount, + avg(cs_quantity) AS catalog_sales_quantityave, + stddev_samp(cs_quantity) / avg(cs_quantity) AS catalog_sales_quantitystdev, + stddev_samp(cs_quantity) / avg(cs_quantity) AS catalog_sales_quantitycov +FROM store_sales, store_returns, catalog_sales, date_dim d1, date_dim d2, date_dim d3, store, item +WHERE d1.d_quarter_name = '2001Q1' + AND d1.d_date_sk = ss_sold_date_sk + AND i_item_sk = ss_item_sk + AND s_store_sk = ss_store_sk + AND ss_customer_sk = sr_customer_sk + AND ss_item_sk = sr_item_sk + AND ss_ticket_number = sr_ticket_number + AND sr_returned_date_sk = d2.d_date_sk + AND d2.d_quarter_name IN ('2001Q1', '2001Q2', '2001Q3') + AND sr_customer_sk = cs_bill_customer_sk + AND sr_item_sk = cs_item_sk + AND cs_sold_date_sk = d3.d_date_sk + AND d3.d_quarter_name IN ('2001Q1', '2001Q2', '2001Q3') +GROUP BY i_item_id, i_item_desc, s_state +ORDER BY i_item_id, i_item_desc, s_state +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q18.sql b/plugin-spark/src/test/resources/tpcds/q18.sql new file mode 100755 index 0000000000..4055c80fde --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q18.sql @@ -0,0 +1,28 @@ +SELECT + i_item_id, + ca_country, + ca_state, + ca_county, + avg(cast(cs_quantity AS DECIMAL(12, 2))) agg1, + avg(cast(cs_list_price AS DECIMAL(12, 2))) agg2, + avg(cast(cs_coupon_amt AS DECIMAL(12, 2))) agg3, + avg(cast(cs_sales_price AS DECIMAL(12, 2))) agg4, + avg(cast(cs_net_profit AS DECIMAL(12, 2))) agg5, + avg(cast(c_birth_year AS DECIMAL(12, 2))) agg6, + avg(cast(cd1.cd_dep_count AS DECIMAL(12, 2))) agg7 +FROM catalog_sales, customer_demographics cd1, + customer_demographics cd2, customer, customer_address, date_dim, item +WHERE cs_sold_date_sk = d_date_sk AND + cs_item_sk = i_item_sk AND + cs_bill_cdemo_sk = cd1.cd_demo_sk AND + cs_bill_customer_sk = c_customer_sk AND + cd1.cd_gender = 'F' AND + cd1.cd_education_status = 'Unknown' AND + c_current_cdemo_sk = cd2.cd_demo_sk AND + c_current_addr_sk = ca_address_sk AND + c_birth_month IN (1, 6, 8, 9, 12, 2) AND + d_year = 1998 AND + ca_state IN ('MS', 'IN', 'ND', 'OK', 'NM', 'VA', 'MS') +GROUP BY ROLLUP (i_item_id, ca_country, ca_state, ca_county) +ORDER BY ca_country, ca_state, ca_county, i_item_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q19.sql b/plugin-spark/src/test/resources/tpcds/q19.sql new file mode 100755 index 0000000000..e38ab7f268 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q19.sql @@ -0,0 +1,19 @@ +SELECT + i_brand_id brand_id, + i_brand brand, + i_manufact_id, + i_manufact, + sum(ss_ext_sales_price) ext_price +FROM date_dim, store_sales, item, customer, customer_address, store +WHERE d_date_sk = ss_sold_date_sk + AND ss_item_sk = i_item_sk + AND i_manager_id = 8 + AND d_moy = 11 + AND d_year = 1998 + AND ss_customer_sk = c_customer_sk + AND c_current_addr_sk = ca_address_sk + AND substr(ca_zip, 1, 5) <> substr(s_zip, 1, 5) + AND ss_store_sk = s_store_sk +GROUP BY i_brand, i_brand_id, i_manufact_id, i_manufact +ORDER BY ext_price DESC, brand, brand_id, i_manufact_id, i_manufact +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q2.sql b/plugin-spark/src/test/resources/tpcds/q2.sql new file mode 100755 index 0000000000..52c0e90c46 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q2.sql @@ -0,0 +1,81 @@ +WITH wscs AS +( SELECT + sold_date_sk, + sales_price + FROM (SELECT + ws_sold_date_sk sold_date_sk, + ws_ext_sales_price sales_price + FROM web_sales) x + UNION ALL + (SELECT + cs_sold_date_sk sold_date_sk, + cs_ext_sales_price sales_price + FROM catalog_sales)), + wswscs AS + ( SELECT + d_week_seq, + sum(CASE WHEN (d_day_name = 'Sunday') + THEN sales_price + ELSE NULL END) + sun_sales, + sum(CASE WHEN (d_day_name = 'Monday') + THEN sales_price + ELSE NULL END) + mon_sales, + sum(CASE WHEN (d_day_name = 'Tuesday') + THEN sales_price + ELSE NULL END) + tue_sales, + sum(CASE WHEN (d_day_name = 'Wednesday') + THEN sales_price + ELSE NULL END) + wed_sales, + sum(CASE WHEN (d_day_name = 'Thursday') + THEN sales_price + ELSE NULL END) + thu_sales, + sum(CASE WHEN (d_day_name = 'Friday') + THEN sales_price + ELSE NULL END) + fri_sales, + sum(CASE WHEN (d_day_name = 'Saturday') + THEN sales_price + ELSE NULL END) + sat_sales + FROM wscs, date_dim + WHERE d_date_sk = sold_date_sk + GROUP BY d_week_seq) +SELECT + d_week_seq1, + round(sun_sales1 / sun_sales2, 2), + round(mon_sales1 / mon_sales2, 2), + round(tue_sales1 / tue_sales2, 2), + round(wed_sales1 / wed_sales2, 2), + round(thu_sales1 / thu_sales2, 2), + round(fri_sales1 / fri_sales2, 2), + round(sat_sales1 / sat_sales2, 2) +FROM + (SELECT + wswscs.d_week_seq d_week_seq1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 + FROM wswscs, date_dim + WHERE date_dim.d_week_seq = wswscs.d_week_seq AND d_year = 2001) y, + (SELECT + wswscs.d_week_seq d_week_seq2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + FROM wswscs, date_dim + WHERE date_dim.d_week_seq = wswscs.d_week_seq AND d_year = 2001 + 1) z +WHERE d_week_seq1 = d_week_seq2 - 53 +ORDER BY d_week_seq1 diff --git a/plugin-spark/src/test/resources/tpcds/q20.sql b/plugin-spark/src/test/resources/tpcds/q20.sql new file mode 100755 index 0000000000..7ac6c7a75d --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q20.sql @@ -0,0 +1,18 @@ +SELECT + i_item_desc, + i_category, + i_class, + i_current_price, + sum(cs_ext_sales_price) AS itemrevenue, + sum(cs_ext_sales_price) * 100 / sum(sum(cs_ext_sales_price)) + OVER + (PARTITION BY i_class) AS revenueratio +FROM catalog_sales, item, date_dim +WHERE cs_item_sk = i_item_sk + AND i_category IN ('Sports', 'Books', 'Home') + AND cs_sold_date_sk = d_date_sk + AND d_date BETWEEN cast('1999-02-22' AS DATE) +AND (cast('1999-02-22' AS DATE) + INTERVAL 30 days) +GROUP BY i_item_id, i_item_desc, i_category, i_class, i_current_price +ORDER BY i_category, i_class, i_item_id, i_item_desc, revenueratio +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q21.sql b/plugin-spark/src/test/resources/tpcds/q21.sql new file mode 100755 index 0000000000..550881143f --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q21.sql @@ -0,0 +1,25 @@ +SELECT * +FROM ( + SELECT + w_warehouse_name, + i_item_id, + sum(CASE WHEN (cast(d_date AS DATE) < cast('2000-03-11' AS DATE)) + THEN inv_quantity_on_hand + ELSE 0 END) AS inv_before, + sum(CASE WHEN (cast(d_date AS DATE) >= cast('2000-03-11' AS DATE)) + THEN inv_quantity_on_hand + ELSE 0 END) AS inv_after + FROM inventory, warehouse, item, date_dim + WHERE i_current_price BETWEEN 0.99 AND 1.49 + AND i_item_sk = inv_item_sk + AND inv_warehouse_sk = w_warehouse_sk + AND inv_date_sk = d_date_sk + AND d_date BETWEEN (cast('2000-03-11' AS DATE) - INTERVAL 30 days) + AND (cast('2000-03-11' AS DATE) + INTERVAL 30 days) + GROUP BY w_warehouse_name, i_item_id) x +WHERE (CASE WHEN inv_before > 0 + THEN inv_after / inv_before + ELSE NULL + END) BETWEEN 2.0 / 3.0 AND 3.0 / 2.0 +ORDER BY w_warehouse_name, i_item_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q22.sql b/plugin-spark/src/test/resources/tpcds/q22.sql new file mode 100755 index 0000000000..add3b41f7c --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q22.sql @@ -0,0 +1,14 @@ +SELECT + i_product_name, + i_brand, + i_class, + i_category, + avg(inv_quantity_on_hand) qoh +FROM inventory, date_dim, item, warehouse +WHERE inv_date_sk = d_date_sk + AND inv_item_sk = i_item_sk + AND inv_warehouse_sk = w_warehouse_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11 +GROUP BY ROLLUP (i_product_name, i_brand, i_class, i_category) +ORDER BY qoh, i_product_name, i_brand, i_class, i_category +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q23a.sql b/plugin-spark/src/test/resources/tpcds/q23a.sql new file mode 100755 index 0000000000..37791f6433 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q23a.sql @@ -0,0 +1,53 @@ +WITH frequent_ss_items AS +(SELECT + substr(i_item_desc, 1, 30) itemdesc, + i_item_sk item_sk, + d_date solddate, + count(*) cnt + FROM store_sales, date_dim, item + WHERE ss_sold_date_sk = d_date_sk + AND ss_item_sk = i_item_sk + AND d_year IN (2000, 2000 + 1, 2000 + 2, 2000 + 3) + GROUP BY substr(i_item_desc, 1, 30), i_item_sk, d_date + HAVING count(*) > 4), + max_store_sales AS + (SELECT max(csales) tpcds_cmax + FROM (SELECT + c_customer_sk, + sum(ss_quantity * ss_sales_price) csales + FROM store_sales, customer, date_dim + WHERE ss_customer_sk = c_customer_sk + AND ss_sold_date_sk = d_date_sk + AND d_year IN (2000, 2000 + 1, 2000 + 2, 2000 + 3) + GROUP BY c_customer_sk) x), + best_ss_customer AS + (SELECT + c_customer_sk, + sum(ss_quantity * ss_sales_price) ssales + FROM store_sales, customer + WHERE ss_customer_sk = c_customer_sk + GROUP BY c_customer_sk + HAVING sum(ss_quantity * ss_sales_price) > (50 / 100.0) * + (SELECT * + FROM max_store_sales)) +SELECT sum(sales) +FROM ((SELECT cs_quantity * cs_list_price sales +FROM catalog_sales, date_dim +WHERE d_year = 2000 + AND d_moy = 2 + AND cs_sold_date_sk = d_date_sk + AND cs_item_sk IN (SELECT item_sk +FROM frequent_ss_items) + AND cs_bill_customer_sk IN (SELECT c_customer_sk +FROM best_ss_customer)) + UNION ALL + (SELECT ws_quantity * ws_list_price sales + FROM web_sales, date_dim + WHERE d_year = 2000 + AND d_moy = 2 + AND ws_sold_date_sk = d_date_sk + AND ws_item_sk IN (SELECT item_sk + FROM frequent_ss_items) + AND ws_bill_customer_sk IN (SELECT c_customer_sk + FROM best_ss_customer))) y +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q23b.sql b/plugin-spark/src/test/resources/tpcds/q23b.sql new file mode 100755 index 0000000000..01150197af --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q23b.sql @@ -0,0 +1,68 @@ +WITH frequent_ss_items AS +(SELECT + substr(i_item_desc, 1, 30) itemdesc, + i_item_sk item_sk, + d_date solddate, + count(*) cnt + FROM store_sales, date_dim, item + WHERE ss_sold_date_sk = d_date_sk + AND ss_item_sk = i_item_sk + AND d_year IN (2000, 2000 + 1, 2000 + 2, 2000 + 3) + GROUP BY substr(i_item_desc, 1, 30), i_item_sk, d_date + HAVING count(*) > 4), + max_store_sales AS + (SELECT max(csales) tpcds_cmax + FROM (SELECT + c_customer_sk, + sum(ss_quantity * ss_sales_price) csales + FROM store_sales, customer, date_dim + WHERE ss_customer_sk = c_customer_sk + AND ss_sold_date_sk = d_date_sk + AND d_year IN (2000, 2000 + 1, 2000 + 2, 2000 + 3) + GROUP BY c_customer_sk) x), + best_ss_customer AS + (SELECT + c_customer_sk, + sum(ss_quantity * ss_sales_price) ssales + FROM store_sales + , customer + WHERE ss_customer_sk = c_customer_sk + GROUP BY c_customer_sk + HAVING sum(ss_quantity * ss_sales_price) > (50 / 100.0) * + (SELECT * + FROM max_store_sales)) +SELECT + c_last_name, + c_first_name, + sales +FROM ((SELECT + c_last_name, + c_first_name, + sum(cs_quantity * cs_list_price) sales +FROM catalog_sales, customer, date_dim +WHERE d_year = 2000 + AND d_moy = 2 + AND cs_sold_date_sk = d_date_sk + AND cs_item_sk IN (SELECT item_sk +FROM frequent_ss_items) + AND cs_bill_customer_sk IN (SELECT c_customer_sk +FROM best_ss_customer) + AND cs_bill_customer_sk = c_customer_sk +GROUP BY c_last_name, c_first_name) + UNION ALL + (SELECT + c_last_name, + c_first_name, + sum(ws_quantity * ws_list_price) sales + FROM web_sales, customer, date_dim + WHERE d_year = 2000 + AND d_moy = 2 + AND ws_sold_date_sk = d_date_sk + AND ws_item_sk IN (SELECT item_sk + FROM frequent_ss_items) + AND ws_bill_customer_sk IN (SELECT c_customer_sk + FROM best_ss_customer) + AND ws_bill_customer_sk = c_customer_sk + GROUP BY c_last_name, c_first_name)) y +ORDER BY c_last_name, c_first_name, sales +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q24a.sql b/plugin-spark/src/test/resources/tpcds/q24a.sql new file mode 100755 index 0000000000..bcc1894866 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q24a.sql @@ -0,0 +1,34 @@ +WITH ssales AS +(SELECT + c_last_name, + c_first_name, + s_store_name, + ca_state, + s_state, + i_color, + i_current_price, + i_manager_id, + i_units, + i_size, + sum(ss_net_paid) netpaid + FROM store_sales, store_returns, store, item, customer, customer_address + WHERE ss_ticket_number = sr_ticket_number + AND ss_item_sk = sr_item_sk + AND ss_customer_sk = c_customer_sk + AND ss_item_sk = i_item_sk + AND ss_store_sk = s_store_sk + AND c_birth_country = upper(ca_country) + AND s_zip = ca_zip + AND s_market_id = 8 + GROUP BY c_last_name, c_first_name, s_store_name, ca_state, s_state, i_color, + i_current_price, i_manager_id, i_units, i_size) +SELECT + c_last_name, + c_first_name, + s_store_name, + sum(netpaid) paid +FROM ssales +WHERE i_color = 'pale' +GROUP BY c_last_name, c_first_name, s_store_name +HAVING sum(netpaid) > (SELECT 0.05 * avg(netpaid) +FROM ssales) diff --git a/plugin-spark/src/test/resources/tpcds/q24b.sql b/plugin-spark/src/test/resources/tpcds/q24b.sql new file mode 100755 index 0000000000..830eb670bc --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q24b.sql @@ -0,0 +1,34 @@ +WITH ssales AS +(SELECT + c_last_name, + c_first_name, + s_store_name, + ca_state, + s_state, + i_color, + i_current_price, + i_manager_id, + i_units, + i_size, + sum(ss_net_paid) netpaid + FROM store_sales, store_returns, store, item, customer, customer_address + WHERE ss_ticket_number = sr_ticket_number + AND ss_item_sk = sr_item_sk + AND ss_customer_sk = c_customer_sk + AND ss_item_sk = i_item_sk + AND ss_store_sk = s_store_sk + AND c_birth_country = upper(ca_country) + AND s_zip = ca_zip + AND s_market_id = 8 + GROUP BY c_last_name, c_first_name, s_store_name, ca_state, s_state, + i_color, i_current_price, i_manager_id, i_units, i_size) +SELECT + c_last_name, + c_first_name, + s_store_name, + sum(netpaid) paid +FROM ssales +WHERE i_color = 'chiffon' +GROUP BY c_last_name, c_first_name, s_store_name +HAVING sum(netpaid) > (SELECT 0.05 * avg(netpaid) +FROM ssales) diff --git a/plugin-spark/src/test/resources/tpcds/q25.sql b/plugin-spark/src/test/resources/tpcds/q25.sql new file mode 100755 index 0000000000..a4d78a3c56 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q25.sql @@ -0,0 +1,33 @@ +SELECT + i_item_id, + i_item_desc, + s_store_id, + s_store_name, + sum(ss_net_profit) AS store_sales_profit, + sum(sr_net_loss) AS store_returns_loss, + sum(cs_net_profit) AS catalog_sales_profit +FROM + store_sales, store_returns, catalog_sales, date_dim d1, date_dim d2, date_dim d3, + store, item +WHERE + d1.d_moy = 4 + AND d1.d_year = 2001 + AND d1.d_date_sk = ss_sold_date_sk + AND i_item_sk = ss_item_sk + AND s_store_sk = ss_store_sk + AND ss_customer_sk = sr_customer_sk + AND ss_item_sk = sr_item_sk + AND ss_ticket_number = sr_ticket_number + AND sr_returned_date_sk = d2.d_date_sk + AND d2.d_moy BETWEEN 4 AND 10 + AND d2.d_year = 2001 + AND sr_customer_sk = cs_bill_customer_sk + AND sr_item_sk = cs_item_sk + AND cs_sold_date_sk = d3.d_date_sk + AND d3.d_moy BETWEEN 4 AND 10 + AND d3.d_year = 2001 +GROUP BY + i_item_id, i_item_desc, s_store_id, s_store_name +ORDER BY + i_item_id, i_item_desc, s_store_id, s_store_name +LIMIT 100 \ No newline at end of file diff --git a/plugin-spark/src/test/resources/tpcds/q26.sql b/plugin-spark/src/test/resources/tpcds/q26.sql new file mode 100755 index 0000000000..6d395a1d79 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q26.sql @@ -0,0 +1,19 @@ +SELECT + i_item_id, + avg(cs_quantity) agg1, + avg(cs_list_price) agg2, + avg(cs_coupon_amt) agg3, + avg(cs_sales_price) agg4 +FROM catalog_sales, customer_demographics, date_dim, item, promotion +WHERE cs_sold_date_sk = d_date_sk AND + cs_item_sk = i_item_sk AND + cs_bill_cdemo_sk = cd_demo_sk AND + cs_promo_sk = p_promo_sk AND + cd_gender = 'M' AND + cd_marital_status = 'S' AND + cd_education_status = 'College' AND + (p_channel_email = 'N' OR p_channel_event = 'N') AND + d_year = 2000 +GROUP BY i_item_id +ORDER BY i_item_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q27.sql b/plugin-spark/src/test/resources/tpcds/q27.sql new file mode 100755 index 0000000000..b0e2fd95fd --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q27.sql @@ -0,0 +1,21 @@ +SELECT + i_item_id, + s_state, + grouping(s_state) g_state, + avg(ss_quantity) agg1, + avg(ss_list_price) agg2, + avg(ss_coupon_amt) agg3, + avg(ss_sales_price) agg4 +FROM store_sales, customer_demographics, date_dim, store, item +WHERE ss_sold_date_sk = d_date_sk AND + ss_item_sk = i_item_sk AND + ss_store_sk = s_store_sk AND + ss_cdemo_sk = cd_demo_sk AND + cd_gender = 'M' AND + cd_marital_status = 'S' AND + cd_education_status = 'College' AND + d_year = 2002 AND + s_state IN ('TN', 'TN', 'TN', 'TN', 'TN', 'TN') +GROUP BY ROLLUP (i_item_id, s_state) +ORDER BY i_item_id, s_state +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q28.sql b/plugin-spark/src/test/resources/tpcds/q28.sql new file mode 100755 index 0000000000..f34c2bb0e3 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q28.sql @@ -0,0 +1,56 @@ +SELECT * +FROM (SELECT + avg(ss_list_price) B1_LP, + count(ss_list_price) B1_CNT, + count(DISTINCT ss_list_price) B1_CNTD +FROM store_sales +WHERE ss_quantity BETWEEN 0 AND 5 + AND (ss_list_price BETWEEN 8 AND 8 + 10 + OR ss_coupon_amt BETWEEN 459 AND 459 + 1000 + OR ss_wholesale_cost BETWEEN 57 AND 57 + 20)) B1, + (SELECT + avg(ss_list_price) B2_LP, + count(ss_list_price) B2_CNT, + count(DISTINCT ss_list_price) B2_CNTD + FROM store_sales + WHERE ss_quantity BETWEEN 6 AND 10 + AND (ss_list_price BETWEEN 90 AND 90 + 10 + OR ss_coupon_amt BETWEEN 2323 AND 2323 + 1000 + OR ss_wholesale_cost BETWEEN 31 AND 31 + 20)) B2, + (SELECT + avg(ss_list_price) B3_LP, + count(ss_list_price) B3_CNT, + count(DISTINCT ss_list_price) B3_CNTD + FROM store_sales + WHERE ss_quantity BETWEEN 11 AND 15 + AND (ss_list_price BETWEEN 142 AND 142 + 10 + OR ss_coupon_amt BETWEEN 12214 AND 12214 + 1000 + OR ss_wholesale_cost BETWEEN 79 AND 79 + 20)) B3, + (SELECT + avg(ss_list_price) B4_LP, + count(ss_list_price) B4_CNT, + count(DISTINCT ss_list_price) B4_CNTD + FROM store_sales + WHERE ss_quantity BETWEEN 16 AND 20 + AND (ss_list_price BETWEEN 135 AND 135 + 10 + OR ss_coupon_amt BETWEEN 6071 AND 6071 + 1000 + OR ss_wholesale_cost BETWEEN 38 AND 38 + 20)) B4, + (SELECT + avg(ss_list_price) B5_LP, + count(ss_list_price) B5_CNT, + count(DISTINCT ss_list_price) B5_CNTD + FROM store_sales + WHERE ss_quantity BETWEEN 21 AND 25 + AND (ss_list_price BETWEEN 122 AND 122 + 10 + OR ss_coupon_amt BETWEEN 836 AND 836 + 1000 + OR ss_wholesale_cost BETWEEN 17 AND 17 + 20)) B5, + (SELECT + avg(ss_list_price) B6_LP, + count(ss_list_price) B6_CNT, + count(DISTINCT ss_list_price) B6_CNTD + FROM store_sales + WHERE ss_quantity BETWEEN 26 AND 30 + AND (ss_list_price BETWEEN 154 AND 154 + 10 + OR ss_coupon_amt BETWEEN 7326 AND 7326 + 1000 + OR ss_wholesale_cost BETWEEN 7 AND 7 + 20)) B6 +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q29.sql b/plugin-spark/src/test/resources/tpcds/q29.sql new file mode 100755 index 0000000000..3f1fd553f6 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q29.sql @@ -0,0 +1,32 @@ +SELECT + i_item_id, + i_item_desc, + s_store_id, + s_store_name, + sum(ss_quantity) AS store_sales_quantity, + sum(sr_return_quantity) AS store_returns_quantity, + sum(cs_quantity) AS catalog_sales_quantity +FROM + store_sales, store_returns, catalog_sales, date_dim d1, date_dim d2, + date_dim d3, store, item +WHERE + d1.d_moy = 9 + AND d1.d_year = 1999 + AND d1.d_date_sk = ss_sold_date_sk + AND i_item_sk = ss_item_sk + AND s_store_sk = ss_store_sk + AND ss_customer_sk = sr_customer_sk + AND ss_item_sk = sr_item_sk + AND ss_ticket_number = sr_ticket_number + AND sr_returned_date_sk = d2.d_date_sk + AND d2.d_moy BETWEEN 9 AND 9 + 3 + AND d2.d_year = 1999 + AND sr_customer_sk = cs_bill_customer_sk + AND sr_item_sk = cs_item_sk + AND cs_sold_date_sk = d3.d_date_sk + AND d3.d_year IN (1999, 1999 + 1, 1999 + 2) +GROUP BY + i_item_id, i_item_desc, s_store_id, s_store_name +ORDER BY + i_item_id, i_item_desc, s_store_id, s_store_name +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q3.sql b/plugin-spark/src/test/resources/tpcds/q3.sql new file mode 100755 index 0000000000..181509df9d --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q3.sql @@ -0,0 +1,13 @@ +SELECT + dt.d_year, + item.i_brand_id brand_id, + item.i_brand brand, + SUM(ss_ext_sales_price) sum_agg +FROM date_dim dt, store_sales, item +WHERE dt.d_date_sk = store_sales.ss_sold_date_sk + AND store_sales.ss_item_sk = item.i_item_sk + AND item.i_manufact_id = 128 + AND dt.d_moy = 11 +GROUP BY dt.d_year, item.i_brand, item.i_brand_id +ORDER BY dt.d_year, sum_agg DESC, brand_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q30.sql b/plugin-spark/src/test/resources/tpcds/q30.sql new file mode 100755 index 0000000000..986bef566d --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q30.sql @@ -0,0 +1,35 @@ +WITH customer_total_return AS +(SELECT + wr_returning_customer_sk AS ctr_customer_sk, + ca_state AS ctr_state, + sum(wr_return_amt) AS ctr_total_return + FROM web_returns, date_dim, customer_address + WHERE wr_returned_date_sk = d_date_sk + AND d_year = 2002 + AND wr_returning_addr_sk = ca_address_sk + GROUP BY wr_returning_customer_sk, ca_state) +SELECT + c_customer_id, + c_salutation, + c_first_name, + c_last_name, + c_preferred_cust_flag, + c_birth_day, + c_birth_month, + c_birth_year, + c_birth_country, + c_login, + c_email_address, + c_last_review_date, + ctr_total_return +FROM customer_total_return ctr1, customer_address, customer +WHERE ctr1.ctr_total_return > (SELECT avg(ctr_total_return) * 1.2 +FROM customer_total_return ctr2 +WHERE ctr1.ctr_state = ctr2.ctr_state) + AND ca_address_sk = c_current_addr_sk + AND ca_state = 'GA' + AND ctr1.ctr_customer_sk = c_customer_sk +ORDER BY c_customer_id, c_salutation, c_first_name, c_last_name, c_preferred_cust_flag + , c_birth_day, c_birth_month, c_birth_year, c_birth_country, c_login, c_email_address + , c_last_review_date, ctr_total_return +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q31.sql b/plugin-spark/src/test/resources/tpcds/q31.sql new file mode 100755 index 0000000000..3e543d5436 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q31.sql @@ -0,0 +1,60 @@ +WITH ss AS +(SELECT + ca_county, + d_qoy, + d_year, + sum(ss_ext_sales_price) AS store_sales + FROM store_sales, date_dim, customer_address + WHERE ss_sold_date_sk = d_date_sk + AND ss_addr_sk = ca_address_sk + GROUP BY ca_county, d_qoy, d_year), + ws AS + (SELECT + ca_county, + d_qoy, + d_year, + sum(ws_ext_sales_price) AS web_sales + FROM web_sales, date_dim, customer_address + WHERE ws_sold_date_sk = d_date_sk + AND ws_bill_addr_sk = ca_address_sk + GROUP BY ca_county, d_qoy, d_year) +SELECT + ss1.ca_county, + ss1.d_year, + ws2.web_sales / ws1.web_sales web_q1_q2_increase, + ss2.store_sales / ss1.store_sales store_q1_q2_increase, + ws3.web_sales / ws2.web_sales web_q2_q3_increase, + ss3.store_sales / ss2.store_sales store_q2_q3_increase +FROM + ss ss1, ss ss2, ss ss3, ws ws1, ws ws2, ws ws3 +WHERE + ss1.d_qoy = 1 + AND ss1.d_year = 2000 + AND ss1.ca_county = ss2.ca_county + AND ss2.d_qoy = 2 + AND ss2.d_year = 2000 + AND ss2.ca_county = ss3.ca_county + AND ss3.d_qoy = 3 + AND ss3.d_year = 2000 + AND ss1.ca_county = ws1.ca_county + AND ws1.d_qoy = 1 + AND ws1.d_year = 2000 + AND ws1.ca_county = ws2.ca_county + AND ws2.d_qoy = 2 + AND ws2.d_year = 2000 + AND ws1.ca_county = ws3.ca_county + AND ws3.d_qoy = 3 + AND ws3.d_year = 2000 + AND CASE WHEN ws1.web_sales > 0 + THEN ws2.web_sales / ws1.web_sales + ELSE NULL END + > CASE WHEN ss1.store_sales > 0 + THEN ss2.store_sales / ss1.store_sales + ELSE NULL END + AND CASE WHEN ws2.web_sales > 0 + THEN ws3.web_sales / ws2.web_sales + ELSE NULL END + > CASE WHEN ss2.store_sales > 0 + THEN ss3.store_sales / ss2.store_sales + ELSE NULL END +ORDER BY ss1.ca_county diff --git a/plugin-spark/src/test/resources/tpcds/q32.sql b/plugin-spark/src/test/resources/tpcds/q32.sql new file mode 100755 index 0000000000..1a907961e7 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q32.sql @@ -0,0 +1,15 @@ +SELECT 1 AS `excess discount amount ` +FROM + catalog_sales, item, date_dim +WHERE + i_manufact_id = 977 + AND i_item_sk = cs_item_sk + AND d_date BETWEEN '2000-01-27' AND (cast('2000-01-27' AS DATE) + interval 90 days) + AND d_date_sk = cs_sold_date_sk + AND cs_ext_discount_amt > ( + SELECT 1.3 * avg(cs_ext_discount_amt) + FROM catalog_sales, date_dim + WHERE cs_item_sk = i_item_sk + AND d_date BETWEEN '2000-01-27]' AND (cast('2000-01-27' AS DATE) + interval 90 days) + AND d_date_sk = cs_sold_date_sk) +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q33.sql b/plugin-spark/src/test/resources/tpcds/q33.sql new file mode 100755 index 0000000000..d24856aa5c --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q33.sql @@ -0,0 +1,65 @@ +WITH ss AS ( + SELECT + i_manufact_id, + sum(ss_ext_sales_price) total_sales + FROM + store_sales, date_dim, customer_address, item + WHERE + i_manufact_id IN (SELECT i_manufact_id + FROM item + WHERE i_category IN ('Electronics')) + AND ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND d_year = 1998 + AND d_moy = 5 + AND ss_addr_sk = ca_address_sk + AND ca_gmt_offset = -5 + GROUP BY i_manufact_id), cs AS +(SELECT + i_manufact_id, + sum(cs_ext_sales_price) total_sales + FROM catalog_sales, date_dim, customer_address, item + WHERE + i_manufact_id IN ( + SELECT i_manufact_id + FROM item + WHERE + i_category IN ('Electronics')) + AND cs_item_sk = i_item_sk + AND cs_sold_date_sk = d_date_sk + AND d_year = 1998 + AND d_moy = 5 + AND cs_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -5 + GROUP BY i_manufact_id), + ws AS ( + SELECT + i_manufact_id, + sum(ws_ext_sales_price) total_sales + FROM + web_sales, date_dim, customer_address, item + WHERE + i_manufact_id IN (SELECT i_manufact_id + FROM item + WHERE i_category IN ('Electronics')) + AND ws_item_sk = i_item_sk + AND ws_sold_date_sk = d_date_sk + AND d_year = 1998 + AND d_moy = 5 + AND ws_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -5 + GROUP BY i_manufact_id) +SELECT + i_manufact_id, + sum(total_sales) total_sales +FROM (SELECT * + FROM ss + UNION ALL + SELECT * + FROM cs + UNION ALL + SELECT * + FROM ws) tmp1 +GROUP BY i_manufact_id +ORDER BY total_sales +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q34.sql b/plugin-spark/src/test/resources/tpcds/q34.sql new file mode 100755 index 0000000000..33396bf16e --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q34.sql @@ -0,0 +1,32 @@ +SELECT + c_last_name, + c_first_name, + c_salutation, + c_preferred_cust_flag, + ss_ticket_number, + cnt +FROM + (SELECT + ss_ticket_number, + ss_customer_sk, + count(*) cnt + FROM store_sales, date_dim, store, household_demographics + WHERE store_sales.ss_sold_date_sk = date_dim.d_date_sk + AND store_sales.ss_store_sk = store.s_store_sk + AND store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + AND (date_dim.d_dom BETWEEN 1 AND 3 OR date_dim.d_dom BETWEEN 25 AND 28) + AND (household_demographics.hd_buy_potential = '>10000' OR + household_demographics.hd_buy_potential = 'unknown') + AND household_demographics.hd_vehicle_count > 0 + AND (CASE WHEN household_demographics.hd_vehicle_count > 0 + THEN household_demographics.hd_dep_count / household_demographics.hd_vehicle_count + ELSE NULL + END) > 1.2 + AND date_dim.d_year IN (1999, 1999 + 1, 1999 + 2) + AND store.s_county IN + ('Williamson County', 'Williamson County', 'Williamson County', 'Williamson County', + 'Williamson County', 'Williamson County', 'Williamson County', 'Williamson County') + GROUP BY ss_ticket_number, ss_customer_sk) dn, customer +WHERE ss_customer_sk = c_customer_sk + AND cnt BETWEEN 15 AND 20 +ORDER BY c_last_name, c_first_name, c_salutation, c_preferred_cust_flag DESC diff --git a/plugin-spark/src/test/resources/tpcds/q35.sql b/plugin-spark/src/test/resources/tpcds/q35.sql new file mode 100755 index 0000000000..cfe4342d8b --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q35.sql @@ -0,0 +1,46 @@ +SELECT + ca_state, + cd_gender, + cd_marital_status, + count(*) cnt1, + min(cd_dep_count), + max(cd_dep_count), + avg(cd_dep_count), + cd_dep_employed_count, + count(*) cnt2, + min(cd_dep_employed_count), + max(cd_dep_employed_count), + avg(cd_dep_employed_count), + cd_dep_college_count, + count(*) cnt3, + min(cd_dep_college_count), + max(cd_dep_college_count), + avg(cd_dep_college_count) +FROM + customer c, customer_address ca, customer_demographics +WHERE + c.c_current_addr_sk = ca.ca_address_sk AND + cd_demo_sk = c.c_current_cdemo_sk AND + exists(SELECT * + FROM store_sales, date_dim + WHERE c.c_customer_sk = ss_customer_sk AND + ss_sold_date_sk = d_date_sk AND + d_year = 2002 AND + d_qoy < 4) AND + (exists(SELECT * + FROM web_sales, date_dim + WHERE c.c_customer_sk = ws_bill_customer_sk AND + ws_sold_date_sk = d_date_sk AND + d_year = 2002 AND + d_qoy < 4) OR + exists(SELECT * + FROM catalog_sales, date_dim + WHERE c.c_customer_sk = cs_ship_customer_sk AND + cs_sold_date_sk = d_date_sk AND + d_year = 2002 AND + d_qoy < 4)) +GROUP BY ca_state, cd_gender, cd_marital_status, cd_dep_count, + cd_dep_employed_count, cd_dep_college_count +ORDER BY ca_state, cd_gender, cd_marital_status, cd_dep_count, + cd_dep_employed_count, cd_dep_college_count +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q36.sql b/plugin-spark/src/test/resources/tpcds/q36.sql new file mode 100755 index 0000000000..a8f93df76a --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q36.sql @@ -0,0 +1,26 @@ +SELECT + sum(ss_net_profit) / sum(ss_ext_sales_price) AS gross_margin, + i_category, + i_class, + grouping(i_category) + grouping(i_class) AS lochierarchy, + rank() + OVER ( + PARTITION BY grouping(i_category) + grouping(i_class), + CASE WHEN grouping(i_class) = 0 + THEN i_category END + ORDER BY sum(ss_net_profit) / sum(ss_ext_sales_price) ASC) AS rank_within_parent +FROM + store_sales, date_dim d1, item, store +WHERE + d1.d_year = 2001 + AND d1.d_date_sk = ss_sold_date_sk + AND i_item_sk = ss_item_sk + AND s_store_sk = ss_store_sk + AND s_state IN ('TN', 'TN', 'TN', 'TN', 'TN', 'TN', 'TN', 'TN') +GROUP BY ROLLUP (i_category, i_class) +ORDER BY + lochierarchy DESC + , CASE WHEN lochierarchy = 0 + THEN i_category END + , rank_within_parent +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q37.sql b/plugin-spark/src/test/resources/tpcds/q37.sql new file mode 100755 index 0000000000..11b3821fa4 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q37.sql @@ -0,0 +1,15 @@ +SELECT + i_item_id, + i_item_desc, + i_current_price +FROM item, inventory, date_dim, catalog_sales +WHERE i_current_price BETWEEN 68 AND 68 + 30 + AND inv_item_sk = i_item_sk + AND d_date_sk = inv_date_sk + AND d_date BETWEEN cast('2000-02-01' AS DATE) AND (cast('2000-02-01' AS DATE) + INTERVAL 60 days) + AND i_manufact_id IN (677, 940, 694, 808) + AND inv_quantity_on_hand BETWEEN 100 AND 500 + AND cs_item_sk = i_item_sk +GROUP BY i_item_id, i_item_desc, i_current_price +ORDER BY i_item_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q38.sql b/plugin-spark/src/test/resources/tpcds/q38.sql new file mode 100755 index 0000000000..1c8d53ee2b --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q38.sql @@ -0,0 +1,30 @@ +SELECT count(*) +FROM ( + SELECT DISTINCT + c_last_name, + c_first_name, + d_date + FROM store_sales, date_dim, customer + WHERE store_sales.ss_sold_date_sk = date_dim.d_date_sk + AND store_sales.ss_customer_sk = customer.c_customer_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11 + INTERSECT + SELECT DISTINCT + c_last_name, + c_first_name, + d_date + FROM catalog_sales, date_dim, customer + WHERE catalog_sales.cs_sold_date_sk = date_dim.d_date_sk + AND catalog_sales.cs_bill_customer_sk = customer.c_customer_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11 + INTERSECT + SELECT DISTINCT + c_last_name, + c_first_name, + d_date + FROM web_sales, date_dim, customer + WHERE web_sales.ws_sold_date_sk = date_dim.d_date_sk + AND web_sales.ws_bill_customer_sk = customer.c_customer_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11 + ) hot_cust +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q39a.sql b/plugin-spark/src/test/resources/tpcds/q39a.sql new file mode 100755 index 0000000000..9fc4c1701c --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q39a.sql @@ -0,0 +1,47 @@ +WITH inv AS +(SELECT + w_warehouse_name, + w_warehouse_sk, + i_item_sk, + d_moy, + stdev, + mean, + CASE mean + WHEN 0 + THEN NULL + ELSE stdev / mean END cov + FROM (SELECT + w_warehouse_name, + w_warehouse_sk, + i_item_sk, + d_moy, + stddev_samp(inv_quantity_on_hand) stdev, + avg(inv_quantity_on_hand) mean + FROM inventory, item, warehouse, date_dim + WHERE inv_item_sk = i_item_sk + AND inv_warehouse_sk = w_warehouse_sk + AND inv_date_sk = d_date_sk + AND d_year = 2001 + GROUP BY w_warehouse_name, w_warehouse_sk, i_item_sk, d_moy) foo + WHERE CASE mean + WHEN 0 + THEN 0 + ELSE stdev / mean END > 1) +SELECT + inv1.w_warehouse_sk, + inv1.i_item_sk, + inv1.d_moy, + inv1.mean, + inv1.cov, + inv2.w_warehouse_sk, + inv2.i_item_sk, + inv2.d_moy, + inv2.mean, + inv2.cov +FROM inv inv1, inv inv2 +WHERE inv1.i_item_sk = inv2.i_item_sk + AND inv1.w_warehouse_sk = inv2.w_warehouse_sk + AND inv1.d_moy = 1 + AND inv2.d_moy = 1 + 1 +ORDER BY inv1.w_warehouse_sk, inv1.i_item_sk, inv1.d_moy, inv1.mean, inv1.cov + , inv2.d_moy, inv2.mean, inv2.cov diff --git a/plugin-spark/src/test/resources/tpcds/q39b.sql b/plugin-spark/src/test/resources/tpcds/q39b.sql new file mode 100755 index 0000000000..6f8493029f --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q39b.sql @@ -0,0 +1,48 @@ +WITH inv AS +(SELECT + w_warehouse_name, + w_warehouse_sk, + i_item_sk, + d_moy, + stdev, + mean, + CASE mean + WHEN 0 + THEN NULL + ELSE stdev / mean END cov + FROM (SELECT + w_warehouse_name, + w_warehouse_sk, + i_item_sk, + d_moy, + stddev_samp(inv_quantity_on_hand) stdev, + avg(inv_quantity_on_hand) mean + FROM inventory, item, warehouse, date_dim + WHERE inv_item_sk = i_item_sk + AND inv_warehouse_sk = w_warehouse_sk + AND inv_date_sk = d_date_sk + AND d_year = 2001 + GROUP BY w_warehouse_name, w_warehouse_sk, i_item_sk, d_moy) foo + WHERE CASE mean + WHEN 0 + THEN 0 + ELSE stdev / mean END > 1) +SELECT + inv1.w_warehouse_sk, + inv1.i_item_sk, + inv1.d_moy, + inv1.mean, + inv1.cov, + inv2.w_warehouse_sk, + inv2.i_item_sk, + inv2.d_moy, + inv2.mean, + inv2.cov +FROM inv inv1, inv inv2 +WHERE inv1.i_item_sk = inv2.i_item_sk + AND inv1.w_warehouse_sk = inv2.w_warehouse_sk + AND inv1.d_moy = 1 + AND inv2.d_moy = 1 + 1 + AND inv1.cov > 1.5 +ORDER BY inv1.w_warehouse_sk, inv1.i_item_sk, inv1.d_moy, inv1.mean, inv1.cov + , inv2.d_moy, inv2.mean, inv2.cov diff --git a/plugin-spark/src/test/resources/tpcds/q4.sql b/plugin-spark/src/test/resources/tpcds/q4.sql new file mode 100755 index 0000000000..b9f27fbc9a --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q4.sql @@ -0,0 +1,120 @@ +WITH year_total AS ( + SELECT + c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + c_preferred_cust_flag customer_preferred_cust_flag, + c_birth_country customer_birth_country, + c_login customer_login, + c_email_address customer_email_address, + d_year dyear, + sum(((ss_ext_list_price - ss_ext_wholesale_cost - ss_ext_discount_amt) + + ss_ext_sales_price) / 2) year_total, + 's' sale_type + FROM customer, store_sales, date_dim + WHERE c_customer_sk = ss_customer_sk AND ss_sold_date_sk = d_date_sk + GROUP BY c_customer_id, + c_first_name, + c_last_name, + c_preferred_cust_flag, + c_birth_country, + c_login, + c_email_address, + d_year + UNION ALL + SELECT + c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + c_preferred_cust_flag customer_preferred_cust_flag, + c_birth_country customer_birth_country, + c_login customer_login, + c_email_address customer_email_address, + d_year dyear, + sum((((cs_ext_list_price - cs_ext_wholesale_cost - cs_ext_discount_amt) + + cs_ext_sales_price) / 2)) year_total, + 'c' sale_type + FROM customer, catalog_sales, date_dim + WHERE c_customer_sk = cs_bill_customer_sk AND cs_sold_date_sk = d_date_sk + GROUP BY c_customer_id, + c_first_name, + c_last_name, + c_preferred_cust_flag, + c_birth_country, + c_login, + c_email_address, + d_year + UNION ALL + SELECT + c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + c_preferred_cust_flag customer_preferred_cust_flag, + c_birth_country customer_birth_country, + c_login customer_login, + c_email_address customer_email_address, + d_year dyear, + sum((((ws_ext_list_price - ws_ext_wholesale_cost - ws_ext_discount_amt) + ws_ext_sales_price) / + 2)) year_total, + 'w' sale_type + FROM customer, web_sales, date_dim + WHERE c_customer_sk = ws_bill_customer_sk AND ws_sold_date_sk = d_date_sk + GROUP BY c_customer_id, + c_first_name, + c_last_name, + c_preferred_cust_flag, + c_birth_country, + c_login, + c_email_address, + d_year) +SELECT + t_s_secyear.customer_id, + t_s_secyear.customer_first_name, + t_s_secyear.customer_last_name, + t_s_secyear.customer_preferred_cust_flag, + t_s_secyear.customer_birth_country, + t_s_secyear.customer_login, + t_s_secyear.customer_email_address +FROM year_total t_s_firstyear, year_total t_s_secyear, year_total t_c_firstyear, + year_total t_c_secyear, year_total t_w_firstyear, year_total t_w_secyear +WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id + AND t_s_firstyear.customer_id = t_c_secyear.customer_id + AND t_s_firstyear.customer_id = t_c_firstyear.customer_id + AND t_s_firstyear.customer_id = t_w_firstyear.customer_id + AND t_s_firstyear.customer_id = t_w_secyear.customer_id + AND t_s_firstyear.sale_type = 's' + AND t_c_firstyear.sale_type = 'c' + AND t_w_firstyear.sale_type = 'w' + AND t_s_secyear.sale_type = 's' + AND t_c_secyear.sale_type = 'c' + AND t_w_secyear.sale_type = 'w' + AND t_s_firstyear.dyear = 2001 + AND t_s_secyear.dyear = 2001 + 1 + AND t_c_firstyear.dyear = 2001 + AND t_c_secyear.dyear = 2001 + 1 + AND t_w_firstyear.dyear = 2001 + AND t_w_secyear.dyear = 2001 + 1 + AND t_s_firstyear.year_total > 0 + AND t_c_firstyear.year_total > 0 + AND t_w_firstyear.year_total > 0 + AND CASE WHEN t_c_firstyear.year_total > 0 + THEN t_c_secyear.year_total / t_c_firstyear.year_total + ELSE NULL END + > CASE WHEN t_s_firstyear.year_total > 0 + THEN t_s_secyear.year_total / t_s_firstyear.year_total + ELSE NULL END + AND CASE WHEN t_c_firstyear.year_total > 0 + THEN t_c_secyear.year_total / t_c_firstyear.year_total + ELSE NULL END + > CASE WHEN t_w_firstyear.year_total > 0 + THEN t_w_secyear.year_total / t_w_firstyear.year_total + ELSE NULL END +ORDER BY + t_s_secyear.customer_id, + t_s_secyear.customer_first_name, + t_s_secyear.customer_last_name, + t_s_secyear.customer_preferred_cust_flag, + t_s_secyear.customer_birth_country, + t_s_secyear.customer_login, + t_s_secyear.customer_email_address +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q40.sql b/plugin-spark/src/test/resources/tpcds/q40.sql new file mode 100755 index 0000000000..66d8b73ac1 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q40.sql @@ -0,0 +1,25 @@ +SELECT + w_state, + i_item_id, + sum(CASE WHEN (cast(d_date AS DATE) < cast('2000-03-11' AS DATE)) + THEN cs_sales_price - coalesce(cr_refunded_cash, 0) + ELSE 0 END) AS sales_before, + sum(CASE WHEN (cast(d_date AS DATE) >= cast('2000-03-11' AS DATE)) + THEN cs_sales_price - coalesce(cr_refunded_cash, 0) + ELSE 0 END) AS sales_after +FROM + catalog_sales + LEFT OUTER JOIN catalog_returns ON + (cs_order_number = cr_order_number + AND cs_item_sk = cr_item_sk) + , warehouse, item, date_dim +WHERE + i_current_price BETWEEN 0.99 AND 1.49 + AND i_item_sk = cs_item_sk + AND cs_warehouse_sk = w_warehouse_sk + AND cs_sold_date_sk = d_date_sk + AND d_date BETWEEN (cast('2000-03-11' AS DATE) - INTERVAL 30 days) + AND (cast('2000-03-11' AS DATE) + INTERVAL 30 days) +GROUP BY w_state, i_item_id +ORDER BY w_state, i_item_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q41.sql b/plugin-spark/src/test/resources/tpcds/q41.sql new file mode 100755 index 0000000000..25e317e0e2 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q41.sql @@ -0,0 +1,49 @@ +SELECT DISTINCT (i_product_name) +FROM item i1 +WHERE i_manufact_id BETWEEN 738 AND 738 + 40 + AND (SELECT count(*) AS item_cnt +FROM item +WHERE (i_manufact = i1.i_manufact AND + ((i_category = 'Women' AND + (i_color = 'powder' OR i_color = 'khaki') AND + (i_units = 'Ounce' OR i_units = 'Oz') AND + (i_size = 'medium' OR i_size = 'extra large') + ) OR + (i_category = 'Women' AND + (i_color = 'brown' OR i_color = 'honeydew') AND + (i_units = 'Bunch' OR i_units = 'Ton') AND + (i_size = 'N/A' OR i_size = 'small') + ) OR + (i_category = 'Men' AND + (i_color = 'floral' OR i_color = 'deep') AND + (i_units = 'N/A' OR i_units = 'Dozen') AND + (i_size = 'petite' OR i_size = 'large') + ) OR + (i_category = 'Men' AND + (i_color = 'light' OR i_color = 'cornflower') AND + (i_units = 'Box' OR i_units = 'Pound') AND + (i_size = 'medium' OR i_size = 'extra large') + ))) OR + (i_manufact = i1.i_manufact AND + ((i_category = 'Women' AND + (i_color = 'midnight' OR i_color = 'snow') AND + (i_units = 'Pallet' OR i_units = 'Gross') AND + (i_size = 'medium' OR i_size = 'extra large') + ) OR + (i_category = 'Women' AND + (i_color = 'cyan' OR i_color = 'papaya') AND + (i_units = 'Cup' OR i_units = 'Dram') AND + (i_size = 'N/A' OR i_size = 'small') + ) OR + (i_category = 'Men' AND + (i_color = 'orange' OR i_color = 'frosted') AND + (i_units = 'Each' OR i_units = 'Tbl') AND + (i_size = 'petite' OR i_size = 'large') + ) OR + (i_category = 'Men' AND + (i_color = 'forest' OR i_color = 'ghost') AND + (i_units = 'Lb' OR i_units = 'Bundle') AND + (i_size = 'medium' OR i_size = 'extra large') + )))) > 0 +ORDER BY i_product_name +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q42.sql b/plugin-spark/src/test/resources/tpcds/q42.sql new file mode 100755 index 0000000000..4d2e71760d --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q42.sql @@ -0,0 +1,18 @@ +SELECT + dt.d_year, + item.i_category_id, + item.i_category, + sum(ss_ext_sales_price) +FROM date_dim dt, store_sales, item +WHERE dt.d_date_sk = store_sales.ss_sold_date_sk + AND store_sales.ss_item_sk = item.i_item_sk + AND item.i_manager_id = 1 + AND dt.d_moy = 11 + AND dt.d_year = 2000 +GROUP BY dt.d_year + , item.i_category_id + , item.i_category +ORDER BY sum(ss_ext_sales_price) DESC, dt.d_year + , item.i_category_id + , item.i_category +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q43.sql b/plugin-spark/src/test/resources/tpcds/q43.sql new file mode 100755 index 0000000000..45411772c1 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q43.sql @@ -0,0 +1,33 @@ +SELECT + s_store_name, + s_store_id, + sum(CASE WHEN (d_day_name = 'Sunday') + THEN ss_sales_price + ELSE NULL END) sun_sales, + sum(CASE WHEN (d_day_name = 'Monday') + THEN ss_sales_price + ELSE NULL END) mon_sales, + sum(CASE WHEN (d_day_name = 'Tuesday') + THEN ss_sales_price + ELSE NULL END) tue_sales, + sum(CASE WHEN (d_day_name = 'Wednesday') + THEN ss_sales_price + ELSE NULL END) wed_sales, + sum(CASE WHEN (d_day_name = 'Thursday') + THEN ss_sales_price + ELSE NULL END) thu_sales, + sum(CASE WHEN (d_day_name = 'Friday') + THEN ss_sales_price + ELSE NULL END) fri_sales, + sum(CASE WHEN (d_day_name = 'Saturday') + THEN ss_sales_price + ELSE NULL END) sat_sales +FROM date_dim, store_sales, store +WHERE d_date_sk = ss_sold_date_sk AND + s_store_sk = ss_store_sk AND + s_gmt_offset = -5 AND + d_year = 2000 +GROUP BY s_store_name, s_store_id +ORDER BY s_store_name, s_store_id, sun_sales, mon_sales, tue_sales, wed_sales, + thu_sales, fri_sales, sat_sales +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q44.sql b/plugin-spark/src/test/resources/tpcds/q44.sql new file mode 100755 index 0000000000..379e604788 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q44.sql @@ -0,0 +1,46 @@ +SELECT + asceding.rnk, + i1.i_product_name best_performing, + i2.i_product_name worst_performing +FROM (SELECT * +FROM (SELECT + item_sk, + rank() + OVER ( + ORDER BY rank_col ASC) rnk +FROM (SELECT + ss_item_sk item_sk, + avg(ss_net_profit) rank_col +FROM store_sales ss1 +WHERE ss_store_sk = 4 +GROUP BY ss_item_sk +HAVING avg(ss_net_profit) > 0.9 * (SELECT avg(ss_net_profit) rank_col +FROM store_sales +WHERE ss_store_sk = 4 + AND ss_addr_sk IS NULL +GROUP BY ss_store_sk)) V1) V11 +WHERE rnk < 11) asceding, + (SELECT * + FROM (SELECT + item_sk, + rank() + OVER ( + ORDER BY rank_col DESC) rnk + FROM (SELECT + ss_item_sk item_sk, + avg(ss_net_profit) rank_col + FROM store_sales ss1 + WHERE ss_store_sk = 4 + GROUP BY ss_item_sk + HAVING avg(ss_net_profit) > 0.9 * (SELECT avg(ss_net_profit) rank_col + FROM store_sales + WHERE ss_store_sk = 4 + AND ss_addr_sk IS NULL + GROUP BY ss_store_sk)) V2) V21 + WHERE rnk < 11) descending, + item i1, item i2 +WHERE asceding.rnk = descending.rnk + AND i1.i_item_sk = asceding.item_sk + AND i2.i_item_sk = descending.item_sk +ORDER BY asceding.rnk +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q45.sql b/plugin-spark/src/test/resources/tpcds/q45.sql new file mode 100755 index 0000000000..907438f196 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q45.sql @@ -0,0 +1,21 @@ +SELECT + ca_zip, + ca_city, + sum(ws_sales_price) +FROM web_sales, customer, customer_address, date_dim, item +WHERE ws_bill_customer_sk = c_customer_sk + AND c_current_addr_sk = ca_address_sk + AND ws_item_sk = i_item_sk + AND (substr(ca_zip, 1, 5) IN + ('85669', '86197', '88274', '83405', '86475', '85392', '85460', '80348', '81792') + OR + i_item_id IN (SELECT i_item_id + FROM item + WHERE i_item_sk IN (2, 3, 5, 7, 11, 13, 17, 19, 23, 29) + ) +) + AND ws_sold_date_sk = d_date_sk + AND d_qoy = 2 AND d_year = 2001 +GROUP BY ca_zip, ca_city +ORDER BY ca_zip, ca_city +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q46.sql b/plugin-spark/src/test/resources/tpcds/q46.sql new file mode 100755 index 0000000000..0911677dff --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q46.sql @@ -0,0 +1,32 @@ +SELECT + c_last_name, + c_first_name, + ca_city, + bought_city, + ss_ticket_number, + amt, + profit +FROM + (SELECT + ss_ticket_number, + ss_customer_sk, + ca_city bought_city, + sum(ss_coupon_amt) amt, + sum(ss_net_profit) profit + FROM store_sales, date_dim, store, household_demographics, customer_address + WHERE store_sales.ss_sold_date_sk = date_dim.d_date_sk + AND store_sales.ss_store_sk = store.s_store_sk + AND store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + AND store_sales.ss_addr_sk = customer_address.ca_address_sk + AND (household_demographics.hd_dep_count = 4 OR + household_demographics.hd_vehicle_count = 3) + AND date_dim.d_dow IN (6, 0) + AND date_dim.d_year IN (1999, 1999 + 1, 1999 + 2) + AND store.s_city IN ('Fairview', 'Midway', 'Fairview', 'Fairview', 'Fairview') + GROUP BY ss_ticket_number, ss_customer_sk, ss_addr_sk, ca_city) dn, customer, + customer_address current_addr +WHERE ss_customer_sk = c_customer_sk + AND customer.c_current_addr_sk = current_addr.ca_address_sk + AND current_addr.ca_city <> bought_city +ORDER BY c_last_name, c_first_name, ca_city, bought_city, ss_ticket_number +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q47.sql b/plugin-spark/src/test/resources/tpcds/q47.sql new file mode 100755 index 0000000000..cfc37a4cec --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q47.sql @@ -0,0 +1,63 @@ +WITH v1 AS ( + SELECT + i_category, + i_brand, + s_store_name, + s_company_name, + d_year, + d_moy, + sum(ss_sales_price) sum_sales, + avg(sum(ss_sales_price)) + OVER + (PARTITION BY i_category, i_brand, + s_store_name, s_company_name, d_year) + avg_monthly_sales, + rank() + OVER + (PARTITION BY i_category, i_brand, + s_store_name, s_company_name + ORDER BY d_year, d_moy) rn + FROM item, store_sales, date_dim, store + WHERE ss_item_sk = i_item_sk AND + ss_sold_date_sk = d_date_sk AND + ss_store_sk = s_store_sk AND + ( + d_year = 1999 OR + (d_year = 1999 - 1 AND d_moy = 12) OR + (d_year = 1999 + 1 AND d_moy = 1) + ) + GROUP BY i_category, i_brand, + s_store_name, s_company_name, + d_year, d_moy), + v2 AS ( + SELECT + v1.i_category, + v1.i_brand, + v1.s_store_name, + v1.s_company_name, + v1.d_year, + v1.d_moy, + v1.avg_monthly_sales, + v1.sum_sales, + v1_lag.sum_sales psum, + v1_lead.sum_sales nsum + FROM v1, v1 v1_lag, v1 v1_lead + WHERE v1.i_category = v1_lag.i_category AND + v1.i_category = v1_lead.i_category AND + v1.i_brand = v1_lag.i_brand AND + v1.i_brand = v1_lead.i_brand AND + v1.s_store_name = v1_lag.s_store_name AND + v1.s_store_name = v1_lead.s_store_name AND + v1.s_company_name = v1_lag.s_company_name AND + v1.s_company_name = v1_lead.s_company_name AND + v1.rn = v1_lag.rn + 1 AND + v1.rn = v1_lead.rn - 1) +SELECT * +FROM v2 +WHERE d_year = 1999 AND + avg_monthly_sales > 0 AND + CASE WHEN avg_monthly_sales > 0 + THEN abs(sum_sales - avg_monthly_sales) / avg_monthly_sales + ELSE NULL END > 0.1 +ORDER BY sum_sales - avg_monthly_sales, 3 +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q48.sql b/plugin-spark/src/test/resources/tpcds/q48.sql new file mode 100755 index 0000000000..fdb9f38e29 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q48.sql @@ -0,0 +1,63 @@ +SELECT sum(ss_quantity) +FROM store_sales, store, customer_demographics, customer_address, date_dim +WHERE s_store_sk = ss_store_sk + AND ss_sold_date_sk = d_date_sk AND d_year = 2001 + AND + ( + ( + cd_demo_sk = ss_cdemo_sk + AND + cd_marital_status = 'M' + AND + cd_education_status = '4 yr Degree' + AND + ss_sales_price BETWEEN 100.00 AND 150.00 + ) + OR + ( + cd_demo_sk = ss_cdemo_sk + AND + cd_marital_status = 'D' + AND + cd_education_status = '2 yr Degree' + AND + ss_sales_price BETWEEN 50.00 AND 100.00 + ) + OR + ( + cd_demo_sk = ss_cdemo_sk + AND + cd_marital_status = 'S' + AND + cd_education_status = 'College' + AND + ss_sales_price BETWEEN 150.00 AND 200.00 + ) + ) + AND + ( + ( + ss_addr_sk = ca_address_sk + AND + ca_country = 'United States' + AND + ca_state IN ('CO', 'OH', 'TX') + AND ss_net_profit BETWEEN 0 AND 2000 + ) + OR + (ss_addr_sk = ca_address_sk + AND + ca_country = 'United States' + AND + ca_state IN ('OR', 'MN', 'KY') + AND ss_net_profit BETWEEN 150 AND 3000 + ) + OR + (ss_addr_sk = ca_address_sk + AND + ca_country = 'United States' + AND + ca_state IN ('VA', 'CA', 'MS') + AND ss_net_profit BETWEEN 50 AND 25000 + ) + ) diff --git a/plugin-spark/src/test/resources/tpcds/q49.sql b/plugin-spark/src/test/resources/tpcds/q49.sql new file mode 100755 index 0000000000..9568d8b92d --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q49.sql @@ -0,0 +1,126 @@ +SELECT + 'web' AS channel, + web.item, + web.return_ratio, + web.return_rank, + web.currency_rank +FROM ( + SELECT + item, + return_ratio, + currency_ratio, + rank() + OVER ( + ORDER BY return_ratio) AS return_rank, + rank() + OVER ( + ORDER BY currency_ratio) AS currency_rank + FROM + (SELECT + ws.ws_item_sk AS item, + (cast(sum(coalesce(wr.wr_return_quantity, 0)) AS DECIMAL(15, 4)) / + cast(sum(coalesce(ws.ws_quantity, 0)) AS DECIMAL(15, 4))) AS return_ratio, + (cast(sum(coalesce(wr.wr_return_amt, 0)) AS DECIMAL(15, 4)) / + cast(sum(coalesce(ws.ws_net_paid, 0)) AS DECIMAL(15, 4))) AS currency_ratio + FROM + web_sales ws LEFT OUTER JOIN web_returns wr + ON (ws.ws_order_number = wr.wr_order_number AND + ws.ws_item_sk = wr.wr_item_sk) + , date_dim + WHERE + wr.wr_return_amt > 10000 + AND ws.ws_net_profit > 1 + AND ws.ws_net_paid > 0 + AND ws.ws_quantity > 0 + AND ws_sold_date_sk = d_date_sk + AND d_year = 2001 + AND d_moy = 12 + GROUP BY ws.ws_item_sk + ) in_web + ) web +WHERE (web.return_rank <= 10 OR web.currency_rank <= 10) +UNION +SELECT + 'catalog' AS channel, + catalog.item, + catalog.return_ratio, + catalog.return_rank, + catalog.currency_rank +FROM ( + SELECT + item, + return_ratio, + currency_ratio, + rank() + OVER ( + ORDER BY return_ratio) AS return_rank, + rank() + OVER ( + ORDER BY currency_ratio) AS currency_rank + FROM + (SELECT + cs.cs_item_sk AS item, + (cast(sum(coalesce(cr.cr_return_quantity, 0)) AS DECIMAL(15, 4)) / + cast(sum(coalesce(cs.cs_quantity, 0)) AS DECIMAL(15, 4))) AS return_ratio, + (cast(sum(coalesce(cr.cr_return_amount, 0)) AS DECIMAL(15, 4)) / + cast(sum(coalesce(cs.cs_net_paid, 0)) AS DECIMAL(15, 4))) AS currency_ratio + FROM + catalog_sales cs LEFT OUTER JOIN catalog_returns cr + ON (cs.cs_order_number = cr.cr_order_number AND + cs.cs_item_sk = cr.cr_item_sk) + , date_dim + WHERE + cr.cr_return_amount > 10000 + AND cs.cs_net_profit > 1 + AND cs.cs_net_paid > 0 + AND cs.cs_quantity > 0 + AND cs_sold_date_sk = d_date_sk + AND d_year = 2001 + AND d_moy = 12 + GROUP BY cs.cs_item_sk + ) in_cat + ) catalog +WHERE (catalog.return_rank <= 10 OR catalog.currency_rank <= 10) +UNION +SELECT + 'store' AS channel, + store.item, + store.return_ratio, + store.return_rank, + store.currency_rank +FROM ( + SELECT + item, + return_ratio, + currency_ratio, + rank() + OVER ( + ORDER BY return_ratio) AS return_rank, + rank() + OVER ( + ORDER BY currency_ratio) AS currency_rank + FROM + (SELECT + sts.ss_item_sk AS item, + (cast(sum(coalesce(sr.sr_return_quantity, 0)) AS DECIMAL(15, 4)) / + cast(sum(coalesce(sts.ss_quantity, 0)) AS DECIMAL(15, 4))) AS return_ratio, + (cast(sum(coalesce(sr.sr_return_amt, 0)) AS DECIMAL(15, 4)) / + cast(sum(coalesce(sts.ss_net_paid, 0)) AS DECIMAL(15, 4))) AS currency_ratio + FROM + store_sales sts LEFT OUTER JOIN store_returns sr + ON (sts.ss_ticket_number = sr.sr_ticket_number AND sts.ss_item_sk = sr.sr_item_sk) + , date_dim + WHERE + sr.sr_return_amt > 10000 + AND sts.ss_net_profit > 1 + AND sts.ss_net_paid > 0 + AND sts.ss_quantity > 0 + AND ss_sold_date_sk = d_date_sk + AND d_year = 2001 + AND d_moy = 12 + GROUP BY sts.ss_item_sk + ) in_store + ) store +WHERE (store.return_rank <= 10 OR store.currency_rank <= 10) +ORDER BY 1, 4, 5 +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q5.sql b/plugin-spark/src/test/resources/tpcds/q5.sql new file mode 100755 index 0000000000..b87cf3a448 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q5.sql @@ -0,0 +1,131 @@ +WITH ssr AS +( SELECT + s_store_id, + sum(sales_price) AS sales, + sum(profit) AS profit, + sum(return_amt) AS RETURNS, + sum(net_loss) AS profit_loss + FROM + (SELECT + ss_store_sk AS store_sk, + ss_sold_date_sk AS date_sk, + ss_ext_sales_price AS sales_price, + ss_net_profit AS profit, + cast(0 AS DECIMAL(7, 2)) AS return_amt, + cast(0 AS DECIMAL(7, 2)) AS net_loss + FROM store_sales + UNION ALL + SELECT + sr_store_sk AS store_sk, + sr_returned_date_sk AS date_sk, + cast(0 AS DECIMAL(7, 2)) AS sales_price, + cast(0 AS DECIMAL(7, 2)) AS profit, + sr_return_amt AS return_amt, + sr_net_loss AS net_loss + FROM store_returns) + salesreturns, date_dim, store + WHERE date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-23' AS DATE) + AND ((cast('2000-08-23' AS DATE) + INTERVAL 14 days)) + AND store_sk = s_store_sk + GROUP BY s_store_id), + csr AS + ( SELECT + cp_catalog_page_id, + sum(sales_price) AS sales, + sum(profit) AS profit, + sum(return_amt) AS RETURNS, + sum(net_loss) AS profit_loss + FROM + (SELECT + cs_catalog_page_sk AS page_sk, + cs_sold_date_sk AS date_sk, + cs_ext_sales_price AS sales_price, + cs_net_profit AS profit, + cast(0 AS DECIMAL(7, 2)) AS return_amt, + cast(0 AS DECIMAL(7, 2)) AS net_loss + FROM catalog_sales + UNION ALL + SELECT + cr_catalog_page_sk AS page_sk, + cr_returned_date_sk AS date_sk, + cast(0 AS DECIMAL(7, 2)) AS sales_price, + cast(0 AS DECIMAL(7, 2)) AS profit, + cr_return_amount AS return_amt, + cr_net_loss AS net_loss + FROM catalog_returns + ) salesreturns, date_dim, catalog_page + WHERE date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-23' AS DATE) + AND ((cast('2000-08-23' AS DATE) + INTERVAL 14 days)) + AND page_sk = cp_catalog_page_sk + GROUP BY cp_catalog_page_id) + , + wsr AS + ( SELECT + web_site_id, + sum(sales_price) AS sales, + sum(profit) AS profit, + sum(return_amt) AS RETURNS, + sum(net_loss) AS profit_loss + FROM + (SELECT + ws_web_site_sk AS wsr_web_site_sk, + ws_sold_date_sk AS date_sk, + ws_ext_sales_price AS sales_price, + ws_net_profit AS profit, + cast(0 AS DECIMAL(7, 2)) AS return_amt, + cast(0 AS DECIMAL(7, 2)) AS net_loss + FROM web_sales + UNION ALL + SELECT + ws_web_site_sk AS wsr_web_site_sk, + wr_returned_date_sk AS date_sk, + cast(0 AS DECIMAL(7, 2)) AS sales_price, + cast(0 AS DECIMAL(7, 2)) AS profit, + wr_return_amt AS return_amt, + wr_net_loss AS net_loss + FROM web_returns + LEFT OUTER JOIN web_sales ON + (wr_item_sk = ws_item_sk + AND wr_order_number = ws_order_number) + ) salesreturns, date_dim, web_site + WHERE date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-23' AS DATE) + AND ((cast('2000-08-23' AS DATE) + INTERVAL 14 days)) + AND wsr_web_site_sk = web_site_sk + GROUP BY web_site_id) +SELECT + channel, + id, + sum(sales) AS sales, + sum(returns) AS returns, + sum(profit) AS profit +FROM + (SELECT + 'store channel' AS channel, + concat('store', s_store_id) AS id, + sales, + returns, + (profit - profit_loss) AS profit + FROM ssr + UNION ALL + SELECT + 'catalog channel' AS channel, + concat('catalog_page', cp_catalog_page_id) AS id, + sales, + returns, + (profit - profit_loss) AS profit + FROM csr + UNION ALL + SELECT + 'web channel' AS channel, + concat('web_site', web_site_id) AS id, + sales, + returns, + (profit - profit_loss) AS profit + FROM wsr + ) x +GROUP BY ROLLUP (channel, id) +ORDER BY channel, id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q50.sql b/plugin-spark/src/test/resources/tpcds/q50.sql new file mode 100755 index 0000000000..f1d4b15449 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q50.sql @@ -0,0 +1,47 @@ +SELECT + s_store_name, + s_company_id, + s_street_number, + s_street_name, + s_street_type, + s_suite_number, + s_city, + s_county, + s_state, + s_zip, + sum(CASE WHEN (sr_returned_date_sk - ss_sold_date_sk <= 30) + THEN 1 + ELSE 0 END) AS `30 days `, + sum(CASE WHEN (sr_returned_date_sk - ss_sold_date_sk > 30) AND + (sr_returned_date_sk - ss_sold_date_sk <= 60) + THEN 1 + ELSE 0 END) AS `31 - 60 days `, + sum(CASE WHEN (sr_returned_date_sk - ss_sold_date_sk > 60) AND + (sr_returned_date_sk - ss_sold_date_sk <= 90) + THEN 1 + ELSE 0 END) AS `61 - 90 days `, + sum(CASE WHEN (sr_returned_date_sk - ss_sold_date_sk > 90) AND + (sr_returned_date_sk - ss_sold_date_sk <= 120) + THEN 1 + ELSE 0 END) AS `91 - 120 days `, + sum(CASE WHEN (sr_returned_date_sk - ss_sold_date_sk > 120) + THEN 1 + ELSE 0 END) AS `>120 days ` +FROM + store_sales, store_returns, store, date_dim d1, date_dim d2 +WHERE + d2.d_year = 2001 + AND d2.d_moy = 8 + AND ss_ticket_number = sr_ticket_number + AND ss_item_sk = sr_item_sk + AND ss_sold_date_sk = d1.d_date_sk + AND sr_returned_date_sk = d2.d_date_sk + AND ss_customer_sk = sr_customer_sk + AND ss_store_sk = s_store_sk +GROUP BY + s_store_name, s_company_id, s_street_number, s_street_name, s_street_type, + s_suite_number, s_city, s_county, s_state, s_zip +ORDER BY + s_store_name, s_company_id, s_street_number, s_street_name, s_street_type, + s_suite_number, s_city, s_county, s_state, s_zip +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q51.sql b/plugin-spark/src/test/resources/tpcds/q51.sql new file mode 100755 index 0000000000..62b003eb67 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q51.sql @@ -0,0 +1,55 @@ +WITH web_v1 AS ( + SELECT + ws_item_sk item_sk, + d_date, + sum(sum(ws_sales_price)) + OVER (PARTITION BY ws_item_sk + ORDER BY d_date + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) cume_sales + FROM web_sales, date_dim + WHERE ws_sold_date_sk = d_date_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11 + AND ws_item_sk IS NOT NULL + GROUP BY ws_item_sk, d_date), + store_v1 AS ( + SELECT + ss_item_sk item_sk, + d_date, + sum(sum(ss_sales_price)) + OVER (PARTITION BY ss_item_sk + ORDER BY d_date + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) cume_sales + FROM store_sales, date_dim + WHERE ss_sold_date_sk = d_date_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11 + AND ss_item_sk IS NOT NULL + GROUP BY ss_item_sk, d_date) +SELECT * +FROM (SELECT + item_sk, + d_date, + web_sales, + store_sales, + max(web_sales) + OVER (PARTITION BY item_sk + ORDER BY d_date + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) web_cumulative, + max(store_sales) + OVER (PARTITION BY item_sk + ORDER BY d_date + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) store_cumulative +FROM (SELECT + CASE WHEN web.item_sk IS NOT NULL + THEN web.item_sk + ELSE store.item_sk END item_sk, + CASE WHEN web.d_date IS NOT NULL + THEN web.d_date + ELSE store.d_date END d_date, + web.cume_sales web_sales, + store.cume_sales store_sales +FROM web_v1 web FULL OUTER JOIN store_v1 store ON (web.item_sk = store.item_sk + AND web.d_date = store.d_date) + ) x) y +WHERE web_cumulative > store_cumulative +ORDER BY item_sk, d_date +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q52.sql b/plugin-spark/src/test/resources/tpcds/q52.sql new file mode 100755 index 0000000000..467d1ae050 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q52.sql @@ -0,0 +1,14 @@ +SELECT + dt.d_year, + item.i_brand_id brand_id, + item.i_brand brand, + sum(ss_ext_sales_price) ext_price +FROM date_dim dt, store_sales, item +WHERE dt.d_date_sk = store_sales.ss_sold_date_sk + AND store_sales.ss_item_sk = item.i_item_sk + AND item.i_manager_id = 1 + AND dt.d_moy = 11 + AND dt.d_year = 2000 +GROUP BY dt.d_year, item.i_brand, item.i_brand_id +ORDER BY dt.d_year, ext_price DESC, brand_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q53.sql b/plugin-spark/src/test/resources/tpcds/q53.sql new file mode 100755 index 0000000000..b42c68dcf8 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q53.sql @@ -0,0 +1,30 @@ +SELECT * +FROM + (SELECT + i_manufact_id, + sum(ss_sales_price) sum_sales, + avg(sum(ss_sales_price)) + OVER (PARTITION BY i_manufact_id) avg_quarterly_sales + FROM item, store_sales, date_dim, store + WHERE ss_item_sk = i_item_sk AND + ss_sold_date_sk = d_date_sk AND + ss_store_sk = s_store_sk AND + d_month_seq IN (1200, 1200 + 1, 1200 + 2, 1200 + 3, 1200 + 4, 1200 + 5, 1200 + 6, + 1200 + 7, 1200 + 8, 1200 + 9, 1200 + 10, 1200 + 11) AND + ((i_category IN ('Books', 'Children', 'Electronics') AND + i_class IN ('personal', 'portable', 'reference', 'self-help') AND + i_brand IN ('scholaramalgamalg #14', 'scholaramalgamalg #7', + 'exportiunivamalg #9', 'scholaramalgamalg #9')) + OR + (i_category IN ('Women', 'Music', 'Men') AND + i_class IN ('accessories', 'classical', 'fragrances', 'pants') AND + i_brand IN ('amalgimporto #1', 'edu packscholar #1', 'exportiimporto #1', + 'importoamalg #1'))) + GROUP BY i_manufact_id, d_qoy) tmp1 +WHERE CASE WHEN avg_quarterly_sales > 0 + THEN abs(sum_sales - avg_quarterly_sales) / avg_quarterly_sales + ELSE NULL END > 0.1 +ORDER BY avg_quarterly_sales, + sum_sales, + i_manufact_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q54.sql b/plugin-spark/src/test/resources/tpcds/q54.sql new file mode 100755 index 0000000000..897237fb6e --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q54.sql @@ -0,0 +1,61 @@ +WITH my_customers AS ( + SELECT DISTINCT + c_customer_sk, + c_current_addr_sk + FROM + (SELECT + cs_sold_date_sk sold_date_sk, + cs_bill_customer_sk customer_sk, + cs_item_sk item_sk + FROM catalog_sales + UNION ALL + SELECT + ws_sold_date_sk sold_date_sk, + ws_bill_customer_sk customer_sk, + ws_item_sk item_sk + FROM web_sales + ) cs_or_ws_sales, + item, + date_dim, + customer + WHERE sold_date_sk = d_date_sk + AND item_sk = i_item_sk + AND i_category = 'Women' + AND i_class = 'maternity' + AND c_customer_sk = cs_or_ws_sales.customer_sk + AND d_moy = 12 + AND d_year = 1998 +) + , my_revenue AS ( + SELECT + c_customer_sk, + sum(ss_ext_sales_price) AS revenue + FROM my_customers, + store_sales, + customer_address, + store, + date_dim + WHERE c_current_addr_sk = ca_address_sk + AND ca_county = s_county + AND ca_state = s_state + AND ss_sold_date_sk = d_date_sk + AND c_customer_sk = ss_customer_sk + AND d_month_seq BETWEEN (SELECT DISTINCT d_month_seq + 1 + FROM date_dim + WHERE d_year = 1998 AND d_moy = 12) + AND (SELECT DISTINCT d_month_seq + 3 + FROM date_dim + WHERE d_year = 1998 AND d_moy = 12) + GROUP BY c_customer_sk +) + , segments AS +(SELECT cast((revenue / 50) AS INT) AS segment + FROM my_revenue) +SELECT + segment, + count(*) AS num_customers, + segment * 50 AS segment_base +FROM segments +GROUP BY segment +ORDER BY segment, num_customers +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q55.sql b/plugin-spark/src/test/resources/tpcds/q55.sql new file mode 100755 index 0000000000..bc5d888c9a --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q55.sql @@ -0,0 +1,13 @@ +SELECT + i_brand_id brand_id, + i_brand brand, + sum(ss_ext_sales_price) ext_price +FROM date_dim, store_sales, item +WHERE d_date_sk = ss_sold_date_sk + AND ss_item_sk = i_item_sk + AND i_manager_id = 28 + AND d_moy = 11 + AND d_year = 1999 +GROUP BY i_brand, i_brand_id +ORDER BY ext_price DESC, brand_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q56.sql b/plugin-spark/src/test/resources/tpcds/q56.sql new file mode 100755 index 0000000000..2fa1738dcf --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q56.sql @@ -0,0 +1,65 @@ +WITH ss AS ( + SELECT + i_item_id, + sum(ss_ext_sales_price) total_sales + FROM + store_sales, date_dim, customer_address, item + WHERE + i_item_id IN (SELECT i_item_id + FROM item + WHERE i_color IN ('slate', 'blanched', 'burnished')) + AND ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND d_year = 2001 + AND d_moy = 2 + AND ss_addr_sk = ca_address_sk + AND ca_gmt_offset = -5 + GROUP BY i_item_id), + cs AS ( + SELECT + i_item_id, + sum(cs_ext_sales_price) total_sales + FROM + catalog_sales, date_dim, customer_address, item + WHERE + i_item_id IN (SELECT i_item_id + FROM item + WHERE i_color IN ('slate', 'blanched', 'burnished')) + AND cs_item_sk = i_item_sk + AND cs_sold_date_sk = d_date_sk + AND d_year = 2001 + AND d_moy = 2 + AND cs_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -5 + GROUP BY i_item_id), + ws AS ( + SELECT + i_item_id, + sum(ws_ext_sales_price) total_sales + FROM + web_sales, date_dim, customer_address, item + WHERE + i_item_id IN (SELECT i_item_id + FROM item + WHERE i_color IN ('slate', 'blanched', 'burnished')) + AND ws_item_sk = i_item_sk + AND ws_sold_date_sk = d_date_sk + AND d_year = 2001 + AND d_moy = 2 + AND ws_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -5 + GROUP BY i_item_id) +SELECT + i_item_id, + sum(total_sales) total_sales +FROM (SELECT * + FROM ss + UNION ALL + SELECT * + FROM cs + UNION ALL + SELECT * + FROM ws) tmp1 +GROUP BY i_item_id +ORDER BY total_sales +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q57.sql b/plugin-spark/src/test/resources/tpcds/q57.sql new file mode 100755 index 0000000000..cf70d4b905 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q57.sql @@ -0,0 +1,56 @@ +WITH v1 AS ( + SELECT + i_category, + i_brand, + cc_name, + d_year, + d_moy, + sum(cs_sales_price) sum_sales, + avg(sum(cs_sales_price)) + OVER + (PARTITION BY i_category, i_brand, cc_name, d_year) + avg_monthly_sales, + rank() + OVER + (PARTITION BY i_category, i_brand, cc_name + ORDER BY d_year, d_moy) rn + FROM item, catalog_sales, date_dim, call_center + WHERE cs_item_sk = i_item_sk AND + cs_sold_date_sk = d_date_sk AND + cc_call_center_sk = cs_call_center_sk AND + ( + d_year = 1999 OR + (d_year = 1999 - 1 AND d_moy = 12) OR + (d_year = 1999 + 1 AND d_moy = 1) + ) + GROUP BY i_category, i_brand, + cc_name, d_year, d_moy), + v2 AS ( + SELECT + v1.i_category, + v1.i_brand, + v1.cc_name, + v1.d_year, + v1.d_moy, + v1.avg_monthly_sales, + v1.sum_sales, + v1_lag.sum_sales psum, + v1_lead.sum_sales nsum + FROM v1, v1 v1_lag, v1 v1_lead + WHERE v1.i_category = v1_lag.i_category AND + v1.i_category = v1_lead.i_category AND + v1.i_brand = v1_lag.i_brand AND + v1.i_brand = v1_lead.i_brand AND + v1.cc_name = v1_lag.cc_name AND + v1.cc_name = v1_lead.cc_name AND + v1.rn = v1_lag.rn + 1 AND + v1.rn = v1_lead.rn - 1) +SELECT * +FROM v2 +WHERE d_year = 1999 AND + avg_monthly_sales > 0 AND + CASE WHEN avg_monthly_sales > 0 + THEN abs(sum_sales - avg_monthly_sales) / avg_monthly_sales + ELSE NULL END > 0.1 +ORDER BY sum_sales - avg_monthly_sales, 3 +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q58.sql b/plugin-spark/src/test/resources/tpcds/q58.sql new file mode 100755 index 0000000000..5f63f33dc9 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q58.sql @@ -0,0 +1,59 @@ +WITH ss_items AS +(SELECT + i_item_id item_id, + sum(ss_ext_sales_price) ss_item_rev + FROM store_sales, item, date_dim + WHERE ss_item_sk = i_item_sk + AND d_date IN (SELECT d_date + FROM date_dim + WHERE d_week_seq = (SELECT d_week_seq + FROM date_dim + WHERE d_date = '2000-01-03')) + AND ss_sold_date_sk = d_date_sk + GROUP BY i_item_id), + cs_items AS + (SELECT + i_item_id item_id, + sum(cs_ext_sales_price) cs_item_rev + FROM catalog_sales, item, date_dim + WHERE cs_item_sk = i_item_sk + AND d_date IN (SELECT d_date + FROM date_dim + WHERE d_week_seq = (SELECT d_week_seq + FROM date_dim + WHERE d_date = '2000-01-03')) + AND cs_sold_date_sk = d_date_sk + GROUP BY i_item_id), + ws_items AS + (SELECT + i_item_id item_id, + sum(ws_ext_sales_price) ws_item_rev + FROM web_sales, item, date_dim + WHERE ws_item_sk = i_item_sk + AND d_date IN (SELECT d_date + FROM date_dim + WHERE d_week_seq = (SELECT d_week_seq + FROM date_dim + WHERE d_date = '2000-01-03')) + AND ws_sold_date_sk = d_date_sk + GROUP BY i_item_id) +SELECT + ss_items.item_id, + ss_item_rev, + ss_item_rev / (ss_item_rev + cs_item_rev + ws_item_rev) / 3 * 100 ss_dev, + cs_item_rev, + cs_item_rev / (ss_item_rev + cs_item_rev + ws_item_rev) / 3 * 100 cs_dev, + ws_item_rev, + ws_item_rev / (ss_item_rev + cs_item_rev + ws_item_rev) / 3 * 100 ws_dev, + (ss_item_rev + cs_item_rev + ws_item_rev) / 3 average +FROM ss_items, cs_items, ws_items +WHERE ss_items.item_id = cs_items.item_id + AND ss_items.item_id = ws_items.item_id + AND ss_item_rev BETWEEN 0.9 * cs_item_rev AND 1.1 * cs_item_rev + AND ss_item_rev BETWEEN 0.9 * ws_item_rev AND 1.1 * ws_item_rev + AND cs_item_rev BETWEEN 0.9 * ss_item_rev AND 1.1 * ss_item_rev + AND cs_item_rev BETWEEN 0.9 * ws_item_rev AND 1.1 * ws_item_rev + AND ws_item_rev BETWEEN 0.9 * ss_item_rev AND 1.1 * ss_item_rev + AND ws_item_rev BETWEEN 0.9 * cs_item_rev AND 1.1 * cs_item_rev +ORDER BY item_id, ss_item_rev +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q59.sql b/plugin-spark/src/test/resources/tpcds/q59.sql new file mode 100755 index 0000000000..3cef202768 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q59.sql @@ -0,0 +1,75 @@ +WITH wss AS +(SELECT + d_week_seq, + ss_store_sk, + sum(CASE WHEN (d_day_name = 'Sunday') + THEN ss_sales_price + ELSE NULL END) sun_sales, + sum(CASE WHEN (d_day_name = 'Monday') + THEN ss_sales_price + ELSE NULL END) mon_sales, + sum(CASE WHEN (d_day_name = 'Tuesday') + THEN ss_sales_price + ELSE NULL END) tue_sales, + sum(CASE WHEN (d_day_name = 'Wednesday') + THEN ss_sales_price + ELSE NULL END) wed_sales, + sum(CASE WHEN (d_day_name = 'Thursday') + THEN ss_sales_price + ELSE NULL END) thu_sales, + sum(CASE WHEN (d_day_name = 'Friday') + THEN ss_sales_price + ELSE NULL END) fri_sales, + sum(CASE WHEN (d_day_name = 'Saturday') + THEN ss_sales_price + ELSE NULL END) sat_sales + FROM store_sales, date_dim + WHERE d_date_sk = ss_sold_date_sk + GROUP BY d_week_seq, ss_store_sk +) +SELECT + s_store_name1, + s_store_id1, + d_week_seq1, + sun_sales1 / sun_sales2, + mon_sales1 / mon_sales2, + tue_sales1 / tue_sales2, + wed_sales1 / wed_sales2, + thu_sales1 / thu_sales2, + fri_sales1 / fri_sales2, + sat_sales1 / sat_sales2 +FROM + (SELECT + s_store_name s_store_name1, + wss.d_week_seq d_week_seq1, + s_store_id s_store_id1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 + FROM wss, store, date_dim d + WHERE d.d_week_seq = wss.d_week_seq AND + ss_store_sk = s_store_sk AND + d_month_seq BETWEEN 1212 AND 1212 + 11) y, + (SELECT + s_store_name s_store_name2, + wss.d_week_seq d_week_seq2, + s_store_id s_store_id2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + FROM wss, store, date_dim d + WHERE d.d_week_seq = wss.d_week_seq AND + ss_store_sk = s_store_sk AND + d_month_seq BETWEEN 1212 + 12 AND 1212 + 23) x +WHERE s_store_id1 = s_store_id2 + AND d_week_seq1 = d_week_seq2 - 52 +ORDER BY s_store_name1, s_store_id1, d_week_seq1 +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q6.sql b/plugin-spark/src/test/resources/tpcds/q6.sql new file mode 100755 index 0000000000..f0f5cf05ae --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q6.sql @@ -0,0 +1,21 @@ +SELECT + a.ca_state state, + count(*) cnt +FROM + customer_address a, customer c, store_sales s, date_dim d, item i +WHERE a.ca_address_sk = c.c_current_addr_sk + AND c.c_customer_sk = s.ss_customer_sk + AND s.ss_sold_date_sk = d.d_date_sk + AND s.ss_item_sk = i.i_item_sk + AND d.d_month_seq = + (SELECT DISTINCT (d_month_seq) + FROM date_dim + WHERE d_year = 2000 AND d_moy = 1) + AND i.i_current_price > 1.2 * + (SELECT avg(j.i_current_price) + FROM item j + WHERE j.i_category = i.i_category) +GROUP BY a.ca_state +HAVING count(*) >= 10 +ORDER BY cnt +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q60.sql b/plugin-spark/src/test/resources/tpcds/q60.sql new file mode 100755 index 0000000000..41b963f44b --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q60.sql @@ -0,0 +1,62 @@ +WITH ss AS ( + SELECT + i_item_id, + sum(ss_ext_sales_price) total_sales + FROM store_sales, date_dim, customer_address, item + WHERE + i_item_id IN (SELECT i_item_id + FROM item + WHERE i_category IN ('Music')) + AND ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND d_year = 1998 + AND d_moy = 9 + AND ss_addr_sk = ca_address_sk + AND ca_gmt_offset = -5 + GROUP BY i_item_id), + cs AS ( + SELECT + i_item_id, + sum(cs_ext_sales_price) total_sales + FROM catalog_sales, date_dim, customer_address, item + WHERE + i_item_id IN (SELECT i_item_id + FROM item + WHERE i_category IN ('Music')) + AND cs_item_sk = i_item_sk + AND cs_sold_date_sk = d_date_sk + AND d_year = 1998 + AND d_moy = 9 + AND cs_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -5 + GROUP BY i_item_id), + ws AS ( + SELECT + i_item_id, + sum(ws_ext_sales_price) total_sales + FROM web_sales, date_dim, customer_address, item + WHERE + i_item_id IN (SELECT i_item_id + FROM item + WHERE i_category IN ('Music')) + AND ws_item_sk = i_item_sk + AND ws_sold_date_sk = d_date_sk + AND d_year = 1998 + AND d_moy = 9 + AND ws_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -5 + GROUP BY i_item_id) +SELECT + i_item_id, + sum(total_sales) total_sales +FROM (SELECT * + FROM ss + UNION ALL + SELECT * + FROM cs + UNION ALL + SELECT * + FROM ws) tmp1 +GROUP BY i_item_id +ORDER BY i_item_id, total_sales +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q61.sql b/plugin-spark/src/test/resources/tpcds/q61.sql new file mode 100755 index 0000000000..b0a872b4b8 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q61.sql @@ -0,0 +1,33 @@ +SELECT + promotions, + total, + cast(promotions AS DECIMAL(15, 4)) / cast(total AS DECIMAL(15, 4)) * 100 +FROM + (SELECT sum(ss_ext_sales_price) promotions + FROM store_sales, store, promotion, date_dim, customer, customer_address, item + WHERE ss_sold_date_sk = d_date_sk + AND ss_store_sk = s_store_sk + AND ss_promo_sk = p_promo_sk + AND ss_customer_sk = c_customer_sk + AND ca_address_sk = c_current_addr_sk + AND ss_item_sk = i_item_sk + AND ca_gmt_offset = -5 + AND i_category = 'Jewelry' + AND (p_channel_dmail = 'Y' OR p_channel_email = 'Y' OR p_channel_tv = 'Y') + AND s_gmt_offset = -5 + AND d_year = 1998 + AND d_moy = 11) promotional_sales, + (SELECT sum(ss_ext_sales_price) total + FROM store_sales, store, date_dim, customer, customer_address, item + WHERE ss_sold_date_sk = d_date_sk + AND ss_store_sk = s_store_sk + AND ss_customer_sk = c_customer_sk + AND ca_address_sk = c_current_addr_sk + AND ss_item_sk = i_item_sk + AND ca_gmt_offset = -5 + AND i_category = 'Jewelry' + AND s_gmt_offset = -5 + AND d_year = 1998 + AND d_moy = 11) all_sales +ORDER BY promotions, total +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q62.sql b/plugin-spark/src/test/resources/tpcds/q62.sql new file mode 100755 index 0000000000..8a414f154b --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q62.sql @@ -0,0 +1,35 @@ +SELECT + substr(w_warehouse_name, 1, 20), + sm_type, + web_name, + sum(CASE WHEN (ws_ship_date_sk - ws_sold_date_sk <= 30) + THEN 1 + ELSE 0 END) AS `30 days `, + sum(CASE WHEN (ws_ship_date_sk - ws_sold_date_sk > 30) AND + (ws_ship_date_sk - ws_sold_date_sk <= 60) + THEN 1 + ELSE 0 END) AS `31 - 60 days `, + sum(CASE WHEN (ws_ship_date_sk - ws_sold_date_sk > 60) AND + (ws_ship_date_sk - ws_sold_date_sk <= 90) + THEN 1 + ELSE 0 END) AS `61 - 90 days `, + sum(CASE WHEN (ws_ship_date_sk - ws_sold_date_sk > 90) AND + (ws_ship_date_sk - ws_sold_date_sk <= 120) + THEN 1 + ELSE 0 END) AS `91 - 120 days `, + sum(CASE WHEN (ws_ship_date_sk - ws_sold_date_sk > 120) + THEN 1 + ELSE 0 END) AS `>120 days ` +FROM + web_sales, warehouse, ship_mode, web_site, date_dim +WHERE + d_month_seq BETWEEN 1200 AND 1200 + 11 + AND ws_ship_date_sk = d_date_sk + AND ws_warehouse_sk = w_warehouse_sk + AND ws_ship_mode_sk = sm_ship_mode_sk + AND ws_web_site_sk = web_site_sk +GROUP BY + substr(w_warehouse_name, 1, 20), sm_type, web_name +ORDER BY + substr(w_warehouse_name, 1, 20), sm_type, web_name +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q63.sql b/plugin-spark/src/test/resources/tpcds/q63.sql new file mode 100755 index 0000000000..ef6867e0a9 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q63.sql @@ -0,0 +1,31 @@ +SELECT * +FROM (SELECT + i_manager_id, + sum(ss_sales_price) sum_sales, + avg(sum(ss_sales_price)) + OVER (PARTITION BY i_manager_id) avg_monthly_sales +FROM item + , store_sales + , date_dim + , store +WHERE ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND ss_store_sk = s_store_sk + AND d_month_seq IN (1200, 1200 + 1, 1200 + 2, 1200 + 3, 1200 + 4, 1200 + 5, 1200 + 6, 1200 + 7, + 1200 + 8, 1200 + 9, 1200 + 10, 1200 + 11) + AND ((i_category IN ('Books', 'Children', 'Electronics') + AND i_class IN ('personal', 'portable', 'refernece', 'self-help') + AND i_brand IN ('scholaramalgamalg #14', 'scholaramalgamalg #7', + 'exportiunivamalg #9', 'scholaramalgamalg #9')) + OR (i_category IN ('Women', 'Music', 'Men') + AND i_class IN ('accessories', 'classical', 'fragrances', 'pants') + AND i_brand IN ('amalgimporto #1', 'edu packscholar #1', 'exportiimporto #1', + 'importoamalg #1'))) +GROUP BY i_manager_id, d_moy) tmp1 +WHERE CASE WHEN avg_monthly_sales > 0 + THEN abs(sum_sales - avg_monthly_sales) / avg_monthly_sales + ELSE NULL END > 0.1 +ORDER BY i_manager_id + , avg_monthly_sales + , sum_sales +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q64.sql b/plugin-spark/src/test/resources/tpcds/q64.sql new file mode 100755 index 0000000000..8ec1d31b61 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q64.sql @@ -0,0 +1,92 @@ +WITH cs_ui AS +(SELECT + cs_item_sk, + sum(cs_ext_list_price) AS sale, + sum(cr_refunded_cash + cr_reversed_charge + cr_store_credit) AS refund + FROM catalog_sales + , catalog_returns + WHERE cs_item_sk = cr_item_sk + AND cs_order_number = cr_order_number + GROUP BY cs_item_sk + HAVING sum(cs_ext_list_price) > 2 * sum(cr_refunded_cash + cr_reversed_charge + cr_store_credit)), + cross_sales AS + (SELECT + i_product_name product_name, + i_item_sk item_sk, + s_store_name store_name, + s_zip store_zip, + ad1.ca_street_number b_street_number, + ad1.ca_street_name b_streen_name, + ad1.ca_city b_city, + ad1.ca_zip b_zip, + ad2.ca_street_number c_street_number, + ad2.ca_street_name c_street_name, + ad2.ca_city c_city, + ad2.ca_zip c_zip, + d1.d_year AS syear, + d2.d_year AS fsyear, + d3.d_year s2year, + count(*) cnt, + sum(ss_wholesale_cost) s1, + sum(ss_list_price) s2, + sum(ss_coupon_amt) s3 + FROM store_sales, store_returns, cs_ui, date_dim d1, date_dim d2, date_dim d3, + store, customer, customer_demographics cd1, customer_demographics cd2, + promotion, household_demographics hd1, household_demographics hd2, + customer_address ad1, customer_address ad2, income_band ib1, income_band ib2, item + WHERE ss_store_sk = s_store_sk AND + ss_sold_date_sk = d1.d_date_sk AND + ss_customer_sk = c_customer_sk AND + ss_cdemo_sk = cd1.cd_demo_sk AND + ss_hdemo_sk = hd1.hd_demo_sk AND + ss_addr_sk = ad1.ca_address_sk AND + ss_item_sk = i_item_sk AND + ss_item_sk = sr_item_sk AND + ss_ticket_number = sr_ticket_number AND + ss_item_sk = cs_ui.cs_item_sk AND + c_current_cdemo_sk = cd2.cd_demo_sk AND + c_current_hdemo_sk = hd2.hd_demo_sk AND + c_current_addr_sk = ad2.ca_address_sk AND + c_first_sales_date_sk = d2.d_date_sk AND + c_first_shipto_date_sk = d3.d_date_sk AND + ss_promo_sk = p_promo_sk AND + hd1.hd_income_band_sk = ib1.ib_income_band_sk AND + hd2.hd_income_band_sk = ib2.ib_income_band_sk AND + cd1.cd_marital_status <> cd2.cd_marital_status AND + i_color IN ('purple', 'burlywood', 'indian', 'spring', 'floral', 'medium') AND + i_current_price BETWEEN 64 AND 64 + 10 AND + i_current_price BETWEEN 64 + 1 AND 64 + 15 + GROUP BY i_product_name, i_item_sk, s_store_name, s_zip, ad1.ca_street_number, + ad1.ca_street_name, ad1.ca_city, ad1.ca_zip, ad2.ca_street_number, + ad2.ca_street_name, ad2.ca_city, ad2.ca_zip, d1.d_year, d2.d_year, d3.d_year + ) +SELECT + cs1.product_name, + cs1.store_name, + cs1.store_zip, + cs1.b_street_number, + cs1.b_streen_name, + cs1.b_city, + cs1.b_zip, + cs1.c_street_number, + cs1.c_street_name, + cs1.c_city, + cs1.c_zip, + cs1.syear, + cs1.cnt, + cs1.s1, + cs1.s2, + cs1.s3, + cs2.s1, + cs2.s2, + cs2.s3, + cs2.syear, + cs2.cnt +FROM cross_sales cs1, cross_sales cs2 +WHERE cs1.item_sk = cs2.item_sk AND + cs1.syear = 1999 AND + cs2.syear = 1999 + 1 AND + cs2.cnt <= cs1.cnt AND + cs1.store_name = cs2.store_name AND + cs1.store_zip = cs2.store_zip +ORDER BY cs1.product_name, cs1.store_name, cs2.cnt diff --git a/plugin-spark/src/test/resources/tpcds/q65.sql b/plugin-spark/src/test/resources/tpcds/q65.sql new file mode 100755 index 0000000000..aad04be1bc --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q65.sql @@ -0,0 +1,33 @@ +SELECT + s_store_name, + i_item_desc, + sc.revenue, + i_current_price, + i_wholesale_cost, + i_brand +FROM store, item, + (SELECT + ss_store_sk, + avg(revenue) AS ave + FROM + (SELECT + ss_store_sk, + ss_item_sk, + sum(ss_sales_price) AS revenue + FROM store_sales, date_dim + WHERE ss_sold_date_sk = d_date_sk AND d_month_seq BETWEEN 1176 AND 1176 + 11 + GROUP BY ss_store_sk, ss_item_sk) sa + GROUP BY ss_store_sk) sb, + (SELECT + ss_store_sk, + ss_item_sk, + sum(ss_sales_price) AS revenue + FROM store_sales, date_dim + WHERE ss_sold_date_sk = d_date_sk AND d_month_seq BETWEEN 1176 AND 1176 + 11 + GROUP BY ss_store_sk, ss_item_sk) sc +WHERE sb.ss_store_sk = sc.ss_store_sk AND + sc.revenue <= 0.1 * sb.ave AND + s_store_sk = sc.ss_store_sk AND + i_item_sk = sc.ss_item_sk +ORDER BY s_store_name, i_item_desc +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q66.sql b/plugin-spark/src/test/resources/tpcds/q66.sql new file mode 100755 index 0000000000..f826b41643 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q66.sql @@ -0,0 +1,240 @@ +SELECT + w_warehouse_name, + w_warehouse_sq_ft, + w_city, + w_county, + w_state, + w_country, + ship_carriers, + year, + sum(jan_sales) AS jan_sales, + sum(feb_sales) AS feb_sales, + sum(mar_sales) AS mar_sales, + sum(apr_sales) AS apr_sales, + sum(may_sales) AS may_sales, + sum(jun_sales) AS jun_sales, + sum(jul_sales) AS jul_sales, + sum(aug_sales) AS aug_sales, + sum(sep_sales) AS sep_sales, + sum(oct_sales) AS oct_sales, + sum(nov_sales) AS nov_sales, + sum(dec_sales) AS dec_sales, + sum(jan_sales / w_warehouse_sq_ft) AS jan_sales_per_sq_foot, + sum(feb_sales / w_warehouse_sq_ft) AS feb_sales_per_sq_foot, + sum(mar_sales / w_warehouse_sq_ft) AS mar_sales_per_sq_foot, + sum(apr_sales / w_warehouse_sq_ft) AS apr_sales_per_sq_foot, + sum(may_sales / w_warehouse_sq_ft) AS may_sales_per_sq_foot, + sum(jun_sales / w_warehouse_sq_ft) AS jun_sales_per_sq_foot, + sum(jul_sales / w_warehouse_sq_ft) AS jul_sales_per_sq_foot, + sum(aug_sales / w_warehouse_sq_ft) AS aug_sales_per_sq_foot, + sum(sep_sales / w_warehouse_sq_ft) AS sep_sales_per_sq_foot, + sum(oct_sales / w_warehouse_sq_ft) AS oct_sales_per_sq_foot, + sum(nov_sales / w_warehouse_sq_ft) AS nov_sales_per_sq_foot, + sum(dec_sales / w_warehouse_sq_ft) AS dec_sales_per_sq_foot, + sum(jan_net) AS jan_net, + sum(feb_net) AS feb_net, + sum(mar_net) AS mar_net, + sum(apr_net) AS apr_net, + sum(may_net) AS may_net, + sum(jun_net) AS jun_net, + sum(jul_net) AS jul_net, + sum(aug_net) AS aug_net, + sum(sep_net) AS sep_net, + sum(oct_net) AS oct_net, + sum(nov_net) AS nov_net, + sum(dec_net) AS dec_net +FROM ( + (SELECT + w_warehouse_name, + w_warehouse_sq_ft, + w_city, + w_county, + w_state, + w_country, + concat('DHL', ',', 'BARIAN') AS ship_carriers, + d_year AS year, + sum(CASE WHEN d_moy = 1 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS jan_sales, + sum(CASE WHEN d_moy = 2 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS feb_sales, + sum(CASE WHEN d_moy = 3 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS mar_sales, + sum(CASE WHEN d_moy = 4 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS apr_sales, + sum(CASE WHEN d_moy = 5 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS may_sales, + sum(CASE WHEN d_moy = 6 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS jun_sales, + sum(CASE WHEN d_moy = 7 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS jul_sales, + sum(CASE WHEN d_moy = 8 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS aug_sales, + sum(CASE WHEN d_moy = 9 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS sep_sales, + sum(CASE WHEN d_moy = 10 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS oct_sales, + sum(CASE WHEN d_moy = 11 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS nov_sales, + sum(CASE WHEN d_moy = 12 + THEN ws_ext_sales_price * ws_quantity + ELSE 0 END) AS dec_sales, + sum(CASE WHEN d_moy = 1 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS jan_net, + sum(CASE WHEN d_moy = 2 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS feb_net, + sum(CASE WHEN d_moy = 3 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS mar_net, + sum(CASE WHEN d_moy = 4 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS apr_net, + sum(CASE WHEN d_moy = 5 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS may_net, + sum(CASE WHEN d_moy = 6 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS jun_net, + sum(CASE WHEN d_moy = 7 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS jul_net, + sum(CASE WHEN d_moy = 8 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS aug_net, + sum(CASE WHEN d_moy = 9 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS sep_net, + sum(CASE WHEN d_moy = 10 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS oct_net, + sum(CASE WHEN d_moy = 11 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS nov_net, + sum(CASE WHEN d_moy = 12 + THEN ws_net_paid * ws_quantity + ELSE 0 END) AS dec_net + FROM + web_sales, warehouse, date_dim, time_dim, ship_mode + WHERE + ws_warehouse_sk = w_warehouse_sk + AND ws_sold_date_sk = d_date_sk + AND ws_sold_time_sk = t_time_sk + AND ws_ship_mode_sk = sm_ship_mode_sk + AND d_year = 2001 + AND t_time BETWEEN 30838 AND 30838 + 28800 + AND sm_carrier IN ('DHL', 'BARIAN') + GROUP BY + w_warehouse_name, w_warehouse_sq_ft, w_city, w_county, w_state, w_country, d_year) + UNION ALL + (SELECT + w_warehouse_name, + w_warehouse_sq_ft, + w_city, + w_county, + w_state, + w_country, + concat('DHL', ',', 'BARIAN') AS ship_carriers, + d_year AS year, + sum(CASE WHEN d_moy = 1 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS jan_sales, + sum(CASE WHEN d_moy = 2 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS feb_sales, + sum(CASE WHEN d_moy = 3 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS mar_sales, + sum(CASE WHEN d_moy = 4 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS apr_sales, + sum(CASE WHEN d_moy = 5 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS may_sales, + sum(CASE WHEN d_moy = 6 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS jun_sales, + sum(CASE WHEN d_moy = 7 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS jul_sales, + sum(CASE WHEN d_moy = 8 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS aug_sales, + sum(CASE WHEN d_moy = 9 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS sep_sales, + sum(CASE WHEN d_moy = 10 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS oct_sales, + sum(CASE WHEN d_moy = 11 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS nov_sales, + sum(CASE WHEN d_moy = 12 + THEN cs_sales_price * cs_quantity + ELSE 0 END) AS dec_sales, + sum(CASE WHEN d_moy = 1 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS jan_net, + sum(CASE WHEN d_moy = 2 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS feb_net, + sum(CASE WHEN d_moy = 3 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS mar_net, + sum(CASE WHEN d_moy = 4 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS apr_net, + sum(CASE WHEN d_moy = 5 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS may_net, + sum(CASE WHEN d_moy = 6 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS jun_net, + sum(CASE WHEN d_moy = 7 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS jul_net, + sum(CASE WHEN d_moy = 8 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS aug_net, + sum(CASE WHEN d_moy = 9 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS sep_net, + sum(CASE WHEN d_moy = 10 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS oct_net, + sum(CASE WHEN d_moy = 11 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS nov_net, + sum(CASE WHEN d_moy = 12 + THEN cs_net_paid_inc_tax * cs_quantity + ELSE 0 END) AS dec_net + FROM + catalog_sales, warehouse, date_dim, time_dim, ship_mode + WHERE + cs_warehouse_sk = w_warehouse_sk + AND cs_sold_date_sk = d_date_sk + AND cs_sold_time_sk = t_time_sk + AND cs_ship_mode_sk = sm_ship_mode_sk + AND d_year = 2001 + AND t_time BETWEEN 30838 AND 30838 + 28800 + AND sm_carrier IN ('DHL', 'BARIAN') + GROUP BY + w_warehouse_name, w_warehouse_sq_ft, w_city, w_county, w_state, w_country, d_year + ) + ) x +GROUP BY + w_warehouse_name, w_warehouse_sq_ft, w_city, w_county, w_state, w_country, + ship_carriers, year +ORDER BY w_warehouse_name +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q67.sql b/plugin-spark/src/test/resources/tpcds/q67.sql new file mode 100755 index 0000000000..f66e2252bd --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q67.sql @@ -0,0 +1,38 @@ +SELECT * +FROM + (SELECT + i_category, + i_class, + i_brand, + i_product_name, + d_year, + d_qoy, + d_moy, + s_store_id, + sumsales, + rank() + OVER (PARTITION BY i_category + ORDER BY sumsales DESC) rk + FROM + (SELECT + i_category, + i_class, + i_brand, + i_product_name, + d_year, + d_qoy, + d_moy, + s_store_id, + sum(coalesce(ss_sales_price * ss_quantity, 0)) sumsales + FROM store_sales, date_dim, store, item + WHERE ss_sold_date_sk = d_date_sk + AND ss_item_sk = i_item_sk + AND ss_store_sk = s_store_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11 + GROUP BY ROLLUP (i_category, i_class, i_brand, i_product_name, d_year, d_qoy, + d_moy, s_store_id)) dw1) dw2 +WHERE rk <= 100 +ORDER BY + i_category, i_class, i_brand, i_product_name, d_year, + d_qoy, d_moy, s_store_id, sumsales, rk +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q68.sql b/plugin-spark/src/test/resources/tpcds/q68.sql new file mode 100755 index 0000000000..adb8a7189d --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q68.sql @@ -0,0 +1,34 @@ +SELECT + c_last_name, + c_first_name, + ca_city, + bought_city, + ss_ticket_number, + extended_price, + extended_tax, + list_price +FROM (SELECT + ss_ticket_number, + ss_customer_sk, + ca_city bought_city, + sum(ss_ext_sales_price) extended_price, + sum(ss_ext_list_price) list_price, + sum(ss_ext_tax) extended_tax +FROM store_sales, date_dim, store, household_demographics, customer_address +WHERE store_sales.ss_sold_date_sk = date_dim.d_date_sk + AND store_sales.ss_store_sk = store.s_store_sk + AND store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + AND store_sales.ss_addr_sk = customer_address.ca_address_sk + AND date_dim.d_dom BETWEEN 1 AND 2 + AND (household_demographics.hd_dep_count = 4 OR + household_demographics.hd_vehicle_count = 3) + AND date_dim.d_year IN (1999, 1999 + 1, 1999 + 2) + AND store.s_city IN ('Midway', 'Fairview') +GROUP BY ss_ticket_number, ss_customer_sk, ss_addr_sk, ca_city) dn, + customer, + customer_address current_addr +WHERE ss_customer_sk = c_customer_sk + AND customer.c_current_addr_sk = current_addr.ca_address_sk + AND current_addr.ca_city <> bought_city +ORDER BY c_last_name, ss_ticket_number +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q69.sql b/plugin-spark/src/test/resources/tpcds/q69.sql new file mode 100755 index 0000000000..1f0ee64f56 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q69.sql @@ -0,0 +1,38 @@ +SELECT + cd_gender, + cd_marital_status, + cd_education_status, + count(*) cnt1, + cd_purchase_estimate, + count(*) cnt2, + cd_credit_rating, + count(*) cnt3 +FROM + customer c, customer_address ca, customer_demographics +WHERE + c.c_current_addr_sk = ca.ca_address_sk AND + ca_state IN ('KY', 'GA', 'NM') AND + cd_demo_sk = c.c_current_cdemo_sk AND + exists(SELECT * + FROM store_sales, date_dim + WHERE c.c_customer_sk = ss_customer_sk AND + ss_sold_date_sk = d_date_sk AND + d_year = 2001 AND + d_moy BETWEEN 4 AND 4 + 2) AND + (NOT exists(SELECT * + FROM web_sales, date_dim + WHERE c.c_customer_sk = ws_bill_customer_sk AND + ws_sold_date_sk = d_date_sk AND + d_year = 2001 AND + d_moy BETWEEN 4 AND 4 + 2) AND + NOT exists(SELECT * + FROM catalog_sales, date_dim + WHERE c.c_customer_sk = cs_ship_customer_sk AND + cs_sold_date_sk = d_date_sk AND + d_year = 2001 AND + d_moy BETWEEN 4 AND 4 + 2)) +GROUP BY cd_gender, cd_marital_status, cd_education_status, + cd_purchase_estimate, cd_credit_rating +ORDER BY cd_gender, cd_marital_status, cd_education_status, + cd_purchase_estimate, cd_credit_rating +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q7.sql b/plugin-spark/src/test/resources/tpcds/q7.sql new file mode 100755 index 0000000000..6630a00548 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q7.sql @@ -0,0 +1,19 @@ +SELECT + i_item_id, + avg(ss_quantity) agg1, + avg(ss_list_price) agg2, + avg(ss_coupon_amt) agg3, + avg(ss_sales_price) agg4 +FROM store_sales, customer_demographics, date_dim, item, promotion +WHERE ss_sold_date_sk = d_date_sk AND + ss_item_sk = i_item_sk AND + ss_cdemo_sk = cd_demo_sk AND + ss_promo_sk = p_promo_sk AND + cd_gender = 'M' AND + cd_marital_status = 'S' AND + cd_education_status = 'College' AND + (p_channel_email = 'N' OR p_channel_event = 'N') AND + d_year = 2000 +GROUP BY i_item_id +ORDER BY i_item_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q70.sql b/plugin-spark/src/test/resources/tpcds/q70.sql new file mode 100755 index 0000000000..625011b212 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q70.sql @@ -0,0 +1,38 @@ +SELECT + sum(ss_net_profit) AS total_sum, + s_state, + s_county, + grouping(s_state) + grouping(s_county) AS lochierarchy, + rank() + OVER ( + PARTITION BY grouping(s_state) + grouping(s_county), + CASE WHEN grouping(s_county) = 0 + THEN s_state END + ORDER BY sum(ss_net_profit) DESC) AS rank_within_parent +FROM + store_sales, date_dim d1, store +WHERE + d1.d_month_seq BETWEEN 1200 AND 1200 + 11 + AND d1.d_date_sk = ss_sold_date_sk + AND s_store_sk = ss_store_sk + AND s_state IN + (SELECT s_state + FROM + (SELECT + s_state AS s_state, + rank() + OVER (PARTITION BY s_state + ORDER BY sum(ss_net_profit) DESC) AS ranking + FROM store_sales, store, date_dim + WHERE d_month_seq BETWEEN 1200 AND 1200 + 11 + AND d_date_sk = ss_sold_date_sk + AND s_store_sk = ss_store_sk + GROUP BY s_state) tmp1 + WHERE ranking <= 5) +GROUP BY ROLLUP (s_state, s_county) +ORDER BY + lochierarchy DESC + , CASE WHEN lochierarchy = 0 + THEN s_state END + , rank_within_parent +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q71.sql b/plugin-spark/src/test/resources/tpcds/q71.sql new file mode 100755 index 0000000000..8d724b9244 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q71.sql @@ -0,0 +1,44 @@ +SELECT + i_brand_id brand_id, + i_brand brand, + t_hour, + t_minute, + sum(ext_price) ext_price +FROM item, + (SELECT + ws_ext_sales_price AS ext_price, + ws_sold_date_sk AS sold_date_sk, + ws_item_sk AS sold_item_sk, + ws_sold_time_sk AS time_sk + FROM web_sales, date_dim + WHERE d_date_sk = ws_sold_date_sk + AND d_moy = 11 + AND d_year = 1999 + UNION ALL + SELECT + cs_ext_sales_price AS ext_price, + cs_sold_date_sk AS sold_date_sk, + cs_item_sk AS sold_item_sk, + cs_sold_time_sk AS time_sk + FROM catalog_sales, date_dim + WHERE d_date_sk = cs_sold_date_sk + AND d_moy = 11 + AND d_year = 1999 + UNION ALL + SELECT + ss_ext_sales_price AS ext_price, + ss_sold_date_sk AS sold_date_sk, + ss_item_sk AS sold_item_sk, + ss_sold_time_sk AS time_sk + FROM store_sales, date_dim + WHERE d_date_sk = ss_sold_date_sk + AND d_moy = 11 + AND d_year = 1999 + ) AS tmp, time_dim +WHERE + sold_item_sk = i_item_sk + AND i_manager_id = 1 + AND time_sk = t_time_sk + AND (t_meal_time = 'breakfast' OR t_meal_time = 'dinner') +GROUP BY i_brand, i_brand_id, t_hour, t_minute +ORDER BY ext_price DESC, brand_id diff --git a/plugin-spark/src/test/resources/tpcds/q72.sql b/plugin-spark/src/test/resources/tpcds/q72.sql new file mode 100755 index 0000000000..99b3eee54a --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q72.sql @@ -0,0 +1,33 @@ +SELECT + i_item_desc, + w_warehouse_name, + d1.d_week_seq, + count(CASE WHEN p_promo_sk IS NULL + THEN 1 + ELSE 0 END) no_promo, + count(CASE WHEN p_promo_sk IS NOT NULL + THEN 1 + ELSE 0 END) promo, + count(*) total_cnt +FROM catalog_sales + JOIN inventory ON (cs_item_sk = inv_item_sk) + JOIN warehouse ON (w_warehouse_sk = inv_warehouse_sk) + JOIN item ON (i_item_sk = cs_item_sk) + JOIN customer_demographics ON (cs_bill_cdemo_sk = cd_demo_sk) + JOIN household_demographics ON (cs_bill_hdemo_sk = hd_demo_sk) + JOIN date_dim d1 ON (cs_sold_date_sk = d1.d_date_sk) + JOIN date_dim d2 ON (inv_date_sk = d2.d_date_sk) + JOIN date_dim d3 ON (cs_ship_date_sk = d3.d_date_sk) + LEFT OUTER JOIN promotion ON (cs_promo_sk = p_promo_sk) + LEFT OUTER JOIN catalog_returns ON (cr_item_sk = cs_item_sk AND cr_order_number = cs_order_number) +WHERE d1.d_week_seq = d2.d_week_seq + AND inv_quantity_on_hand < cs_quantity + AND d3.d_date > (cast(d1.d_date AS DATE) + interval 5 days) + AND hd_buy_potential = '>10000' + AND d1.d_year = 1999 + AND hd_buy_potential = '>10000' + AND cd_marital_status = 'D' + AND d1.d_year = 1999 +GROUP BY i_item_desc, w_warehouse_name, d1.d_week_seq +ORDER BY total_cnt DESC, i_item_desc, w_warehouse_name, d_week_seq +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q73.sql b/plugin-spark/src/test/resources/tpcds/q73.sql new file mode 100755 index 0000000000..881be2e902 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q73.sql @@ -0,0 +1,30 @@ +SELECT + c_last_name, + c_first_name, + c_salutation, + c_preferred_cust_flag, + ss_ticket_number, + cnt +FROM + (SELECT + ss_ticket_number, + ss_customer_sk, + count(*) cnt + FROM store_sales, date_dim, store, household_demographics + WHERE store_sales.ss_sold_date_sk = date_dim.d_date_sk + AND store_sales.ss_store_sk = store.s_store_sk + AND store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + AND date_dim.d_dom BETWEEN 1 AND 2 + AND (household_demographics.hd_buy_potential = '>10000' OR + household_demographics.hd_buy_potential = 'unknown') + AND household_demographics.hd_vehicle_count > 0 + AND CASE WHEN household_demographics.hd_vehicle_count > 0 + THEN + household_demographics.hd_dep_count / household_demographics.hd_vehicle_count + ELSE NULL END > 1 + AND date_dim.d_year IN (1999, 1999 + 1, 1999 + 2) + AND store.s_county IN ('Williamson County', 'Franklin Parish', 'Bronx County', 'Orange County') + GROUP BY ss_ticket_number, ss_customer_sk) dj, customer +WHERE ss_customer_sk = c_customer_sk + AND cnt BETWEEN 1 AND 5 +ORDER BY cnt DESC diff --git a/plugin-spark/src/test/resources/tpcds/q74.sql b/plugin-spark/src/test/resources/tpcds/q74.sql new file mode 100755 index 0000000000..154b26d680 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q74.sql @@ -0,0 +1,58 @@ +WITH year_total AS ( + SELECT + c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + d_year AS year, + sum(ss_net_paid) year_total, + 's' sale_type + FROM + customer, store_sales, date_dim + WHERE c_customer_sk = ss_customer_sk + AND ss_sold_date_sk = d_date_sk + AND d_year IN (2001, 2001 + 1) + GROUP BY + c_customer_id, c_first_name, c_last_name, d_year + UNION ALL + SELECT + c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + d_year AS year, + sum(ws_net_paid) year_total, + 'w' sale_type + FROM + customer, web_sales, date_dim + WHERE c_customer_sk = ws_bill_customer_sk + AND ws_sold_date_sk = d_date_sk + AND d_year IN (2001, 2001 + 1) + GROUP BY + c_customer_id, c_first_name, c_last_name, d_year) +SELECT + t_s_secyear.customer_id, + t_s_secyear.customer_first_name, + t_s_secyear.customer_last_name +FROM + year_total t_s_firstyear, year_total t_s_secyear, + year_total t_w_firstyear, year_total t_w_secyear +WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id + AND t_s_firstyear.customer_id = t_w_secyear.customer_id + AND t_s_firstyear.customer_id = t_w_firstyear.customer_id + AND t_s_firstyear.sale_type = 's' + AND t_w_firstyear.sale_type = 'w' + AND t_s_secyear.sale_type = 's' + AND t_w_secyear.sale_type = 'w' + AND t_s_firstyear.year = 2001 + AND t_s_secyear.year = 2001 + 1 + AND t_w_firstyear.year = 2001 + AND t_w_secyear.year = 2001 + 1 + AND t_s_firstyear.year_total > 0 + AND t_w_firstyear.year_total > 0 + AND CASE WHEN t_w_firstyear.year_total > 0 + THEN t_w_secyear.year_total / t_w_firstyear.year_total + ELSE NULL END + > CASE WHEN t_s_firstyear.year_total > 0 + THEN t_s_secyear.year_total / t_s_firstyear.year_total + ELSE NULL END +ORDER BY 1, 1, 1 +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q75.sql b/plugin-spark/src/test/resources/tpcds/q75.sql new file mode 100755 index 0000000000..2a143232b5 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q75.sql @@ -0,0 +1,76 @@ +WITH all_sales AS ( + SELECT + d_year, + i_brand_id, + i_class_id, + i_category_id, + i_manufact_id, + SUM(sales_cnt) AS sales_cnt, + SUM(sales_amt) AS sales_amt + FROM ( + SELECT + d_year, + i_brand_id, + i_class_id, + i_category_id, + i_manufact_id, + cs_quantity - COALESCE(cr_return_quantity, 0) AS sales_cnt, + cs_ext_sales_price - COALESCE(cr_return_amount, 0.0) AS sales_amt + FROM catalog_sales + JOIN item ON i_item_sk = cs_item_sk + JOIN date_dim ON d_date_sk = cs_sold_date_sk + LEFT JOIN catalog_returns ON (cs_order_number = cr_order_number + AND cs_item_sk = cr_item_sk) + WHERE i_category = 'Books' + UNION + SELECT + d_year, + i_brand_id, + i_class_id, + i_category_id, + i_manufact_id, + ss_quantity - COALESCE(sr_return_quantity, 0) AS sales_cnt, + ss_ext_sales_price - COALESCE(sr_return_amt, 0.0) AS sales_amt + FROM store_sales + JOIN item ON i_item_sk = ss_item_sk + JOIN date_dim ON d_date_sk = ss_sold_date_sk + LEFT JOIN store_returns ON (ss_ticket_number = sr_ticket_number + AND ss_item_sk = sr_item_sk) + WHERE i_category = 'Books' + UNION + SELECT + d_year, + i_brand_id, + i_class_id, + i_category_id, + i_manufact_id, + ws_quantity - COALESCE(wr_return_quantity, 0) AS sales_cnt, + ws_ext_sales_price - COALESCE(wr_return_amt, 0.0) AS sales_amt + FROM web_sales + JOIN item ON i_item_sk = ws_item_sk + JOIN date_dim ON d_date_sk = ws_sold_date_sk + LEFT JOIN web_returns ON (ws_order_number = wr_order_number + AND ws_item_sk = wr_item_sk) + WHERE i_category = 'Books') sales_detail + GROUP BY d_year, i_brand_id, i_class_id, i_category_id, i_manufact_id) +SELECT + prev_yr.d_year AS prev_year, + curr_yr.d_year AS year, + curr_yr.i_brand_id, + curr_yr.i_class_id, + curr_yr.i_category_id, + curr_yr.i_manufact_id, + prev_yr.sales_cnt AS prev_yr_cnt, + curr_yr.sales_cnt AS curr_yr_cnt, + curr_yr.sales_cnt - prev_yr.sales_cnt AS sales_cnt_diff, + curr_yr.sales_amt - prev_yr.sales_amt AS sales_amt_diff +FROM all_sales curr_yr, all_sales prev_yr +WHERE curr_yr.i_brand_id = prev_yr.i_brand_id + AND curr_yr.i_class_id = prev_yr.i_class_id + AND curr_yr.i_category_id = prev_yr.i_category_id + AND curr_yr.i_manufact_id = prev_yr.i_manufact_id + AND curr_yr.d_year = 2002 + AND prev_yr.d_year = 2002 - 1 + AND CAST(curr_yr.sales_cnt AS DECIMAL(17, 2)) / CAST(prev_yr.sales_cnt AS DECIMAL(17, 2)) < 0.9 +ORDER BY sales_cnt_diff +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q76.sql b/plugin-spark/src/test/resources/tpcds/q76.sql new file mode 100755 index 0000000000..815fa922be --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q76.sql @@ -0,0 +1,47 @@ +SELECT + channel, + col_name, + d_year, + d_qoy, + i_category, + COUNT(*) sales_cnt, + SUM(ext_sales_price) sales_amt +FROM ( + SELECT + 'store' AS channel, + ss_store_sk col_name, + d_year, + d_qoy, + i_category, + ss_ext_sales_price ext_sales_price + FROM store_sales, item, date_dim + WHERE ss_store_sk IS NULL + AND ss_sold_date_sk = d_date_sk + AND ss_item_sk = i_item_sk + UNION ALL + SELECT + 'web' AS channel, + ws_ship_customer_sk col_name, + d_year, + d_qoy, + i_category, + ws_ext_sales_price ext_sales_price + FROM web_sales, item, date_dim + WHERE ws_ship_customer_sk IS NULL + AND ws_sold_date_sk = d_date_sk + AND ws_item_sk = i_item_sk + UNION ALL + SELECT + 'catalog' AS channel, + cs_ship_addr_sk col_name, + d_year, + d_qoy, + i_category, + cs_ext_sales_price ext_sales_price + FROM catalog_sales, item, date_dim + WHERE cs_ship_addr_sk IS NULL + AND cs_sold_date_sk = d_date_sk + AND cs_item_sk = i_item_sk) foo +GROUP BY channel, col_name, d_year, d_qoy, i_category +ORDER BY channel, col_name, d_year, d_qoy, i_category +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q77.sql b/plugin-spark/src/test/resources/tpcds/q77.sql new file mode 100755 index 0000000000..a69df9fbcd --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q77.sql @@ -0,0 +1,100 @@ +WITH ss AS +(SELECT + s_store_sk, + sum(ss_ext_sales_price) AS sales, + sum(ss_net_profit) AS profit + FROM store_sales, date_dim, store + WHERE ss_sold_date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-03' AS DATE) AND + (cast('2000-08-03' AS DATE) + INTERVAL 30 days) + AND ss_store_sk = s_store_sk + GROUP BY s_store_sk), + sr AS + (SELECT + s_store_sk, + sum(sr_return_amt) AS returns, + sum(sr_net_loss) AS profit_loss + FROM store_returns, date_dim, store + WHERE sr_returned_date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-03' AS DATE) AND + (cast('2000-08-03' AS DATE) + INTERVAL 30 days) + AND sr_store_sk = s_store_sk + GROUP BY s_store_sk), + cs AS + (SELECT + cs_call_center_sk, + sum(cs_ext_sales_price) AS sales, + sum(cs_net_profit) AS profit + FROM catalog_sales, date_dim + WHERE cs_sold_date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-03' AS DATE) AND + (cast('2000-08-03' AS DATE) + INTERVAL 30 days) + GROUP BY cs_call_center_sk), + cr AS + (SELECT + sum(cr_return_amount) AS returns, + sum(cr_net_loss) AS profit_loss + FROM catalog_returns, date_dim + WHERE cr_returned_date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-03' AS DATE) AND + (cast('2000-08-03' AS DATE) + INTERVAL 30 days)), + ws AS + (SELECT + wp_web_page_sk, + sum(ws_ext_sales_price) AS sales, + sum(ws_net_profit) AS profit + FROM web_sales, date_dim, web_page + WHERE ws_sold_date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-03' AS DATE) AND + (cast('2000-08-03' AS DATE) + INTERVAL 30 days) + AND ws_web_page_sk = wp_web_page_sk + GROUP BY wp_web_page_sk), + wr AS + (SELECT + wp_web_page_sk, + sum(wr_return_amt) AS returns, + sum(wr_net_loss) AS profit_loss + FROM web_returns, date_dim, web_page + WHERE wr_returned_date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-03' AS DATE) AND + (cast('2000-08-03' AS DATE) + INTERVAL 30 days) + AND wr_web_page_sk = wp_web_page_sk + GROUP BY wp_web_page_sk) +SELECT + channel, + id, + sum(sales) AS sales, + sum(returns) AS returns, + sum(profit) AS profit +FROM + (SELECT + 'store channel' AS channel, + ss.s_store_sk AS id, + sales, + coalesce(returns, 0) AS returns, + (profit - coalesce(profit_loss, 0)) AS profit + FROM ss + LEFT JOIN sr + ON ss.s_store_sk = sr.s_store_sk + UNION ALL + SELECT + 'catalog channel' AS channel, + cs_call_center_sk AS id, + sales, + returns, + (profit - profit_loss) AS profit + FROM cs, cr + UNION ALL + SELECT + 'web channel' AS channel, + ws.wp_web_page_sk AS id, + sales, + coalesce(returns, 0) returns, + (profit - coalesce(profit_loss, 0)) AS profit + FROM ws + LEFT JOIN wr + ON ws.wp_web_page_sk = wr.wp_web_page_sk + ) x +GROUP BY ROLLUP (channel, id) +ORDER BY channel, id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q78.sql b/plugin-spark/src/test/resources/tpcds/q78.sql new file mode 100755 index 0000000000..07b0940e26 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q78.sql @@ -0,0 +1,64 @@ +WITH ws AS +(SELECT + d_year AS ws_sold_year, + ws_item_sk, + ws_bill_customer_sk ws_customer_sk, + sum(ws_quantity) ws_qty, + sum(ws_wholesale_cost) ws_wc, + sum(ws_sales_price) ws_sp + FROM web_sales + LEFT JOIN web_returns ON wr_order_number = ws_order_number AND ws_item_sk = wr_item_sk + JOIN date_dim ON ws_sold_date_sk = d_date_sk + WHERE wr_order_number IS NULL + GROUP BY d_year, ws_item_sk, ws_bill_customer_sk +), + cs AS + (SELECT + d_year AS cs_sold_year, + cs_item_sk, + cs_bill_customer_sk cs_customer_sk, + sum(cs_quantity) cs_qty, + sum(cs_wholesale_cost) cs_wc, + sum(cs_sales_price) cs_sp + FROM catalog_sales + LEFT JOIN catalog_returns ON cr_order_number = cs_order_number AND cs_item_sk = cr_item_sk + JOIN date_dim ON cs_sold_date_sk = d_date_sk + WHERE cr_order_number IS NULL + GROUP BY d_year, cs_item_sk, cs_bill_customer_sk + ), + ss AS + (SELECT + d_year AS ss_sold_year, + ss_item_sk, + ss_customer_sk, + sum(ss_quantity) ss_qty, + sum(ss_wholesale_cost) ss_wc, + sum(ss_sales_price) ss_sp + FROM store_sales + LEFT JOIN store_returns ON sr_ticket_number = ss_ticket_number AND ss_item_sk = sr_item_sk + JOIN date_dim ON ss_sold_date_sk = d_date_sk + WHERE sr_ticket_number IS NULL + GROUP BY d_year, ss_item_sk, ss_customer_sk + ) +SELECT + round(ss_qty / (coalesce(ws_qty + cs_qty, 1)), 2) ratio, + ss_qty store_qty, + ss_wc store_wholesale_cost, + ss_sp store_sales_price, + coalesce(ws_qty, 0) + coalesce(cs_qty, 0) other_chan_qty, + coalesce(ws_wc, 0) + coalesce(cs_wc, 0) other_chan_wholesale_cost, + coalesce(ws_sp, 0) + coalesce(cs_sp, 0) other_chan_sales_price +FROM ss + LEFT JOIN ws + ON (ws_sold_year = ss_sold_year AND ws_item_sk = ss_item_sk AND ws_customer_sk = ss_customer_sk) + LEFT JOIN cs + ON (cs_sold_year = ss_sold_year AND cs_item_sk = ss_item_sk AND cs_customer_sk = ss_customer_sk) +WHERE coalesce(ws_qty, 0) > 0 AND coalesce(cs_qty, 0) > 0 AND ss_sold_year = 2000 +ORDER BY + ratio, + ss_qty DESC, ss_wc DESC, ss_sp DESC, + other_chan_qty, + other_chan_wholesale_cost, + other_chan_sales_price, + round(ss_qty / (coalesce(ws_qty + cs_qty, 1)), 2) +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q79.sql b/plugin-spark/src/test/resources/tpcds/q79.sql new file mode 100755 index 0000000000..08f86dc203 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q79.sql @@ -0,0 +1,27 @@ +SELECT + c_last_name, + c_first_name, + substr(s_city, 1, 30), + ss_ticket_number, + amt, + profit +FROM + (SELECT + ss_ticket_number, + ss_customer_sk, + store.s_city, + sum(ss_coupon_amt) amt, + sum(ss_net_profit) profit + FROM store_sales, date_dim, store, household_demographics + WHERE store_sales.ss_sold_date_sk = date_dim.d_date_sk + AND store_sales.ss_store_sk = store.s_store_sk + AND store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + AND (household_demographics.hd_dep_count = 6 OR + household_demographics.hd_vehicle_count > 2) + AND date_dim.d_dow = 1 + AND date_dim.d_year IN (1999, 1999 + 1, 1999 + 2) + AND store.s_number_employees BETWEEN 200 AND 295 + GROUP BY ss_ticket_number, ss_customer_sk, ss_addr_sk, store.s_city) ms, customer +WHERE ss_customer_sk = c_customer_sk +ORDER BY c_last_name, c_first_name, substr(s_city, 1, 30), profit +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q8.sql b/plugin-spark/src/test/resources/tpcds/q8.sql new file mode 100755 index 0000000000..497725111f --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q8.sql @@ -0,0 +1,87 @@ +SELECT + s_store_name, + sum(ss_net_profit) +FROM store_sales, date_dim, store, + (SELECT ca_zip + FROM ( + (SELECT substr(ca_zip, 1, 5) ca_zip + FROM customer_address + WHERE substr(ca_zip, 1, 5) IN ( + '24128','76232','65084','87816','83926','77556','20548', + '26231','43848','15126','91137','61265','98294','25782', + '17920','18426','98235','40081','84093','28577','55565', + '17183','54601','67897','22752','86284','18376','38607', + '45200','21756','29741','96765','23932','89360','29839', + '25989','28898','91068','72550','10390','18845','47770', + '82636','41367','76638','86198','81312','37126','39192', + '88424','72175','81426','53672','10445','42666','66864', + '66708','41248','48583','82276','18842','78890','49448', + '14089','38122','34425','79077','19849','43285','39861', + '66162','77610','13695','99543','83444','83041','12305', + '57665','68341','25003','57834','62878','49130','81096', + '18840','27700','23470','50412','21195','16021','76107', + '71954','68309','18119','98359','64544','10336','86379', + '27068','39736','98569','28915','24206','56529','57647', + '54917','42961','91110','63981','14922','36420','23006', + '67467','32754','30903','20260','31671','51798','72325', + '85816','68621','13955','36446','41766','68806','16725', + '15146','22744','35850','88086','51649','18270','52867', + '39972','96976','63792','11376','94898','13595','10516', + '90225','58943','39371','94945','28587','96576','57855', + '28488','26105','83933','25858','34322','44438','73171', + '30122','34102','22685','71256','78451','54364','13354', + '45375','40558','56458','28286','45266','47305','69399', + '83921','26233','11101','15371','69913','35942','15882', + '25631','24610','44165','99076','33786','70738','26653', + '14328','72305','62496','22152','10144','64147','48425', + '14663','21076','18799','30450','63089','81019','68893', + '24996','51200','51211','45692','92712','70466','79994', + '22437','25280','38935','71791','73134','56571','14060', + '19505','72425','56575','74351','68786','51650','20004', + '18383','76614','11634','18906','15765','41368','73241', + '76698','78567','97189','28545','76231','75691','22246', + '51061','90578','56691','68014','51103','94167','57047', + '14867','73520','15734','63435','25733','35474','24676', + '94627','53535','17879','15559','53268','59166','11928', + '59402','33282','45721','43933','68101','33515','36634', + '71286','19736','58058','55253','67473','41918','19515', + '36495','19430','22351','77191','91393','49156','50298', + '87501','18652','53179','18767','63193','23968','65164', + '68880','21286','72823','58470','67301','13394','31016', + '70372','67030','40604','24317','45748','39127','26065', + '77721','31029','31880','60576','24671','45549','13376', + '50016','33123','19769','22927','97789','46081','72151', + '15723','46136','51949','68100','96888','64528','14171', + '79777','28709','11489','25103','32213','78668','22245', + '15798','27156','37930','62971','21337','51622','67853', + '10567','38415','15455','58263','42029','60279','37125', + '56240','88190','50308','26859','64457','89091','82136', + '62377','36233','63837','58078','17043','30010','60099', + '28810','98025','29178','87343','73273','30469','64034', + '39516','86057','21309','90257','67875','40162','11356', + '73650','61810','72013','30431','22461','19512','13375', + '55307','30625','83849','68908','26689','96451','38193', + '46820','88885','84935','69035','83144','47537','56616', + '94983','48033','69952','25486','61547','27385','61860', + '58048','56910','16807','17871','35258','31387','35458', + '35576')) + INTERSECT + (SELECT ca_zip + FROM + (SELECT + substr(ca_zip, 1, 5) ca_zip, + count(*) cnt + FROM customer_address, customer + WHERE ca_address_sk = c_current_addr_sk AND + c_preferred_cust_flag = 'Y' + GROUP BY ca_zip + HAVING count(*) > 10) A1) + ) A2 + ) V1 +WHERE ss_store_sk = s_store_sk + AND ss_sold_date_sk = d_date_sk + AND d_qoy = 2 AND d_year = 1998 + AND (substr(s_zip, 1, 2) = substr(V1.ca_zip, 1, 2)) +GROUP BY s_store_name +ORDER BY s_store_name +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q80.sql b/plugin-spark/src/test/resources/tpcds/q80.sql new file mode 100755 index 0000000000..433db87d2a --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q80.sql @@ -0,0 +1,94 @@ +WITH ssr AS +(SELECT + s_store_id AS store_id, + sum(ss_ext_sales_price) AS sales, + sum(coalesce(sr_return_amt, 0)) AS returns, + sum(ss_net_profit - coalesce(sr_net_loss, 0)) AS profit + FROM store_sales + LEFT OUTER JOIN store_returns ON + (ss_item_sk = sr_item_sk AND + ss_ticket_number = sr_ticket_number) + , + date_dim, store, item, promotion + WHERE ss_sold_date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-23' AS DATE) + AND (cast('2000-08-23' AS DATE) + INTERVAL 30 days) + AND ss_store_sk = s_store_sk + AND ss_item_sk = i_item_sk + AND i_current_price > 50 + AND ss_promo_sk = p_promo_sk + AND p_channel_tv = 'N' + GROUP BY s_store_id), + csr AS + (SELECT + cp_catalog_page_id AS catalog_page_id, + sum(cs_ext_sales_price) AS sales, + sum(coalesce(cr_return_amount, 0)) AS returns, + sum(cs_net_profit - coalesce(cr_net_loss, 0)) AS profit + FROM catalog_sales + LEFT OUTER JOIN catalog_returns ON + (cs_item_sk = cr_item_sk AND + cs_order_number = cr_order_number) + , + date_dim, catalog_page, item, promotion + WHERE cs_sold_date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-23' AS DATE) + AND (cast('2000-08-23' AS DATE) + INTERVAL 30 days) + AND cs_catalog_page_sk = cp_catalog_page_sk + AND cs_item_sk = i_item_sk + AND i_current_price > 50 + AND cs_promo_sk = p_promo_sk + AND p_channel_tv = 'N' + GROUP BY cp_catalog_page_id), + wsr AS + (SELECT + web_site_id, + sum(ws_ext_sales_price) AS sales, + sum(coalesce(wr_return_amt, 0)) AS returns, + sum(ws_net_profit - coalesce(wr_net_loss, 0)) AS profit + FROM web_sales + LEFT OUTER JOIN web_returns ON + (ws_item_sk = wr_item_sk AND ws_order_number = wr_order_number) + , + date_dim, web_site, item, promotion + WHERE ws_sold_date_sk = d_date_sk + AND d_date BETWEEN cast('2000-08-23' AS DATE) + AND (cast('2000-08-23' AS DATE) + INTERVAL 30 days) + AND ws_web_site_sk = web_site_sk + AND ws_item_sk = i_item_sk + AND i_current_price > 50 + AND ws_promo_sk = p_promo_sk + AND p_channel_tv = 'N' + GROUP BY web_site_id) +SELECT + channel, + id, + sum(sales) AS sales, + sum(returns) AS returns, + sum(profit) AS profit +FROM (SELECT + 'store channel' AS channel, + concat('store', store_id) AS id, + sales, + returns, + profit + FROM ssr + UNION ALL + SELECT + 'catalog channel' AS channel, + concat('catalog_page', catalog_page_id) AS id, + sales, + returns, + profit + FROM csr + UNION ALL + SELECT + 'web channel' AS channel, + concat('web_site', web_site_id) AS id, + sales, + returns, + profit + FROM wsr) x +GROUP BY ROLLUP (channel, id) +ORDER BY channel, id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q81.sql b/plugin-spark/src/test/resources/tpcds/q81.sql new file mode 100755 index 0000000000..18f0ffa7e8 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q81.sql @@ -0,0 +1,38 @@ +WITH customer_total_return AS +(SELECT + cr_returning_customer_sk AS ctr_customer_sk, + ca_state AS ctr_state, + sum(cr_return_amt_inc_tax) AS ctr_total_return + FROM catalog_returns, date_dim, customer_address + WHERE cr_returned_date_sk = d_date_sk + AND d_year = 2000 + AND cr_returning_addr_sk = ca_address_sk + GROUP BY cr_returning_customer_sk, ca_state ) +SELECT + c_customer_id, + c_salutation, + c_first_name, + c_last_name, + ca_street_number, + ca_street_name, + ca_street_type, + ca_suite_number, + ca_city, + ca_county, + ca_state, + ca_zip, + ca_country, + ca_gmt_offset, + ca_location_type, + ctr_total_return +FROM customer_total_return ctr1, customer_address, customer +WHERE ctr1.ctr_total_return > (SELECT avg(ctr_total_return) * 1.2 +FROM customer_total_return ctr2 +WHERE ctr1.ctr_state = ctr2.ctr_state) + AND ca_address_sk = c_current_addr_sk + AND ca_state = 'GA' + AND ctr1.ctr_customer_sk = c_customer_sk +ORDER BY c_customer_id, c_salutation, c_first_name, c_last_name, ca_street_number, ca_street_name + , ca_street_type, ca_suite_number, ca_city, ca_county, ca_state, ca_zip, ca_country, ca_gmt_offset + , ca_location_type, ctr_total_return +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q82.sql b/plugin-spark/src/test/resources/tpcds/q82.sql new file mode 100755 index 0000000000..20942cfeb0 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q82.sql @@ -0,0 +1,15 @@ +SELECT + i_item_id, + i_item_desc, + i_current_price +FROM item, inventory, date_dim, store_sales +WHERE i_current_price BETWEEN 62 AND 62 + 30 + AND inv_item_sk = i_item_sk + AND d_date_sk = inv_date_sk + AND d_date BETWEEN cast('2000-05-25' AS DATE) AND (cast('2000-05-25' AS DATE) + INTERVAL 60 days) + AND i_manufact_id IN (129, 270, 821, 423) + AND inv_quantity_on_hand BETWEEN 100 AND 500 + AND ss_item_sk = i_item_sk +GROUP BY i_item_id, i_item_desc, i_current_price +ORDER BY i_item_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q83.sql b/plugin-spark/src/test/resources/tpcds/q83.sql new file mode 100755 index 0000000000..53c10c7ded --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q83.sql @@ -0,0 +1,56 @@ +WITH sr_items AS +(SELECT + i_item_id item_id, + sum(sr_return_quantity) sr_item_qty + FROM store_returns, item, date_dim + WHERE sr_item_sk = i_item_sk + AND d_date IN (SELECT d_date + FROM date_dim + WHERE d_week_seq IN + (SELECT d_week_seq + FROM date_dim + WHERE d_date IN ('2000-06-30', '2000-09-27', '2000-11-17'))) + AND sr_returned_date_sk = d_date_sk + GROUP BY i_item_id), + cr_items AS + (SELECT + i_item_id item_id, + sum(cr_return_quantity) cr_item_qty + FROM catalog_returns, item, date_dim + WHERE cr_item_sk = i_item_sk + AND d_date IN (SELECT d_date + FROM date_dim + WHERE d_week_seq IN + (SELECT d_week_seq + FROM date_dim + WHERE d_date IN ('2000-06-30', '2000-09-27', '2000-11-17'))) + AND cr_returned_date_sk = d_date_sk + GROUP BY i_item_id), + wr_items AS + (SELECT + i_item_id item_id, + sum(wr_return_quantity) wr_item_qty + FROM web_returns, item, date_dim + WHERE wr_item_sk = i_item_sk AND d_date IN + (SELECT d_date + FROM date_dim + WHERE d_week_seq IN + (SELECT d_week_seq + FROM date_dim + WHERE d_date IN ('2000-06-30', '2000-09-27', '2000-11-17'))) + AND wr_returned_date_sk = d_date_sk + GROUP BY i_item_id) +SELECT + sr_items.item_id, + sr_item_qty, + sr_item_qty / (sr_item_qty + cr_item_qty + wr_item_qty) / 3.0 * 100 sr_dev, + cr_item_qty, + cr_item_qty / (sr_item_qty + cr_item_qty + wr_item_qty) / 3.0 * 100 cr_dev, + wr_item_qty, + wr_item_qty / (sr_item_qty + cr_item_qty + wr_item_qty) / 3.0 * 100 wr_dev, + (sr_item_qty + cr_item_qty + wr_item_qty) / 3.0 average +FROM sr_items, cr_items, wr_items +WHERE sr_items.item_id = cr_items.item_id + AND sr_items.item_id = wr_items.item_id +ORDER BY sr_items.item_id, sr_item_qty +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q84.sql b/plugin-spark/src/test/resources/tpcds/q84.sql new file mode 100755 index 0000000000..a1076b57ce --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q84.sql @@ -0,0 +1,19 @@ +SELECT + c_customer_id AS customer_id, + concat(c_last_name, ', ', c_first_name) AS customername +FROM customer + , customer_address + , customer_demographics + , household_demographics + , income_band + , store_returns +WHERE ca_city = 'Edgewood' + AND c_current_addr_sk = ca_address_sk + AND ib_lower_bound >= 38128 + AND ib_upper_bound <= 38128 + 50000 + AND ib_income_band_sk = hd_income_band_sk + AND cd_demo_sk = c_current_cdemo_sk + AND hd_demo_sk = c_current_hdemo_sk + AND sr_cdemo_sk = cd_demo_sk +ORDER BY c_customer_id +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q85.sql b/plugin-spark/src/test/resources/tpcds/q85.sql new file mode 100755 index 0000000000..cf718b0f8a --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q85.sql @@ -0,0 +1,82 @@ +SELECT + substr(r_reason_desc, 1, 20), + avg(ws_quantity), + avg(wr_refunded_cash), + avg(wr_fee) +FROM web_sales, web_returns, web_page, customer_demographics cd1, + customer_demographics cd2, customer_address, date_dim, reason +WHERE ws_web_page_sk = wp_web_page_sk + AND ws_item_sk = wr_item_sk + AND ws_order_number = wr_order_number + AND ws_sold_date_sk = d_date_sk AND d_year = 2000 + AND cd1.cd_demo_sk = wr_refunded_cdemo_sk + AND cd2.cd_demo_sk = wr_returning_cdemo_sk + AND ca_address_sk = wr_refunded_addr_sk + AND r_reason_sk = wr_reason_sk + AND + ( + ( + cd1.cd_marital_status = 'M' + AND + cd1.cd_marital_status = cd2.cd_marital_status + AND + cd1.cd_education_status = 'Advanced Degree' + AND + cd1.cd_education_status = cd2.cd_education_status + AND + ws_sales_price BETWEEN 100.00 AND 150.00 + ) + OR + ( + cd1.cd_marital_status = 'S' + AND + cd1.cd_marital_status = cd2.cd_marital_status + AND + cd1.cd_education_status = 'College' + AND + cd1.cd_education_status = cd2.cd_education_status + AND + ws_sales_price BETWEEN 50.00 AND 100.00 + ) + OR + ( + cd1.cd_marital_status = 'W' + AND + cd1.cd_marital_status = cd2.cd_marital_status + AND + cd1.cd_education_status = '2 yr Degree' + AND + cd1.cd_education_status = cd2.cd_education_status + AND + ws_sales_price BETWEEN 150.00 AND 200.00 + ) + ) + AND + ( + ( + ca_country = 'United States' + AND + ca_state IN ('IN', 'OH', 'NJ') + AND ws_net_profit BETWEEN 100 AND 200 + ) + OR + ( + ca_country = 'United States' + AND + ca_state IN ('WI', 'CT', 'KY') + AND ws_net_profit BETWEEN 150 AND 300 + ) + OR + ( + ca_country = 'United States' + AND + ca_state IN ('LA', 'IA', 'AR') + AND ws_net_profit BETWEEN 50 AND 250 + ) + ) +GROUP BY r_reason_desc +ORDER BY substr(r_reason_desc, 1, 20) + , avg(ws_quantity) + , avg(wr_refunded_cash) + , avg(wr_fee) +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q86.sql b/plugin-spark/src/test/resources/tpcds/q86.sql new file mode 100755 index 0000000000..789a4abf7b --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q86.sql @@ -0,0 +1,24 @@ +SELECT + sum(ws_net_paid) AS total_sum, + i_category, + i_class, + grouping(i_category) + grouping(i_class) AS lochierarchy, + rank() + OVER ( + PARTITION BY grouping(i_category) + grouping(i_class), + CASE WHEN grouping(i_class) = 0 + THEN i_category END + ORDER BY sum(ws_net_paid) DESC) AS rank_within_parent +FROM + web_sales, date_dim d1, item +WHERE + d1.d_month_seq BETWEEN 1200 AND 1200 + 11 + AND d1.d_date_sk = ws_sold_date_sk + AND i_item_sk = ws_item_sk +GROUP BY ROLLUP (i_category, i_class) +ORDER BY + lochierarchy DESC, + CASE WHEN lochierarchy = 0 + THEN i_category END, + rank_within_parent +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q87.sql b/plugin-spark/src/test/resources/tpcds/q87.sql new file mode 100755 index 0000000000..4aaa9f39dc --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q87.sql @@ -0,0 +1,28 @@ +SELECT count(*) +FROM ((SELECT DISTINCT + c_last_name, + c_first_name, + d_date +FROM store_sales, date_dim, customer +WHERE store_sales.ss_sold_date_sk = date_dim.d_date_sk + AND store_sales.ss_customer_sk = customer.c_customer_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11) + EXCEPT + (SELECT DISTINCT + c_last_name, + c_first_name, + d_date + FROM catalog_sales, date_dim, customer + WHERE catalog_sales.cs_sold_date_sk = date_dim.d_date_sk + AND catalog_sales.cs_bill_customer_sk = customer.c_customer_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11) + EXCEPT + (SELECT DISTINCT + c_last_name, + c_first_name, + d_date + FROM web_sales, date_dim, customer + WHERE web_sales.ws_sold_date_sk = date_dim.d_date_sk + AND web_sales.ws_bill_customer_sk = customer.c_customer_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11) + ) cool_cust diff --git a/plugin-spark/src/test/resources/tpcds/q88.sql b/plugin-spark/src/test/resources/tpcds/q88.sql new file mode 100755 index 0000000000..25bcd90f41 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q88.sql @@ -0,0 +1,122 @@ +SELECT * +FROM + (SELECT count(*) h8_30_to_9 + FROM store_sales, household_demographics, time_dim, store + WHERE ss_sold_time_sk = time_dim.t_time_sk + AND ss_hdemo_sk = household_demographics.hd_demo_sk + AND ss_store_sk = s_store_sk + AND time_dim.t_hour = 8 + AND time_dim.t_minute >= 30 + AND ( + (household_demographics.hd_dep_count = 4 AND household_demographics.hd_vehicle_count <= 4 + 2) + OR + (household_demographics.hd_dep_count = 2 AND household_demographics.hd_vehicle_count <= 2 + 2) + OR + (household_demographics.hd_dep_count = 0 AND + household_demographics.hd_vehicle_count <= 0 + 2)) + AND store.s_store_name = 'ese') s1, + (SELECT count(*) h9_to_9_30 + FROM store_sales, household_demographics, time_dim, store + WHERE ss_sold_time_sk = time_dim.t_time_sk + AND ss_hdemo_sk = household_demographics.hd_demo_sk + AND ss_store_sk = s_store_sk + AND time_dim.t_hour = 9 + AND time_dim.t_minute < 30 + AND ( + (household_demographics.hd_dep_count = 4 AND household_demographics.hd_vehicle_count <= 4 + 2) + OR + (household_demographics.hd_dep_count = 2 AND household_demographics.hd_vehicle_count <= 2 + 2) + OR + (household_demographics.hd_dep_count = 0 AND + household_demographics.hd_vehicle_count <= 0 + 2)) + AND store.s_store_name = 'ese') s2, + (SELECT count(*) h9_30_to_10 + FROM store_sales, household_demographics, time_dim, store + WHERE ss_sold_time_sk = time_dim.t_time_sk + AND ss_hdemo_sk = household_demographics.hd_demo_sk + AND ss_store_sk = s_store_sk + AND time_dim.t_hour = 9 + AND time_dim.t_minute >= 30 + AND ( + (household_demographics.hd_dep_count = 4 AND household_demographics.hd_vehicle_count <= 4 + 2) + OR + (household_demographics.hd_dep_count = 2 AND household_demographics.hd_vehicle_count <= 2 + 2) + OR + (household_demographics.hd_dep_count = 0 AND + household_demographics.hd_vehicle_count <= 0 + 2)) + AND store.s_store_name = 'ese') s3, + (SELECT count(*) h10_to_10_30 + FROM store_sales, household_demographics, time_dim, store + WHERE ss_sold_time_sk = time_dim.t_time_sk + AND ss_hdemo_sk = household_demographics.hd_demo_sk + AND ss_store_sk = s_store_sk + AND time_dim.t_hour = 10 + AND time_dim.t_minute < 30 + AND ( + (household_demographics.hd_dep_count = 4 AND household_demographics.hd_vehicle_count <= 4 + 2) + OR + (household_demographics.hd_dep_count = 2 AND household_demographics.hd_vehicle_count <= 2 + 2) + OR + (household_demographics.hd_dep_count = 0 AND + household_demographics.hd_vehicle_count <= 0 + 2)) + AND store.s_store_name = 'ese') s4, + (SELECT count(*) h10_30_to_11 + FROM store_sales, household_demographics, time_dim, store + WHERE ss_sold_time_sk = time_dim.t_time_sk + AND ss_hdemo_sk = household_demographics.hd_demo_sk + AND ss_store_sk = s_store_sk + AND time_dim.t_hour = 10 + AND time_dim.t_minute >= 30 + AND ( + (household_demographics.hd_dep_count = 4 AND household_demographics.hd_vehicle_count <= 4 + 2) + OR + (household_demographics.hd_dep_count = 2 AND household_demographics.hd_vehicle_count <= 2 + 2) + OR + (household_demographics.hd_dep_count = 0 AND + household_demographics.hd_vehicle_count <= 0 + 2)) + AND store.s_store_name = 'ese') s5, + (SELECT count(*) h11_to_11_30 + FROM store_sales, household_demographics, time_dim, store + WHERE ss_sold_time_sk = time_dim.t_time_sk + AND ss_hdemo_sk = household_demographics.hd_demo_sk + AND ss_store_sk = s_store_sk + AND time_dim.t_hour = 11 + AND time_dim.t_minute < 30 + AND ( + (household_demographics.hd_dep_count = 4 AND household_demographics.hd_vehicle_count <= 4 + 2) + OR + (household_demographics.hd_dep_count = 2 AND household_demographics.hd_vehicle_count <= 2 + 2) + OR + (household_demographics.hd_dep_count = 0 AND + household_demographics.hd_vehicle_count <= 0 + 2)) + AND store.s_store_name = 'ese') s6, + (SELECT count(*) h11_30_to_12 + FROM store_sales, household_demographics, time_dim, store + WHERE ss_sold_time_sk = time_dim.t_time_sk + AND ss_hdemo_sk = household_demographics.hd_demo_sk + AND ss_store_sk = s_store_sk + AND time_dim.t_hour = 11 + AND time_dim.t_minute >= 30 + AND ( + (household_demographics.hd_dep_count = 4 AND household_demographics.hd_vehicle_count <= 4 + 2) + OR + (household_demographics.hd_dep_count = 2 AND household_demographics.hd_vehicle_count <= 2 + 2) + OR + (household_demographics.hd_dep_count = 0 AND + household_demographics.hd_vehicle_count <= 0 + 2)) + AND store.s_store_name = 'ese') s7, + (SELECT count(*) h12_to_12_30 + FROM store_sales, household_demographics, time_dim, store + WHERE ss_sold_time_sk = time_dim.t_time_sk + AND ss_hdemo_sk = household_demographics.hd_demo_sk + AND ss_store_sk = s_store_sk + AND time_dim.t_hour = 12 + AND time_dim.t_minute < 30 + AND ( + (household_demographics.hd_dep_count = 4 AND household_demographics.hd_vehicle_count <= 4 + 2) + OR + (household_demographics.hd_dep_count = 2 AND household_demographics.hd_vehicle_count <= 2 + 2) + OR + (household_demographics.hd_dep_count = 0 AND + household_demographics.hd_vehicle_count <= 0 + 2)) + AND store.s_store_name = 'ese') s8 diff --git a/plugin-spark/src/test/resources/tpcds/q89.sql b/plugin-spark/src/test/resources/tpcds/q89.sql new file mode 100755 index 0000000000..75408cb032 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q89.sql @@ -0,0 +1,30 @@ +SELECT * +FROM ( + SELECT + i_category, + i_class, + i_brand, + s_store_name, + s_company_name, + d_moy, + sum(ss_sales_price) sum_sales, + avg(sum(ss_sales_price)) + OVER + (PARTITION BY i_category, i_brand, s_store_name, s_company_name) + avg_monthly_sales + FROM item, store_sales, date_dim, store + WHERE ss_item_sk = i_item_sk AND + ss_sold_date_sk = d_date_sk AND + ss_store_sk = s_store_sk AND + d_year IN (1999) AND + ((i_category IN ('Books', 'Electronics', 'Sports') AND + i_class IN ('computers', 'stereo', 'football')) + OR (i_category IN ('Men', 'Jewelry', 'Women') AND + i_class IN ('shirts', 'birdal', 'dresses'))) + GROUP BY i_category, i_class, i_brand, + s_store_name, s_company_name, d_moy) tmp1 +WHERE CASE WHEN (avg_monthly_sales <> 0) + THEN (abs(sum_sales - avg_monthly_sales) / avg_monthly_sales) + ELSE NULL END > 0.1 +ORDER BY sum_sales - avg_monthly_sales, s_store_name +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q9.sql b/plugin-spark/src/test/resources/tpcds/q9.sql new file mode 100755 index 0000000000..de3db9d988 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q9.sql @@ -0,0 +1,48 @@ +SELECT + CASE WHEN (SELECT count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 1 AND 20) > 62316685 + THEN (SELECT avg(ss_ext_discount_amt) + FROM store_sales + WHERE ss_quantity BETWEEN 1 AND 20) + ELSE (SELECT avg(ss_net_paid) + FROM store_sales + WHERE ss_quantity BETWEEN 1 AND 20) END bucket1, + CASE WHEN (SELECT count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 21 AND 40) > 19045798 + THEN (SELECT avg(ss_ext_discount_amt) + FROM store_sales + WHERE ss_quantity BETWEEN 21 AND 40) + ELSE (SELECT avg(ss_net_paid) + FROM store_sales + WHERE ss_quantity BETWEEN 21 AND 40) END bucket2, + CASE WHEN (SELECT count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 41 AND 60) > 365541424 + THEN (SELECT avg(ss_ext_discount_amt) + FROM store_sales + WHERE ss_quantity BETWEEN 41 AND 60) + ELSE (SELECT avg(ss_net_paid) + FROM store_sales + WHERE ss_quantity BETWEEN 41 AND 60) END bucket3, + CASE WHEN (SELECT count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 61 AND 80) > 216357808 + THEN (SELECT avg(ss_ext_discount_amt) + FROM store_sales + WHERE ss_quantity BETWEEN 61 AND 80) + ELSE (SELECT avg(ss_net_paid) + FROM store_sales + WHERE ss_quantity BETWEEN 61 AND 80) END bucket4, + CASE WHEN (SELECT count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 81 AND 100) > 184483884 + THEN (SELECT avg(ss_ext_discount_amt) + FROM store_sales + WHERE ss_quantity BETWEEN 81 AND 100) + ELSE (SELECT avg(ss_net_paid) + FROM store_sales + WHERE ss_quantity BETWEEN 81 AND 100) END bucket5 +FROM reason +WHERE r_reason_sk = 1 diff --git a/plugin-spark/src/test/resources/tpcds/q90.sql b/plugin-spark/src/test/resources/tpcds/q90.sql new file mode 100755 index 0000000000..85e35bf8bf --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q90.sql @@ -0,0 +1,19 @@ +SELECT cast(amc AS DECIMAL(15, 4)) / cast(pmc AS DECIMAL(15, 4)) am_pm_ratio +FROM (SELECT count(*) amc +FROM web_sales, household_demographics, time_dim, web_page +WHERE ws_sold_time_sk = time_dim.t_time_sk + AND ws_ship_hdemo_sk = household_demographics.hd_demo_sk + AND ws_web_page_sk = web_page.wp_web_page_sk + AND time_dim.t_hour BETWEEN 8 AND 8 + 1 + AND household_demographics.hd_dep_count = 6 + AND web_page.wp_char_count BETWEEN 5000 AND 5200) at, + (SELECT count(*) pmc + FROM web_sales, household_demographics, time_dim, web_page + WHERE ws_sold_time_sk = time_dim.t_time_sk + AND ws_ship_hdemo_sk = household_demographics.hd_demo_sk + AND ws_web_page_sk = web_page.wp_web_page_sk + AND time_dim.t_hour BETWEEN 19 AND 19 + 1 + AND household_demographics.hd_dep_count = 6 + AND web_page.wp_char_count BETWEEN 5000 AND 5200) pt +ORDER BY am_pm_ratio +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q91.sql b/plugin-spark/src/test/resources/tpcds/q91.sql new file mode 100755 index 0000000000..9ca7ce00ac --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q91.sql @@ -0,0 +1,23 @@ +SELECT + cc_call_center_id Call_Center, + cc_name Call_Center_Name, + cc_manager Manager, + sum(cr_net_loss) Returns_Loss +FROM + call_center, catalog_returns, date_dim, customer, customer_address, + customer_demographics, household_demographics +WHERE + cr_call_center_sk = cc_call_center_sk + AND cr_returned_date_sk = d_date_sk + AND cr_returning_customer_sk = c_customer_sk + AND cd_demo_sk = c_current_cdemo_sk + AND hd_demo_sk = c_current_hdemo_sk + AND ca_address_sk = c_current_addr_sk + AND d_year = 1998 + AND d_moy = 11 + AND ((cd_marital_status = 'M' AND cd_education_status = 'Unknown') + OR (cd_marital_status = 'W' AND cd_education_status = 'Advanced Degree')) + AND hd_buy_potential LIKE 'Unknown%' + AND ca_gmt_offset = -7 +GROUP BY cc_call_center_id, cc_name, cc_manager, cd_marital_status, cd_education_status +ORDER BY sum(cr_net_loss) DESC diff --git a/plugin-spark/src/test/resources/tpcds/q92.sql b/plugin-spark/src/test/resources/tpcds/q92.sql new file mode 100755 index 0000000000..99129c3bd9 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q92.sql @@ -0,0 +1,16 @@ +SELECT sum(ws_ext_discount_amt) AS `Excess Discount Amount ` +FROM web_sales, item, date_dim +WHERE i_manufact_id = 350 + AND i_item_sk = ws_item_sk + AND d_date BETWEEN '2000-01-27' AND (cast('2000-01-27' AS DATE) + INTERVAL 90 days) + AND d_date_sk = ws_sold_date_sk + AND ws_ext_discount_amt > + ( + SELECT 1.3 * avg(ws_ext_discount_amt) + FROM web_sales, date_dim + WHERE ws_item_sk = i_item_sk + AND d_date BETWEEN '2000-01-27' AND (cast('2000-01-27' AS DATE) + INTERVAL 90 days) + AND d_date_sk = ws_sold_date_sk + ) +ORDER BY sum(ws_ext_discount_amt) +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q93.sql b/plugin-spark/src/test/resources/tpcds/q93.sql new file mode 100755 index 0000000000..222dc31c1f --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q93.sql @@ -0,0 +1,19 @@ +SELECT + ss_customer_sk, + sum(act_sales) sumsales +FROM (SELECT + ss_item_sk, + ss_ticket_number, + ss_customer_sk, + CASE WHEN sr_return_quantity IS NOT NULL + THEN (ss_quantity - sr_return_quantity) * ss_sales_price + ELSE (ss_quantity * ss_sales_price) END act_sales +FROM store_sales + LEFT OUTER JOIN store_returns + ON (sr_item_sk = ss_item_sk AND sr_ticket_number = ss_ticket_number) + , + reason +WHERE sr_reason_sk = r_reason_sk AND r_reason_desc = 'reason 28') t +GROUP BY ss_customer_sk +ORDER BY sumsales, ss_customer_sk +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q94.sql b/plugin-spark/src/test/resources/tpcds/q94.sql new file mode 100755 index 0000000000..d6de3d75b8 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q94.sql @@ -0,0 +1,23 @@ +SELECT + count(DISTINCT ws_order_number) AS `order count `, + sum(ws_ext_ship_cost) AS `total shipping cost `, + sum(ws_net_profit) AS `total net profit ` +FROM + web_sales ws1, date_dim, customer_address, web_site +WHERE + d_date BETWEEN '1999-02-01' AND + (CAST('1999-02-01' AS DATE) + INTERVAL 60 days) + AND ws1.ws_ship_date_sk = d_date_sk + AND ws1.ws_ship_addr_sk = ca_address_sk + AND ca_state = 'IL' + AND ws1.ws_web_site_sk = web_site_sk + AND web_company_name = 'pri' + AND EXISTS(SELECT * + FROM web_sales ws2 + WHERE ws1.ws_order_number = ws2.ws_order_number + AND ws1.ws_warehouse_sk <> ws2.ws_warehouse_sk) + AND NOT EXISTS(SELECT * + FROM web_returns wr1 + WHERE ws1.ws_order_number = wr1.wr_order_number) +ORDER BY count(DISTINCT ws_order_number) +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q95.sql b/plugin-spark/src/test/resources/tpcds/q95.sql new file mode 100755 index 0000000000..df71f00bd6 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q95.sql @@ -0,0 +1,29 @@ +WITH ws_wh AS +(SELECT + ws1.ws_order_number, + ws1.ws_warehouse_sk wh1, + ws2.ws_warehouse_sk wh2 + FROM web_sales ws1, web_sales ws2 + WHERE ws1.ws_order_number = ws2.ws_order_number + AND ws1.ws_warehouse_sk <> ws2.ws_warehouse_sk) +SELECT + count(DISTINCT ws_order_number) AS `order count `, + sum(ws_ext_ship_cost) AS `total shipping cost `, + sum(ws_net_profit) AS `total net profit ` +FROM + web_sales ws1, date_dim, customer_address, web_site +WHERE + d_date BETWEEN '1999-02-01' AND + (CAST('1999-02-01' AS DATE) + INTERVAL 60 DAY) + AND ws1.ws_ship_date_sk = d_date_sk + AND ws1.ws_ship_addr_sk = ca_address_sk + AND ca_state = 'IL' + AND ws1.ws_web_site_sk = web_site_sk + AND web_company_name = 'pri' + AND ws1.ws_order_number IN (SELECT ws_order_number + FROM ws_wh) + AND ws1.ws_order_number IN (SELECT wr_order_number + FROM web_returns, ws_wh + WHERE wr_order_number = ws_wh.ws_order_number) +ORDER BY count(DISTINCT ws_order_number) +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q96.sql b/plugin-spark/src/test/resources/tpcds/q96.sql new file mode 100755 index 0000000000..7ab17e7bc4 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q96.sql @@ -0,0 +1,11 @@ +SELECT count(*) +FROM store_sales, household_demographics, time_dim, store +WHERE ss_sold_time_sk = time_dim.t_time_sk + AND ss_hdemo_sk = household_demographics.hd_demo_sk + AND ss_store_sk = s_store_sk + AND time_dim.t_hour = 20 + AND time_dim.t_minute >= 30 + AND household_demographics.hd_dep_count = 7 + AND store.s_store_name = 'ese' +ORDER BY count(*) +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q97.sql b/plugin-spark/src/test/resources/tpcds/q97.sql new file mode 100755 index 0000000000..e7e0b1a052 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q97.sql @@ -0,0 +1,30 @@ +WITH ssci AS ( + SELECT + ss_customer_sk customer_sk, + ss_item_sk item_sk + FROM store_sales, date_dim + WHERE ss_sold_date_sk = d_date_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11 + GROUP BY ss_customer_sk, ss_item_sk), + csci AS ( + SELECT + cs_bill_customer_sk customer_sk, + cs_item_sk item_sk + FROM catalog_sales, date_dim + WHERE cs_sold_date_sk = d_date_sk + AND d_month_seq BETWEEN 1200 AND 1200 + 11 + GROUP BY cs_bill_customer_sk, cs_item_sk) +SELECT + sum(CASE WHEN ssci.customer_sk IS NOT NULL AND csci.customer_sk IS NULL + THEN 1 + ELSE 0 END) store_only, + sum(CASE WHEN ssci.customer_sk IS NULL AND csci.customer_sk IS NOT NULL + THEN 1 + ELSE 0 END) catalog_only, + sum(CASE WHEN ssci.customer_sk IS NOT NULL AND csci.customer_sk IS NOT NULL + THEN 1 + ELSE 0 END) store_and_catalog +FROM ssci + FULL OUTER JOIN csci ON (ssci.customer_sk = csci.customer_sk + AND ssci.item_sk = csci.item_sk) +LIMIT 100 diff --git a/plugin-spark/src/test/resources/tpcds/q98.sql b/plugin-spark/src/test/resources/tpcds/q98.sql new file mode 100755 index 0000000000..bb10d4bf8d --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q98.sql @@ -0,0 +1,21 @@ +SELECT + i_item_desc, + i_category, + i_class, + i_current_price, + sum(ss_ext_sales_price) AS itemrevenue, + sum(ss_ext_sales_price) * 100 / sum(sum(ss_ext_sales_price)) + OVER + (PARTITION BY i_class) AS revenueratio +FROM + store_sales, item, date_dim +WHERE + ss_item_sk = i_item_sk + AND i_category IN ('Sports', 'Books', 'Home') + AND ss_sold_date_sk = d_date_sk + AND d_date BETWEEN cast('1999-02-22' AS DATE) + AND (cast('1999-02-22' AS DATE) + INTERVAL 30 days) +GROUP BY + i_item_id, i_item_desc, i_category, i_class, i_current_price +ORDER BY + i_category, i_class, i_item_id, i_item_desc, revenueratio diff --git a/plugin-spark/src/test/resources/tpcds/q99.sql b/plugin-spark/src/test/resources/tpcds/q99.sql new file mode 100755 index 0000000000..f1a3d4d2b7 --- /dev/null +++ b/plugin-spark/src/test/resources/tpcds/q99.sql @@ -0,0 +1,34 @@ +SELECT + substr(w_warehouse_name, 1, 20), + sm_type, + cc_name, + sum(CASE WHEN (cs_ship_date_sk - cs_sold_date_sk <= 30) + THEN 1 + ELSE 0 END) AS `30 days `, + sum(CASE WHEN (cs_ship_date_sk - cs_sold_date_sk > 30) AND + (cs_ship_date_sk - cs_sold_date_sk <= 60) + THEN 1 + ELSE 0 END) AS `31 - 60 days `, + sum(CASE WHEN (cs_ship_date_sk - cs_sold_date_sk > 60) AND + (cs_ship_date_sk - cs_sold_date_sk <= 90) + THEN 1 + ELSE 0 END) AS `61 - 90 days `, + sum(CASE WHEN (cs_ship_date_sk - cs_sold_date_sk > 90) AND + (cs_ship_date_sk - cs_sold_date_sk <= 120) + THEN 1 + ELSE 0 END) AS `91 - 120 days `, + sum(CASE WHEN (cs_ship_date_sk - cs_sold_date_sk > 120) + THEN 1 + ELSE 0 END) AS `>120 days ` +FROM + catalog_sales, warehouse, ship_mode, call_center, date_dim +WHERE + d_month_seq BETWEEN 1200 AND 1200 + 11 + AND cs_ship_date_sk = d_date_sk + AND cs_warehouse_sk = w_warehouse_sk + AND cs_ship_mode_sk = sm_ship_mode_sk + AND cs_call_center_sk = cc_call_center_sk +GROUP BY + substr(w_warehouse_name, 1, 20), sm_type, cc_name +ORDER BY substr(w_warehouse_name, 1, 20), sm_type, cc_name +LIMIT 100 diff --git a/plugin-spark/src/test/scala/org/apache/ranger/services/spark/RangerAdminClientImpl.scala b/plugin-spark/src/test/scala/org/apache/ranger/services/spark/RangerAdminClientImpl.scala new file mode 100644 index 0000000000..2c96876479 --- /dev/null +++ b/plugin-spark/src/test/scala/org/apache/ranger/services/spark/RangerAdminClientImpl.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.services.spark + +import java.nio.file.{Files, FileSystems} +import java.util + +import com.google.gson.GsonBuilder +import org.apache.commons.logging.{Log, LogFactory} +import org.apache.ranger.admin.client.RangerAdminClient +import org.apache.ranger.plugin.util.{GrantRevokeRequest, ServicePolicies, ServiceTags} + +class RangerAdminClientImpl extends RangerAdminClient { + private val LOG: Log = LogFactory.getLog(classOf[RangerAdminClientImpl]) + private val cacheFilename = "sparkSql_hive_jenkins.json" + private val gson = + new GsonBuilder().setDateFormat("yyyyMMdd-HH:mm:ss.SSS-Z").setPrettyPrinting().create + + override def init(serviceName: String, appId: String, configPropertyPrefix: String): Unit = {} + + override def getServicePoliciesIfUpdated( + lastKnownVersion: Long, + lastActivationTimeInMillis: Long): ServicePolicies = { + val basedir = this.getClass.getProtectionDomain.getCodeSource.getLocation.getPath + val cachePath = FileSystems.getDefault.getPath(basedir, cacheFilename) + LOG.info("Reading policies from " + cachePath) + val bytes = Files.readAllBytes(cachePath) + gson.fromJson(new String(bytes), classOf[ServicePolicies]) + } + + override def grantAccess(request: GrantRevokeRequest): Unit = {} + + override def revokeAccess(request: GrantRevokeRequest): Unit = {} + + override def getServiceTagsIfUpdated( + lastKnownVersion: Long, + lastActivationTimeInMillis: Long): ServiceTags = null + + override def getTagTypes(tagTypePattern: String): util.List[String] = null +} diff --git a/plugin-spark/src/test/scala/org/apache/ranger/services/spark/SparkRangerAuthorizerTest.scala b/plugin-spark/src/test/scala/org/apache/ranger/services/spark/SparkRangerAuthorizerTest.scala new file mode 100644 index 0000000000..879df9fdc9 --- /dev/null +++ b/plugin-spark/src/test/scala/org/apache/ranger/services/spark/SparkRangerAuthorizerTest.scala @@ -0,0 +1,574 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ranger.services.spark + +import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.RangerSparkTestUtils +import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator} +import org.apache.spark.sql.catalyst.plans.logical.{Project, RangerSparkMasking, RangerSparkRowFilter} +import org.apache.spark.sql.catalyst.util.resourceToString +import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec} +import org.apache.spark.sql.internal.SQLConf +import org.scalatest.{BeforeAndAfterAll, FunSuite} + +class SparkRangerAuthorizerTest extends FunSuite with BeforeAndAfterAll { + + import RangerSparkTestUtils._ + private val spark = TestHive.sparkSession + private lazy val sql = spark.sql _ + + override def beforeAll(): Unit = { + super.beforeAll() + injectRules(spark) + spark.conf.set(SQLConf.CROSS_JOINS_ENABLED.key, "true") + + sql( + """ + |CREATE TABLE `catalog_page` ( + |`cp_catalog_page_sk` INT, `cp_catalog_page_id` STRING, `cp_start_date_sk` INT, + |`cp_end_date_sk` INT, `cp_department` STRING, `cp_catalog_number` INT, + |`cp_catalog_page_number` INT, `cp_description` STRING, `cp_type` STRING) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `catalog_returns` ( + |`cr_returned_date_sk` INT, `cr_returned_time_sk` INT, `cr_item_sk` INT, + |`cr_refunded_customer_sk` INT, `cr_refunded_cdemo_sk` INT, `cr_refunded_hdemo_sk` INT, + |`cr_refunded_addr_sk` INT, `cr_returning_customer_sk` INT, `cr_returning_cdemo_sk` INT, + |`cr_returning_hdemo_sk` INT, `cr_returning_addr_sk` INT, `cr_call_center_sk` INT, + |`cr_catalog_page_sk` INT, `cr_ship_mode_sk` INT, `cr_warehouse_sk` INT, `cr_reason_sk` INT, + |`cr_order_number` INT, `cr_return_quantity` INT, `cr_return_amount` DECIMAL(7,2), + |`cr_return_tax` DECIMAL(7,2), `cr_return_amt_inc_tax` DECIMAL(7,2), `cr_fee` DECIMAL(7,2), + |`cr_return_ship_cost` DECIMAL(7,2), `cr_refunded_cash` DECIMAL(7,2), + |`cr_reversed_charge` DECIMAL(7,2), `cr_store_credit` DECIMAL(7,2), + |`cr_net_loss` DECIMAL(7,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `customer` ( + |`c_customer_sk` INT, `c_customer_id` STRING, `c_current_cdemo_sk` INT, + |`c_current_hdemo_sk` INT, `c_current_addr_sk` INT, `c_first_shipto_date_sk` INT, + |`c_first_sales_date_sk` INT, `c_salutation` STRING, `c_first_name` STRING, + |`c_last_name` STRING, `c_preferred_cust_flag` STRING, `c_birth_day` INT, + |`c_birth_month` INT, `c_birth_year` INT, `c_birth_country` STRING, `c_login` STRING, + |`c_email_address` STRING, `c_last_review_date` STRING) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `customer_address` ( + |`ca_address_sk` INT, `ca_address_id` STRING, `ca_street_number` STRING, + |`ca_street_name` STRING, `ca_street_type` STRING, `ca_suite_number` STRING, + |`ca_city` STRING, `ca_county` STRING, `ca_state` STRING, `ca_zip` STRING, + |`ca_country` STRING, `ca_gmt_offset` DECIMAL(5,2), `ca_location_type` STRING) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `customer_demographics` ( + |`cd_demo_sk` INT, `cd_gender` STRING, `cd_marital_status` STRING, + |`cd_education_status` STRING, `cd_purchase_estimate` INT, `cd_credit_rating` STRING, + |`cd_dep_count` INT, `cd_dep_employed_count` INT, `cd_dep_college_count` INT) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `date_dim` ( + |`d_date_sk` INT, `d_date_id` STRING, `d_date` STRING, + |`d_month_seq` INT, `d_week_seq` INT, `d_quarter_seq` INT, `d_year` INT, `d_dow` INT, + |`d_moy` INT, `d_dom` INT, `d_qoy` INT, `d_fy_year` INT, `d_fy_quarter_seq` INT, + |`d_fy_week_seq` INT, `d_day_name` STRING, `d_quarter_name` STRING, `d_holiday` STRING, + |`d_weekend` STRING, `d_following_holiday` STRING, `d_first_dom` INT, `d_last_dom` INT, + |`d_same_day_ly` INT, `d_same_day_lq` INT, `d_current_day` STRING, `d_current_week` STRING, + |`d_current_month` STRING, `d_current_quarter` STRING, `d_current_year` STRING) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `household_demographics` ( + |`hd_demo_sk` INT, `hd_income_band_sk` INT, `hd_buy_potential` STRING, `hd_dep_count` INT, + |`hd_vehicle_count` INT) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `inventory` (`inv_date_sk` INT, `inv_item_sk` INT, `inv_warehouse_sk` INT, + |`inv_quantity_on_hand` INT) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `item` (`i_item_sk` INT, `i_item_id` STRING, `i_rec_start_date` STRING, + |`i_rec_end_date` STRING, `i_item_desc` STRING, `i_current_price` DECIMAL(7,2), + |`i_wholesale_cost` DECIMAL(7,2), `i_brand_id` INT, `i_brand` STRING, `i_class_id` INT, + |`i_class` STRING, `i_category_id` INT, `i_category` STRING, `i_manufact_id` INT, + |`i_manufact` STRING, `i_size` STRING, `i_formulation` STRING, `i_color` STRING, + |`i_units` STRING, `i_container` STRING, `i_manager_id` INT, `i_product_name` STRING) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `promotion` ( + |`p_promo_sk` INT, `p_promo_id` STRING, `p_start_date_sk` INT, `p_end_date_sk` INT, + |`p_item_sk` INT, `p_cost` DECIMAL(15,2), `p_response_target` INT, `p_promo_name` STRING, + |`p_channel_dmail` STRING, `p_channel_email` STRING, `p_channel_catalog` STRING, + |`p_channel_tv` STRING, `p_channel_radio` STRING, `p_channel_press` STRING, + |`p_channel_event` STRING, `p_channel_demo` STRING, `p_channel_details` STRING, + |`p_purpose` STRING, `p_discount_active` STRING) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `store` ( + |`s_store_sk` INT, `s_store_id` STRING, `s_rec_start_date` STRING, + |`s_rec_end_date` STRING, `s_closed_date_sk` INT, `s_store_name` STRING, + |`s_number_employees` INT, `s_floor_space` INT, `s_hours` STRING, `s_manager` STRING, + |`s_market_id` INT, `s_geography_class` STRING, `s_market_desc` STRING, + |`s_market_manager` STRING, `s_division_id` INT, `s_division_name` STRING, + |`s_company_id` INT, `s_company_name` STRING, `s_street_number` STRING, + |`s_street_name` STRING, `s_street_type` STRING, `s_suite_number` STRING, `s_city` STRING, + |`s_county` STRING, `s_state` STRING, `s_zip` STRING, `s_country` STRING, + |`s_gmt_offset` DECIMAL(5,2), `s_tax_precentage` DECIMAL(5,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `store_returns` ( + |`sr_returned_date_sk` BIGINT, `sr_return_time_sk` BIGINT, `sr_item_sk` BIGINT, + |`sr_customer_sk` BIGINT, `sr_cdemo_sk` BIGINT, `sr_hdemo_sk` BIGINT, `sr_addr_sk` BIGINT, + |`sr_store_sk` BIGINT, `sr_reason_sk` BIGINT, `sr_ticket_number` BIGINT, + |`sr_return_quantity` BIGINT, `sr_return_amt` DECIMAL(7,2), `sr_return_tax` DECIMAL(7,2), + |`sr_return_amt_inc_tax` DECIMAL(7,2), `sr_fee` DECIMAL(7,2), + |`sr_return_ship_cost` DECIMAL(7,2), `sr_refunded_cash` DECIMAL(7,2), + |`sr_reversed_charge` DECIMAL(7,2), `sr_store_credit` DECIMAL(7,2), + |`sr_net_loss` DECIMAL(7,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `catalog_sales` ( + |`cs_sold_date_sk` INT, `cs_sold_time_sk` INT, `cs_ship_date_sk` INT, + |`cs_bill_customer_sk` INT, `cs_bill_cdemo_sk` INT, `cs_bill_hdemo_sk` INT, + |`cs_bill_addr_sk` INT, `cs_ship_customer_sk` INT, `cs_ship_cdemo_sk` INT, + |`cs_ship_hdemo_sk` INT, `cs_ship_addr_sk` INT, `cs_call_center_sk` INT, + |`cs_catalog_page_sk` INT, `cs_ship_mode_sk` INT, `cs_warehouse_sk` INT, + |`cs_item_sk` INT, `cs_promo_sk` INT, `cs_order_number` INT, `cs_quantity` INT, + |`cs_wholesale_cost` DECIMAL(7,2), `cs_list_price` DECIMAL(7,2), + |`cs_sales_price` DECIMAL(7,2), `cs_ext_discount_amt` DECIMAL(7,2), + |`cs_ext_sales_price` DECIMAL(7,2), `cs_ext_wholesale_cost` DECIMAL(7,2), + |`cs_ext_list_price` DECIMAL(7,2), `cs_ext_tax` DECIMAL(7,2), `cs_coupon_amt` DECIMAL(7,2), + |`cs_ext_ship_cost` DECIMAL(7,2), `cs_net_paid` DECIMAL(7,2), + |`cs_net_paid_inc_tax` DECIMAL(7,2), `cs_net_paid_inc_ship` DECIMAL(7,2), + |`cs_net_paid_inc_ship_tax` DECIMAL(7,2), `cs_net_profit` DECIMAL(7,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `web_sales` ( + |`ws_sold_date_sk` INT, `ws_sold_time_sk` INT, `ws_ship_date_sk` INT, `ws_item_sk` INT, + |`ws_bill_customer_sk` INT, `ws_bill_cdemo_sk` INT, `ws_bill_hdemo_sk` INT, + |`ws_bill_addr_sk` INT, `ws_ship_customer_sk` INT, `ws_ship_cdemo_sk` INT, + |`ws_ship_hdemo_sk` INT, `ws_ship_addr_sk` INT, `ws_web_page_sk` INT, `ws_web_site_sk` INT, + |`ws_ship_mode_sk` INT, `ws_warehouse_sk` INT, `ws_promo_sk` INT, `ws_order_number` INT, + |`ws_quantity` INT, `ws_wholesale_cost` DECIMAL(7,2), `ws_list_price` DECIMAL(7,2), + |`ws_sales_price` DECIMAL(7,2), `ws_ext_discount_amt` DECIMAL(7,2), + |`ws_ext_sales_price` DECIMAL(7,2), `ws_ext_wholesale_cost` DECIMAL(7,2), + |`ws_ext_list_price` DECIMAL(7,2), `ws_ext_tax` DECIMAL(7,2), + |`ws_coupon_amt` DECIMAL(7,2), `ws_ext_ship_cost` DECIMAL(7,2), `ws_net_paid` DECIMAL(7,2), + |`ws_net_paid_inc_tax` DECIMAL(7,2), `ws_net_paid_inc_ship` DECIMAL(7,2), + |`ws_net_paid_inc_ship_tax` DECIMAL(7,2), `ws_net_profit` DECIMAL(7,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `store_sales` ( + |`ss_sold_date_sk` INT, `ss_sold_time_sk` INT, `ss_item_sk` INT, `ss_customer_sk` INT, + |`ss_cdemo_sk` INT, `ss_hdemo_sk` INT, `ss_addr_sk` INT, `ss_store_sk` INT, + |`ss_promo_sk` INT, `ss_ticket_number` INT, `ss_quantity` INT, + |`ss_wholesale_cost` DECIMAL(7,2), `ss_list_price` DECIMAL(7,2), + |`ss_sales_price` DECIMAL(7,2), `ss_ext_discount_amt` DECIMAL(7,2), + |`ss_ext_sales_price` DECIMAL(7,2), `ss_ext_wholesale_cost` DECIMAL(7,2), + |`ss_ext_list_price` DECIMAL(7,2), `ss_ext_tax` DECIMAL(7,2), + |`ss_coupon_amt` DECIMAL(7,2), `ss_net_paid` DECIMAL(7,2), + |`ss_net_paid_inc_tax` DECIMAL(7,2), `ss_net_profit` DECIMAL(7,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `web_returns` ( + |`wr_returned_date_sk` BIGINT, `wr_returned_time_sk` BIGINT, `wr_item_sk` BIGINT, + |`wr_refunded_customer_sk` BIGINT, `wr_refunded_cdemo_sk` BIGINT, + |`wr_refunded_hdemo_sk` BIGINT, `wr_refunded_addr_sk` BIGINT, + |`wr_returning_customer_sk` BIGINT, `wr_returning_cdemo_sk` BIGINT, + |`wr_returning_hdemo_sk` BIGINT, `wr_returning_addr_sk` BIGINT, `wr_web_page_sk` BIGINT, + |`wr_reason_sk` BIGINT, `wr_order_number` BIGINT, `wr_return_quantity` BIGINT, + |`wr_return_amt` DECIMAL(7,2), `wr_return_tax` DECIMAL(7,2), + |`wr_return_amt_inc_tax` DECIMAL(7,2), `wr_fee` DECIMAL(7,2), + |`wr_return_ship_cost` DECIMAL(7,2), `wr_refunded_cash` DECIMAL(7,2), + |`wr_reversed_charge` DECIMAL(7,2), `wr_account_credit` DECIMAL(7,2), + |`wr_net_loss` DECIMAL(7,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `web_site` ( + |`web_site_sk` INT, `web_site_id` STRING, `web_rec_start_date` DATE, + |`web_rec_end_date` DATE, `web_name` STRING, `web_open_date_sk` INT, + |`web_close_date_sk` INT, `web_class` STRING, `web_manager` STRING, `web_mkt_id` INT, + |`web_mkt_class` STRING, `web_mkt_desc` STRING, `web_market_manager` STRING, + |`web_company_id` INT, `web_company_name` STRING, `web_street_number` STRING, + |`web_street_name` STRING, `web_street_type` STRING, `web_suite_number` STRING, + |`web_city` STRING, `web_county` STRING, `web_state` STRING, `web_zip` STRING, + |`web_country` STRING, `web_gmt_offset` STRING, `web_tax_percentage` DECIMAL(5,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `reason` ( + |`r_reason_sk` INT, `r_reason_id` STRING, `r_reason_desc` STRING) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `call_center` ( + |`cc_call_center_sk` INT, `cc_call_center_id` STRING, `cc_rec_start_date` DATE, + |`cc_rec_end_date` DATE, `cc_closed_date_sk` INT, `cc_open_date_sk` INT, `cc_name` STRING, + |`cc_class` STRING, `cc_employees` INT, `cc_sq_ft` INT, `cc_hours` STRING, + |`cc_manager` STRING, `cc_mkt_id` INT, `cc_mkt_class` STRING, `cc_mkt_desc` STRING, + |`cc_market_manager` STRING, `cc_division` INT, `cc_division_name` STRING, `cc_company` INT, + |`cc_company_name` STRING, `cc_street_number` STRING, `cc_street_name` STRING, + |`cc_street_type` STRING, `cc_suite_number` STRING, `cc_city` STRING, `cc_county` STRING, + |`cc_state` STRING, `cc_zip` STRING, `cc_country` STRING, `cc_gmt_offset` DECIMAL(5,2), + |`cc_tax_percentage` DECIMAL(5,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `warehouse` ( + |`w_warehouse_sk` INT, `w_warehouse_id` STRING, `w_warehouse_name` STRING, + |`w_warehouse_sq_ft` INT, `w_street_number` STRING, `w_street_name` STRING, + |`w_street_type` STRING, `w_suite_number` STRING, `w_city` STRING, `w_county` STRING, + |`w_state` STRING, `w_zip` STRING, `w_country` STRING, `w_gmt_offset` DECIMAL(5,2)) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `ship_mode` ( + |`sm_ship_mode_sk` INT, `sm_ship_mode_id` STRING, `sm_type` STRING, `sm_code` STRING, + |`sm_carrier` STRING, `sm_contract` STRING) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `income_band` ( + |`ib_income_band_sk` INT, `ib_lower_bound` INT, `ib_upper_bound` INT) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `time_dim` ( + |`t_time_sk` INT, `t_time_id` STRING, `t_time` INT, `t_hour` INT, `t_minute` INT, + |`t_second` INT, `t_am_pm` STRING, `t_shift` STRING, `t_sub_shift` STRING, + |`t_meal_time` STRING) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE `web_page` (`wp_web_page_sk` INT, `wp_web_page_id` STRING, + |`wp_rec_start_date` DATE, `wp_rec_end_date` DATE, `wp_creation_date_sk` INT, + |`wp_access_date_sk` INT, `wp_autogen_flag` STRING, `wp_customer_sk` INT, + |`wp_url` STRING, `wp_type` STRING, `wp_char_count` INT, `wp_link_count` INT, + |`wp_image_count` INT, `wp_max_ad_count` INT) + |USING parquet + """.stripMargin) + + sql( + """ + |CREATE TABLE default.rangertbl1 AS SELECT * FROM default.src + """.stripMargin) + + sql( + """ + |CREATE TABLE default.rangertbl2 AS SELECT * FROM default.src + """.stripMargin) + + sql( + """ + |CREATE TABLE default.rangertbl3 AS SELECT * FROM default.src + """.stripMargin) + + sql( + """ + |CREATE TABLE default.rangertbl4 AS SELECT * FROM default.src + """.stripMargin) + + sql( + """ + |CREATE TABLE default.rangertbl5 AS SELECT * FROM default.src + """.stripMargin) + + sql( + """ + |CREATE TABLE default.rangertbl6 AS SELECT * FROM default.src + """.stripMargin) + } + + test("simple query") { + val statement = "select * from default.src" + withUser("bob") { + val df = sql(statement) + assert(df.queryExecution.optimizedPlan.find(_.isInstanceOf[RangerSparkMasking]).nonEmpty) + assert(df.queryExecution.optimizedPlan.find(_.isInstanceOf[RangerSparkRowFilter]).nonEmpty) + assert(df.queryExecution.optimizedPlan.isInstanceOf[Project]) + val project = df.queryExecution.optimizedPlan.asInstanceOf[Project] + val masker = project.projectList(1) + assert(masker.name === "value") + assert(masker.children.exists(_.sql.contains("mask_show_last_n"))) + val row = df.take(1)(0) + assert(row.getInt(0) < 20, "keys above 20 should be filtered automatically") + assert(row.getString(1).startsWith("x"), "values should be masked") + assert(df.count() === 20, "keys above 20 should be filtered automatically") + } + withUser("alice") { + val df = sql(statement) + assert(df.count() === 500) + } + } + + test("projection with ranger filter key") { + val statement = "select key from default.src" + withUser("bob") { + val df = sql(statement) + val row = df.take(1)(0) + assert(row.getInt(0) < 20) + } + withUser("alice") { + val df = sql(statement) + assert(df.count() === 500) + } + } + + test("projection without ranger filter key") { + val statement = "select value from default.src" + withUser("bob") { + val df = sql(statement) + val row = df.take(1)(0) + assert(row.getString(0).split("_")(1).toInt < 20) + } + withUser("alice") { + val df = sql(statement) + assert(df.count() === 500) + } + } + + test("filter with with ranger filter key") { + val statement = "select key from default.src where key = 0" + val statement2 = "select key from default.src where key >= 20" + withUser("bob") { + val df = sql(statement) + val row = df.take(1)(0) + assert(row.getInt(0) === 0) + val df2 = sql(statement2) + assert(df2.count() === 0, "all keys should be filtered") + } + withUser("alice") { + val df = sql(statement) + assert(df.count() === 3) + val df2 = sql(statement2) + assert(df2.count() === 480) + } + } + + test("alias") { + val statement = "select key as k1, value v1 from default.src" + withUser("bob") { + val df = sql(statement) + val row = df.take(1)(0) + assert(row.getInt(0) < 20, "keys above 20 should be filtered automatically") + assert(row.getString(1).startsWith("x"), "values should be masked") + assert(df.count() === 20, "keys above 20 should be filtered automatically") + } + withUser("alice") { + val df = sql(statement) + assert(df.count() === 500) + } + } + + test("agg") { + val statement = "select sum(key) as k1, value v1 from default.src group by v1" + withUser("bob") { + val df = sql(statement) + println(df.queryExecution.optimizedPlan) + val row = df.take(1)(0) + assert(row.getString(1).startsWith("x"), "values should be masked") + assert(row.getString(1).split("_")(1).toInt < 20) + } + withUser("alice") { + val df = sql(statement) + val row = df.take(1)(0) + assert(row.getString(1).startsWith("val"), "values should not be masked") + } + } + + test("with equal expression") { + val statement = "select * from default.rangertbl1" + withUser("bob") { + val df = sql(statement) + println(df.queryExecution.optimizedPlan) + val row = df.take(1)(0) + assert(row.getInt(0) === 0, "rangertbl1 has an internal expression key=0") + assert(row.getString(1).startsWith("x"), "values should be masked") + } + } + + test("with in set") { + val statement = "select * from default.rangertbl2" + withUser("bob") { + val df = sql(statement) + println(df.queryExecution.optimizedPlan) + val row = df.take(1)(0) + assert(row.getInt(0) === 0, "rangertbl2 has an internal expression key in (0, 1, 2)") + assert(row.getString(1).startsWith("val_x"), "values should show first 4 characters") + } + } + + test("with in subquery") { + val statement = "select * from default.rangertbl3" + withUser("bob") { + val df = sql(statement) + println(df.queryExecution.optimizedPlan) + val rows = df.collect() + assert(rows.forall(_.getInt(0) < 100), "rangertbl3 has an internal expression key in (query)") + assert(rows.forall(_.getString(1).length > 10), "values should be hashed") + } + } + + test("with in subquery self joined") { + val statement = "select * from default.rangertbl4" + withUser("bob") { + val df = sql(statement) + println(df.queryExecution.optimizedPlan) + val rows = df.collect() + assert(rows.length === 500) + assert(rows.forall(_.getString(1) === null), "values should be hashed") + } + } + + test("with udf") { + val statement = "select * from default.rangertbl5" + withUser("bob") { + val df = sql(statement) + println(df.queryExecution.optimizedPlan) + val rows = df.collect() + assert(rows.length === 0) + } + } + + test("with multiple expressions") { + val statement = "select * from default.rangertbl6" + withUser("bob") { + val df = sql(statement) + println(df.queryExecution.optimizedPlan) + val rows = df.collect() + assert(rows.forall { r => val x = r.getInt(0); x > 1 && x < 10 || x == 500 }) + } + } + + private val tpcdsQueries = Seq( + "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "q14a", "q14b", "q15", "q16", "q17", "q18", "q19", "q20", + "q21", "q22", "q23a", "q23b", "q24a", "q24b", "q25", "q26", "q27", "q28", "q29", "q30", + "q31", "q32", "q33", "q34", "q35", "q36", "q37", "q38", "q39a", "q39b", "q40", + "q41", "q42", "q43", "q44", "q45", "q46", "q47", "q48", "q49", "q50", + "q51", "q52", "q53", "q54", "q55", "q56", "q57", "q58", "q59", "q60", + "q61", "q62", "q63", "q64", "q65", "q66", "q67", "q68", "q69", "q70", + "q71", "q72", "q73", "q74", "q75", "q76", "q77", "q78", "q79", "q80", + "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90", + "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99") + + tpcdsQueries.foreach { name => + val queryString = resourceToString(s"tpcds/$name.sql", + classLoader = Thread.currentThread().getContextClassLoader) + test(name) { + withUser("bob") { + val queryExecution = sql(queryString).queryExecution + val optimized = queryExecution.optimizedPlan + // println(optimized) + assert(optimized.find(_.isInstanceOf[RangerSparkRowFilter]).nonEmpty) + assert(optimized.find(_.isInstanceOf[RangerSparkMasking]).nonEmpty) + val plan = queryExecution.executedPlan + checkGeneratedCode(plan) + } + } + } + + /** + * Check whether the Modified queries can be properly compiled + */ + def checkGeneratedCode(plan: SparkPlan): Unit = { + val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() + plan foreach { + case s: WholeStageCodegenExec => + codegenSubtrees += s + case s => s + } + codegenSubtrees.toSeq.foreach { subtree => + val code = subtree.doCodeGen()._2 + try { + // Just check the generated code can be properly compiled + CodeGenerator.compile(code) + } catch { + case e: Exception => + val msg = + s""" + |failed to compile: + |Subtree: + |$subtree + |Generated code: + |${CodeFormatter.format(code)} + """.stripMargin + throw new Exception(msg, e) + } + } + } +} diff --git a/plugin-spark/src/test/scala/org/apache/spark/sql/RangerSparkTestUtils.scala b/plugin-spark/src/test/scala/org/apache/spark/sql/RangerSparkTestUtils.scala new file mode 100644 index 0000000000..a723b4dce0 --- /dev/null +++ b/plugin-spark/src/test/scala/org/apache/spark/sql/RangerSparkTestUtils.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.security.PrivilegedExceptionAction + +import org.apache.hadoop.security.UserGroupInformation +import org.apache.spark.sql.catalyst.optimizer.{RangerSparkMaskingExtension, RangerSparkRowFilterExtension} +import org.apache.spark.sql.execution.RangerSparkPlanOmitStrategy + +object RangerSparkTestUtils { + + def injectRules(spark: SparkSession): Unit = { + spark.extensions.injectOptimizerRule(RangerSparkRowFilterExtension) + spark.extensions.injectOptimizerRule(RangerSparkMaskingExtension) + spark.extensions.injectPlannerStrategy(RangerSparkPlanOmitStrategy) + } + + def withUser[T](user: String)(f: => T): T = { + val ugi = UserGroupInformation.createRemoteUser(user) + ugi.doAs(new PrivilegedExceptionAction[T] { + override def run(): T = f + }) + } +} diff --git a/plugin-spark/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkAuthorizerExtensionTest.scala b/plugin-spark/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkAuthorizerExtensionTest.scala new file mode 100644 index 0000000000..a650a10835 --- /dev/null +++ b/plugin-spark/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkAuthorizerExtensionTest.scala @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.ranger.authorization.spark.authorizer.SparkAccessControlException +import org.apache.spark.sql.hive.test.TestHive +import org.scalatest.FunSuite +import org.apache.spark.sql.RangerSparkTestUtils._ +import org.apache.spark.sql.execution.{RangerShowDatabasesCommand, RangerShowTablesCommand} + +class RangerSparkAuthorizerExtensionTest extends FunSuite { + private val spark = TestHive.sparkSession + val extension = RangerSparkAuthorizerExtension(spark) + + test("convert show tables command") { + val df = spark.sql("show tables") + val plan = df.queryExecution.optimizedPlan + val newPlan = extension.apply(plan) + assert(newPlan.isInstanceOf[RangerShowTablesCommand]) + assert(extension.apply(newPlan) === newPlan) + } + + test("convert show databases command") { + val df = spark.sql("show databases") + val plan = df.queryExecution.optimizedPlan + val newPlan = extension.apply(plan) + assert(newPlan.isInstanceOf[RangerShowDatabasesCommand]) + assert(extension.apply(newPlan) === newPlan) + } + + test("simple select") { + val df = spark.sql("select * from src") + val plan = df.queryExecution.optimizedPlan + withUser("bob") { + assert(extension.apply(plan) === plan, "bob has all privileges of table src") + } + withUser("alice") { + val e = intercept[SparkAccessControlException](extension.apply(plan)) + assert(e.getMessage === "Permission denied: user [alice] does not have [SELECT] privilege" + + " on [default/src/key,value]", "alice is not allow to access table src") + } + withUser("kent") { + val e = intercept[SparkAccessControlException](extension.apply(plan)) + assert(e.getMessage === "Permission denied: user [kent] does not have [SELECT] privilege" + + " on [default/src/key,value]", "kent can only access table src.key") + } + } + + test("projection select") { + val df1 = spark.sql("select key from src") + val df2 = spark.sql("select value from src") + + val plan1 = df1.queryExecution.optimizedPlan + val plan2 = df2.queryExecution.optimizedPlan + + withUser("bob") { + assert(extension.apply(plan1) === plan1, "bob has all privileges of table src") + assert(extension.apply(plan2) === plan2, "bob has all privileges of table src") + } + withUser("alice") { + val e = intercept[SparkAccessControlException](extension.apply(plan1)) + assert(e.getMessage === "Permission denied: user [alice] does not have [SELECT] privilege" + + " on [default/src/key]", "alice is not allow to access table src") + } + withUser("kent") { + assert(extension.apply(plan1) === plan1, "kent can only access table src.key") + val e = intercept[SparkAccessControlException](extension.apply(plan2)) + assert(e.getMessage === "Permission denied: user [kent] does not have [SELECT] privilege" + + " on [default/src/value]", "kent can only access table src.key") + } + } + + test("alter database set properties") { + val df = spark.sql("ALTER DATABASE default SET DBPROPERTIES (hero='i am iron man')") + val plan = df.queryExecution.optimizedPlan + withUser("bob") { + assert(extension.apply(plan) === plan) + } + withUser("alice") { + val e = intercept[SparkAccessControlException](extension.apply(plan)) + assert(e.getMessage === "Permission denied: user [alice] does not have [ALTER] privilege" + + " on [default]", "alice is not allow to set properties to default") + } + } +} diff --git a/plugin-spark/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkMaskingExtensionTest.scala b/plugin-spark/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkMaskingExtensionTest.scala new file mode 100644 index 0000000000..ce4c574183 --- /dev/null +++ b/plugin-spark/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkMaskingExtensionTest.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.RangerSparkTestUtils._ +import org.apache.spark.sql.catalyst.expressions.Alias +import org.apache.spark.sql.catalyst.plans.logical.{Project, RangerSparkMasking} +import org.scalatest.FunSuite + +class RangerSparkMaskingExtensionTest extends FunSuite { + + private val spark = TestHive.sparkSession + + test("data masking for bob show last 4") { + val extension = RangerSparkMaskingExtension(spark) + val plan = spark.sql("select * from src").queryExecution.optimizedPlan + println(plan) + withUser("bob") { + val newPlan = extension.apply(plan) + assert(newPlan.isInstanceOf[Project]) + val project = newPlan.asInstanceOf[Project] + val key = project.projectList.head + assert(key.name === "key", "no affect on un masking attribute") + val value = project.projectList.tail + assert(value.head.name === "value", "attibute name should be unchanged") + assert(value.head.asInstanceOf[Alias].child.sql === + "mask_show_last_n(`value`, 4, 'x', 'x', 'x', -1, '1')") + } + + withUser("alice") { + val newPlan = extension.apply(plan) + assert(newPlan === RangerSparkMasking(plan)) + } + } + +} diff --git a/plugin-spark/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkRowFilterExtensionTest.scala b/plugin-spark/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkRowFilterExtensionTest.scala new file mode 100644 index 0000000000..176948f5a5 --- /dev/null +++ b/plugin-spark/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RangerSparkRowFilterExtensionTest.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.hive.test.TestHive +import org.scalatest.FunSuite +import org.apache.spark.sql.RangerSparkTestUtils._ +import org.apache.spark.sql.catalyst.plans.logical.{Filter, RangerSparkRowFilter} + +class RangerSparkRowFilterExtensionTest extends FunSuite { + + private val spark = TestHive.sparkSession + + test("ranger spark row filter extension") { + val extension = RangerSparkRowFilterExtension(spark) + val plan = spark.sql("select * from src").queryExecution.optimizedPlan + println(plan) + withUser("bob") { + val newPlan = extension.apply(plan) + assert(newPlan.isInstanceOf[RangerSparkRowFilter]) + val filters = newPlan.collect { case f: Filter => f } + assert(filters.nonEmpty, "ranger row level filters should be applied automatically") + println(newPlan) + } + withUser("alice") { + val newPlan = extension.apply(plan) + assert(newPlan.isInstanceOf[RangerSparkRowFilter]) + val filters = newPlan.collect { case f: Filter => f } + assert(filters.isEmpty, "alice does not have implicit filters") + println(newPlan) + } + } + +} diff --git a/plugin-spark/src/test/scala/org/apache/spark/sql/execution/RangerSparkPlanOmitStrategyTest.scala b/plugin-spark/src/test/scala/org/apache/spark/sql/execution/RangerSparkPlanOmitStrategyTest.scala new file mode 100644 index 0000000000..f5c11bd6f6 --- /dev/null +++ b/plugin-spark/src/test/scala/org/apache/spark/sql/execution/RangerSparkPlanOmitStrategyTest.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.catalyst.plans.logical.{RangerSparkMasking, RangerSparkRowFilter} +import org.apache.spark.sql.hive.test.TestHive +import org.scalatest.FunSuite + +class RangerSparkPlanOmitStrategyTest extends FunSuite { + + private val spark = TestHive.sparkSession + + test("ranger spark plan omit strategy") { + val strategy = RangerSparkPlanOmitStrategy(spark) + val df = spark.range(0, 5) + val plan1 = df.queryExecution.optimizedPlan + assert(strategy.apply(plan1) === Nil) + val plan2 = RangerSparkRowFilter(plan1) + assert(strategy.apply(plan2) === PlanLater(plan1) :: Nil) + val plan3 = RangerSparkMasking(plan1) + assert(strategy.apply(plan3) === PlanLater(plan1) :: Nil) + val plan4 = RangerSparkMasking(plan2) + assert(strategy.apply(plan4) === PlanLater(plan2) :: Nil) + val plan5 = RangerSparkRowFilter(plan3) + assert(strategy.apply(plan5) === PlanLater(plan3) :: Nil) + } +} diff --git a/plugin-sqoop/pom.xml b/plugin-sqoop/pom.xml index af00e97782..06ff981652 100644 --- a/plugin-sqoop/pom.xml +++ b/plugin-sqoop/pom.xml @@ -73,4 +73,9 @@ ${httpcomponents.httpcore.version} + + + target/classes + target/test-classes + diff --git a/plugin-yarn/pom.xml b/plugin-yarn/pom.xml index f2735699af..445762b0e8 100644 --- a/plugin-yarn/pom.xml +++ b/plugin-yarn/pom.xml @@ -62,4 +62,9 @@ ${httpcomponents.httpcore.version} + + + target/classes + target/test-classes + diff --git a/pom.xml b/pom.xml index 2a2540a0d5..3dcee01484 100644 --- a/pom.xml +++ b/pom.xml @@ -224,6 +224,7 @@ ranger-kylin-plugin-shim plugin-elasticsearch ranger-elasticsearch-plugin-shim + plugin-spark @@ -251,6 +252,7 @@ src/main/assembly/plugin-sqoop.xml src/main/assembly/plugin-kylin.xml src/main/assembly/plugin-elasticsearch.xml + src/main/assembly/plugin-spark.xml diff --git a/ranger-atlas-plugin-shim/pom.xml b/ranger-atlas-plugin-shim/pom.xml index 86d6a264b1..380cc3f595 100644 --- a/ranger-atlas-plugin-shim/pom.xml +++ b/ranger-atlas-plugin-shim/pom.xml @@ -90,4 +90,9 @@ ${slf4j-api.version} + + + target/classes + target/test-classes + diff --git a/ranger-elasticsearch-plugin-shim/pom.xml b/ranger-elasticsearch-plugin-shim/pom.xml index 59a653b1cc..2bcc57047e 100644 --- a/ranger-elasticsearch-plugin-shim/pom.xml +++ b/ranger-elasticsearch-plugin-shim/pom.xml @@ -63,4 +63,9 @@ runtime + + + target/classes + target/test-classes + \ No newline at end of file diff --git a/ranger-examples/pom.xml b/ranger-examples/pom.xml index fa4cb2024c..fc0156e3e8 100644 --- a/ranger-examples/pom.xml +++ b/ranger-examples/pom.xml @@ -30,6 +30,8 @@ plugin-sampleapp + target/classes + target/test-classes diff --git a/ranger-examples/sampleapp/pom.xml b/ranger-examples/sampleapp/pom.xml index add9546115..ce4e404dfe 100644 --- a/ranger-examples/sampleapp/pom.xml +++ b/ranger-examples/sampleapp/pom.xml @@ -38,4 +38,9 @@ ${log4j.version} + + + target/classes + target/test-classes + diff --git a/ranger-hbase-plugin-shim/pom.xml b/ranger-hbase-plugin-shim/pom.xml index 9ed0aebe8e..150a870d82 100644 --- a/ranger-hbase-plugin-shim/pom.xml +++ b/ranger-hbase-plugin-shim/pom.xml @@ -56,4 +56,9 @@ gson + + + target/classes + target/test-classes + diff --git a/ranger-hdfs-plugin-shim/pom.xml b/ranger-hdfs-plugin-shim/pom.xml index f45621c5b8..75bd83f279 100644 --- a/ranger-hdfs-plugin-shim/pom.xml +++ b/ranger-hdfs-plugin-shim/pom.xml @@ -62,4 +62,9 @@ ${project.version} + + + target/classes + target/test-classes + diff --git a/ranger-hive-plugin-shim/pom.xml b/ranger-hive-plugin-shim/pom.xml index 26258d7bf2..9368ba7664 100644 --- a/ranger-hive-plugin-shim/pom.xml +++ b/ranger-hive-plugin-shim/pom.xml @@ -83,4 +83,9 @@ ${project.version} + + + target/classes + target/test-classes + diff --git a/ranger-kafka-plugin-shim/pom.xml b/ranger-kafka-plugin-shim/pom.xml index 5a7353f02d..1e2f3ba101 100644 --- a/ranger-kafka-plugin-shim/pom.xml +++ b/ranger-kafka-plugin-shim/pom.xml @@ -58,4 +58,9 @@ ${kafka.version} + + + target/classes + target/test-classes + diff --git a/ranger-kms-plugin-shim/pom.xml b/ranger-kms-plugin-shim/pom.xml index 0b505f6afa..d5e64ccbf6 100644 --- a/ranger-kms-plugin-shim/pom.xml +++ b/ranger-kms-plugin-shim/pom.xml @@ -57,4 +57,9 @@ ${project.version} + + + target/classes + target/test-classes + diff --git a/ranger-knox-plugin-shim/pom.xml b/ranger-knox-plugin-shim/pom.xml index db342b71df..03289916f7 100644 --- a/ranger-knox-plugin-shim/pom.xml +++ b/ranger-knox-plugin-shim/pom.xml @@ -79,4 +79,9 @@ ${project.version} + + + target/classes + target/test-classes + diff --git a/ranger-kylin-plugin-shim/pom.xml b/ranger-kylin-plugin-shim/pom.xml index 4bb9f34edb..d8a420a646 100644 --- a/ranger-kylin-plugin-shim/pom.xml +++ b/ranger-kylin-plugin-shim/pom.xml @@ -69,4 +69,9 @@ ${project.version} + + + target/classes + target/test-classes + \ No newline at end of file diff --git a/ranger-plugin-classloader/pom.xml b/ranger-plugin-classloader/pom.xml index 0aa71c36e9..517f89596a 100644 --- a/ranger-plugin-classloader/pom.xml +++ b/ranger-plugin-classloader/pom.xml @@ -48,4 +48,9 @@ ${slf4j-api.version} + + + target/classes + target/test-classes + diff --git a/ranger-solr-plugin-shim/pom.xml b/ranger-solr-plugin-shim/pom.xml index 46b78218a8..34ad042298 100644 --- a/ranger-solr-plugin-shim/pom.xml +++ b/ranger-solr-plugin-shim/pom.xml @@ -57,4 +57,9 @@ ${project.version} + + + target/classes + target/test-classes + diff --git a/ranger-sqoop-plugin-shim/pom.xml b/ranger-sqoop-plugin-shim/pom.xml index 86ab432332..ab12bc7461 100644 --- a/ranger-sqoop-plugin-shim/pom.xml +++ b/ranger-sqoop-plugin-shim/pom.xml @@ -59,4 +59,9 @@ ${project.version} + + + target/classes + target/test-classes + \ No newline at end of file diff --git a/src/main/assembly/plugin-spark.xml b/src/main/assembly/plugin-spark.xml new file mode 100644 index 0000000000..1d357beaec --- /dev/null +++ b/src/main/assembly/plugin-spark.xml @@ -0,0 +1,51 @@ + + + + spark-plugin + + tar.gz + zip + + ${project.name}-${project.version}-spark-plugin + true + + + + /jars + true + 755 + 644 + false + + + org.apache.ranger:ranger-spark-plugin + + + + + + + / + ${project.build.outputDirectory} + + version + + 444 + + + diff --git a/ugsync/pom.xml b/ugsync/pom.xml index c30d8ca7c6..ca4459d5fe 100644 --- a/ugsync/pom.xml +++ b/ugsync/pom.xml @@ -137,6 +137,8 @@ + target/classes + target/test-classes src/test/resources diff --git a/unixauthclient/pom.xml b/unixauthclient/pom.xml index 4625925a2a..27211e2b5b 100644 --- a/unixauthclient/pom.xml +++ b/unixauthclient/pom.xml @@ -89,4 +89,9 @@ + + + target/classes + target/test-classes + diff --git a/unixauthpam/pom.xml b/unixauthpam/pom.xml index 0f2a4596f2..251bfe08b2 100644 --- a/unixauthpam/pom.xml +++ b/unixauthpam/pom.xml @@ -28,6 +28,8 @@ PAM Authenticator PAM authentication service + target/classes + target/test-classes org.codehaus.mojo