-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support resultSet direct push for SparkSqlExecutor (#5170)
* support direct push in SparkSqlExecutor * fix the default type of fetchSize * disable the feature of higher versions of scala * add dependency of arrow to spark engine * add UT for arrowUtil * add restful api server * Remove methods that use features with excessively high versions and are not used for the time being. * remove unused ExecutionNodedStatus * fix conversion problem * fix ArrowConverter of valueCount issue * remove ReadyForFetchResponse * disable job-way to fetch cached data
- Loading branch information
Showing
11 changed files
with
369 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
70 changes: 70 additions & 0 deletions
70
...lugins/spark/src/main/java/org/apache/linkis/engineplugin/spark/DirectPushRestfulApi.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.linkis.engineplugin.spark; | ||
|
||
import org.apache.linkis.engineplugin.spark.utils.DataFrameResponse; | ||
import org.apache.linkis.engineplugin.spark.utils.DirectPushCache; | ||
import org.apache.linkis.server.Message; | ||
|
||
import org.springframework.web.bind.annotation.RequestBody; | ||
import org.springframework.web.bind.annotation.RequestMapping; | ||
import org.springframework.web.bind.annotation.RequestMethod; | ||
import org.springframework.web.bind.annotation.RestController; | ||
|
||
import javax.servlet.http.HttpServletRequest; | ||
|
||
import java.util.Map; | ||
|
||
import io.swagger.annotations.Api; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
@Api(tags = "DirectPush") | ||
@RestController | ||
@RequestMapping(path = "directpush") | ||
public class DirectPushRestfulApi { | ||
private static final Logger logger = LoggerFactory.getLogger(DirectPushRestfulApi.class); | ||
|
||
@RequestMapping(path = "pull", method = RequestMethod.POST) | ||
public Message getDirectPushResult( | ||
HttpServletRequest req, @RequestBody Map<String, Object> json) { | ||
Message message = null; | ||
try { | ||
String taskId = (String) json.getOrDefault("taskId", null); | ||
if (taskId == null) { | ||
message = Message.error("taskId is null"); | ||
return message; | ||
} | ||
int fetchSize = (int) json.getOrDefault("fetchSize", 1000); | ||
|
||
DataFrameResponse response = DirectPushCache.fetchResultSetOfDataFrame(taskId, fetchSize); | ||
if (response.dataFrame() == null) { | ||
message = Message.error("No result found for taskId: " + taskId); | ||
} else { | ||
message = | ||
Message.ok() | ||
.data("data", response.dataFrame()) | ||
.data("hasMoreData", response.hasMoreData()); | ||
} | ||
} catch (Exception e) { | ||
logger.error("Failed to get direct push result", e); | ||
message = Message.error("Failed to get direct push result: " + e.getMessage()); | ||
} | ||
return message; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
101 changes: 101 additions & 0 deletions
101
...-plugins/spark/src/main/scala/org/apache/linkis/engineplugin/spark/utils/ArrowUtils.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.linkis.engineplugin.spark.utils | ||
|
||
import org.apache.arrow.memory.RootAllocator | ||
import org.apache.arrow.vector._ | ||
import org.apache.arrow.vector.ipc.ArrowStreamWriter | ||
import org.apache.spark.sql.DataFrame | ||
import org.apache.spark.sql.types._ | ||
|
||
import java.io.ByteArrayOutputStream | ||
import java.util | ||
|
||
object ArrowUtils { | ||
|
||
def toArrow(df: DataFrame): Array[Byte] = { | ||
val allocator = new RootAllocator(Long.MaxValue) | ||
val (root, fieldVectors) = createArrowVectors(df, allocator) | ||
val outStream = new ByteArrayOutputStream() | ||
val writer = new ArrowStreamWriter(root, null, outStream) | ||
|
||
writer.start() | ||
writer.writeBatch() | ||
writer.end() | ||
writer.close() | ||
|
||
val arrowBytes = outStream.toByteArray | ||
fieldVectors.foreach(_.close()) | ||
allocator.close() | ||
arrowBytes | ||
} | ||
|
||
private def createArrowVectors( | ||
df: DataFrame, | ||
allocator: RootAllocator | ||
): (VectorSchemaRoot, List[FieldVector]) = { | ||
val schema = df.schema | ||
val fieldVectors = schema.fields.map { field => | ||
field.dataType match { | ||
case IntegerType => | ||
val vector = new IntVector(field.name, allocator) | ||
vector.allocateNew(df.count().toInt) | ||
vector | ||
case LongType => | ||
val vector = new BigIntVector(field.name, allocator) | ||
vector.allocateNew(df.count().toInt) | ||
vector | ||
case DoubleType => | ||
val vector = new Float8Vector(field.name, allocator) | ||
vector.allocateNew(df.count().toInt) | ||
vector | ||
case BooleanType => | ||
val vector = new BitVector(field.name, allocator) | ||
vector.allocateNew(df.count().toInt) | ||
vector | ||
case _ => | ||
val vector: VarCharVector = new VarCharVector(field.name, allocator) | ||
vector.allocateNew(df.count().toInt) | ||
vector | ||
} | ||
}.toList | ||
|
||
df.collect().zipWithIndex.foreach { case (row, i) => | ||
for (j <- fieldVectors.indices) { | ||
val vector = fieldVectors(j) | ||
row.schema.fields(j).dataType match { | ||
case IntegerType => vector.asInstanceOf[IntVector].setSafe(i, row.getInt(j)) | ||
case LongType => vector.asInstanceOf[BigIntVector].setSafe(i, row.getLong(j)) | ||
case DoubleType => vector.asInstanceOf[Float8Vector].setSafe(i, row.getDouble(j)) | ||
case BooleanType => | ||
vector.asInstanceOf[BitVector].setSafe(i, if (row.getBoolean(j)) 1 else 0) | ||
case _ => | ||
vector.asInstanceOf[VarCharVector].setSafe(i, row.getString(j).getBytes) | ||
} | ||
vector.setValueCount(vector.getValueCount + 1) | ||
} | ||
} | ||
|
||
val javaFieldVectors: util.ArrayList[FieldVector] = new util.ArrayList[FieldVector]() | ||
fieldVectors.foreach(javaFieldVectors.add) | ||
val root = new VectorSchemaRoot(javaFieldVectors) | ||
|
||
(root, fieldVectors) | ||
} | ||
|
||
} |
65 changes: 65 additions & 0 deletions
65
...ins/spark/src/main/scala/org/apache/linkis/engineplugin/spark/utils/DirectPushCache.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.linkis.engineplugin.spark.utils | ||
|
||
import org.apache.linkis.engineconn.common.conf.{EngineConnConf, EngineConnConstant} | ||
|
||
import org.apache.spark.sql.DataFrame | ||
|
||
import java.util.concurrent.TimeUnit | ||
|
||
import com.google.common.cache.{Cache, CacheBuilder} | ||
|
||
case class DataFrameResponse(dataFrame: DataFrame, hasMoreData: Boolean) | ||
|
||
object DirectPushCache { | ||
|
||
private val resultSet: Cache[String, DataFrame] = CacheBuilder | ||
.newBuilder() | ||
.expireAfterAccess(EngineConnConf.ENGINE_TASK_EXPIRE_TIME.getValue, TimeUnit.MILLISECONDS) | ||
.maximumSize(EngineConnConstant.MAX_TASK_NUM) | ||
.build() | ||
|
||
// This method is not idempotent. After fetching a result set of size fetchSize each time, the corresponding results will be removed from the cache. | ||
def fetchResultSetOfDataFrame(taskId: String, fetchSize: Int): DataFrameResponse = { | ||
val df = DirectPushCache.resultSet.getIfPresent(taskId) | ||
if (df == null) { | ||
throw new IllegalAccessException(s"Task $taskId not exists in resultSet cache.") | ||
} else { | ||
val batchDf = df.limit(fetchSize) | ||
if (batchDf.count() < fetchSize) { | ||
// All the data in df has been consumed. | ||
DirectPushCache.resultSet.invalidate(taskId) | ||
DataFrameResponse(batchDf, hasMoreData = false) | ||
} else { | ||
// Update df with consumed one. | ||
DirectPushCache.resultSet.put(taskId, df.except(batchDf)) | ||
DataFrameResponse(batchDf, hasMoreData = true) | ||
} | ||
} | ||
} | ||
|
||
def isTaskCached(taskId: String): Boolean = { | ||
DirectPushCache.resultSet.getIfPresent(taskId) != null | ||
} | ||
|
||
def submitExecuteResult(taskId: String, df: DataFrame): Unit = { | ||
DirectPushCache.resultSet.put(taskId, df) | ||
} | ||
|
||
} |
Oops, something went wrong.