-
Notifications
You must be signed in to change notification settings - Fork 29k
SPARK-17829 [SQL] Stable format for offset log #15626
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
60a71fa
5243114
3ccdc5c
16c6cea
bdd7fa7
2fcf251
3cf14fa
c7a8bd3
f1010ee
1390711
b80b3ce
4663d5e
d62a877
bbc5a26
9298022
8a2028b
7beedcc
d6fec94
8ee336d
acab31e
81f0c01
cb54aa9
dabb628
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -131,8 +131,8 @@ class FileStreamSource( | |
| * Returns the data that is between the offsets (`start`, `end`]. | ||
| */ | ||
| override def getBatch(start: Option[Offset], end: Offset): DataFrame = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. given this change that start and end Offset will not the custom Offset class defined by the source, i think it is important to document this in the Source.getBatch class that the user should not make that assumption. |
||
| val startId = start.map(_.asInstanceOf[LongOffset].offset).getOrElse(-1L) | ||
| val endId = end.asInstanceOf[LongOffset].offset | ||
| val startId = start.flatMap(LongOffset.convert(_)).getOrElse(LongOffset(-1L)).offset | ||
| val endId = LongOffset.convert(end).getOrElse(LongOffset(0)).offset | ||
|
|
||
| assert(startId <= endId) | ||
| val files = metadataLog.get(Some(startId + 1), Some(endId)).flatMap(_._2) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,8 @@ | |
|
|
||
| package org.apache.spark.sql.execution.streaming | ||
|
|
||
| import java.io.{FileNotFoundException, InputStream, IOException, OutputStream} | ||
| import java.io._ | ||
| import java.nio.charset.StandardCharsets | ||
| import java.util.{ConcurrentModificationException, EnumSet, UUID} | ||
|
|
||
| import scala.reflect.ClassTag | ||
|
|
@@ -26,9 +27,10 @@ import org.apache.commons.io.IOUtils | |
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs._ | ||
| import org.apache.hadoop.fs.permission.FsPermission | ||
| import org.json4s.NoTypeHints | ||
| import org.json4s.jackson.Serialization | ||
|
|
||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.serializer.JavaSerializer | ||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.util.UninterruptibleThread | ||
|
|
||
|
|
@@ -44,9 +46,14 @@ import org.apache.spark.util.UninterruptibleThread | |
| * Note: [[HDFSMetadataLog]] doesn't support S3-like file systems as they don't guarantee listing | ||
| * files in a directory always shows the latest files. | ||
| */ | ||
| class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String) | ||
| class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: String) | ||
| extends MetadataLog[T] with Logging { | ||
|
|
||
| private implicit val formats = Serialization.formats(NoTypeHints) | ||
|
|
||
| /** Needed to serialize type T into JSON when using Jackson */ | ||
| private implicit val manifest = Manifest.classType[T](implicitly[ClassTag[T]].runtimeClass) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this needed now? This wasnt needed earlier when we were using json serialization in FilestreamSinkLog, then why is the manifest needed now?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I got it why this is needed.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you added "Needed by Jackson to serialize/deserialize... "
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
|
|
||
| // Avoid serializing generic sequences, see SPARK-17372 | ||
| require(implicitly[ClassTag[T]].runtimeClass != classOf[Seq[_]], | ||
| "Should not create a log with type Seq, use Arrays instead - see SPARK-17372") | ||
|
|
@@ -67,8 +74,6 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String) | |
| override def accept(path: Path): Boolean = isBatchFile(path) | ||
| } | ||
|
|
||
| private val serializer = new JavaSerializer(sparkSession.sparkContext.conf).newInstance() | ||
|
|
||
| protected def batchIdToPath(batchId: Long): Path = { | ||
| new Path(metadataPath, batchId.toString) | ||
| } | ||
|
|
@@ -88,14 +93,13 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String) | |
|
|
||
| protected def serialize(metadata: T, out: OutputStream): Unit = { | ||
| // called inside a try-finally where the underlying stream is closed in the caller | ||
| val outStream = serializer.serializeStream(out) | ||
| outStream.writeObject(metadata) | ||
| Serialization.write(metadata, out) | ||
| } | ||
|
|
||
| protected def deserialize(in: InputStream): T = { | ||
| // called inside a try-finally where the underlying stream is closed in the caller | ||
| val inStream = serializer.deserializeStream(in) | ||
| inStream.readObject[T]() | ||
| val reader = new InputStreamReader(in, StandardCharsets.UTF_8) | ||
| Serialization.read[T](reader) | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this AnyRef needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's required by json4s that T be an AnyRef.