-
Notifications
You must be signed in to change notification settings - Fork 702
Scalding viz options #1426
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Scalding viz options #1426
Changes from all commits
d6709d2
11f232a
a18ec2d
d33bdf2
4ea6e5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,12 +16,14 @@ limitations under the License. | |
| package com.twitter.scalding | ||
|
|
||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.mapred.JobConf | ||
| import org.apache.hadoop.io.serializer.{ Serialization => HSerialization } | ||
| import com.twitter.chill.KryoInstantiator | ||
| import com.twitter.chill.{ ExternalizerCodec, ExternalizerInjection, Externalizer, KryoInstantiator } | ||
| import com.twitter.chill.config.{ ScalaMapConfig, ConfiguredInstantiator } | ||
| import com.twitter.bijection.{ Base64String, Injection } | ||
|
|
||
| import cascading.pipe.assembly.AggregateBy | ||
| import cascading.flow.FlowProps | ||
| import cascading.flow.{ FlowListener, FlowStepListener, FlowProps, FlowStepStrategy } | ||
| import cascading.property.AppProps | ||
| import cascading.tuple.collect.SpillableProps | ||
|
|
||
|
|
@@ -298,6 +300,57 @@ trait Config extends Serializable { | |
| def setReducerEstimators(clsList: String): Config = | ||
| this + (Config.ReducerEstimators -> clsList) | ||
|
|
||
| /** | ||
| * configure flow listeneres for observability | ||
| */ | ||
| def addFlowListener(flowListenerProvider: (Mode, Config) => FlowListener): Config = { | ||
| val serializedListener = flowListenerSerializer(flowListenerProvider) | ||
| update(Config.FlowListeners) { | ||
| case None => (Some(serializedListener), ()) | ||
| case Some(lst) => (Some(s"$serializedListener,$lst"), ()) | ||
| }._2 | ||
| } | ||
|
|
||
| def getFlowListeners: List[Try[(Mode, Config) => FlowListener]] = | ||
| get(Config.FlowListeners) | ||
| .toIterable | ||
| .flatMap(s => StringUtility.fastSplit(s, ",")) | ||
| .map(flowListenerSerializer.invert(_)) | ||
| .toList | ||
|
|
||
| def addFlowStepListener(flowListenerProvider: (Mode, Config) => FlowStepListener): Config = { | ||
| val serializedListener = flowStepListenerSerializer(flowListenerProvider) | ||
| update(Config.FlowStepListeners) { | ||
| case None => (Some(serializedListener), ()) | ||
| case Some(lst) => (Some(s"$serializedListener,$lst"), ()) | ||
| }._2 | ||
| } | ||
|
|
||
| def getFlowStepListeners: List[Try[(Mode, Config) => FlowStepListener]] = | ||
| get(Config.FlowStepListeners) | ||
| .toIterable | ||
| .flatMap(s => StringUtility.fastSplit(s, ",")) | ||
| .map(flowStepListenerSerializer.invert(_)) | ||
| .toList | ||
|
|
||
| def addFlowStepStrategy(flowStrategyProvider: (Mode, Config) => FlowStepStrategy[JobConf]): Config = { | ||
| val serializedListener = flowStepStrategiesSerializer(flowStrategyProvider) | ||
| update(Config.FlowStepStrategies) { | ||
| case None => (Some(serializedListener), ()) | ||
| case Some(lst) => (Some(s"$serializedListener,$lst"), ()) | ||
| }._2 | ||
| } | ||
|
|
||
| def clearFlowStepStrategies: Config = | ||
| this.-(Config.FlowStepStrategies) | ||
|
|
||
| def getFlowStepStrategies: List[Try[(Mode, Config) => FlowStepStrategy[JobConf]]] = | ||
| get(Config.FlowStepStrategies) | ||
| .toIterable | ||
| .flatMap(s => StringUtility.fastSplit(s, ",")) | ||
| .map(flowStepStrategiesSerializer.invert(_)) | ||
| .toList | ||
|
|
||
| /** Get the number of reducers (this is the parameter Hadoop will use) */ | ||
| def getNumReducers: Option[Int] = get(Config.HadoopNumReducers).map(_.toInt) | ||
| def setNumReducers(n: Int): Config = this + (Config.HadoopNumReducers -> n.toString) | ||
|
|
@@ -326,6 +379,9 @@ object Config { | |
| val ScaldingVersion: String = "scalding.version" | ||
| val HRavenHistoryUserName: String = "hraven.history.user.name" | ||
| val ScaldingRequireOrderedSerialization: String = "scalding.require.orderedserialization" | ||
| val FlowListeners: String = "scalding.observability.flowlisteners" | ||
| val FlowStepListeners: String = "scalding.observability.flowsteplisteners" | ||
| val FlowStepStrategies: String = "scalding.strategies.flowstepstrategies" | ||
|
|
||
| /** | ||
| * Parameter that actually controls the number of reduce tasks. | ||
|
|
@@ -476,4 +532,11 @@ object Config { | |
| is.close() | ||
| md5Hex(bytes) | ||
| } | ||
|
|
||
| private[this] def buildInj[T: ExternalizerInjection: ExternalizerCodec]: Injection[T, String] = | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. while we're at it, why not stack in GzippedBase64String: Might make the jobconfs slightly smaller. |
||
| Injection.connect[T, Externalizer[T], Array[Byte], Base64String, String] | ||
|
|
||
| @transient private[scalding] lazy val flowStepListenerSerializer = buildInj[(Mode, Config) => FlowStepListener] | ||
| @transient private[scalding] lazy val flowListenerSerializer = buildInj[(Mode, Config) => FlowListener] | ||
| @transient private[scalding] lazy val flowStepStrategiesSerializer = buildInj[(Mode, Config) => FlowStepStrategy[JobConf]] | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,7 @@ limitations under the License. | |
| package com.twitter.scalding | ||
|
|
||
| import cascading.flow.hadoop.HadoopFlow | ||
| import cascading.flow.{ FlowDef, Flow } | ||
| import cascading.flow.{ Flow, FlowDef, FlowListener, FlowStepListener, FlowStepStrategy } | ||
| import cascading.flow.planner.BaseFlowStep | ||
| import cascading.pipe.Pipe | ||
| import com.twitter.scalding.reducer_estimation.ReducerEstimatorStepStrategy | ||
|
|
@@ -25,6 +25,7 @@ import org.apache.hadoop.mapred.JobConf | |
| import scala.collection.JavaConverters._ | ||
| import scala.concurrent.Future | ||
| import scala.util.{ Failure, Success, Try } | ||
| import org.slf4j.{ Logger, LoggerFactory } | ||
|
|
||
| /* | ||
| * This has all the state needed to build a single flow | ||
|
|
@@ -36,6 +37,8 @@ trait ExecutionContext { | |
| def flowDef: FlowDef | ||
| def mode: Mode | ||
|
|
||
| import ExecutionContext._ | ||
|
|
||
| private def getIdentifierOpt(descriptions: Seq[String]): Option[String] = { | ||
| if (descriptions.nonEmpty) Some(descriptions.distinct.mkString(", ")) else None | ||
| } | ||
|
|
@@ -67,8 +70,8 @@ trait ExecutionContext { | |
| name.foreach(flowDef.setName) | ||
|
|
||
| // identify the flowDef | ||
| val withId = config.addUniqueId(UniqueID.getIDFor(flowDef)) | ||
| val flow = mode.newFlowConnector(withId).connect(flowDef) | ||
| val configWithId = config.addUniqueId(UniqueID.getIDFor(flowDef)) | ||
| val flow = mode.newFlowConnector(configWithId).connect(flowDef) | ||
| if (config.getRequireOrderedSerialization) { | ||
| // This will throw, but be caught by the outer try if | ||
| // we have groupby/cogroupby not using OrderedSerializations | ||
|
|
@@ -89,8 +92,30 @@ trait ExecutionContext { | |
| // which instantiates and runs them | ||
| mode match { | ||
| case _: HadoopMode => | ||
| config.get(Config.ReducerEstimators) | ||
| .foreach(_ => flow.setFlowStepStrategy(ReducerEstimatorStepStrategy)) | ||
| val reducerEstimatorStrategy: Seq[FlowStepStrategy[JobConf]] = config.get(Config.ReducerEstimators).toList.map(_ => ReducerEstimatorStepStrategy) | ||
|
|
||
|
|
||
| val otherStrategies: Seq[FlowStepStrategy[JobConf]] = config.getFlowStepStrategies.map { tTry: Try[(Mode, Config) => FlowStepStrategy[JobConf]] => | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why take the mode as a parameter if we only pass it a HadoopMode? Why not do all these in any case and let the user attach an empty FlowListener/etc... (which we can even provide, as there is a monoid on these things).
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can move it to hadoop mode sure. You mean have a default strategy so its not an option below? |
||
| val t: (Mode, Config) => FlowStepStrategy[JobConf] = tTry.getOrElse(throw new Exception(s"Failed to decode flow step strategy $tTry when submitting job")) | ||
| t(mode, configWithId) | ||
| } | ||
|
|
||
| val optionalFinalStrategy = FlowStepStrategies().sumOption(reducerEstimatorStrategy ++ otherStrategies) | ||
|
|
||
| optionalFinalStrategy.foreach { strategy => | ||
| flow.setFlowStepStrategy(strategy) | ||
| } | ||
|
|
||
| config.getFlowListeners.foreach { tTry: Try[(Mode, Config) => FlowListener] => | ||
| val t: (Mode, Config) => FlowListener = tTry.getOrElse(throw new Exception(s"Failed to decode flow listener $tTry when submitting job")) | ||
| flow.addListener(t(mode, configWithId)) | ||
| } | ||
|
|
||
| config.getFlowStepListeners.foreach { tTry: Try[(Mode, Config) => FlowStepListener] => | ||
| val t: (Mode, Config) => FlowStepListener = tTry.getOrElse(throw new Exception(s"Failed to decode flow step listener $tTry when submitting job")) | ||
| flow.addStepListener(t(mode, configWithId)) | ||
| } | ||
|
|
||
| case _ => () | ||
| } | ||
|
|
||
|
|
@@ -124,6 +149,8 @@ trait ExecutionContext { | |
| * modeFromImplicit, etc... below. | ||
| */ | ||
| object ExecutionContext { | ||
| private val LOG: Logger = LoggerFactory.getLogger(ExecutionContext.getClass) | ||
|
|
||
| private[scalding] def getDesc[T](baseFlowStep: BaseFlowStep[T]): Seq[String] = { | ||
| baseFlowStep.getGraph.vertexSet.asScala.toSeq.flatMap(_ match { | ||
| case pipe: Pipe => RichPipe.getPipeDescriptions(pipe) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1 @@ | ||
| version in ThisBuild := "0.15.1-SNAPSHOT" | ||
| version in ThisBuild := "0.15.1-exec.1" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just to be clear, we wouldn't merge this yet, right? This is just to get comments?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes exactly, this version isn't on maven central, so this won't pass tests. -- branch hasn't merged in Chill either, and no rls there.