-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-3936] Remove Bytecode Inspection for Join Elimination #2815
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
2e47158
Removing bytecode inspection from triplet operations and introducing …
jegonzal f25a6ac
Fixing typos
jegonzal d32ec1c
dropping commented references to bytecode inspection
jegonzal 8cf4726
updating applications
jegonzal 459fa8c
fixing typo in docs
jegonzal e8a8a8d
removing long lines
jegonzal 4097ca7
Adding an enum for the triplet fields.
jegonzal File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -195,6 +195,12 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab | |
| * the underlying index structures can be reused. | ||
| * | ||
| * @param map the function from an edge object to a new edge value. | ||
| * @param mapUsesSrcAttr indicates whether the source vertex attribute should be included in | ||
| * the triplet. Setting this to false can improve performance if the source vertex attribute | ||
| * is not needed. | ||
| * @param mapUsesDstAttr indicates whether the destination vertex attribute should be included in | ||
| * the triplet. Setting this to false can improve performance if the destination vertex attribute | ||
| * is not needed. | ||
| * | ||
| * @tparam ED2 the new edge data type | ||
| * | ||
|
|
@@ -207,8 +213,10 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab | |
| * }}} | ||
| * | ||
| */ | ||
| def mapTriplets[ED2: ClassTag](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2] = { | ||
| mapTriplets((pid, iter) => iter.map(map)) | ||
| def mapTriplets[ED2: ClassTag](map: EdgeTriplet[VD, ED] => ED2, | ||
| tripletFields: TripletFields = TripletFields.All) | ||
| : Graph[VD, ED2] = { | ||
| mapTriplets((pid, iter) => iter.map(map), tripletFields) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -223,12 +231,18 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab | |
| * the underlying index structures can be reused. | ||
| * | ||
| * @param map the iterator transform | ||
| * @param mapUsesSrcAttr indicates whether the source vertex attribute should be included in | ||
| * the triplet. Setting this to false can improve performance if the source vertex attribute | ||
| * is not needed. | ||
| * @param mapUsesDstAttr indicates whether the destination vertex attribute should be included in | ||
| * the triplet. Setting this to false can improve performance if the destination vertex attribute | ||
| * is not needed. | ||
| * | ||
| * @tparam ED2 the new edge data type | ||
| * | ||
| */ | ||
| def mapTriplets[ED2: ClassTag](map: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2]) | ||
| : Graph[VD, ED2] | ||
| def mapTriplets[ED2: ClassTag](map: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2], | ||
| tripletFields: TripletFields): Graph[VD, ED2] | ||
|
|
||
| /** | ||
| * Reverses all edges in the graph. If this graph contains an edge from a to b then the returned | ||
|
|
@@ -258,7 +272,8 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab | |
| */ | ||
| def subgraph( | ||
| epred: EdgeTriplet[VD,ED] => Boolean = (x => true), | ||
| vpred: (VertexId, VD) => Boolean = ((v, d) => true)) | ||
| vpred: (VertexId, VD) => Boolean = ((v, d) => true), | ||
| tripletFields: TripletFields = TripletFields.All) | ||
| : Graph[VD, ED] | ||
|
|
||
| /** | ||
|
|
@@ -303,6 +318,12 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab | |
| * direction is `Either`, `mapFunc` will be run on edges with *either* vertex in the active set | ||
| * . If the direction is `Both`, `mapFunc` will be run on edges with *both* vertices in the | ||
| * active set. The active set must have the same index as the graph's vertices. | ||
| * @param mapUsesSrcAttr indicates whether the source vertex attribute should be included in | ||
| * the triplet. Setting this to false can improve performance if the source vertex attribute | ||
| * is not needed. | ||
| * @param mapUsesDstAttr indicates whether the destination vertex attribute should be included in | ||
| * the triplet. Setting this to false can improve performance if the destination vertex attribute | ||
| * is not needed. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These comments are out of date now that we have tripletFields |
||
| * | ||
| * @example We can use this function to compute the in-degree of each | ||
| * vertex | ||
|
|
@@ -322,12 +343,13 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab | |
| def mapReduceTriplets[A: ClassTag]( | ||
| mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], | ||
| reduceFunc: (A, A) => A, | ||
| tripletFields: TripletFields = TripletFields.All, | ||
| activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None) | ||
| : VertexRDD[A] | ||
|
|
||
| /** | ||
| * Joins the vertices with entries in the `table` RDD and merges the results using `mapFunc`. The | ||
| * input table should contain at most one entry for each vertex. If no entry in `other` is | ||
| * Joins the vertices with entries in the `table` RDD and merges the results using `mapFunc`. | ||
| * The input table should contain at most one entry for each vertex. If no entry in `other` is | ||
| * provided for a particular vertex in the graph, the map function receives `None`. | ||
| * | ||
| * @tparam U the type of entry in the table of updates | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
47 changes: 47 additions & 0 deletions
47
graphx/src/main/scala/org/apache/spark/graphx/TripletFields.scala
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.graphx | ||
|
|
||
|
|
||
| class TripletFields private( | ||
| val useSrc: Boolean, | ||
| val useDst: Boolean, | ||
| val useEdge: Boolean) | ||
| extends Serializable { | ||
| /** | ||
| * Default triplet fields includes all fields | ||
| */ | ||
| def this() = this(true, true, true) | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * A set of [[TripletFields]]s. | ||
| */ | ||
| object TripletFields { | ||
| final val None = new TripletFields(useSrc = false, useDst = false, useEdge = false) | ||
| final val EdgeOnly = new TripletFields(useSrc = false, useDst = false, useEdge = true) | ||
| final val SrcOnly = new TripletFields(useSrc = true, useDst = false, useEdge = false) | ||
| final val DstOnly = new TripletFields(useSrc = false, useDst = true, useEdge = false) | ||
| final val SrcDstOnly = new TripletFields(useSrc = true, useDst = true, useEdge = false) | ||
| final val SrcAndEdge = new TripletFields(useSrc = true, useDst = false, useEdge = true) | ||
| final val Src = SrcAndEdge | ||
| final val DstAndEdge = new TripletFields(useSrc = false, useDst = true, useEdge = true) | ||
| final val Dst = DstAndEdge | ||
| final val All = new TripletFields(useSrc = true, useDst = true, useEdge = true) | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -127,26 +127,26 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( | |
| } | ||
|
|
||
| override def mapTriplets[ED2: ClassTag]( | ||
| f: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2]): Graph[VD, ED2] = { | ||
| f: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2], | ||
| tripletFields: TripletFields): Graph[VD, ED2] = { | ||
| vertices.cache() | ||
| val mapUsesSrcAttr = accessesVertexAttr(f, "srcAttr") | ||
| val mapUsesDstAttr = accessesVertexAttr(f, "dstAttr") | ||
| replicatedVertexView.upgrade(vertices, mapUsesSrcAttr, mapUsesDstAttr) | ||
| replicatedVertexView.upgrade(vertices, tripletFields.useSrc, tripletFields.useDst) | ||
| val newEdges = replicatedVertexView.edges.mapEdgePartitions { (pid, part) => | ||
| part.map(f(pid, part.tripletIterator(mapUsesSrcAttr, mapUsesDstAttr))) | ||
| part.map(f(pid, part.tripletIterator(tripletFields.useSrc, tripletFields.useDst))) | ||
| } | ||
| new GraphImpl(vertices, replicatedVertexView.withEdges(newEdges)) | ||
| } | ||
|
|
||
| override def subgraph( | ||
| epred: EdgeTriplet[VD, ED] => Boolean = x => true, | ||
| vpred: (VertexId, VD) => Boolean = (a, b) => true): Graph[VD, ED] = { | ||
| vpred: (VertexId, VD) => Boolean = (a, b) => true, | ||
| tripletFields: TripletFields = TripletFields.All): Graph[VD, ED] = { | ||
| vertices.cache() | ||
| // Filter the vertices, reusing the partitioner and the index from this graph | ||
| val newVerts = vertices.mapVertexPartitions(_.filter(vpred)) | ||
| // Filter the triplets. We must always upgrade the triplet view fully because vpred always runs | ||
| // on both src and dst vertices | ||
| replicatedVertexView.upgrade(vertices, true, true) | ||
| replicatedVertexView.upgrade(vertices, tripletFields.useSrc, tripletFields.useDst) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this change is correct - see the comment on the line above. |
||
| val newEdges = replicatedVertexView.edges.filter(epred, vpred) | ||
| new GraphImpl(newVerts, replicatedVertexView.withEdges(newEdges)) | ||
| } | ||
|
|
@@ -171,15 +171,13 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( | |
| override def mapReduceTriplets[A: ClassTag]( | ||
| mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], | ||
| reduceFunc: (A, A) => A, | ||
| activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None): VertexRDD[A] = { | ||
| tripletFields: TripletFields, | ||
| activeSetOpt: Option[(VertexRDD[_], EdgeDirection)]): VertexRDD[A] = { | ||
|
|
||
| vertices.cache() | ||
|
|
||
| // For each vertex, replicate its attribute only to partitions where it is | ||
| // in the relevant position in an edge. | ||
| val mapUsesSrcAttr = accessesVertexAttr(mapFunc, "srcAttr") | ||
| val mapUsesDstAttr = accessesVertexAttr(mapFunc, "dstAttr") | ||
| replicatedVertexView.upgrade(vertices, mapUsesSrcAttr, mapUsesDstAttr) | ||
| replicatedVertexView.upgrade(vertices, tripletFields.useSrc, tripletFields.useDst) | ||
| val view = activeSetOpt match { | ||
| case Some((activeSet, _)) => | ||
| replicatedVertexView.withActiveSet(activeSet) | ||
|
|
@@ -220,8 +218,8 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( | |
| } | ||
|
|
||
| // Scan edges and run the map function | ||
| val mapOutputs = edgePartition.upgradeIterator(edgeIter, mapUsesSrcAttr, mapUsesDstAttr) | ||
| .flatMap(mapFunc(_)) | ||
| val mapOutputs = edgePartition.upgradeIterator(edgeIter, tripletFields.useSrc, | ||
| tripletFields.useDst).flatMap(mapFunc(_)) | ||
| // Note: This doesn't allow users to send messages to arbitrary vertices. | ||
| edgePartition.vertices.aggregateUsingIndex(mapOutputs, reduceFunc).iterator | ||
| }).setName("GraphImpl.mapReduceTriplets - preAgg") | ||
|
|
@@ -251,14 +249,6 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( | |
| } | ||
| } | ||
|
|
||
| /** Test whether the closure accesses the the attribute with name `attrName`. */ | ||
| private def accessesVertexAttr(closure: AnyRef, attrName: String): Boolean = { | ||
| try { | ||
| BytecodeUtils.invokedMethod(closure, classOf[EdgeTriplet[VD, ED]], attrName) | ||
| } catch { | ||
| case _: ClassNotFoundException => true // if we don't know, be conservative | ||
| } | ||
| } | ||
| } // end of class GraphImpl | ||
|
|
||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i'd deprecate the old one and create a new one for the new api.
the old one should keep the old behavior