@@ -20,8 +20,6 @@ package org.apache.spark.mllib.recommendation
2020import java .io .IOException
2121import java .lang .{Integer => JavaInteger }
2222
23- import scala .collection .mutable
24-
2523import com .clearspring .analytics .stream .cardinality .HyperLogLogPlus
2624import com .github .fommil .netlib .BLAS .{getInstance => blas }
2725import org .apache .hadoop .fs .Path
@@ -33,7 +31,7 @@ import org.apache.spark.SparkContext
3331import org .apache .spark .annotation .Since
3432import org .apache .spark .api .java .{JavaPairRDD , JavaRDD }
3533import org .apache .spark .internal .Logging
36- import org .apache .spark .mllib .linalg ._
34+ import org .apache .spark .mllib .linalg .BLAS
3735import org .apache .spark .mllib .rdd .MLPairRDDFunctions ._
3836import org .apache .spark .mllib .util .{Loader , Saveable }
3937import org .apache .spark .rdd .RDD
@@ -263,6 +261,19 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
263261
264262 /**
265263 * Makes recommendations for all users (or products).
264+ *
265+ * Note: the previous approach used for computing top-k recommendations aimed to group
266+ * individual factor vectors into blocks, so that Level 3 BLAS operations (gemm) could
267+ * be used for efficiency. However, this causes excessive GC pressure due to the large
268+ * arrays required for intermediate result storage, as well as a high sensitivity to the
269+ * block size used.
270+ *
271+ * The following approach still groups factors into blocks, but instead computes the
272+ * top-k elements per block, using dot product and an efficient [[BoundedPriorityQueue ]]
273+ * (instead of gemm). This avoids any large intermediate data structures and results
274+ * in significantly reduced GC pressure as well as shuffle data, which far outweighs
275+ * any cost incurred from not using Level 3 BLAS operations.
276+ *
266277 * @param rank rank
267278 * @param srcFeatures src features to receive recommendations
268279 * @param dstFeatures dst features used to make recommendations
@@ -277,46 +288,22 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
277288 num : Int ): RDD [(Int , Array [(Int , Double )])] = {
278289 val srcBlocks = blockify(srcFeatures)
279290 val dstBlocks = blockify(dstFeatures)
280- /**
281- * The previous approach used for computing top-k recommendations aimed to group
282- * individual factor vectors into blocks, so that Level 3 BLAS operations (gemm) could
283- * be used for efficiency. However, this causes excessive GC pressure due to the large
284- * arrays required for intermediate result storage, as well as a high sensitivity to the
285- * block size used.
286- * The following approach still groups factors into blocks, but instead computes the
287- * top-k elements per block, using a simple dot product (instead of gemm) and an efficient
288- * [[BoundedPriorityQueue ]]. This avoids any large intermediate data structures and results
289- * in significantly reduced GC pressure as well as shuffle data, which far outweighs
290- * any cost incurred from not using Level 3 BLAS operations.
291- */
292291 val ratings = srcBlocks.cartesian(dstBlocks).flatMap { case (srcIter, dstIter) =>
293292 val m = srcIter.size
294293 val n = math.min(dstIter.size, num)
295294 val output = new Array [(Int , (Int , Double ))](m * n)
296- var j = 0
295+ var i = 0
297296 val pq = new BoundedPriorityQueue [(Int , Double )](n)(Ordering .by(_._2))
298297 srcIter.foreach { case (srcId, srcFactor) =>
299298 dstIter.foreach { case (dstId, dstFactor) =>
300- /*
301- * The below code is equivalent to
302- * `val score = blas.ddot(rank, srcFactor, 1, dstFactor, 1)`
303- * This handwritten version is as or more efficient as BLAS calls in this case.
304- */
305- var score : Double = 0
306- var k = 0
307- while (k < rank) {
308- score += srcFactor(k) * dstFactor(k)
309- k += 1
310- }
299+ // We use F2jBLAS which is faster than a call to native BLAS for vector dot product
300+ val score = BLAS .f2jBLAS.ddot(rank, srcFactor, 1 , dstFactor, 1 )
311301 pq += dstId -> score
312302 }
313- val pqIter = pq.iterator
314- var i = 0
315- while (i < n) {
316- output(j + i) = (srcId, pqIter.next())
303+ pq.foreach { case (dstId, score) =>
304+ output(i) = (srcId, (dstId, score))
317305 i += 1
318306 }
319- j += n
320307 pq.clear()
321308 }
322309 output.toSeq
0 commit comments