diff --git a/.gitignore b/.gitignore
index 6a98303bb..d61a9bf82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,5 @@ modules.xml
 *.pdf
 *.gz
 *.sc
+
+*.o
diff --git a/src/main/scala/apps/harrisCornerDetection.scala b/src/main/scala/apps/harrisCornerDetection.scala
index 80d426c65..9f5803b4e 100644
--- a/src/main/scala/apps/harrisCornerDetection.scala
+++ b/src/main/scala/apps/harrisCornerDetection.scala
@@ -12,6 +12,13 @@ import shine.OpenCL.KernelExecutor._
 
 import scala.reflect.ClassTag
 
+/** This version of Harris follows from the following paper:
+  * https://dl.acm.org/doi/abs/10.1145/2568058.2568067
+  *
+  * Compared to Halide's version:
+  * - it starts from grayscale images instead of color images
+  * - it uses a binomial filter instead of a box filter
+  */
 object harrisCornerDetection {
   private val C2D = separableConvolution2D
   private val id = C2D.id
diff --git a/src/main/scala/apps/harrisCornerDetection2/binomialCoarsity.scala b/src/main/scala/apps/harrisCornerDetection2/binomialCoarsity.scala
new file mode 100644
index 000000000..eac19963f
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/binomialCoarsity.scala
@@ -0,0 +1,242 @@
+package apps.harrisCornerDetection2
+
+import rise.core._
+import rise.core.DSL._
+import rise.core.primitives.{id => _, _}
+import Type._
+import rise.core.types._
+import rise.core.types.DataType._
+import rise.openCL.DSL._
+import rise.openCL.primitives.oclRotateValues
+import shine.OpenCL.{GlobalSize, LocalSize}
+
+object binomialCoarsity {
+  def check(module: shine.OpenCL.Module, h: Int, w: Int, kappa: Float): Unit = {
+    val main = s"""
+#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"
+
+int main(int argc, char** argv) {
+  Context ctx = createDefaultContext();
+  size_t in_bytes = $h * $w * sizeof(float);
+  size_t out_bytes = ${h - 2*bd_h} * ${w - 2*bd_w} * sizeof(float);
+  Buffer input_ixx = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer input_ixy = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer input_iyy = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer output = createBuffer(ctx, out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE);
+
+  float* sxx_gold = (float*) malloc(out_bytes);
+  float* sxy_gold = (float*) malloc(out_bytes);
+  float* syy_gold = (float*) malloc(out_bytes);
+  float* out_gold = (float*) malloc(out_bytes);
+
+  std::random_device rand_d;
+  std::default_random_engine rand_e(rand_d());
+  // bigger range results in higher output differences
+  std::uniform_real_distribution<float> dist(0, 200);
+
+  float* in_ixx = (float*) hostBufferSync(ctx, input_ixx, in_bytes, HOST_WRITE | HOST_READ);
+  float* in_ixy = (float*) hostBufferSync(ctx, input_ixy, in_bytes, HOST_WRITE | HOST_READ);
+  float* in_iyy = (float*) hostBufferSync(ctx, input_iyy, in_bytes, HOST_WRITE | HOST_READ);
+  for (int y = 0; y < $h; y++) {
+    for (int x = 0; x < $w; x++) {
+      in_ixx[y*$w + x] = dist(rand_e);
+      in_ixy[y*$w + x] = dist(rand_e);
+      in_iyy[y*$w + x] = dist(rand_e);
+    }
+  }
+
+  binomial_gold(sxx_gold, $h, $w, in_ixx);
+  binomial_gold(sxy_gold, $h, $w, in_ixy);
+  binomial_gold(syy_gold, $h, $w, in_iyy);
+  coarsity_gold(out_gold, ${h - 2*bd_h}, ${w - 2*bd_w}, sxx_gold, sxy_gold, syy_gold, $kappa);
+
+  foo_init_run(ctx, output, $h, $w, input_ixx, input_ixy, input_iyy, $kappa);
+
+  ErrorStats errors;
+  init_error_stats(&errors);
+  float* out = (float*) hostBufferSync(ctx, output, out_bytes, HOST_READ);
+  accumulate_error_stats(&errors, out, out_gold, ${h - 2*bd_h}, ${w - 2*bd_w});
+  finish_error_stats(&errors, 0.05, 0.0001);
+
+  free(sxx_gold);
+  free(sxy_gold);
+  free(syy_gold);
+  free(out_gold);
+  destroyBuffer(ctx, input_ixx);
+  destroyBuffer(ctx, input_ixy);
+  destroyBuffer(ctx, input_iyy);
+  destroyBuffer(ctx, output);
+  destroyContext(ctx);
+  return EXIT_SUCCESS;
+}
+"""
+    util.ExecuteOpenCL.using_cpp(main, module, "one_copy")
+  }
+
+  val base: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
+        ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((sxx, sxy, syy, kappa) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        makeArray(3)(sxx)(sxy)(syy) |>
+        transpose >> map(transpose) >>
+        map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >>
+        map(transpose) >>
+        mapGlobal(mapSeq(
+          map(transpose) >> transpose >>
+          toPrivateFun(mapSeqUnroll(fun(nbh =>
+            dotSeqU(join(binomialWeights2d))(join(nbh))
+          ))) >>
+          letf(fun(s => {
+            val sxx = s `@` lidx(0, 3)
+            val sxy = s `@` lidx(1, 3)
+            val syy = s `@` lidx(2, 3)
+            val det = sxx * syy - sxy * sxy
+            val trace = sxx + syy
+            det - kappa * trace * trace
+          }))
+        ))
+    ))))
+
+  val lineVec: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
+        ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((sxx, sxy, syy, kappa) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        makeArray(3)(sxx)(sxy)(syy) |>
+        map(
+          map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+          drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+          map(asVectorAligned(vecw))
+        ) >>
+        transpose >> map(transpose) >>
+        slide(3)(1) >> mapGlobal(
+          transpose >> map(transpose) >>
+          mapSeq(mapSeqUnroll(dotSeqUWV(binomialWeightsV))) >>
+          toGlobal >>
+          slide(3)(1) >>
+          mapSeq(
+            transpose >> map(shuffle) >>
+            toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >>
+            letf(fun(s => {
+              val sxx = s `@` lidx(0, 3)
+              val sxy = s `@` lidx(1, 3)
+              val syy = s `@` lidx(2, 3)
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - vectorFromScalar(kappa) * trace * trace
+            }))
+          ) >> asScalar
+        )
+      ))))
+
+  val rotvVec: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32)
+        ->: f32 ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((sxx, sxy, syy, kappa) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        makeArray(3)(sxx)(sxy)(syy) |>
+        map(
+          map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+          drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+          map(asVectorAligned(vecw))
+        ) >>
+        transpose >> map(transpose) >>
+        slide(3)(1) >> mapGlobal(
+          transpose >> map(transpose) >>
+          map(map(dotSeqUWV(binomialWeightsV))) >>
+          oclRotateValues(AddressSpace.Private)(3)(mapSeqUnroll(id)) >> iterateStream(
+            transpose >> map(shuffle) >>
+            toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >>
+            letf(fun(s => {
+              val sxx = s `@` lidx(0, 3)
+              val sxy = s `@` lidx(1, 3)
+              val syy = s `@` lidx(2, 3)
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - vectorFromScalar(kappa) * trace * trace
+            }))
+          ) >> asScalar
+        )
+    ))))
+
+  val tile: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x + 2
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
+        ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((sxx, sxy, syy, kappa) =>
+      oclRun(LocalSize((tile_x, tile_y)), GlobalSize((w - 2*bd_w, h - 2*bd_h)))(
+        makeArray(3)(sxx)(sxy)(syy) |>
+        transpose >> map(transpose) >>
+        map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(tile_x_in)(tile_x)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
+        map(transpose) >>
+        map(map(
+          map(slide(3)(1)) >> slide(3)(1) >> map(transpose)
+        )) >>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(mapLocal(0)(
+            map(transpose) >> transpose >>
+            toPrivateFun(mapSeqUnroll(fun(nbh =>
+              dotSeqU(join(binomialWeights2d))(join(nbh))
+            ))) >>
+            letf(fun(s => {
+              val sxx = s `@` lidx(0, 3)
+              val sxy = s `@` lidx(1, 3)
+              val syy = s `@` lidx(2, 3)
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - kappa * trace * trace
+            }))
+          ))
+        )) >> map(transpose) >> join >> map(join)
+    ))))
+  }
+
+  val tileVec: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x + 2*vecw
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
+        ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((sxx, sxy, syy, kappa) =>
+      oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))(
+        makeArray(3)(sxx)(sxy)(syy) |>
+        map(
+          map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> slide(tile_x_in)(tile_x)) >>
+          drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
+          map(transpose) >>
+          map(map(
+            map(asVectorAligned(vecw) >> slide(3)(1)) >> slide(3)(1) >> map(transpose)
+          ))
+        ) >>
+        transpose >> map(transpose >> map(transpose >> map(transpose))) >>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(mapLocal(0)(
+            toPrivateFun(mapSeqUnroll(fun(nbh =>
+              dotSeqUWV(join(binomialWeights2d))(join(map(shuffle)(nbh)))
+            ))) >>
+            letf(fun(s => {
+              val sxx = s `@` lidx(0, 3)
+              val sxy = s `@` lidx(1, 3)
+              val syy = s `@` lidx(2, 3)
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - vectorFromScalar(kappa) * trace * trace
+            }))
+          ))
+        )) >> map(transpose) >> join >> map(join >> asScalar)
+    ))))
+  }
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/coarsity.scala b/src/main/scala/apps/harrisCornerDetection2/coarsity.scala
new file mode 100644
index 000000000..310057677
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/coarsity.scala
@@ -0,0 +1,161 @@
+package apps.harrisCornerDetection2
+
+import rise.core._
+import rise.core.DSL._
+import rise.core.primitives.{id => _, _}
+import Type._
+import rise.core.types._
+import rise.core.types.DataType._
+import rise.openCL.DSL._
+import shine.OpenCL.{GlobalSize, LocalSize}
+
+object coarsity {
+  def check(module: shine.OpenCL.Module, h: Int, w: Int, kappa: Float): Unit = {
+    val main = s"""
+#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"
+
+int main(int argc, char** argv) {
+  Context ctx = createDefaultContext();
+  size_t bytes = $h * $w * sizeof(float);
+  Buffer input_sxx = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer input_sxy = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer input_syy = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer output = createBuffer(ctx, bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE);
+
+  float* out_gold = (float*) malloc(bytes);
+
+  std::random_device rand_d;
+  std::default_random_engine rand_e(rand_d());
+  // bigger range results in higher output differences
+  std::uniform_real_distribution<float> dist(0, 200);
+
+  float* in_sxx = (float*) hostBufferSync(ctx, input_sxx, bytes, HOST_WRITE | HOST_READ);
+  float* in_sxy = (float*) hostBufferSync(ctx, input_sxy, bytes, HOST_WRITE | HOST_READ);
+  float* in_syy = (float*) hostBufferSync(ctx, input_syy, bytes, HOST_WRITE | HOST_READ);
+  for (int y = 0; y < $h; y++) {
+    for (int x = 0; x < $w; x++) {
+      in_sxx[y*$w + x] = dist(rand_e);
+      in_sxy[y*$w + x] = dist(rand_e);
+      in_syy[y*$w + x] = dist(rand_e);
+    }
+  }
+
+  coarsity_gold(out_gold, $h, $w, in_sxx, in_sxy, in_syy, $kappa);
+
+  foo_init_run(ctx, output, $h, $w, input_sxx, input_sxy, input_syy, $kappa);
+
+  ErrorStats errors;
+  init_error_stats(&errors);
+  float* out = (float*) hostBufferSync(ctx, output, bytes, HOST_READ);
+  accumulate_error_stats(&errors, out, out_gold, $h, $w);
+  finish_error_stats(&errors, 0.01, 0.0001);
+
+  free(out_gold);
+  destroyBuffer(ctx, input_sxx);
+  destroyBuffer(ctx, input_sxy);
+  destroyBuffer(ctx, input_syy);
+  destroyBuffer(ctx, output);
+  destroyContext(ctx);
+  return EXIT_SUCCESS;
+}
+"""
+    util.ExecuteOpenCL.using_cpp(main, module, "one_copy")
+  }
+
+  def base: ToBeTyped[Expr] =
+    depFun((h: Nat, w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 ->: (h`.`w`.`f32)
+    )((sxx, sxy, syy, kappa) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        zip(sxx)(zip(sxy)(syy)) |> mapGlobal(fun(s =>
+          zip(s._1)(zip(s._2._1)(s._2._2)) |>
+            mapSeq(fun(s => {
+              val sxx = fst(s)
+              val sxy = fst(snd(s))
+              val syy = snd(snd(s))
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - kappa * trace * trace
+            }))
+        ))
+    )))
+
+  val vec: ToBeTyped[Expr] =
+    depFun((h: Nat, w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 ->: (h`.`w`.`f32)
+    )((sxx, sxy, syy, kappa) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        zip(sxx)(zip(sxy)(syy)) |> mapGlobal(fun(s =>
+          zip(asVectorAligned(vecw)(s._1))(zip(asVectorAligned(vecw)(s._2._1))(asVectorAligned(vecw)(s._2._2))) |>
+          mapSeq(fun(s => {
+            val sxx = fst(s)
+            val sxy = fst(snd(s))
+            val syy = snd(snd(s))
+            val det = sxx * syy - sxy * sxy
+            val trace = sxx + syy
+            det - vectorFromScalar(kappa) * trace * trace
+          })) >>
+          asScalar
+        ))
+    )))
+
+  val tile: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x
+    val tile_y_in = tile_y
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 ->: (h`.`w`.`f32)
+    )((sxx, sxy, syy, kappa) =>
+      oclRun(LocalSize((tile_x, tile_y)), GlobalSize((w, h)))(
+        zip2D(sxx, zip2D(sxy, syy)) |>
+        map(slide(tile_x_in)(tile_x)) |>
+        slide(tile_y_in)(tile_y) |>
+        map(transpose) |>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(fun(s =>
+            zip(asVectorAligned(vecw)(unzip(s)._1))(
+              zip(asVectorAligned(vecw)(unzip(unzip(s)._2)._1))(
+                asVectorAligned(vecw)(unzip(unzip(s)._2)._2))) |>
+            mapLocal(0)(fun(s => {
+              val sxx = fst(s)
+              val sxy = fst(snd(s))
+              val syy = snd(snd(s))
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - vectorFromScalar(kappa) * trace * trace
+            }))
+          ))
+        )) >> map(transpose) >> join >> map(join >> asScalar)
+      ))))
+  }
+
+  val tileVec: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x
+    val tile_y_in = tile_y
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 ->: (h`.`w`.`f32)
+    )((sxx, sxy, syy, kappa) =>
+      oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize((w / vecw, h)))(
+        zip2D(sxx, zip2D(sxy, syy)) |>
+        map(slide(tile_x_in)(tile_x)) |>
+        slide(tile_y_in)(tile_y) |>
+        map(transpose) |>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(fun(s =>
+            zip(asVectorAligned(vecw)(unzip(s)._1))(
+              zip(asVectorAligned(vecw)(unzip(unzip(s)._2)._1))(
+                asVectorAligned(vecw)(unzip(unzip(s)._2)._2))) |>
+            mapLocal(0)(fun(s => {
+              val sxx = fst(s)
+              val sxy = fst(snd(s))
+              val syy = snd(snd(s))
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - vectorFromScalar(kappa) * trace * trace
+            }))
+          ))
+        )) >> map(transpose) >> join >> map(join >> asScalar)
+    ))))
+  }
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/common.cpp b/src/main/scala/apps/harrisCornerDetection2/common.cpp
new file mode 100644
index 000000000..77642f0c0
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/common.cpp
@@ -0,0 +1,145 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <random>
+#include <cmath>
+#include <algorithm>
+#include <vector>
+
+extern "C" {
+#include "ocl/ocl.h"
+AccessFlags operator|(AccessFlags a, AccessFlags b) {
+    return static_cast<AccessFlags>(static_cast<int>(a) | static_cast<int>(b)); }
+}
+
+// TODO: pass these in from Scala?
+const int bd_h = 16;
+const int bd_w = 32;
+
+struct ErrorStats {
+    float min_val;
+    float max_val;
+    double min;
+    double max;
+    double max_mse;
+};
+
+void init_error_stats(ErrorStats* es) {
+    es->min_val = 1.f / 0.f;
+    es->max_val = -1.f / 0.f;
+    es->min = 1.f / 0.f;
+    es->max = 0.f;
+    es->max_mse = 0.f;
+}
+
+void accumulate_error_stats(ErrorStats* es, float* a, float* b, int h, int w) {
+    double square_sum = 0.f;
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            es->min_val = std::min(es->min_val, b[y*w + x]);
+            es->max_val = std::max(es->max_val, b[y*w + x]);
+            double delta = a[y*w + x] - b[y*w + x];
+            double d_abs = abs(delta);
+            es->min = std::min(es->min, d_abs);
+            es->max = std::max(es->max, d_abs);
+            square_sum += d_abs * d_abs;
+        }
+    }
+    es->max_mse = std::max(es->max_mse, square_sum / (h * w));
+}
+
+void finish_error_stats(ErrorStats* es, float tolerated_per_pixel, float tolerated_mse) {
+    fprintf(stderr, "errors: [%.4lf - %.4lf] with %.4lf MSE\n",
+        es->min, es->max, es->max_mse);
+    if (es->max > tolerated_per_pixel || es->max_mse > tolerated_mse) {
+        fprintf(stderr, "maximum tolerated error: %.4f per pixel and %.4f MSE\n",
+            tolerated_per_pixel, tolerated_mse);
+        fprintf(stderr, "value range: [%.4f - %.4f]\n", es->min_val, es->max_val);
+        exit(EXIT_FAILURE);
+    }
+}
+
+void conv3x3_gold(float* out,
+                  int h, int w,
+                  const float* in,
+                  const float* weights)
+{
+    for (int y = 0; y < (h - 2*bd_h); y++) {
+        int r0 = (y + bd_h - 1) * w;
+        int r1 = (y + bd_h) * w;
+        int r2 = (y + bd_h + 1) * w;
+        for (int x = 0; x < (w - 2*bd_w); x++) {
+            int c0 = x + (bd_w - 1);
+            int c1 = x + bd_w;
+            int c2 = x + (bd_w + 1);
+            out[y*(w - 2*bd_w)+x] = (
+                weights[0]*in[r0+c0] + weights[1]*in[r0+c1] + weights[2]*in[r0+c2] +
+                weights[3]*in[r1+c0] + weights[4]*in[r1+c1] + weights[5]*in[r1+c2] +
+                weights[6]*in[r2+c0] + weights[7]*in[r2+c1] + weights[8]*in[r2+c2]
+            );
+        }
+    }
+}
+
+void sobelX_gold(float* out,
+                 int h, int w,
+                 const float* in)
+{
+    float weights[9] = {
+        -1.f/8.f, 0.f, 1.f/8.f,
+        -2.f/8.f, 0.f, 2.f/8.f,
+        -1.f/8.f, 0.f, 1.f/8.f
+    };
+    conv3x3_gold(out, h, w, in, weights);
+}
+
+void sobelY_gold(float* out,
+                 int h, int w,
+                 const float* in)
+{
+    float weights[9] = {
+        -1.f/8.f, -2.f/8.f, -1.f/8.f,
+         0.f/8.f,  0.f/8.f,  0.f/8.f,
+         1.f/8.f,  2.f/8.f,  1.f/8.f
+    };
+    conv3x3_gold(out, h, w, in, weights);
+}
+
+void binomial_gold(float* out,
+                   int h, int w,
+                   const float* in)
+{
+    float weights[9] = {
+        1.f/16.f, 2.f/16.f, 1.f/16.f,
+        2.f/16.f, 4.f/16.f, 2.f/16.f,
+        1.f/16.f, 2.f/16.f, 1.f/16.f
+    };
+    conv3x3_gold(out, h, w, in, weights);
+}
+
+void mul_gold(float* out,
+              int h, int w,
+              const float* a,
+              const float* b)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            out[y*w + x] = a[y*w + x] * b[y*w + x];
+        }
+    }
+}
+
+void coarsity_gold(float* out,
+                   int h, int w,
+                   const float* sxx,
+                   const float* sxy,
+                   const float* syy,
+                   float kappa)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            float det = sxx[y*w + x] * syy[y*w + x] - sxy[y*w + x] * sxy[y*w + x];
+            float trace = sxx[y*w + x] + syy[y*w + x];
+            out[y*w + x] = det - kappa * trace * trace;
+        }
+    }
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/convolutions.scala b/src/main/scala/apps/harrisCornerDetection2/convolutions.scala
new file mode 100644
index 000000000..40b4ba415
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/convolutions.scala
@@ -0,0 +1,160 @@
+package apps.harrisCornerDetection2
+
+import rise.core._
+import rise.core.DSL._
+import rise.core.primitives.{id => _, _}
+import Type._
+import rise.core.types._
+import rise.core.types.DataType._
+import rise.core.DSL.Type._
+import rise.openCL.DSL._
+import rise.openCL.primitives.oclRotateValues
+import shine.OpenCL.{GlobalSize, LocalSize}
+
+object convolutions {
+  def check(prelude: String, module: shine.OpenCL.Module, h: Int, w: Int): Unit = {
+    val main = s"""
+${prelude}
+
+int main(int argc, char** argv) {
+  Context ctx = createDefaultContext();
+  size_t in_bytes = $h * $w * sizeof(float);
+  size_t out_bytes = ${h - 2*bd_h} * ${w - 2*bd_w} * sizeof(float);
+  Buffer input = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer output = createBuffer(ctx, out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE);
+
+  float* out_gold = (float*) malloc(out_bytes);
+
+  std::random_device rand_d;
+  std::default_random_engine rand_e(rand_d());
+  // bigger range results in higher output differences
+  std::uniform_real_distribution<float> dist(0, 200);
+
+  float* in = (float*) hostBufferSync(ctx, input, in_bytes, HOST_WRITE | HOST_READ);
+  for (int y = 0; y < $h; y++) {
+    for (int x = 0; x < $w; x++) {
+      in[y*$w + x] = dist(rand_e);
+    }
+  }
+
+  gold(out_gold, in);
+
+  generated(ctx, output, input);
+
+  ErrorStats errors;
+  init_error_stats(&errors);
+  float* out = (float*) hostBufferSync(ctx, output, out_bytes, HOST_READ);
+  accumulate_error_stats(&errors, out, out_gold, ${h - 2*bd_h}, ${w - 2*bd_w});
+  finish_error_stats(&errors, 0.01, 0.0001);
+
+  free(out_gold);
+  destroyBuffer(ctx, input);
+  destroyBuffer(ctx, output);
+  destroyContext(ctx);
+  return EXIT_SUCCESS;
+}
+"""
+    util.ExecuteOpenCL.using_cpp(main, module, "one_copy")
+  }
+
+  def base(weights2d: ToBeTyped[Expr]): ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w-2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        input |>
+        map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >>
+        map(transpose) >>
+        mapGlobal(mapSeq(fun(nbh =>
+          dotSeqU(join(weights2d))(join(nbh))
+        )))
+    ))))
+
+  def lineVec(weightsV: ToBeTyped[Expr], weightsH: ToBeTyped[Expr]): ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        input |>
+        map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+        map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal(
+          transpose >>
+          mapSeq(dotSeqUWV(weightsV)) >>
+          toGlobal >>
+          // toLocal >>
+          slide(3)(1) >>
+          mapSeq(shuffle >> dotSeqUWV(weightsH)) >>
+          asScalar
+        )
+    ))))
+
+  def rotvVec(weightsV: ToBeTyped[Expr], weightsH: ToBeTyped[Expr]): ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        input |>
+        map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+        map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal(
+          transpose >>
+          map(dotSeqUWV(weightsV)) >>
+          oclRotateValues(AddressSpace.Private)(3)(id) >> iterateStream(
+            shuffle >> dotSeqUWV(weightsH)
+          ) >>
+          asScalar
+        )
+    ))))
+
+  def tile(weights2d: ToBeTyped[Expr]): ToBeTyped[Expr] = {
+    val tile_x_in = tile_x + 2
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize((tile_x, tile_y)), GlobalSize(((w - 2*bd_w), h - 2*bd_h)))(
+        input |>
+        map(drop(bd_w-1) >> take(w - (2*bd_w-2)) >> slide(tile_x_in)(tile_x)) >>
+          drop(bd_h-1) >> take(h - (2*bd_h-2)) >> slide(tile_y_in)(tile_y) >>
+          map(transpose) >>
+          map(map(
+            map(slide(3)(1)) >> slide(3)(1) >> map(transpose)
+          )) >>
+          mapWorkGroup(1)(mapWorkGroup(0)(
+            mapLocal(1)(mapLocal(0)(fun(nbh =>
+              dotSeqU(join(weights2d))(join(nbh))
+            )))
+          )) >> map(transpose) >> join >> map(join)
+    ))))
+  }
+
+  def tileVec(weights2d: ToBeTyped[Expr]): ToBeTyped[Expr] = {
+    val tile_x_in = tile_x + 2*vecw
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))(
+        input |>
+        map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> slide(tile_x_in)(tile_x)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
+        map(transpose) >>
+        map(map(
+          map(asVectorAligned(vecw) >> slide(3)(1)) >> slide(3)(1) >> map(transpose)
+        )) >>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(mapLocal(0)(fun(nbh =>
+            dotSeqUWV(join(weights2d))(join(map(shuffle)(nbh)))
+          )))
+        )) >> map(transpose) >> join >> map(join >> asScalar)
+      ))
+    ))
+  }
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/generateCode.scala b/src/main/scala/apps/harrisCornerDetection2/generateCode.scala
new file mode 100644
index 000000000..3f5b0baa8
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/generateCode.scala
@@ -0,0 +1,110 @@
+package apps.harrisCornerDetection2
+
+object generateCode {
+  val H = 1024
+  val W = 2048
+  val kappa = 0.04f
+
+  def checkBinomial(m: shine.OpenCL.Module): Unit = {
+    val prelude = s"""
+#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"
+#define gold(o, i) binomial_gold(o, $H, $W, i)
+#define generated(ctx, o, i) foo_init_run(ctx, o, $H, $W, i)
+"""
+    convolutions.check(prelude, m, H, W)
+  }
+
+  def checkSobelX(m: shine.OpenCL.Module): Unit = {
+    val prelude = s"""
+#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"
+#define gold(o, i) sobelX_gold(o, $H, $W, i)
+#define generated(ctx, o, i) foo_init_run(ctx, o, $H, $W, i)
+"""
+    convolutions.check(prelude, m, H, W)
+  }
+
+  def checkSobelY(m: shine.OpenCL.Module): Unit = {
+    val prelude = s"""
+#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"
+#define gold(o, i) sobelY_gold(o, $H, $W, i)
+#define generated(ctx, o, i) foo_init_run(ctx, o, $H, $W, i)
+"""
+    convolutions.check(prelude, m, H, W)
+  }
+
+  def main(args: Array[String]): Unit = {
+    val kernels = Seq[(String, rise.core.DSL.ToBeTyped[rise.core.Expr], shine.OpenCL.Module => Unit)](
+      ("binomial-base", convolutions.base(binomialWeights2d), checkBinomial),
+      ("binomial-line-vec", convolutions.lineVec(binomialWeightsV, binomialWeightsH), checkBinomial),
+      ("binomial-rotv-vec", convolutions.rotvVec(binomialWeightsV, binomialWeightsH), checkBinomial),
+      ("binomial-tile", convolutions.tile(binomialWeights2d), checkBinomial),
+      ("binomial-tile-vec", convolutions.tileVec(binomialWeights2d), checkBinomial),
+
+      ("sobelX-base", convolutions.base(sobelXWeights2d), checkSobelX),
+      ("sobelX-line-vec", convolutions.lineVec(sobelXWeightsV, sobelXWeightsH), checkSobelX),
+      ("sobelX-rotv-vec", convolutions.rotvVec(sobelXWeightsV, sobelXWeightsH), checkSobelX),
+      ("sobelX-tile", convolutions.tile(sobelXWeights2d), checkSobelX),
+      ("sobelX-tile-vec", convolutions.tileVec(sobelXWeights2d), checkSobelX),
+
+      ("sobelY-base", convolutions.base(sobelYWeights2d), checkSobelY),
+      ("sobelY-line-vec", convolutions.lineVec(sobelYWeightsV, sobelYWeightsH), checkSobelY),
+      ("sobelY-rotv-vec", convolutions.rotvVec(sobelYWeightsV, sobelYWeightsH), checkSobelY),
+      ("sobelY-tile", convolutions.tile(sobelYWeights2d), checkSobelY),
+      ("sobelY-tile-vec", convolutions.tileVec(sobelYWeights2d), checkSobelY),
+
+      ("mul-base", mul.base, mul.check(_, H, W)),
+      ("mul-vec", mul.vec, mul.check(_, H, W)),
+      ("mul-tile", mul.tile, mul.check(_, H, W)),
+      ("mul-tileVec", mul.tileVec, mul.check(_, H, W)),
+
+      ("coarsity-base", coarsity.base, coarsity.check(_, H, W, kappa)),
+      ("coarsity-vec", coarsity.vec, coarsity.check(_, H, W, kappa)),
+      ("coarsity-tile", coarsity.tile, coarsity.check(_, H, W, kappa)),
+      ("coarsity-tileVec", coarsity.tileVec, coarsity.check(_, H, W, kappa)),
+
+      ////
+
+      ("sobelXYMul-base", sobelXYMul.base, sobelXYMul.check(_, H, W)),
+      ("sobelXYMul-line-vec", sobelXYMul.lineVec, sobelXYMul.check(_, H, W)),
+      ("sobelXYMul-rotv-vec", sobelXYMul.rotvVec, sobelXYMul.check(_, H, W)),
+      ("sobelXYMul-tile", sobelXYMul.tile, sobelXYMul.check(_, H, W)),
+      ("sobelXYMul-tile-vec", sobelXYMul.tileVec, sobelXYMul.check(_, H, W)),
+
+      ("binomialCoarsity-base", binomialCoarsity.base, binomialCoarsity.check(_, H, W, kappa)),
+      ("binomialCoarsity-line-vec", binomialCoarsity.lineVec, binomialCoarsity.check(_, H, W, kappa)),
+      ("binomialCoarsity-rotv-vec", binomialCoarsity.rotvVec, binomialCoarsity.check(_, H, W, kappa)),
+      ("binomialCoarsity-tile", binomialCoarsity.tile, binomialCoarsity.check(_, H, W, kappa)),
+      ("binomialCoarsity-tile-vec", binomialCoarsity.tileVec, binomialCoarsity.check(_, H, W, kappa)),
+
+      ////
+
+      ("sobelXY-base", sobelXY.base, sobelXY.check(_, H, W)),
+      ("sobelXY-line-vec", sobelXY.lineVec, sobelXY.check(_, H, W)),
+      ("sobelXY-rotv-vec", sobelXY.rotvVec, sobelXY.check(_, H, W)),
+      ("sobelXY-tile", sobelXY.tile, sobelXY.check(_, H, W)),
+      ("sobelXY-tile-vec", sobelXY.tileVec, sobelXY.check(_, H, W)),
+
+      ("mulBinomialCoarsity-base", mulBinomialCoarsity.base, mulBinomialCoarsity.check(_, H, W, kappa)),
+      ("mulBinomialCoarsity-line-vec", mulBinomialCoarsity.lineVec, mulBinomialCoarsity.check(_, H, W, kappa)),
+      ("mulBinomialCoarsity-rotv-vec", mulBinomialCoarsity.rotvVec, mulBinomialCoarsity.check(_, H, W, kappa)),
+      ("mulBinomialCoarsity-tile", mulBinomialCoarsity.tile, mulBinomialCoarsity.check(_, H, W, kappa)),
+      ("mulBinomialCoarsity-tile-vec", mulBinomialCoarsity.tileVec, mulBinomialCoarsity.check(_, H, W, kappa)),
+
+      ////
+
+      // TODO
+    )
+
+    java.nio.file.Files.createDirectories(java.nio.file.Paths.get("/tmp/harris/"))
+
+    for ((name, prog, check) <- kernels) {
+      println(name)
+      // val p: rise.core.Expr = rise.core.DSL.toBeTyped(
+      //   rise.eqsat.Expr.toNamedUnique(rise.eqsat.Expr.fromNamed(prog.toUntypedExpr)))
+      val m = util.gen.opencl.hosted.fromExpr(prog)
+      val c = util.gen.opencl.hosted.asString(m)
+      util.writeToPath(s"/tmp/harris/${name}.c", c)
+      check(m)
+    }
+  }
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/harris.scala b/src/main/scala/apps/harrisCornerDetection2/harris.scala
new file mode 100644
index 000000000..db90f1045
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/harris.scala
@@ -0,0 +1,17 @@
+package apps.harrisCornerDetection2
+
+import rise.core.DSL.Type._
+import rise.core.DSL._
+import rise.core._
+import rise.core.primitives.{id => _, _}
+import rise.core.types.DataType._
+import rise.core.types._
+import rise.openCL.DSL._
+import rise.openCL.primitives.oclRotateValues
+import shine.OpenCL.{GlobalSize, LocalSize}
+
+object harris {
+  // TODO: tile / tileVec
+  // TODO: h1ModRotvVec
+  // TODO: h2ModRotvVec
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/mul.scala b/src/main/scala/apps/harrisCornerDetection2/mul.scala
new file mode 100644
index 000000000..ffb799089
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/mul.scala
@@ -0,0 +1,125 @@
+package apps.harrisCornerDetection2
+
+import rise.core.DSL.Type._
+import rise.core.DSL._
+import rise.core._
+import rise.core.primitives.{id => _, _}
+import rise.core.types.DataType._
+import rise.core.types._
+import rise.openCL.DSL._
+import shine.OpenCL.{GlobalSize, LocalSize}
+
+object mul {
+  def check(module: shine.OpenCL.Module, h: Int, w: Int): Unit = {
+    val main = s"""
+#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"
+
+int main(int argc, char** argv) {
+  Context ctx = createDefaultContext();
+  size_t bytes = $h * $w * sizeof(float);
+  Buffer input_a = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer input_b = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer output = createBuffer(ctx, bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE);
+
+  float* out_gold = (float*) malloc(bytes);
+
+  std::random_device rand_d;
+  std::default_random_engine rand_e(rand_d());
+  // bigger range results in higher output differences
+  std::uniform_real_distribution<float> dist(0, 200);
+
+  float* in_a = (float*) hostBufferSync(ctx, input_a, bytes, HOST_WRITE | HOST_READ);
+  float* in_b = (float*) hostBufferSync(ctx, input_b, bytes, HOST_WRITE | HOST_READ);
+  for (int y = 0; y < $h; y++) {
+    for (int x = 0; x < $w; x++) {
+      in_a[y*$w + x] = dist(rand_e);
+      in_b[y*$w + x] = dist(rand_e);
+    }
+  }
+
+  mul_gold(out_gold, $h, $w, in_a, in_b);
+
+  foo_init_run(ctx, output, $h, $w, input_a, input_b);
+
+  ErrorStats errors;
+  init_error_stats(&errors);
+  float* out = (float*) hostBufferSync(ctx, output, bytes, HOST_READ);
+  accumulate_error_stats(&errors, out, out_gold, $h, $w);
+  finish_error_stats(&errors, 0.01, 0.0001);
+
+  free(out_gold);
+  destroyBuffer(ctx, input_a);
+  destroyBuffer(ctx, input_b);
+  destroyBuffer(ctx, output);
+  destroyContext(ctx);
+  return EXIT_SUCCESS;
+}
+"""
+    util.ExecuteOpenCL.using_cpp(main, module, "one_copy")
+  }
+
+  def base: ToBeTyped[Expr] =
+    depFun((h: Nat, w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32)
+    )((a, b) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        zip(a)(b) |> mapGlobal(fun(ab =>
+          zip(ab._1)(ab._2) |>
+          mapSeq(mulT)
+        ))
+    )))
+
+  val vec: ToBeTyped[Expr] =
+    depFun((h: Nat, w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32)
+    )((a, b) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        zip(a)(b) |> mapGlobal(fun(ab =>
+          zip(asVectorAligned(vecw)(ab._1))(asVectorAligned(vecw)(ab._2)) |>
+          mapSeq(mulT) >>
+          asScalar
+        ))
+    )))
+
+  val tile: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x
+    val tile_y_in = tile_y
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32)
+    )((a, b) =>
+      oclRun(LocalSize((tile_x, tile_y)), GlobalSize((w, h)))(
+        zip2D(a, b) |>
+        map(slide(tile_x_in)(tile_x)) |>
+        slide(tile_y_in)(tile_y) |>
+        map(transpose) |>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(
+            mapLocal(0)(mulT)
+          )
+        )) >> map(transpose) >> join >> map(join)
+    ))))
+  }
+
+  val tileVec: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x
+    val tile_y_in = tile_y
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32)
+    )((a, b) =>
+      oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize((w / vecw, h)))(
+        zip2D(a, b) |>
+        map(slide(tile_x_in)(tile_x)) |>
+        slide(tile_y_in)(tile_y) |>
+        map(transpose) |>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(fun(ab =>
+            zip(asVectorAligned(vecw)(unzip(ab)._1))(
+              asVectorAligned(vecw)(unzip(ab)._2)) |>
+              mapLocal(0)(mulT)
+          ))
+        )) >> map(transpose) >> join >> map(join >> asScalar)
+    ))))
+  }
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/mulBinomialCoarsity.scala b/src/main/scala/apps/harrisCornerDetection2/mulBinomialCoarsity.scala
new file mode 100644
index 000000000..9cb21844c
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/mulBinomialCoarsity.scala
@@ -0,0 +1,273 @@
+package apps.harrisCornerDetection2
+
+import rise.core.DSL.Type._
+import rise.core.DSL._
+import rise.core._
+import rise.core.primitives.{id => _, _}
+import rise.core.types.DataType._
+import rise.core.types._
+import rise.openCL.DSL._
+import rise.openCL.primitives.oclRotateValues
+import shine.OpenCL.{GlobalSize, LocalSize}
+
+object mulBinomialCoarsity {
+  def check(module: shine.OpenCL.Module, h: Int, w: Int, kappa: Float): Unit = {
+    val main = s"""
+#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"
+
+int main(int argc, char** argv) {
+  Context ctx = createDefaultContext();
+  size_t in_bytes = $h * $w * sizeof(float);
+  size_t out_bytes = ${h - 2*bd_h} * ${w - 2*bd_w} * sizeof(float);
+  Buffer input_ix = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer input_iy = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer output = createBuffer(ctx, out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE);
+
+  float* ixx_gold = (float*) malloc(in_bytes);
+  float* ixy_gold = (float*) malloc(in_bytes);
+  float* iyy_gold = (float*) malloc(in_bytes);
+  float* sxx_gold = (float*) malloc(out_bytes);
+  float* sxy_gold = (float*) malloc(out_bytes);
+  float* syy_gold = (float*) malloc(out_bytes);
+  float* out_gold = (float*) malloc(out_bytes);
+
+  std::random_device rand_d;
+  std::default_random_engine rand_e(rand_d());
+  // bigger range results in higher output differences
+  std::uniform_real_distribution<float> dist(0, 50);
+
+  float* in_ix = (float*) hostBufferSync(ctx, input_ix, in_bytes, HOST_WRITE | HOST_READ);
+  float* in_iy = (float*) hostBufferSync(ctx, input_iy, in_bytes, HOST_WRITE | HOST_READ);
+  for (int y = 0; y < $h; y++) {
+    for (int x = 0; x < $w; x++) {
+      in_ix[y*$w + x] = dist(rand_e);
+      in_iy[y*$w + x] = dist(rand_e);
+    }
+  }
+
+  mul_gold(ixx_gold, $h, $w, in_ix, in_ix);
+  mul_gold(ixy_gold, $h, $w, in_ix, in_iy);
+  mul_gold(iyy_gold, $h, $w, in_iy, in_iy);
+  binomial_gold(sxx_gold, $h, $w, ixx_gold);
+  binomial_gold(sxy_gold, $h, $w, ixy_gold);
+  binomial_gold(syy_gold, $h, $w, iyy_gold);
+  coarsity_gold(out_gold, ${h - 2*bd_h}, ${w - 2*bd_w}, sxx_gold, sxy_gold, syy_gold, $kappa);
+
+  foo_init_run(ctx, output, $h, $w, input_ix, input_iy, $kappa);
+
+  ErrorStats errors;
+  init_error_stats(&errors);
+  float* out = (float*) hostBufferSync(ctx, output, out_bytes, HOST_READ);
+  accumulate_error_stats(&errors, out, out_gold, ${h - 2*bd_h}, ${w - 2*bd_w});
+  finish_error_stats(&errors, 5.0, 0.01);
+
+  free(sxx_gold);
+  free(sxy_gold);
+  free(syy_gold);
+  free(out_gold);
+  destroyBuffer(ctx, input_ix);
+  destroyBuffer(ctx, input_iy);
+  destroyBuffer(ctx, output);
+  destroyContext(ctx);
+  return EXIT_SUCCESS;
+}
+"""
+    util.ExecuteOpenCL.using_cpp(main, module, "one_copy")
+  }
+
+  val base: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
+        ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((ix, iy, kappa) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        makeArray(2)(ix)(iy) |>
+        transpose >> map(transpose) >>
+        map(map(fun(ixiy => {
+          val ix = ixiy `@` lidx(0, 2)
+          val iy = ixiy `@` lidx(1, 2)
+          makeArray(3)(ix * ix)(ix * iy)(iy * iy)
+        }))) >>
+        map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >>
+        map(transpose) >>
+        mapGlobal(mapSeq(
+          map(transpose) >> transpose >>
+          toPrivateFun(mapSeqUnroll(fun(nbh =>
+            dotSeqU(join(binomialWeights2d))(join(nbh))
+          ))) >>
+          letf(fun(s => {
+            val sxx = s `@` lidx(0, 3)
+            val sxy = s `@` lidx(1, 3)
+            val syy = s `@` lidx(2, 3)
+            val det = sxx * syy - sxy * sxy
+            val trace = sxx + syy
+            det - kappa * trace * trace
+          }))
+        ))
+    ))))
+
+  val lineVec: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
+        ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((ix, iy, kappa) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        makeArray(2)(ix)(iy) |>
+        map(
+          map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+          drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+          map(asVectorAligned(vecw))
+        ) >>
+        transpose >> map(transpose) >>
+        map(map(fun(ixiy => {
+          val ix = ixiy `@` lidx(0, 2)
+          val iy = ixiy `@` lidx(1, 2)
+          makeArray(3)(ix * ix)(ix * iy)(iy * iy)
+        }))) >>
+        slide(3)(1) >> mapGlobal(
+          transpose >> map(transpose) >>
+          mapSeq(mapSeqUnroll(dotSeqUWV(binomialWeightsV))) >>
+          toGlobal >>
+          slide(3)(1) >>
+          mapSeq(
+            transpose >> map(shuffle) >>
+            toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >>
+            letf(fun(s => {
+              val sxx = s `@` lidx(0, 3)
+              val sxy = s `@` lidx(1, 3)
+              val syy = s `@` lidx(2, 3)
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - vectorFromScalar(kappa) * trace * trace
+            }))
+          ) >> asScalar
+      )
+    ))))
+
+  val rotvVec: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
+        ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((ix, iy, kappa) =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        makeArray(2)(ix)(iy) |>
+        map(
+          map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+          drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+          map(asVectorAligned(vecw))
+        ) >>
+        transpose >> map(transpose) >>
+        map(map(fun(ixiy => {
+          val ix = ixiy `@` lidx(0, 2)
+          val iy = ixiy `@` lidx(1, 2)
+          makeArray(3)(ix * ix)(ix * iy)(iy * iy)
+        }))) >>
+        slide(3)(1) >> mapGlobal(
+          transpose >> map(transpose) >>
+          map(map(dotSeqUWV(binomialWeightsV))) >>
+          oclRotateValues(AddressSpace.Private)(3)(mapSeqUnroll(id)) >> iterateStream(
+            transpose >> map(shuffle) >>
+            toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >>
+            letf(fun(s => {
+              val sxx = s `@` lidx(0, 3)
+              val sxy = s `@` lidx(1, 3)
+              val syy = s `@` lidx(2, 3)
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - vectorFromScalar(kappa) * trace * trace
+            }))
+          ) >> asScalar
+      )
+    ))))
+
+  val tile: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x + 2
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
+        ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((ix, iy, kappa) =>
+      oclRun(LocalSize((tile_x, tile_y)), GlobalSize((w - 2*bd_w, h - 2*bd_h)))(
+        makeArray(2)(ix)(iy) |>
+        transpose >> map(transpose) >>
+        map(map(fun(ixiy => {
+          val ix = ixiy `@` lidx(0, 2)
+          val iy = ixiy `@` lidx(1, 2)
+          makeArray(3)(ix * ix)(ix * iy)(iy * iy)
+        }))) >>
+        map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(tile_x_in)(tile_x)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
+        map(transpose) >>
+        map(map(
+          map(slide(3)(1)) >> slide(3)(1) >> map(transpose)
+        )) >>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(mapLocal(0)(
+            map(transpose) >> transpose >>
+            toPrivateFun(mapSeqUnroll(fun(nbh =>
+              dotSeqU(join(binomialWeights2d))(join(nbh))
+            ))) >>
+            letf(fun(s => {
+              val sxx = s `@` lidx(0, 3)
+              val sxy = s `@` lidx(1, 3)
+              val syy = s `@` lidx(2, 3)
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - kappa * trace * trace
+            }))
+          ))
+        )) >> map(transpose) >> join >> map(join)
+    ))))
+  }
+
+  val tileVec: ToBeTyped[Expr] = {
+    val tile_vx = tile_x / vecw
+    val tile_vx_in = tile_vx + 2
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32
+        ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )((ix, iy, kappa) =>
+      oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))(
+        makeArray(2)(ix)(iy) |>
+        map(
+          map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> asVectorAligned(vecw)) >>
+          drop(bd_h-1) >> take(h - 2*(bd_h-1))
+        ) >>
+        transpose >> map(transpose) >>
+        map(map(fun(ixiy => {
+          val ix = ixiy `@` lidx(0, 2)
+          val iy = ixiy `@` lidx(1, 2)
+          makeArray(3)(ix * ix)(ix * iy)(iy * iy)
+        }))) >>
+        map(slide(tile_vx_in)(tile_vx)) >>
+        slide(tile_y_in)(tile_y) >>
+        map(transpose) >>
+        map(map(
+          map(slide(3)(1)) >> slide(3)(1) >> map(transpose)
+        )) >>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(mapLocal(0)(
+            map(transpose) >> transpose >>
+            toPrivateFun(mapSeqUnroll(fun(nbh =>
+              dotSeqUWV(join(binomialWeights2d))(join(map(shuffle)(nbh)))
+            ))) >>
+            letf(fun(s => {
+              val sxx = s `@` lidx(0, 3)
+              val sxy = s `@` lidx(1, 3)
+              val syy = s `@` lidx(2, 3)
+              val det = sxx * syy - sxy * sxy
+              val trace = sxx + syy
+              det - vectorFromScalar(kappa) * trace * trace
+            }))
+          ))
+        )) >> map(transpose) >> join >> map(join >> asScalar)
+    ))))
+  }
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/package.scala b/src/main/scala/apps/harrisCornerDetection2/package.scala
new file mode 100644
index 000000000..97f0fb3e8
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/package.scala
@@ -0,0 +1,58 @@
+package apps
+
+import rise.core._
+import rise.core.DSL._
+import rise.core.primitives._
+import rise.core.types._
+import rise.openCL.primitives._
+import rise.core.DSL.HighLevelConstructs.zipND
+
+/** This version of Harris follows from the following paper:
+  * https://dl.acm.org/doi/abs/10.1145/2568058.2568067
+  *
+  * Compared to Halide's version:
+  * - it starts from grayscale images instead of color images
+  * - it uses a binomial filter instead of a box filter
+  *
+  * The algorithm is simplified:
+  * - there is no padding and the output is smaller than the input.
+  */
+package object harrisCornerDetection2 {
+  val num_threads = 4
+  val vecw = 8
+  val bd_h = 16
+  val bd_w = 32
+  val tile_x = 32
+  val tile_y = 8
+
+  val hFrom = (n: Int) =>
+    arithexpr.arithmetic.RangeAdd(n, arithexpr.arithmetic.PosInf, 8)
+  val wFrom = (n: Int) =>
+    arithexpr.arithmetic.RangeAdd(12, arithexpr.arithmetic.PosInf, 32)
+
+  val id: ToBeTyped[Expr] = fun(x => x)
+  val mulT: ToBeTyped[Expr] = fun(x => fst(x) * snd(x))
+  val zip2D: ToBeTyped[Expr] = zipND(2)
+  val dotSeqU: ToBeTyped[Expr] = fun(a => fun(b =>
+    zip(a)(b) |> map(mulT) |> oclReduceSeqUnroll(AddressSpace.Private)(add)(lf32(0.0f))
+  ))
+  val dotSeqUWV: ToBeTyped[Expr] = fun(weights => fun(vectors =>
+    zip(map(vectorFromScalar)(weights))(vectors) |>
+      map(mulT) |> oclReduceSeqUnroll(AddressSpace.Private)(add)(vectorFromScalar(lf32(0.0f)))
+  ))
+
+  val shuffle =
+    asScalar >> drop(vecw-1) >> take(vecw+2) >> slide(vecw)(1) >> join >> asVector(vecw)
+
+  val binomialWeights2d = apps.separableConvolution2D.binomialWeights2d
+  val binomialWeightsH = apps.separableConvolution2D.binomialWeightsH
+  val binomialWeightsV = apps.separableConvolution2D.binomialWeightsV
+
+  val sobelXWeights2d = apps.separableConvolution2D.sobelXWeights2d
+  val sobelXWeightsH = apps.separableConvolution2D.sobelXWeightsH
+  val sobelXWeightsV = apps.separableConvolution2D.sobelXWeightsV
+
+  val sobelYWeights2d = apps.separableConvolution2D.sobelYWeights2d
+  val sobelYWeightsH = apps.separableConvolution2D.sobelYWeightsH
+  val sobelYWeightsV = apps.separableConvolution2D.sobelYWeightsV
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/sobelXY.scala b/src/main/scala/apps/harrisCornerDetection2/sobelXY.scala
new file mode 100644
index 000000000..25def5660
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/sobelXY.scala
@@ -0,0 +1,180 @@
+package apps.harrisCornerDetection2
+
+import rise.core.DSL.Type._
+import rise.core.DSL._
+import rise.core._
+import rise.core.primitives.{id => _, _}
+import rise.core.types.DataType._
+import rise.core.types._
+import rise.openCL.DSL._
+import rise.openCL.primitives.oclRotateValues
+import shine.OpenCL.{GlobalSize, LocalSize}
+
+object sobelXY {
+  def check(module: shine.OpenCL.Module, h: Int, w: Int): Unit = {
+    val main = s"""
+#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"
+
+int main(int argc, char** argv) {
+  Context ctx = createDefaultContext();
+  size_t in_bytes = $h * $w * sizeof(float);
+  size_t out_h = ${h - 2*bd_h};
+  size_t out_w = ${w - 2*bd_w};
+  size_t out_bytes = out_h * out_w * sizeof(float);
+  Buffer input = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer output = createBuffer(ctx, 2 * out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE);
+
+  float* out_ix_gold = (float*) malloc(out_bytes);
+  float* out_iy_gold = (float*) malloc(out_bytes);
+
+  std::random_device rand_d;
+  std::default_random_engine rand_e(rand_d());
+  // bigger range results in higher output differences
+  std::uniform_real_distribution<float> dist(0, 200);
+
+  float* in = (float*) hostBufferSync(ctx, input, in_bytes, HOST_WRITE | HOST_READ);
+  for (int y = 0; y < $h; y++) {
+    for (int x = 0; x < $w; x++) {
+      in[y*$w + x] = dist(rand_e);
+    }
+  }
+
+  sobelX_gold(out_ix_gold, $h, $w, in);
+  sobelY_gold(out_iy_gold, $h, $w, in);
+
+  foo_init_run(ctx, output, $h, $w, input);
+
+  ErrorStats errors;
+  init_error_stats(&errors);
+  float* out = (float*) hostBufferSync(ctx, output, 2 * out_bytes, HOST_READ);
+  accumulate_error_stats(&errors, out, out_ix_gold, out_h, out_w);
+  accumulate_error_stats(&errors, &out[out_h*out_w], out_iy_gold, out_h, out_w);
+  finish_error_stats(&errors, 0.01, 0.0001);
+
+  free(out_ix_gold);
+  free(out_iy_gold);
+  destroyBuffer(ctx, input);
+  destroyBuffer(ctx, output);
+  destroyContext(ctx);
+  return EXIT_SUCCESS;
+}
+"""
+    util.ExecuteOpenCL.using_cpp(main, module, "one_copy")
+  }
+
+  val base: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        input |>
+        map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >>
+        map(transpose) >>
+        mapGlobal(mapSeq(fun(nbh =>
+          makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |>
+          mapSeqUnroll(fun(ws => dotSeqU(join(ws))(join(nbh))))
+        ))) >> map(transpose) >> transpose
+    ))))
+
+  val lineVec: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        input |>
+        map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+        map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal(
+          transpose >>
+          mapSeq(fun(vNbh =>
+            makeArray(2)(sobelXWeightsV)(sobelYWeightsV) |>
+            mapSeqUnroll(fun(ws => dotSeqUWV(ws)(vNbh)))
+          )) >>
+          toGlobal >>
+          slide(3)(1) >>
+          mapSeq(
+            transpose >> map(shuffle) >>
+            zip(makeArray(2)(sobelXWeightsH)(sobelYWeightsH)) >>
+            mapSeqUnroll(fun(hWsNbh =>
+              dotSeqUWV(hWsNbh._1)(hWsNbh._2)
+            ))
+          ) >> transpose >> map(asScalar)
+      ) >> transpose
+    ))))
+
+  val rotvVec: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        input |>
+        map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+        map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal(
+        transpose >>
+          map(fun(vNbh =>
+            makeArray(2)(sobelXWeightsV)(sobelYWeightsV) |>
+            map(fun(ws => dotSeqUWV(ws)(vNbh)))
+          )) >>
+          oclRotateValues(AddressSpace.Private)(3)(mapSeqUnroll(id)) >> iterateStream(
+            transpose >> map(shuffle) >>
+            zip(makeArray(2)(sobelXWeightsH)(sobelYWeightsH)) >>
+            mapSeqUnroll(fun(hWsNbh => dotSeqUWV(hWsNbh._1)(hWsNbh._2)))
+          ) >> transpose >> map(asScalar)
+      ) >> transpose
+    ))))
+
+  val tile: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x + 2
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize((tile_x, tile_y)), GlobalSize(((w - 2*bd_w), h - 2*bd_h)))(
+        input |>
+        map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(tile_x_in)(tile_x)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
+        map(transpose) >>
+        map(map(
+          map(slide(3)(1)) >> slide(3)(1) >> map(transpose)
+        )) >>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(mapLocal(0)(fun(nbh =>
+            makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |>
+            mapSeqUnroll(fun(ws => dotSeqU(join(ws))(join(nbh))))
+          ))) // ty.tx.2.f
+        )) >> map(transpose) >> join >> map(join) >>
+        map(transpose) >> transpose
+    ))))
+  }
+
+  val tileVec: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x + 2*vecw
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))(
+        input |>
+        map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> slide(tile_x_in)(tile_x)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
+        map(transpose) >>
+        map(map(
+          map(asVectorAligned(vecw) >> slide(3)(1)) >> slide(3)(1) >> map(transpose)
+        )) >>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(mapLocal(0)(fun(nbh =>
+            makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |>
+            mapSeqUnroll(fun(ws => dotSeqUWV(join(ws))(join(map(shuffle)(nbh)))))
+          )))
+        )) >> map(transpose) >> join >> map(join) >>
+        map(transpose >> map(asScalar)) >> transpose
+    ))))
+  }
+}
diff --git a/src/main/scala/apps/harrisCornerDetection2/sobelXYMul.scala b/src/main/scala/apps/harrisCornerDetection2/sobelXYMul.scala
new file mode 100644
index 000000000..9050db4d8
--- /dev/null
+++ b/src/main/scala/apps/harrisCornerDetection2/sobelXYMul.scala
@@ -0,0 +1,217 @@
+package apps.harrisCornerDetection2
+
+import rise.core.DSL.Type._
+import rise.core.DSL._
+import rise.core._
+import rise.core.primitives.{id => _, _}
+import rise.core.types.DataType._
+import rise.core.types._
+import rise.openCL.DSL._
+import rise.openCL.primitives.oclRotateValues
+import shine.OpenCL.{GlobalSize, LocalSize}
+
+object sobelXYMul {
+  def check(module: shine.OpenCL.Module, h: Int, w: Int): Unit = {
+    val main = s"""
+#include "src/main/scala/apps/harrisCornerDetection2/common.cpp"
+
+int main(int argc, char** argv) {
+  Context ctx = createDefaultContext();
+  size_t in_bytes = $h * $w * sizeof(float);
+  size_t out_h = ${h - 2*bd_h};
+  size_t out_w = ${w - 2*bd_w};
+  size_t out_bytes = out_h * out_w * sizeof(float);
+  Buffer input = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ);
+  Buffer output = createBuffer(ctx, 3 * out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE);
+
+  float* ix_gold = (float*) malloc(out_bytes);
+  float* iy_gold = (float*) malloc(out_bytes);
+  float* out_ixx_gold = (float*) malloc(out_bytes);
+  float* out_ixy_gold = (float*) malloc(out_bytes);
+  float* out_iyy_gold = (float*) malloc(out_bytes);
+
+  std::random_device rand_d;
+  std::default_random_engine rand_e(rand_d());
+  // bigger range results in higher output differences
+  std::uniform_real_distribution<float> dist(0, 200);
+
+  float* in = (float*) hostBufferSync(ctx, input, in_bytes, HOST_WRITE | HOST_READ);
+  for (int y = 0; y < $h; y++) {
+    for (int x = 0; x < $w; x++) {
+      in[y*$w + x] = dist(rand_e);
+    }
+  }
+
+  sobelX_gold(ix_gold, $h, $w, in);
+  sobelY_gold(iy_gold, $h, $w, in);
+  mul_gold(out_ixx_gold, out_h, out_w, ix_gold, ix_gold);
+  mul_gold(out_ixy_gold, out_h, out_w, ix_gold, iy_gold);
+  mul_gold(out_iyy_gold, out_h, out_w, iy_gold, iy_gold);
+
+  foo_init_run(ctx, output, $h, $w, input);
+
+  ErrorStats errors;
+  init_error_stats(&errors);
+  float* out = (float*) hostBufferSync(ctx, output, 3 * out_bytes, HOST_READ);
+  accumulate_error_stats(&errors, out, out_ixx_gold, out_h, out_w);
+  accumulate_error_stats(&errors, &out[out_h * out_w], out_ixy_gold, out_h, out_w);
+  accumulate_error_stats(&errors, &out[2 * out_h * out_w], out_iyy_gold, out_h, out_w);
+  finish_error_stats(&errors, 0.01, 0.0001);
+
+  free(ix_gold);
+  free(iy_gold);
+  free(out_ixx_gold);
+  free(out_ixy_gold);
+  free(out_iyy_gold);
+  destroyBuffer(ctx, input);
+  destroyBuffer(ctx, output);
+  destroyContext(ctx);
+  return EXIT_SUCCESS;
+}
+"""
+    util.ExecuteOpenCL.using_cpp(main, module, "one_copy")
+  }
+
+  val base: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        input |>
+        map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >>
+        map(transpose) >>
+        mapGlobal(mapSeq(fun(nbh =>
+          makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |>
+          toPrivateFun(mapSeqUnroll(fun(ws => dotSeqU(join(ws))(join(nbh))))) |>
+          letf(fun(ixiy => {
+            val ix = ixiy `@` lidx(0, 2)
+            val iy = ixiy `@` lidx(1, 2)
+            makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id)
+          }))
+        ))) >> map(transpose) >> transpose
+    ))))
+
+  val lineVec: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        input |>
+        map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+        map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal(
+          transpose >>
+          mapSeq(fun(vNbh =>
+            makeArray(2)(sobelXWeightsV)(sobelYWeightsV) |>
+            mapSeqUnroll(fun(ws => dotSeqUWV(ws)(vNbh)))
+          )) >>
+          toGlobal >>
+          slide(3)(1) >>
+          mapSeq(
+            transpose >> map(shuffle) >>
+            zip(makeArray(2)(sobelXWeightsH)(sobelYWeightsH)) >>
+            toPrivateFun(mapSeqUnroll(fun(hWsNbh =>
+              dotSeqUWV(hWsNbh._1)(hWsNbh._2)
+            ))) >>
+            letf(fun(ixiy => {
+              val ix = ixiy `@` lidx(0, 2)
+              val iy = ixiy `@` lidx(1, 2)
+              makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id)
+            }))
+          ) >> transpose >> map(asScalar)
+      ) >> transpose
+    ))))
+
+  val rotvVec: ToBeTyped[Expr] =
+    depFun(hFrom(3), (h: Nat) =>
+    depFun(wFrom(12), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize(1), GlobalSize(num_threads))(
+        input |>
+        map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >>
+        map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal(
+        transpose >>
+          map(fun(vNbh =>
+            makeArray(2)(sobelXWeightsV)(sobelYWeightsV) |>
+            map(fun(ws => dotSeqUWV(ws)(vNbh)))
+          )) >>
+          oclRotateValues(AddressSpace.Private)(3)(mapSeqUnroll(id)) >> iterateStream(
+            transpose >> map(shuffle) >>
+            zip(makeArray(2)(sobelXWeightsH)(sobelYWeightsH)) >>
+            toPrivateFun(mapSeqUnroll(fun(hWsNbh =>
+              dotSeqUWV(hWsNbh._1)(hWsNbh._2)
+            ))) >>
+            letf(fun(ixiy => {
+              val ix = ixiy `@` lidx(0, 2)
+              val iy = ixiy `@` lidx(1, 2)
+              makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id)
+            }))
+          ) >> transpose >> map(asScalar)
+      ) >> transpose
+    ))))
+
+  val tile: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x + 2
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize((tile_x, tile_y)), GlobalSize(((w - 2*bd_w), h - 2*bd_h)))(
+        input |>
+        map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(tile_x_in)(tile_x)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
+        map(transpose) >>
+        map(map(
+          map(slide(3)(1)) >> slide(3)(1) >> map(transpose)
+        )) >>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(mapLocal(0)(fun(nbh =>
+            makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |>
+            toPrivateFun(mapSeqUnroll(fun(ws => dotSeqU(join(ws))(join(nbh))))) |>
+            letf(fun(ixiy => {
+              val ix = ixiy `@` lidx(0, 2)
+              val iy = ixiy `@` lidx(1, 2)
+              makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id)
+            }))
+          )))
+        )) >> map(transpose) >> join >> map(join) >>
+        map(transpose) >> transpose
+    ))))
+  }
+
+  val tileVec: ToBeTyped[Expr] = {
+    val tile_x_in = tile_x + 2*vecw
+    val tile_y_in = tile_y + 2
+    depFun(hFrom(tile_y), (h: Nat) =>
+    depFun(wFrom(tile_x), (w: Nat) => fun(
+      (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32)
+    )(input =>
+      oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))(
+        input |>
+        map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> slide(tile_x_in)(tile_x)) >>
+        drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >>
+        map(transpose) >>
+        map(map(
+          map(asVectorAligned(vecw) >> slide(3)(1)) >> slide(3)(1) >> map(transpose)
+        )) >>
+        mapWorkGroup(1)(mapWorkGroup(0)(
+          mapLocal(1)(mapLocal(0)(fun(nbh =>
+            makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |>
+            toPrivateFun(mapSeqUnroll(fun(ws => dotSeqUWV(join(ws))(join(map(shuffle)(nbh)))))) |>
+            letf(fun(ixiy => {
+              val ix = ixiy `@` lidx(0, 2)
+              val iy = ixiy `@` lidx(1, 2)
+              makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id)
+            }))
+          )))
+        )) >> map(transpose) >> join >> map(join) >>
+        map(transpose >> map(asScalar)) >> transpose
+    ))))
+  }
+}
diff --git a/src/main/scala/apps/harrisCornerDetectionHalide.scala b/src/main/scala/apps/harrisCornerDetectionHalide.scala
index ea61ad7ed..034b45452 100644
--- a/src/main/scala/apps/harrisCornerDetectionHalide.scala
+++ b/src/main/scala/apps/harrisCornerDetectionHalide.scala
@@ -8,6 +8,8 @@ import rise.core.types._
 import rise.core.types.DataType._
 import HighLevelConstructs._
 
+// This version of Harris uses Halide's version as reference
+// used in the CGO'21 paper: https://ieeexplore.ieee.org/abstract/document/9370337/
 // in Halide: https://github.com/halide/Halide/blob/e8acdea/apps/harris
 // in PolyMage: https://bitbucket.org/udayb/polymage/src/e28327c/sandbox/apps/python/img_proc/harris
 // FIXME: PolyMage's algorithm is different
diff --git a/src/main/scala/shine/C/AST/Printer.scala b/src/main/scala/shine/C/AST/Printer.scala
index 972567744..665770c98 100644
--- a/src/main/scala/shine/C/AST/Printer.scala
+++ b/src/main/scala/shine/C/AST/Printer.scala
@@ -43,6 +43,8 @@ trait Printer {
 
 object Printer {
   def apply(n: Node): String = (new CPrinter).printNode(n)
+  def declFun(f: FunDecl): String =
+    (new CPrinter).declareFunSig(f)
 }
 
 class CPrinter extends Printer {
@@ -67,6 +69,12 @@ class CPrinter extends Printer {
     case s: StructTypeDecl => printStructTypeDecl(s)
   }
 
+  def declareFunSig(f: FunDecl): String = {
+    printFunSig(f)
+    print(";")
+    sb.toString()
+  }
+
   override def printExpr(e: Expr, parenthesize: Boolean): Unit = e match {
     case a: Assignment =>
       printMaybe(parenthesize)(
@@ -116,8 +124,7 @@ class CPrinter extends Printer {
       print(";")
   }
 
-  // Decls
-  private def printFunDecl(f: FunDecl): Unit = {
+  def printFunSig(f: FunDecl): Unit = {
     print(typeName(f.returnType))
     print(s" ${f.name}(")
     f.params.foreach(p => {
@@ -125,7 +132,11 @@ class CPrinter extends Printer {
       if (!p.eq(f.params.last)) print(", ")
     })
     print(")")
+  }
 
+  // Decls
+  private def printFunDecl(f: FunDecl): Unit = {
+    printFunSig(f)
     printStmt(f.body)
   }
 
diff --git a/src/main/scala/shine/DPIA/fromRise.scala b/src/main/scala/shine/DPIA/fromRise.scala
index 2944bf635..c081791ff 100644
--- a/src/main/scala/shine/DPIA/fromRise.scala
+++ b/src/main/scala/shine/DPIA/fromRise.scala
@@ -2,7 +2,7 @@ package shine.DPIA
 
 import elevate.core.strategies.Traversable
 import elevate.core.strategies.basic.normalize
-import rise.core.types.{AddressSpaceKind, DataKind, DataType, NatKind, NatToNatKind, NatToNatLambda, read, write}
+import rise.core.types.{AddressSpaceKind, DataKind, DataType, NatKind, NatToNatKind, NatToNatLambda, TypeIdentifier, TypePlaceholder, read, write}
 import rise.core.DSL.Type._
 import rise.core.types.DataType._
 import rise.elevate.Rise
@@ -12,7 +12,10 @@ import rise.{core => r}
 import shine.DPIA.Phrases._
 import shine.DPIA.Types._
 import shine.DPIA.primitives.functional._
+import util.monads
 
+import scala.annotation.tailrec
+import scala.collection.immutable
 import scala.collection.mutable
 
 object fromRise {
@@ -22,8 +25,128 @@ object fromRise {
       throw new Exception(s"expression is not in closed form: $expr\n\n with type ${expr.t}\n free vars: $fV\n free type vars: $fT\n\n")
     }
     val bnfExpr = normalize(ev).apply(betaReduction)(expr).get
-    val rwMap = inferAccess(bnfExpr)
-    expression(bnfExpr, rwMap)
+    val nExpr = normalizeEqualNats(bnfExpr)
+    val rwMap = inferAccess(nExpr)
+    expression(nExpr, rwMap)
+  }
+
+  // NOTE: this is required because unify(nat1, nat2) can succeed while
+  // normalize(nat1) != normalize(nat2)
+  private def normalizeEqualNats(e: r.Expr): r.Expr = {
+    // inspired from union find algorithm
+    var map = mutable.Map[Nat, Nat]()
+
+    @tailrec
+    def getBest(n: Nat): Nat = {
+      val n2 = map.getOrElse(n, n)
+      if (n == n2) { return n }
+      val n3 = map.getOrElse(n2, n2)
+      map(n) = n3
+      getBest(n3)
+    }
+
+    r.traverse.traverse(e, new r.traverse.PureTraversal {
+      override def expr: r.Expr => util.monads.Pure[r.Expr] = { e =>
+        e match {
+          case r.App(f, arg) =>
+            val ft = f.t.asInstanceOf[rt.FunType[_ <: rt.ExprType, _ <: rt.ExprType]]
+            sameType(arg.t, ft.inT)
+            sameType(e.t, ft.outT)
+          case _ => ()
+        }
+        super.expr(e)
+      }
+
+      private def sameType(a: rt.ExprType, b: rt.ExprType): Unit = {
+        def unwrapb(f: PartialFunction[rt.ExprType, Unit]): Unit = {
+          f.lift(b) match {
+            case Some(()) => ()
+            case None => throw new Exception(s"Unexpected type for $b")
+          }
+        }
+        a match {
+          case TypePlaceholder | TypeIdentifier(_) =>
+            throw new Exception("this should not happen")
+          case rt.FunType(inT, outT) => unwrapb {
+            case rt.FunType(inT2, outT2) =>
+                sameType(inT, inT2)
+                sameType(outT, outT2)
+            }
+          case rt.DepFunType(kind, x, t) => unwrapb {
+            case rt.DepFunType(kind2, x2, t2) =>
+              assert(kind == kind2)
+              assert(x == x2)
+              sameType(t, t2)
+          }
+          case dataType: DataType => dataType match {
+            case DataTypeIdentifier(_) => ()
+            case scalarType: ScalarType => ()
+            case DataType.NatType => ()
+            case OpaqueType(_) => ()
+            case VectorType(size, elemType) => unwrapb {
+              case VectorType(size2, elemType2) =>
+                sameNat(size, size2)
+                sameType(elemType, elemType2)
+            }
+            case IndexType(size) => unwrapb {
+              case IndexType(size2) =>
+                sameNat(size, size2)
+            }
+            case PairType(dt1, dt2) => unwrapb {
+              case PairType(dt12, dt22) =>
+                sameType(dt1, dt12)
+                sameType(dt2, dt22)
+            }
+            case FragmentType(rows, columns, d3, dataType, fragmentKind, layout) => unwrapb {
+              case FragmentType(_, _, d32, dataType2, _, _) =>
+                sameNat(d3, d32)
+                sameType(dataType, dataType2)
+            }
+            case ManagedBufferType(dt) => unwrapb {
+              case ManagedBufferType(dt2) =>
+                sameType(dt, dt2)
+            }
+            case DepPairType(kind, x, t) => unwrapb {
+              case DepPairType(kind2, x2, t2) =>
+                assert(kind == kind2)
+                assert(x == x2)
+                sameType(t, t2)
+            }
+            case apply: NatToDataApply => ???
+            case ArrayType(size, elemType) => unwrapb {
+              case ArrayType(size2, elemType2) =>
+                sameNat(size, size2)
+                sameType(elemType, elemType2)
+            }
+            case DepArrayType(size, fdt) => ???
+          }
+        }
+      }
+
+      private def sameNat(a: Nat, b: Nat): Unit = {
+        val bestA = getBest(a)
+        val bestB = getBest(b)
+        if (bestA != bestB) {
+          println(s"WARNING: $bestA != $bestB")
+          val best = if (natSize(bestA) < natSize(bestB)) { bestA } else { bestB }
+          println(s" --> assuming they are equal and using $best")
+          map(bestA) = best
+          map(bestB) = best
+        }
+      }
+    })
+
+    r.traverse.traverse(e, new r.traverse.PureTraversal {
+      override def nat: Nat => monads.Pure[Nat] = n => return_(getBest(n))
+    })
+  }
+
+  private def natSize(n: Nat): Int = {
+    var i = 0
+    arithexpr.arithmetic.ArithExpr.visit(n, {
+      _ => i += 1
+    })
+    i
   }
 
   def expression(
diff --git a/src/main/scala/shine/OpenCL/Compilation/HostCodeGenerator.scala b/src/main/scala/shine/OpenCL/Compilation/HostCodeGenerator.scala
index 857cec97e..52ef0701b 100644
--- a/src/main/scala/shine/OpenCL/Compilation/HostCodeGenerator.scala
+++ b/src/main/scala/shine/OpenCL/Compilation/HostCodeGenerator.scala
@@ -52,7 +52,7 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec
     }
     val temporaries = calledKernel.paramKinds.zip(calledKernel.code.params).flatMap { case (pk, p) =>
       if (pk.kind == ParamKind.Kind.temporary) {
-        Some((pk.typ, p.t.asInstanceOf[shine.OpenCL.AST.PointerType].a))
+        Some((pk.typ, p.t.asInstanceOf[shine.OpenCL.AST.PointerType].a, "m" + p.name))
       } else {
         None
       }
@@ -67,6 +67,29 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec
           case _ => None
         }
       }
+      // TODO: could optimize temporary buffer creation/deletion
+      val createTmp = temporaries.zipWithIndex.flatMap {
+        case ((dt, AddressSpace.Global, name), i) => Seq(
+          C.AST.DeclStmt(C.AST.VarDecl(name, typ(ManagedBufferType(dt)), Some(
+            C.AST.FunCall(C.AST.DeclRef("createBuffer"), Seq(
+              C.AST.DeclRef("ctx"),
+              bufferSize(dt),
+              C.AST.Literal(accessToString(DEVICE_READ | DEVICE_WRITE))
+            ))
+          ))),
+          deviceBufferSync(s"tb${i}", C.AST.DeclRef(name), dt, DEVICE_READ | DEVICE_WRITE)
+        )
+        case _ => Seq()
+      }
+      val destroyTmp = temporaries.flatMap {
+        case (dt, AddressSpace.Global, name) => Seq(
+          C.AST.ExprStmt(C.AST.FunCall(C.AST.DeclRef("destroyBuffer"), Seq(
+            C.AST.DeclRef("ctx"),
+            C.AST.DeclRef(name)
+          )))
+        )
+        case _ => Seq()
+      }
       val ndRangeTy = C.AST.ArrayType(C.AST.Type.usize, Some(3), true)
       val declGlobalSize = C.AST.DeclStmt(C.AST.VarDecl("global_size", ndRangeTy, Some(
         ArrayLiteral(ndRangeTy, NDRangeToAST(globalSize))
@@ -81,9 +104,12 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec
           ((args zip argsC).zipWithIndex.map { case ((arg, argC), i) =>
             kernelArg(i + 1, arg.t.dataType, argC)
           } ++ temporaries.zipWithIndex.map {
-            case ((dt, AddressSpace.Local), i) =>
+            case ((_, AddressSpace.Private, _), _) =>
+              throw new Exception("temporary kernel argument cannot live in private memory")
+            case ((dt, AddressSpace.Local, _), i) =>
               kernelLocalArg(i + 1 + args.size, dt)
-            case ((_, a), _) => throw new Exception(s"codegen is not implemented for temporaries in $a")
+            case ((dt, AddressSpace.Global, name), i) =>
+              kernelArg(i + 1 + args.size, dt, C.AST.DeclRef(s"tb${i}"))
           })
         )
       )))
@@ -99,8 +125,8 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec
         C.AST.DeclRef("args")
       )))
       C.AST.Block(
-        Seq(outputSync) ++ argSyncs ++
-        Seq(declGlobalSize, declLocalSize, declArgs, launchKernel)
+        Seq(outputSync) ++ argSyncs ++ createTmp ++
+        Seq(declGlobalSize, declLocalSize, declArgs, launchKernel) ++ destroyTmp
       )
     }))
   }
@@ -196,11 +222,14 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec
     }
   }
 
+  // TODO: use arith expr simplification here
   private def bufferSize(dt: DataType): Expr =
     dt match {
       case ManagedBufferType(dt) => bufferSize(dt)
-      case _: ScalarType | _: IndexType | _: VectorType | NatType =>
+      case _: ScalarType | _: IndexType | NatType =>
         C.AST.Literal(s"sizeof(${typ(dt)})")
+      case v: VectorType =>
+        C.AST.BinaryExpr(C.AST.ArithmeticExpr(v.size), BinaryOperator.*, bufferSize(v.elemType))
       case PairType(fst, snd) =>
         C.AST.BinaryExpr(bufferSize(fst), BinaryOperator.+, bufferSize(snd))
       case a: DataType.ArrayType =>
diff --git a/src/main/scala/shine/OpenCL/Compilation/Passes/HoistMemoryAllocations.scala b/src/main/scala/shine/OpenCL/Compilation/Passes/HoistMemoryAllocations.scala
index d123e97bb..914a65e24 100644
--- a/src/main/scala/shine/OpenCL/Compilation/Passes/HoistMemoryAllocations.scala
+++ b/src/main/scala/shine/OpenCL/Compilation/Passes/HoistMemoryAllocations.scala
@@ -80,9 +80,11 @@ object HoistMemoryAllocations {
                 parallelismLevel match {
                   case OpenCL.Local | OpenCL.Sequential =>
                     performRewrite(oldVariable, oldBody, i, n)
+                  case OpenCL.Global =>
+                    throw new Exception("hoisting local memory outside of global parallelism is not implemented")
                   case OpenCL.WorkGroup => // do not perform the substitution
                     (oldVariable, oldBody)
-                  case OpenCL.Global | OpenCL.Warp | OpenCL.Lane =>
+                  case OpenCL.Warp | OpenCL.Lane =>
                     throw new Exception("This should not happen")
                 }
               case AddressSpace.Private | AddressSpace.Constant | AddressSpaceIdentifier(_) =>
diff --git a/src/main/scala/shine/OpenCL/Module.scala b/src/main/scala/shine/OpenCL/Module.scala
index 0b00744f7..b54b890d7 100644
--- a/src/main/scala/shine/OpenCL/Module.scala
+++ b/src/main/scala/shine/OpenCL/Module.scala
@@ -30,6 +30,26 @@ object Module {
        |${util.gen.c.function.asString(m.hostCode)}
        |""".stripMargin
 
+  def translateToHeaderAndSource(m: Module): (String, String) =
+    (s"""
+      |#ifdef __cplusplus
+      |extern "C"
+      |{
+      |#endif
+      |${m.hostCode.includes.map(_.toString).mkString("\n")}
+      |${m.hostCode.decls.map(C.AST.Printer(_)).mkString("\n")}
+      |${m.hostCode.functions.map(f => C.AST.Printer.declFun(f.code)).mkString("\n")}
+      |#ifdef __cplusplus
+      |}
+      |#endif
+      |""".stripMargin,
+      s"""
+       |${m.kernels.map(kernelSource).mkString("\n")}
+       |#define loadKernel(ctx, id)\\
+       |  loadKernelFromSource(ctx, #id, id##_source, sizeof(id##_source) - 1)
+       |${util.gen.c.function.asString(m.hostCode)}
+       |""".stripMargin)
+
   def dumpToDirectory(dir: java.io.File)(m: Module): Unit = {
     util.writeToPath(s"${dir.getAbsolutePath}/host.c",
       s"""#define loadKernel(ctx, ident) loadKernelFromFile(ctx, #ident, #ident ".cl")
diff --git a/src/main/scala/util/ExecuteOpenCL.scala b/src/main/scala/util/ExecuteOpenCL.scala
index 5d8f7c3ef..ff4300584 100644
--- a/src/main/scala/util/ExecuteOpenCL.scala
+++ b/src/main/scala/util/ExecuteOpenCL.scala
@@ -11,7 +11,7 @@ object ExecuteOpenCL {
   val platformPath = "runtime/ocl/"
   val executorHeadersPath = "lib/executor/lib/Executor/include/"
   val libs = "-lm -lOpenCL"
-  val includes = s"-I$runtimePath -I$executorHeadersPath"
+  val includes = s"-I$runtimePath -I$executorHeadersPath -I."
   val libDirs: String = tryToFindOpenCLLibDir()
 
   def tryToFindOpenCLLibDir(): String = {
@@ -45,7 +45,7 @@ object ExecuteOpenCL {
         s"""#include "host.c"
            |$mainSource""".stripMargin)
       val sources = s"$mainPath $platformPath/buffer_$buffer_impl.c $platformPath/ocl.c"
-      (s"clang -O2 $sources $includes -o $binPath $libDirs $libs -Wno-parentheses-equality" !!)
+      (s"clang -O2 $sources $includes -o $binPath $libDirs $libs -Wno-parentheses-equality ." !!)
       (Process(s"$binPath", new java.io.File(genDir.getAbsolutePath)) !!)
     } catch {
       case e: Throwable =>
@@ -71,5 +71,31 @@ object ExecuteOpenCL {
         throw Exception(s"execution failed: $e")
     }
   }
+
+  @throws[Exception]
+  def using_cpp(main: String, module: shine.OpenCL.Module, buffer_impl: String): String = {
+    try {
+      val (m_h, m_c) = shine.OpenCL.Module.translateToHeaderAndSource(module)
+      val module_hdr = writeToTempFile("code-", ".h", m_h).getAbsolutePath
+      val module_src = writeToTempFile("code-", ".c", m_c).getAbsolutePath
+      val main_src = writeToTempFile("code-", ".cpp", main).getAbsolutePath
+      val sources = Seq(module_src, s"$platformPath/buffer_$buffer_impl.c", s"$platformPath/ocl.c")
+      val objs = sources.map(s => {
+        val obj = s.stripSuffix(".c") + ".o"
+        if (!(new java.io.File(obj)).exists()) {
+          (s"clang -c -O2 $s $includes -o $obj -Wno-parentheses-equality" !!)
+        }
+        obj
+      }).mkString(" ")
+      val bin = createTempFile("bin-", "").getAbsolutePath
+      (s"clang++ -O2 $main_src $objs -include $module_hdr $includes -o $bin $libDirs $libs -Wno-parentheses-equality" !!)
+      (new java.io.File(module_src.stripSuffix(".c") + ".o")).delete()
+      (s"$bin" !!)
+    } catch {
+      case e: Throwable =>
+        Console.err.println(s"execution failed: $e")
+        throw Exception(s"execution failed: $e")
+    }
+  }
 }
 
diff --git a/src/test/scala/shine/host.scala b/src/test/scala/shine/host.scala
index 7d88420b6..18cb580df 100644
--- a/src/test/scala/shine/host.scala
+++ b/src/test/scala/shine/host.scala
@@ -163,4 +163,22 @@ int main(int argc, char** argv) {
     findDeviceBufferSyncRead(1, hostCode)
     checkOutput(m)
   }
+
+  test("global memory") {
+    val e = depFun((n: Nat) => fun((n`.`i32) ->: (n`.`i32))(in =>
+      oclRun(LocalSize(16), GlobalSize(n))(
+        in |> split(16) |> mapWorkGroup(0)(
+          mapLocal(0)(add(li32(1))) >>
+          toGlobal >>
+          mapLocal(0)(add(li32(2)))
+        ) |> join
+      )
+    ))
+    val m = gen.opencl.hosted.fromExpr(e)
+    val hostCode = gen.c.function.asString(m.hostCode)
+    // logger.debug(hostCode)
+    findDeviceBufferSyncWrite(1, hostCode)
+    findDeviceBufferSyncRead(1, hostCode)
+    checkOutput(m)
+  }
 }