diff --git a/.gitignore b/.gitignore index 6a98303bb..d61a9bf82 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ modules.xml *.pdf *.gz *.sc + +*.o diff --git a/src/main/scala/apps/harrisCornerDetection.scala b/src/main/scala/apps/harrisCornerDetection.scala index 80d426c65..9f5803b4e 100644 --- a/src/main/scala/apps/harrisCornerDetection.scala +++ b/src/main/scala/apps/harrisCornerDetection.scala @@ -12,6 +12,13 @@ import shine.OpenCL.KernelExecutor._ import scala.reflect.ClassTag +/** This version of Harris follows from the following paper: + * https://dl.acm.org/doi/abs/10.1145/2568058.2568067 + * + * Compared to Halide's version: + * - it starts from grayscale images instead of color images + * - it uses a binomial filter instead of a box filter + */ object harrisCornerDetection { private val C2D = separableConvolution2D private val id = C2D.id diff --git a/src/main/scala/apps/harrisCornerDetection2/binomialCoarsity.scala b/src/main/scala/apps/harrisCornerDetection2/binomialCoarsity.scala new file mode 100644 index 000000000..eac19963f --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/binomialCoarsity.scala @@ -0,0 +1,242 @@ +package apps.harrisCornerDetection2 + +import rise.core._ +import rise.core.DSL._ +import rise.core.primitives.{id => _, _} +import Type._ +import rise.core.types._ +import rise.core.types.DataType._ +import rise.openCL.DSL._ +import rise.openCL.primitives.oclRotateValues +import shine.OpenCL.{GlobalSize, LocalSize} + +object binomialCoarsity { + def check(module: shine.OpenCL.Module, h: Int, w: Int, kappa: Float): Unit = { + val main = s""" +#include "src/main/scala/apps/harrisCornerDetection2/common.cpp" + +int main(int argc, char** argv) { + Context ctx = createDefaultContext(); + size_t in_bytes = $h * $w * sizeof(float); + size_t out_bytes = ${h - 2*bd_h} * ${w - 2*bd_w} * sizeof(float); + Buffer input_ixx = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer input_ixy = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer input_iyy = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer output = createBuffer(ctx, out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE); + + float* sxx_gold = (float*) malloc(out_bytes); + float* sxy_gold = (float*) malloc(out_bytes); + float* syy_gold = (float*) malloc(out_bytes); + float* out_gold = (float*) malloc(out_bytes); + + std::random_device rand_d; + std::default_random_engine rand_e(rand_d()); + // bigger range results in higher output differences + std::uniform_real_distribution dist(0, 200); + + float* in_ixx = (float*) hostBufferSync(ctx, input_ixx, in_bytes, HOST_WRITE | HOST_READ); + float* in_ixy = (float*) hostBufferSync(ctx, input_ixy, in_bytes, HOST_WRITE | HOST_READ); + float* in_iyy = (float*) hostBufferSync(ctx, input_iyy, in_bytes, HOST_WRITE | HOST_READ); + for (int y = 0; y < $h; y++) { + for (int x = 0; x < $w; x++) { + in_ixx[y*$w + x] = dist(rand_e); + in_ixy[y*$w + x] = dist(rand_e); + in_iyy[y*$w + x] = dist(rand_e); + } + } + + binomial_gold(sxx_gold, $h, $w, in_ixx); + binomial_gold(sxy_gold, $h, $w, in_ixy); + binomial_gold(syy_gold, $h, $w, in_iyy); + coarsity_gold(out_gold, ${h - 2*bd_h}, ${w - 2*bd_w}, sxx_gold, sxy_gold, syy_gold, $kappa); + + foo_init_run(ctx, output, $h, $w, input_ixx, input_ixy, input_iyy, $kappa); + + ErrorStats errors; + init_error_stats(&errors); + float* out = (float*) hostBufferSync(ctx, output, out_bytes, HOST_READ); + accumulate_error_stats(&errors, out, out_gold, ${h - 2*bd_h}, ${w - 2*bd_w}); + finish_error_stats(&errors, 0.05, 0.0001); + + free(sxx_gold); + free(sxy_gold); + free(syy_gold); + free(out_gold); + destroyBuffer(ctx, input_ixx); + destroyBuffer(ctx, input_ixy); + destroyBuffer(ctx, input_iyy); + destroyBuffer(ctx, output); + destroyContext(ctx); + return EXIT_SUCCESS; +} +""" + util.ExecuteOpenCL.using_cpp(main, module, "one_copy") + } + + val base: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 + ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((sxx, sxy, syy, kappa) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + makeArray(3)(sxx)(sxy)(syy) |> + transpose >> map(transpose) >> + map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >> + map(transpose) >> + mapGlobal(mapSeq( + map(transpose) >> transpose >> + toPrivateFun(mapSeqUnroll(fun(nbh => + dotSeqU(join(binomialWeights2d))(join(nbh)) + ))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - kappa * trace * trace + })) + )) + )))) + + val lineVec: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 + ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((sxx, sxy, syy, kappa) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + makeArray(3)(sxx)(sxy)(syy) |> + map( + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) + ) >> + transpose >> map(transpose) >> + slide(3)(1) >> mapGlobal( + transpose >> map(transpose) >> + mapSeq(mapSeqUnroll(dotSeqUWV(binomialWeightsV))) >> + toGlobal >> + slide(3)(1) >> + mapSeq( + transpose >> map(shuffle) >> + toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - vectorFromScalar(kappa) * trace * trace + })) + ) >> asScalar + ) + )))) + + val rotvVec: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) + ->: f32 ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((sxx, sxy, syy, kappa) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + makeArray(3)(sxx)(sxy)(syy) |> + map( + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) + ) >> + transpose >> map(transpose) >> + slide(3)(1) >> mapGlobal( + transpose >> map(transpose) >> + map(map(dotSeqUWV(binomialWeightsV))) >> + oclRotateValues(AddressSpace.Private)(3)(mapSeqUnroll(id)) >> iterateStream( + transpose >> map(shuffle) >> + toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - vectorFromScalar(kappa) * trace * trace + })) + ) >> asScalar + ) + )))) + + val tile: ToBeTyped[Expr] = { + val tile_x_in = tile_x + 2 + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 + ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((sxx, sxy, syy, kappa) => + oclRun(LocalSize((tile_x, tile_y)), GlobalSize((w - 2*bd_w, h - 2*bd_h)))( + makeArray(3)(sxx)(sxy)(syy) |> + transpose >> map(transpose) >> + map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(tile_x_in)(tile_x)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)( + map(transpose) >> transpose >> + toPrivateFun(mapSeqUnroll(fun(nbh => + dotSeqU(join(binomialWeights2d))(join(nbh)) + ))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - kappa * trace * trace + })) + )) + )) >> map(transpose) >> join >> map(join) + )))) + } + + val tileVec: ToBeTyped[Expr] = { + val tile_x_in = tile_x + 2*vecw + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 + ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((sxx, sxy, syy, kappa) => + oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))( + makeArray(3)(sxx)(sxy)(syy) |> + map( + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> slide(tile_x_in)(tile_x)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(asVectorAligned(vecw) >> slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) + ) >> + transpose >> map(transpose >> map(transpose >> map(transpose))) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)( + toPrivateFun(mapSeqUnroll(fun(nbh => + dotSeqUWV(join(binomialWeights2d))(join(map(shuffle)(nbh))) + ))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - vectorFromScalar(kappa) * trace * trace + })) + )) + )) >> map(transpose) >> join >> map(join >> asScalar) + )))) + } +} diff --git a/src/main/scala/apps/harrisCornerDetection2/coarsity.scala b/src/main/scala/apps/harrisCornerDetection2/coarsity.scala new file mode 100644 index 000000000..310057677 --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/coarsity.scala @@ -0,0 +1,161 @@ +package apps.harrisCornerDetection2 + +import rise.core._ +import rise.core.DSL._ +import rise.core.primitives.{id => _, _} +import Type._ +import rise.core.types._ +import rise.core.types.DataType._ +import rise.openCL.DSL._ +import shine.OpenCL.{GlobalSize, LocalSize} + +object coarsity { + def check(module: shine.OpenCL.Module, h: Int, w: Int, kappa: Float): Unit = { + val main = s""" +#include "src/main/scala/apps/harrisCornerDetection2/common.cpp" + +int main(int argc, char** argv) { + Context ctx = createDefaultContext(); + size_t bytes = $h * $w * sizeof(float); + Buffer input_sxx = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer input_sxy = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer input_syy = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer output = createBuffer(ctx, bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE); + + float* out_gold = (float*) malloc(bytes); + + std::random_device rand_d; + std::default_random_engine rand_e(rand_d()); + // bigger range results in higher output differences + std::uniform_real_distribution dist(0, 200); + + float* in_sxx = (float*) hostBufferSync(ctx, input_sxx, bytes, HOST_WRITE | HOST_READ); + float* in_sxy = (float*) hostBufferSync(ctx, input_sxy, bytes, HOST_WRITE | HOST_READ); + float* in_syy = (float*) hostBufferSync(ctx, input_syy, bytes, HOST_WRITE | HOST_READ); + for (int y = 0; y < $h; y++) { + for (int x = 0; x < $w; x++) { + in_sxx[y*$w + x] = dist(rand_e); + in_sxy[y*$w + x] = dist(rand_e); + in_syy[y*$w + x] = dist(rand_e); + } + } + + coarsity_gold(out_gold, $h, $w, in_sxx, in_sxy, in_syy, $kappa); + + foo_init_run(ctx, output, $h, $w, input_sxx, input_sxy, input_syy, $kappa); + + ErrorStats errors; + init_error_stats(&errors); + float* out = (float*) hostBufferSync(ctx, output, bytes, HOST_READ); + accumulate_error_stats(&errors, out, out_gold, $h, $w); + finish_error_stats(&errors, 0.01, 0.0001); + + free(out_gold); + destroyBuffer(ctx, input_sxx); + destroyBuffer(ctx, input_sxy); + destroyBuffer(ctx, input_syy); + destroyBuffer(ctx, output); + destroyContext(ctx); + return EXIT_SUCCESS; +} +""" + util.ExecuteOpenCL.using_cpp(main, module, "one_copy") + } + + def base: ToBeTyped[Expr] = + depFun((h: Nat, w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 ->: (h`.`w`.`f32) + )((sxx, sxy, syy, kappa) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + zip(sxx)(zip(sxy)(syy)) |> mapGlobal(fun(s => + zip(s._1)(zip(s._2._1)(s._2._2)) |> + mapSeq(fun(s => { + val sxx = fst(s) + val sxy = fst(snd(s)) + val syy = snd(snd(s)) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - kappa * trace * trace + })) + )) + ))) + + val vec: ToBeTyped[Expr] = + depFun((h: Nat, w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 ->: (h`.`w`.`f32) + )((sxx, sxy, syy, kappa) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + zip(sxx)(zip(sxy)(syy)) |> mapGlobal(fun(s => + zip(asVectorAligned(vecw)(s._1))(zip(asVectorAligned(vecw)(s._2._1))(asVectorAligned(vecw)(s._2._2))) |> + mapSeq(fun(s => { + val sxx = fst(s) + val sxy = fst(snd(s)) + val syy = snd(snd(s)) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - vectorFromScalar(kappa) * trace * trace + })) >> + asScalar + )) + ))) + + val tile: ToBeTyped[Expr] = { + val tile_x_in = tile_x + val tile_y_in = tile_y + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 ->: (h`.`w`.`f32) + )((sxx, sxy, syy, kappa) => + oclRun(LocalSize((tile_x, tile_y)), GlobalSize((w, h)))( + zip2D(sxx, zip2D(sxy, syy)) |> + map(slide(tile_x_in)(tile_x)) |> + slide(tile_y_in)(tile_y) |> + map(transpose) |> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(fun(s => + zip(asVectorAligned(vecw)(unzip(s)._1))( + zip(asVectorAligned(vecw)(unzip(unzip(s)._2)._1))( + asVectorAligned(vecw)(unzip(unzip(s)._2)._2))) |> + mapLocal(0)(fun(s => { + val sxx = fst(s) + val sxy = fst(snd(s)) + val syy = snd(snd(s)) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - vectorFromScalar(kappa) * trace * trace + })) + )) + )) >> map(transpose) >> join >> map(join >> asScalar) + )))) + } + + val tileVec: ToBeTyped[Expr] = { + val tile_x_in = tile_x + val tile_y_in = tile_y + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 ->: (h`.`w`.`f32) + )((sxx, sxy, syy, kappa) => + oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize((w / vecw, h)))( + zip2D(sxx, zip2D(sxy, syy)) |> + map(slide(tile_x_in)(tile_x)) |> + slide(tile_y_in)(tile_y) |> + map(transpose) |> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(fun(s => + zip(asVectorAligned(vecw)(unzip(s)._1))( + zip(asVectorAligned(vecw)(unzip(unzip(s)._2)._1))( + asVectorAligned(vecw)(unzip(unzip(s)._2)._2))) |> + mapLocal(0)(fun(s => { + val sxx = fst(s) + val sxy = fst(snd(s)) + val syy = snd(snd(s)) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - vectorFromScalar(kappa) * trace * trace + })) + )) + )) >> map(transpose) >> join >> map(join >> asScalar) + )))) + } +} diff --git a/src/main/scala/apps/harrisCornerDetection2/common.cpp b/src/main/scala/apps/harrisCornerDetection2/common.cpp new file mode 100644 index 000000000..77642f0c0 --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/common.cpp @@ -0,0 +1,145 @@ +#include +#include +#include +#include +#include +#include + +extern "C" { +#include "ocl/ocl.h" +AccessFlags operator|(AccessFlags a, AccessFlags b) { + return static_cast(static_cast(a) | static_cast(b)); } +} + +// TODO: pass these in from Scala? +const int bd_h = 16; +const int bd_w = 32; + +struct ErrorStats { + float min_val; + float max_val; + double min; + double max; + double max_mse; +}; + +void init_error_stats(ErrorStats* es) { + es->min_val = 1.f / 0.f; + es->max_val = -1.f / 0.f; + es->min = 1.f / 0.f; + es->max = 0.f; + es->max_mse = 0.f; +} + +void accumulate_error_stats(ErrorStats* es, float* a, float* b, int h, int w) { + double square_sum = 0.f; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + es->min_val = std::min(es->min_val, b[y*w + x]); + es->max_val = std::max(es->max_val, b[y*w + x]); + double delta = a[y*w + x] - b[y*w + x]; + double d_abs = abs(delta); + es->min = std::min(es->min, d_abs); + es->max = std::max(es->max, d_abs); + square_sum += d_abs * d_abs; + } + } + es->max_mse = std::max(es->max_mse, square_sum / (h * w)); +} + +void finish_error_stats(ErrorStats* es, float tolerated_per_pixel, float tolerated_mse) { + fprintf(stderr, "errors: [%.4lf - %.4lf] with %.4lf MSE\n", + es->min, es->max, es->max_mse); + if (es->max > tolerated_per_pixel || es->max_mse > tolerated_mse) { + fprintf(stderr, "maximum tolerated error: %.4f per pixel and %.4f MSE\n", + tolerated_per_pixel, tolerated_mse); + fprintf(stderr, "value range: [%.4f - %.4f]\n", es->min_val, es->max_val); + exit(EXIT_FAILURE); + } +} + +void conv3x3_gold(float* out, + int h, int w, + const float* in, + const float* weights) +{ + for (int y = 0; y < (h - 2*bd_h); y++) { + int r0 = (y + bd_h - 1) * w; + int r1 = (y + bd_h) * w; + int r2 = (y + bd_h + 1) * w; + for (int x = 0; x < (w - 2*bd_w); x++) { + int c0 = x + (bd_w - 1); + int c1 = x + bd_w; + int c2 = x + (bd_w + 1); + out[y*(w - 2*bd_w)+x] = ( + weights[0]*in[r0+c0] + weights[1]*in[r0+c1] + weights[2]*in[r0+c2] + + weights[3]*in[r1+c0] + weights[4]*in[r1+c1] + weights[5]*in[r1+c2] + + weights[6]*in[r2+c0] + weights[7]*in[r2+c1] + weights[8]*in[r2+c2] + ); + } + } +} + +void sobelX_gold(float* out, + int h, int w, + const float* in) +{ + float weights[9] = { + -1.f/8.f, 0.f, 1.f/8.f, + -2.f/8.f, 0.f, 2.f/8.f, + -1.f/8.f, 0.f, 1.f/8.f + }; + conv3x3_gold(out, h, w, in, weights); +} + +void sobelY_gold(float* out, + int h, int w, + const float* in) +{ + float weights[9] = { + -1.f/8.f, -2.f/8.f, -1.f/8.f, + 0.f/8.f, 0.f/8.f, 0.f/8.f, + 1.f/8.f, 2.f/8.f, 1.f/8.f + }; + conv3x3_gold(out, h, w, in, weights); +} + +void binomial_gold(float* out, + int h, int w, + const float* in) +{ + float weights[9] = { + 1.f/16.f, 2.f/16.f, 1.f/16.f, + 2.f/16.f, 4.f/16.f, 2.f/16.f, + 1.f/16.f, 2.f/16.f, 1.f/16.f + }; + conv3x3_gold(out, h, w, in, weights); +} + +void mul_gold(float* out, + int h, int w, + const float* a, + const float* b) +{ + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + out[y*w + x] = a[y*w + x] * b[y*w + x]; + } + } +} + +void coarsity_gold(float* out, + int h, int w, + const float* sxx, + const float* sxy, + const float* syy, + float kappa) +{ + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + float det = sxx[y*w + x] * syy[y*w + x] - sxy[y*w + x] * sxy[y*w + x]; + float trace = sxx[y*w + x] + syy[y*w + x]; + out[y*w + x] = det - kappa * trace * trace; + } + } +} diff --git a/src/main/scala/apps/harrisCornerDetection2/convolutions.scala b/src/main/scala/apps/harrisCornerDetection2/convolutions.scala new file mode 100644 index 000000000..40b4ba415 --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/convolutions.scala @@ -0,0 +1,160 @@ +package apps.harrisCornerDetection2 + +import rise.core._ +import rise.core.DSL._ +import rise.core.primitives.{id => _, _} +import Type._ +import rise.core.types._ +import rise.core.types.DataType._ +import rise.core.DSL.Type._ +import rise.openCL.DSL._ +import rise.openCL.primitives.oclRotateValues +import shine.OpenCL.{GlobalSize, LocalSize} + +object convolutions { + def check(prelude: String, module: shine.OpenCL.Module, h: Int, w: Int): Unit = { + val main = s""" +${prelude} + +int main(int argc, char** argv) { + Context ctx = createDefaultContext(); + size_t in_bytes = $h * $w * sizeof(float); + size_t out_bytes = ${h - 2*bd_h} * ${w - 2*bd_w} * sizeof(float); + Buffer input = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer output = createBuffer(ctx, out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE); + + float* out_gold = (float*) malloc(out_bytes); + + std::random_device rand_d; + std::default_random_engine rand_e(rand_d()); + // bigger range results in higher output differences + std::uniform_real_distribution dist(0, 200); + + float* in = (float*) hostBufferSync(ctx, input, in_bytes, HOST_WRITE | HOST_READ); + for (int y = 0; y < $h; y++) { + for (int x = 0; x < $w; x++) { + in[y*$w + x] = dist(rand_e); + } + } + + gold(out_gold, in); + + generated(ctx, output, input); + + ErrorStats errors; + init_error_stats(&errors); + float* out = (float*) hostBufferSync(ctx, output, out_bytes, HOST_READ); + accumulate_error_stats(&errors, out, out_gold, ${h - 2*bd_h}, ${w - 2*bd_w}); + finish_error_stats(&errors, 0.01, 0.0001); + + free(out_gold); + destroyBuffer(ctx, input); + destroyBuffer(ctx, output); + destroyContext(ctx); + return EXIT_SUCCESS; +} +""" + util.ExecuteOpenCL.using_cpp(main, module, "one_copy") + } + + def base(weights2d: ToBeTyped[Expr]): ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w-2*bd_w)`.`f32) + )(input => + oclRun(LocalSize(1), GlobalSize(num_threads))( + input |> + map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >> + map(transpose) >> + mapGlobal(mapSeq(fun(nbh => + dotSeqU(join(weights2d))(join(nbh)) + ))) + )))) + + def lineVec(weightsV: ToBeTyped[Expr], weightsH: ToBeTyped[Expr]): ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize(1), GlobalSize(num_threads))( + input |> + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal( + transpose >> + mapSeq(dotSeqUWV(weightsV)) >> + toGlobal >> + // toLocal >> + slide(3)(1) >> + mapSeq(shuffle >> dotSeqUWV(weightsH)) >> + asScalar + ) + )))) + + def rotvVec(weightsV: ToBeTyped[Expr], weightsH: ToBeTyped[Expr]): ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize(1), GlobalSize(num_threads))( + input |> + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal( + transpose >> + map(dotSeqUWV(weightsV)) >> + oclRotateValues(AddressSpace.Private)(3)(id) >> iterateStream( + shuffle >> dotSeqUWV(weightsH) + ) >> + asScalar + ) + )))) + + def tile(weights2d: ToBeTyped[Expr]): ToBeTyped[Expr] = { + val tile_x_in = tile_x + 2 + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize((tile_x, tile_y)), GlobalSize(((w - 2*bd_w), h - 2*bd_h)))( + input |> + map(drop(bd_w-1) >> take(w - (2*bd_w-2)) >> slide(tile_x_in)(tile_x)) >> + drop(bd_h-1) >> take(h - (2*bd_h-2)) >> slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)(fun(nbh => + dotSeqU(join(weights2d))(join(nbh)) + ))) + )) >> map(transpose) >> join >> map(join) + )))) + } + + def tileVec(weights2d: ToBeTyped[Expr]): ToBeTyped[Expr] = { + val tile_x_in = tile_x + 2*vecw + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))( + input |> + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> slide(tile_x_in)(tile_x)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(asVectorAligned(vecw) >> slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)(fun(nbh => + dotSeqUWV(join(weights2d))(join(map(shuffle)(nbh))) + ))) + )) >> map(transpose) >> join >> map(join >> asScalar) + )) + )) + } +} diff --git a/src/main/scala/apps/harrisCornerDetection2/generateCode.scala b/src/main/scala/apps/harrisCornerDetection2/generateCode.scala new file mode 100644 index 000000000..3f5b0baa8 --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/generateCode.scala @@ -0,0 +1,110 @@ +package apps.harrisCornerDetection2 + +object generateCode { + val H = 1024 + val W = 2048 + val kappa = 0.04f + + def checkBinomial(m: shine.OpenCL.Module): Unit = { + val prelude = s""" +#include "src/main/scala/apps/harrisCornerDetection2/common.cpp" +#define gold(o, i) binomial_gold(o, $H, $W, i) +#define generated(ctx, o, i) foo_init_run(ctx, o, $H, $W, i) +""" + convolutions.check(prelude, m, H, W) + } + + def checkSobelX(m: shine.OpenCL.Module): Unit = { + val prelude = s""" +#include "src/main/scala/apps/harrisCornerDetection2/common.cpp" +#define gold(o, i) sobelX_gold(o, $H, $W, i) +#define generated(ctx, o, i) foo_init_run(ctx, o, $H, $W, i) +""" + convolutions.check(prelude, m, H, W) + } + + def checkSobelY(m: shine.OpenCL.Module): Unit = { + val prelude = s""" +#include "src/main/scala/apps/harrisCornerDetection2/common.cpp" +#define gold(o, i) sobelY_gold(o, $H, $W, i) +#define generated(ctx, o, i) foo_init_run(ctx, o, $H, $W, i) +""" + convolutions.check(prelude, m, H, W) + } + + def main(args: Array[String]): Unit = { + val kernels = Seq[(String, rise.core.DSL.ToBeTyped[rise.core.Expr], shine.OpenCL.Module => Unit)]( + ("binomial-base", convolutions.base(binomialWeights2d), checkBinomial), + ("binomial-line-vec", convolutions.lineVec(binomialWeightsV, binomialWeightsH), checkBinomial), + ("binomial-rotv-vec", convolutions.rotvVec(binomialWeightsV, binomialWeightsH), checkBinomial), + ("binomial-tile", convolutions.tile(binomialWeights2d), checkBinomial), + ("binomial-tile-vec", convolutions.tileVec(binomialWeights2d), checkBinomial), + + ("sobelX-base", convolutions.base(sobelXWeights2d), checkSobelX), + ("sobelX-line-vec", convolutions.lineVec(sobelXWeightsV, sobelXWeightsH), checkSobelX), + ("sobelX-rotv-vec", convolutions.rotvVec(sobelXWeightsV, sobelXWeightsH), checkSobelX), + ("sobelX-tile", convolutions.tile(sobelXWeights2d), checkSobelX), + ("sobelX-tile-vec", convolutions.tileVec(sobelXWeights2d), checkSobelX), + + ("sobelY-base", convolutions.base(sobelYWeights2d), checkSobelY), + ("sobelY-line-vec", convolutions.lineVec(sobelYWeightsV, sobelYWeightsH), checkSobelY), + ("sobelY-rotv-vec", convolutions.rotvVec(sobelYWeightsV, sobelYWeightsH), checkSobelY), + ("sobelY-tile", convolutions.tile(sobelYWeights2d), checkSobelY), + ("sobelY-tile-vec", convolutions.tileVec(sobelYWeights2d), checkSobelY), + + ("mul-base", mul.base, mul.check(_, H, W)), + ("mul-vec", mul.vec, mul.check(_, H, W)), + ("mul-tile", mul.tile, mul.check(_, H, W)), + ("mul-tileVec", mul.tileVec, mul.check(_, H, W)), + + ("coarsity-base", coarsity.base, coarsity.check(_, H, W, kappa)), + ("coarsity-vec", coarsity.vec, coarsity.check(_, H, W, kappa)), + ("coarsity-tile", coarsity.tile, coarsity.check(_, H, W, kappa)), + ("coarsity-tileVec", coarsity.tileVec, coarsity.check(_, H, W, kappa)), + + //// + + ("sobelXYMul-base", sobelXYMul.base, sobelXYMul.check(_, H, W)), + ("sobelXYMul-line-vec", sobelXYMul.lineVec, sobelXYMul.check(_, H, W)), + ("sobelXYMul-rotv-vec", sobelXYMul.rotvVec, sobelXYMul.check(_, H, W)), + ("sobelXYMul-tile", sobelXYMul.tile, sobelXYMul.check(_, H, W)), + ("sobelXYMul-tile-vec", sobelXYMul.tileVec, sobelXYMul.check(_, H, W)), + + ("binomialCoarsity-base", binomialCoarsity.base, binomialCoarsity.check(_, H, W, kappa)), + ("binomialCoarsity-line-vec", binomialCoarsity.lineVec, binomialCoarsity.check(_, H, W, kappa)), + ("binomialCoarsity-rotv-vec", binomialCoarsity.rotvVec, binomialCoarsity.check(_, H, W, kappa)), + ("binomialCoarsity-tile", binomialCoarsity.tile, binomialCoarsity.check(_, H, W, kappa)), + ("binomialCoarsity-tile-vec", binomialCoarsity.tileVec, binomialCoarsity.check(_, H, W, kappa)), + + //// + + ("sobelXY-base", sobelXY.base, sobelXY.check(_, H, W)), + ("sobelXY-line-vec", sobelXY.lineVec, sobelXY.check(_, H, W)), + ("sobelXY-rotv-vec", sobelXY.rotvVec, sobelXY.check(_, H, W)), + ("sobelXY-tile", sobelXY.tile, sobelXY.check(_, H, W)), + ("sobelXY-tile-vec", sobelXY.tileVec, sobelXY.check(_, H, W)), + + ("mulBinomialCoarsity-base", mulBinomialCoarsity.base, mulBinomialCoarsity.check(_, H, W, kappa)), + ("mulBinomialCoarsity-line-vec", mulBinomialCoarsity.lineVec, mulBinomialCoarsity.check(_, H, W, kappa)), + ("mulBinomialCoarsity-rotv-vec", mulBinomialCoarsity.rotvVec, mulBinomialCoarsity.check(_, H, W, kappa)), + ("mulBinomialCoarsity-tile", mulBinomialCoarsity.tile, mulBinomialCoarsity.check(_, H, W, kappa)), + ("mulBinomialCoarsity-tile-vec", mulBinomialCoarsity.tileVec, mulBinomialCoarsity.check(_, H, W, kappa)), + + //// + + // TODO + ) + + java.nio.file.Files.createDirectories(java.nio.file.Paths.get("/tmp/harris/")) + + for ((name, prog, check) <- kernels) { + println(name) + // val p: rise.core.Expr = rise.core.DSL.toBeTyped( + // rise.eqsat.Expr.toNamedUnique(rise.eqsat.Expr.fromNamed(prog.toUntypedExpr))) + val m = util.gen.opencl.hosted.fromExpr(prog) + val c = util.gen.opencl.hosted.asString(m) + util.writeToPath(s"/tmp/harris/${name}.c", c) + check(m) + } + } +} diff --git a/src/main/scala/apps/harrisCornerDetection2/harris.scala b/src/main/scala/apps/harrisCornerDetection2/harris.scala new file mode 100644 index 000000000..db90f1045 --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/harris.scala @@ -0,0 +1,17 @@ +package apps.harrisCornerDetection2 + +import rise.core.DSL.Type._ +import rise.core.DSL._ +import rise.core._ +import rise.core.primitives.{id => _, _} +import rise.core.types.DataType._ +import rise.core.types._ +import rise.openCL.DSL._ +import rise.openCL.primitives.oclRotateValues +import shine.OpenCL.{GlobalSize, LocalSize} + +object harris { + // TODO: tile / tileVec + // TODO: h1ModRotvVec + // TODO: h2ModRotvVec +} diff --git a/src/main/scala/apps/harrisCornerDetection2/mul.scala b/src/main/scala/apps/harrisCornerDetection2/mul.scala new file mode 100644 index 000000000..ffb799089 --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/mul.scala @@ -0,0 +1,125 @@ +package apps.harrisCornerDetection2 + +import rise.core.DSL.Type._ +import rise.core.DSL._ +import rise.core._ +import rise.core.primitives.{id => _, _} +import rise.core.types.DataType._ +import rise.core.types._ +import rise.openCL.DSL._ +import shine.OpenCL.{GlobalSize, LocalSize} + +object mul { + def check(module: shine.OpenCL.Module, h: Int, w: Int): Unit = { + val main = s""" +#include "src/main/scala/apps/harrisCornerDetection2/common.cpp" + +int main(int argc, char** argv) { + Context ctx = createDefaultContext(); + size_t bytes = $h * $w * sizeof(float); + Buffer input_a = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer input_b = createBuffer(ctx, bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer output = createBuffer(ctx, bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE); + + float* out_gold = (float*) malloc(bytes); + + std::random_device rand_d; + std::default_random_engine rand_e(rand_d()); + // bigger range results in higher output differences + std::uniform_real_distribution dist(0, 200); + + float* in_a = (float*) hostBufferSync(ctx, input_a, bytes, HOST_WRITE | HOST_READ); + float* in_b = (float*) hostBufferSync(ctx, input_b, bytes, HOST_WRITE | HOST_READ); + for (int y = 0; y < $h; y++) { + for (int x = 0; x < $w; x++) { + in_a[y*$w + x] = dist(rand_e); + in_b[y*$w + x] = dist(rand_e); + } + } + + mul_gold(out_gold, $h, $w, in_a, in_b); + + foo_init_run(ctx, output, $h, $w, input_a, input_b); + + ErrorStats errors; + init_error_stats(&errors); + float* out = (float*) hostBufferSync(ctx, output, bytes, HOST_READ); + accumulate_error_stats(&errors, out, out_gold, $h, $w); + finish_error_stats(&errors, 0.01, 0.0001); + + free(out_gold); + destroyBuffer(ctx, input_a); + destroyBuffer(ctx, input_b); + destroyBuffer(ctx, output); + destroyContext(ctx); + return EXIT_SUCCESS; +} +""" + util.ExecuteOpenCL.using_cpp(main, module, "one_copy") + } + + def base: ToBeTyped[Expr] = + depFun((h: Nat, w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) + )((a, b) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + zip(a)(b) |> mapGlobal(fun(ab => + zip(ab._1)(ab._2) |> + mapSeq(mulT) + )) + ))) + + val vec: ToBeTyped[Expr] = + depFun((h: Nat, w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) + )((a, b) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + zip(a)(b) |> mapGlobal(fun(ab => + zip(asVectorAligned(vecw)(ab._1))(asVectorAligned(vecw)(ab._2)) |> + mapSeq(mulT) >> + asScalar + )) + ))) + + val tile: ToBeTyped[Expr] = { + val tile_x_in = tile_x + val tile_y_in = tile_y + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) + )((a, b) => + oclRun(LocalSize((tile_x, tile_y)), GlobalSize((w, h)))( + zip2D(a, b) |> + map(slide(tile_x_in)(tile_x)) |> + slide(tile_y_in)(tile_y) |> + map(transpose) |> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)( + mapLocal(0)(mulT) + ) + )) >> map(transpose) >> join >> map(join) + )))) + } + + val tileVec: ToBeTyped[Expr] = { + val tile_x_in = tile_x + val tile_y_in = tile_y + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: (h`.`w`.`f32) + )((a, b) => + oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize((w / vecw, h)))( + zip2D(a, b) |> + map(slide(tile_x_in)(tile_x)) |> + slide(tile_y_in)(tile_y) |> + map(transpose) |> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(fun(ab => + zip(asVectorAligned(vecw)(unzip(ab)._1))( + asVectorAligned(vecw)(unzip(ab)._2)) |> + mapLocal(0)(mulT) + )) + )) >> map(transpose) >> join >> map(join >> asScalar) + )))) + } +} diff --git a/src/main/scala/apps/harrisCornerDetection2/mulBinomialCoarsity.scala b/src/main/scala/apps/harrisCornerDetection2/mulBinomialCoarsity.scala new file mode 100644 index 000000000..9cb21844c --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/mulBinomialCoarsity.scala @@ -0,0 +1,273 @@ +package apps.harrisCornerDetection2 + +import rise.core.DSL.Type._ +import rise.core.DSL._ +import rise.core._ +import rise.core.primitives.{id => _, _} +import rise.core.types.DataType._ +import rise.core.types._ +import rise.openCL.DSL._ +import rise.openCL.primitives.oclRotateValues +import shine.OpenCL.{GlobalSize, LocalSize} + +object mulBinomialCoarsity { + def check(module: shine.OpenCL.Module, h: Int, w: Int, kappa: Float): Unit = { + val main = s""" +#include "src/main/scala/apps/harrisCornerDetection2/common.cpp" + +int main(int argc, char** argv) { + Context ctx = createDefaultContext(); + size_t in_bytes = $h * $w * sizeof(float); + size_t out_bytes = ${h - 2*bd_h} * ${w - 2*bd_w} * sizeof(float); + Buffer input_ix = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer input_iy = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer output = createBuffer(ctx, out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE); + + float* ixx_gold = (float*) malloc(in_bytes); + float* ixy_gold = (float*) malloc(in_bytes); + float* iyy_gold = (float*) malloc(in_bytes); + float* sxx_gold = (float*) malloc(out_bytes); + float* sxy_gold = (float*) malloc(out_bytes); + float* syy_gold = (float*) malloc(out_bytes); + float* out_gold = (float*) malloc(out_bytes); + + std::random_device rand_d; + std::default_random_engine rand_e(rand_d()); + // bigger range results in higher output differences + std::uniform_real_distribution dist(0, 50); + + float* in_ix = (float*) hostBufferSync(ctx, input_ix, in_bytes, HOST_WRITE | HOST_READ); + float* in_iy = (float*) hostBufferSync(ctx, input_iy, in_bytes, HOST_WRITE | HOST_READ); + for (int y = 0; y < $h; y++) { + for (int x = 0; x < $w; x++) { + in_ix[y*$w + x] = dist(rand_e); + in_iy[y*$w + x] = dist(rand_e); + } + } + + mul_gold(ixx_gold, $h, $w, in_ix, in_ix); + mul_gold(ixy_gold, $h, $w, in_ix, in_iy); + mul_gold(iyy_gold, $h, $w, in_iy, in_iy); + binomial_gold(sxx_gold, $h, $w, ixx_gold); + binomial_gold(sxy_gold, $h, $w, ixy_gold); + binomial_gold(syy_gold, $h, $w, iyy_gold); + coarsity_gold(out_gold, ${h - 2*bd_h}, ${w - 2*bd_w}, sxx_gold, sxy_gold, syy_gold, $kappa); + + foo_init_run(ctx, output, $h, $w, input_ix, input_iy, $kappa); + + ErrorStats errors; + init_error_stats(&errors); + float* out = (float*) hostBufferSync(ctx, output, out_bytes, HOST_READ); + accumulate_error_stats(&errors, out, out_gold, ${h - 2*bd_h}, ${w - 2*bd_w}); + finish_error_stats(&errors, 5.0, 0.01); + + free(sxx_gold); + free(sxy_gold); + free(syy_gold); + free(out_gold); + destroyBuffer(ctx, input_ix); + destroyBuffer(ctx, input_iy); + destroyBuffer(ctx, output); + destroyContext(ctx); + return EXIT_SUCCESS; +} +""" + util.ExecuteOpenCL.using_cpp(main, module, "one_copy") + } + + val base: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 + ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((ix, iy, kappa) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + makeArray(2)(ix)(iy) |> + transpose >> map(transpose) >> + map(map(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) + }))) >> + map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >> + map(transpose) >> + mapGlobal(mapSeq( + map(transpose) >> transpose >> + toPrivateFun(mapSeqUnroll(fun(nbh => + dotSeqU(join(binomialWeights2d))(join(nbh)) + ))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - kappa * trace * trace + })) + )) + )))) + + val lineVec: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 + ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((ix, iy, kappa) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + makeArray(2)(ix)(iy) |> + map( + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) + ) >> + transpose >> map(transpose) >> + map(map(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) + }))) >> + slide(3)(1) >> mapGlobal( + transpose >> map(transpose) >> + mapSeq(mapSeqUnroll(dotSeqUWV(binomialWeightsV))) >> + toGlobal >> + slide(3)(1) >> + mapSeq( + transpose >> map(shuffle) >> + toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - vectorFromScalar(kappa) * trace * trace + })) + ) >> asScalar + ) + )))) + + val rotvVec: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 + ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((ix, iy, kappa) => + oclRun(LocalSize(1), GlobalSize(num_threads))( + makeArray(2)(ix)(iy) |> + map( + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) + ) >> + transpose >> map(transpose) >> + map(map(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) + }))) >> + slide(3)(1) >> mapGlobal( + transpose >> map(transpose) >> + map(map(dotSeqUWV(binomialWeightsV))) >> + oclRotateValues(AddressSpace.Private)(3)(mapSeqUnroll(id)) >> iterateStream( + transpose >> map(shuffle) >> + toPrivateFun(mapSeqUnroll(dotSeqUWV(binomialWeightsH))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - vectorFromScalar(kappa) * trace * trace + })) + ) >> asScalar + ) + )))) + + val tile: ToBeTyped[Expr] = { + val tile_x_in = tile_x + 2 + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 + ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((ix, iy, kappa) => + oclRun(LocalSize((tile_x, tile_y)), GlobalSize((w - 2*bd_w, h - 2*bd_h)))( + makeArray(2)(ix)(iy) |> + transpose >> map(transpose) >> + map(map(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) + }))) >> + map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(tile_x_in)(tile_x)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)( + map(transpose) >> transpose >> + toPrivateFun(mapSeqUnroll(fun(nbh => + dotSeqU(join(binomialWeights2d))(join(nbh)) + ))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - kappa * trace * trace + })) + )) + )) >> map(transpose) >> join >> map(join) + )))) + } + + val tileVec: ToBeTyped[Expr] = { + val tile_vx = tile_x / vecw + val tile_vx_in = tile_vx + 2 + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (h`.`w`.`f32) ->: f32 + ->: ((h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )((ix, iy, kappa) => + oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))( + makeArray(2)(ix)(iy) |> + map( + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> asVectorAligned(vecw)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) + ) >> + transpose >> map(transpose) >> + map(map(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) + }))) >> + map(slide(tile_vx_in)(tile_vx)) >> + slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)( + map(transpose) >> transpose >> + toPrivateFun(mapSeqUnroll(fun(nbh => + dotSeqUWV(join(binomialWeights2d))(join(map(shuffle)(nbh))) + ))) >> + letf(fun(s => { + val sxx = s `@` lidx(0, 3) + val sxy = s `@` lidx(1, 3) + val syy = s `@` lidx(2, 3) + val det = sxx * syy - sxy * sxy + val trace = sxx + syy + det - vectorFromScalar(kappa) * trace * trace + })) + )) + )) >> map(transpose) >> join >> map(join >> asScalar) + )))) + } +} diff --git a/src/main/scala/apps/harrisCornerDetection2/package.scala b/src/main/scala/apps/harrisCornerDetection2/package.scala new file mode 100644 index 000000000..97f0fb3e8 --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/package.scala @@ -0,0 +1,58 @@ +package apps + +import rise.core._ +import rise.core.DSL._ +import rise.core.primitives._ +import rise.core.types._ +import rise.openCL.primitives._ +import rise.core.DSL.HighLevelConstructs.zipND + +/** This version of Harris follows from the following paper: + * https://dl.acm.org/doi/abs/10.1145/2568058.2568067 + * + * Compared to Halide's version: + * - it starts from grayscale images instead of color images + * - it uses a binomial filter instead of a box filter + * + * The algorithm is simplified: + * - there is no padding and the output is smaller than the input. + */ +package object harrisCornerDetection2 { + val num_threads = 4 + val vecw = 8 + val bd_h = 16 + val bd_w = 32 + val tile_x = 32 + val tile_y = 8 + + val hFrom = (n: Int) => + arithexpr.arithmetic.RangeAdd(n, arithexpr.arithmetic.PosInf, 8) + val wFrom = (n: Int) => + arithexpr.arithmetic.RangeAdd(12, arithexpr.arithmetic.PosInf, 32) + + val id: ToBeTyped[Expr] = fun(x => x) + val mulT: ToBeTyped[Expr] = fun(x => fst(x) * snd(x)) + val zip2D: ToBeTyped[Expr] = zipND(2) + val dotSeqU: ToBeTyped[Expr] = fun(a => fun(b => + zip(a)(b) |> map(mulT) |> oclReduceSeqUnroll(AddressSpace.Private)(add)(lf32(0.0f)) + )) + val dotSeqUWV: ToBeTyped[Expr] = fun(weights => fun(vectors => + zip(map(vectorFromScalar)(weights))(vectors) |> + map(mulT) |> oclReduceSeqUnroll(AddressSpace.Private)(add)(vectorFromScalar(lf32(0.0f))) + )) + + val shuffle = + asScalar >> drop(vecw-1) >> take(vecw+2) >> slide(vecw)(1) >> join >> asVector(vecw) + + val binomialWeights2d = apps.separableConvolution2D.binomialWeights2d + val binomialWeightsH = apps.separableConvolution2D.binomialWeightsH + val binomialWeightsV = apps.separableConvolution2D.binomialWeightsV + + val sobelXWeights2d = apps.separableConvolution2D.sobelXWeights2d + val sobelXWeightsH = apps.separableConvolution2D.sobelXWeightsH + val sobelXWeightsV = apps.separableConvolution2D.sobelXWeightsV + + val sobelYWeights2d = apps.separableConvolution2D.sobelYWeights2d + val sobelYWeightsH = apps.separableConvolution2D.sobelYWeightsH + val sobelYWeightsV = apps.separableConvolution2D.sobelYWeightsV +} diff --git a/src/main/scala/apps/harrisCornerDetection2/sobelXY.scala b/src/main/scala/apps/harrisCornerDetection2/sobelXY.scala new file mode 100644 index 000000000..25def5660 --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/sobelXY.scala @@ -0,0 +1,180 @@ +package apps.harrisCornerDetection2 + +import rise.core.DSL.Type._ +import rise.core.DSL._ +import rise.core._ +import rise.core.primitives.{id => _, _} +import rise.core.types.DataType._ +import rise.core.types._ +import rise.openCL.DSL._ +import rise.openCL.primitives.oclRotateValues +import shine.OpenCL.{GlobalSize, LocalSize} + +object sobelXY { + def check(module: shine.OpenCL.Module, h: Int, w: Int): Unit = { + val main = s""" +#include "src/main/scala/apps/harrisCornerDetection2/common.cpp" + +int main(int argc, char** argv) { + Context ctx = createDefaultContext(); + size_t in_bytes = $h * $w * sizeof(float); + size_t out_h = ${h - 2*bd_h}; + size_t out_w = ${w - 2*bd_w}; + size_t out_bytes = out_h * out_w * sizeof(float); + Buffer input = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer output = createBuffer(ctx, 2 * out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE); + + float* out_ix_gold = (float*) malloc(out_bytes); + float* out_iy_gold = (float*) malloc(out_bytes); + + std::random_device rand_d; + std::default_random_engine rand_e(rand_d()); + // bigger range results in higher output differences + std::uniform_real_distribution dist(0, 200); + + float* in = (float*) hostBufferSync(ctx, input, in_bytes, HOST_WRITE | HOST_READ); + for (int y = 0; y < $h; y++) { + for (int x = 0; x < $w; x++) { + in[y*$w + x] = dist(rand_e); + } + } + + sobelX_gold(out_ix_gold, $h, $w, in); + sobelY_gold(out_iy_gold, $h, $w, in); + + foo_init_run(ctx, output, $h, $w, input); + + ErrorStats errors; + init_error_stats(&errors); + float* out = (float*) hostBufferSync(ctx, output, 2 * out_bytes, HOST_READ); + accumulate_error_stats(&errors, out, out_ix_gold, out_h, out_w); + accumulate_error_stats(&errors, &out[out_h*out_w], out_iy_gold, out_h, out_w); + finish_error_stats(&errors, 0.01, 0.0001); + + free(out_ix_gold); + free(out_iy_gold); + destroyBuffer(ctx, input); + destroyBuffer(ctx, output); + destroyContext(ctx); + return EXIT_SUCCESS; +} +""" + util.ExecuteOpenCL.using_cpp(main, module, "one_copy") + } + + val base: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize(1), GlobalSize(num_threads))( + input |> + map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >> + map(transpose) >> + mapGlobal(mapSeq(fun(nbh => + makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |> + mapSeqUnroll(fun(ws => dotSeqU(join(ws))(join(nbh)))) + ))) >> map(transpose) >> transpose + )))) + + val lineVec: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize(1), GlobalSize(num_threads))( + input |> + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal( + transpose >> + mapSeq(fun(vNbh => + makeArray(2)(sobelXWeightsV)(sobelYWeightsV) |> + mapSeqUnroll(fun(ws => dotSeqUWV(ws)(vNbh))) + )) >> + toGlobal >> + slide(3)(1) >> + mapSeq( + transpose >> map(shuffle) >> + zip(makeArray(2)(sobelXWeightsH)(sobelYWeightsH)) >> + mapSeqUnroll(fun(hWsNbh => + dotSeqUWV(hWsNbh._1)(hWsNbh._2) + )) + ) >> transpose >> map(asScalar) + ) >> transpose + )))) + + val rotvVec: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize(1), GlobalSize(num_threads))( + input |> + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal( + transpose >> + map(fun(vNbh => + makeArray(2)(sobelXWeightsV)(sobelYWeightsV) |> + map(fun(ws => dotSeqUWV(ws)(vNbh))) + )) >> + oclRotateValues(AddressSpace.Private)(3)(mapSeqUnroll(id)) >> iterateStream( + transpose >> map(shuffle) >> + zip(makeArray(2)(sobelXWeightsH)(sobelYWeightsH)) >> + mapSeqUnroll(fun(hWsNbh => dotSeqUWV(hWsNbh._1)(hWsNbh._2))) + ) >> transpose >> map(asScalar) + ) >> transpose + )))) + + val tile: ToBeTyped[Expr] = { + val tile_x_in = tile_x + 2 + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize((tile_x, tile_y)), GlobalSize(((w - 2*bd_w), h - 2*bd_h)))( + input |> + map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(tile_x_in)(tile_x)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)(fun(nbh => + makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |> + mapSeqUnroll(fun(ws => dotSeqU(join(ws))(join(nbh)))) + ))) // ty.tx.2.f + )) >> map(transpose) >> join >> map(join) >> + map(transpose) >> transpose + )))) + } + + val tileVec: ToBeTyped[Expr] = { + val tile_x_in = tile_x + 2*vecw + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (2`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))( + input |> + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> slide(tile_x_in)(tile_x)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(asVectorAligned(vecw) >> slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)(fun(nbh => + makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |> + mapSeqUnroll(fun(ws => dotSeqUWV(join(ws))(join(map(shuffle)(nbh))))) + ))) + )) >> map(transpose) >> join >> map(join) >> + map(transpose >> map(asScalar)) >> transpose + )))) + } +} diff --git a/src/main/scala/apps/harrisCornerDetection2/sobelXYMul.scala b/src/main/scala/apps/harrisCornerDetection2/sobelXYMul.scala new file mode 100644 index 000000000..9050db4d8 --- /dev/null +++ b/src/main/scala/apps/harrisCornerDetection2/sobelXYMul.scala @@ -0,0 +1,217 @@ +package apps.harrisCornerDetection2 + +import rise.core.DSL.Type._ +import rise.core.DSL._ +import rise.core._ +import rise.core.primitives.{id => _, _} +import rise.core.types.DataType._ +import rise.core.types._ +import rise.openCL.DSL._ +import rise.openCL.primitives.oclRotateValues +import shine.OpenCL.{GlobalSize, LocalSize} + +object sobelXYMul { + def check(module: shine.OpenCL.Module, h: Int, w: Int): Unit = { + val main = s""" +#include "src/main/scala/apps/harrisCornerDetection2/common.cpp" + +int main(int argc, char** argv) { + Context ctx = createDefaultContext(); + size_t in_bytes = $h * $w * sizeof(float); + size_t out_h = ${h - 2*bd_h}; + size_t out_w = ${w - 2*bd_w}; + size_t out_bytes = out_h * out_w * sizeof(float); + Buffer input = createBuffer(ctx, in_bytes, HOST_WRITE | HOST_READ | DEVICE_READ); + Buffer output = createBuffer(ctx, 3 * out_bytes, HOST_READ | HOST_WRITE | DEVICE_WRITE); + + float* ix_gold = (float*) malloc(out_bytes); + float* iy_gold = (float*) malloc(out_bytes); + float* out_ixx_gold = (float*) malloc(out_bytes); + float* out_ixy_gold = (float*) malloc(out_bytes); + float* out_iyy_gold = (float*) malloc(out_bytes); + + std::random_device rand_d; + std::default_random_engine rand_e(rand_d()); + // bigger range results in higher output differences + std::uniform_real_distribution dist(0, 200); + + float* in = (float*) hostBufferSync(ctx, input, in_bytes, HOST_WRITE | HOST_READ); + for (int y = 0; y < $h; y++) { + for (int x = 0; x < $w; x++) { + in[y*$w + x] = dist(rand_e); + } + } + + sobelX_gold(ix_gold, $h, $w, in); + sobelY_gold(iy_gold, $h, $w, in); + mul_gold(out_ixx_gold, out_h, out_w, ix_gold, ix_gold); + mul_gold(out_ixy_gold, out_h, out_w, ix_gold, iy_gold); + mul_gold(out_iyy_gold, out_h, out_w, iy_gold, iy_gold); + + foo_init_run(ctx, output, $h, $w, input); + + ErrorStats errors; + init_error_stats(&errors); + float* out = (float*) hostBufferSync(ctx, output, 3 * out_bytes, HOST_READ); + accumulate_error_stats(&errors, out, out_ixx_gold, out_h, out_w); + accumulate_error_stats(&errors, &out[out_h * out_w], out_ixy_gold, out_h, out_w); + accumulate_error_stats(&errors, &out[2 * out_h * out_w], out_iyy_gold, out_h, out_w); + finish_error_stats(&errors, 0.01, 0.0001); + + free(ix_gold); + free(iy_gold); + free(out_ixx_gold); + free(out_ixy_gold); + free(out_iyy_gold); + destroyBuffer(ctx, input); + destroyBuffer(ctx, output); + destroyContext(ctx); + return EXIT_SUCCESS; +} +""" + util.ExecuteOpenCL.using_cpp(main, module, "one_copy") + } + + val base: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize(1), GlobalSize(num_threads))( + input |> + map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(3)(1)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(3)(1) >> + map(transpose) >> + mapGlobal(mapSeq(fun(nbh => + makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |> + toPrivateFun(mapSeqUnroll(fun(ws => dotSeqU(join(ws))(join(nbh))))) |> + letf(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id) + })) + ))) >> map(transpose) >> transpose + )))) + + val lineVec: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize(1), GlobalSize(num_threads))( + input |> + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal( + transpose >> + mapSeq(fun(vNbh => + makeArray(2)(sobelXWeightsV)(sobelYWeightsV) |> + mapSeqUnroll(fun(ws => dotSeqUWV(ws)(vNbh))) + )) >> + toGlobal >> + slide(3)(1) >> + mapSeq( + transpose >> map(shuffle) >> + zip(makeArray(2)(sobelXWeightsH)(sobelYWeightsH)) >> + toPrivateFun(mapSeqUnroll(fun(hWsNbh => + dotSeqUWV(hWsNbh._1)(hWsNbh._2) + ))) >> + letf(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id) + })) + ) >> transpose >> map(asScalar) + ) >> transpose + )))) + + val rotvVec: ToBeTyped[Expr] = + depFun(hFrom(3), (h: Nat) => + depFun(wFrom(12), (w: Nat) => fun( + (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize(1), GlobalSize(num_threads))( + input |> + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw))) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> + map(asVectorAligned(vecw)) >> slide(3)(1) >> mapGlobal( + transpose >> + map(fun(vNbh => + makeArray(2)(sobelXWeightsV)(sobelYWeightsV) |> + map(fun(ws => dotSeqUWV(ws)(vNbh))) + )) >> + oclRotateValues(AddressSpace.Private)(3)(mapSeqUnroll(id)) >> iterateStream( + transpose >> map(shuffle) >> + zip(makeArray(2)(sobelXWeightsH)(sobelYWeightsH)) >> + toPrivateFun(mapSeqUnroll(fun(hWsNbh => + dotSeqUWV(hWsNbh._1)(hWsNbh._2) + ))) >> + letf(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id) + })) + ) >> transpose >> map(asScalar) + ) >> transpose + )))) + + val tile: ToBeTyped[Expr] = { + val tile_x_in = tile_x + 2 + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize((tile_x, tile_y)), GlobalSize(((w - 2*bd_w), h - 2*bd_h)))( + input |> + map(drop(bd_w-1) >> take(w - 2*(bd_w-1)) >> slide(tile_x_in)(tile_x)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)(fun(nbh => + makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |> + toPrivateFun(mapSeqUnroll(fun(ws => dotSeqU(join(ws))(join(nbh))))) |> + letf(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id) + })) + ))) + )) >> map(transpose) >> join >> map(join) >> + map(transpose) >> transpose + )))) + } + + val tileVec: ToBeTyped[Expr] = { + val tile_x_in = tile_x + 2*vecw + val tile_y_in = tile_y + 2 + depFun(hFrom(tile_y), (h: Nat) => + depFun(wFrom(tile_x), (w: Nat) => fun( + (h`.`w`.`f32) ->: (3`.`(h - 2*bd_h)`.`(w - 2*bd_w)`.`f32) + )(input => + oclRun(LocalSize((tile_x / vecw, tile_y)), GlobalSize(((w - 2*bd_w) / vecw, h - 2*bd_h)))( + input |> + map(drop(bd_w-vecw) >> take(w - 2*(bd_w-vecw)) >> slide(tile_x_in)(tile_x)) >> + drop(bd_h-1) >> take(h - 2*(bd_h-1)) >> slide(tile_y_in)(tile_y) >> + map(transpose) >> + map(map( + map(asVectorAligned(vecw) >> slide(3)(1)) >> slide(3)(1) >> map(transpose) + )) >> + mapWorkGroup(1)(mapWorkGroup(0)( + mapLocal(1)(mapLocal(0)(fun(nbh => + makeArray(2)(sobelXWeights2d)(sobelYWeights2d) |> + toPrivateFun(mapSeqUnroll(fun(ws => dotSeqUWV(join(ws))(join(map(shuffle)(nbh)))))) |> + letf(fun(ixiy => { + val ix = ixiy `@` lidx(0, 2) + val iy = ixiy `@` lidx(1, 2) + makeArray(3)(ix * ix)(ix * iy)(iy * iy) |> mapSeqUnroll(id) + })) + ))) + )) >> map(transpose) >> join >> map(join) >> + map(transpose >> map(asScalar)) >> transpose + )))) + } +} diff --git a/src/main/scala/apps/harrisCornerDetectionHalide.scala b/src/main/scala/apps/harrisCornerDetectionHalide.scala index ea61ad7ed..034b45452 100644 --- a/src/main/scala/apps/harrisCornerDetectionHalide.scala +++ b/src/main/scala/apps/harrisCornerDetectionHalide.scala @@ -8,6 +8,8 @@ import rise.core.types._ import rise.core.types.DataType._ import HighLevelConstructs._ +// This version of Harris uses Halide's version as reference +// used in the CGO'21 paper: https://ieeexplore.ieee.org/abstract/document/9370337/ // in Halide: https://github.com/halide/Halide/blob/e8acdea/apps/harris // in PolyMage: https://bitbucket.org/udayb/polymage/src/e28327c/sandbox/apps/python/img_proc/harris // FIXME: PolyMage's algorithm is different diff --git a/src/main/scala/shine/C/AST/Printer.scala b/src/main/scala/shine/C/AST/Printer.scala index 972567744..665770c98 100644 --- a/src/main/scala/shine/C/AST/Printer.scala +++ b/src/main/scala/shine/C/AST/Printer.scala @@ -43,6 +43,8 @@ trait Printer { object Printer { def apply(n: Node): String = (new CPrinter).printNode(n) + def declFun(f: FunDecl): String = + (new CPrinter).declareFunSig(f) } class CPrinter extends Printer { @@ -67,6 +69,12 @@ class CPrinter extends Printer { case s: StructTypeDecl => printStructTypeDecl(s) } + def declareFunSig(f: FunDecl): String = { + printFunSig(f) + print(";") + sb.toString() + } + override def printExpr(e: Expr, parenthesize: Boolean): Unit = e match { case a: Assignment => printMaybe(parenthesize)( @@ -116,8 +124,7 @@ class CPrinter extends Printer { print(";") } - // Decls - private def printFunDecl(f: FunDecl): Unit = { + def printFunSig(f: FunDecl): Unit = { print(typeName(f.returnType)) print(s" ${f.name}(") f.params.foreach(p => { @@ -125,7 +132,11 @@ class CPrinter extends Printer { if (!p.eq(f.params.last)) print(", ") }) print(")") + } + // Decls + private def printFunDecl(f: FunDecl): Unit = { + printFunSig(f) printStmt(f.body) } diff --git a/src/main/scala/shine/DPIA/fromRise.scala b/src/main/scala/shine/DPIA/fromRise.scala index 2944bf635..c081791ff 100644 --- a/src/main/scala/shine/DPIA/fromRise.scala +++ b/src/main/scala/shine/DPIA/fromRise.scala @@ -2,7 +2,7 @@ package shine.DPIA import elevate.core.strategies.Traversable import elevate.core.strategies.basic.normalize -import rise.core.types.{AddressSpaceKind, DataKind, DataType, NatKind, NatToNatKind, NatToNatLambda, read, write} +import rise.core.types.{AddressSpaceKind, DataKind, DataType, NatKind, NatToNatKind, NatToNatLambda, TypeIdentifier, TypePlaceholder, read, write} import rise.core.DSL.Type._ import rise.core.types.DataType._ import rise.elevate.Rise @@ -12,7 +12,10 @@ import rise.{core => r} import shine.DPIA.Phrases._ import shine.DPIA.Types._ import shine.DPIA.primitives.functional._ +import util.monads +import scala.annotation.tailrec +import scala.collection.immutable import scala.collection.mutable object fromRise { @@ -22,8 +25,128 @@ object fromRise { throw new Exception(s"expression is not in closed form: $expr\n\n with type ${expr.t}\n free vars: $fV\n free type vars: $fT\n\n") } val bnfExpr = normalize(ev).apply(betaReduction)(expr).get - val rwMap = inferAccess(bnfExpr) - expression(bnfExpr, rwMap) + val nExpr = normalizeEqualNats(bnfExpr) + val rwMap = inferAccess(nExpr) + expression(nExpr, rwMap) + } + + // NOTE: this is required because unify(nat1, nat2) can succeed while + // normalize(nat1) != normalize(nat2) + private def normalizeEqualNats(e: r.Expr): r.Expr = { + // inspired from union find algorithm + var map = mutable.Map[Nat, Nat]() + + @tailrec + def getBest(n: Nat): Nat = { + val n2 = map.getOrElse(n, n) + if (n == n2) { return n } + val n3 = map.getOrElse(n2, n2) + map(n) = n3 + getBest(n3) + } + + r.traverse.traverse(e, new r.traverse.PureTraversal { + override def expr: r.Expr => util.monads.Pure[r.Expr] = { e => + e match { + case r.App(f, arg) => + val ft = f.t.asInstanceOf[rt.FunType[_ <: rt.ExprType, _ <: rt.ExprType]] + sameType(arg.t, ft.inT) + sameType(e.t, ft.outT) + case _ => () + } + super.expr(e) + } + + private def sameType(a: rt.ExprType, b: rt.ExprType): Unit = { + def unwrapb(f: PartialFunction[rt.ExprType, Unit]): Unit = { + f.lift(b) match { + case Some(()) => () + case None => throw new Exception(s"Unexpected type for $b") + } + } + a match { + case TypePlaceholder | TypeIdentifier(_) => + throw new Exception("this should not happen") + case rt.FunType(inT, outT) => unwrapb { + case rt.FunType(inT2, outT2) => + sameType(inT, inT2) + sameType(outT, outT2) + } + case rt.DepFunType(kind, x, t) => unwrapb { + case rt.DepFunType(kind2, x2, t2) => + assert(kind == kind2) + assert(x == x2) + sameType(t, t2) + } + case dataType: DataType => dataType match { + case DataTypeIdentifier(_) => () + case scalarType: ScalarType => () + case DataType.NatType => () + case OpaqueType(_) => () + case VectorType(size, elemType) => unwrapb { + case VectorType(size2, elemType2) => + sameNat(size, size2) + sameType(elemType, elemType2) + } + case IndexType(size) => unwrapb { + case IndexType(size2) => + sameNat(size, size2) + } + case PairType(dt1, dt2) => unwrapb { + case PairType(dt12, dt22) => + sameType(dt1, dt12) + sameType(dt2, dt22) + } + case FragmentType(rows, columns, d3, dataType, fragmentKind, layout) => unwrapb { + case FragmentType(_, _, d32, dataType2, _, _) => + sameNat(d3, d32) + sameType(dataType, dataType2) + } + case ManagedBufferType(dt) => unwrapb { + case ManagedBufferType(dt2) => + sameType(dt, dt2) + } + case DepPairType(kind, x, t) => unwrapb { + case DepPairType(kind2, x2, t2) => + assert(kind == kind2) + assert(x == x2) + sameType(t, t2) + } + case apply: NatToDataApply => ??? + case ArrayType(size, elemType) => unwrapb { + case ArrayType(size2, elemType2) => + sameNat(size, size2) + sameType(elemType, elemType2) + } + case DepArrayType(size, fdt) => ??? + } + } + } + + private def sameNat(a: Nat, b: Nat): Unit = { + val bestA = getBest(a) + val bestB = getBest(b) + if (bestA != bestB) { + println(s"WARNING: $bestA != $bestB") + val best = if (natSize(bestA) < natSize(bestB)) { bestA } else { bestB } + println(s" --> assuming they are equal and using $best") + map(bestA) = best + map(bestB) = best + } + } + }) + + r.traverse.traverse(e, new r.traverse.PureTraversal { + override def nat: Nat => monads.Pure[Nat] = n => return_(getBest(n)) + }) + } + + private def natSize(n: Nat): Int = { + var i = 0 + arithexpr.arithmetic.ArithExpr.visit(n, { + _ => i += 1 + }) + i } def expression( diff --git a/src/main/scala/shine/OpenCL/Compilation/HostCodeGenerator.scala b/src/main/scala/shine/OpenCL/Compilation/HostCodeGenerator.scala index 857cec97e..52ef0701b 100644 --- a/src/main/scala/shine/OpenCL/Compilation/HostCodeGenerator.scala +++ b/src/main/scala/shine/OpenCL/Compilation/HostCodeGenerator.scala @@ -52,7 +52,7 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec } val temporaries = calledKernel.paramKinds.zip(calledKernel.code.params).flatMap { case (pk, p) => if (pk.kind == ParamKind.Kind.temporary) { - Some((pk.typ, p.t.asInstanceOf[shine.OpenCL.AST.PointerType].a)) + Some((pk.typ, p.t.asInstanceOf[shine.OpenCL.AST.PointerType].a, "m" + p.name)) } else { None } @@ -67,6 +67,29 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec case _ => None } } + // TODO: could optimize temporary buffer creation/deletion + val createTmp = temporaries.zipWithIndex.flatMap { + case ((dt, AddressSpace.Global, name), i) => Seq( + C.AST.DeclStmt(C.AST.VarDecl(name, typ(ManagedBufferType(dt)), Some( + C.AST.FunCall(C.AST.DeclRef("createBuffer"), Seq( + C.AST.DeclRef("ctx"), + bufferSize(dt), + C.AST.Literal(accessToString(DEVICE_READ | DEVICE_WRITE)) + )) + ))), + deviceBufferSync(s"tb${i}", C.AST.DeclRef(name), dt, DEVICE_READ | DEVICE_WRITE) + ) + case _ => Seq() + } + val destroyTmp = temporaries.flatMap { + case (dt, AddressSpace.Global, name) => Seq( + C.AST.ExprStmt(C.AST.FunCall(C.AST.DeclRef("destroyBuffer"), Seq( + C.AST.DeclRef("ctx"), + C.AST.DeclRef(name) + ))) + ) + case _ => Seq() + } val ndRangeTy = C.AST.ArrayType(C.AST.Type.usize, Some(3), true) val declGlobalSize = C.AST.DeclStmt(C.AST.VarDecl("global_size", ndRangeTy, Some( ArrayLiteral(ndRangeTy, NDRangeToAST(globalSize)) @@ -81,9 +104,12 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec ((args zip argsC).zipWithIndex.map { case ((arg, argC), i) => kernelArg(i + 1, arg.t.dataType, argC) } ++ temporaries.zipWithIndex.map { - case ((dt, AddressSpace.Local), i) => + case ((_, AddressSpace.Private, _), _) => + throw new Exception("temporary kernel argument cannot live in private memory") + case ((dt, AddressSpace.Local, _), i) => kernelLocalArg(i + 1 + args.size, dt) - case ((_, a), _) => throw new Exception(s"codegen is not implemented for temporaries in $a") + case ((dt, AddressSpace.Global, name), i) => + kernelArg(i + 1 + args.size, dt, C.AST.DeclRef(s"tb${i}")) }) ) ))) @@ -99,8 +125,8 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec C.AST.DeclRef("args") ))) C.AST.Block( - Seq(outputSync) ++ argSyncs ++ - Seq(declGlobalSize, declLocalSize, declArgs, launchKernel) + Seq(outputSync) ++ argSyncs ++ createTmp ++ + Seq(declGlobalSize, declLocalSize, declArgs, launchKernel) ++ destroyTmp ) })) } @@ -196,11 +222,14 @@ case class HostCodeGenerator(override val decls: C.Compilation.CodeGenerator.Dec } } + // TODO: use arith expr simplification here private def bufferSize(dt: DataType): Expr = dt match { case ManagedBufferType(dt) => bufferSize(dt) - case _: ScalarType | _: IndexType | _: VectorType | NatType => + case _: ScalarType | _: IndexType | NatType => C.AST.Literal(s"sizeof(${typ(dt)})") + case v: VectorType => + C.AST.BinaryExpr(C.AST.ArithmeticExpr(v.size), BinaryOperator.*, bufferSize(v.elemType)) case PairType(fst, snd) => C.AST.BinaryExpr(bufferSize(fst), BinaryOperator.+, bufferSize(snd)) case a: DataType.ArrayType => diff --git a/src/main/scala/shine/OpenCL/Compilation/Passes/HoistMemoryAllocations.scala b/src/main/scala/shine/OpenCL/Compilation/Passes/HoistMemoryAllocations.scala index d123e97bb..914a65e24 100644 --- a/src/main/scala/shine/OpenCL/Compilation/Passes/HoistMemoryAllocations.scala +++ b/src/main/scala/shine/OpenCL/Compilation/Passes/HoistMemoryAllocations.scala @@ -80,9 +80,11 @@ object HoistMemoryAllocations { parallelismLevel match { case OpenCL.Local | OpenCL.Sequential => performRewrite(oldVariable, oldBody, i, n) + case OpenCL.Global => + throw new Exception("hoisting local memory outside of global parallelism is not implemented") case OpenCL.WorkGroup => // do not perform the substitution (oldVariable, oldBody) - case OpenCL.Global | OpenCL.Warp | OpenCL.Lane => + case OpenCL.Warp | OpenCL.Lane => throw new Exception("This should not happen") } case AddressSpace.Private | AddressSpace.Constant | AddressSpaceIdentifier(_) => diff --git a/src/main/scala/shine/OpenCL/Module.scala b/src/main/scala/shine/OpenCL/Module.scala index 0b00744f7..b54b890d7 100644 --- a/src/main/scala/shine/OpenCL/Module.scala +++ b/src/main/scala/shine/OpenCL/Module.scala @@ -30,6 +30,26 @@ object Module { |${util.gen.c.function.asString(m.hostCode)} |""".stripMargin + def translateToHeaderAndSource(m: Module): (String, String) = + (s""" + |#ifdef __cplusplus + |extern "C" + |{ + |#endif + |${m.hostCode.includes.map(_.toString).mkString("\n")} + |${m.hostCode.decls.map(C.AST.Printer(_)).mkString("\n")} + |${m.hostCode.functions.map(f => C.AST.Printer.declFun(f.code)).mkString("\n")} + |#ifdef __cplusplus + |} + |#endif + |""".stripMargin, + s""" + |${m.kernels.map(kernelSource).mkString("\n")} + |#define loadKernel(ctx, id)\\ + | loadKernelFromSource(ctx, #id, id##_source, sizeof(id##_source) - 1) + |${util.gen.c.function.asString(m.hostCode)} + |""".stripMargin) + def dumpToDirectory(dir: java.io.File)(m: Module): Unit = { util.writeToPath(s"${dir.getAbsolutePath}/host.c", s"""#define loadKernel(ctx, ident) loadKernelFromFile(ctx, #ident, #ident ".cl") diff --git a/src/main/scala/util/ExecuteOpenCL.scala b/src/main/scala/util/ExecuteOpenCL.scala index 5d8f7c3ef..ff4300584 100644 --- a/src/main/scala/util/ExecuteOpenCL.scala +++ b/src/main/scala/util/ExecuteOpenCL.scala @@ -11,7 +11,7 @@ object ExecuteOpenCL { val platformPath = "runtime/ocl/" val executorHeadersPath = "lib/executor/lib/Executor/include/" val libs = "-lm -lOpenCL" - val includes = s"-I$runtimePath -I$executorHeadersPath" + val includes = s"-I$runtimePath -I$executorHeadersPath -I." val libDirs: String = tryToFindOpenCLLibDir() def tryToFindOpenCLLibDir(): String = { @@ -45,7 +45,7 @@ object ExecuteOpenCL { s"""#include "host.c" |$mainSource""".stripMargin) val sources = s"$mainPath $platformPath/buffer_$buffer_impl.c $platformPath/ocl.c" - (s"clang -O2 $sources $includes -o $binPath $libDirs $libs -Wno-parentheses-equality" !!) + (s"clang -O2 $sources $includes -o $binPath $libDirs $libs -Wno-parentheses-equality ." !!) (Process(s"$binPath", new java.io.File(genDir.getAbsolutePath)) !!) } catch { case e: Throwable => @@ -71,5 +71,31 @@ object ExecuteOpenCL { throw Exception(s"execution failed: $e") } } + + @throws[Exception] + def using_cpp(main: String, module: shine.OpenCL.Module, buffer_impl: String): String = { + try { + val (m_h, m_c) = shine.OpenCL.Module.translateToHeaderAndSource(module) + val module_hdr = writeToTempFile("code-", ".h", m_h).getAbsolutePath + val module_src = writeToTempFile("code-", ".c", m_c).getAbsolutePath + val main_src = writeToTempFile("code-", ".cpp", main).getAbsolutePath + val sources = Seq(module_src, s"$platformPath/buffer_$buffer_impl.c", s"$platformPath/ocl.c") + val objs = sources.map(s => { + val obj = s.stripSuffix(".c") + ".o" + if (!(new java.io.File(obj)).exists()) { + (s"clang -c -O2 $s $includes -o $obj -Wno-parentheses-equality" !!) + } + obj + }).mkString(" ") + val bin = createTempFile("bin-", "").getAbsolutePath + (s"clang++ -O2 $main_src $objs -include $module_hdr $includes -o $bin $libDirs $libs -Wno-parentheses-equality" !!) + (new java.io.File(module_src.stripSuffix(".c") + ".o")).delete() + (s"$bin" !!) + } catch { + case e: Throwable => + Console.err.println(s"execution failed: $e") + throw Exception(s"execution failed: $e") + } + } } diff --git a/src/test/scala/shine/host.scala b/src/test/scala/shine/host.scala index 7d88420b6..18cb580df 100644 --- a/src/test/scala/shine/host.scala +++ b/src/test/scala/shine/host.scala @@ -163,4 +163,22 @@ int main(int argc, char** argv) { findDeviceBufferSyncRead(1, hostCode) checkOutput(m) } + + test("global memory") { + val e = depFun((n: Nat) => fun((n`.`i32) ->: (n`.`i32))(in => + oclRun(LocalSize(16), GlobalSize(n))( + in |> split(16) |> mapWorkGroup(0)( + mapLocal(0)(add(li32(1))) >> + toGlobal >> + mapLocal(0)(add(li32(2))) + ) |> join + ) + )) + val m = gen.opencl.hosted.fromExpr(e) + val hostCode = gen.c.function.asString(m.hostCode) + // logger.debug(hostCode) + findDeviceBufferSyncWrite(1, hostCode) + findDeviceBufferSyncRead(1, hostCode) + checkOutput(m) + } }