17 Apr 07:13

chenghuaWang

371469b

Testing on more Models and Aten bugs fixed. Latest

Latest

Tests

MobileNetV3. Using -target [Native/AdvX86]
SqueezeNet. Using -target [Native/AdvX86]
Transformer block write in aten-lang. Code
Conv2d 3x3 write in aten-lang. Code

New features

math package support in aten-lang. See Code
Turn arith op to Affine Expr when processing affine.for's Induction value.
New target:
- Native: using only MLIR's builtin Pass
- AdvX86: using scf.for loops with dispatch info and split for loops to avoid vector.mask

Full Changelog: v1.0...v1.1

Assets 2

27 Feb 14:14

chenghuaWang

v1.0

9ab9ea4

DL Model compile Support for x86 target

The nncv compiler now support DL Model compilation for x86 target.

Currently, nncv supports a very simple lowering pipeline. It basicly uses tiling and vectorization on linalg.ops. And this vectorization method currently only supports CPUs with the AVX2 feature.

If you want to compile a DL model to cpu target(without parallel). You can use commands below to generate a object file:

nncv-c -warp-c-interface -target HostWoParallel res18.mlir -o optimizedRes18.mlir
mlir-translate -mlir-to-llvmir optimizedRes18.mlir -o res18.ll
llc -filetype=object res18.ll -o libres18.o

If you want to enable multi-threads on cpu target, use HostWParallel option instead:

nncv-c -warp-c-interface -target HostWParallel res18.mlir -o optimizedRes18.mlir

Assets 2

09 Feb 13:09

chenghuaWang

v1.0-beta.4

f055c87

nncv's aten support polyhedral now Pre-release

Pre-release

What's new

Polyhedral model support for aten ir.

For now, nncv's aten ir can be transformed using polyhedral model provided by polymer. I will provide a simple example hereby.

The nncv's compiler will do 3 stages lowering(aten-lang-->aten dialect-->mlir's dialects-->llvm ir) and use LLVM's JIT to execute it. Aten-lang provides a pfor(parallel-for) mechanism, which will lowering all pfor scopes to affine.for in mlir. Such as:

@package = "main";

import "io";

func matmul(lhs Tensor<6, 6, float32>, rhs Tensor<6, 6, float32>, dst Tensor<6, 6, float32>) {
    pfor(/*lower bound, set axis name*/i := 0; /*upper bound*/6; /*step*/ 1) {
        pfor (j := 0; 6; 1) {
            pfor (k := 0; 6; 1) {
                dst[i, j] = dst[i, j] + lhs[i, k] * rhs[k, j]; // do 
            };
        };
    };
};

func main() -> void {
    var lhs Tensor<6, 6, float32>;
    var rhs Tensor<6, 6, float32>;
    var dst Tensor<6, 6, float32>;

    matmul(lhs, rhs, dst);

    io.print(dst);
};

after lowering to atenir-mlir, we will get:

module @__main {
  Aten.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 512 {
      affine.for %arg4 = 0 to 512 {
        affine.for %arg5 = 0 to 512 {
          %0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
          %1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
          %2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
          %3 = Aten.binop(mul, %1, %2) : f32
          %4 = Aten.binop(add, %0, %3) : f32
          memref.store %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
        }
      }
    }
    Aten.return
  }
  Aten.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32>
    Aten.call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
    Aten.return
  }
}

then lowering all aten-ir to mlir:

module @__main {
  func.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 512 {
      affine.for %arg4 = 0 to 512 {
        affine.for %arg5 = 0 to 512 {
          %0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
          %1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
          %2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
          %3 = arith.mulf %1, %2 : f32
          %4 = arith.addf %0, %3 : f32
          memref.store %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
        }
      }
    }
    return
  }
  func.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32>
    call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
    return
  }
}

nncv will try to use polymer to optimize all affine loops. (memref(loadOp, storeOp) will raise to affine if necessary). After optimization, we will get:

#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 32 + 32)>
module @__main {
  func.func private @S0(%arg0: memref<512x512xf32>, %arg1: index, %arg2: index, %arg3: memref<512x512xf32>, %arg4: index, %arg5: memref<512x512xf32>) attributes {scop.stmt} {
    %0 = affine.load %arg0[symbol(%arg1), symbol(%arg2)] : memref<512x512xf32>
    %1 = affine.load %arg5[symbol(%arg1), symbol(%arg4)] : memref<512x512xf32>
    %2 = affine.load %arg3[symbol(%arg4), symbol(%arg2)] : memref<512x512xf32>
    %3 = arith.mulf %1, %2 : f32
    %4 = arith.addf %0, %3 : f32
    affine.store %4, %arg0[symbol(%arg1), symbol(%arg2)] : memref<512x512xf32>
    return
  }
  func.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 16 {
      affine.for %arg4 = 0 to 16 {
        affine.for %arg5 = 0 to 16 {
          affine.for %arg6 = #map(%arg3) to #map1(%arg3) {
            affine.for %arg7 = #map(%arg5) to #map1(%arg5) {
              affine.for %arg8 = #map(%arg4) to #map1(%arg4) {
                func.call @S0(%arg2, %arg6, %arg8, %arg1, %arg7, %arg0) : (memref<512x512xf32>, index, index, memref<512x512xf32>, index, memref<512x512xf32>) -> ()
              }
            }
          }
        }
      }
    }
    return
  }
  func.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32>
    call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
    memref.dealloc %alloc_1 : memref<512x512xf32>
    memref.dealloc %alloc_0 : memref<512x512xf32>
    memref.dealloc %alloc : memref<512x512xf32>
    return
  }
}

Finally, nncv's lowering pipeline will lowering mlir to llvm ir. More examples can be found at test directory.

Assets 2

24 Jan 06:18

chenghuaWang

v1.0-beta.3

794c084

nncv lang's frontend almost done. Pre-release

Pre-release

Add more test cases.
Aten-lang lowering to mlir, almost done.

Case 1: Parallel For Loops

pfor.aten

/**
 * author: chenghua.wang (chenghua.wang.edu@gmail.com)
 * brief: test aten-lang frontend code generation
 */
@package = "main";

func matmul(lhs Tensor<512, 512, float32>, rhs Tensor<512, 512, float32>, dst Tensor<512, 512, float32>) {
    pfor(/*lower bound, set axis name*/i := 0; /*upper bound*/512; /*step*/ 1) {
        pfor (j := 0; 512; 1) {
            pfor (k := 0; 512; 1) {
                dst[i, j] = dst[i, j] + lhs[i, k] * rhs[k, j]; // do 
            };
        };
    };
};

func main() -> void {
    var lhs Tensor<512, 512, float32>;
    var rhs Tensor<512, 512, float32>;
    var dst Tensor<512, 512, float32>;

    matmul(lhs, rhs, dst);
};

lowering to Pfor.air:

module @__main {
  Aten.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 512 {
      affine.for %arg4 = 0 to 512 {
        affine.for %arg5 = 0 to 512 {
          %0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
          %1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
          %2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
          %3 = Aten.binop(mul, %1, %2) : f32
          %4 = Aten.binop(add, %0, %3) : f32
          memref.store %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
        }
      }
    }
    Aten.return
  }
  Aten.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32>
    Aten.call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
    Aten.return
  }
}

Lowering to mlir:

module @__main {
  func.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 512 {
      affine.for %arg4 = 0 to 512 {
        affine.for %arg5 = 0 to 512 {
          %0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
          %1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
          %2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
          %3 = arith.mulf %1, %2 : f32
          %4 = arith.addf %0, %3 : f32
          memref.store %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
        }
      }
    }
    return
  }
  func.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32>
    call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
    return
  }
}

Case 2 External Function Call

FuncCall.aten

@package = "main";

func _lib_nncv_do_something(Tensor<1, 1, float32>);

pub func add(a int32, b int32) -> int32 {
    return a + b;
};

func main() {
    res := add(8, 8);
    var t Tensor<1, 1, float32>;
    _lib_nncv_do_something(t);
};

Lowering to FuncCall.air

module @__main {
  Aten.func private @_lib_nncv_do_something(memref<1x1xf32>)
  Aten.func public @add(%arg0: !Aten.int<s, 32>, %arg1: !Aten.int<s, 32>) -> !Aten.int<s, 32> {
    %0 = Aten.binop(add, %arg0, %arg1) : !Aten.int<s, 32>
    Aten.return %0 : !Aten.int<s, 32>
  }
  Aten.func private @main() {
    %0 = Aten.const(#Aten.int<8> : !Aten.int<s, 32>) : !Aten.int<s, 32>
    %1 = Aten.call @add(%0, %0) : (!Aten.int<s, 32>, !Aten.int<s, 32>) -> !Aten.int<s, 32>
    %2 = Aten.alloca !Aten.int<s, 32>, aten.ptr <!Aten.int<s, 32>>, ["res"] {alignment = 4 : i64}
    Aten.store %1, %2 : !Aten.int<s, 32>, aten.ptr <!Aten.int<s, 32>>
    %alloc = memref.alloc() : memref<1x1xf32>
    Aten.call @_lib_nncv_do_something(%alloc) : (memref<1x1xf32>) -> ()
    Aten.return
  }
}

Lowering to mlir

module @__main {
  func.func private @_lib_nncv_do_something(memref<1x1xf32>)
  func.func @add(%arg0: i32, %arg1: i32) -> i32 {
    %0 = arith.addi %arg0, %arg1 : i32
    return %0 : i32
  }
  func.func private @main() {
    %c8_i32 = arith.constant 8 : i32
    %0 = call @add(%c8_i32, %c8_i32) : (i32, i32) -> i32
    %alloca = memref.alloca() {alignment = 4 : i64} : memref<i32>
    memref.store %0, %alloca[] : memref<i32>
    %alloc = memref.alloc() : memref<1x1xf32>
    call @_lib_nncv_do_something(%alloc) : (memref<1x1xf32>) -> ()
    return
  }
}

Check ./test for more examples.

Assets 2

06 Jan 14:12

chenghuaWang

v1.0-beta.2

39a9b04

v1.0-beta.2 release. Pre-release

Pre-release

freeze src2. For backup.

Add new GPU Lowering Pipeline.
Add NNCV Frontend and Graph IR/Transforms.
Add Graph Level Optimization.

Assets 2

29 Dec 12:23

chenghuaWang

v1.0-beta.1

ad18627

v1.0-beta.1 release. Pre-release

Pre-release

freeze src1. For backup.

Assets 2

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Tests

New features

What's new

Case 1: Parallel For Loops

Case 2 External Function Call

Releases: chenghuaWang/nncv

Testing on more Models and Aten bugs fixed.

Tests

New features

DL Model compile Support for x86 target

nncv's aten support polyhedral now

What's new

nncv lang's frontend almost done.

Case 1: Parallel For Loops

Case 2 External Function Call

v1.0-beta.2 release.

v1.0-beta.1 release.