Releases: chenghuaWang/nncv
Testing on more Models and Aten bugs fixed.
Tests
- MobileNetV3. Using -target [Native/AdvX86]
- SqueezeNet. Using -target [Native/AdvX86]
- Transformer block write in aten-lang. Code
- Conv2d 3x3 write in aten-lang. Code
New features
math
package support in aten-lang. See Code- Turn
arith
op to Affine Expr when processingaffine.for
's Induction value. - New target:
- Native: using only MLIR's builtin Pass
- AdvX86: using
scf.for
loops with dispatch info and split for loops to avoidvector.mask
Full Changelog: v1.0...v1.1
DL Model compile Support for x86 target
The nncv compiler now support DL Model compilation for x86 target.
Currently, nncv supports a very simple lowering pipeline. It basicly uses tiling and vectorization on linalg.ops. And this vectorization method currently only supports CPUs with the AVX2 feature.
If you want to compile a DL model to cpu target(without parallel). You can use commands below to generate a object file:
nncv-c -warp-c-interface -target HostWoParallel res18.mlir -o optimizedRes18.mlir
mlir-translate -mlir-to-llvmir optimizedRes18.mlir -o res18.ll
llc -filetype=object res18.ll -o libres18.o
If you want to enable multi-threads on cpu target, use HostWParallel option instead:
nncv-c -warp-c-interface -target HostWParallel res18.mlir -o optimizedRes18.mlir
nncv's aten support polyhedral now
What's new
- Polyhedral model support for aten ir.
For now, nncv's aten ir can be transformed using polyhedral model provided by polymer. I will provide a simple example hereby.
The nncv's compiler will do 3 stages lowering(aten-lang-->aten dialect-->mlir's dialects-->llvm ir) and use LLVM's JIT to execute it. Aten-lang provides a pfor
(parallel-for) mechanism, which will lowering all pfor
scopes to affine.for
in mlir. Such as:
@package = "main";
import "io";
func matmul(lhs Tensor<6, 6, float32>, rhs Tensor<6, 6, float32>, dst Tensor<6, 6, float32>) {
pfor(/*lower bound, set axis name*/i := 0; /*upper bound*/6; /*step*/ 1) {
pfor (j := 0; 6; 1) {
pfor (k := 0; 6; 1) {
dst[i, j] = dst[i, j] + lhs[i, k] * rhs[k, j]; // do
};
};
};
};
func main() -> void {
var lhs Tensor<6, 6, float32>;
var rhs Tensor<6, 6, float32>;
var dst Tensor<6, 6, float32>;
matmul(lhs, rhs, dst);
io.print(dst);
};
after lowering to atenir-mlir, we will get:
module @__main {
Aten.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
affine.for %arg3 = 0 to 512 {
affine.for %arg4 = 0 to 512 {
affine.for %arg5 = 0 to 512 {
%0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
%1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
%2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
%3 = Aten.binop(mul, %1, %2) : f32
%4 = Aten.binop(add, %0, %3) : f32
memref.store %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
}
}
}
Aten.return
}
Aten.func private @main() {
%alloc = memref.alloc() : memref<512x512xf32>
%alloc_0 = memref.alloc() : memref<512x512xf32>
%alloc_1 = memref.alloc() : memref<512x512xf32>
Aten.call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
Aten.return
}
}
then lowering all aten-ir to mlir:
module @__main {
func.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
affine.for %arg3 = 0 to 512 {
affine.for %arg4 = 0 to 512 {
affine.for %arg5 = 0 to 512 {
%0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
%1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
%2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
%3 = arith.mulf %1, %2 : f32
%4 = arith.addf %0, %3 : f32
memref.store %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
}
}
}
return
}
func.func private @main() {
%alloc = memref.alloc() : memref<512x512xf32>
%alloc_0 = memref.alloc() : memref<512x512xf32>
%alloc_1 = memref.alloc() : memref<512x512xf32>
call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
return
}
}
nncv will try to use polymer to optimize all affine loops. (memref(loadOp, storeOp) will raise to affine if necessary). After optimization, we will get:
#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 32 + 32)>
module @__main {
func.func private @S0(%arg0: memref<512x512xf32>, %arg1: index, %arg2: index, %arg3: memref<512x512xf32>, %arg4: index, %arg5: memref<512x512xf32>) attributes {scop.stmt} {
%0 = affine.load %arg0[symbol(%arg1), symbol(%arg2)] : memref<512x512xf32>
%1 = affine.load %arg5[symbol(%arg1), symbol(%arg4)] : memref<512x512xf32>
%2 = affine.load %arg3[symbol(%arg4), symbol(%arg2)] : memref<512x512xf32>
%3 = arith.mulf %1, %2 : f32
%4 = arith.addf %0, %3 : f32
affine.store %4, %arg0[symbol(%arg1), symbol(%arg2)] : memref<512x512xf32>
return
}
func.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
affine.for %arg3 = 0 to 16 {
affine.for %arg4 = 0 to 16 {
affine.for %arg5 = 0 to 16 {
affine.for %arg6 = #map(%arg3) to #map1(%arg3) {
affine.for %arg7 = #map(%arg5) to #map1(%arg5) {
affine.for %arg8 = #map(%arg4) to #map1(%arg4) {
func.call @S0(%arg2, %arg6, %arg8, %arg1, %arg7, %arg0) : (memref<512x512xf32>, index, index, memref<512x512xf32>, index, memref<512x512xf32>) -> ()
}
}
}
}
}
}
return
}
func.func private @main() {
%alloc = memref.alloc() : memref<512x512xf32>
%alloc_0 = memref.alloc() : memref<512x512xf32>
%alloc_1 = memref.alloc() : memref<512x512xf32>
call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
memref.dealloc %alloc_1 : memref<512x512xf32>
memref.dealloc %alloc_0 : memref<512x512xf32>
memref.dealloc %alloc : memref<512x512xf32>
return
}
}
Finally, nncv's lowering pipeline will lowering mlir to llvm ir. More examples can be found at test directory.
nncv lang's frontend almost done.
- Add more test cases.
- Aten-lang lowering to mlir, almost done.
Case 1: Parallel For Loops
pfor.aten
/**
* author: chenghua.wang (chenghua.wang.edu@gmail.com)
* brief: test aten-lang frontend code generation
*/
@package = "main";
func matmul(lhs Tensor<512, 512, float32>, rhs Tensor<512, 512, float32>, dst Tensor<512, 512, float32>) {
pfor(/*lower bound, set axis name*/i := 0; /*upper bound*/512; /*step*/ 1) {
pfor (j := 0; 512; 1) {
pfor (k := 0; 512; 1) {
dst[i, j] = dst[i, j] + lhs[i, k] * rhs[k, j]; // do
};
};
};
};
func main() -> void {
var lhs Tensor<512, 512, float32>;
var rhs Tensor<512, 512, float32>;
var dst Tensor<512, 512, float32>;
matmul(lhs, rhs, dst);
};
lowering to Pfor.air:
module @__main {
Aten.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
affine.for %arg3 = 0 to 512 {
affine.for %arg4 = 0 to 512 {
affine.for %arg5 = 0 to 512 {
%0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
%1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
%2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
%3 = Aten.binop(mul, %1, %2) : f32
%4 = Aten.binop(add, %0, %3) : f32
memref.store %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
}
}
}
Aten.return
}
Aten.func private @main() {
%alloc = memref.alloc() : memref<512x512xf32>
%alloc_0 = memref.alloc() : memref<512x512xf32>
%alloc_1 = memref.alloc() : memref<512x512xf32>
Aten.call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
Aten.return
}
}
Lowering to mlir:
module @__main {
func.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
affine.for %arg3 = 0 to 512 {
affine.for %arg4 = 0 to 512 {
affine.for %arg5 = 0 to 512 {
%0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
%1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
%2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
%3 = arith.mulf %1, %2 : f32
%4 = arith.addf %0, %3 : f32
memref.store %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
}
}
}
return
}
func.func private @main() {
%alloc = memref.alloc() : memref<512x512xf32>
%alloc_0 = memref.alloc() : memref<512x512xf32>
%alloc_1 = memref.alloc() : memref<512x512xf32>
call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
return
}
}
Case 2 External Function Call
FuncCall.aten
@package = "main";
func _lib_nncv_do_something(Tensor<1, 1, float32>);
pub func add(a int32, b int32) -> int32 {
return a + b;
};
func main() {
res := add(8, 8);
var t Tensor<1, 1, float32>;
_lib_nncv_do_something(t);
};
Lowering to FuncCall.air
module @__main {
Aten.func private @_lib_nncv_do_something(memref<1x1xf32>)
Aten.func public @add(%arg0: !Aten.int<s, 32>, %arg1: !Aten.int<s, 32>) -> !Aten.int<s, 32> {
%0 = Aten.binop(add, %arg0, %arg1) : !Aten.int<s, 32>
Aten.return %0 : !Aten.int<s, 32>
}
Aten.func private @main() {
%0 = Aten.const(#Aten.int<8> : !Aten.int<s, 32>) : !Aten.int<s, 32>
%1 = Aten.call @add(%0, %0) : (!Aten.int<s, 32>, !Aten.int<s, 32>) -> !Aten.int<s, 32>
%2 = Aten.alloca !Aten.int<s, 32>, aten.ptr <!Aten.int<s, 32>>, ["res"] {alignment = 4 : i64}
Aten.store %1, %2 : !Aten.int<s, 32>, aten.ptr <!Aten.int<s, 32>>
%alloc = memref.alloc() : memref<1x1xf32>
Aten.call @_lib_nncv_do_something(%alloc) : (memref<1x1xf32>) -> ()
Aten.return
}
}
Lowering to mlir
module @__main {
func.func private @_lib_nncv_do_something(memref<1x1xf32>)
func.func @add(%arg0: i32, %arg1: i32) -> i32 {
%0 = arith.addi %arg0, %arg1 : i32
return %0 : i32
}
func.func private @main() {
%c8_i32 = arith.constant 8 : i32
%0 = call @add(%c8_i32, %c8_i32) : (i32, i32) -> i32
%alloca = memref.alloca() {alignment = 4 : i64} : memref<i32>
memref.store %0, %alloca[] : memref<i32>
%alloc = memref.alloc() : memref<1x1xf32>
call @_lib_nncv_do_something(%alloc) : (memref<1x1xf32>) -> ()
return
}
}
Check ./test for more examples.
v1.0-beta.2 release.
freeze src2. For backup.
Add new GPU Lowering Pipeline.
Add NNCV Frontend and Graph IR/Transforms.
Add Graph Level Optimization.
v1.0-beta.1 release.
freeze src1. For backup.