esl-epfl · davideschiavone · Mar 1, 2024 · Feb 27, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/sw/applications/example_matmul/gen_stimuly.py b/sw/applications/example_matmul/gen_stimuly.py
@@ -1,27 +1,21 @@
 #!/usr/bin/env python
 
+## Copyright 2024 EPFL
+## Solderpad Hardware License, Version 2.1, see LICENSE.md for details.
+## SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+
 import sys
 import random
-
-# Copyright 2017 ETH Zurich and University of Bologna.
-# Copyright and related rights are licensed under the Solderpad Hardware
-# License, Version 0.51 (the License); you may not use this file except in
-# compliance with the License.  You may obtain a copy of the License at
-# http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-# or agreed to in writing, software, hardware and materials distributed under
-# this License is distributed on an AS IS BASIS, WITHOUT WARRANTIES OR
-# CONDITIONS OF ANY KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations under the License.
+import numpy as np
 
 def write_arr(f, name, arr, ctype, size):
     f.write("const " + ctype + " " + name + "[] = {\n")
-    i = 1
-    for v in arr:
-        if i % size == 0:
-            f.write('%d,\n' % (v))
-        else:
-            f.write('%d,' % (v))
-        i+=1
+
+    for row in arr:
+        for elem in row[:-1]:
+            f.write('%d,' % (elem))
+        f.write('%d,\n' % (row[-1]))
+
     f.write('};\n\n')
     return
 
@@ -33,33 +27,23 @@ def write_arr(f, name, arr, ctype, size):
 f.write('// This file is automatically generated\n')
 
 
-SIZE = 16
+SIZE  = 16
 RANGE = 4
 
 m_a   = []
 m_b   = []
 m_exp = []
 
-for i in range(0,SIZE):
-    for j in range(0,SIZE):
-        a = random.randint(-RANGE, RANGE-1)
-        b = random.randint(-RANGE, RANGE-1)
-
-        m_a.append(a)
-        m_b.append(b)
-
-for i in range(0,SIZE):
-    for j in range(0,SIZE):
-        r = 0
-
-        for k in range (0,SIZE):
-            r = r + m_a[i * SIZE + k] * m_b[k * SIZE + j]
-
-        m_exp.append(r)
+# Generate random 8 bit integers from -RANGE to RANGE for A and B
+m_a = np.random.randint(-RANGE, RANGE, size=(SIZE, SIZE), dtype=np.int32)
+m_b = np.random.randint(-RANGE, RANGE, size=(SIZE, SIZE), dtype=np.int32)
+m_exp = np.zeros((SIZE, SIZE), dtype=np.int32)
 
+# Test the function with A and B
+m_exp = np.matmul(m_a,m_b)
 
 write_arr(f, 'm_a',   m_a, 'int8_t', SIZE)
-write_arr(f, 'm_b_transposed',   m_b, 'int8_t', SIZE)
+write_arr(f, 'm_b',   m_b, 'int8_t', SIZE)
 write_arr(f, 'm_exp', m_exp, 'int32_t', SIZE)
 
 f.write('#define SIZE %d\n' % SIZE)

diff --git a/sw/applications/example_matmul/main.c b/sw/applications/example_matmul/main.c
@@ -20,25 +20,46 @@
     #define PRINTF(...)
 #endif
 
-void __attribute__ ((noinline)) matrixMul8(int8_t *  A, int8_t *  Bt, int32_t *  C, int N);
+void __attribute__ ((noinline)) matrixMul8_blocksize(int8_t *  A, int8_t *  B, int32_t *  C, int N);
+
+void __attribute__ ((noinline)) matrixMul8_tiled(int8_t *  A, int8_t *  B, int32_t *  C, int N);
 
 uint32_t check_results(int32_t * C, int N);
 
 int32_t m_c[SIZE*SIZE];
 
+#define BLOCK_SIZE 4
+
+// Define a macro for accessing matrix elements
+#define A(i,j) &A[i*SIZE+j]
+#define B(i,j) &B[i*SIZE+j]
+#define C(i,j) &C[i*SIZE+j]
+
+#define HIGHEST_PERF
+
 int main()
 {
 
     uint32_t errors = 0;
     unsigned int instr, cycles;
 
+    for(int i =0;i<SIZE;i++) {
+        for(int j =0;j<SIZE;j++) {
+            m_c[i*SIZE+j] = 0;
+        }
+    }
+
     //enable mcycle csr
     CSR_CLEAR_BITS(CSR_REG_MCOUNTINHIBIT, 0x1);
 
     CSR_WRITE(CSR_REG_MCYCLE, 0);
 
+#ifdef HIGHEST_PERF
+    matrixMul8_blocksize(m_a, m_b, m_c, SIZE);
+#else
     //execute the kernel
-    matrixMul8(m_a, m_b_transposed, m_c, SIZE);
+    matrixMul8_tiled(m_a, m_b, m_c, SIZE);
+#endif
 
     CSR_READ(CSR_REG_MCYCLE, &cycles);
 
@@ -48,19 +69,44 @@ int main()
     return errors;
 }
 
-void __attribute__ ((noinline)) matrixMul8(int8_t *  A, int8_t *  Bt, int32_t *  C, int N)
+void __attribute__ ((noinline)) matrixMul8_blocksize(int8_t *  A, int8_t *  B, int32_t *  C, int N)
 {
+
     for(int i = 0; i < N; i++) {
         for(int j = 0; j < N; j++) {
             int32_t acc = 0;
             for(int k = 0; k < N; k++) {
-                acc+= A[i*N+k] * Bt[k*N+j];
+                acc+= A[i*SIZE+k] * B[k*SIZE+j];
             }
-            C[i*N+j] = acc;
+            C[i*SIZE+j] += acc;
         }
     }
+
+}
+
+
+// Define a recursive function that multiplies two matrices using the tiled algorithm
+void __attribute__ ((noinline)) matrixMul8_tiled(int8_t* A, int8_t* B, int32_t* C, int N) {
+    // use the elementary function
+    if (N == BLOCK_SIZE) {
+        matrixMul8_blocksize(A, B, C, N);
+    }
+    //split the matrices into four blocks each
+    else {
+        N = N >> 1; // Half the size
+        // Multiply the blocks and add them to the corresponding blocks of C
+        matrixMul8_tiled(A(0, 0), B(0, 0), C(0, 0), N); // C_00 += A_00 * B_00
+        matrixMul8_tiled(A(0, N), B(N, 0), C(0, 0), N); // C_00 += A_01 * B_10
+        matrixMul8_tiled(A(0, 0), B(0, N), C(0, N), N); // C_01 += A_00 * B_01
+        matrixMul8_tiled(A(0, N), B(N, N), C(0, N), N); // C_01 += A_01 * B_11
+        matrixMul8_tiled(A(N, 0), B(0, 0), C(N, 0), N); // C_10 += A_10 * B_00
+        matrixMul8_tiled(A(N, N), B(N, 0), C(N, 0), N); // C_10 += A_11 * B_10
+        matrixMul8_tiled(A(N, 0), B(0, N), C(N, N), N); // C_11 += A_10 * B_01
+        matrixMul8_tiled(A(N, N), B(N, N), C(N, N), N); // C_11 += A_11 * B_11
+    }
 }
 
+
 uint32_t check_results(int32_t * C, int N)
 {
     // check

diff --git a/sw/applications/example_matmul/matrixMul8.h b/sw/applications/example_matmul/matrixMul8.h
@@ -2,60 +2,60 @@
 #define _MATMUL8_
 // This file is automatically generated
 const int8_t m_a[] = {
--2,3,-1,-1,-3,2,0,2,-2,-1,-3,-3,1,3,2,1,
-1,-3,1,-4,-1,2,1,0,-4,-1,-1,-3,-2,-1,0,-2,
-1,0,-1,2,1,3,2,-3,0,3,-1,3,-2,3,2,2,
--3,1,0,0,1,0,-4,-4,0,-2,-3,-2,-1,-1,3,2,
--2,0,-1,-3,-4,-1,-3,0,3,-1,2,-1,0,-2,1,1,
-0,-2,-4,1,1,-4,1,1,0,-2,0,-4,-4,-1,2,-4,
--2,-1,-2,-4,0,-2,2,3,-2,2,-2,-1,2,3,1,-3,
--1,-2,3,-4,0,3,-4,-3,1,-4,0,-1,-2,0,1,-1,
-1,-3,0,1,-3,-3,0,-2,2,1,-3,1,1,-4,3,0,
-2,1,1,2,-1,-4,3,-3,-3,1,-2,-1,1,3,-2,3,
--1,-1,-1,-1,-3,1,-2,3,-1,0,-2,3,1,3,-3,-4,
--1,-1,-1,-1,-2,3,-3,3,-3,-1,-4,-2,-3,3,0,-2,
-2,3,-4,0,3,-4,-4,3,3,2,-1,-2,0,-1,-4,-4,
-3,-1,-1,-3,2,-2,3,1,-2,2,3,0,-2,-2,-2,0,
--4,1,2,2,3,-2,2,1,-2,0,-4,2,-2,-2,3,-2,
-1,3,-2,0,1,3,2,-3,-4,0,0,3,3,-1,3,3,
+-4,-2,-3,3,3,-1,-4,-3,1,-2,-2,2,-2,-4,-2,-1,
+0,2,0,-2,-3,3,-2,3,-1,3,0,2,3,1,-3,-2,
+0,2,2,3,-2,-2,-3,-2,0,-3,0,-4,2,-3,-3,1,
+3,-4,1,1,-1,0,-3,0,1,3,-1,-2,2,3,-4,0,
+3,-1,2,-4,-1,-2,0,-3,-3,0,-2,-4,-2,1,0,-2,
+-3,0,-2,2,-1,2,2,2,3,1,0,-2,-1,-4,3,-3,
+1,3,2,-3,-4,-3,-2,-1,-2,3,1,-3,2,-4,0,0,
+-2,0,3,-1,3,-4,0,-2,2,3,-2,-2,-1,-4,-3,1,
+-4,-2,-4,-1,-2,-4,-4,-2,-4,-4,-3,0,-2,-4,2,-4,
+1,-1,-3,-2,-1,-2,-4,0,2,3,1,-3,-3,1,3,1,
+-3,0,1,3,-3,1,3,2,0,-2,1,-4,-3,0,1,-4,
+3,0,-1,-3,2,-1,-1,1,-1,3,2,-4,2,2,0,-2,
+-4,2,2,-2,2,-4,-3,-3,-3,-3,-2,-3,3,0,3,0,
+2,3,-2,0,3,1,1,-3,3,0,1,-4,-4,-1,0,1,
+0,0,1,3,-3,3,3,1,-3,1,1,2,1,-2,-2,2,
+-1,2,-2,1,0,-2,-3,-1,0,0,2,0,3,3,-4,-4,
 };
 
-const int8_t m_b_transposed[] = {
--1,-2,-1,3,-1,-3,2,-3,1,0,2,1,2,-4,1,-1,
-2,-4,2,3,-4,-3,3,-2,2,1,-1,-2,-4,-4,-4,2,
--4,-4,-4,1,-4,0,1,0,-2,1,-4,2,-3,2,-3,-4,
-2,2,-4,0,-2,-4,-3,-3,-3,-4,-3,-3,-3,2,1,0,
--2,1,-2,-4,2,3,1,-4,2,2,2,-2,-2,-4,-1,-3,
-3,2,3,2,3,0,2,-2,-3,1,1,3,3,2,2,2,
--3,-4,-2,1,-1,2,3,-2,-3,0,-1,-2,3,-3,2,-4,
-1,-2,-1,3,-1,-3,-4,2,-4,3,2,-4,1,1,-2,-2,
--2,2,3,-3,0,2,-1,1,2,-2,-2,3,-2,-2,-3,-4,
--1,-3,-3,1,3,3,3,1,3,-4,0,-4,-2,-3,0,3,
--1,1,-4,0,-1,-2,-3,2,1,1,-4,2,1,0,1,-1,
--2,-4,0,-3,-1,2,0,-1,2,-3,-3,-4,0,-3,-3,-2,
-3,1,2,3,2,-3,1,2,2,-4,-4,3,-2,3,-4,1,
--3,-1,-3,-1,-2,3,-2,-4,2,-4,-1,0,-2,1,-1,2,
--1,0,-2,-3,3,3,0,-1,3,2,3,-3,3,1,-1,3,
-3,3,-1,-3,-3,2,-3,0,2,-2,1,-2,3,1,0,-4,
+const int8_t m_b[] = {
+2,1,2,1,0,3,-1,-1,3,3,3,-2,0,-3,-1,3,
+-3,0,-3,0,2,-2,3,0,-2,-1,2,-3,2,-2,0,-2,
+-4,0,1,3,-3,-3,-2,2,3,0,-1,2,3,2,-2,-4,
+3,3,-2,3,2,2,-2,-1,-3,-3,-1,-3,3,-2,2,-2,
+0,1,0,-3,-1,-3,-1,0,-1,3,3,1,-1,-1,2,2,
+-2,-4,-1,-1,1,1,2,-1,2,3,0,3,2,0,-4,3,
+-1,-1,-3,2,-4,3,3,-1,3,1,-3,-2,1,1,2,3,
+0,2,0,1,1,-3,-4,-4,2,-2,1,0,3,2,0,-1,
+0,1,1,-1,2,-1,0,2,-1,-3,3,0,-2,-4,2,3,
+1,-4,2,-4,0,1,3,-4,-2,-3,-2,-2,-1,3,-2,-3,
+1,0,2,3,0,1,1,-1,-3,-1,3,-3,-3,-2,1,-4,
+-3,2,-4,1,3,2,3,-2,1,1,3,0,-2,-2,-1,-1,
+1,-3,-1,1,-2,2,-1,-4,3,0,-3,-4,-3,1,-1,2,
+-1,-2,3,-4,0,1,1,1,-2,1,-4,3,0,1,2,1,
+1,3,-1,-4,2,0,-1,-1,-4,-3,-2,-4,-1,-4,3,2,
+0,1,0,1,3,-4,-1,2,1,-1,3,-2,-2,-2,-1,-1,
 };
 
 const int32_t m_exp[] = {
-33,0,23,29,-4,-7,0,-3,-11,8,25,-4,8,35,-6,44,
--7,-4,7,25,23,5,22,9,-34,50,40,27,43,18,42,16,
--14,-10,-23,-30,8,47,18,-49,22,-39,5,-34,12,-26,14,10,
-23,44,26,-39,11,22,0,-4,25,12,25,11,-8,25,-3,25,
-17,28,36,-7,4,-4,-22,53,17,13,-2,32,12,23,-9,8,
--15,20,-8,-13,20,-3,-17,1,-18,43,45,-13,15,-9,39,13,
--16,-28,3,16,30,24,12,14,5,5,23,-19,3,1,-7,31,
--11,31,31,-23,12,21,1,8,1,40,12,66,13,32,5,4,
--2,6,11,-16,20,8,8,25,12,-18,3,-5,12,6,0,-2,
--8,-23,-35,10,-41,0,12,-27,10,-44,-15,-18,-16,-9,7,-7,
-2,-21,25,29,5,-15,-8,13,-20,-17,-10,4,-15,19,-12,30,
-17,7,20,21,13,-2,-11,-6,-35,26,49,-1,13,37,21,49,
-9,1,29,20,8,-36,0,11,19,5,18,-7,-56,-48,-22,14,
--24,-27,-24,9,2,5,15,6,0,26,17,-16,27,-48,26,-26,
--20,-33,-19,-25,2,24,14,-17,-17,22,12,-61,-18,-10,-14,-6,
-31,-8,7,0,12,5,34,-29,27,-12,4,-24,28,-14,-1,22,
+15,30,-21,1,25,-11,-8,24,-29,-1,32,23,-10,-15,9,-4,
+-20,-44,1,-1,3,12,19,-41,25,4,-10,11,7,36,-44,-17,
+13,10,3,50,-5,-17,-31,28,5,-12,9,-14,14,-3,-8,-28,
+23,-31,50,-6,-12,14,-20,-5,19,6,-17,24,-4,29,-22,-1,
+0,-17,35,-16,-40,5,-2,28,15,29,-28,27,10,27,-7,8,
+15,8,-21,-5,6,6,0,-18,-23,-38,-11,-19,16,-8,16,14,
+3,-18,14,16,-15,-9,1,-8,6,-23,-5,-36,-6,20,-27,-43,
+-4,-1,9,3,-23,-39,-5,24,5,-13,15,6,-4,21,-3,-26,
+20,35,-18,-6,6,3,-17,18,-28,-4,-14,16,-2,8,15,-4,
+31,7,45,-49,28,-14,-8,10,-49,-31,11,-6,-24,-17,14,-3,
+3,8,-5,21,-19,8,-10,8,-17,-23,-39,10,49,16,22,-9,
+21,-30,44,-32,-25,5,-5,-22,-5,10,-12,-8,-15,19,1,8,
+-7,8,-2,-18,-18,-38,-23,28,-20,-3,-29,-4,-9,9,17,-11,
+10,4,11,-18,8,-13,16,34,-29,6,36,-5,0,-37,18,22,
+-7,-13,-29,48,-2,22,13,-26,31,1,-8,-18,20,16,-30,-23,
+8,-20,13,0,-5,20,10,-10,-22,2,-9,4,-12,14,6,-18,
 };
 
 #define SIZE 16