diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise1/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise1/bambu.sh
new file mode 100755
index 000000000..34be96f47
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise1/bambu.sh
@@ -0,0 +1,11 @@
+script=$(readlink -e $0)
+root_dir=$(dirname $script)
+rm -rf icrc1
+mkdir -p icrc1
+cd icrc1
+echo "#synthesis of icrc1"
+bambu ../icrc.c --top-fname=icrc1 \
+ --generate-tb=../test_icrc1.xml --simulator=VERILATOR --simulate \
+ -v2 --print-dot --pretty-print=a.c "$@" |& tee icrc1.log
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise1/icrc.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise1/icrc.c
new file mode 100644
index 000000000..8852b50a1
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise1/icrc.c
@@ -0,0 +1,14 @@
+unsigned short icrc1(unsigned short crc, unsigned char onech)
+ int i;
+ unsigned short ans=(crc^onech << 8);
+ for (i=0;i<8;i++) {
+ if (ans & 0x8000)
+ ans = (ans <<= 1) ^ 4129;
+ else
+ ans <<= 1;
+ }
+ return ans;
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/minmax.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/minmax.c
new file mode 100644
index 000000000..2058b7576
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/minmax.c
@@ -0,0 +1,19 @@
+void min_max(int * input, int num_elements, int * max, int * min)
+ int local_max = input[0];
+ int local_min = input[0];
+ int i = 0;
+ for(i = 0; i < num_elements; i++)
+ {
+ if(input[i] > local_max)
+ {
+ local_max = input[i];
+ }
+ else if(input[i] < local_min)
+ {
+ local_min = input[i];
+ }
+ }
+ *min = local_min;
+ *max = local_max;
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/synthesize.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/synthesize.sh
new file mode 100755
index 000000000..48f30b583
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/synthesize.sh
@@ -0,0 +1,2 @@
+bambu minmax.c --generate-tb=testbench.xml --simulate "$@" |& tee log.txt
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/testbench.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/testbench.xml
new file mode 100644
index 000000000..3781cfbc4
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/testbench.xml
@@ -0,0 +1,7 @@
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise3/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/bambu.sh
new file mode 100644
index 000000000..d8574c1d5
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/bambu.sh
@@ -0,0 +1,2 @@
+bambu matmul.ll --top-fname=main_kernel --generate-tb=test.xml --simulate --simulator=VERILATOR --compiler=I386_CLANG12 "$@" |& tee log.txt
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise3/matmul.ll b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/matmul.ll
new file mode 100644
index 000000000..b23ebcf11
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/matmul.ll
@@ -0,0 +1,637 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+declare i8* @malloc(i64)
+declare void @free(i8*)
+define void @main_kernel(float* noalias %0, float* noalias %1, float* noalias %2) !dbg !3 {
+ %4 = alloca float, i64 ptrtoint (float* getelementptr (float, float* null, i64 16) to i64), align 4, !dbg !7
+ %5 = getelementptr float, float* %0, i64 0, !dbg !9
+ %6 = load float, float* %5, align 4, !dbg !10
+ %7 = getelementptr float, float* %4, i64 0, !dbg !11
+ store float %6, float* %7, align 4, !dbg !12
+ %8 = getelementptr float, float* %0, i64 1, !dbg !13
+ %9 = load float, float* %8, align 4, !dbg !14
+ %10 = getelementptr float, float* %4, i64 1, !dbg !15
+ store float %9, float* %10, align 4, !dbg !16
+ %11 = getelementptr float, float* %0, i64 2, !dbg !17
+ %12 = load float, float* %11, align 4, !dbg !18
+ %13 = getelementptr float, float* %4, i64 2, !dbg !19
+ store float %12, float* %13, align 4, !dbg !20
+ %14 = getelementptr float, float* %0, i64 3, !dbg !21
+ %15 = load float, float* %14, align 4, !dbg !22
+ %16 = getelementptr float, float* %4, i64 3, !dbg !23
+ store float %15, float* %16, align 4, !dbg !24
+ %17 = getelementptr float, float* %0, i64 4, !dbg !25
+ %18 = load float, float* %17, align 4, !dbg !26
+ %19 = getelementptr float, float* %4, i64 4, !dbg !27
+ store float %18, float* %19, align 4, !dbg !28
+ %20 = getelementptr float, float* %0, i64 5, !dbg !29
+ %21 = load float, float* %20, align 4, !dbg !30
+ %22 = getelementptr float, float* %4, i64 5, !dbg !31
+ store float %21, float* %22, align 4, !dbg !32
+ %23 = getelementptr float, float* %0, i64 6, !dbg !33
+ %24 = load float, float* %23, align 4, !dbg !34
+ %25 = getelementptr float, float* %4, i64 6, !dbg !35
+ store float %24, float* %25, align 4, !dbg !36
+ %26 = getelementptr float, float* %0, i64 7, !dbg !37
+ %27 = load float, float* %26, align 4, !dbg !38
+ %28 = getelementptr float, float* %4, i64 7, !dbg !39
+ store float %27, float* %28, align 4, !dbg !40
+ %29 = getelementptr float, float* %0, i64 8, !dbg !41
+ %30 = load float, float* %29, align 4, !dbg !42
+ %31 = getelementptr float, float* %4, i64 8, !dbg !43
+ store float %30, float* %31, align 4, !dbg !44
+ %32 = getelementptr float, float* %0, i64 9, !dbg !45
+ %33 = load float, float* %32, align 4, !dbg !46
+ %34 = getelementptr float, float* %4, i64 9, !dbg !47
+ store float %33, float* %34, align 4, !dbg !48
+ %35 = getelementptr float, float* %0, i64 10, !dbg !49
+ %36 = load float, float* %35, align 4, !dbg !50
+ %37 = getelementptr float, float* %4, i64 10, !dbg !51
+ store float %36, float* %37, align 4, !dbg !52
+ %38 = getelementptr float, float* %0, i64 11, !dbg !53
+ %39 = load float, float* %38, align 4, !dbg !54
+ %40 = getelementptr float, float* %4, i64 11, !dbg !55
+ store float %39, float* %40, align 4, !dbg !56
+ %41 = getelementptr float, float* %0, i64 12, !dbg !57
+ %42 = load float, float* %41, align 4, !dbg !58
+ %43 = getelementptr float, float* %4, i64 12, !dbg !59
+ store float %42, float* %43, align 4, !dbg !60
+ %44 = getelementptr float, float* %0, i64 13, !dbg !61
+ %45 = load float, float* %44, align 4, !dbg !62
+ %46 = getelementptr float, float* %4, i64 13, !dbg !63
+ store float %45, float* %46, align 4, !dbg !64
+ %47 = getelementptr float, float* %0, i64 14, !dbg !65
+ %48 = load float, float* %47, align 4, !dbg !66
+ %49 = getelementptr float, float* %4, i64 14, !dbg !67
+ store float %48, float* %49, align 4, !dbg !68
+ %50 = getelementptr float, float* %0, i64 15, !dbg !69
+ %51 = load float, float* %50, align 4, !dbg !70
+ %52 = getelementptr float, float* %4, i64 15, !dbg !71
+ store float %51, float* %52, align 4, !dbg !72
+ %53 = alloca float, i64 ptrtoint (float* getelementptr (float, float* null, i64 8) to i64), align 4, !dbg !73
+ %54 = getelementptr float, float* %1, i64 0, !dbg !74
+ %55 = load float, float* %54, align 4, !dbg !75
+ %56 = getelementptr float, float* %53, i64 0, !dbg !76
+ store float %55, float* %56, align 4, !dbg !77
+ %57 = getelementptr float, float* %1, i64 1, !dbg !78
+ %58 = load float, float* %57, align 4, !dbg !79
+ %59 = getelementptr float, float* %53, i64 1, !dbg !80
+ store float %58, float* %59, align 4, !dbg !81
+ %60 = getelementptr float, float* %1, i64 2, !dbg !82
+ %61 = load float, float* %60, align 4, !dbg !83
+ %62 = getelementptr float, float* %53, i64 2, !dbg !84
+ store float %61, float* %62, align 4, !dbg !85
+ %63 = getelementptr float, float* %1, i64 3, !dbg !86
+ %64 = load float, float* %63, align 4, !dbg !87
+ %65 = getelementptr float, float* %53, i64 3, !dbg !88
+ store float %64, float* %65, align 4, !dbg !89
+ %66 = getelementptr float, float* %1, i64 4, !dbg !90
+ %67 = load float, float* %66, align 4, !dbg !91
+ %68 = getelementptr float, float* %53, i64 4, !dbg !92
+ store float %67, float* %68, align 4, !dbg !93
+ %69 = getelementptr float, float* %1, i64 5, !dbg !94
+ %70 = load float, float* %69, align 4, !dbg !95
+ %71 = getelementptr float, float* %53, i64 5, !dbg !96
+ store float %70, float* %71, align 4, !dbg !97
+ %72 = getelementptr float, float* %1, i64 6, !dbg !98
+ %73 = load float, float* %72, align 4, !dbg !99
+ %74 = getelementptr float, float* %53, i64 6, !dbg !100
+ store float %73, float* %74, align 4, !dbg !101
+ %75 = getelementptr float, float* %1, i64 7, !dbg !102
+ %76 = load float, float* %75, align 4, !dbg !103
+ %77 = getelementptr float, float* %53, i64 7, !dbg !104
+ store float %76, float* %77, align 4, !dbg !105
+ %78 = alloca float, i64 ptrtoint (float* getelementptr (float, float* null, i64 8) to i64), align 4, !dbg !106
+ %79 = getelementptr float, float* %2, i64 0, !dbg !107
+ %80 = load float, float* %79, align 4, !dbg !108
+ %81 = getelementptr float, float* %78, i64 0, !dbg !109
+ store float %80, float* %81, align 4, !dbg !110
+ %82 = getelementptr float, float* %2, i64 1, !dbg !111
+ %83 = load float, float* %82, align 4, !dbg !112
+ %84 = getelementptr float, float* %78, i64 1, !dbg !113
+ store float %83, float* %84, align 4, !dbg !114
+ %85 = getelementptr float, float* %2, i64 2, !dbg !115
+ %86 = load float, float* %85, align 4, !dbg !116
+ %87 = getelementptr float, float* %78, i64 2, !dbg !117
+ store float %86, float* %87, align 4, !dbg !118
+ %88 = getelementptr float, float* %2, i64 3, !dbg !119
+ %89 = load float, float* %88, align 4, !dbg !120
+ %90 = getelementptr float, float* %78, i64 3, !dbg !121
+ store float %89, float* %90, align 4, !dbg !122
+ %91 = getelementptr float, float* %2, i64 4, !dbg !123
+ %92 = load float, float* %91, align 4, !dbg !124
+ %93 = getelementptr float, float* %78, i64 4, !dbg !125
+ store float %92, float* %93, align 4, !dbg !126
+ %94 = getelementptr float, float* %2, i64 5, !dbg !127
+ %95 = load float, float* %94, align 4, !dbg !128
+ %96 = getelementptr float, float* %78, i64 5, !dbg !129
+ store float %95, float* %96, align 4, !dbg !130
+ %97 = getelementptr float, float* %2, i64 6, !dbg !131
+ %98 = load float, float* %97, align 4, !dbg !132
+ %99 = getelementptr float, float* %78, i64 6, !dbg !133
+ store float %98, float* %99, align 4, !dbg !134
+ %100 = getelementptr float, float* %2, i64 7, !dbg !135
+ %101 = load float, float* %100, align 4, !dbg !136
+ %102 = getelementptr float, float* %78, i64 7, !dbg !137
+ store float %101, float* %102, align 4, !dbg !138
+ %103 = getelementptr float, float* %4, i64 0, !dbg !139
+ %104 = load float, float* %103, align 4, !dbg !140
+ %105 = getelementptr float, float* %53, i64 0, !dbg !141
+ %106 = load float, float* %105, align 4, !dbg !142
+ %107 = getelementptr float, float* %78, i64 0, !dbg !143
+ %108 = load float, float* %107, align 4, !dbg !144
+ %109 = fmul float %104, %106, !dbg !145
+ %110 = fadd float %108, %109, !dbg !146
+ %111 = getelementptr float, float* %4, i64 1, !dbg !147
+ %112 = load float, float* %111, align 4, !dbg !148
+ %113 = getelementptr float, float* %53, i64 2, !dbg !149
+ %114 = load float, float* %113, align 4, !dbg !150
+ %115 = fmul float %112, %114, !dbg !151
+ %116 = fadd float %110, %115, !dbg !152
+ %117 = getelementptr float, float* %4, i64 2, !dbg !153
+ %118 = load float, float* %117, align 4, !dbg !154
+ %119 = getelementptr float, float* %53, i64 4, !dbg !155
+ %120 = load float, float* %119, align 4, !dbg !156
+ %121 = fmul float %118, %120, !dbg !157
+ %122 = fadd float %116, %121, !dbg !158
+ %123 = getelementptr float, float* %4, i64 3, !dbg !159
+ %124 = load float, float* %123, align 4, !dbg !160
+ %125 = getelementptr float, float* %53, i64 6, !dbg !161
+ %126 = load float, float* %125, align 4, !dbg !162
+ %127 = fmul float %124, %126, !dbg !163
+ %128 = fadd float %122, %127, !dbg !164
+ %129 = getelementptr float, float* %78, i64 0, !dbg !165
+ store float %128, float* %129, align 4, !dbg !166
+ %130 = getelementptr float, float* %53, i64 1, !dbg !167
+ %131 = load float, float* %130, align 4, !dbg !168
+ %132 = getelementptr float, float* %78, i64 1, !dbg !169
+ %133 = load float, float* %132, align 4, !dbg !170
+ %134 = fmul float %104, %131, !dbg !171
+ %135 = fadd float %133, %134, !dbg !172
+ %136 = getelementptr float, float* %53, i64 3, !dbg !173
+ %137 = load float, float* %136, align 4, !dbg !174
+ %138 = fmul float %112, %137, !dbg !175
+ %139 = fadd float %135, %138, !dbg !176
+ %140 = getelementptr float, float* %53, i64 5, !dbg !177
+ %141 = load float, float* %140, align 4, !dbg !178
+ %142 = fmul float %118, %141, !dbg !179
+ %143 = fadd float %139, %142, !dbg !180
+ %144 = getelementptr float, float* %53, i64 7, !dbg !181
+ %145 = load float, float* %144, align 4, !dbg !182
+ %146 = fmul float %124, %145, !dbg !183
+ %147 = fadd float %143, %146, !dbg !184
+ %148 = getelementptr float, float* %78, i64 1, !dbg !185
+ store float %147, float* %148, align 4, !dbg !186
+ %149 = getelementptr float, float* %4, i64 4, !dbg !187
+ %150 = load float, float* %149, align 4, !dbg !188
+ %151 = getelementptr float, float* %78, i64 2, !dbg !189
+ %152 = load float, float* %151, align 4, !dbg !190
+ %153 = fmul float %150, %106, !dbg !191
+ %154 = fadd float %152, %153, !dbg !192
+ %155 = getelementptr float, float* %4, i64 5, !dbg !193
+ %156 = load float, float* %155, align 4, !dbg !194
+ %157 = fmul float %156, %114, !dbg !195
+ %158 = fadd float %154, %157, !dbg !196
+ %159 = getelementptr float, float* %4, i64 6, !dbg !197
+ %160 = load float, float* %159, align 4, !dbg !198
+ %161 = fmul float %160, %120, !dbg !199
+ %162 = fadd float %158, %161, !dbg !200
+ %163 = getelementptr float, float* %4, i64 7, !dbg !201
+ %164 = load float, float* %163, align 4, !dbg !202
+ %165 = fmul float %164, %126, !dbg !203
+ %166 = fadd float %162, %165, !dbg !204
+ %167 = getelementptr float, float* %78, i64 2, !dbg !205
+ store float %166, float* %167, align 4, !dbg !206
+ %168 = getelementptr float, float* %78, i64 3, !dbg !207
+ %169 = load float, float* %168, align 4, !dbg !208
+ %170 = fmul float %150, %131, !dbg !209
+ %171 = fadd float %169, %170, !dbg !210
+ %172 = fmul float %156, %137, !dbg !211
+ %173 = fadd float %171, %172, !dbg !212
+ %174 = fmul float %160, %141, !dbg !213
+ %175 = fadd float %173, %174, !dbg !214
+ %176 = fmul float %164, %145, !dbg !215
+ %177 = fadd float %175, %176, !dbg !216
+ %178 = getelementptr float, float* %78, i64 3, !dbg !217
+ store float %177, float* %178, align 4, !dbg !218
+ %179 = getelementptr float, float* %4, i64 8, !dbg !219
+ %180 = load float, float* %179, align 4, !dbg !220
+ %181 = getelementptr float, float* %78, i64 4, !dbg !221
+ %182 = load float, float* %181, align 4, !dbg !222
+ %183 = fmul float %180, %106, !dbg !223
+ %184 = fadd float %182, %183, !dbg !224
+ %185 = getelementptr float, float* %4, i64 9, !dbg !225
+ %186 = load float, float* %185, align 4, !dbg !226
+ %187 = fmul float %186, %114, !dbg !227
+ %188 = fadd float %184, %187, !dbg !228
+ %189 = getelementptr float, float* %4, i64 10, !dbg !229
+ %190 = load float, float* %189, align 4, !dbg !230
+ %191 = fmul float %190, %120, !dbg !231
+ %192 = fadd float %188, %191, !dbg !232
+ %193 = getelementptr float, float* %4, i64 11, !dbg !233
+ %194 = load float, float* %193, align 4, !dbg !234
+ %195 = fmul float %194, %126, !dbg !235
+ %196 = fadd float %192, %195, !dbg !236
+ %197 = getelementptr float, float* %78, i64 4, !dbg !237
+ store float %196, float* %197, align 4, !dbg !238
+ %198 = getelementptr float, float* %78, i64 5, !dbg !239
+ %199 = load float, float* %198, align 4, !dbg !240
+ %200 = fmul float %180, %131, !dbg !241
+ %201 = fadd float %199, %200, !dbg !242
+ %202 = fmul float %186, %137, !dbg !243
+ %203 = fadd float %201, %202, !dbg !244
+ %204 = fmul float %190, %141, !dbg !245
+ %205 = fadd float %203, %204, !dbg !246
+ %206 = fmul float %194, %145, !dbg !247
+ %207 = fadd float %205, %206, !dbg !248
+ %208 = getelementptr float, float* %78, i64 5, !dbg !249
+ store float %207, float* %208, align 4, !dbg !250
+ %209 = getelementptr float, float* %4, i64 12, !dbg !251
+ %210 = load float, float* %209, align 4, !dbg !252
+ %211 = getelementptr float, float* %78, i64 6, !dbg !253
+ %212 = load float, float* %211, align 4, !dbg !254
+ %213 = fmul float %210, %106, !dbg !255
+ %214 = fadd float %212, %213, !dbg !256
+ %215 = getelementptr float, float* %4, i64 13, !dbg !257
+ %216 = load float, float* %215, align 4, !dbg !258
+ %217 = fmul float %216, %114, !dbg !259
+ %218 = fadd float %214, %217, !dbg !260
+ %219 = getelementptr float, float* %4, i64 14, !dbg !261
+ %220 = load float, float* %219, align 4, !dbg !262
+ %221 = fmul float %220, %120, !dbg !263
+ %222 = fadd float %218, %221, !dbg !264
+ %223 = getelementptr float, float* %4, i64 15, !dbg !265
+ %224 = load float, float* %223, align 4, !dbg !266
+ %225 = fmul float %224, %126, !dbg !267
+ %226 = fadd float %222, %225, !dbg !268
+ %227 = getelementptr float, float* %78, i64 6, !dbg !269
+ store float %226, float* %227, align 4, !dbg !270
+ %228 = getelementptr float, float* %78, i64 7, !dbg !271
+ %229 = load float, float* %228, align 4, !dbg !272
+ %230 = fmul float %210, %131, !dbg !273
+ %231 = fadd float %229, %230, !dbg !274
+ %232 = fmul float %216, %137, !dbg !275
+ %233 = fadd float %231, %232, !dbg !276
+ %234 = fmul float %220, %141, !dbg !277
+ %235 = fadd float %233, %234, !dbg !278
+ %236 = fmul float %224, %145, !dbg !279
+ %237 = fadd float %235, %236, !dbg !280
+ %238 = getelementptr float, float* %78, i64 7, !dbg !281
+ store float %237, float* %238, align 4, !dbg !282
+ %239 = getelementptr float, float* %78, i64 0, !dbg !283
+ %240 = load float, float* %239, align 4, !dbg !284
+ %241 = getelementptr float, float* %2, i64 0, !dbg !285
+ store float %240, float* %241, align 4, !dbg !286
+ %242 = getelementptr float, float* %78, i64 1, !dbg !287
+ %243 = load float, float* %242, align 4, !dbg !288
+ %244 = getelementptr float, float* %2, i64 1, !dbg !289
+ store float %243, float* %244, align 4, !dbg !290
+ %245 = getelementptr float, float* %78, i64 2, !dbg !291
+ %246 = load float, float* %245, align 4, !dbg !292
+ %247 = getelementptr float, float* %2, i64 2, !dbg !293
+ store float %246, float* %247, align 4, !dbg !294
+ %248 = getelementptr float, float* %78, i64 3, !dbg !295
+ %249 = load float, float* %248, align 4, !dbg !296
+ %250 = getelementptr float, float* %2, i64 3, !dbg !297
+ store float %249, float* %250, align 4, !dbg !298
+ %251 = getelementptr float, float* %78, i64 4, !dbg !299
+ %252 = load float, float* %251, align 4, !dbg !300
+ %253 = getelementptr float, float* %2, i64 4, !dbg !301
+ store float %252, float* %253, align 4, !dbg !302
+ %254 = getelementptr float, float* %78, i64 5, !dbg !303
+ %255 = load float, float* %254, align 4, !dbg !304
+ %256 = getelementptr float, float* %2, i64 5, !dbg !305
+ store float %255, float* %256, align 4, !dbg !306
+ %257 = getelementptr float, float* %78, i64 6, !dbg !307
+ %258 = load float, float* %257, align 4, !dbg !308
+ %259 = getelementptr float, float* %2, i64 6, !dbg !309
+ store float %258, float* %259, align 4, !dbg !310
+ %260 = getelementptr float, float* %78, i64 7, !dbg !311
+ %261 = load float, float* %260, align 4, !dbg !312
+ %262 = getelementptr float, float* %2, i64 7, !dbg !313
+ store float %261, float* %262, align 4, !dbg !314
+ ret void, !dbg !315
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "mlir", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "LLVMDialectModule", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "main_kernel", linkageName: "main_kernel", scope: null, file: !4, line: 2, type: !5, scopeLine: 2, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !6)
+!4 = !DIFile(filename: "output/04optimized.mlir", directory: "/files0/extended/bohm747/Development/soda/soda-opt/docs/tutorials/dataflow2022")
+!5 = !DISubroutineType(types: !6)
+!6 = !{}
+!7 = !DILocation(line: 11, column: 10, scope: !8)
+!8 = !DILexicalBlockFile(scope: !3, file: !4, discriminator: 0)
+!9 = !DILocation(line: 18, column: 11, scope: !8)
+!10 = !DILocation(line: 19, column: 11, scope: !8)
+!11 = !DILocation(line: 26, column: 11, scope: !8)
+!12 = !DILocation(line: 27, column: 5, scope: !8)
+!13 = !DILocation(line: 34, column: 11, scope: !8)
+!14 = !DILocation(line: 35, column: 11, scope: !8)
+!15 = !DILocation(line: 42, column: 11, scope: !8)
+!16 = !DILocation(line: 43, column: 5, scope: !8)
+!17 = !DILocation(line: 50, column: 11, scope: !8)
+!18 = !DILocation(line: 51, column: 11, scope: !8)
+!19 = !DILocation(line: 58, column: 11, scope: !8)
+!20 = !DILocation(line: 59, column: 5, scope: !8)
+!21 = !DILocation(line: 66, column: 11, scope: !8)
+!22 = !DILocation(line: 67, column: 11, scope: !8)
+!23 = !DILocation(line: 74, column: 11, scope: !8)
+!24 = !DILocation(line: 75, column: 5, scope: !8)
+!25 = !DILocation(line: 82, column: 11, scope: !8)
+!26 = !DILocation(line: 83, column: 11, scope: !8)
+!27 = !DILocation(line: 90, column: 11, scope: !8)
+!28 = !DILocation(line: 91, column: 5, scope: !8)
+!29 = !DILocation(line: 98, column: 11, scope: !8)
+!30 = !DILocation(line: 99, column: 11, scope: !8)
+!31 = !DILocation(line: 106, column: 11, scope: !8)
+!32 = !DILocation(line: 107, column: 5, scope: !8)
+!33 = !DILocation(line: 114, column: 12, scope: !8)
+!34 = !DILocation(line: 115, column: 12, scope: !8)
+!35 = !DILocation(line: 122, column: 12, scope: !8)
+!36 = !DILocation(line: 123, column: 5, scope: !8)
+!37 = !DILocation(line: 130, column: 12, scope: !8)
+!38 = !DILocation(line: 131, column: 12, scope: !8)
+!39 = !DILocation(line: 138, column: 12, scope: !8)
+!40 = !DILocation(line: 139, column: 5, scope: !8)
+!41 = !DILocation(line: 146, column: 12, scope: !8)
+!42 = !DILocation(line: 147, column: 12, scope: !8)
+!43 = !DILocation(line: 154, column: 12, scope: !8)
+!44 = !DILocation(line: 155, column: 5, scope: !8)
+!45 = !DILocation(line: 162, column: 12, scope: !8)
+!46 = !DILocation(line: 163, column: 12, scope: !8)
+!47 = !DILocation(line: 170, column: 12, scope: !8)
+!48 = !DILocation(line: 171, column: 5, scope: !8)
+!49 = !DILocation(line: 178, column: 12, scope: !8)
+!50 = !DILocation(line: 179, column: 12, scope: !8)
+!51 = !DILocation(line: 186, column: 12, scope: !8)
+!52 = !DILocation(line: 187, column: 5, scope: !8)
+!53 = !DILocation(line: 194, column: 12, scope: !8)
+!54 = !DILocation(line: 195, column: 12, scope: !8)
+!55 = !DILocation(line: 202, column: 12, scope: !8)
+!56 = !DILocation(line: 203, column: 5, scope: !8)
+!57 = !DILocation(line: 210, column: 12, scope: !8)
+!58 = !DILocation(line: 211, column: 12, scope: !8)
+!59 = !DILocation(line: 218, column: 12, scope: !8)
+!60 = !DILocation(line: 219, column: 5, scope: !8)
+!61 = !DILocation(line: 226, column: 12, scope: !8)
+!62 = !DILocation(line: 227, column: 12, scope: !8)
+!63 = !DILocation(line: 234, column: 12, scope: !8)
+!64 = !DILocation(line: 235, column: 5, scope: !8)
+!65 = !DILocation(line: 242, column: 12, scope: !8)
+!66 = !DILocation(line: 243, column: 12, scope: !8)
+!67 = !DILocation(line: 250, column: 12, scope: !8)
+!68 = !DILocation(line: 251, column: 5, scope: !8)
+!69 = !DILocation(line: 258, column: 12, scope: !8)
+!70 = !DILocation(line: 259, column: 12, scope: !8)
+!71 = !DILocation(line: 266, column: 12, scope: !8)
+!72 = !DILocation(line: 267, column: 5, scope: !8)
+!73 = !DILocation(line: 272, column: 12, scope: !8)
+!74 = !DILocation(line: 279, column: 12, scope: !8)
+!75 = !DILocation(line: 280, column: 12, scope: !8)
+!76 = !DILocation(line: 287, column: 12, scope: !8)
+!77 = !DILocation(line: 288, column: 5, scope: !8)
+!78 = !DILocation(line: 295, column: 12, scope: !8)
+!79 = !DILocation(line: 296, column: 12, scope: !8)
+!80 = !DILocation(line: 303, column: 12, scope: !8)
+!81 = !DILocation(line: 304, column: 5, scope: !8)
+!82 = !DILocation(line: 311, column: 12, scope: !8)
+!83 = !DILocation(line: 312, column: 12, scope: !8)
+!84 = !DILocation(line: 319, column: 12, scope: !8)
+!85 = !DILocation(line: 320, column: 5, scope: !8)
+!86 = !DILocation(line: 327, column: 12, scope: !8)
+!87 = !DILocation(line: 328, column: 12, scope: !8)
+!88 = !DILocation(line: 335, column: 12, scope: !8)
+!89 = !DILocation(line: 336, column: 5, scope: !8)
+!90 = !DILocation(line: 343, column: 12, scope: !8)
+!91 = !DILocation(line: 344, column: 12, scope: !8)
+!92 = !DILocation(line: 351, column: 12, scope: !8)
+!93 = !DILocation(line: 352, column: 5, scope: !8)
+!94 = !DILocation(line: 359, column: 12, scope: !8)
+!95 = !DILocation(line: 360, column: 12, scope: !8)
+!96 = !DILocation(line: 367, column: 12, scope: !8)
+!97 = !DILocation(line: 368, column: 5, scope: !8)
+!98 = !DILocation(line: 375, column: 12, scope: !8)
+!99 = !DILocation(line: 376, column: 12, scope: !8)
+!100 = !DILocation(line: 383, column: 12, scope: !8)
+!101 = !DILocation(line: 384, column: 5, scope: !8)
+!102 = !DILocation(line: 391, column: 12, scope: !8)
+!103 = !DILocation(line: 392, column: 12, scope: !8)
+!104 = !DILocation(line: 399, column: 12, scope: !8)
+!105 = !DILocation(line: 400, column: 5, scope: !8)
+!106 = !DILocation(line: 405, column: 12, scope: !8)
+!107 = !DILocation(line: 412, column: 12, scope: !8)
+!108 = !DILocation(line: 413, column: 12, scope: !8)
+!109 = !DILocation(line: 420, column: 12, scope: !8)
+!110 = !DILocation(line: 421, column: 5, scope: !8)
+!111 = !DILocation(line: 428, column: 12, scope: !8)
+!112 = !DILocation(line: 429, column: 12, scope: !8)
+!113 = !DILocation(line: 436, column: 12, scope: !8)
+!114 = !DILocation(line: 437, column: 5, scope: !8)
+!115 = !DILocation(line: 444, column: 12, scope: !8)
+!116 = !DILocation(line: 445, column: 12, scope: !8)
+!117 = !DILocation(line: 452, column: 12, scope: !8)
+!118 = !DILocation(line: 453, column: 5, scope: !8)
+!119 = !DILocation(line: 460, column: 12, scope: !8)
+!120 = !DILocation(line: 461, column: 12, scope: !8)
+!121 = !DILocation(line: 468, column: 12, scope: !8)
+!122 = !DILocation(line: 469, column: 5, scope: !8)
+!123 = !DILocation(line: 476, column: 12, scope: !8)
+!124 = !DILocation(line: 477, column: 12, scope: !8)
+!125 = !DILocation(line: 484, column: 12, scope: !8)
+!126 = !DILocation(line: 485, column: 5, scope: !8)
+!127 = !DILocation(line: 492, column: 12, scope: !8)
+!128 = !DILocation(line: 493, column: 12, scope: !8)
+!129 = !DILocation(line: 500, column: 12, scope: !8)
+!130 = !DILocation(line: 501, column: 5, scope: !8)
+!131 = !DILocation(line: 508, column: 12, scope: !8)
+!132 = !DILocation(line: 509, column: 12, scope: !8)
+!133 = !DILocation(line: 516, column: 12, scope: !8)
+!134 = !DILocation(line: 517, column: 5, scope: !8)
+!135 = !DILocation(line: 524, column: 12, scope: !8)
+!136 = !DILocation(line: 525, column: 12, scope: !8)
+!137 = !DILocation(line: 532, column: 12, scope: !8)
+!138 = !DILocation(line: 533, column: 5, scope: !8)
+!139 = !DILocation(line: 540, column: 12, scope: !8)
+!140 = !DILocation(line: 541, column: 12, scope: !8)
+!141 = !DILocation(line: 548, column: 12, scope: !8)
+!142 = !DILocation(line: 549, column: 12, scope: !8)
+!143 = !DILocation(line: 556, column: 12, scope: !8)
+!144 = !DILocation(line: 557, column: 12, scope: !8)
+!145 = !DILocation(line: 558, column: 12, scope: !8)
+!146 = !DILocation(line: 559, column: 12, scope: !8)
+!147 = !DILocation(line: 566, column: 12, scope: !8)
+!148 = !DILocation(line: 567, column: 12, scope: !8)
+!149 = !DILocation(line: 574, column: 12, scope: !8)
+!150 = !DILocation(line: 575, column: 12, scope: !8)
+!151 = !DILocation(line: 576, column: 12, scope: !8)
+!152 = !DILocation(line: 577, column: 12, scope: !8)
+!153 = !DILocation(line: 584, column: 12, scope: !8)
+!154 = !DILocation(line: 585, column: 12, scope: !8)
+!155 = !DILocation(line: 592, column: 12, scope: !8)
+!156 = !DILocation(line: 593, column: 12, scope: !8)
+!157 = !DILocation(line: 594, column: 12, scope: !8)
+!158 = !DILocation(line: 595, column: 12, scope: !8)
+!159 = !DILocation(line: 602, column: 12, scope: !8)
+!160 = !DILocation(line: 603, column: 12, scope: !8)
+!161 = !DILocation(line: 610, column: 12, scope: !8)
+!162 = !DILocation(line: 611, column: 12, scope: !8)
+!163 = !DILocation(line: 612, column: 12, scope: !8)
+!164 = !DILocation(line: 613, column: 12, scope: !8)
+!165 = !DILocation(line: 620, column: 12, scope: !8)
+!166 = !DILocation(line: 621, column: 5, scope: !8)
+!167 = !DILocation(line: 628, column: 12, scope: !8)
+!168 = !DILocation(line: 629, column: 12, scope: !8)
+!169 = !DILocation(line: 636, column: 12, scope: !8)
+!170 = !DILocation(line: 637, column: 12, scope: !8)
+!171 = !DILocation(line: 638, column: 12, scope: !8)
+!172 = !DILocation(line: 639, column: 12, scope: !8)
+!173 = !DILocation(line: 646, column: 12, scope: !8)
+!174 = !DILocation(line: 647, column: 12, scope: !8)
+!175 = !DILocation(line: 648, column: 12, scope: !8)
+!176 = !DILocation(line: 649, column: 12, scope: !8)
+!177 = !DILocation(line: 656, column: 12, scope: !8)
+!178 = !DILocation(line: 657, column: 12, scope: !8)
+!179 = !DILocation(line: 658, column: 12, scope: !8)
+!180 = !DILocation(line: 659, column: 12, scope: !8)
+!181 = !DILocation(line: 666, column: 12, scope: !8)
+!182 = !DILocation(line: 667, column: 12, scope: !8)
+!183 = !DILocation(line: 668, column: 12, scope: !8)
+!184 = !DILocation(line: 669, column: 12, scope: !8)
+!185 = !DILocation(line: 676, column: 12, scope: !8)
+!186 = !DILocation(line: 677, column: 5, scope: !8)
+!187 = !DILocation(line: 684, column: 12, scope: !8)
+!188 = !DILocation(line: 685, column: 12, scope: !8)
+!189 = !DILocation(line: 692, column: 12, scope: !8)
+!190 = !DILocation(line: 693, column: 12, scope: !8)
+!191 = !DILocation(line: 694, column: 12, scope: !8)
+!192 = !DILocation(line: 695, column: 12, scope: !8)
+!193 = !DILocation(line: 702, column: 12, scope: !8)
+!194 = !DILocation(line: 703, column: 12, scope: !8)
+!195 = !DILocation(line: 704, column: 12, scope: !8)
+!196 = !DILocation(line: 705, column: 12, scope: !8)
+!197 = !DILocation(line: 712, column: 12, scope: !8)
+!198 = !DILocation(line: 713, column: 12, scope: !8)
+!199 = !DILocation(line: 714, column: 12, scope: !8)
+!200 = !DILocation(line: 715, column: 12, scope: !8)
+!201 = !DILocation(line: 722, column: 12, scope: !8)
+!202 = !DILocation(line: 723, column: 12, scope: !8)
+!203 = !DILocation(line: 724, column: 12, scope: !8)
+!204 = !DILocation(line: 725, column: 12, scope: !8)
+!205 = !DILocation(line: 732, column: 12, scope: !8)
+!206 = !DILocation(line: 733, column: 5, scope: !8)
+!207 = !DILocation(line: 740, column: 12, scope: !8)
+!208 = !DILocation(line: 741, column: 12, scope: !8)
+!209 = !DILocation(line: 742, column: 12, scope: !8)
+!210 = !DILocation(line: 743, column: 12, scope: !8)
+!211 = !DILocation(line: 744, column: 12, scope: !8)
+!212 = !DILocation(line: 745, column: 12, scope: !8)
+!213 = !DILocation(line: 746, column: 12, scope: !8)
+!214 = !DILocation(line: 747, column: 12, scope: !8)
+!215 = !DILocation(line: 748, column: 12, scope: !8)
+!216 = !DILocation(line: 749, column: 12, scope: !8)
+!217 = !DILocation(line: 756, column: 12, scope: !8)
+!218 = !DILocation(line: 757, column: 5, scope: !8)
+!219 = !DILocation(line: 764, column: 12, scope: !8)
+!220 = !DILocation(line: 765, column: 12, scope: !8)
+!221 = !DILocation(line: 772, column: 12, scope: !8)
+!222 = !DILocation(line: 773, column: 12, scope: !8)
+!223 = !DILocation(line: 774, column: 12, scope: !8)
+!224 = !DILocation(line: 775, column: 12, scope: !8)
+!225 = !DILocation(line: 782, column: 12, scope: !8)
+!226 = !DILocation(line: 783, column: 12, scope: !8)
+!227 = !DILocation(line: 784, column: 12, scope: !8)
+!228 = !DILocation(line: 785, column: 12, scope: !8)
+!229 = !DILocation(line: 792, column: 12, scope: !8)
+!230 = !DILocation(line: 793, column: 12, scope: !8)
+!231 = !DILocation(line: 794, column: 12, scope: !8)
+!232 = !DILocation(line: 795, column: 12, scope: !8)
+!233 = !DILocation(line: 802, column: 12, scope: !8)
+!234 = !DILocation(line: 803, column: 12, scope: !8)
+!235 = !DILocation(line: 804, column: 12, scope: !8)
+!236 = !DILocation(line: 805, column: 12, scope: !8)
+!237 = !DILocation(line: 812, column: 12, scope: !8)
+!238 = !DILocation(line: 813, column: 5, scope: !8)
+!239 = !DILocation(line: 820, column: 12, scope: !8)
+!240 = !DILocation(line: 821, column: 12, scope: !8)
+!241 = !DILocation(line: 822, column: 12, scope: !8)
+!242 = !DILocation(line: 823, column: 12, scope: !8)
+!243 = !DILocation(line: 824, column: 12, scope: !8)
+!244 = !DILocation(line: 825, column: 12, scope: !8)
+!245 = !DILocation(line: 826, column: 12, scope: !8)
+!246 = !DILocation(line: 827, column: 12, scope: !8)
+!247 = !DILocation(line: 828, column: 12, scope: !8)
+!248 = !DILocation(line: 829, column: 12, scope: !8)
+!249 = !DILocation(line: 836, column: 12, scope: !8)
+!250 = !DILocation(line: 837, column: 5, scope: !8)
+!251 = !DILocation(line: 844, column: 12, scope: !8)
+!252 = !DILocation(line: 845, column: 12, scope: !8)
+!253 = !DILocation(line: 852, column: 12, scope: !8)
+!254 = !DILocation(line: 853, column: 12, scope: !8)
+!255 = !DILocation(line: 854, column: 12, scope: !8)
+!256 = !DILocation(line: 855, column: 12, scope: !8)
+!257 = !DILocation(line: 862, column: 12, scope: !8)
+!258 = !DILocation(line: 863, column: 12, scope: !8)
+!259 = !DILocation(line: 864, column: 12, scope: !8)
+!260 = !DILocation(line: 865, column: 12, scope: !8)
+!261 = !DILocation(line: 872, column: 12, scope: !8)
+!262 = !DILocation(line: 873, column: 12, scope: !8)
+!263 = !DILocation(line: 874, column: 12, scope: !8)
+!264 = !DILocation(line: 875, column: 12, scope: !8)
+!265 = !DILocation(line: 882, column: 12, scope: !8)
+!266 = !DILocation(line: 883, column: 12, scope: !8)
+!267 = !DILocation(line: 884, column: 12, scope: !8)
+!268 = !DILocation(line: 885, column: 12, scope: !8)
+!269 = !DILocation(line: 892, column: 12, scope: !8)
+!270 = !DILocation(line: 893, column: 5, scope: !8)
+!271 = !DILocation(line: 900, column: 12, scope: !8)
+!272 = !DILocation(line: 901, column: 12, scope: !8)
+!273 = !DILocation(line: 902, column: 12, scope: !8)
+!274 = !DILocation(line: 903, column: 12, scope: !8)
+!275 = !DILocation(line: 904, column: 12, scope: !8)
+!276 = !DILocation(line: 905, column: 12, scope: !8)
+!277 = !DILocation(line: 906, column: 12, scope: !8)
+!278 = !DILocation(line: 907, column: 12, scope: !8)
+!279 = !DILocation(line: 908, column: 12, scope: !8)
+!280 = !DILocation(line: 909, column: 12, scope: !8)
+!281 = !DILocation(line: 916, column: 12, scope: !8)
+!282 = !DILocation(line: 917, column: 5, scope: !8)
+!283 = !DILocation(line: 924, column: 12, scope: !8)
+!284 = !DILocation(line: 925, column: 12, scope: !8)
+!285 = !DILocation(line: 932, column: 12, scope: !8)
+!286 = !DILocation(line: 933, column: 5, scope: !8)
+!287 = !DILocation(line: 940, column: 12, scope: !8)
+!288 = !DILocation(line: 941, column: 12, scope: !8)
+!289 = !DILocation(line: 948, column: 12, scope: !8)
+!290 = !DILocation(line: 949, column: 5, scope: !8)
+!291 = !DILocation(line: 956, column: 12, scope: !8)
+!292 = !DILocation(line: 957, column: 12, scope: !8)
+!293 = !DILocation(line: 964, column: 12, scope: !8)
+!294 = !DILocation(line: 965, column: 5, scope: !8)
+!295 = !DILocation(line: 972, column: 12, scope: !8)
+!296 = !DILocation(line: 973, column: 12, scope: !8)
+!297 = !DILocation(line: 980, column: 12, scope: !8)
+!298 = !DILocation(line: 981, column: 5, scope: !8)
+!299 = !DILocation(line: 988, column: 12, scope: !8)
+!300 = !DILocation(line: 989, column: 12, scope: !8)
+!301 = !DILocation(line: 996, column: 12, scope: !8)
+!302 = !DILocation(line: 997, column: 5, scope: !8)
+!303 = !DILocation(line: 1004, column: 12, scope: !8)
+!304 = !DILocation(line: 1005, column: 12, scope: !8)
+!305 = !DILocation(line: 1012, column: 12, scope: !8)
+!306 = !DILocation(line: 1013, column: 5, scope: !8)
+!307 = !DILocation(line: 1020, column: 12, scope: !8)
+!308 = !DILocation(line: 1021, column: 12, scope: !8)
+!309 = !DILocation(line: 1028, column: 12, scope: !8)
+!310 = !DILocation(line: 1029, column: 5, scope: !8)
+!311 = !DILocation(line: 1036, column: 12, scope: !8)
+!312 = !DILocation(line: 1037, column: 12, scope: !8)
+!313 = !DILocation(line: 1044, column: 12, scope: !8)
+!314 = !DILocation(line: 1045, column: 5, scope: !8)
+!315 = !DILocation(line: 1046, column: 5, scope: !8)
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise3/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/test.xml
new file mode 100644
index 000000000..56aea2e94
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/test.xml
@@ -0,0 +1,8 @@
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise4/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise4/bambu.sh
new file mode 100644
index 000000000..a9c45b8a0
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise4/bambu.sh
@@ -0,0 +1,2 @@
+bambu proxies.c --top-fname=funcA "$@" |& tee log.txt
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise4/proxies.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise4/proxies.c
new file mode 100644
index 000000000..cc89ba13e
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise4/proxies.c
@@ -0,0 +1,18 @@
+int __attribute__ ((noinline)) funcC(int a[2]){
+ return a[0] * a[0] + a[1] * a[1];
+int __attribute__ ((noinline)) funcB(int a[2]){
+ int i;
+ for(i=0; i<2; i++)
+ a[i] = a[i] + 1;
+ return funcC(a);
+int funcA(){
+ int temp1, temp2;
+ int a[2] = {0,1};
+ temp1 = funcC(a);
+ temp2 = funcB(a);
+ return temp1 + temp2;
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5/LUdecomposition.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/LUdecomposition.c
new file mode 100644
index 000000000..a20e6b9f7
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/LUdecomposition.c
@@ -0,0 +1,270 @@
+// int Upper_Triangular_Solve(float *U, float *B, float x[], int n) //
+// //
+// Description: //
+// This routine solves the linear equation Ux = B, where U is an n x n //
+// upper triangular matrix. (The subdiagonal part of the matrix is //
+// not addressed.) //
+// The algorithm follows: //
+// x[n-1] = B[n-1]/U[n-1][n-1], and //
+// x[i] = [B[i] - (U[i][i+1] * x[i+1] + ... + U[i][n-1] * x[n-1])] //
+// / U[i][i], //
+// for i = n-2, ..., 0. //
+// //
+// Arguments: //
+// float *U Pointer to the first element of the upper triangular //
+// matrix. //
+// float *B Pointer to the column vector, (n x 1) matrix, B. //
+// float *x Pointer to the column vector, (n x 1) matrix, x. //
+// int n The number of rows or columns of the matrix U. //
+// //
+// Return Values: //
+// 0 Success //
+// -1 Failure - The matrix U is singular. //
+// //
+// Example: //
+// #define N //
+// float A[N][N], B[N], x[N]; //
+// //
+// (your code to create matrix A and column vector B) //
+// err = Upper_Triangular_Solve(&A[0][0], B, x, n); //
+// if (err < 0) printf(" Matrix A is singular\n"); //
+// else printf(" The solution is \n"); //
+// ... //
+// //
+int Upper_Triangular_Solve(float *U, float B[], float x[], int n)
+ int i, k;
+// Solve the linear equation Ux = B for x, where U is an upper
+// triangular matrix.
+ for (k = n-1, U += n * (n - 1); k >= 0; U -= n, k--) {
+ if (*(U + k) == 0.0) return -1; // The matrix U is singular
+ x[k] = B[k];
+ for (i = k + 1; i < n; i++) x[k] -= x[i] * *(U + i);
+ x[k] /= *(U + k);
+ }
+ return 0;
+// void Unit_Lower_Triangular_Solve(float *L, float *B, float x[], int n) //
+// //
+// Description: //
+// This routine solves the linear equation Lx = B, where L is an n x n //
+// unit lower triangular matrix. (Only the subdiagonal part of the matrix//
+// is addressed.) The diagonal is assumed to consist of 1's and is not //
+// addressed. //
+// The algorithm follows: //
+// x[0] = B[0], and //
+// x[i] = B[i] - (L[i][0] * x[0] + ... + L[i][i-1] * x[i-1]), //
+// for i = 1, ..., n-1. //
+// //
+// Arguments: //
+// float *L Pointer to the first element of the unit lower triangular //
+// matrix. //
+// float *B Pointer to the column vector, (n x 1) matrix, B. //
+// float *x Pointer to the column vector, (n x 1) matrix, x. //
+// int n The number of rows or columns of the matrix L. //
+// //
+// Return Values: //
+// void //
+// //
+// Example: //
+// #define N //
+// float A[N][N], B[N], x[N]; //
+// //
+// (your code to create matrix A and column vector B) //
+// Unit_Lower_Triangular_Solve(&A[0][0], B, x, n); //
+// printf(" The solution is \n"); //
+// ... //
+// //
+void Unit_Lower_Triangular_Solve(float *L, float B[], float x[], int n)
+ int i, k;
+// Solve the linear equation Lx = B for x, where L is a unit lower
+// triangular matrix.
+ x[0] = B[0];
+ for (k = 1, L += n; k < n; L += n, k++)
+ for (i = 0, x[k] = B[k]; i < k; i++) x[k] -= x[i] * *(L + i);
+// int Doolittle_LU_Decomposition(float *A, int n) //
+// //
+// Description: //
+// This routine uses Doolittle's method to decompose the n x n matrix A //
+// into a unit lower triangular matrix L and an upper triangular matrix U //
+// such that A = LU. //
+// The matrices L and U replace the matrix A so that the original matrix //
+// A is destroyed. //
+// Note! In Doolittle's method the diagonal elements of L are 1 and are //
+// not stored. //
+// Note! The determinant of A is the product of the diagonal elements //
+// of U. (det A = det L * det U = det U). //
+// This routine is suitable for those classes of matrices which when //
+// performing Gaussian elimination do not need to undergo partial //
+// pivoting, e.g. positive definite symmetric matrices, diagonally //
+// dominant band matrices, etc. //
+// For the more general case in which partial pivoting is needed use //
+// Doolittle_LU_Decomposition_with_Pivoting. //
+// The LU decomposition is convenient when one needs to solve the linear //
+// equation Ax = B for the vector x while the matrix A is fixed and the //
+// vector B is varied. The routine for solving the linear system Ax = B //
+// after performing the LU decomposition for A is Doolittle_LU_Solve //
+// (see below). //
+// //
+// The Doolittle method is given by evaluating, in order, the following //
+// pair of expressions for k = 0, ... , n-1: //
+// U[k][j] = A[k][j] - (L[k][0]*U[0][j] + ... + L[k][k-1]*U[k-1][j]) //
+// for j = k, k+1, ... , n-1 //
+// L[i][k] = (A[i][k] - (L[i][0]*U[0][k] + . + L[i][k-1]*U[k-1][k])) //
+// / U[k][k] //
+// for i = k+1, ... , n-1. //
+// The matrix U forms the upper triangular matrix, and the matrix L //
+// forms the lower triangular matrix. //
+// //
+// Arguments: //
+// float *A Pointer to the first element of the matrix A[n][n]. //
+// int n The number of rows or columns of the matrix A. //
+// //
+// Return Values: //
+// 0 Success //
+// -1 Failure - The matrix A is singular. //
+// //
+// Example: //
+// #define N //
+// float A[N][N]; //
+// //
+// (your code to intialize the matrix A) //
+// //
+// err = Doolittle_LU_Decomposition(&A[0][0], N); //
+// if (err < 0) printf(" Matrix A is singular\n"); //
+// else { printf(" The LU decomposition of A is \n"); //
+// ... //
+// //
+int Doolittle_LU_Decomposition(float *A, int n)
+ int i, j, k, p;
+ float *p_k, *p_row, *p_col;
+// For each row and column, k = 0, ..., n-1,
+// find the upper triangular matrix elements for row k
+// and if the matrix is non-singular (nonzero diagonal element).
+// find the lower triangular matrix elements for column k.
+ for (k = 0, p_k = A; k < n; p_k += n, k++) {
+ for (j = k; j < n; j++) {
+ for (p = 0, p_col = A; p < k; p_col += n, p++)
+ *(p_k + j) -= *(p_k + p) * *(p_col + j);
+ }
+ if ( *(p_k + k) == 0.0 ) return -1;
+ for (i = k+1, p_row = p_k + n; i < n; p_row += n, i++) {
+ for (p = 0, p_col = A; p < k; p_col += n, p++)
+ *(p_row + k) -= *(p_row + p) * *(p_col + k);
+ *(p_row + k) /= *(p_k + k);
+ }
+ }
+ return 0;
+// int Doolittle_LU_Solve(float *LU, float *B, float *x, int n) //
+// //
+// Description: //
+// This routine uses Doolittle's method to solve the linear equation //
+// Ax = B. This routine is called after the matrix A has been decomposed //
+// into a product of a unit lower triangular matrix L and an upper //
+// triangular matrix U without pivoting. The argument LU is a pointer to //
+// the matrix the subdiagonal part of which is L and the superdiagonal //
+// together with the diagonal part is U. (The diagonal part of L is 1 and //
+// is not stored.) The matrix A = LU. //
+// The solution proceeds by solving the linear equation Ly = B for y and //
+// subsequently solving the linear equation Ux = y for x. //
+// //
+// Arguments: //
+// float *LU Pointer to the first element of the matrix whose elements //
+// form the lower and upper triangular matrix factors of A. //
+// float *B Pointer to the column vector, (n x 1) matrix, B //
+// float *x Solution to the equation Ax = B. //
+// int n The number of rows or columns of the matrix LU. //
+// //
+// Return Values: //
+// 0 Success //
+// -1 Failure - The matrix A is singular. //
+// //
+// Example: //
+// #define N //
+// float A[N][N], B[N], x[N]; //
+// //
+// (your code to create matrix A and column vector B) //
+// err = Doolittle_LU_Decomposition(&A[0][0], N); //
+// if (err < 0) printf(" Matrix A is singular\n"); //
+// else { //
+// err = Doolittle_LU_Solve(&A[0][0], B, x, n); //
+// if (err < 0) printf(" Matrix A is singular\n"); //
+// else printf(" The solution is \n"); //
+// ... //
+// } //
+// //
+int Doolittle_LU_Solve(float *LU, float B[], float x[], int n)
+// Solve the linear equation Lx = B for x, where L is a lower
+// triangular matrix with an implied 1 along the diagonal.
+ Unit_Lower_Triangular_Solve(LU, B, x, n);
+// Solve the linear equation Ux = y, where y is the solution
+// obtained above of Lx = B and U is an upper triangular matrix.
+ return Upper_Triangular_Solve(LU, x, x, n);
+int invertMatrix(float *LU, float *invA, float *I)
+ int i, j;
+ // float I[4][4] = {{1, 0, 0, 0}, {0, 1, 0, 0}, {0, 0, 1, 0}, {0, 0, 0, 1}};
+ float resultColumn[4];
+ for (i = 0; i < 4; ++i)
+ {
+ int res = Doolittle_LU_Solve(LU, I + i*4, resultColumn, 4);
+ if (res != 0) return res;
+ for (j = 0; j < 4; ++j)
+ *(invA + i + j * 4) = resultColumn[j];
+ }
+ return 0;
+//float A[4][4] = {{1, 1, 1, 1}, {1, 4, 2, 3}, {1, 2, 1, 2}, {1, 1, 1, 0}};
+//float invA[4][4]= {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+int fun(float *A, float *invA, float *b, float *x, float *I)
+ int res = Doolittle_LU_Decomposition((float *)A, 4);
+ if (res != 0) return res;
+ // float b[4] = {63, 105, 48, 186};
+ // float x[4];
+ res = Doolittle_LU_Solve((float *)A, b, x, 4);
+ if (res != 0) return res;
+ res = invertMatrix((float *)A, (float *)invA, I);
+ return res;
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/bambu.sh
new file mode 100755
index 000000000..f37d97ff8
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/bambu.sh
@@ -0,0 +1,12 @@
+script=$(readlink -e $0)
+root_dir=$(dirname $script)
+rm -rf ludecomp
+mkdir -p ludecomp
+cd ludecomp
+echo "#synthesis of fun"
+bambu $root_dir/LUdecomposition.c --top-fname=fun \
+ -O1 \
+ --generate-tb=$root_dir/test.xml --simulate --simulator=VERILATOR \
+ -v2 --print-dot "$@" |& tee log.txt
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/test.xml
new file mode 100644
index 000000000..500454cc7
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/test.xml
@@ -0,0 +1,4 @@
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/bambu.sh
new file mode 100644
index 000000000..11ecb2ac9
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/bambu.sh
@@ -0,0 +1,3 @@
+!bambu helm.c --top-fname=helm_naive -Icommon.h --simulate --simulator=VERILATOR --generate-tb=test.xml --compiler=I386_CLANG6
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/common.h b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/common.h
new file mode 100644
index 000000000..1381fda68
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/common.h
@@ -0,0 +1,46 @@
+#pragma once
+typedef float real_t;
+real_t* make_empty(size_t size)
+ return (real_t*)calloc(size, sizeof(real_t));
+real_t* make_random(size_t size)
+ real_t* result = make_empty(size);
+ if (!result) return NULL;
+ real_t* end = result + size;
+ for (real_t* ptr = result; ptr != end; ++ptr) {
+ *ptr = ((real_t)random() / RAND_MAX) * (real_t)(2) - (real_t)(1);
+ }
+ return result;
+real_t* make_copy(const real_t* data, size_t size)
+ real_t* result = make_empty(size);
+ if (!result) return NULL;
+ memcpy(result, data, size*sizeof(real_t));
+ return result;
+real_t mse(const real_t* a, const real_t* b, size_t size)
+ real_t accu = 0;
+ const real_t* a_end = a + size;
+ for (; a != a_end; ++a,++b) {
+ real_t err = (*a - *b);
+ accu += err * err;
+ }
+ return accu / (real_t)(size);
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/helm.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/helm.c
new file mode 100644
index 000000000..4eb45d408
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/helm.c
@@ -0,0 +1,146 @@
+#include "common.h"
+#pragma GCC diagnostic ignored "-Wincompatible-pointer-types"
+const size_t P = 11;
+void helm_naive(
+ real_t w[P],
+ real_t L[P][P],
+ real_t d[4],
+ real_t u[P][P][P],
+ real_t r[P][P][P]
+ for (size_t x = 0; x < P; ++x)
+ for (size_t y = 0; y < P; ++y)
+ for (size_t z = 0; z < P; ++z) {
+ r[x][y][z] = d[0] * w[x] * w[y] * w[z] * u[x][y][z];
+ }
+ for (size_t x = 0; x < P; ++x)
+ for (size_t y = 0; y < P; ++y)
+ for (size_t z = 0; z < P; ++z) {
+ real_t accu = 0;
+ for (size_t k = 0; k < P; ++k) {
+ accu += L[x][k] * w[y] * w[z] * u[k][y][z];
+ }
+ r[x][y][z] += d[1] * accu;
+ }
+ for (size_t x = 0; x < P; ++x)
+ for (size_t y = 0; y < P; ++y)
+ for (size_t z = 0; z < P; ++z) {
+ real_t accu = 0;
+ for (size_t k = 0; k < P; ++k) {
+ accu += w[x] * L[y][k] * w[z] * u[x][k][z];
+ }
+ r[x][y][z] += d[2] * accu;
+ }
+ for (size_t x = 0; x < P; ++x)
+ for (size_t y = 0; y < P; ++y)
+ for (size_t z = 0; z < P; ++z) {
+ real_t accu = 0;
+ for (size_t k = 0; k < P; ++k) {
+ accu += w[x] * w[y] * L[z][k] * u[x][y][k];
+ }
+ r[x][y][z] += d[3] * accu;
+ }
+void helm_factor_impl(
+ real_t w[P],
+ real_t L[P][P],
+ real_t d[4],
+ real_t u[P][P][P],
+ real_t L_hat[P][P],
+ real_t M_u[P][P][P],
+ real_t r[P][P][P]
+ for (size_t x = 0; x < P; ++x)
+ for (size_t y = 0; y < P; ++y)
+ for (size_t z = 0; z < P; ++z) {
+ real_t M_u_xyz = w[x] * w[y] * w[z] * u[x][y][z];
+ M_u[x][y][z] = M_u_xyz;
+ r[x][y][z] = M_u_xyz * d[0];
+ }
+ for (size_t i = 0; i < P; ++i)
+ for (size_t j = 0; j < P; ++j) {
+ L_hat[i][j] = L[i][j] / w[j];
+ }
+ for (size_t x = 0; x < P; ++x)
+ for (size_t y = 0; y < P; ++y)
+ for (size_t z = 0; z < P; ++z) {
+ real_t accu = 0;
+ for (size_t k = 0; k < P; ++k) {
+ accu += L_hat[x][k] * M_u[k][y][z];
+ }
+ r[x][y][z] += d[1] * accu;
+ }
+ for (size_t x = 0; x < P; ++x)
+ for (size_t y = 0; y < P; ++y)
+ for (size_t z = 0; z < P; ++z) {
+ real_t accu = 0;
+ for (size_t k = 0; k < P; ++k) {
+ accu += L_hat[y][k] * M_u[x][k][z];
+ }
+ r[x][y][z] += d[2] * accu;
+ }
+ for (size_t x = 0; x < P; ++x)
+ for (size_t y = 0; y < P; ++y)
+ for (size_t z = 0; z < P; ++z) {
+ real_t accu = 0;
+ for (size_t k = 0; k < P; ++k) {
+ accu += L_hat[z][k] * M_u[x][y][k];
+ }
+ r[x][y][z] += d[3] * accu;
+ }
+void helm_factor(
+ real_t w[P],
+ real_t L[P][P],
+ real_t d[4],
+ real_t u[P][P][P],
+ real_t r[P][P][P]
+ real_t* L_hat = make_empty(P*P);
+ real_t* M_u = make_empty(P*P*P);
+ helm_factor_impl(
+ w,
+ L,
+ d,
+ u,
+ L_hat,
+ M_u,
+ r
+ );
+int main(int argc, const char* argv[])
+ srandom(0xDEADBEEF);
+ real_t* w = make_random(P);
+ real_t* L = make_random(P*P);
+ real_t* d = make_random(4);
+ real_t* u = make_random(P*P*P);
+ real_t* r1 = make_empty(P*P*P);
+ helm_naive(w, L, d, u, r1);
+ real_t* r2 = make_empty(P*P*P);
+ helm_factor(w, L, d, u, r2);
+ real_t mse2 = mse(r1, r2, P*P*P);
+ printf("mse2 = %G\n", mse2);
+ return EXIT_SUCCESS;
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/test.xml
new file mode 100644
index 000000000..88c970fa1
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/test.xml
@@ -0,0 +1,10 @@
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/README.txt b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/README.txt
new file mode 100644
index 000000000..bdaccd406
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/README.txt
@@ -0,0 +1,24 @@
+Simple example describing how to integrate and verify existing IP with functions written in C that receives structs passed by pointers.
+Hereafter a small description of files
+top.c: file to be compiled/synthesized by bambu.
+module_lib.h: header that declares the interfaces to existing Verilog IPs.
+module_lib.xml: XML file that describes interfaces of existing Verilog IPs.
+module1.v: verilog of an existing synthesizable IP.
+module1.c: C stub used to emulate the module1 IP in C.
+module2.v: verilog of an existing synthesizable IP.
+module2.c: C stub used to emulate the module2 IP in C.
+printer1.v: verilog of an existing non-synthesizable IP.
+printer1.c: C stub used to emulate the printer1 IP in C.
+printer2.v: verilog of an existing non-synthesizable IP.
+printer2.c: C stub used to emulate the printer2 IP in C.
+main_test.c: C testbench
+constraints_STD.xml: resource constraint file passed to bambu to generate a Verilog design with just 1 my_ip module.
+test.xml: XML file describing the testbench inputs. It is empty since we use the main_test.c as testbench generator.
+bambu.sh: synthesis and simulation script. It requires Vivado RTL and Verilator to properly work.
+All C/H files were validated using the "gcc -c" command.
+A C executable can be created with this command: "gcc -o ip_test main_test.c top.c module1.c module2.c printer1.c printer2.c
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/bambu.sh
new file mode 100755
index 000000000..2365638eb
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/bambu.sh
@@ -0,0 +1,16 @@
+script=$(readlink -e $0)
+root_dir=$(dirname $script)
+rm -rf hls
+mkdir -p hls
+cd hls
+echo "# integrating IP simulation"
+bambu $root_dir/main_test.c $root_dir/top.c --top-fname=main --top-rtldesign-name=my_ip \
+ --C-no-parse=$root_dir/module1.c,$root_dir/module2.c,$root_dir/printer1.c,$root_dir/printer2.c \
+ --file-input-data=$root_dir/module1.v,$root_dir/module2.v,$root_dir/printer1.v,$root_dir/printer2.v \
+ $root_dir/module_lib.xml $root_dir/constraints_STD.xml \
+ --experimental-setup=BAMBU -O3 \
+ --no-iob --memory-allocation-policy=ALL_BRAM \
+ --generate-tb=$root_dir/test.xml --simulate --simulator=VERILATOR \
+ --print-dot -v4 "$@" |& tee log.txt
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/constraints_STD.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/constraints_STD.xml
new file mode 100644
index 000000000..bd4f51938
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/constraints_STD.xml
@@ -0,0 +1,6 @@
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/main_test.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/main_test.c
new file mode 100644
index 000000000..fdc441e97
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/main_test.c
@@ -0,0 +1,22 @@
+#include "module_lib.h"
+extern void __builtin_bambu_time_start();
+extern void __builtin_bambu_time_stop();
+int main()
+ uint32_t param1=10;
+ uint32_t param2=10<<16;
+ __builtin_bambu_time_start();
+ my_ip(0, param1, param2);
+ my_ip(1, param1, param2);
+ my_ip(2, param1, param2);
+ my_ip(3, param1, param2);
+ __builtin_bambu_time_stop();
+ return 0;
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.c
new file mode 100644
index 000000000..21ba71f5f
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.c
@@ -0,0 +1,8 @@
+#include "module_lib.h"
+void module1(uint32_t input1, uint16_t input2, module1_output_t *outputs)
+ outputs->output1 = input1 * input2;
+ outputs->output2 = input1 + input2;
+ outputs->output3 = (~input2) + 1;
+ outputs->output4 = input2 | (((uint32_t)input2)<<16);
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.v b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.v
new file mode 100644
index 000000000..32ffa7810
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.v
@@ -0,0 +1,226 @@
+module module1_IP
+ (input wire clock,
+ input wire reset,
+ input wire start_port,
+ output reg done_port,
+ input wire [31:0] input1,
+ input wire [15:0] input2,
+ output reg [63:0] output1,
+ output reg [63:0] output2,
+ output reg [15:0] output3,
+ output reg [31:0] output4);
+ reg done_port_reg;
+ reg [63:0] output1_reg;
+ reg [63:0] output2_reg;
+ reg [15:0] output3_reg;
+ reg [31:0] output4_reg;
+ //----------------------------------------------------------------
+ // Simulate processing on input
+ //----------------------------------------------------------------
+ always @(posedge clock) begin
+ if (!reset) begin
+ done_port_reg <= 0;
+ output1_reg <= 0;
+ output2_reg <= 0;
+ output3_reg <= 0;
+ output4_reg <= 0;
+ end
+ else begin
+ done_port_reg <= start_port;
+ output1_reg <= input1 * input2;
+ output2_reg <= {32'd0, input1} + {48'd0, input2};
+ output3_reg <= (~input2) + 1;
+ output4_reg <= {input2, input2};
+ end
+ end
+ //----------------------------------------------------------------
+ // Outputs, two cycle latency
+ //----------------------------------------------------------------
+ always @(posedge clock) begin
+ if (!reset) begin
+ done_port <= 0;
+ output1 <= 0;
+ output2 <= 0;
+ output3 <= 0;
+ output4 <= 0;
+ end
+ else begin
+ done_port <= done_port_reg;
+ output1 <= output1_reg;
+ output2 <= output2_reg;
+ output3 <= output3_reg;
+ output4 <= output4_reg;
+ end
+ end
+module module1 (clock, reset, start_port, input1, input2, outputs, done_port, Min_oe_ram, Mout_oe_ram, Min_we_ram, Mout_we_ram, Min_addr_ram, Mout_addr_ram, M_Rdata_ram, Min_Wdata_ram, Mout_Wdata_ram, Min_data_ram_size, Mout_data_ram_size, M_DataRdy);
+ parameter BITSIZE_outputs=1, BITSIZE_Min_addr_ram=1, BITSIZE_Mout_addr_ram=1, BITSIZE_M_Rdata_ram=8, BITSIZE_Min_Wdata_ram=8, BITSIZE_Mout_Wdata_ram=8, BITSIZE_Min_data_ram_size=1, BITSIZE_Mout_data_ram_size=1;
+ // IN
+ input clock;
+ input reset;
+ input start_port;
+ input [31:0] input1;
+ input [15:0] input2;
+ input [BITSIZE_outputs-1:0] outputs;
+ input Min_oe_ram;
+ input Min_we_ram;
+ input [BITSIZE_Min_addr_ram-1:0] Min_addr_ram;
+ input [BITSIZE_M_Rdata_ram-1:0] M_Rdata_ram;
+ input [BITSIZE_Min_Wdata_ram-1:0] Min_Wdata_ram;
+ input [BITSIZE_Min_data_ram_size-1:0] Min_data_ram_size;
+ input M_DataRdy;
+ // OUT
+ output done_port;
+ output Mout_oe_ram;
+ output Mout_we_ram;
+ output [BITSIZE_Mout_addr_ram-1:0] Mout_addr_ram;
+ output [BITSIZE_Mout_Wdata_ram-1:0] Mout_Wdata_ram;
+ output [BITSIZE_Mout_data_ram_size-1:0] Mout_data_ram_size;
+ wire [63:0] output1_int;
+ wire [63:0] output2_int;
+ wire [15:0] output3_int;
+ wire [31:0] output4_int;
+ reg [63:0] output1_reg;
+ reg [63:0] output2_reg;
+ reg [15:0] output3_reg;
+ reg [31:0] output4_reg;
+ reg done_port;
+ wire done_port_my_ip;
+ wire start_port_fsm;
+ reg start_port_memstore;
+ wire done_port_memstore;
+ reg [63:0] data_int;
+ reg [BITSIZE_outputs-1:0] addr_int;
+ reg [6:0] size_int;
+ reg Min_oe_ram_int;
+ reg Min_we_ram_int;
+ reg [BITSIZE_Min_addr_ram-1:0] Min_addr_ram_int;
+ reg [BITSIZE_Min_Wdata_ram-1:0] Min_Wdata_ram_int;
+ reg [BITSIZE_Min_data_ram_size-1:0] Min_data_ram_size_int;
+ parameter [2:0] S_0 = 3'd0,
+ S_1 = 3'd1,
+ S_2 = 3'd2,
+ S_3 = 3'd3,
+ S_4 = 3'd4;
+ reg [2:0] _present_state=S_0, _next_state;
+ module1_IP my_module1_IP (.done_port(done_port_my_ip), .clock(clock), .reset(reset), .start_port(start_port), .input1(input1), .input2(input2), .output1(output1_int), .output2(output2_int), .output3(output3_int), .output4(output4_int));
+ assign start_port_fsm = done_port_my_ip;
+ __builtin_memstore #(.BITSIZE_data(64), .BITSIZE_addr(BITSIZE_outputs), .BITSIZE_size(7), .BITSIZE_Min_addr_ram(BITSIZE_Min_addr_ram), .BITSIZE_Mout_addr_ram(BITSIZE_Mout_addr_ram), .BITSIZE_M_Rdata_ram(BITSIZE_M_Rdata_ram), .BITSIZE_Min_Wdata_ram(BITSIZE_Min_Wdata_ram), .BITSIZE_Mout_Wdata_ram(BITSIZE_Mout_Wdata_ram), .BITSIZE_Min_data_ram_size(BITSIZE_Min_data_ram_size), .BITSIZE_Mout_data_ram_size(BITSIZE_Mout_data_ram_size)) my__builtin_memstore (.clock(clock), .reset(reset), .start_port(start_port_memstore), .data(data_int), .addr(addr_int), .size(size_int), .done_port(done_port_memstore), .Min_oe_ram(Min_oe_ram_int), .Mout_oe_ram(Mout_oe_ram), .Min_we_ram(Min_we_ram_int), .Mout_we_ram(Mout_we_ram), .Min_addr_ram(Min_addr_ram_int), .Mout_addr_ram(Mout_addr_ram), .M_Rdata_ram(M_Rdata_ram), .Min_Wdata_ram(Min_Wdata_ram_int), .Mout_Wdata_ram(Mout_Wdata_ram), .Min_data_ram_size(Min_data_ram_size_int), .Mout_data_ram_size(Mout_data_ram_size), .M_DataRdy(M_DataRdy));
+ always @(posedge clock or negedge reset)
+ if (!reset)
+ begin
+ _present_state <= S_0;
+ end
+ else
+ _present_state <= _next_state;
+ always @(posedge clock or negedge reset)
+ if (!reset)
+ begin
+ output1_reg <= 0;
+ output2_reg <= 0;
+ output3_reg <= 0;
+ output4_reg <= 0;
+ end
+ else if(done_port_my_ip == 1'b1)
+ begin
+ output1_reg <= output1_int;
+ output2_reg <= output2_int;
+ output3_reg <= output3_int;
+ output4_reg <= output4_int;
+ end
+ always @(*)
+ begin
+ _next_state=S_0;
+ done_port=1'b0;
+ start_port_memstore=1'b0;
+ addr_int=0;
+ data_int=0;
+ size_int=0;
+ Min_oe_ram_int=Min_oe_ram;
+ Min_we_ram_int=Min_we_ram;
+ Min_data_ram_size_int=Min_data_ram_size;
+ Min_Wdata_ram_int=Min_Wdata_ram;
+ Min_addr_ram_int=Min_addr_ram;
+ case (_present_state)
+ S_0 :
+ if(start_port_fsm != 1'b1)
+ begin
+ _next_state=S_0;
+ end
+ else
+ begin
+ _next_state=S_1;
+ end
+ S_1 :
+ begin
+ _next_state=S_1;
+ start_port_memstore=1'b1;
+ addr_int=outputs;
+ data_int=output1_reg;
+ size_int=64;
+ if(done_port_memstore)
+ begin
+ _next_state=S_2;
+ end
+ end
+ S_2 :
+ begin
+ _next_state=S_2;
+ start_port_memstore=1'b1;
+ addr_int=outputs+64/8;
+ data_int=output2_reg;
+ size_int=64;
+ if(done_port_memstore)
+ begin
+ _next_state=S_3;
+ end
+ end
+ S_3 :
+ begin
+ _next_state=S_3;
+ start_port_memstore=1'b1;
+ addr_int=outputs+(64+64)/8;
+ data_int=output3_reg;
+ size_int=15;
+ if(done_port_memstore)
+ begin
+ _next_state=S_4;
+ end
+ end
+ S_4 :
+ begin
+ _next_state=S_4;
+ start_port_memstore=1'b1;
+ addr_int=outputs+(64+64+32)/8;
+ data_int=output4_reg;
+ size_int=32;
+ if(done_port_memstore)
+ begin
+ _next_state=S_0;
+ done_port=1'b1;
+ end
+ end
+ endcase
+ end
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.c
new file mode 100644
index 000000000..890e8444a
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.c
@@ -0,0 +1,7 @@
+#include "module_lib.h"
+void module2(uint32_t input1, module2_output_t *outputs)
+ outputs->output1 = input1 * input1;
+ outputs->output2 = input1 | (((uint64_t)input1)<<32);
+ outputs->output3 = (uint16_t)input1;
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.v b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.v
new file mode 100644
index 000000000..4d42e2395
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.v
@@ -0,0 +1,201 @@
+module module2_IP
+ (input wire clock,
+ input wire reset,
+ input wire start_port,
+ output reg done_port,
+ input wire [31:0] input1,
+ output reg [63:0] output1,
+ output reg [63:0] output2,
+ output reg [15:0] output3);
+ reg done_port_reg;
+ reg [63:0] output1_reg;
+ reg [63:0] output2_reg;
+ reg [15:0] output3_reg;
+ //----------------------------------------------------------------
+ // Simulate processing on input
+ //----------------------------------------------------------------
+ always @(posedge clock) begin
+ if (!reset) begin
+ done_port_reg <= 0;
+ output1_reg <= 0;
+ output2_reg <= 0;
+ output3_reg <= 0;
+ end
+ else begin
+ done_port_reg <= start_port;
+ output1_reg <= input1 * input1;
+ output2_reg <= {input1, input1};
+ output3_reg <= input1[15:0];
+ end
+ end
+ //----------------------------------------------------------------
+ // Outputs, two cycle latency
+ //----------------------------------------------------------------
+ always @(posedge clock) begin
+ if (!reset) begin
+ done_port <= 0;
+ output1 <= 0;
+ output2 <= 0;
+ output3 <= 0;
+ end
+ else begin
+ done_port <= done_port_reg;
+ output1 <= output1_reg;
+ output2 <= output2_reg;
+ output3 <= output3_reg;
+ end
+ end
+module module2 (clock, reset, start_port, input1, outputs, done_port, Min_oe_ram, Mout_oe_ram, Min_we_ram, Mout_we_ram, Min_addr_ram, Mout_addr_ram, M_Rdata_ram, Min_Wdata_ram, Mout_Wdata_ram, Min_data_ram_size, Mout_data_ram_size, M_DataRdy);
+ parameter BITSIZE_outputs=1, BITSIZE_Min_addr_ram=1, BITSIZE_Mout_addr_ram=1, BITSIZE_M_Rdata_ram=8, BITSIZE_Min_Wdata_ram=8, BITSIZE_Mout_Wdata_ram=8, BITSIZE_Min_data_ram_size=1, BITSIZE_Mout_data_ram_size=1;
+ // IN
+ input clock;
+ input reset;
+ input start_port;
+ input [31:0] input1;
+ input [BITSIZE_outputs-1:0] outputs;
+ input Min_oe_ram;
+ input Min_we_ram;
+ input [BITSIZE_Min_addr_ram-1:0] Min_addr_ram;
+ input [BITSIZE_M_Rdata_ram-1:0] M_Rdata_ram;
+ input [BITSIZE_Min_Wdata_ram-1:0] Min_Wdata_ram;
+ input [BITSIZE_Min_data_ram_size-1:0] Min_data_ram_size;
+ input M_DataRdy;
+ // OUT
+ output done_port;
+ output Mout_oe_ram;
+ output Mout_we_ram;
+ output [BITSIZE_Mout_addr_ram-1:0] Mout_addr_ram;
+ output [BITSIZE_Mout_Wdata_ram-1:0] Mout_Wdata_ram;
+ output [BITSIZE_Mout_data_ram_size-1:0] Mout_data_ram_size;
+ wire [63:0] output1_int;
+ wire [63:0] output2_int;
+ wire [15:0] output3_int;
+ reg [63:0] output1_reg;
+ reg [63:0] output2_reg;
+ reg [15:0] output3_reg;
+ reg done_port;
+ wire done_port_my_ip;
+ wire start_port_fsm;
+ reg start_port_memstore;
+ wire done_port_memstore;
+ reg [63:0] data_int;
+ reg [BITSIZE_outputs-1:0] addr_int;
+ reg [6:0] size_int;
+ reg Min_oe_ram_int;
+ reg Min_we_ram_int;
+ reg [BITSIZE_Min_addr_ram-1:0] Min_addr_ram_int;
+ reg [BITSIZE_Min_Wdata_ram-1:0] Min_Wdata_ram_int;
+ reg [BITSIZE_Min_data_ram_size-1:0] Min_data_ram_size_int;
+ parameter [1:0] S_0 = 2'd0,
+ S_1 = 2'd1,
+ S_2 = 2'd2,
+ S_3 = 2'd3;
+ reg [1:0] _present_state=S_0, _next_state;
+ module2_IP my_module2_IP (.done_port(done_port_my_ip), .clock(clock), .reset(reset), .start_port(start_port), .input1(input1), .output1(output1_int), .output2(output2_int), .output3(output3_int));
+ assign start_port_fsm = done_port_my_ip;
+ __builtin_memstore #(.BITSIZE_data(64), .BITSIZE_addr(BITSIZE_outputs), .BITSIZE_size(7), .BITSIZE_Min_addr_ram(BITSIZE_Min_addr_ram), .BITSIZE_Mout_addr_ram(BITSIZE_Mout_addr_ram), .BITSIZE_M_Rdata_ram(BITSIZE_M_Rdata_ram), .BITSIZE_Min_Wdata_ram(BITSIZE_Min_Wdata_ram), .BITSIZE_Mout_Wdata_ram(BITSIZE_Mout_Wdata_ram), .BITSIZE_Min_data_ram_size(BITSIZE_Min_data_ram_size), .BITSIZE_Mout_data_ram_size(BITSIZE_Mout_data_ram_size)) my__builtin_memstore (.clock(clock), .reset(reset), .start_port(start_port_memstore), .data(data_int), .addr(addr_int), .size(size_int), .done_port(done_port_memstore), .Min_oe_ram(Min_oe_ram_int), .Mout_oe_ram(Mout_oe_ram), .Min_we_ram(Min_we_ram_int), .Mout_we_ram(Mout_we_ram), .Min_addr_ram(Min_addr_ram_int), .Mout_addr_ram(Mout_addr_ram), .M_Rdata_ram(M_Rdata_ram), .Min_Wdata_ram(Min_Wdata_ram_int), .Mout_Wdata_ram(Mout_Wdata_ram), .Min_data_ram_size(Min_data_ram_size_int), .Mout_data_ram_size(Mout_data_ram_size), .M_DataRdy(M_DataRdy));
+ always @(posedge clock or negedge reset)
+ if (!reset)
+ begin
+ _present_state <= S_0;
+ end
+ else
+ _present_state <= _next_state;
+ always @(posedge clock or negedge reset)
+ if (!reset)
+ begin
+ output1_reg <= 0;
+ output2_reg <= 0;
+ output3_reg <= 0;
+ end
+ else if(done_port_my_ip == 1'b1)
+ begin
+ output1_reg <= output1_int;
+ output2_reg <= output2_int;
+ output3_reg <= output3_int;
+ end
+ always @(*)
+ begin
+ _next_state=S_0;
+ done_port=1'b0;
+ start_port_memstore=1'b0;
+ addr_int=0;
+ data_int=0;
+ size_int=0;
+ Min_oe_ram_int=Min_oe_ram;
+ Min_we_ram_int=Min_we_ram;
+ Min_data_ram_size_int=Min_data_ram_size;
+ Min_Wdata_ram_int=Min_Wdata_ram;
+ Min_addr_ram_int=Min_addr_ram;
+ case (_present_state)
+ S_0 :
+ if(start_port_fsm != 1'b1)
+ begin
+ _next_state=S_0;
+ end
+ else
+ begin
+ _next_state=S_1;
+ end
+ S_1 :
+ begin
+ _next_state=S_1;
+ start_port_memstore=1'b1;
+ addr_int=outputs;
+ data_int=output1_reg;
+ size_int=64;
+ if(done_port_memstore)
+ begin
+ _next_state=S_2;
+ end
+ end
+ S_2 :
+ begin
+ _next_state=S_2;
+ start_port_memstore=1'b1;
+ addr_int=outputs+64/8;
+ data_int=output2_reg;
+ size_int=64;
+ if(done_port_memstore)
+ begin
+ _next_state=S_3;
+ end
+ end
+ S_3 :
+ begin
+ _next_state=S_3;
+ start_port_memstore=1'b1;
+ addr_int=outputs+(64+64)/8;
+ size_int=16;
+ data_int=output3_reg;
+ if(done_port_memstore)
+ begin
+ _next_state=S_0;
+ done_port=1'b1;
+ end
+ end
+ endcase
+ end
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.h b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.h
new file mode 100644
index 000000000..6bff2ed40
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.h
@@ -0,0 +1,32 @@
+#ifndef MODULE_LIB_H
+#define MODULE_LIB_H
+typedef struct {
+ uint64_t output1;
+ uint64_t output2;
+ uint16_t output3;
+ uint32_t output4;
+} module1_output_t;
+extern void module1(uint32_t input1, uint16_t input2, module1_output_t *outputs);
+typedef struct {
+ uint64_t output1;
+ uint64_t output2;
+ uint16_t output3;
+} module2_output_t;
+extern void module2(uint32_t input1, module2_output_t *outputs);
+extern void printer1(uint64_t value1, uint64_t value2, uint16_t value3, uint32_t value4);
+extern void printer2(uint64_t value1, uint64_t value2, uint16_t value3);
+extern void my_ip(uint8_t command, uint32_t param1, uint32_t param2);
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.xml
new file mode 100644
index 000000000..edd567d75
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.xml
@@ -0,0 +1,262 @@
+ module_lib
+ module1
+ Module 1 IP
+ foo
+ foo
+ foo
+ |
+ module2
+ Module 2 IP
+ foo
+ foo
+ foo
+ |
+ printer2
+ Printer 2 IP
+ foo
+ foo
+ foo
+ |
+ printer1
+ Printer 1 IP
+ foo
+ foo
+ foo
+ |
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.c
new file mode 100644
index 000000000..49fd269f7
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.c
@@ -0,0 +1,6 @@
+#include "module_lib.h"
+void printer1(uint64_t value1, uint64_t value2, uint16_t value3, uint32_t value4)
+ printf("printer1: %llx %llx %x %x\n", value1, value2, value3, value4);
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.v b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.v
new file mode 100644
index 000000000..b572577e6
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.v
@@ -0,0 +1,45 @@
+module printer1
+ (input wire clock,
+ input wire reset,
+ input wire start_port,
+ output reg done_port,
+ input wire [63:0] value1,
+ input wire [63:0] value2,
+ input wire [15:0] value3,
+ input wire [31:0] value4);
+ reg done_port_reg;
+ //----------------------------------------------------------------
+ // Simulate processing on input
+ //----------------------------------------------------------------
+ always @(posedge clock) begin
+ if (!reset) begin
+ done_port_reg <= 0;
+ end
+ else begin
+ done_port_reg <= start_port;
+ end
+ end
+ //----------------------------------------------------------------
+ // Outputs, two cycle latency
+ //----------------------------------------------------------------
+ always @(posedge clock) begin
+ if (!reset) begin
+ done_port <= 0;
+ end
+ else begin
+ done_port <= done_port_reg;
+ if (done_port_reg) begin
+ $display("printer1: %h %h %h %h", value1, value2, value3, value4);
+ end
+ end
+ end
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.c
new file mode 100644
index 000000000..a73c63f62
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.c
@@ -0,0 +1,6 @@
+#include "module_lib.h"
+void printer2(uint64_t value1, uint64_t value2, uint16_t value3)
+ printf("printer2: %llx %llx %x\n", value1, value2, value3);
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.v b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.v
new file mode 100644
index 000000000..15fc5bdd0
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.v
@@ -0,0 +1,44 @@
+module printer2
+ (input wire clock,
+ input wire reset,
+ input wire start_port,
+ output reg done_port,
+ input wire [63:0] value1,
+ input wire [63:0] value2,
+ input wire [15:0] value3);
+ reg done_port_reg;
+ //----------------------------------------------------------------
+ // Simulate processing on input
+ //----------------------------------------------------------------
+ always @(posedge clock) begin
+ if (!reset) begin
+ done_port_reg <= 0;
+ end
+ else begin
+ done_port_reg <= start_port;
+ end
+ end
+ //----------------------------------------------------------------
+ // Outputs, two cycle latency
+ //----------------------------------------------------------------
+ always @(posedge clock) begin
+ if (!reset) begin
+ done_port <= 0;
+ end
+ else begin
+ done_port <= done_port_reg;
+ if (done_port_reg) begin
+ $display("printer2: %h %h %h", value1, value2, value3);
+ end
+ end
+ end
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/test.xml
new file mode 100644
index 000000000..6a8f8acff
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/test.xml
@@ -0,0 +1,4 @@
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/top.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/top.c
new file mode 100644
index 000000000..46fa1c11a
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/top.c
@@ -0,0 +1,23 @@
+#include "module_lib.h"
+void my_ip(uint8_t command, uint32_t param1, uint32_t param2) {
+ static module1_output_t module1_output;
+ static module2_output_t module2_output;
+ switch(command) {
+ case 0:
+ module1(param1, param2 >> 16, &module1_output);
+ break;
+ case 1:
+ module2(param1, &module2_output);
+ break;
+ case 2:
+ printer1(module1_output.output1, module1_output.output2, module1_output.output3, module1_output.output4);
+ break;
+ case 3:
+ printer2(module2_output.output1, module2_output.output2, module2_output.output3);
+ break;
+ default:
+ break;
+ }
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/aggregate.h b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/aggregate.h
new file mode 100644
index 000000000..2d0e9085f
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/aggregate.h
@@ -0,0 +1,16 @@
+#ifndef AGGREGATE_H
+#define AGGREGATE_H
+struct aggregate
+ float a0;
+ float a1;
+ float a2;
+ float a3;
+ float a4;
+ float a5;
+ float a6;
+ float a7;
+#endif /* AGGREGATE_H */
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/bambu.sh
new file mode 100755
index 000000000..7219597b6
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/bambu.sh
@@ -0,0 +1,12 @@
+script=$(readlink -e $0)
+root_dir=$(dirname $script)
+rm -rf hls
+mkdir hls
+cd hls
+echo "#simulation of qsort"
+bambu $root_dir/test.c $root_dir/less.c $root_dir/qsort.c --top-fname=test \
+ -Os --no-iob \
+ --generate-tb=$root_dir/test.xml --simulate \
+ -v2 --print-dot --pretty-print=a.c "$@" |& tee log.txt
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/less.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/less.c
new file mode 100644
index 000000000..6b5a20dfd
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/less.c
@@ -0,0 +1,31 @@
+#include "aggregate.h"
+int less (const void * a, const void * b, void * notUsed)
+ struct aggregate * aPtr = (struct aggregate *)a;
+ struct aggregate * bPtr = (struct aggregate *)b;
+ float aSum = aPtr->a0 +
+ aPtr->a1 +
+ aPtr->a2 +
+ aPtr->a3 +
+ aPtr->a4 +
+ aPtr->a5 +
+ aPtr->a6 +
+ aPtr->a7;
+ float bSum = bPtr->a0 +
+ bPtr->a1 +
+ bPtr->a2 +
+ bPtr->a3 +
+ bPtr->a4 +
+ bPtr->a5 +
+ bPtr->a6 +
+ bPtr->a7;
+ int equal = (bSum - aSum) == 0;
+ if (equal) return 0;
+ int lt = (aSum - bSum) < 0;
+ return lt ? -1 : 1;
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/qsort.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/qsort.c
new file mode 100644
index 000000000..64e5a1a81
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/qsort.c
@@ -0,0 +1,247 @@
+/* Copyright (C) 1991-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Written by Douglas C. Schmidt (schmidt@ics.uci.edu).
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ . */
+/* If you consider tuning this algorithm, you should consult first:
+ Engineering a sort function; Jon Bentley and M. Douglas McIlroy;
+ Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993. */
+/* Byte-wise swap two items of size SIZE. */
+#define SWAP(a, b, size) \
+ do \
+ { \
+ size_t __size = (size); \
+ char *__a = (a), *__b = (b); \
+ do \
+ { \
+ char __tmp = *__a; \
+ *__a++ = *__b; \
+ *__b++ = __tmp; \
+ } while (--__size > 0); \
+ } while (0)
+/* Discontinue quicksort algorithm when partition gets below this size.
+ This particular magic number was chosen to work best on a Sun 4/260. */
+#define MAX_THRESH 4
+/* Stack node declarations used to store unfulfilled partition obligations. */
+typedef struct
+ {
+ char *lo;
+ char *hi;
+ } stack_node;
+/* The next 4 #defines implement a very fast in-line stack abstraction. */
+/* The stack needs log (total_elements) entries (we could even subtract
+ log(MAX_THRESH)). Since total_elements has type size_t, we get as
+ upper bound for log (total_elements):
+ bits per byte (CHAR_BIT) * sizeof(size_t). */
+#define STACK_SIZE (CHAR_BIT * sizeof(size_t))
+#define PUSH(low, high) ((void) ((top->lo = (low)), (top->hi = (high)), ++top))
+#define POP(low, high) ((void) (--top, (low = top->lo), (high = top->hi)))
+#define STACK_NOT_EMPTY (stack < top)
+/* Order size using quicksort. This implementation incorporates
+ four optimizations discussed in Sedgewick:
+ 1. Non-recursive, using an explicit stack of pointer that store the
+ next array partition to sort. To save time, this maximum amount
+ of space required to store an array of SIZE_MAX is allocated on the
+ stack. Assuming a 32-bit (64 bit) integer for size_t, this needs
+ only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
+ Pretty cheap, actually.
+ 2. Chose the pivot element using a median-of-three decision tree.
+ This reduces the probability of selecting a bad pivot value and
+ eliminates certain extraneous comparisons.
+ 3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
+ insertion sort to order the MAX_THRESH items within each partition.
+ This is a big win, since insertion sort is faster for small, mostly
+ sorted array segments.
+ 4. The larger of the two sub-partitions is always pushed onto the
+ stack first, with the algorithm then concentrating on the
+ smaller partition. This *guarantees* no more than log (total_elems)
+ stack size is needed (actually O(1) in this case)! */
+_quicksort (void *const pbase, size_t total_elems, size_t size,
+ int (*cmp)(const void *, const void *, void *), void *arg)
+ char *base_ptr = (char *) pbase;
+ const size_t max_thresh = MAX_THRESH * size;
+ if (total_elems == 0)
+ /* Avoid lossage with unsigned arithmetic below. */
+ return;
+ if (total_elems > MAX_THRESH)
+ {
+ char *lo = base_ptr;
+ char *hi = &lo[size * (total_elems - 1)];
+ stack_node stack[STACK_SIZE];
+ stack_node *top = stack;
+ {
+ char *left_ptr;
+ char *right_ptr;
+ /* Select median value from among LO, MID, and HI. Rearrange
+ LO and HI so the three values are sorted. This lowers the
+ probability of picking a pathological pivot value and
+ skips a comparison for both the LEFT_PTR and RIGHT_PTR in
+ the while loops. */
+ char *mid = lo + size * ((hi - lo) / size >> 1);
+ if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
+ SWAP (mid, lo, size);
+ if ((*cmp) ((void *) hi, (void *) mid, arg) < 0)
+ SWAP (mid, hi, size);
+ else
+ goto jump_over;
+ if ((*cmp) ((void *) mid, (void *) lo, arg) < 0)
+ SWAP (mid, lo, size);
+ jump_over:;
+ left_ptr = lo + size;
+ right_ptr = hi - size;
+ /* Here's the famous ``collapse the walls'' section of quicksort.
+ Gotta like those tight inner loops! They are the main reason
+ that this algorithm runs much faster than others. */
+ do
+ {
+ while ((*cmp) ((void *) left_ptr, (void *) mid, arg) < 0)
+ left_ptr += size;
+ while ((*cmp) ((void *) mid, (void *) right_ptr, arg) < 0)
+ right_ptr -= size;
+ if (left_ptr < right_ptr)
+ {
+ SWAP (left_ptr, right_ptr, size);
+ if (mid == left_ptr)
+ mid = right_ptr;
+ else if (mid == right_ptr)
+ mid = left_ptr;
+ left_ptr += size;
+ right_ptr -= size;
+ }
+ else if (left_ptr == right_ptr)
+ {
+ left_ptr += size;
+ right_ptr -= size;
+ break;
+ }
+ }
+ while (left_ptr <= right_ptr);
+ /* Set up pointers for next iteration. First determine whether
+ left and right partitions are below the threshold size. If so,
+ ignore one or both. Otherwise, push the larger partition's
+ bounds on the stack and continue sorting the smaller one. */
+ if ((size_t) (right_ptr - lo) <= max_thresh)
+ {
+ if ((size_t) (hi - left_ptr) <= max_thresh)
+ /* Ignore both small partitions. */
+ POP (lo, hi);
+ else
+ /* Ignore small left partition. */
+ lo = left_ptr;
+ }
+ else if ((size_t) (hi - left_ptr) <= max_thresh)
+ /* Ignore small right partition. */
+ hi = right_ptr;
+ else if ((right_ptr - lo) > (hi - left_ptr))
+ {
+ /* Push larger left partition indices. */
+ PUSH (lo, right_ptr);
+ lo = left_ptr;
+ }
+ else
+ {
+ /* Push larger right partition indices. */
+ PUSH (left_ptr, hi);
+ hi = right_ptr;
+ }
+ }
+ }
+ /* Once the BASE_PTR array is partially sorted by quicksort the rest
+ is completely sorted using insertion sort, since this is efficient
+ for partitions below MAX_THRESH size. BASE_PTR points to the beginning
+ of the array to sort, and END_PTR points at the very last element in
+ the array (*not* one beyond it!). */
+#define min(x, y) ((x) < (y) ? (x) : (y))
+ {
+ char *const end_ptr = &base_ptr[size * (total_elems - 1)];
+ char *tmp_ptr = base_ptr;
+ char *thresh = min(end_ptr, base_ptr + max_thresh);
+ char *run_ptr;
+ /* Find smallest element in first threshold and place it at the
+ array's beginning. This is the smallest array element,
+ and the operation speeds up insertion sort's inner loop. */
+ for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
+ if ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
+ tmp_ptr = run_ptr;
+ if (tmp_ptr != base_ptr)
+ SWAP (tmp_ptr, base_ptr, size);
+ /* Insertion sort, running from left-hand-side up to right-hand-side. */
+ run_ptr = base_ptr + size;
+ while ((run_ptr += size) <= end_ptr)
+ {
+ tmp_ptr = run_ptr - size;
+ while ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0)
+ tmp_ptr -= size;
+ tmp_ptr += size;
+ if (tmp_ptr != run_ptr)
+ {
+ char *trav;
+ trav = run_ptr + size;
+ while (--trav >= run_ptr)
+ {
+ char c = *trav;
+ char *hi, *lo;
+ for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
+ *hi = *lo;
+ *hi = c;
+ }
+ }
+ }
+ }
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.c
new file mode 100644
index 000000000..f968a2897
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.c
@@ -0,0 +1,17 @@
+typedef unsigned int size_t;
+typedef int (*__compar_d_fn_t)(void *, void *, void *);
+#include "aggregate.h"
+//#include "qsort.c"
+_quicksort (void *const pbase, size_t total_elems, size_t size,
+ int (*cmp)(const void *, const void *, void *), void *arg);
+//#include "less.c"
+int less (void * a, void * b, void * notUsed);
+void test(float * const pbase, size_t total_elems)
+ _quicksort(pbase, (sizeof(float) * total_elems) / sizeof(struct aggregate), sizeof(struct aggregate), less , (void *)0);
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.xml
new file mode 100644
index 000000000..a98af4c69
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.xml
@@ -0,0 +1,4 @@
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise8/Keccak.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise8/Keccak.c
new file mode 100644
index 000000000..cbc8cc651
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise8/Keccak.c
@@ -0,0 +1,142 @@
+ * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+ * Michaël Peeters and Gilles Van Assche. For more information, feedback or
+ * questions, please refer to our website: http://keccak.noekeon.org/
+ * Implementation by the designers,
+ * hereby denoted as "the implementer".
+ * To the extent possible under law, the implementer has waived all copyright
+ * and related or neighboring rights to the source code in this file.
+ * http://creativecommons.org/publicdomain/zero/1.0/
+ *
+ */
+typedef unsigned char UINT8;
+typedef unsigned long long int UINT64;
+#define nrRounds 24
+#define GET_KRC_VAL(index) (KeccakRoundConstants[index])
+static UINT64 KeccakRoundConstants[nrRounds] = {
+ 0x0000000000000001ULL,
+ 0x0000000000008082ULL,
+ 0x800000000000808aULL,
+ 0x8000000080008000ULL,
+ 0x000000000000808bULL,
+ 0x0000000080000001ULL,
+ 0x8000000080008081ULL,
+ 0x8000000000008009ULL,
+ 0x000000000000008aULL,
+ 0x0000000000000088ULL,
+ 0x0000000080008009ULL,
+ 0x000000008000000aULL,
+ 0x000000008000808bULL,
+ 0x800000000000008bULL,
+ 0x8000000000008089ULL,
+ 0x8000000000008003ULL,
+ 0x8000000000008002ULL,
+ 0x8000000000000080ULL,
+ 0x000000000000800aULL,
+ 0x800000008000000aULL,
+ 0x8000000080008081ULL,
+ 0x8000000000008080ULL,
+ 0x0000000080000001ULL,
+ 0x8000000080008008ULL
+#define nrLanes 25
+static unsigned char KeccakRhoOffsets[nrLanes] = {
+ 0,
+ 1,
+ 62,
+ 28,
+ 27,
+ 36,
+ 44,
+ 6,
+ 55,
+ 20,
+ 3,
+ 10,
+ 43,
+ 25,
+ 39,
+ 41,
+ 45,
+ 15,
+ 21,
+ 8,
+ 18,
+ 2,
+ 61,
+ 56,
+ 14
+#define index(x, y) (((x)%5)+5*((y)%5))
+#define ROL64(a, offset) ((offset != 0) ? ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset))) : a)
+void theta(UINT64 *A)
+ unsigned int x, y;
+ UINT64 C[5], D[5];
+ for(x=0; x<5; x++) {
+ C[x] = 0;
+ for(y=0; y<5; y++)
+ C[x] ^= A[index(x, y)];
+ }
+ for(x=0; x<5; x++)
+ D[x] = ROL64(C[(x+1)%5], 1) ^ C[(x+4)%5];
+ for(x=0; x<5; x++)
+ for(y=0; y<5; y++)
+ A[index(x, y)] ^= D[x];
+void rho(UINT64 *A)
+ unsigned int x, y;
+ for(x=0; x<5; x++) for(y=0; y<5; y++)
+ A[index(x, y)] = ROL64(A[index(x, y)], KeccakRhoOffsets[index(x, y)]);
+void pi(UINT64 *A)
+ unsigned int x, y;
+ UINT64 tempA[25];
+ for(x=0; x<5; x++) for(y=0; y<5; y++)
+ tempA[index(x, y)] = A[index(x, y)];
+ for(x=0; x<5; x++) for(y=0; y<5; y++)
+ A[index(0*x+1*y, 2*x+3*y)] = tempA[index(x, y)];
+void chi(UINT64 *A)
+ unsigned int x, y;
+ UINT64 C[5];
+ for(y=0; y<5; y++) {
+ for(x=0; x<5; x++)
+ C[x] = A[index(x, y)] ^ ((~A[index(x+1, y)]) & A[index(x+2, y)]);
+ for(x=0; x<5; x++)
+ A[index(x, y)] = C[x];
+ }
+void iota(UINT64 *A, unsigned int indexRound)
+ A[index(0, 0)] ^= GET_KRC_VAL(indexRound);
+void kekka_coproc(UINT64 A[25])
+ unsigned int i;
+ for(i=0;i
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise9/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise9/bambu.sh
new file mode 100755
index 000000000..a67cdc80f
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise9/bambu.sh
@@ -0,0 +1,13 @@
+script=$(readlink -e $0)
+root_dir=$(dirname $script)
+rm -rf search
+mkdir -p search
+cd search
+echo "#simulation of search function"
+bambu $root_dir/tree.c --top-fname=main --top-rtldesign-name=search \
+ -O3 --experimental-setup=BAMBU \
+ --generate-tb=$root_dir/test_search.xml --simulator=ICARUS --simulate \
+ --print-dot -v2 "$@" |& tee log.txt
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise9/tree.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise9/tree.c
new file mode 100644
index 000000000..088ea68c2
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise9/tree.c
@@ -0,0 +1,323 @@
+#define MAX_NUMBER_OF_NODES 255
+extern void __builtin_bambu_time_start();
+extern void __builtin_bambu_time_stop();
+/* stack data structure */
+struct stack
+ void *data;
+ struct stack *next;
+typedef struct stack node_stack;
+/* Auxiliary memory stack allocation utilities */
+static node_stack StaticPoolStack[MAX_NUMBER_OF_NODES];
+static node_stack* head_stack_free_list;
+void push_stack_free_list(node_stack ** head, node_stack * new_node)
+ new_node->data = 0;
+ new_node->next = *head;
+ *head = new_node;
+node_stack* pop_stack_free_list(node_stack ** head)
+ node_stack* retval = 0;
+ node_stack * next_node = NULL;
+ if (*head == NULL)
+ return NULL;
+ next_node = (*head)->next;
+ retval = *head;
+ *head = next_node;
+ return retval;
+void init_stack_free_list()
+ int index;
+ for(index=0; index < MAX_NUMBER_OF_NODES; ++index)
+ push_stack_free_list(&head_stack_free_list, &StaticPoolStack[index]);
+/* Stack related functions */
+void push(node_stack** head, void *t)
+ node_stack* temp = pop_stack_free_list(&head_stack_free_list);
+ assert(temp);
+ temp->data = t;
+ temp->next = (*head);
+ *head= temp;
+_Bool isEmpty(node_stack *head)
+ return (head == NULL)? 1 : 0;
+void *pop(node_stack** head)
+ void *res;
+ node_stack *top;
+ assert(!isEmpty(*head));
+ top = *head;
+ res = top->data;
+ *head = top->next;
+ push_stack_free_list(&head_stack_free_list, top);
+ return res;
+void* top(node_stack* head)
+ return head->data;
+/* binary tree data structure */
+struct bin_tree {
+ int data;
+ struct bin_tree * right, * left;
+typedef struct bin_tree node_tree;
+/* Auxiliary memory tree allocation utilities */
+static node_tree StaticPoolTree[MAX_NUMBER_OF_NODES];
+static node_tree* head_tree_free_list;
+void push_tree_free_list(node_tree ** head, node_tree * new_node)
+ new_node->data = 0;
+ new_node->left = *head;
+ new_node->right = 0;
+ *head = new_node;
+node_tree* pop_tree_free_list(node_tree ** head)
+ node_tree* retval = 0;
+ node_tree * next_node = NULL;
+ if (*head == NULL)
+ return NULL;
+ next_node = (*head)->left;
+ retval = *head;
+ *head = next_node;
+ return retval;
+void init_tree_free_list()
+ int index;
+ for(index=0; index < MAX_NUMBER_OF_NODES; ++index)
+ push_tree_free_list(&head_tree_free_list, &StaticPoolTree[index]);
+/* binary tree functions */
+void insert(node_tree ** tree, int val)
+ node_tree *temp = NULL;
+ if(!(*tree))
+ {
+ temp = pop_tree_free_list(&head_tree_free_list);
+ assert(temp);
+ temp->left = temp->right = NULL;
+ temp->data = val;
+ *tree = temp;
+ return;
+ }
+ if(val < (*tree)->data)
+ {
+ insert(&(*tree)->left, val);
+ }
+ else if(val > (*tree)->data)
+ {
+ insert(&(*tree)->right, val);
+ }
+void print_preorder(node_tree * root)
+ if (root)
+ {
+ node_tree *current;
+ node_stack *s = NULL;
+ push(&s, root);
+ while (!isEmpty(s))
+ {
+ current = pop(&s);
+ printf ("%d\n", current->data);
+ if (current->right)
+ push(&s, current->right);
+ if (current->left)
+ push(&s, current->left);
+ }
+ }
+/* Iterative function for inorder binary tree print */
+void print_inorder(node_tree *root)
+ node_tree *current = root;
+ node_stack *s = NULL;
+ _Bool done = 0;
+ while (!done)
+ {
+ if(current != NULL)
+ {
+ push(&s, current);
+ current = current->left;
+ }
+ else
+ {
+ if (!isEmpty(s))
+ {
+ current = pop(&s);
+ printf("%d\n", current->data);
+ current = current->right;
+ }
+ else
+ done = 1;
+ }
+ }
+void print_postorder(node_tree * root)
+ if (root)
+ {
+ node_tree *prev=NULL;
+ node_stack *s = NULL;
+ push(&s, root);
+ while (!isEmpty(s)) {
+ node_tree *curr = top(s);
+ if (!prev || prev->left == curr || prev->right == curr) {
+ if (curr->left)
+ push(&s, curr->left);
+ else if (curr->right)
+ push(&s, curr->right);
+ } else if (curr->left == prev) {
+ if (curr->right)
+ push(&s, curr->right);
+ } else {
+ printf("%d\n", curr->data);
+ pop(&s);
+ }
+ prev = curr;
+ }
+ }
+void deltree(node_tree * root)
+ if (root)
+ {
+ node_tree *prev=NULL;
+ node_stack *s = NULL;
+ push(&s, root);
+ while (!isEmpty(s)) {
+ node_tree *curr = top(s);
+ if (!prev || prev->left == curr || prev->right == curr) {
+ if (curr->left)
+ push(&s, curr->left);
+ else if (curr->right)
+ push(&s, curr->right);
+ } else if (curr->left == prev) {
+ if (curr->right)
+ push(&s, curr->right);
+ } else {
+ push_tree_free_list(&head_tree_free_list, curr);
+ pop(&s);
+ }
+ prev = curr;
+ }
+ }
+node_tree* __attribute__ ((noinline)) search(node_tree * tree, int val)
+ if(tree == NULL|| tree->data == val)
+ return tree;
+ if (tree->data < val)
+ return search(tree->right, val);
+ else
+ return search(tree->left, val);
+int main()
+ node_tree *root;
+ node_tree *tmp;
+ //int i;
+ init_tree_free_list();
+ init_stack_free_list();
+ root = NULL;
+ /* Inserting nodes into tree */
+ insert(&root, 9);
+ insert(&root, 4);
+ insert(&root, 15);
+ insert(&root, 6);
+ insert(&root, 12);
+ insert(&root, 17);
+ insert(&root, 2);
+ /* Printing nodes of tree */
+ printf("Pre Order Display\n");
+ print_preorder(root);
+ printf("In Order Display\n");
+ print_inorder(root);
+ printf("Post Order Display\n");
+ print_postorder(root);
+ /* Search node into tree */
+ __builtin_bambu_time_start();
+ tmp = search(root, 4);
+ __builtin_bambu_time_stop();
+ if (tmp)
+ {
+ printf("Searched node=%d\n", tmp->data);
+ }
+ else
+ {
+ printf("Data Not found in tree.\n");
+ }
+ /* Search node into tree */
+ __builtin_bambu_time_start();
+ tmp = search(root, 6);
+ __builtin_bambu_time_stop();
+ if (tmp)
+ {
+ printf("Second searched node=%d\n", tmp->data);
+ }
+ else
+ {
+ printf("Data Not found in tree.\n");
+ }
+ /* Deleting all nodes of tree */
+ deltree(root);
+ return 0;
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/README b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/README
new file mode 100644
index 000000000..ebe8c2298
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/README
@@ -0,0 +1,5 @@
+Evaluate the effects of GCC optimizations on the number of cycles of adpcm benchmark:
+- Different level of optimizations
+- Vectorization
+- Different inlining
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/adpcm.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/adpcm.c
new file mode 100755
index 000000000..613b1bdb5
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/adpcm.c
@@ -0,0 +1,882 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+/* */
+/* SNU-RT Benchmark Suite for Worst Case Timing Analysis */
+/* ===================================================== */
+/* Collected and Modified by S.-S. Lim */
+/* sslim@archi.snu.ac.kr */
+/* Real-Time Research Group */
+/* Seoul National University */
+/* */
+/* */
+/* < Features > - restrictions for our experimental environment */
+/* */
+/* 1. Completely structured. */
+/* - There are no unconditional jumps. */
+/* - There are no exit from loop bodies. */
+/* (There are no 'break' or 'return' in loop bodies) */
+/* 2. No 'switch' statements. */
+/* 3. No 'do..while' statements. */
+/* 4. Expressions are restricted. */
+/* - There are no multiple expressions joined by 'or', */
+/* 'and' operations. */
+/* 5. No library calls. */
+/* - All the functions needed are implemented in the */
+/* source file. */
+/* */
+/* */
+/* */
+/* FILE: adpcm.c */
+/* SOURCE : C Algorithms for Real-Time DSP by P. M. Embree */
+/* */
+/* */
+/* CCITT G.722 ADPCM (Adaptive Differential Pulse Code Modulation) */
+/* algorithm. */
+/* 16khz sample rate data is stored in the array test_data[SIZE]. */
+/* Results are stored in the array compressed[SIZE] and result[SIZE].*/
+/* Execution time is determined by the constant SIZE (default value */
+/* is 2000). */
+/* */
+/* REMARK : */
+/* */
+/* */
+/* */
+int encode (int, int);
+void decode (int);
+int filtez (int *bpl, int *dlt);
+void upzero (int dlt, int *dlti, int *bli);
+int filtep (int rlt1, int al1, int rlt2, int al2);
+int quantl (int el, int detl);
+int logscl (int il, int nbl);
+int scalel (int nbl, int shift_constant);
+int uppol2 (int al1, int al2, int plt, int plt1, int plt2);
+int uppol1 (int al1, int apl2, int plt, int plt1);
+int logsch (int ih, int nbh);
+void reset ();
+/* G722 C code */
+/* variables for transimit quadrature mirror filter here */
+int tqmf[24];
+/* QMF filter coefficients:
+scaled by a factor of 4 compared to G722 CCITT recomendation */
+const int h[24] = {
+ 12, -44, -44, 212, 48, -624, 128, 1448,
+ -840, -3220, 3804, 15504, 15504, 3804, -3220, -840,
+ 1448, 128, -624, 48, 212, -44, -44, 12
+int xl, xh;
+/* variables for receive quadrature mirror filter here */
+int accumc[11], accumd[11];
+/* outputs of decode() */
+int xout1, xout2;
+int xs, xd;
+/* variables for encoder (hi and lo) here */
+int il, szl, spl, sl, el;
+const int qq4_code4_table[16] = {
+ 0, -20456, -12896, -8968, -6288, -4240, -2584, -1200,
+ 20456, 12896, 8968, 6288, 4240, 2584, 1200, 0
+const int qq6_code6_table[64] = {
+ -136, -136, -136, -136, -24808, -21904, -19008, -16704,
+ -14984, -13512, -12280, -11192, -10232, -9360, -8576, -7856,
+ -7192, -6576, -6000, -5456, -4944, -4464, -4008, -3576,
+ -3168, -2776, -2400, -2032, -1688, -1360, -1040, -728,
+ 24808, 21904, 19008, 16704, 14984, 13512, 12280, 11192,
+ 10232, 9360, 8576, 7856, 7192, 6576, 6000, 5456,
+ 4944, 4464, 4008, 3576, 3168, 2776, 2400, 2032,
+ 1688, 1360, 1040, 728, 432, 136, -432, -136
+int delay_bpl[6];
+int delay_dltx[6];
+const int wl_code_table[16] = {
+ -60, 3042, 1198, 538, 334, 172, 58, -30,
+ 3042, 1198, 538, 334, 172, 58, -30, -60
+const int ilb_table[32] = {
+ 2048, 2093, 2139, 2186, 2233, 2282, 2332, 2383,
+ 2435, 2489, 2543, 2599, 2656, 2714, 2774, 2834,
+ 2896, 2960, 3025, 3091, 3158, 3228, 3298, 3371,
+ 3444, 3520, 3597, 3676, 3756, 3838, 3922, 4008
+int nbl; /* delay line */
+int al1, al2;
+int plt, plt1, plt2;
+int dlt;
+int rlt, rlt1, rlt2;
+/* decision levels - pre-multiplied by 8, 0 to indicate end */
+const int decis_levl[30] = {
+ 280, 576, 880, 1200, 1520, 1864, 2208, 2584,
+ 2960, 3376, 3784, 4240, 4696, 5200, 5712, 6288,
+ 6864, 7520, 8184, 8968, 9752, 10712, 11664, 12896,
+ 14120, 15840, 17560, 20456, 23352, 32767
+int detl;
+/* quantization table 31 long to make quantl look-up easier,
+last entry is for mil=30 case when wd is max */
+const int quant26bt_pos[31] = {
+ 61, 60, 59, 58, 57, 56, 55, 54,
+ 53, 52, 51, 50, 49, 48, 47, 46,
+ 45, 44, 43, 42, 41, 40, 39, 38,
+ 37, 36, 35, 34, 33, 32, 32
+/* quantization table 31 long to make quantl look-up easier,
+last entry is for mil=30 case when wd is max */
+const int quant26bt_neg[31] = {
+ 63, 62, 31, 30, 29, 28, 27, 26,
+ 25, 24, 23, 22, 21, 20, 19, 18,
+ 17, 16, 15, 14, 13, 12, 11, 10,
+ 9, 8, 7, 6, 5, 4, 4
+int deth;
+int sh; /* this comes from adaptive predictor */
+int eh;
+const int qq2_code2_table[4] = {
+ -7408, -1616, 7408, 1616
+const int wh_code_table[4] = {
+ 798, -214, 798, -214
+int dh, ih;
+int nbh, szh;
+int sph, ph, yh, rh;
+int delay_dhx[6];
+int delay_bph[6];
+int ah1, ah2;
+int ph1, ph2;
+int rh1, rh2;
+/* variables for decoder here */
+int ilr, rl;
+int dec_deth, dec_detl, dec_dlt;
+int dec_del_bpl[6];
+int dec_del_dltx[6];
+int dec_plt, dec_plt1, dec_plt2;
+int dec_szl, dec_spl, dec_sl;
+int dec_rlt1, dec_rlt2, dec_rlt;
+int dec_al1, dec_al2;
+int dl;
+int dec_nbl, dec_dh, dec_nbh;
+/* variables used in filtez */
+int dec_del_bph[6];
+int dec_del_dhx[6];
+int dec_szh;
+/* variables used in filtep */
+int dec_rh1, dec_rh2;
+int dec_ah1, dec_ah2;
+int dec_ph, dec_sph;
+int dec_sh;
+int dec_ph1, dec_ph2;
+/* G722 encode function two ints in, one 8 bit output */
+/* put input samples in xin1 = first value, xin2 = second value */
+/* returns il and ih stored together */
+abs (int n)
+ int m;
+ if (n >= 0)
+ m = n;
+ else
+ m = -n;
+ return m;
+encode (int xin1, int xin2)
+ int i;
+ const int *h_ptr;
+ int *tqmf_ptr, *tqmf_ptr1;
+ long int xa, xb;
+ int decis;
+/* transmit quadrature mirror filters implemented here */
+ h_ptr = h;
+ tqmf_ptr = tqmf;
+ xa = (long) (*tqmf_ptr++) * (*h_ptr++);
+ xb = (long) (*tqmf_ptr++) * (*h_ptr++);
+/* main multiply accumulate loop for samples and coefficients */
+ for (i = 0; i < 10; i++)
+ {
+ xa += (long) (*tqmf_ptr++) * (*h_ptr++);
+ xb += (long) (*tqmf_ptr++) * (*h_ptr++);
+ }
+/* final mult/accumulate */
+ xa += (long) (*tqmf_ptr++) * (*h_ptr++);
+ xb += (long) (*tqmf_ptr) * (*h_ptr++);
+/* update delay line tqmf */
+ tqmf_ptr1 = tqmf_ptr - 2;
+ for (i = 0; i < 22; i++)
+ *tqmf_ptr-- = *tqmf_ptr1--;
+ *tqmf_ptr-- = xin1;
+ *tqmf_ptr = xin2;
+/* scale outputs */
+ xl = (xa + xb) >> 15;
+ xh = (xa - xb) >> 15;
+/* end of quadrature mirror filter code */
+/* starting with lower sub band encoder */
+/* filtez - compute predictor output section - zero section */
+ szl = filtez (delay_bpl, delay_dltx);
+/* filtep - compute predictor output signal (pole section) */
+ spl = filtep (rlt1, al1, rlt2, al2);
+/* compute the predictor output value in the lower sub_band encoder */
+ sl = szl + spl;
+ el = xl - sl;
+/* quantl: quantize the difference signal */
+ il = quantl (el, detl);
+/* computes quantized difference signal */
+/* for invqbl, truncate by 2 lsbs, so mode = 3 */
+ dlt = ((long) detl * qq4_code4_table[il >> 2]) >> 15;
+/* logscl: updates logarithmic quant. scale factor in low sub band */
+ nbl = logscl (il, nbl);
+/* scalel: compute the quantizer scale factor in the lower sub band */
+/* calling parameters nbl and 8 (constant such that scalel can be scaleh) */
+ detl = scalel (nbl, 8);
+/* parrec - simple addition to compute recontructed signal for adaptive pred */
+ plt = dlt + szl;
+/* upzero: update zero section predictor coefficients (sixth order)*/
+/* calling parameters: dlt, dlt1, dlt2, ..., dlt6 from dlt */
+/* bpli (linear_buffer in which all six values are delayed */
+/* return params: updated bpli, delayed dltx */
+ upzero (dlt, delay_dltx, delay_bpl);
+/* uppol2- update second predictor coefficient apl2 and delay it as al2 */
+/* calling parameters: al1, al2, plt, plt1, plt2 */
+ al2 = uppol2 (al1, al2, plt, plt1, plt2);
+/* uppol1 :update first predictor coefficient apl1 and delay it as al1 */
+/* calling parameters: al1, apl2, plt, plt1 */
+ al1 = uppol1 (al1, al2, plt, plt1);
+/* recons : compute recontructed signal for adaptive predictor */
+ rlt = sl + dlt;
+/* done with lower sub_band encoder; now implement delays for next time*/
+ rlt2 = rlt1;
+ rlt1 = rlt;
+ plt2 = plt1;
+ plt1 = plt;
+/* high band encode */
+ szh = filtez (delay_bph, delay_dhx);
+ sph = filtep (rh1, ah1, rh2, ah2);
+/* predic: sh = sph + szh */
+ sh = sph + szh;
+/* subtra: eh = xh - sh */
+ eh = xh - sh;
+/* quanth - quantization of difference signal for higher sub-band */
+/* quanth: in-place for speed params: eh, deth (has init. value) */
+ if (eh >= 0)
+ {
+ ih = 3; /* 2,3 are pos codes */
+ }
+ else
+ {
+ ih = 1; /* 0,1 are neg codes */
+ }
+ decis = (564L * (long) deth) >> 12L;
+ if (abs (eh) > decis)
+ ih--; /* mih = 2 case */
+/* compute the quantized difference signal, higher sub-band*/
+ dh = ((long) deth * qq2_code2_table[ih]) >> 15L;
+/* logsch: update logarithmic quantizer scale factor in hi sub-band*/
+ nbh = logsch (ih, nbh);
+/* note : scalel and scaleh use same code, different parameters */
+ deth = scalel (nbh, 10);
+/* parrec - add pole predictor output to quantized diff. signal */
+ ph = dh + szh;
+/* upzero: update zero section predictor coefficients (sixth order) */
+/* calling parameters: dh, dhi, bphi */
+/* return params: updated bphi, delayed dhx */
+ upzero (dh, delay_dhx, delay_bph);
+/* uppol2: update second predictor coef aph2 and delay as ah2 */
+/* calling params: ah1, ah2, ph, ph1, ph2 */
+ ah2 = uppol2 (ah1, ah2, ph, ph1, ph2);
+/* uppol1: update first predictor coef. aph2 and delay it as ah1 */
+ ah1 = uppol1 (ah1, ah2, ph, ph1);
+/* recons for higher sub-band */
+ yh = sh + dh;
+/* done with higher sub-band encoder, now Delay for next time */
+ rh2 = rh1;
+ rh1 = yh;
+ ph2 = ph1;
+ ph1 = ph;
+/* multiplex ih and il to get signals together */
+ return (il | (ih << 6));
+/* decode function, result in xout1 and xout2 */
+decode (int input)
+ int i;
+ long int xa1, xa2; /* qmf accumulators */
+ const int *h_ptr;
+ int *ac_ptr, *ac_ptr1, *ad_ptr, *ad_ptr1;
+/* split transmitted word from input into ilr and ih */
+ ilr = input & 0x3f;
+ ih = input >> 6;
+/* filtez: compute predictor output for zero section */
+ dec_szl = filtez (dec_del_bpl, dec_del_dltx);
+/* filtep: compute predictor output signal for pole section */
+ dec_spl = filtep (dec_rlt1, dec_al1, dec_rlt2, dec_al2);
+ dec_sl = dec_spl + dec_szl;
+/* compute quantized difference signal for adaptive predic */
+ dec_dlt = ((long) dec_detl * qq4_code4_table[ilr >> 2]) >> 15;
+/* compute quantized difference signal for decoder output */
+ dl = ((long) dec_detl * qq6_code6_table[il]) >> 15;
+ rl = dl + dec_sl;
+/* logscl: quantizer scale factor adaptation in the lower sub-band */
+ dec_nbl = logscl (ilr, dec_nbl);
+/* scalel: computes quantizer scale factor in the lower sub band */
+ dec_detl = scalel (dec_nbl, 8);
+/* parrec - add pole predictor output to quantized diff. signal */
+/* for partially reconstructed signal */
+ dec_plt = dec_dlt + dec_szl;
+/* upzero: update zero section predictor coefficients */
+ upzero (dec_dlt, dec_del_dltx, dec_del_bpl);
+/* uppol2: update second predictor coefficient apl2 and delay it as al2 */
+ dec_al2 = uppol2 (dec_al1, dec_al2, dec_plt, dec_plt1, dec_plt2);
+/* uppol1: update first predictor coef. (pole setion) */
+ dec_al1 = uppol1 (dec_al1, dec_al2, dec_plt, dec_plt1);
+/* recons : compute recontructed signal for adaptive predictor */
+ dec_rlt = dec_sl + dec_dlt;
+/* done with lower sub band decoder, implement delays for next time */
+ dec_rlt2 = dec_rlt1;
+ dec_rlt1 = dec_rlt;
+ dec_plt2 = dec_plt1;
+ dec_plt1 = dec_plt;
+/* filtez: compute predictor output for zero section */
+ dec_szh = filtez (dec_del_bph, dec_del_dhx);
+/* filtep: compute predictor output signal for pole section */
+ dec_sph = filtep (dec_rh1, dec_ah1, dec_rh2, dec_ah2);
+/* predic:compute the predictor output value in the higher sub_band decoder */
+ dec_sh = dec_sph + dec_szh;
+/* in-place compute the quantized difference signal */
+ dec_dh = ((long) dec_deth * qq2_code2_table[ih]) >> 15L;
+/* logsch: update logarithmic quantizer scale factor in hi sub band */
+ dec_nbh = logsch (ih, dec_nbh);
+/* scalel: compute the quantizer scale factor in the higher sub band */
+ dec_deth = scalel (dec_nbh, 10);
+/* parrec: compute partially recontructed signal */
+ dec_ph = dec_dh + dec_szh;
+/* upzero: update zero section predictor coefficients */
+ upzero (dec_dh, dec_del_dhx, dec_del_bph);
+/* uppol2: update second predictor coefficient aph2 and delay it as ah2 */
+ dec_ah2 = uppol2 (dec_ah1, dec_ah2, dec_ph, dec_ph1, dec_ph2);
+/* uppol1: update first predictor coef. (pole setion) */
+ dec_ah1 = uppol1 (dec_ah1, dec_ah2, dec_ph, dec_ph1);
+/* recons : compute recontructed signal for adaptive predictor */
+ rh = dec_sh + dec_dh;
+/* done with high band decode, implementing delays for next time here */
+ dec_rh2 = dec_rh1;
+ dec_rh1 = rh;
+ dec_ph2 = dec_ph1;
+ dec_ph1 = dec_ph;
+/* end of higher sub_band decoder */
+/* end with receive quadrature mirror filters */
+ xd = rl - rh;
+ xs = rl + rh;
+/* receive quadrature mirror filters implemented here */
+ h_ptr = h;
+ ac_ptr = accumc;
+ ad_ptr = accumd;
+ xa1 = (long) xd *(*h_ptr++);
+ xa2 = (long) xs *(*h_ptr++);
+/* main multiply accumulate loop for samples and coefficients */
+ for (i = 0; i < 10; i++)
+ {
+ xa1 += (long) (*ac_ptr++) * (*h_ptr++);
+ xa2 += (long) (*ad_ptr++) * (*h_ptr++);
+ }
+/* final mult/accumulate */
+ xa1 += (long) (*ac_ptr) * (*h_ptr++);
+ xa2 += (long) (*ad_ptr) * (*h_ptr++);
+/* scale by 2^14 */
+ xout1 = xa1 >> 14;
+ xout2 = xa2 >> 14;
+/* update delay lines */
+ ac_ptr1 = ac_ptr - 1;
+ ad_ptr1 = ad_ptr - 1;
+ for (i = 0; i < 10; i++)
+ {
+ *ac_ptr-- = *ac_ptr1--;
+ *ad_ptr-- = *ad_ptr1--;
+ }
+ *ac_ptr = xd;
+ *ad_ptr = xs;
+/* clear all storage locations */
+reset ()
+ int i;
+ detl = dec_detl = 32; /* reset to min scale factor */
+ deth = dec_deth = 8;
+ nbl = al1 = al2 = plt1 = plt2 = rlt1 = rlt2 = 0;
+ nbh = ah1 = ah2 = ph1 = ph2 = rh1 = rh2 = 0;
+ dec_nbl = dec_al1 = dec_al2 = dec_plt1 = dec_plt2 = dec_rlt1 = dec_rlt2 = 0;
+ dec_nbh = dec_ah1 = dec_ah2 = dec_ph1 = dec_ph2 = dec_rh1 = dec_rh2 = 0;
+ for (i = 0; i < 6; i++)
+ {
+ delay_dltx[i] = 0;
+ delay_dhx[i] = 0;
+ dec_del_dltx[i] = 0;
+ dec_del_dhx[i] = 0;
+ }
+ for (i = 0; i < 6; i++)
+ {
+ delay_bpl[i] = 0;
+ delay_bph[i] = 0;
+ dec_del_bpl[i] = 0;
+ dec_del_bph[i] = 0;
+ }
+ for (i = 0; i < 24; i++)
+ tqmf[i] = 0; // i<23
+ for (i = 0; i < 11; i++)
+ {
+ accumc[i] = 0;
+ accumd[i] = 0;
+ }
+/* filtez - compute predictor output signal (zero section) */
+/* input: bpl1-6 and dlt1-6, output: szl */
+filtez (int *bpl, int *dlt)
+ int i;
+ long int zl;
+ zl = (long) (*bpl++) * (*dlt++);
+ for (i = 1; i < 6; i++)
+ zl += (long) (*bpl++) * (*dlt++);
+ return ((int) (zl >> 14)); /* x2 here */
+/* filtep - compute predictor output signal (pole section) */
+/* input rlt1-2 and al1-2, output spl */
+filtep (int rlt1, int al1, int rlt2, int al2)
+ long int pl, pl2;
+ pl = 2 * rlt1;
+ pl = (long) al1 *pl;
+ pl2 = 2 * rlt2;
+ pl += (long) al2 *pl2;
+ return ((int) (pl >> 15));
+/* quantl - quantize the difference signal in the lower sub-band */
+quantl (int el, int detl)
+ int ril, mil;
+ long int wd, decis;
+/* abs of difference signal */
+ wd = abs (el);
+/* determine mil based on decision levels and detl gain */
+ for (mil = 0; mil < 30; mil++)
+ {
+ decis = (decis_levl[mil] * (long) detl) >> 15L;
+ if (wd <= decis)
+ break;
+ }
+/* if mil=30 then wd is less than all decision levels */
+ if (el >= 0)
+ ril = quant26bt_pos[mil];
+ else
+ ril = quant26bt_neg[mil];
+ return (ril);
+/* logscl - update log quantizer scale factor in lower sub-band */
+/* note that nbl is passed and returned */
+logscl (int il, int nbl)
+ long int wd;
+ wd = ((long) nbl * 127L) >> 7L; /* leak factor 127/128 */
+ nbl = (int) wd + wl_code_table[il >> 2];
+ if (nbl < 0)
+ nbl = 0;
+ if (nbl > 18432)
+ nbl = 18432;
+ return (nbl);
+/* scalel: compute quantizer scale factor in lower or upper sub-band*/
+scalel (int nbl, int shift_constant)
+ int wd1, wd2, wd3;
+ wd1 = (nbl >> 6) & 31;
+ wd2 = nbl >> 11;
+ wd3 = ilb_table[wd1] >> (shift_constant + 1 - wd2);
+ return (wd3 << 3);
+/* upzero - inputs: dlt, dlti[0-5], bli[0-5], outputs: updated bli[0-5] */
+/* also implements delay of bli and update of dlti from dlt */
+upzero (int dlt, int *dlti, int *bli)
+ int i, wd2, wd3;
+/*if dlt is zero, then no sum into bli */
+ if (dlt == 0)
+ {
+ for (i = 0; i < 6; i++)
+ {
+ bli[i] = (int) ((255L * bli[i]) >> 8L); /* leak factor of 255/256 */
+ }
+ }
+ else
+ {
+ for (i = 0; i < 6; i++)
+ {
+ if ((long) dlt * dlti[i] >= 0)
+ wd2 = 128;
+ else
+ wd2 = -128;
+ wd3 = (int) ((255L * bli[i]) >> 8L); /* leak factor of 255/256 */
+ bli[i] = wd2 + wd3;
+ }
+ }
+/* implement delay line for dlt */
+ dlti[5] = dlti[4];
+ dlti[4] = dlti[3];
+ dlti[3] = dlti[2];
+ dlti[2] = dlti[1];
+ dlti[1] = dlti[0];
+ dlti[0] = dlt;
+/* uppol2 - update second predictor coefficient (pole section) */
+/* inputs: al1, al2, plt, plt1, plt2. outputs: apl2 */
+uppol2 (int al1, int al2, int plt, int plt1, int plt2)
+ long int wd2, wd4;
+ int apl2;
+ wd2 = 4L * (long) al1;
+ if ((long) plt * plt1 >= 0L)
+ wd2 = -wd2; /* check same sign */
+ wd2 = wd2 >> 7; /* gain of 1/128 */
+ if ((long) plt * plt2 >= 0L)
+ {
+ wd4 = wd2 + 128; /* same sign case */
+ }
+ else
+ {
+ wd4 = wd2 - 128;
+ }
+ apl2 = wd4 + (127L * (long) al2 >> 7L); /* leak factor of 127/128 */
+/* apl2 is limited to +-.75 */
+ if (apl2 > 12288)
+ apl2 = 12288;
+ if (apl2 < -12288)
+ apl2 = -12288;
+ return (apl2);
+/* uppol1 - update first predictor coefficient (pole section) */
+/* inputs: al1, apl2, plt, plt1. outputs: apl1 */
+uppol1 (int al1, int apl2, int plt, int plt1)
+ long int wd2;
+ int wd3, apl1;
+ wd2 = ((long) al1 * 255L) >> 8L; /* leak factor of 255/256 */
+ if ((long) plt * plt1 >= 0L)
+ {
+ apl1 = (int) wd2 + 192; /* same sign case */
+ }
+ else
+ {
+ apl1 = (int) wd2 - 192;
+ }
+/* note: wd3= .9375-.75 is always positive */
+ wd3 = 15360 - apl2; /* limit value */
+ if (apl1 > wd3)
+ apl1 = wd3;
+ if (apl1 < -wd3)
+ apl1 = -wd3;
+ return (apl1);
+/* logsch - update log quantizer scale factor in higher sub-band */
+/* note that nbh is passed and returned */
+logsch (int ih, int nbh)
+ int wd;
+ wd = ((long) nbh * 127L) >> 7L; /* leak factor 127/128 */
+ nbh = wd + wh_code_table[ih];
+ if (nbh < 0)
+ nbh = 0;
+ if (nbh > 22528)
+ nbh = 22528;
+ return (nbh);
+| * Test Vectors (added for CHStone) |
+| test_data : input data |
+| test_compressed : expected output data for "encode" |
+| test_result : expected output data for "decode" |
+#define SIZE 100
+#define IN_END 100
+const int test_data[SIZE] = {
+ 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x43, 0x43, 0x43,
+ 0x43, 0x43, 0x43, 0x43, 0x42,
+ 0x42, 0x42, 0x42, 0x42, 0x42,
+ 0x41, 0x41, 0x41, 0x41, 0x41,
+ 0x40, 0x40, 0x40, 0x40, 0x40,
+ 0x40, 0x40, 0x40, 0x3f, 0x3f,
+ 0x3f, 0x3f, 0x3f, 0x3e, 0x3e,
+ 0x3e, 0x3e, 0x3e, 0x3e, 0x3d,
+ 0x3d, 0x3d, 0x3d, 0x3d, 0x3d,
+ 0x3c, 0x3c, 0x3c, 0x3c, 0x3c,
+ 0x3c, 0x3c, 0x3c, 0x3c, 0x3b,
+ 0x3b, 0x3b, 0x3b, 0x3b, 0x3b,
+ 0x3b, 0x3b, 0x3b, 0x3b, 0x3b,
+ 0x3b, 0x3b, 0x3b, 0x3b, 0x3b,
+ 0x3b, 0x3b, 0x3b, 0x3b, 0x3b,
+ 0x3b, 0x3b, 0x3c, 0x3c, 0x3c,
+ 0x3c, 0x3c, 0x3c, 0x3c, 0x3c
+int compressed[SIZE], result[SIZE];
+const int test_compressed[SIZE] = {
+ 0xfd, 0xde, 0x77, 0xba, 0xf2,
+ 0x90, 0x20, 0xa0, 0xec, 0xed,
+ 0xef, 0xf1, 0xf3, 0xf4, 0xf5,
+ 0xf5, 0xf5, 0xf5, 0xf6, 0xf6,
+ 0xf6, 0xf7, 0xf8, 0xf7, 0xf8,
+ 0xf7, 0xf9, 0xf8, 0xf7, 0xf9,
+ 0xf8, 0xf8, 0xf6, 0xf8, 0xf8,
+ 0xf7, 0xf9, 0xf9, 0xf9, 0xf8,
+ 0xf7, 0xfa, 0xf8, 0xf8, 0xf7,
+ 0xfb, 0xfa, 0xf9, 0xf8, 0xf8
+const int test_result[SIZE] = {
+ 0, 0xffffffff, 0xffffffff, 0, 0,
+ 0xffffffff, 0, 0, 0xffffffff, 0xffffffff,
+ 0, 0, 0x1, 0x1, 0,
+ 0xfffffffe, 0xffffffff, 0xfffffffe, 0, 0xfffffffc,
+ 0x1, 0x1, 0x1, 0xfffffffb, 0x2,
+ 0x2, 0x3, 0xb, 0x14, 0x14,
+ 0x16, 0x18, 0x20, 0x21, 0x26,
+ 0x27, 0x2e, 0x2f, 0x33, 0x32,
+ 0x35, 0x33, 0x36, 0x34, 0x37,
+ 0x34, 0x37, 0x35, 0x38, 0x36,
+ 0x39, 0x38, 0x3b, 0x3a, 0x3f,
+ 0x3f, 0x40, 0x3a, 0x3d, 0x3e,
+ 0x41, 0x3c, 0x3e, 0x3f, 0x42,
+ 0x3e, 0x3b, 0x37, 0x3b, 0x3e,
+ 0x41, 0x3b, 0x3b, 0x3a, 0x3b,
+ 0x36, 0x39, 0x3b, 0x3f, 0x3c,
+ 0x3b, 0x37, 0x3b, 0x3d, 0x41,
+ 0x3d, 0x3e, 0x3c, 0x3e, 0x3b,
+ 0x3a, 0x37, 0x3b, 0x3e, 0x41,
+ 0x3c, 0x3b, 0x39, 0x3a, 0x36
+adpcm_main ()
+ int i, j;
+/* reset, initialize required memory */
+ reset ();
+ j = 10;
+ for (i = 0; i < IN_END; i += 2)
+ {
+ compressed[i / 2] = encode (test_data[i], test_data[i + 1]);
+ }
+ for (i = 0; i < IN_END; i += 2)
+ {
+ decode (compressed[i / 2]);
+ result[i] = xout1;
+ result[i + 1] = xout2;
+ }
+main ()
+ int i;
+ int main_result;
+ main_result = 0;
+ adpcm_main ();
+ for (i = 0; i < IN_END / 2; i++)
+ {
+ if (compressed[i] != test_compressed[i])
+ {
+ main_result += 1;
+ }
+ }
+ for (i = 0; i < IN_END; i++)
+ {
+ if (result[i] != test_result[i])
+ {
+ main_result += 1;
+ }
+ }
+ printf ("%d\n", main_result);
+ return main_result;
+ }
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/hint.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/hint.sh
new file mode 100755
index 000000000..3bb28d85f
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/hint.sh
@@ -0,0 +1,4 @@
+abs_script=$(readlink -e $0)
+dir_script=$(dirname $abs_script)
+bambu $dir_script/adpcm.c -O0 --simulate "$@" |& tee log.txt
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.c
new file mode 100755
index 000000000..613b1bdb5
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.c
@@ -0,0 +1,882 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+/* */
+/* SNU-RT Benchmark Suite for Worst Case Timing Analysis */
+/* ===================================================== */
+/* Collected and Modified by S.-S. Lim */
+/* sslim@archi.snu.ac.kr */
+/* Real-Time Research Group */
+/* Seoul National University */
+/* */
+/* */
+/* < Features > - restrictions for our experimental environment */
+/* */
+/* 1. Completely structured. */
+/* - There are no unconditional jumps. */
+/* - There are no exit from loop bodies. */
+/* (There are no 'break' or 'return' in loop bodies) */
+/* 2. No 'switch' statements. */
+/* 3. No 'do..while' statements. */
+/* 4. Expressions are restricted. */
+/* - There are no multiple expressions joined by 'or', */
+/* 'and' operations. */
+/* 5. No library calls. */
+/* - All the functions needed are implemented in the */
+/* source file. */
+/* */
+/* */
+/* */
+/* FILE: adpcm.c */
+/* SOURCE : C Algorithms for Real-Time DSP by P. M. Embree */
+/* */
+/* */
+/* CCITT G.722 ADPCM (Adaptive Differential Pulse Code Modulation) */
+/* algorithm. */
+/* 16khz sample rate data is stored in the array test_data[SIZE]. */
+/* Results are stored in the array compressed[SIZE] and result[SIZE].*/
+/* Execution time is determined by the constant SIZE (default value */
+/* is 2000). */
+/* */
+/* REMARK : */
+/* */
+/* */
+/* */
+int encode (int, int);
+void decode (int);
+int filtez (int *bpl, int *dlt);
+void upzero (int dlt, int *dlti, int *bli);
+int filtep (int rlt1, int al1, int rlt2, int al2);
+int quantl (int el, int detl);
+int logscl (int il, int nbl);
+int scalel (int nbl, int shift_constant);
+int uppol2 (int al1, int al2, int plt, int plt1, int plt2);
+int uppol1 (int al1, int apl2, int plt, int plt1);
+int logsch (int ih, int nbh);
+void reset ();
+/* G722 C code */
+/* variables for transimit quadrature mirror filter here */
+int tqmf[24];
+/* QMF filter coefficients:
+scaled by a factor of 4 compared to G722 CCITT recomendation */
+const int h[24] = {
+ 12, -44, -44, 212, 48, -624, 128, 1448,
+ -840, -3220, 3804, 15504, 15504, 3804, -3220, -840,
+ 1448, 128, -624, 48, 212, -44, -44, 12
+int xl, xh;
+/* variables for receive quadrature mirror filter here */
+int accumc[11], accumd[11];
+/* outputs of decode() */
+int xout1, xout2;
+int xs, xd;
+/* variables for encoder (hi and lo) here */
+int il, szl, spl, sl, el;
+const int qq4_code4_table[16] = {
+ 0, -20456, -12896, -8968, -6288, -4240, -2584, -1200,
+ 20456, 12896, 8968, 6288, 4240, 2584, 1200, 0
+const int qq6_code6_table[64] = {
+ -136, -136, -136, -136, -24808, -21904, -19008, -16704,
+ -14984, -13512, -12280, -11192, -10232, -9360, -8576, -7856,
+ -7192, -6576, -6000, -5456, -4944, -4464, -4008, -3576,
+ -3168, -2776, -2400, -2032, -1688, -1360, -1040, -728,
+ 24808, 21904, 19008, 16704, 14984, 13512, 12280, 11192,
+ 10232, 9360, 8576, 7856, 7192, 6576, 6000, 5456,
+ 4944, 4464, 4008, 3576, 3168, 2776, 2400, 2032,
+ 1688, 1360, 1040, 728, 432, 136, -432, -136
+int delay_bpl[6];
+int delay_dltx[6];
+const int wl_code_table[16] = {
+ -60, 3042, 1198, 538, 334, 172, 58, -30,
+ 3042, 1198, 538, 334, 172, 58, -30, -60
+const int ilb_table[32] = {
+ 2048, 2093, 2139, 2186, 2233, 2282, 2332, 2383,
+ 2435, 2489, 2543, 2599, 2656, 2714, 2774, 2834,
+ 2896, 2960, 3025, 3091, 3158, 3228, 3298, 3371,
+ 3444, 3520, 3597, 3676, 3756, 3838, 3922, 4008
+int nbl; /* delay line */
+int al1, al2;
+int plt, plt1, plt2;
+int dlt;
+int rlt, rlt1, rlt2;
+/* decision levels - pre-multiplied by 8, 0 to indicate end */
+const int decis_levl[30] = {
+ 280, 576, 880, 1200, 1520, 1864, 2208, 2584,
+ 2960, 3376, 3784, 4240, 4696, 5200, 5712, 6288,
+ 6864, 7520, 8184, 8968, 9752, 10712, 11664, 12896,
+ 14120, 15840, 17560, 20456, 23352, 32767
+int detl;
+/* quantization table 31 long to make quantl look-up easier,
+last entry is for mil=30 case when wd is max */
+const int quant26bt_pos[31] = {
+ 61, 60, 59, 58, 57, 56, 55, 54,
+ 53, 52, 51, 50, 49, 48, 47, 46,
+ 45, 44, 43, 42, 41, 40, 39, 38,
+ 37, 36, 35, 34, 33, 32, 32
+/* quantization table 31 long to make quantl look-up easier,
+last entry is for mil=30 case when wd is max */
+const int quant26bt_neg[31] = {
+ 63, 62, 31, 30, 29, 28, 27, 26,
+ 25, 24, 23, 22, 21, 20, 19, 18,
+ 17, 16, 15, 14, 13, 12, 11, 10,
+ 9, 8, 7, 6, 5, 4, 4
+int deth;
+int sh; /* this comes from adaptive predictor */
+int eh;
+const int qq2_code2_table[4] = {
+ -7408, -1616, 7408, 1616
+const int wh_code_table[4] = {
+ 798, -214, 798, -214
+int dh, ih;
+int nbh, szh;
+int sph, ph, yh, rh;
+int delay_dhx[6];
+int delay_bph[6];
+int ah1, ah2;
+int ph1, ph2;
+int rh1, rh2;
+/* variables for decoder here */
+int ilr, rl;
+int dec_deth, dec_detl, dec_dlt;
+int dec_del_bpl[6];
+int dec_del_dltx[6];
+int dec_plt, dec_plt1, dec_plt2;
+int dec_szl, dec_spl, dec_sl;
+int dec_rlt1, dec_rlt2, dec_rlt;
+int dec_al1, dec_al2;
+int dl;
+int dec_nbl, dec_dh, dec_nbh;
+/* variables used in filtez */
+int dec_del_bph[6];
+int dec_del_dhx[6];
+int dec_szh;
+/* variables used in filtep */
+int dec_rh1, dec_rh2;
+int dec_ah1, dec_ah2;
+int dec_ph, dec_sph;
+int dec_sh;
+int dec_ph1, dec_ph2;
+/* G722 encode function two ints in, one 8 bit output */
+/* put input samples in xin1 = first value, xin2 = second value */
+/* returns il and ih stored together */
+abs (int n)
+ int m;
+ if (n >= 0)
+ m = n;
+ else
+ m = -n;
+ return m;
+encode (int xin1, int xin2)
+ int i;
+ const int *h_ptr;
+ int *tqmf_ptr, *tqmf_ptr1;
+ long int xa, xb;
+ int decis;
+/* transmit quadrature mirror filters implemented here */
+ h_ptr = h;
+ tqmf_ptr = tqmf;
+ xa = (long) (*tqmf_ptr++) * (*h_ptr++);
+ xb = (long) (*tqmf_ptr++) * (*h_ptr++);
+/* main multiply accumulate loop for samples and coefficients */
+ for (i = 0; i < 10; i++)
+ {
+ xa += (long) (*tqmf_ptr++) * (*h_ptr++);
+ xb += (long) (*tqmf_ptr++) * (*h_ptr++);
+ }
+/* final mult/accumulate */
+ xa += (long) (*tqmf_ptr++) * (*h_ptr++);
+ xb += (long) (*tqmf_ptr) * (*h_ptr++);
+/* update delay line tqmf */
+ tqmf_ptr1 = tqmf_ptr - 2;
+ for (i = 0; i < 22; i++)
+ *tqmf_ptr-- = *tqmf_ptr1--;
+ *tqmf_ptr-- = xin1;
+ *tqmf_ptr = xin2;
+/* scale outputs */
+ xl = (xa + xb) >> 15;
+ xh = (xa - xb) >> 15;
+/* end of quadrature mirror filter code */
+/* starting with lower sub band encoder */
+/* filtez - compute predictor output section - zero section */
+ szl = filtez (delay_bpl, delay_dltx);
+/* filtep - compute predictor output signal (pole section) */
+ spl = filtep (rlt1, al1, rlt2, al2);
+/* compute the predictor output value in the lower sub_band encoder */
+ sl = szl + spl;
+ el = xl - sl;
+/* quantl: quantize the difference signal */
+ il = quantl (el, detl);
+/* computes quantized difference signal */
+/* for invqbl, truncate by 2 lsbs, so mode = 3 */
+ dlt = ((long) detl * qq4_code4_table[il >> 2]) >> 15;
+/* logscl: updates logarithmic quant. scale factor in low sub band */
+ nbl = logscl (il, nbl);
+/* scalel: compute the quantizer scale factor in the lower sub band */
+/* calling parameters nbl and 8 (constant such that scalel can be scaleh) */
+ detl = scalel (nbl, 8);
+/* parrec - simple addition to compute recontructed signal for adaptive pred */
+ plt = dlt + szl;
+/* upzero: update zero section predictor coefficients (sixth order)*/
+/* calling parameters: dlt, dlt1, dlt2, ..., dlt6 from dlt */
+/* bpli (linear_buffer in which all six values are delayed */
+/* return params: updated bpli, delayed dltx */
+ upzero (dlt, delay_dltx, delay_bpl);
+/* uppol2- update second predictor coefficient apl2 and delay it as al2 */
+/* calling parameters: al1, al2, plt, plt1, plt2 */
+ al2 = uppol2 (al1, al2, plt, plt1, plt2);
+/* uppol1 :update first predictor coefficient apl1 and delay it as al1 */
+/* calling parameters: al1, apl2, plt, plt1 */
+ al1 = uppol1 (al1, al2, plt, plt1);
+/* recons : compute recontructed signal for adaptive predictor */
+ rlt = sl + dlt;
+/* done with lower sub_band encoder; now implement delays for next time*/
+ rlt2 = rlt1;
+ rlt1 = rlt;
+ plt2 = plt1;
+ plt1 = plt;
+/* high band encode */
+ szh = filtez (delay_bph, delay_dhx);
+ sph = filtep (rh1, ah1, rh2, ah2);
+/* predic: sh = sph + szh */
+ sh = sph + szh;
+/* subtra: eh = xh - sh */
+ eh = xh - sh;
+/* quanth - quantization of difference signal for higher sub-band */
+/* quanth: in-place for speed params: eh, deth (has init. value) */
+ if (eh >= 0)
+ {
+ ih = 3; /* 2,3 are pos codes */
+ }
+ else
+ {
+ ih = 1; /* 0,1 are neg codes */
+ }
+ decis = (564L * (long) deth) >> 12L;
+ if (abs (eh) > decis)
+ ih--; /* mih = 2 case */
+/* compute the quantized difference signal, higher sub-band*/
+ dh = ((long) deth * qq2_code2_table[ih]) >> 15L;
+/* logsch: update logarithmic quantizer scale factor in hi sub-band*/
+ nbh = logsch (ih, nbh);
+/* note : scalel and scaleh use same code, different parameters */
+ deth = scalel (nbh, 10);
+/* parrec - add pole predictor output to quantized diff. signal */
+ ph = dh + szh;
+/* upzero: update zero section predictor coefficients (sixth order) */
+/* calling parameters: dh, dhi, bphi */
+/* return params: updated bphi, delayed dhx */
+ upzero (dh, delay_dhx, delay_bph);
+/* uppol2: update second predictor coef aph2 and delay as ah2 */
+/* calling params: ah1, ah2, ph, ph1, ph2 */
+ ah2 = uppol2 (ah1, ah2, ph, ph1, ph2);
+/* uppol1: update first predictor coef. aph2 and delay it as ah1 */
+ ah1 = uppol1 (ah1, ah2, ph, ph1);
+/* recons for higher sub-band */
+ yh = sh + dh;
+/* done with higher sub-band encoder, now Delay for next time */
+ rh2 = rh1;
+ rh1 = yh;
+ ph2 = ph1;
+ ph1 = ph;
+/* multiplex ih and il to get signals together */
+ return (il | (ih << 6));
+/* decode function, result in xout1 and xout2 */
+decode (int input)
+ int i;
+ long int xa1, xa2; /* qmf accumulators */
+ const int *h_ptr;
+ int *ac_ptr, *ac_ptr1, *ad_ptr, *ad_ptr1;
+/* split transmitted word from input into ilr and ih */
+ ilr = input & 0x3f;
+ ih = input >> 6;
+/* filtez: compute predictor output for zero section */
+ dec_szl = filtez (dec_del_bpl, dec_del_dltx);
+/* filtep: compute predictor output signal for pole section */
+ dec_spl = filtep (dec_rlt1, dec_al1, dec_rlt2, dec_al2);
+ dec_sl = dec_spl + dec_szl;
+/* compute quantized difference signal for adaptive predic */
+ dec_dlt = ((long) dec_detl * qq4_code4_table[ilr >> 2]) >> 15;
+/* compute quantized difference signal for decoder output */
+ dl = ((long) dec_detl * qq6_code6_table[il]) >> 15;
+ rl = dl + dec_sl;
+/* logscl: quantizer scale factor adaptation in the lower sub-band */
+ dec_nbl = logscl (ilr, dec_nbl);
+/* scalel: computes quantizer scale factor in the lower sub band */
+ dec_detl = scalel (dec_nbl, 8);
+/* parrec - add pole predictor output to quantized diff. signal */
+/* for partially reconstructed signal */
+ dec_plt = dec_dlt + dec_szl;
+/* upzero: update zero section predictor coefficients */
+ upzero (dec_dlt, dec_del_dltx, dec_del_bpl);
+/* uppol2: update second predictor coefficient apl2 and delay it as al2 */
+ dec_al2 = uppol2 (dec_al1, dec_al2, dec_plt, dec_plt1, dec_plt2);
+/* uppol1: update first predictor coef. (pole setion) */
+ dec_al1 = uppol1 (dec_al1, dec_al2, dec_plt, dec_plt1);
+/* recons : compute recontructed signal for adaptive predictor */
+ dec_rlt = dec_sl + dec_dlt;
+/* done with lower sub band decoder, implement delays for next time */
+ dec_rlt2 = dec_rlt1;
+ dec_rlt1 = dec_rlt;
+ dec_plt2 = dec_plt1;
+ dec_plt1 = dec_plt;
+/* filtez: compute predictor output for zero section */
+ dec_szh = filtez (dec_del_bph, dec_del_dhx);
+/* filtep: compute predictor output signal for pole section */
+ dec_sph = filtep (dec_rh1, dec_ah1, dec_rh2, dec_ah2);
+/* predic:compute the predictor output value in the higher sub_band decoder */
+ dec_sh = dec_sph + dec_szh;
+/* in-place compute the quantized difference signal */
+ dec_dh = ((long) dec_deth * qq2_code2_table[ih]) >> 15L;
+/* logsch: update logarithmic quantizer scale factor in hi sub band */
+ dec_nbh = logsch (ih, dec_nbh);
+/* scalel: compute the quantizer scale factor in the higher sub band */
+ dec_deth = scalel (dec_nbh, 10);
+/* parrec: compute partially recontructed signal */
+ dec_ph = dec_dh + dec_szh;
+/* upzero: update zero section predictor coefficients */
+ upzero (dec_dh, dec_del_dhx, dec_del_bph);
+/* uppol2: update second predictor coefficient aph2 and delay it as ah2 */
+ dec_ah2 = uppol2 (dec_ah1, dec_ah2, dec_ph, dec_ph1, dec_ph2);
+/* uppol1: update first predictor coef. (pole setion) */
+ dec_ah1 = uppol1 (dec_ah1, dec_ah2, dec_ph, dec_ph1);
+/* recons : compute recontructed signal for adaptive predictor */
+ rh = dec_sh + dec_dh;
+/* done with high band decode, implementing delays for next time here */
+ dec_rh2 = dec_rh1;
+ dec_rh1 = rh;
+ dec_ph2 = dec_ph1;
+ dec_ph1 = dec_ph;
+/* end of higher sub_band decoder */
+/* end with receive quadrature mirror filters */
+ xd = rl - rh;
+ xs = rl + rh;
+/* receive quadrature mirror filters implemented here */
+ h_ptr = h;
+ ac_ptr = accumc;
+ ad_ptr = accumd;
+ xa1 = (long) xd *(*h_ptr++);
+ xa2 = (long) xs *(*h_ptr++);
+/* main multiply accumulate loop for samples and coefficients */
+ for (i = 0; i < 10; i++)
+ {
+ xa1 += (long) (*ac_ptr++) * (*h_ptr++);
+ xa2 += (long) (*ad_ptr++) * (*h_ptr++);
+ }
+/* final mult/accumulate */
+ xa1 += (long) (*ac_ptr) * (*h_ptr++);
+ xa2 += (long) (*ad_ptr) * (*h_ptr++);
+/* scale by 2^14 */
+ xout1 = xa1 >> 14;
+ xout2 = xa2 >> 14;
+/* update delay lines */
+ ac_ptr1 = ac_ptr - 1;
+ ad_ptr1 = ad_ptr - 1;
+ for (i = 0; i < 10; i++)
+ {
+ *ac_ptr-- = *ac_ptr1--;
+ *ad_ptr-- = *ad_ptr1--;
+ }
+ *ac_ptr = xd;
+ *ad_ptr = xs;
+/* clear all storage locations */
+reset ()
+ int i;
+ detl = dec_detl = 32; /* reset to min scale factor */
+ deth = dec_deth = 8;
+ nbl = al1 = al2 = plt1 = plt2 = rlt1 = rlt2 = 0;
+ nbh = ah1 = ah2 = ph1 = ph2 = rh1 = rh2 = 0;
+ dec_nbl = dec_al1 = dec_al2 = dec_plt1 = dec_plt2 = dec_rlt1 = dec_rlt2 = 0;
+ dec_nbh = dec_ah1 = dec_ah2 = dec_ph1 = dec_ph2 = dec_rh1 = dec_rh2 = 0;
+ for (i = 0; i < 6; i++)
+ {
+ delay_dltx[i] = 0;
+ delay_dhx[i] = 0;
+ dec_del_dltx[i] = 0;
+ dec_del_dhx[i] = 0;
+ }
+ for (i = 0; i < 6; i++)
+ {
+ delay_bpl[i] = 0;
+ delay_bph[i] = 0;
+ dec_del_bpl[i] = 0;
+ dec_del_bph[i] = 0;
+ }
+ for (i = 0; i < 24; i++)
+ tqmf[i] = 0; // i<23
+ for (i = 0; i < 11; i++)
+ {
+ accumc[i] = 0;
+ accumd[i] = 0;
+ }
+/* filtez - compute predictor output signal (zero section) */
+/* input: bpl1-6 and dlt1-6, output: szl */
+filtez (int *bpl, int *dlt)
+ int i;
+ long int zl;
+ zl = (long) (*bpl++) * (*dlt++);
+ for (i = 1; i < 6; i++)
+ zl += (long) (*bpl++) * (*dlt++);
+ return ((int) (zl >> 14)); /* x2 here */
+/* filtep - compute predictor output signal (pole section) */
+/* input rlt1-2 and al1-2, output spl */
+filtep (int rlt1, int al1, int rlt2, int al2)
+ long int pl, pl2;
+ pl = 2 * rlt1;
+ pl = (long) al1 *pl;
+ pl2 = 2 * rlt2;
+ pl += (long) al2 *pl2;
+ return ((int) (pl >> 15));
+/* quantl - quantize the difference signal in the lower sub-band */
+quantl (int el, int detl)
+ int ril, mil;
+ long int wd, decis;
+/* abs of difference signal */
+ wd = abs (el);
+/* determine mil based on decision levels and detl gain */
+ for (mil = 0; mil < 30; mil++)
+ {
+ decis = (decis_levl[mil] * (long) detl) >> 15L;
+ if (wd <= decis)
+ break;
+ }
+/* if mil=30 then wd is less than all decision levels */
+ if (el >= 0)
+ ril = quant26bt_pos[mil];
+ else
+ ril = quant26bt_neg[mil];
+ return (ril);
+/* logscl - update log quantizer scale factor in lower sub-band */
+/* note that nbl is passed and returned */
+logscl (int il, int nbl)
+ long int wd;
+ wd = ((long) nbl * 127L) >> 7L; /* leak factor 127/128 */
+ nbl = (int) wd + wl_code_table[il >> 2];
+ if (nbl < 0)
+ nbl = 0;
+ if (nbl > 18432)
+ nbl = 18432;
+ return (nbl);
+/* scalel: compute quantizer scale factor in lower or upper sub-band*/
+scalel (int nbl, int shift_constant)
+ int wd1, wd2, wd3;
+ wd1 = (nbl >> 6) & 31;
+ wd2 = nbl >> 11;
+ wd3 = ilb_table[wd1] >> (shift_constant + 1 - wd2);
+ return (wd3 << 3);
+/* upzero - inputs: dlt, dlti[0-5], bli[0-5], outputs: updated bli[0-5] */
+/* also implements delay of bli and update of dlti from dlt */
+upzero (int dlt, int *dlti, int *bli)
+ int i, wd2, wd3;
+/*if dlt is zero, then no sum into bli */
+ if (dlt == 0)
+ {
+ for (i = 0; i < 6; i++)
+ {
+ bli[i] = (int) ((255L * bli[i]) >> 8L); /* leak factor of 255/256 */
+ }
+ }
+ else
+ {
+ for (i = 0; i < 6; i++)
+ {
+ if ((long) dlt * dlti[i] >= 0)
+ wd2 = 128;
+ else
+ wd2 = -128;
+ wd3 = (int) ((255L * bli[i]) >> 8L); /* leak factor of 255/256 */
+ bli[i] = wd2 + wd3;
+ }
+ }
+/* implement delay line for dlt */
+ dlti[5] = dlti[4];
+ dlti[4] = dlti[3];
+ dlti[3] = dlti[2];
+ dlti[2] = dlti[1];
+ dlti[1] = dlti[0];
+ dlti[0] = dlt;
+/* uppol2 - update second predictor coefficient (pole section) */
+/* inputs: al1, al2, plt, plt1, plt2. outputs: apl2 */
+uppol2 (int al1, int al2, int plt, int plt1, int plt2)
+ long int wd2, wd4;
+ int apl2;
+ wd2 = 4L * (long) al1;
+ if ((long) plt * plt1 >= 0L)
+ wd2 = -wd2; /* check same sign */
+ wd2 = wd2 >> 7; /* gain of 1/128 */
+ if ((long) plt * plt2 >= 0L)
+ {
+ wd4 = wd2 + 128; /* same sign case */
+ }
+ else
+ {
+ wd4 = wd2 - 128;
+ }
+ apl2 = wd4 + (127L * (long) al2 >> 7L); /* leak factor of 127/128 */
+/* apl2 is limited to +-.75 */
+ if (apl2 > 12288)
+ apl2 = 12288;
+ if (apl2 < -12288)
+ apl2 = -12288;
+ return (apl2);
+/* uppol1 - update first predictor coefficient (pole section) */
+/* inputs: al1, apl2, plt, plt1. outputs: apl1 */
+uppol1 (int al1, int apl2, int plt, int plt1)
+ long int wd2;
+ int wd3, apl1;
+ wd2 = ((long) al1 * 255L) >> 8L; /* leak factor of 255/256 */
+ if ((long) plt * plt1 >= 0L)
+ {
+ apl1 = (int) wd2 + 192; /* same sign case */
+ }
+ else
+ {
+ apl1 = (int) wd2 - 192;
+ }
+/* note: wd3= .9375-.75 is always positive */
+ wd3 = 15360 - apl2; /* limit value */
+ if (apl1 > wd3)
+ apl1 = wd3;
+ if (apl1 < -wd3)
+ apl1 = -wd3;
+ return (apl1);
+/* logsch - update log quantizer scale factor in higher sub-band */
+/* note that nbh is passed and returned */
+logsch (int ih, int nbh)
+ int wd;
+ wd = ((long) nbh * 127L) >> 7L; /* leak factor 127/128 */
+ nbh = wd + wh_code_table[ih];
+ if (nbh < 0)
+ nbh = 0;
+ if (nbh > 22528)
+ nbh = 22528;
+ return (nbh);
+| * Test Vectors (added for CHStone) |
+| test_data : input data |
+| test_compressed : expected output data for "encode" |
+| test_result : expected output data for "decode" |
+#define SIZE 100
+#define IN_END 100
+const int test_data[SIZE] = {
+ 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x43, 0x43, 0x43,
+ 0x43, 0x43, 0x43, 0x43, 0x42,
+ 0x42, 0x42, 0x42, 0x42, 0x42,
+ 0x41, 0x41, 0x41, 0x41, 0x41,
+ 0x40, 0x40, 0x40, 0x40, 0x40,
+ 0x40, 0x40, 0x40, 0x3f, 0x3f,
+ 0x3f, 0x3f, 0x3f, 0x3e, 0x3e,
+ 0x3e, 0x3e, 0x3e, 0x3e, 0x3d,
+ 0x3d, 0x3d, 0x3d, 0x3d, 0x3d,
+ 0x3c, 0x3c, 0x3c, 0x3c, 0x3c,
+ 0x3c, 0x3c, 0x3c, 0x3c, 0x3b,
+ 0x3b, 0x3b, 0x3b, 0x3b, 0x3b,
+ 0x3b, 0x3b, 0x3b, 0x3b, 0x3b,
+ 0x3b, 0x3b, 0x3b, 0x3b, 0x3b,
+ 0x3b, 0x3b, 0x3b, 0x3b, 0x3b,
+ 0x3b, 0x3b, 0x3c, 0x3c, 0x3c,
+ 0x3c, 0x3c, 0x3c, 0x3c, 0x3c
+int compressed[SIZE], result[SIZE];
+const int test_compressed[SIZE] = {
+ 0xfd, 0xde, 0x77, 0xba, 0xf2,
+ 0x90, 0x20, 0xa0, 0xec, 0xed,
+ 0xef, 0xf1, 0xf3, 0xf4, 0xf5,
+ 0xf5, 0xf5, 0xf5, 0xf6, 0xf6,
+ 0xf6, 0xf7, 0xf8, 0xf7, 0xf8,
+ 0xf7, 0xf9, 0xf8, 0xf7, 0xf9,
+ 0xf8, 0xf8, 0xf6, 0xf8, 0xf8,
+ 0xf7, 0xf9, 0xf9, 0xf9, 0xf8,
+ 0xf7, 0xfa, 0xf8, 0xf8, 0xf7,
+ 0xfb, 0xfa, 0xf9, 0xf8, 0xf8
+const int test_result[SIZE] = {
+ 0, 0xffffffff, 0xffffffff, 0, 0,
+ 0xffffffff, 0, 0, 0xffffffff, 0xffffffff,
+ 0, 0, 0x1, 0x1, 0,
+ 0xfffffffe, 0xffffffff, 0xfffffffe, 0, 0xfffffffc,
+ 0x1, 0x1, 0x1, 0xfffffffb, 0x2,
+ 0x2, 0x3, 0xb, 0x14, 0x14,
+ 0x16, 0x18, 0x20, 0x21, 0x26,
+ 0x27, 0x2e, 0x2f, 0x33, 0x32,
+ 0x35, 0x33, 0x36, 0x34, 0x37,
+ 0x34, 0x37, 0x35, 0x38, 0x36,
+ 0x39, 0x38, 0x3b, 0x3a, 0x3f,
+ 0x3f, 0x40, 0x3a, 0x3d, 0x3e,
+ 0x41, 0x3c, 0x3e, 0x3f, 0x42,
+ 0x3e, 0x3b, 0x37, 0x3b, 0x3e,
+ 0x41, 0x3b, 0x3b, 0x3a, 0x3b,
+ 0x36, 0x39, 0x3b, 0x3f, 0x3c,
+ 0x3b, 0x37, 0x3b, 0x3d, 0x41,
+ 0x3d, 0x3e, 0x3c, 0x3e, 0x3b,
+ 0x3a, 0x37, 0x3b, 0x3e, 0x41,
+ 0x3c, 0x3b, 0x39, 0x3a, 0x36
+adpcm_main ()
+ int i, j;
+/* reset, initialize required memory */
+ reset ();
+ j = 10;
+ for (i = 0; i < IN_END; i += 2)
+ {
+ compressed[i / 2] = encode (test_data[i], test_data[i + 1]);
+ }
+ for (i = 0; i < IN_END; i += 2)
+ {
+ decode (compressed[i / 2]);
+ result[i] = xout1;
+ result[i + 1] = xout2;
+ }
+main ()
+ int i;
+ int main_result;
+ main_result = 0;
+ adpcm_main ();
+ for (i = 0; i < IN_END / 2; i++)
+ {
+ if (compressed[i] != test_compressed[i])
+ {
+ main_result += 1;
+ }
+ }
+ for (i = 0; i < IN_END; i++)
+ {
+ if (result[i] != test_result[i])
+ {
+ main_result += 1;
+ }
+ }
+ printf ("%d\n", main_result);
+ return main_result;
+ }
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.csv b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.csv
new file mode 100644
index 000000000..7139cb2f5
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.csv
@@ -0,0 +1,25 @@
+Benchmark, CYCLES, HLS_execution_time,
+CLANG11:adpcm_O0:main_0, 23643,43.0299999999999999989,
+CLANG11:adpcm_O1:main_0, 23643,43.8199999999999999997,
+CLANG11:adpcm_O2:main_0, 9651,61.9900000000000000015,
+CLANG11:adpcm_O3:main_0, 8855,70.1500000000000000014,
+CLANG11:adpcm_Os:main_0, 21593,49.4599999999999999992,
+CLANG6:adpcm_O0:main_0, 23393,44.1500000000000000014,
+CLANG6:adpcm_O1:main_0, 23393,42.9500000000000000007,
+CLANG6:adpcm_O2:main_0, 17392,62.4700000000000000011,
+CLANG6:adpcm_O3:main_0, 17392,62.5200000000000000004,
+CLANG6:adpcm_Os:main_0, 21543,54.5099999999999999985,
+GCC49:adpcm_O0:main_0, 33429,23.0499999999999999993,
+GCC49:adpcm_O1:main_0, 24547,18.7199999999999999994,
+GCC49:adpcm_O2:main_0, 24043,43.2599999999999999985,
+GCC49:adpcm_O3:main_0, 10429,76.4499999999999999972,
+GCC49:adpcm_O3_inline:main_0, 7503,99.5800000000000000017,
+GCC49:adpcm_O3_vectorize:main_0, 6995,49.3100000000000000012,
+GCC49:adpcm_Os:main_0, 24847,25.2099999999999999992,
+GCC7:adpcm_O0:main_0, 34015,15.8599999999999999997,
+GCC7:adpcm_O1:main_0, 24933,16.9500000000000000007,
+GCC7:adpcm_O2:main_0, 22526,40.7200000000000000011,
+GCC7:adpcm_O3:main_0, 8345,51.0699999999999999997,
+GCC7:adpcm_O3_inline:main_0, 5441,59.3400000000000000001,
+GCC7:adpcm_O3_vectorize:main_0, 8765,32.6100000000000000006,
+GCC7:adpcm_Os:main_0, 25033,27.2199999999999999994,
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm_sdc.csv b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm_sdc.csv
new file mode 100644
index 000000000..fdf820d8c
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm_sdc.csv
@@ -0,0 +1,28 @@
+Benchmark, CYCLES, HLS_execution_time,
+CLANG11:adpcm_O0:main_0, 23693,92.9400000000000000022,
+CLANG11:adpcm_O1:main_0, 23693,91.7799999999999999989,
+CLANG11:adpcm_O2:main_0, 10071,147.809999999999999998,
+CLANG11:adpcm_O3:main_0, 8719,183.690000000000000002,
+CLANG11:adpcm_O3_inline:main_0, 8719,183.960000000000000006,
+CLANG11:adpcm_Os:main_0, 22063,97.5100000000000000019,
+CLANG6:adpcm_O0:main_0, 23443,91.2099999999999999992,
+CLANG6:adpcm_O1:main_0, 23443,93.6999999999999999972,
+CLANG6:adpcm_O2:main_0, 17804,124.870000000000000002,
+CLANG6:adpcm_O3:main_0, 17804,129.610000000000000001,
+CLANG6:adpcm_O3_inline:main_0, 17804,127.849999999999999999,
+CLANG6:adpcm_O3_vectorize:main_0, 17804,126.870000000000000002,
+CLANG6:adpcm_Os:main_0, 22013,105.059999999999999998,
+GCC49:adpcm_O0:main_0, 33479,64.3799999999999999975,
+GCC49:adpcm_O1:main_0, 24297,57.0900000000000000001,
+GCC49:adpcm_O2:main_0, 22863,83.5299999999999999989,
+GCC49:adpcm_O3:main_0, 9149,175.929999999999999993,
+GCC49:adpcm_O3_inline:main_0, 5356,210.619999999999999996,
+GCC49:adpcm_O3_vectorize:main_0, 6135,110.809999999999999998,
+GCC49:adpcm_Os:main_0, 24397,68.4499999999999999972,
+GCC7:adpcm_O0:main_0, 32979,46.5,
+GCC7:adpcm_O1:main_0, 24297,47.0299999999999999989,
+GCC7:adpcm_O2:main_0, 21513,80.2399999999999999981,
+GCC7:adpcm_O3:main_0, 7653,152.220000000000000001,
+GCC7:adpcm_O3_inline:main_0, 5003,136.25,
+GCC7:adpcm_O3_vectorize:main_0, 8235,97.4300000000000000003,
+GCC7:adpcm_Os:main_0, 24397,58.2000000000000000007,
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/list b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/list
new file mode 100644
index 000000000..9a26f25da
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/list
@@ -0,0 +1,7 @@
+adpcm.c --benchmark-name=adpcm_O0 -O0
+adpcm.c --benchmark-name=adpcm_O1 -O1
+adpcm.c --benchmark-name=adpcm_O2 -O2
+adpcm.c --benchmark-name=adpcm_O3 -O3
+adpcm.c --benchmark-name=adpcm_O3_inline -O3 -finline-limit=1000000
+adpcm.c --benchmark-name=adpcm_O3_vectorize -O3 -ftree-vectorize
+adpcm.c --benchmark-name=adpcm_Os -Os
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/synthesize.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/synthesize.sh
new file mode 100755
index 000000000..8d9bc09cc
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/synthesize.sh
@@ -0,0 +1,9 @@
+abs_script=$(readlink -e $0)
+dir_script=$(dirname $abs_script)
+$dir_script/../../test_panda.py --tool=bambu --bambu=bambu --spider=spider \
+ --args="--configuration-name=GCC49 --compiler=I386_GCC49" \
+ --args="--configuration-name=GCC7 --compiler=I386_GCC7" \
+ --args="--configuration-name=CLANG6 --compiler=I386_CLANG6" \
+ --args="--configuration-name=CLANG11 --compiler=I386_CLANG11" \
+ -c=--simulate -b$dir_script -l$dir_script/list "$@"
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/README b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/README
new file mode 100644
index 000000000..cb138e2fb
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/README
@@ -0,0 +1 @@
+Evaluate the effects on the number of cycles in using different integer division implementations on the dfdiv algorithm targeting Zynq and 66MHz
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/SPARC-GCC.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/SPARC-GCC.h
new file mode 100755
index 000000000..523e274f6
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/SPARC-GCC.h
@@ -0,0 +1,88 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Each of the following `typedef's defines the most convenient type that holds
+| integers of at least as many bits as specified. For example, `uint8' should
+| be the most convenient type that can hold unsigned integers of as many as
+| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most
+| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed
+| to the same as `int'.
+typedef int flag;
+typedef int int8;
+typedef int int16;
+| Each of the following `typedef's defines a type that holds integers
+| of _exactly_ the number of bits specified. For instance, for most
+| implementation of C, `bits16' and `sbits16' should be `typedef'ed to
+| `unsigned short int' and `signed short int' (or `short int'), respectively.
+typedef unsigned short int bits16;
+typedef unsigned int bits32;
+typedef unsigned long long int bits64;
+typedef signed long long int sbits64;
+| The `LIT64' macro takes as its argument a textual integer literal and
+| if necessary ``marks'' the literal as having a 64-bit integer type.
+| For example, the GNU C Compiler (`gcc') requires that 64-bit literals be
+| appended with the letters `LL' standing for `long long', which is `gcc's
+| name for the 64-bit integer type. Some compilers may allow `LIT64' to be
+| defined as the identity macro: `#define LIT64( a ) a'.
+#define LIT64( a ) a##LL
+| The macro `INLINE' can be used before functions that should be inlined. If
+| a compiler does not support explicit inlining, this macro should be defined
+| to be `static'.
+#define INLINE
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/dfdiv.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/dfdiv.c
new file mode 100755
index 000000000..7fd9823bd
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/dfdiv.c
@@ -0,0 +1,159 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+ * Copyright (C) 2008
+ * Y. Hara, H. Tomiyama, S. Honda, H. Takada and K. Ishii
+ * Nagoya University, Japan
+ * All rights reserved.
+ *
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis. The authors disclaims any and all warranties,
+ * whether express, implied, or statuary, including any implied warranties or
+ * merchantability or of fitness for a particular purpose. In no event shall the
+ * copyright-holder be liable for any incidental, punitive, or consequential damages
+ * of any kind whatsoever arising from the use of these programs. This disclaimer
+ * of warranty extends to the user of these programs and user's customers, employees,
+ * agents, transferees, successors, and assigns.
+ *
+ */
+#include "softfloat.c"
+ullong_to_double (unsigned long long x)
+ union
+ {
+ double d;
+ unsigned long long ll;
+ } t;
+ t.ll = x;
+ return t.d;
+| * Test Vectors (added for CHStone) |
+| a_input, b_input : input data |
+| z_output : expected output data |
+#define N 22
+const float64 a_input[N] = {
+ 0x7FFF000000000000ULL, /* nan */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x8000000000000000ULL, /* -0.0 */
+ 0x4008000000000000ULL, /* 3.0 */
+ 0xC008000000000000ULL, /* -3.0 */
+ 0x4008000000000000ULL, /* 3.0 */
+ 0xC008000000000000ULL, /* -3.0 */
+ 0x4000000000000000ULL, /* 2.0 */
+ 0xC000000000000000ULL, /* -2.0 */
+ 0x4000000000000000ULL, /* 2.0 */
+ 0xC000000000000000ULL, /* -2.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0xBFF0000000000000ULL, /* -1.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0xBFF0000000000000ULL /* -1.0 */
+const float64 b_input[N] = {
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x7FF8000000000000ULL, /* nan */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x7FF8000000000000ULL, /* nan */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x4000000000000000ULL, /* 2.0 */
+ 0x4000000000000000ULL, /* 2.0 */
+ 0xC000000000000000ULL, /* 2.0 */
+ 0xC000000000000000ULL, /* -2.0 */
+ 0x4010000000000000ULL, /* 4.0 */
+ 0x4010000000000000ULL, /* 4.0 */
+ 0xC010000000000000ULL, /* -4.0 */
+ 0xC010000000000000ULL, /* -4.0 */
+ 0x3FF8000000000000ULL, /* 1.5 */
+ 0x3FF8000000000000ULL, /* 1.5 */
+ 0xBFF8000000000000ULL, /* -1.5 */
+ 0xBFF8000000000000ULL /* -1.5 */
+const float64 z_output[N] = {
+ 0x7FFF000000000000ULL, /* nan */
+ 0x7FF8000000000000ULL, /* nan */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x7FF8000000000000ULL, /* nan */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x8000000000000000ULL, /* -0.0 */
+ 0x3FF8000000000000ULL, /* 1.5 */
+ 0xBFF8000000000000ULL, /* -1.5 */
+ 0xBFF8000000000000ULL, /* 1.5 */
+ 0x3FF8000000000000ULL, /* -1.5 */
+ 0x3FE0000000000000ULL, /* 0.5 */
+ 0xBFE0000000000000ULL, /* 5.0 */
+ 0xBFE0000000000000ULL, /* -5.0 */
+ 0x3FE0000000000000ULL, /* 0.5 */
+ 0x3FE5555555555555ULL, /* 0.666667 */
+ 0xBFE5555555555555ULL, /* -0.666667 */
+ 0xBFE5555555555555ULL, /* -0.666667 */
+ 0x3FE5555555555555ULL /* 0.666667 */
+main ()
+ int main_result;
+ int i;
+ float64 x1, x2;
+ main_result = 0;
+ for (i = 0; i < N; i++)
+ {
+ float64 result;
+ x1 = a_input[i];
+ x2 = b_input[i];
+ result = float64_div (x1, x2);
+ main_result += (result != z_output[i]);
+ printf
+ ("a_input=%016llx b_input=%016llx expected=%016llx output=%016llx (%lf)\n",
+ a_input[i], b_input[i], z_output[i], result,
+ ullong_to_double (result));
+ }
+ printf ("%d\n", main_result);
+ return main_result;
+ }
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/hint.txt b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/hint.txt
new file mode 100644
index 000000000..c68ac9a75
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/hint.txt
@@ -0,0 +1 @@
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/milieu.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/milieu.h
new file mode 100755
index 000000000..4d92d5e05
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/milieu.h
@@ -0,0 +1,53 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Include common integer types and flags.
+#include "SPARC-GCC.h"
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-macros b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-macros
new file mode 100755
index 000000000..a735f741e
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-macros
@@ -0,0 +1,247 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Shifts `a' right by the number of bits given in `count'. If any nonzero
+| bits are shifted off, they are ``jammed'' into the least significant bit of
+| the result by setting the least significant bit to 1. The value of `count'
+| can be arbitrarily large; in particular, if `count' is greater than 64, the
+| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
+| The result is stored in the location pointed to by `zPtr'.
+INLINE void
+shift64RightJamming (bits64 a, int16 count, bits64 * zPtr)
+ bits64 z;
+ if (count == 0)
+ {
+ z = a;
+ }
+ else if (count < 64)
+ {
+ z = (a >> count) | ((a << ((-count) & 63)) != 0);
+ }
+ else
+ {
+ z = (a != 0);
+ }
+ *zPtr = z;
+| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
+| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
+| any carry out is lost. The result is broken into two 64-bit pieces which
+| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+INLINE void
+add128 (bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 * z0Ptr,
+ bits64 * z1Ptr)
+ bits64 z1;
+ z1 = a1 + b1;
+ *z1Ptr = z1;
+ *z0Ptr = a0 + b0 + (z1 < a1);
+| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
+| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
+| 2^128, so any borrow out (carry out) is lost. The result is broken into two
+| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
+| `z1Ptr'.
+INLINE void
+sub128 (bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 * z0Ptr,
+ bits64 * z1Ptr)
+ *z1Ptr = a1 - b1;
+ *z0Ptr = a0 - b0 - (a1 < b1);
+| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
+| into two 64-bit pieces which are stored at the locations pointed to by
+| `z0Ptr' and `z1Ptr'.
+INLINE void
+mul64To128 (bits64 a, bits64 b, bits64 * z0Ptr, bits64 * z1Ptr)
+ bits32 aHigh, aLow, bHigh, bLow;
+ bits64 z0, zMiddleA, zMiddleB, z1;
+ aLow = a;
+ aHigh = a >> 32;
+ bLow = b;
+ bHigh = b >> 32;
+ z1 = ((bits64) aLow) * bLow;
+ zMiddleA = ((bits64) aLow) * bHigh;
+ zMiddleB = ((bits64) aHigh) * bLow;
+ z0 = ((bits64) aHigh) * bHigh;
+ zMiddleA += zMiddleB;
+ z0 += (((bits64) (zMiddleA < zMiddleB)) << 32) + (zMiddleA >> 32);
+ zMiddleA <<= 32;
+ z1 += zMiddleA;
+ z0 += (z1 < zMiddleA);
+ *z1Ptr = z1;
+ *z0Ptr = z0;
+| Returns an approximation to the 64-bit integer quotient obtained by dividing
+| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
+| divisor `b' must be at least 2^63. If q is the exact quotient truncated
+| toward zero, the approximation returned lies between q and q + 2 inclusive.
+| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
+| unsigned integer is returned.
+static bits64
+estimateDiv128To64 (bits64 a0, bits64 a1, bits64 b)
+ bits64 b0, b1;
+ bits64 rem0, rem1, term0, term1;
+ bits64 z;
+ if (b <= a0)
+ b0 = b >> 32;
+ z = (b0 << 32 <= a0) ? LIT64 (0xFFFFFFFF00000000) : (a0 / b0) << 32;
+ mul64To128 (b, z, &term0, &term1);
+ sub128 (a0, a1, term0, term1, &rem0, &rem1);
+ while (((sbits64) rem0) < 0)
+ {
+ z -= LIT64 (0x100000000);
+ b1 = b << 32;
+ add128 (rem0, rem1, b0, b1, &rem0, &rem1);
+ }
+ rem0 = (rem0 << 32) | (rem1 >> 32);
+ z |= (b0 << 32 <= rem0) ? 0xFFFFFFFF : rem0 / b0;
+ return z;
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'. If `a' is zero, 32 is returned.
+static int8
+countLeadingZeros32 (bits32 a)
+ static const int8 countLeadingZerosHigh[256] = {
+ 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ int8 shiftCount;
+ shiftCount = 0;
+ if (a < 0x10000)
+ {
+ shiftCount += 16;
+ a <<= 16;
+ }
+ if (a < 0x1000000)
+ {
+ shiftCount += 8;
+ a <<= 8;
+ }
+ shiftCount += countLeadingZerosHigh[a >> 24];
+ return shiftCount;
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'. If `a' is zero, 64 is returned.
+static int8
+countLeadingZeros64 (bits64 a)
+ int8 shiftCount;
+ shiftCount = 0;
+ if (a < ((bits64) 1) << 32)
+ {
+ shiftCount += 32;
+ }
+ else
+ {
+ a >>= 32;
+ }
+ shiftCount += countLeadingZeros32 (a);
+ return shiftCount;
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-specialize b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-specialize
new file mode 100755
index 000000000..3c5105928
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-specialize
@@ -0,0 +1,123 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Underflow tininess-detection mode, statically initialized to default value.
+| (The declaration in `softfloat.h' must match the `int8' type here.)
+#define float_detect_tininess float_tininess_before_rounding
+| Raises the exceptions specified by `flags'. Floating-point traps can be
+| defined here if desired. It is currently not possible for such a trap
+| to substitute a result value. If traps are not implemented, this routine
+| should be simply `float_exception_flags |= flags;'.
+float_raise (int8 flags)
+ float_exception_flags |= flags;
+| The pattern for a default generated double-precision NaN.
+#define float64_default_nan LIT64( 0x7FFFFFFFFFFFFFFF )
+| Returns 1 if the double-precision floating-point value `a' is a NaN;
+| otherwise returns 0.
+float64_is_nan (float64 a)
+ return (LIT64 (0xFFE0000000000000) < (bits64) (a << 1));
+| Returns 1 if the double-precision floating-point value `a' is a signaling
+| NaN; otherwise returns 0.
+float64_is_signaling_nan (float64 a)
+ return (((a >> 51) & 0xFFF) == 0xFFE) && (a & LIT64 (0x0007FFFFFFFFFFFF));
+| Takes two double-precision floating-point values `a' and `b', one of which
+| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
+| signaling NaN, the invalid exception is raised.
+static float64
+propagateFloat64NaN (float64 a, float64 b)
+ flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
+ aIsNaN = float64_is_nan (a);
+ aIsSignalingNaN = float64_is_signaling_nan (a);
+ bIsNaN = float64_is_nan (b);
+ bIsSignalingNaN = float64_is_signaling_nan (b);
+ a |= LIT64 (0x0008000000000000);
+ b |= LIT64 (0x0008000000000000);
+ if (aIsSignalingNaN | bIsSignalingNaN)
+ float_raise (float_flag_invalid);
+ return bIsSignalingNaN ? b : aIsSignalingNaN ? a : bIsNaN ? b : a;
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.c
new file mode 100755
index 000000000..8604da331
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.c
@@ -0,0 +1,316 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+#include "milieu.h"
+#include "softfloat.h"
+| Floating-point rounding mode, extended double-precision rounding precision,
+| and exception flags.
+int8 float_rounding_mode = float_round_nearest_even;
+int8 float_exception_flags = 0;
+| Primitive arithmetic functions, including multi-word arithmetic, and
+| division and square root approximations. (Can be specialized to target if
+| desired.)
+#include "softfloat-macros"
+| Functions and definitions to determine: (1) whether tininess for underflow
+| is detected before or after rounding by default, (2) what (if anything)
+| happens when exceptions are raised, (3) how signaling NaNs are distinguished
+| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
+| are propagated from function inputs to output. These details are target-
+| specific.
+#include "softfloat-specialize"
+| Returns the fraction bits of the double-precision floating-point value `a'.
+INLINE bits64
+extractFloat64Frac (float64 a)
+ return a & LIT64 (0x000FFFFFFFFFFFFF);
+| Returns the exponent bits of the double-precision floating-point value `a'.
+INLINE int16
+extractFloat64Exp (float64 a)
+ return (a >> 52) & 0x7FF;
+| Returns the sign bit of the double-precision floating-point value `a'.
+INLINE flag
+extractFloat64Sign (float64 a)
+ return a >> 63;
+| Normalizes the subnormal double-precision floating-point value represented
+| by the denormalized significand `aSig'. The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+static void
+normalizeFloat64Subnormal (bits64 aSig, int16 * zExpPtr, bits64 * zSigPtr)
+ int8 shiftCount;
+ shiftCount = countLeadingZeros64 (aSig) - 11;
+ *zSigPtr = aSig << shiftCount;
+ *zExpPtr = 1 - shiftCount;
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| double-precision floating-point value, returning the result. After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result. This means that any integer portion of `zSig'
+| will be added into the exponent. Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+INLINE float64
+packFloat64 (flag zSign, int16 zExp, bits64 zSig)
+ return (((bits64) zSign) << 63) + (((bits64) zExp) << 52) + zSig;
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper double-precision floating-
+| point value corresponding to the abstract input. Ordinarily, the abstract
+| value is simply rounded and packed into the double-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly. However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned. If the abstract value is too small, the input value is rounded
+| to a subnormal number, and the underflow and inexact exceptions are raised
+| if the abstract input cannot be represented exactly as a subnormal double-
+| precision floating-point number.
+| The input significand `zSig' has its binary point between bits 62
+| and 61, which is 10 bits to the left of the usual location. This shifted
+| significand must be normalized or smaller. If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding. In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+static float64
+roundAndPackFloat64 (flag zSign, int16 zExp, bits64 zSig)
+ int8 roundingMode;
+ flag roundNearestEven, isTiny;
+ int16 roundIncrement, roundBits;
+ roundingMode = float_rounding_mode;
+ roundNearestEven = (roundingMode == float_round_nearest_even);
+ roundIncrement = 0x200;
+ if (!roundNearestEven)
+ {
+ if (roundingMode == float_round_to_zero)
+ {
+ roundIncrement = 0;
+ }
+ else
+ {
+ roundIncrement = 0x3FF;
+ if (zSign)
+ {
+ if (roundingMode == float_round_up)
+ roundIncrement = 0;
+ }
+ else
+ {
+ if (roundingMode == float_round_down)
+ roundIncrement = 0;
+ }
+ }
+ }
+ roundBits = zSig & 0x3FF;
+ if (0x7FD <= (bits16) zExp)
+ {
+ if ((0x7FD < zExp)
+ || ((zExp == 0x7FD) && ((sbits64) (zSig + roundIncrement) < 0)))
+ {
+ float_raise (float_flag_overflow | float_flag_inexact);
+ return packFloat64 (zSign, 0x7FF, 0) - (roundIncrement == 0);
+ }
+ if (zExp < 0)
+ {
+ isTiny = (float_detect_tininess == float_tininess_before_rounding)
+ || (zExp < -1)
+ || (zSig + roundIncrement < LIT64 (0x8000000000000000));
+ shift64RightJamming (zSig, -zExp, &zSig);
+ zExp = 0;
+ roundBits = zSig & 0x3FF;
+ if (isTiny && roundBits)
+ float_raise (float_flag_underflow);
+ }
+ }
+ if (roundBits)
+ float_exception_flags |= float_flag_inexact;
+ zSig = (zSig + roundIncrement) >> 10;
+ zSig &= ~(((roundBits ^ 0x200) == 0) & roundNearestEven);
+ if (zSig == 0)
+ zExp = 0;
+ return packFloat64 (zSign, zExp, zSig);
+| Returns the result of dividing the double-precision floating-point value `a'
+| by the corresponding value `b'. The operation is performed according to
+| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+float64_div (float64 a, float64 b)
+ flag aSign, bSign, zSign;
+ int16 aExp, bExp, zExp;
+ bits64 aSig, bSig, zSig;
+ bits64 rem0, rem1, term0, term1;
+ aSig = extractFloat64Frac (a);
+ aExp = extractFloat64Exp (a);
+ aSign = extractFloat64Sign (a);
+ bSig = extractFloat64Frac (b);
+ bExp = extractFloat64Exp (b);
+ bSign = extractFloat64Sign (b);
+ zSign = aSign ^ bSign;
+ if (aExp == 0x7FF)
+ {
+ if (aSig)
+ return propagateFloat64NaN (a, b);
+ if (bExp == 0x7FF)
+ {
+ if (bSig)
+ return propagateFloat64NaN (a, b);
+ float_raise (float_flag_invalid);
+ return float64_default_nan;
+ }
+ return packFloat64 (zSign, 0x7FF, 0);
+ }
+ if (bExp == 0x7FF)
+ {
+ if (bSig)
+ return propagateFloat64NaN (a, b);
+ return packFloat64 (zSign, 0, 0);
+ }
+ if (bExp == 0)
+ {
+ if (bSig == 0)
+ {
+ if ((aExp | aSig) == 0)
+ {
+ float_raise (float_flag_invalid);
+ return float64_default_nan;
+ }
+ float_raise (float_flag_divbyzero);
+ return packFloat64 (zSign, 0x7FF, 0);
+ }
+ normalizeFloat64Subnormal (bSig, &bExp, &bSig);
+ }
+ if (aExp == 0)
+ {
+ if (aSig == 0)
+ return packFloat64 (zSign, 0, 0);
+ normalizeFloat64Subnormal (aSig, &aExp, &aSig);
+ }
+ zExp = aExp - bExp + 0x3FD;
+ aSig = (aSig | LIT64 (0x0010000000000000)) << 10;
+ bSig = (bSig | LIT64 (0x0010000000000000)) << 11;
+ if (bSig <= (aSig + aSig))
+ {
+ aSig >>= 1;
+ ++zExp;
+ }
+ zSig = estimateDiv128To64 (aSig, 0, bSig);
+ if ((zSig & 0x1FF) <= 2)
+ {
+ mul64To128 (bSig, zSig, &term0, &term1);
+ sub128 (aSig, 0, term0, term1, &rem0, &rem1);
+ while ((sbits64) rem0 < 0)
+ {
+ --zSig;
+ add128 (rem0, rem1, 0, bSig, &rem0, &rem1);
+ }
+ zSig |= (rem1 != 0);
+ }
+ return roundAndPackFloat64 (zSign, zExp, zSig);
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.h
new file mode 100755
index 000000000..6d075ca15
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.h
@@ -0,0 +1,77 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Software IEC/IEEE floating-point types.
+typedef unsigned int float32;
+typedef unsigned long long float64;
+| Software IEC/IEEE floating-point underflow tininess-detection mode.
+#define float_tininess_after_rounding 0
+#define float_tininess_before_rounding 1
+| Software IEC/IEEE floating-point rounding mode.
+#define float_round_nearest_even 0
+#define float_round_to_zero 1
+#define float_round_up 2
+#define float_round_down 3
+| Software IEC/IEEE floating-point exception flags.
+#define float_flag_inexact 1
+#define float_flag_divbyzero 2
+#define float_flag_underflow 4
+#define float_flag_overflow 8
+#define float_flag_invalid 16
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/SPARC-GCC.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/SPARC-GCC.h
new file mode 100755
index 000000000..523e274f6
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/SPARC-GCC.h
@@ -0,0 +1,88 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Each of the following `typedef's defines the most convenient type that holds
+| integers of at least as many bits as specified. For example, `uint8' should
+| be the most convenient type that can hold unsigned integers of as many as
+| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most
+| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed
+| to the same as `int'.
+typedef int flag;
+typedef int int8;
+typedef int int16;
+| Each of the following `typedef's defines a type that holds integers
+| of _exactly_ the number of bits specified. For instance, for most
+| implementation of C, `bits16' and `sbits16' should be `typedef'ed to
+| `unsigned short int' and `signed short int' (or `short int'), respectively.
+typedef unsigned short int bits16;
+typedef unsigned int bits32;
+typedef unsigned long long int bits64;
+typedef signed long long int sbits64;
+| The `LIT64' macro takes as its argument a textual integer literal and
+| if necessary ``marks'' the literal as having a 64-bit integer type.
+| For example, the GNU C Compiler (`gcc') requires that 64-bit literals be
+| appended with the letters `LL' standing for `long long', which is `gcc's
+| name for the 64-bit integer type. Some compilers may allow `LIT64' to be
+| defined as the identity macro: `#define LIT64( a ) a'.
+#define LIT64( a ) a##LL
+| The macro `INLINE' can be used before functions that should be inlined. If
+| a compiler does not support explicit inlining, this macro should be defined
+| to be `static'.
+#define INLINE
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.c
new file mode 100755
index 000000000..7fd9823bd
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.c
@@ -0,0 +1,159 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+ * Copyright (C) 2008
+ * Y. Hara, H. Tomiyama, S. Honda, H. Takada and K. Ishii
+ * Nagoya University, Japan
+ * All rights reserved.
+ *
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis. The authors disclaims any and all warranties,
+ * whether express, implied, or statuary, including any implied warranties or
+ * merchantability or of fitness for a particular purpose. In no event shall the
+ * copyright-holder be liable for any incidental, punitive, or consequential damages
+ * of any kind whatsoever arising from the use of these programs. This disclaimer
+ * of warranty extends to the user of these programs and user's customers, employees,
+ * agents, transferees, successors, and assigns.
+ *
+ */
+#include "softfloat.c"
+ullong_to_double (unsigned long long x)
+ union
+ {
+ double d;
+ unsigned long long ll;
+ } t;
+ t.ll = x;
+ return t.d;
+| * Test Vectors (added for CHStone) |
+| a_input, b_input : input data |
+| z_output : expected output data |
+#define N 22
+const float64 a_input[N] = {
+ 0x7FFF000000000000ULL, /* nan */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x8000000000000000ULL, /* -0.0 */
+ 0x4008000000000000ULL, /* 3.0 */
+ 0xC008000000000000ULL, /* -3.0 */
+ 0x4008000000000000ULL, /* 3.0 */
+ 0xC008000000000000ULL, /* -3.0 */
+ 0x4000000000000000ULL, /* 2.0 */
+ 0xC000000000000000ULL, /* -2.0 */
+ 0x4000000000000000ULL, /* 2.0 */
+ 0xC000000000000000ULL, /* -2.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0xBFF0000000000000ULL, /* -1.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0xBFF0000000000000ULL /* -1.0 */
+const float64 b_input[N] = {
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x7FF8000000000000ULL, /* nan */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x7FF8000000000000ULL, /* nan */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x3FF0000000000000ULL, /* 1.0 */
+ 0x4000000000000000ULL, /* 2.0 */
+ 0x4000000000000000ULL, /* 2.0 */
+ 0xC000000000000000ULL, /* 2.0 */
+ 0xC000000000000000ULL, /* -2.0 */
+ 0x4010000000000000ULL, /* 4.0 */
+ 0x4010000000000000ULL, /* 4.0 */
+ 0xC010000000000000ULL, /* -4.0 */
+ 0xC010000000000000ULL, /* -4.0 */
+ 0x3FF8000000000000ULL, /* 1.5 */
+ 0x3FF8000000000000ULL, /* 1.5 */
+ 0xBFF8000000000000ULL, /* -1.5 */
+ 0xBFF8000000000000ULL /* -1.5 */
+const float64 z_output[N] = {
+ 0x7FFF000000000000ULL, /* nan */
+ 0x7FF8000000000000ULL, /* nan */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x7FF8000000000000ULL, /* nan */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x7FF0000000000000ULL, /* inf */
+ 0x0000000000000000ULL, /* 0.0 */
+ 0x8000000000000000ULL, /* -0.0 */
+ 0x3FF8000000000000ULL, /* 1.5 */
+ 0xBFF8000000000000ULL, /* -1.5 */
+ 0xBFF8000000000000ULL, /* 1.5 */
+ 0x3FF8000000000000ULL, /* -1.5 */
+ 0x3FE0000000000000ULL, /* 0.5 */
+ 0xBFE0000000000000ULL, /* 5.0 */
+ 0xBFE0000000000000ULL, /* -5.0 */
+ 0x3FE0000000000000ULL, /* 0.5 */
+ 0x3FE5555555555555ULL, /* 0.666667 */
+ 0xBFE5555555555555ULL, /* -0.666667 */
+ 0xBFE5555555555555ULL, /* -0.666667 */
+ 0x3FE5555555555555ULL /* 0.666667 */
+main ()
+ int main_result;
+ int i;
+ float64 x1, x2;
+ main_result = 0;
+ for (i = 0; i < N; i++)
+ {
+ float64 result;
+ x1 = a_input[i];
+ x2 = b_input[i];
+ result = float64_div (x1, x2);
+ main_result += (result != z_output[i]);
+ printf
+ ("a_input=%016llx b_input=%016llx expected=%016llx output=%016llx (%lf)\n",
+ a_input[i], b_input[i], z_output[i], result,
+ ullong_to_double (result));
+ }
+ printf ("%d\n", main_result);
+ return main_result;
+ }
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.csv b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.csv
new file mode 100644
index 000000000..daedfee6a
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.csv
@@ -0,0 +1,6 @@
+Benchmark, CYCLES, HLS_execution_time,
+GCC49:dfdiv_NR:main_0, 825,44.9199999999999999983,
+GCC49:dfdiv_as:main_0, 841,30.1399999999999999994,
+GCC49:dfdiv_none:main_0, 1777,37.5,
+GCC49:dfdiv_nr1:main_0, 1849,41.1800000000000000003,
+GCC49:dfdiv_nr2:main_0, 1105,43.119999999999999999,
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/list b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/list
new file mode 100644
index 000000000..a46f62bf6
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/list
@@ -0,0 +1,5 @@
+dfdiv.c --benchmark-name=dfdiv_none --hls-div=none
+dfdiv.c --benchmark-name=dfdiv_nr1 --hls-div=nr1
+dfdiv.c --benchmark-name=dfdiv_nr2 --hls-div=nr2
+dfdiv.c --benchmark-name=dfdiv_NR --hls-div=NR
+dfdiv.c --benchmark-name=dfdiv_as --hls-div=as
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/milieu.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/milieu.h
new file mode 100755
index 000000000..4d92d5e05
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/milieu.h
@@ -0,0 +1,53 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Include common integer types and flags.
+#include "SPARC-GCC.h"
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-macros b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-macros
new file mode 100755
index 000000000..a735f741e
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-macros
@@ -0,0 +1,247 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Shifts `a' right by the number of bits given in `count'. If any nonzero
+| bits are shifted off, they are ``jammed'' into the least significant bit of
+| the result by setting the least significant bit to 1. The value of `count'
+| can be arbitrarily large; in particular, if `count' is greater than 64, the
+| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
+| The result is stored in the location pointed to by `zPtr'.
+INLINE void
+shift64RightJamming (bits64 a, int16 count, bits64 * zPtr)
+ bits64 z;
+ if (count == 0)
+ {
+ z = a;
+ }
+ else if (count < 64)
+ {
+ z = (a >> count) | ((a << ((-count) & 63)) != 0);
+ }
+ else
+ {
+ z = (a != 0);
+ }
+ *zPtr = z;
+| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
+| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
+| any carry out is lost. The result is broken into two 64-bit pieces which
+| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+INLINE void
+add128 (bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 * z0Ptr,
+ bits64 * z1Ptr)
+ bits64 z1;
+ z1 = a1 + b1;
+ *z1Ptr = z1;
+ *z0Ptr = a0 + b0 + (z1 < a1);
+| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
+| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
+| 2^128, so any borrow out (carry out) is lost. The result is broken into two
+| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
+| `z1Ptr'.
+INLINE void
+sub128 (bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 * z0Ptr,
+ bits64 * z1Ptr)
+ *z1Ptr = a1 - b1;
+ *z0Ptr = a0 - b0 - (a1 < b1);
+| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
+| into two 64-bit pieces which are stored at the locations pointed to by
+| `z0Ptr' and `z1Ptr'.
+INLINE void
+mul64To128 (bits64 a, bits64 b, bits64 * z0Ptr, bits64 * z1Ptr)
+ bits32 aHigh, aLow, bHigh, bLow;
+ bits64 z0, zMiddleA, zMiddleB, z1;
+ aLow = a;
+ aHigh = a >> 32;
+ bLow = b;
+ bHigh = b >> 32;
+ z1 = ((bits64) aLow) * bLow;
+ zMiddleA = ((bits64) aLow) * bHigh;
+ zMiddleB = ((bits64) aHigh) * bLow;
+ z0 = ((bits64) aHigh) * bHigh;
+ zMiddleA += zMiddleB;
+ z0 += (((bits64) (zMiddleA < zMiddleB)) << 32) + (zMiddleA >> 32);
+ zMiddleA <<= 32;
+ z1 += zMiddleA;
+ z0 += (z1 < zMiddleA);
+ *z1Ptr = z1;
+ *z0Ptr = z0;
+| Returns an approximation to the 64-bit integer quotient obtained by dividing
+| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
+| divisor `b' must be at least 2^63. If q is the exact quotient truncated
+| toward zero, the approximation returned lies between q and q + 2 inclusive.
+| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
+| unsigned integer is returned.
+static bits64
+estimateDiv128To64 (bits64 a0, bits64 a1, bits64 b)
+ bits64 b0, b1;
+ bits64 rem0, rem1, term0, term1;
+ bits64 z;
+ if (b <= a0)
+ b0 = b >> 32;
+ z = (b0 << 32 <= a0) ? LIT64 (0xFFFFFFFF00000000) : (a0 / b0) << 32;
+ mul64To128 (b, z, &term0, &term1);
+ sub128 (a0, a1, term0, term1, &rem0, &rem1);
+ while (((sbits64) rem0) < 0)
+ {
+ z -= LIT64 (0x100000000);
+ b1 = b << 32;
+ add128 (rem0, rem1, b0, b1, &rem0, &rem1);
+ }
+ rem0 = (rem0 << 32) | (rem1 >> 32);
+ z |= (b0 << 32 <= rem0) ? 0xFFFFFFFF : rem0 / b0;
+ return z;
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'. If `a' is zero, 32 is returned.
+static int8
+countLeadingZeros32 (bits32 a)
+ static const int8 countLeadingZerosHigh[256] = {
+ 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ int8 shiftCount;
+ shiftCount = 0;
+ if (a < 0x10000)
+ {
+ shiftCount += 16;
+ a <<= 16;
+ }
+ if (a < 0x1000000)
+ {
+ shiftCount += 8;
+ a <<= 8;
+ }
+ shiftCount += countLeadingZerosHigh[a >> 24];
+ return shiftCount;
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'. If `a' is zero, 64 is returned.
+static int8
+countLeadingZeros64 (bits64 a)
+ int8 shiftCount;
+ shiftCount = 0;
+ if (a < ((bits64) 1) << 32)
+ {
+ shiftCount += 32;
+ }
+ else
+ {
+ a >>= 32;
+ }
+ shiftCount += countLeadingZeros32 (a);
+ return shiftCount;
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-specialize b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-specialize
new file mode 100755
index 000000000..3c5105928
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-specialize
@@ -0,0 +1,123 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Underflow tininess-detection mode, statically initialized to default value.
+| (The declaration in `softfloat.h' must match the `int8' type here.)
+#define float_detect_tininess float_tininess_before_rounding
+| Raises the exceptions specified by `flags'. Floating-point traps can be
+| defined here if desired. It is currently not possible for such a trap
+| to substitute a result value. If traps are not implemented, this routine
+| should be simply `float_exception_flags |= flags;'.
+float_raise (int8 flags)
+ float_exception_flags |= flags;
+| The pattern for a default generated double-precision NaN.
+#define float64_default_nan LIT64( 0x7FFFFFFFFFFFFFFF )
+| Returns 1 if the double-precision floating-point value `a' is a NaN;
+| otherwise returns 0.
+float64_is_nan (float64 a)
+ return (LIT64 (0xFFE0000000000000) < (bits64) (a << 1));
+| Returns 1 if the double-precision floating-point value `a' is a signaling
+| NaN; otherwise returns 0.
+float64_is_signaling_nan (float64 a)
+ return (((a >> 51) & 0xFFF) == 0xFFE) && (a & LIT64 (0x0007FFFFFFFFFFFF));
+| Takes two double-precision floating-point values `a' and `b', one of which
+| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
+| signaling NaN, the invalid exception is raised.
+static float64
+propagateFloat64NaN (float64 a, float64 b)
+ flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
+ aIsNaN = float64_is_nan (a);
+ aIsSignalingNaN = float64_is_signaling_nan (a);
+ bIsNaN = float64_is_nan (b);
+ bIsSignalingNaN = float64_is_signaling_nan (b);
+ a |= LIT64 (0x0008000000000000);
+ b |= LIT64 (0x0008000000000000);
+ if (aIsSignalingNaN | bIsSignalingNaN)
+ float_raise (float_flag_invalid);
+ return bIsSignalingNaN ? b : aIsSignalingNaN ? a : bIsNaN ? b : a;
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.c
new file mode 100755
index 000000000..8604da331
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.c
@@ -0,0 +1,316 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+#include "milieu.h"
+#include "softfloat.h"
+| Floating-point rounding mode, extended double-precision rounding precision,
+| and exception flags.
+int8 float_rounding_mode = float_round_nearest_even;
+int8 float_exception_flags = 0;
+| Primitive arithmetic functions, including multi-word arithmetic, and
+| division and square root approximations. (Can be specialized to target if
+| desired.)
+#include "softfloat-macros"
+| Functions and definitions to determine: (1) whether tininess for underflow
+| is detected before or after rounding by default, (2) what (if anything)
+| happens when exceptions are raised, (3) how signaling NaNs are distinguished
+| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
+| are propagated from function inputs to output. These details are target-
+| specific.
+#include "softfloat-specialize"
+| Returns the fraction bits of the double-precision floating-point value `a'.
+INLINE bits64
+extractFloat64Frac (float64 a)
+ return a & LIT64 (0x000FFFFFFFFFFFFF);
+| Returns the exponent bits of the double-precision floating-point value `a'.
+INLINE int16
+extractFloat64Exp (float64 a)
+ return (a >> 52) & 0x7FF;
+| Returns the sign bit of the double-precision floating-point value `a'.
+INLINE flag
+extractFloat64Sign (float64 a)
+ return a >> 63;
+| Normalizes the subnormal double-precision floating-point value represented
+| by the denormalized significand `aSig'. The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+static void
+normalizeFloat64Subnormal (bits64 aSig, int16 * zExpPtr, bits64 * zSigPtr)
+ int8 shiftCount;
+ shiftCount = countLeadingZeros64 (aSig) - 11;
+ *zSigPtr = aSig << shiftCount;
+ *zExpPtr = 1 - shiftCount;
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| double-precision floating-point value, returning the result. After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result. This means that any integer portion of `zSig'
+| will be added into the exponent. Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+INLINE float64
+packFloat64 (flag zSign, int16 zExp, bits64 zSig)
+ return (((bits64) zSign) << 63) + (((bits64) zExp) << 52) + zSig;
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper double-precision floating-
+| point value corresponding to the abstract input. Ordinarily, the abstract
+| value is simply rounded and packed into the double-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly. However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned. If the abstract value is too small, the input value is rounded
+| to a subnormal number, and the underflow and inexact exceptions are raised
+| if the abstract input cannot be represented exactly as a subnormal double-
+| precision floating-point number.
+| The input significand `zSig' has its binary point between bits 62
+| and 61, which is 10 bits to the left of the usual location. This shifted
+| significand must be normalized or smaller. If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding. In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+static float64
+roundAndPackFloat64 (flag zSign, int16 zExp, bits64 zSig)
+ int8 roundingMode;
+ flag roundNearestEven, isTiny;
+ int16 roundIncrement, roundBits;
+ roundingMode = float_rounding_mode;
+ roundNearestEven = (roundingMode == float_round_nearest_even);
+ roundIncrement = 0x200;
+ if (!roundNearestEven)
+ {
+ if (roundingMode == float_round_to_zero)
+ {
+ roundIncrement = 0;
+ }
+ else
+ {
+ roundIncrement = 0x3FF;
+ if (zSign)
+ {
+ if (roundingMode == float_round_up)
+ roundIncrement = 0;
+ }
+ else
+ {
+ if (roundingMode == float_round_down)
+ roundIncrement = 0;
+ }
+ }
+ }
+ roundBits = zSig & 0x3FF;
+ if (0x7FD <= (bits16) zExp)
+ {
+ if ((0x7FD < zExp)
+ || ((zExp == 0x7FD) && ((sbits64) (zSig + roundIncrement) < 0)))
+ {
+ float_raise (float_flag_overflow | float_flag_inexact);
+ return packFloat64 (zSign, 0x7FF, 0) - (roundIncrement == 0);
+ }
+ if (zExp < 0)
+ {
+ isTiny = (float_detect_tininess == float_tininess_before_rounding)
+ || (zExp < -1)
+ || (zSig + roundIncrement < LIT64 (0x8000000000000000));
+ shift64RightJamming (zSig, -zExp, &zSig);
+ zExp = 0;
+ roundBits = zSig & 0x3FF;
+ if (isTiny && roundBits)
+ float_raise (float_flag_underflow);
+ }
+ }
+ if (roundBits)
+ float_exception_flags |= float_flag_inexact;
+ zSig = (zSig + roundIncrement) >> 10;
+ zSig &= ~(((roundBits ^ 0x200) == 0) & roundNearestEven);
+ if (zSig == 0)
+ zExp = 0;
+ return packFloat64 (zSign, zExp, zSig);
+| Returns the result of dividing the double-precision floating-point value `a'
+| by the corresponding value `b'. The operation is performed according to
+| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+float64_div (float64 a, float64 b)
+ flag aSign, bSign, zSign;
+ int16 aExp, bExp, zExp;
+ bits64 aSig, bSig, zSig;
+ bits64 rem0, rem1, term0, term1;
+ aSig = extractFloat64Frac (a);
+ aExp = extractFloat64Exp (a);
+ aSign = extractFloat64Sign (a);
+ bSig = extractFloat64Frac (b);
+ bExp = extractFloat64Exp (b);
+ bSign = extractFloat64Sign (b);
+ zSign = aSign ^ bSign;
+ if (aExp == 0x7FF)
+ {
+ if (aSig)
+ return propagateFloat64NaN (a, b);
+ if (bExp == 0x7FF)
+ {
+ if (bSig)
+ return propagateFloat64NaN (a, b);
+ float_raise (float_flag_invalid);
+ return float64_default_nan;
+ }
+ return packFloat64 (zSign, 0x7FF, 0);
+ }
+ if (bExp == 0x7FF)
+ {
+ if (bSig)
+ return propagateFloat64NaN (a, b);
+ return packFloat64 (zSign, 0, 0);
+ }
+ if (bExp == 0)
+ {
+ if (bSig == 0)
+ {
+ if ((aExp | aSig) == 0)
+ {
+ float_raise (float_flag_invalid);
+ return float64_default_nan;
+ }
+ float_raise (float_flag_divbyzero);
+ return packFloat64 (zSign, 0x7FF, 0);
+ }
+ normalizeFloat64Subnormal (bSig, &bExp, &bSig);
+ }
+ if (aExp == 0)
+ {
+ if (aSig == 0)
+ return packFloat64 (zSign, 0, 0);
+ normalizeFloat64Subnormal (aSig, &aExp, &aSig);
+ }
+ zExp = aExp - bExp + 0x3FD;
+ aSig = (aSig | LIT64 (0x0010000000000000)) << 10;
+ bSig = (bSig | LIT64 (0x0010000000000000)) << 11;
+ if (bSig <= (aSig + aSig))
+ {
+ aSig >>= 1;
+ ++zExp;
+ }
+ zSig = estimateDiv128To64 (aSig, 0, bSig);
+ if ((zSig & 0x1FF) <= 2)
+ {
+ mul64To128 (bSig, zSig, &term0, &term1);
+ sub128 (aSig, 0, term0, term1, &rem0, &rem1);
+ while ((sbits64) rem0 < 0)
+ {
+ --zSig;
+ add128 (rem0, rem1, 0, bSig, &rem0, &rem1);
+ }
+ zSig |= (rem1 != 0);
+ }
+ return roundAndPackFloat64 (zSign, zExp, zSig);
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.h
new file mode 100755
index 000000000..6d075ca15
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.h
@@ -0,0 +1,77 @@
+| CHStone : a suite of benchmark programs for C-based High-Level Synthesis |
+| ======================================================================== |
+| |
+| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, |
+| H. Takada and K. Ishii |
+| Nagoya University, Japan |
+| |
+| * Remark : |
+| 1. This source code is modified to unify the formats of the benchmark |
+| programs in CHStone. |
+| 2. Test vectors are added for CHStone. |
+| 3. If "main_result" is 0 at the end of the program, the program is |
+| correctly executed. |
+| 4. Please follow the copyright of each benchmark program. |
+This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
+Package, Release 2b.
+Written by John R. Hauser. This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704. Funding was partially provided by the
+National Science Foundation under grant MIP-9311980. The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek. More information
+is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
+INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) the source code for the derivative work includes prominent notice that
+the work is derivative, and (2) the source code includes prominent notice with
+these four paragraphs for those parts of this code that are retained.
+| Software IEC/IEEE floating-point types.
+typedef unsigned int float32;
+typedef unsigned long long float64;
+| Software IEC/IEEE floating-point underflow tininess-detection mode.
+#define float_tininess_after_rounding 0
+#define float_tininess_before_rounding 1
+| Software IEC/IEEE floating-point rounding mode.
+#define float_round_nearest_even 0
+#define float_round_to_zero 1
+#define float_round_up 2
+#define float_round_down 3
+| Software IEC/IEEE floating-point exception flags.
+#define float_flag_inexact 1
+#define float_flag_divbyzero 2
+#define float_flag_underflow 4
+#define float_flag_overflow 8
+#define float_flag_invalid 16
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/synthesize.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/synthesize.sh
new file mode 100755
index 000000000..5180fef4b
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/synthesize.sh
@@ -0,0 +1,6 @@
+abs_script=$(readlink -e $0)
+dir_script=$(dirname $abs_script)
+$dir_script/../../test_panda.py --tool=bambu --bambu=bambu --spider=spider \
+ --args="--configuration-name=GCC49 --compiler=I386_GCC49" \
+ -c=--simulate -b$dir_script -l$dir_script/list "$@"
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise4/module.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise4/module.c
new file mode 100644
index 000000000..0d0c6140c
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise4/module.c
@@ -0,0 +1,4 @@
+long long func_replace(long long a, long long b)
+ return a * b;
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/README b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/README
new file mode 100644
index 000000000..955068302
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/README
@@ -0,0 +1,5 @@
+1. Generate the module implementing the following formula (single precision and double precision):
+gamma = acos((a**2+b**2-c**2)/(2*a*b))
+2. Identify the combination of softfloat ops and libm which produces the best performances.
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.sh
new file mode 100755
index 000000000..244aa73b5
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.sh
@@ -0,0 +1,7 @@
+abs_script=$(readlink -e $0)
+dir_script=$(dirname $abs_script)
+bambu $dir_script/module.c --top-fname=awesome_math \
+ -O3 -lm --speculative-sdc-scheduling --libm-std-rounding --soft-float \
+ --simulate --generate-tb="a=3.0,b=4.0,c=5.0" \
+ "$@" |& tee log.txt
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.txt b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.txt
new file mode 100644
index 000000000..783879ebf
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.txt
@@ -0,0 +1,4 @@
+Use the following parameters:
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/list b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/list
new file mode 100644
index 000000000..b2a51a80b
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/list
@@ -0,0 +1,4 @@
+module.c --benchmark-name=std_rounding_softfloat --libm-std-rounding --soft-float
+module.c --benchmark-name=std_rounding_soft-fp --libm-std-rounding --soft-fp
+module.c --benchmark-name=faith_rounding_softfloat --soft-float
+module.c --benchmark-name=faith_rounding_soft-fp --soft-fp
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/module.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/module.c
new file mode 100644
index 000000000..266c15765
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/module.c
@@ -0,0 +1,24 @@
+#ifdef FP_SINGLE
+#define FP_TYPE float
+#define ACOS(a) acosf(a)
+#define FP_TYPE double
+#define ACOS(a) acos(a)
+#define SQUARE(a) (a*a)
+#ifdef FP_SINGLE
+#define SQUARE(a) powf(a,2)
+#define SQUARE(a) pow(a,2)
+FP_TYPE awesome_math(FP_TYPE a, FP_TYPE b, FP_TYPE c)
+ return ACOS((SQUARE(a) + SQUARE(b) - SQUARE(c))/(2*a*b));
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/synthesize.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/synthesize.sh
new file mode 100755
index 000000000..07ba2bbd5
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/synthesize.sh
@@ -0,0 +1,11 @@
+abs_script=$(readlink -e $0)
+dir_script=$(dirname $abs_script)
+$dir_script/../../test_panda.py --tool=bambu --bambu=bambu --spider=spider \
+ --args="--configuration-name=pow_square " \
+ --args="--configuration-name=mult_square -DMULT_SQUARE" \
+ --args="--configuration-name=single_pow_square -DFP_SINGLE" \
+ --args="--configuration-name=single_mult_square -DFP_SINGLE -DMULT_SQUARE" \
+ -c=--simulate -c=-lm -c=--generate-tb=$dir_script/testbench.xml -c=--speculative-sdc-scheduling \
+ -c=--top-fname=awesome_math \
+ -b$dir_script -l$dir_script/list "$@"
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/testbench.xml b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/testbench.xml
new file mode 100644
index 000000000..63bd95293
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/testbench.xml
@@ -0,0 +1,5 @@
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise6/module.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise6/module.c
new file mode 100644
index 000000000..881227549
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise6/module.c
@@ -0,0 +1,4 @@
+float user_fp(float a, float b, float c)
+ return a * b + c;
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/03-optimizations/test_panda.py b/documentation/tutorial_fpl_2022/03-optimizations/test_panda.py
new file mode 100755
index 000000000..d486f8b83
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/03-optimizations/test_panda.py
@@ -0,0 +1,961 @@
+import argparse
+import datetime
+import distutils.spawn
+import logging
+import os
+import re
+import shlex
+import shutil
+import signal
+import subprocess
+import sys
+import threading
+import xml.dom.minidom
+from collections import deque
+line_index = 0
+failure = False
+def positive_integer(value):
+ pos_int = int(value)
+ if pos_int <= 0:
+ raise argparse.ArgumentTypeError("%s must be a positive integer" % value)
+ return pos_int
+class StoreOrUpdateMin(argparse.Action):
+ first_parsed = True
+ def __call__(self, parser, namespace, values, option_string=None):
+ if self.first_parsed == True :
+ self.first_parsed = False
+ setattr(namespace, self.dest, values)
+ else :
+ setattr(namespace, self.dest, min(namespace.j, values))
+#Return children of a process
+def GetChildren(parent_pid):
+ ret = set()
+ ps_command = subprocess.Popen("ps -o pid --ppid %d --noheaders" % parent_pid, shell=True, stdout=subprocess.PIPE)
+ ps_output = ps_command.stdout.read()
+ ps_command.wait()
+ for pid_str in ps_output.split("\n")[:-1]:
+ ret.add(int(pid_str))
+ return ret
+#Kill a process than kill its children
+def kill_proc_tree(pid):
+ children = GetChildren(pid)
+ os.kill(pid, signal.SIGKILL)
+ for child in children:
+ kill_proc_tree(child)
+#Process benchmark in list
+def execute_tests(named_list,thread_index):
+ global passed_benchmark
+ global total_benchmark
+ global line_index
+ global children
+ global failure
+ lines = open(named_list).readlines()
+ with lock:
+ local_index = line_index
+ line_index += 1
+ while local_index < len(lines) and not (failure and args.stop):
+ cwd = ComputeDirectory(lines[local_index])
+ failed_output_file_name = os.path.join(cwd, args.tool + "_failed_output")
+ if os.path.exists(failed_output_file_name):
+ os.remove(failed_output_file_name)
+ tool_return_value_file_name = os.path.join(cwd, args.tool + "_return_value")
+ if args.restart and os.path.exists(os.path.join(cwd, args.tool + "_return_value")):
+ tool_return_value_file = open(tool_return_value_file_name, "r")
+ return_value = tool_return_value_file.read()
+ tool_return_value_file.close()
+ if return_value == "0":
+ with lock:
+ total_benchmark += 1
+ passed_benchmark += 1
+ logging.info(" SKIPPING --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", ""))
+ local_index = line_index
+ line_index += 1
+ continue
+ HLS_output_directory = os.path.join(cwd, "HLS_output")
+ if os.path.exists(HLS_output_directory):
+ shutil.rmtree(HLS_output_directory)
+ output_file_name = os.path.join(cwd, args.tool + "_execution_output")
+ output_file = open(output_file_name, "w")
+ local_args = lines[local_index]
+ if local_args[0] == "\"":
+ local_args = local_args[1:-1]
+ if args.tool != "bambu" and args.tool != "zebu":
+ tokens = shlex.split(lines[local_index])
+ args_without_benchmark_name = ""
+ for token in tokens:
+ if token.find("--benchmark-name") == -1:
+ args_without_benchmark_name += token + " "
+ local_args = args_without_benchmark_name
+ local_command = "ulimit " + args.ulimit + "; exec timeout " + args.timeout + " " + tool_exe
+ local_command = local_command + " " + local_args
+ output_file.write("#" * 80 + "\n")
+ output_file.write("cd " + cwd + "; ")
+ output_file.write(local_command + "\n")
+ output_file.write("#" * 80 + "\n")
+ output_file.flush()
+ return_value = -1
+ with lock_creation_destruction:
+ if not (failure and args.stop):
+ children[thread_index] = subprocess.Popen(local_command, stderr=output_file, stdout=output_file, cwd=cwd, shell=True, executable="/bin/bash")
+ try:
+ return_value = children[thread_index].wait()
+ except:
+ pass
+ with lock_creation_destruction:
+ if return_value != 0 and (args.stop or args.returnfail):
+ failure = True
+ if failure and args.stop:
+ for local_thread_index in range(n_jobs):
+ if children[local_thread_index] != None:
+ if children[local_thread_index].poll() == None:
+ try:
+ kill_proc_tree(children[local_thread_index].pid)
+ except OSError:
+ pass
+ os.fsync(output_file.fileno())
+ output_file.close()
+ tool_return_value_file = open(tool_return_value_file_name, "w")
+ tool_return_value_file.write(str(return_value))
+ tool_return_value_file.close()
+ args_file = open(os.path.join(cwd, "args"), "w")
+ args_file.write(lines[local_index])
+ args_file.close()
+ if return_value == 0 and os.path.exists(os.path.join(cwd, args.tool + "_results_0.xml")):
+ tool_results_file_name = os.path.join(cwd, args.tool + "_results")
+ tool_results_file = open(tool_results_file_name, "w")
+ tool_results_string = ""
+ xml_document = xml.dom.minidom.parse(os.path.join(cwd, args.tool + "_results_0.xml"))
+ if len(xml_document.getElementsByTagName("CYCLES")) > 0:
+ cycles_tag = xml_document.getElementsByTagName("CYCLES")[0]
+ tool_results_string = tool_results_string + cycles_tag.attributes["value"].value + " CYCLES"
+ if len(xml_document.getElementsByTagName("CLOCK_SLACK")) > 0:
+ slack_tag = xml_document.getElementsByTagName("CLOCK_SLACK")[0]
+ tool_results_string = tool_results_string + " *** " + slack_tag.attributes["value"].value + "ns"
+ tool_results_file.write(tool_results_string)
+ tool_results_file.close()
+ if not (failure and args.stop) or (return_value != -9 and return_value != 0):
+ if return_value != 0:
+ shutil.copy(output_file_name, str(os.path.join(os.path.dirname(output_file_name), args.tool + "_failed_output")))
+ with lock:
+ total_benchmark += 1
+ if return_value == 0:
+ passed_benchmark += 1
+ if not args.no_clean:
+ for sub in os.listdir(cwd):
+ if os.path.isdir(os.path.join(cwd, sub)):
+ shutil.rmtree(os.path.join(cwd, sub))
+ else:
+ if sub != args.tool + "_return_value" and sub != args.tool + "_execution_output" and sub != args.tool + "_results_0.xml" and sub != "args":
+ os.remove(os.path.join(cwd, sub))
+ if os.path.exists(os.path.join(cwd, args.tool + "_results_0.xml")):
+ logging.info(" SUCCESS (" + tool_results_string + ") --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", ""))
+ else:
+ logging.info(" SUCCESS --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", ""))
+ elif return_value == 124:
+ logging.info(" FAILURE (Timeout) --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", ""))
+ elif return_value == 153:
+ logging.info(" FAILURE (File size limit exceeded) --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", ""))
+ else:
+ logging.info(" FAILURE --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", ""))
+ with lock:
+ local_index = line_index
+ line_index += 1
+#Computing relative path
+def ComputeDirectory(line):
+ configuration_name = ""
+ benchmark_name = ""
+ tokens = shlex.split(line)
+ for token in tokens:
+ if token.find("--configuration-name") != -1:
+ configuration_name = token[len("--configuration-name="):]
+ if token.find("--benchmark-name") != -1:
+ benchmark_name = token[len("--benchmark-name="):]
+ new_dir = os.path.join(abs_path, configuration_name, benchmark_name)
+ return new_dir
+#Search c files
+def SearchCFiles(directory):
+ logging.info(" Looking for file in " + str(directory))
+ files = set()
+ for element in os.listdir(directory):
+ if os.path.isdir(os.path.join(directory, element)):
+ files = files.union(SearchCFiles(os.path.join(directory, element)))
+ elif (element[-2:] == ".c") or (element[-2:] == ".C") or (element[-4:] == ".CPP") or (element[-4:] == ".cpp") or (element[-4:] == ".cxx") or (element[-3:] == ".cc") or (element[-4:] == ".c++"):
+ files.add(os.path.join(directory, element))
+ return files
+#Collecting results
+def CollectResults(directory):
+ #Skip if this is a leaf directory
+ if os.path.exists(os.path.join(directory, args.tool + "_return_value")) or os.listdir(directory) == []:
+ return
+ subdirs = [s for s in sorted(os.listdir(directory)) if os.path.isdir(os.path.join(directory,s)) and s != "panda-temp" and s != "HLS_output"]
+ for subdir in subdirs:
+ CollectResults(os.path.join(directory, subdir))
+ tool_failed_output = open(os.path.join(directory, args.tool + "_failed_output"), "w")
+ for subdir in subdirs:
+ if os.path.exists(os.path.join(directory, subdir, args.tool + "_failed_output")):
+ tool_failed_output.write(open(os.path.join(directory, subdir, args.tool + "_failed_output")).read())
+ if os.path.exists(os.path.join(directory, subdir, args.tool + "_execution_output")):
+ tool_failed_output.write("\n")
+ tool_failed_output.write("\n")
+ tool_failed_output.write("\n")
+ tool_failed_output.close()
+ report_file = open(os.path.join(directory, "report"), "w")
+ for subdir in subdirs:
+ if os.path.exists(os.path.join(directory, subdir, args.tool + "_return_value")):
+ return_value_file_name = os.path.join(directory, subdir, args.tool + "_return_value")
+ return_value_file = open(return_value_file_name)
+ return_value = return_value_file.read()
+ return_value_file.close()
+ args_file = open(os.path.join(directory, subdir, "args"))
+ command_args = args_file.readlines()[0]
+ command_args = command_args.replace(abs_benchmarks_root + "/", "")
+ args_file.close()
+ if return_value == "0":
+ tool_results_file_name = os.path.join(directory, subdir, args.tool + "_results")
+ if os.path.exists(tool_results_file_name):
+ report_file.write("SUCCESS (" + open(tool_results_file_name).read() + " cycles) " + command_args.replace("\\", ""))
+ else:
+ report_file.write("SUCCESS: " + command_args.replace("\\", ""))
+ else:
+ if return_value == "124":
+ report_file.write("FAILURE(Timeout): " + command_args.replace("\\", ""))
+ else:
+ report_file.write("FAILURE: " + command_args.replace("\\", ""))
+ report_file.write("\n")
+ elif os.path.exists(os.path.join(directory, subdir, "report")):
+ local_report_file = open(os.path.join(directory, subdir, "report"))
+ report_file.write(local_report_file.read())
+ local_report_file.close()
+ report_file.close()
+ if args.tool == "bambu":
+ local_args = ""
+ named_list_name = os.path.join(abs_path, "named_list")
+ lines = open(named_list_name).readlines()
+ for line in lines:
+ local_dir = ComputeDirectory(line)
+ if os.path.exists(os.path.join(local_dir, args.tool + "_results_0.xml")):
+ local_args = local_args + " " + os.path.join(local_dir, args.tool + "_results_0.xml")
+ if len(local_args) > 0:
+ #Generate experimental setup xml
+ experimental_setup_file_name = os.path.join(abs_path, "experimental_setup.xml")
+ temp_list = open(experimental_setup_file_name, "w")
+ bambu_version_file_name = os.path.join(abs_path, "bambu_version")
+ bambu_version_file = open(bambu_version_file_name, "w")
+ bambu_version_command = [tool_exe]
+ bambu_version_command.extend(shlex.split("--version"))
+ subprocess.call(bambu_version_command, stdout=bambu_version_file)
+ bambu_version_file.close()
+ bambu_version_file = open(bambu_version_file_name, "r")
+ bambu_version = bambu_version_file.readlines()[-2].rstrip()
+ bambu_version_file.close()
+ if args.commonargs != None:
+ bambu_arguments = ' '.join(' '.join(map(str,l)) for l in args.commonargs)
+ else:
+ bambu_arguments = ""
+ temp_list.write("\n")
+ temp_list.write("\n")
+ temp_list.write(" \n")
+ temp_list.write(" \n")
+ temp_list.write(" \n")
+ temp_list.write(" \n")
+ temp_list.write(" \n")
+ reordered_list_name = os.path.join(abs_path, "reordered_list")
+ reordered_list = open(reordered_list_name, "r")
+ for line in reordered_list.readlines():
+ temp_list.write(" \n")
+ temp_list.write(" \n")
+ temp_list.write("\n")
+ temp_list.close();
+ local_args = local_args + " " + experimental_setup_file_name
+ if os.path.exists(args.spider_style) :
+ local_args = local_args + " " + args.spider_style + " " + table
+ else:
+ local_args = local_args + " " + os.path.join(os.path.dirname(spider), args.spider_style) + " " + table
+# logging.info(" Executing " + spider + " " + local_args)
+ logging.info(" Executing " + spider)
+ local_command = [spider]
+ local_command.extend(shlex.split(local_args))
+ return_value = subprocess.call(local_command)
+ logging.info("Collected results of " + directory)
+#Create Junit Body
+def CreateJunitBody(directory,ju_file):
+ #Skip if this is a leaf directory
+ if os.path.exists(os.path.join(directory, args.tool + "_return_value")) or os.listdir(directory) == []:
+ return
+ subdirs = [s for s in sorted(os.listdir(directory)) if os.path.isdir(os.path.join(directory,s)) and s != "panda-temp" and s != "HLS_output"]
+ print_testsuite = False
+ for subdir in subdirs:
+ if os.path.exists(os.path.join(directory, subdir, args.tool + "_return_value")):
+ print_testsuite = True
+ CreateJunitBody(os.path.join(directory, subdir),ju_file)
+ failed_counter_file_name = os.path.join(abs_path, "failed_counter")
+ failed_counter = "0"
+ if os.path.exists(failed_counter_file_name):
+ failed_counter_file = open(failed_counter_file_name)
+ failed_counter = failed_counter_file.read()
+ if print_testsuite and len(subdirs) > 0:
+ ju_file.write(" \n")
+ for subdir in subdirs:
+ if os.path.exists(os.path.join(directory, subdir, args.tool + "_return_value")):
+ return_value_file_name = os.path.join(directory, subdir, args.tool + "_return_value")
+ return_value_file = open(return_value_file_name)
+ return_value = return_value_file.read()
+ return_value_file.close()
+ args_file = open(os.path.join(directory, subdir, "args"))
+ command_args = args_file.readlines()[0]
+ command_args = command_args.replace(abs_benchmarks_root + "/", "")
+ args_file.close()
+ if return_value == "0":
+ ju_file.write(" \n")
+ else:
+ if return_value == "124":
+ ju_file.write(" \n")
+ ju_file.write(" \n")
+ ju_file.write(" \n")
+ ju_file.write("\n")
+ ju_file.write(" \n")
+ ju_file.write(" \n")
+ ju_file.write("\n")
+ ju_file.write(" \n")
+ ju_file.write(" \n")
+ if print_testsuite and len(subdirs) > 0:
+ ju_file.write(" \n")
+#Create PerfPublisher Body
+def CreatePerfPublisherBody(directory,pp_file):
+ #Skip if this is a leaf directory
+ if os.path.exists(os.path.join(directory, args.tool + "_return_value")) or os.listdir(directory) == []:
+ return
+ subdirs = [s for s in sorted(os.listdir(directory)) if os.path.isdir(os.path.join(directory,s)) and s != "panda-temp" and s != "HLS_output"]
+ print_testsuite = False
+ for subdir in subdirs:
+ if os.path.exists(os.path.join(directory, subdir, args.tool + "_return_value")):
+ print_testsuite = True
+ CreatePerfPublisherBody(os.path.join(directory, subdir),pp_file)
+ for subdir in subdirs:
+ if os.path.exists(os.path.join(directory, subdir, args.tool + "_return_value")):
+ pp_file.write(" \n")
+ pp_file.write(" \n")
+ return_value_file_name = os.path.join(directory, subdir, args.tool + "_return_value")
+ return_value_file = open(return_value_file_name)
+ return_value = return_value_file.read()
+ return_value_file.close()
+ args_file = open(os.path.join(directory, subdir, "args"))
+ command_args = args_file.readlines()[0]
+ command_args = command_args.replace(abs_benchmarks_root + "/", "")
+ args_file.close()
+ if return_value == "0":
+ pp_file.write(" \n")
+ cycles_tag = ""
+ areatime_tag = ""
+ slice_tag = ""
+ sliceluts_tag = ""
+ registers_tag = ""
+ dsps_tag = ""
+ brams_tag = ""
+ period_tag = ""
+ dsps_tag = ""
+ slack_tag = ""
+ frequency_tag = ""
+ HLS_execution_time_tag = ""
+ if os.path.exists(os.path.join(directory, subdir, args.tool + "_results_0.xml")):
+ xml_document = xml.dom.minidom.parse(os.path.join(directory, subdir, args.tool + "_results_0.xml"))
+ if len(xml_document.getElementsByTagName("CYCLES")) > 0:
+ cycles_tag = str(xml_document.getElementsByTagName("CYCLES")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("AREAxTIME")) > 0:
+ areatime_tag = str(xml_document.getElementsByTagName("AREAxTIME")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("SLICE")) > 0:
+ slice_tag = str(xml_document.getElementsByTagName("SLICE")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("SLICE_LUTS")) > 0:
+ sliceluts_tag = str(xml_document.getElementsByTagName("SLICE_LUTS")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("REGISTERS")) > 0:
+ registers_tag = str(xml_document.getElementsByTagName("REGISTERS")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("DSPS")) > 0:
+ dsps_tag = str(xml_document.getElementsByTagName("DSPS")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("BRAMS")) > 0:
+ brams_tag = str(xml_document.getElementsByTagName("BRAMS")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("PERIOD")) > 0:
+ period_tag = str(xml_document.getElementsByTagName("PERIOD")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("CLOCK_SLACK")) > 0:
+ slack_tag = str(xml_document.getElementsByTagName("CLOCK_SLACK")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("FREQUENCY")) > 0:
+ frequency_tag = str(xml_document.getElementsByTagName("FREQUENCY")[0].attributes["value"].value)
+ if len(xml_document.getElementsByTagName("HLS_execution_time")) > 0:
+ HLS_execution_time_tag = str(xml_document.getElementsByTagName("HLS_execution_time")[0].attributes["value"].value)
+ if cycles_tag != "":
+ pp_file.write(" \n")
+ if HLS_execution_time_tag != "":
+ pp_file.write(" \n")
+ if areatime_tag != "" or slice_tag != "" or sliceluts_tag != "" or registers_tag != "" or dsps_tag != "" or brams_tag != "" or period_tag != "" or dsps_tag != "" or slack_tag != "" or frequency_tag != "":
+ pp_file.write(" \n")
+ if areatime_tag != "":
+ pp_file.write(" \n")
+ if slice_tag != "":
+ pp_file.write(" \n")
+ if sliceluts_tag != "":
+ pp_file.write(" \n")
+ if registers_tag != "":
+ pp_file.write(" \n")
+ if dsps_tag != "":
+ pp_file.write(" \n")
+ if brams_tag != "":
+ pp_file.write(" \n")
+ if period_tag != "":
+ pp_file.write(" \n")
+ if slack_tag != "":
+ pp_file.write(" \n")
+ if frequency_tag != "":
+ pp_file.write(" \n")
+ pp_file.write(" \n")
+ else:
+ if return_value == "124":
+ pp_file.write(" \n")
+ else:
+ pp_file.write(" \n")
+ pp_file.write(" \n")
+ pp_file.write(" \n")
+parser = argparse.ArgumentParser(description="Performs panda tests", fromfile_prefix_chars='@')
+parser.add_argument("files", help="The files to be tested: they can be configuration files, directories containing benchmarks or source code files.", nargs='*', action="append")
+parser.add_argument('-l', "--benchmarks_list", help="The file containing the list of tests to be performed", nargs='*', action="append")
+parser.add_argument('-b', "--benchmarks_root", help="The directory containing benchmarks")
+parser.add_argument('-o', "--output", help="The directory where output files we be put (default=\"output\")", default="output")
+parser.add_argument('-j', help="The number of jobs which execute the benchmarks (default=\"1\")", default=1, type=positive_integer, action=StoreOrUpdateMin)
+parser.add_argument("--bambu", help="The bambu executable (default=/opt/panda/bin/bambu)", default="/opt/panda/bin/bambu")
+parser.add_argument("--spider", help="The spider executable (default=/opt/panda/bin/spider)", default="/opt/panda/bin/spider")
+parser.add_argument("--spider-style", help="The spider table style relative to the spider executable (default=../lib/latex_format_bambu_results.xml)", default="../lib/latex_format_bambu_results.xml")
+parser.add_argument("--zebu", help="The zebu executable (default=/opt/panda/bin/zebu)", default="/opt/panda/bin/zebu")
+parser.add_argument('-t', "--timeout", help="Timeout for tool execution (default=60m)", default="60m")
+parser.add_argument('-a', "--args", help="A set of arguments to be passed to the tool", nargs='*', action='append')
+parser.add_argument('-c', "--commonargs", help="A set of arguments to be passed to the tool", nargs='*', action='append')
+parser.add_argument("--table", help="Print the results in tex format", default="results.tex")
+parser.add_argument("--tool", help="The tool to be tested", default="bambu")
+parser.add_argument("--ulimit", help="The ulimit options", default="-f 2097152 -v 8388608 -s 16384")
+parser.add_argument("--stop", help="Stop the execution on first error (default=false)", default=False, action="store_true")
+parser.add_argument("--returnfail", help="Return FAILURE in case at least one test fails (default=false)", default=False, action="store_true")
+parser.add_argument("--mail", help="Send a mail with the result")
+parser.add_argument("--name", help="Set the name of this regression (default=Bambu regression)", nargs='*', action='append')
+parser.add_argument("--no-clean", help="Do not clean produced files", default=False, action="store_true")
+parser.add_argument("--restart", help="Restart last execution (default=false)", default=False, action="store_true")
+parser.add_argument("--script", help="Set the bash script in the generated tex", default="")
+parser.add_argument("--junitdir", help="Set the JUnit directory", default="")
+parser.add_argument("--perfpublisherdir", help="Set the PerfPublisher directory", default="")
+args = parser.parse_args()
+n_jobs = args.j # set this, because it will be overwritten by the parse of modified_argv
+logging.basicConfig(level=logging.INFO,format='%(levelname)s: %(message)s')
+#The absolute path of current script
+abs_script = os.path.abspath(sys.argv[0])
+#Expand configuration files
+modified_argv = []
+for arg in sys.argv[1:]:
+ if arg in args.files[0]:
+ #.c/c++ file
+ if (arg[-2:] == ".c") or (arg[-2:] == ".C") or (arg[-4:] == ".CPP") or (arg[-4:] == ".cpp") or (arg[-4:] == ".cxx") or (arg[-3:] == ".cc") or (arg[-4:] == ".c++"):
+ modified_argv.append(arg)
+ #Check if it is a directory
+ elif os.path.exists(arg) and os.path.isdir(arg):
+ modified_argv.append(arg)
+ elif args.benchmarks_root != None and os.path.exists(os.path.join(os.path.abspath(args.benchmarks_root), arg)) and os.path.isdir(os.path.join(os.path.abspath(args.benchmarks_root), arg)):
+ modified_argv.append(arg)
+ elif os.path.exists(os.path.join(os.path.dirname(abs_script), arg)) and os.path.isdir(os.path.join(os.path.dirname(abs_script), arg)):
+ modified_argv.append(arg)
+ else:
+ modified_argv.append("@" + arg)
+ else:
+ modified_argv.append(arg)
+args = parser.parse_args(modified_argv)
+#The absolute path of current script
+abs_script = os.path.abspath(sys.argv[0])
+#The table to be produced
+table = os.path.abspath(args.table)
+#Check if output directory exists, if yes abort
+if os.path.exists(args.output) and not args.restart:
+ logging.error("Output directory " + args.output + " already exists. Please remove it or specify a different one with -o")
+ sys.exit(1)
+#Check if JUnit dir exist
+if args.junitdir != "" and not os.path.exists(args.junitdir):
+ os.mkdir(args.junitdir)
+#Check if PerfPublisher dir exist
+if args.perfpublisherdir != "" and not os.path.exists(args.perfpublisherdir):
+ os.mkdir(args.perfpublisherdir)
+#compute JUnit file name
+junit_file_name = ""
+if args.junitdir != "":
+ junit_index = 0
+ junit_file_name = os.path.abspath(os.path.join(args.junitdir, "Junit_report"+str(junit_index)+".xml"))
+ while os.path.isfile(junit_file_name) :
+ junit_index = junit_index + 1
+ junit_file_name = os.path.abspath(os.path.join(args.junitdir, "Junit_report"+str(junit_index)+".xml"))
+#compute PerfPublisher file name
+perfpublisher_name = ""
+perfpublisher_file_name = ""
+if args.perfpublisherdir != "":
+ perfpublisher_index = 0
+ perfpublisher_name = "PerfPublisher_report"+str(perfpublisher_index)
+ perfpublisher_file_name = os.path.abspath(os.path.join(args.perfpublisherdir, perfpublisher_name+".xml"))
+ while os.path.isfile(perfpublisher_file_name) :
+ perfpublisher_index = perfpublisher_index + 1
+ perfpublisher_name = "PerfPublisher_report"+str(perfpublisher_index)
+ perfpublisher_file_name = os.path.abspath(os.path.join(args.perfpublisherdir, perfpublisher_name+".xml"))
+#Create the folder and enter in it
+abs_path = os.path.abspath(args.output)
+if args.restart:
+ if not os.path.exists(abs_path):
+ args.restart = False
+if not args.restart:
+ os.mkdir(abs_path)
+#Skipping if all benchmarks already pass
+if args.restart:
+ failed_counter_file_name = os.path.join(abs_path, "failed_counter")
+ if os.path.exists(failed_counter_file_name):
+ failed_counter_file = open(failed_counter_file_name)
+ failed_counter = failed_counter_file.read()
+ if failed_counter == "0" and args.junitdir == "" and args.perfpublisherdir == "":
+ logging.info("Already pass")
+ sys.exit(0)
+#Check tool executable
+tool_exe = ""
+if args.tool == "bambu":
+ if os.path.isfile(args.bambu) and os.access(args.bambu, os.X_OK):
+ tool_exe = args.bambu
+ else:
+ #Check bambu in the path
+ for path in os.environ["PATH"].split(os.pathsep):
+ exe_file = os.path.join(path, "bambu")
+ if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK):
+ tool_exe = exe_file
+ if tool_exe == "":
+ if args.bambu != "opt/panda/bin/bambu":
+ if not os.path.isfile(args.bambu):
+ logging.error(args.bambu + " does not exist")
+ else:
+ logging.error(args.bambu + " is not an executable")
+ else:
+ logging.error("bambu not found")
+ sys.exit(1)
+ logging.info("Bambu found: " + tool_exe)
+elif args.tool == "zebu":
+ if os.path.isfile(args.zebu) and os.access(args.zebu, os.X_OK):
+ tool_exe = args.zebu
+ else:
+ #Check zebu in the path
+ for path in os.environ["PATH"].split(os.pathsep):
+ exe_file = os.path.join(path, "zebu")
+ if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK):
+ tool_exe = exe_file
+ if tool_exe == "":
+ if args.zebu != "opt/panda/bin/zebu":
+ if not os.path.isfile(args.zebu):
+ logging.error(args.zebu + " does not exist")
+ else:
+ logging.error(args.zebu + " is not an executable")
+ else:
+ logging.error("zebu not found")
+ sys.exit(1)
+ logging.info("Zebu found: " + tool_exe)
+ tool_exe = args.tool
+ if distutils.spawn.find_executable(tool_exe) == None:
+ logging.error(tool_exe + " not found")
+ sys.exit(1)
+if args.benchmarks_root is None:
+ abs_benchmarks_root = abs_configuration_dir
+ if os.path.isabs(args.benchmarks_root):
+ abs_benchmarks_root = os.path.abspath(args.benchmarks_root)
+ else:
+ if os.path.exists(os.path.join(os.path.abspath(".."), args.benchmarks_root)):
+ abs_benchmarks_root = os.path.join(os.path.abspath(".."), args.benchmarks_root)
+ else:
+ if os.path.exists(os.path.join(os.path.abspath(os.path.join(os.path.dirname(abs_script), "../../..")), args.benchmarks_root)):
+ abs_benchmarks_root = os.path.join(os.path.abspath(os.path.join(os.path.dirname(abs_script), "../../..")), args.benchmarks_root)
+ else:
+ logging.error(args.benchmarks_root + " not found")
+ sys.exit(1)
+#Check spider executable
+spider = ""
+if os.path.isfile(args.spider) and os.access(args.spider, os.X_OK):
+ spider = args.spider
+ #Check spider in the path
+ for path in os.environ["PATH"].split(os.pathsep):
+ exe_file = os.path.join(path, "spider")
+ if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK):
+ spider = exe_file
+if spider == "":
+ if args.spider != "opt/panda/bin/spider":
+ if not os.path.isfile(args.spider):
+ logging.error(args.spider + " does not exist")
+ else:
+ logging.error(args.spider + " is not an executable")
+ else:
+ logging.error("spider not found")
+ sys.exit(1)
+if args.benchmarks_root is None:
+ abs_benchmarks_root = abs_configuration_dir
+ if os.path.isabs(args.benchmarks_root):
+ abs_benchmarks_root = os.path.abspath(args.benchmarks_root)
+ else:
+ if os.path.exists(os.path.join(os.path.abspath(".."), args.benchmarks_root)):
+ abs_benchmarks_root = os.path.join(os.path.abspath(".."), args.benchmarks_root)
+ else:
+ if os.path.exists(os.path.join(os.path.abspath(os.path.join(os.path.dirname(abs_script), "../../..")), args.benchmarks_root)):
+ abs_benchmarks_root = os.path.join(os.path.abspath(os.path.join(os.path.dirname(abs_script), "../../..")), args.benchmarks_root)
+ else:
+ logging.error(args.benchmarks_root + " not found")
+ sys.exit(1)
+mutt = None
+logging.info("Spider found: " + spider)
+#Check mutt executable
+if args.mail != None:
+ #Check mutt in the path
+ for path in os.environ["PATH"].split(os.pathsep):
+ exe_file = os.path.join(path, "mutt")
+ if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK):
+ mutt = exe_file
+ if mutt == None:
+ logging.error("mutt not found")
+ sys.exit(1)
+if not args.restart:
+ #Check if file lists exist
+ abs_lists = []
+ if args.benchmarks_list != None:
+ for relative_list in args.benchmarks_list:
+ #First look in the current directory
+ if os.path.exists(os.path.abspath("../" + relative_list[0])):
+ abs_lists.append(os.path.abspath("../" + relative_list[0]))
+ #Then look in script directory
+ elif os.path.exists(os.path.join(os.path.dirname(abs_script), relative_list[0])):
+ abs_lists.append(os.path.join(os.path.dirname(abs_script), relative_list[0]))
+ #Then look in configuration directory
+ elif os.path.exists(os.path.join(abs_configuration_dir, relative_list[0])):
+ abs_lists.append(os.path.join(abs_configuration_dir, relative_list[0]))
+ #Then look in benchmarks root
+ elif os.path.exists(os.path.join(abs_benchmarks_root, relative_list[0])):
+ abs_lists.append(os.path.join(abs_benchmarks_root, relative_list[0]))
+ else:
+ logging.error(relative_list[0] + " does not exist")
+ sys.exit(1)
+ files_list = []
+ #Create temp list with arg
+ for arg in args.files[0]:
+ #.c/.c++ file
+ if (arg[-2:] == ".c") or (arg[-2:] == ".C") or (arg[-4:] == ".CPP") or (arg[-4:] == ".cpp") or (arg[-4:] == ".cxx") or (arg[-3:] == ".cc") or (arg[-4:] == ".c++"):
+ files_list.append(arg)
+ #Check if it is a directory
+ elif os.path.exists(arg) and os.path.isdir(arg):
+ files_list.append(arg)
+ elif os.path.exists(os.path.join(os.path.dirname(abs_script), arg)) and os.path.isdir(os.path.join(os.path.dirname(abs_script), arg)):
+ files_list.append(arg)
+ elif os.path.exists(os.path.join(abs_benchmarks_root, arg)) and os.path.isdir(os.path.join(abs_benchmarks_root, arg)):
+ files_list.append(os.path.join(abs_benchmarks_root, arg))
+ if args.benchmarks_list == None and len(files_list) == 0 and (args.tool == "bambu" or args.tool == "zebu"):
+ logging.error("Benchmarks not found")
+ sys.exit(2)
+ if len(files_list) > 0:
+ temp_list = open(os.path.join(abs_path, "temp_list"), "w")
+ for element in files_list:
+ temp_list.write(element)
+ temp_list.close()
+ abs_lists.append(os.path.join(abs_path, "temp_list"))
+ #Reordering elements in each row
+ reordered_list_name = os.path.join(abs_path, "reordered_list")
+ reordered_list = open(reordered_list_name, "w")
+ logging.info("Preparing benchmark list")
+ logging.info(" Reordering arguments")
+ for abs_list in abs_lists:
+ list_file = open(abs_list)
+ lines = list_file.readlines()
+ list_file.close()
+ for line in lines:
+ if line.strip() == "":
+ continue
+ if line[0] =='#':
+ continue
+ if args.tool != "bambu" and args.tool != "zebu":
+ reordered_list.write(line)
+ continue
+ tokens = shlex.split(line)
+ parameters = list()
+ #Flag used to ad-hoc manage --param arg
+ follow_param = False
+ for token in tokens:
+ if token[0] == '-':
+ parameters.append(re.escape(token))
+ if token.find("--param") != -1:
+ follow_param = True;
+ else:
+ follow_param = False;
+ else:
+ if follow_param == True:
+ parameters.append(re.escape(token))
+ else:
+ reordered_list.write(token + " ")
+ follow_param = False;
+ for parameter in parameters:
+ reordered_list.write(re.escape(parameter) + " ")
+ reordered_list.write("\n")
+ reordered_list.close()
+ #Expanding directory
+ expanded_list_name = os.path.join(abs_path, "expanded_list")
+ expanded_list = open(expanded_list_name, "w")
+ logging.info(" Expanding directory")
+ lines = open(reordered_list_name).readlines()
+ for line in lines:
+ if line.strip() == "":
+ continue
+ tokens = shlex.split(line)
+ if args.tool == "bambu" or args.tool == "zebu":
+ if(tokens[0][0] != '/'):
+ first_parameter = os.path.join(abs_benchmarks_root, tokens[0])
+ else:
+ first_parameter = tokens[0]
+ else:
+ first_parameter = tokens[0].replace("BENCHMARKS\_ROOT", abs_benchmarks_root)
+ other_parameters = tokens[1:len(tokens)]
+ if not os.path.exists(first_parameter) and (args.tool == "bambu" or args.tool == "zebu"):
+ logging.error(first_parameter + " does not exist")
+ sys.exit(1)
+ #Check if it is a directory or a file
+ if os.path.isdir(first_parameter):
+ logging.info(" " + tokens[0])
+ c_files = SearchCFiles(first_parameter)
+ c_files = sorted(c_files)
+ for c_file in c_files:
+ expanded_list.write(c_file)
+ for other_parameter in other_parameters:
+ expanded_list.write(" " + other_parameter.replace("BENCHMARKS\_ROOT", abs_benchmarks_root))
+ expanded_list.write("\n")
+ else:
+ expanded_list.write(first_parameter)
+ for other_parameter in other_parameters:
+ if ((other_parameter[-2:] == ".c") or (other_parameter[-2:] == ".C") or (other_parameter[-4:] == ".CPP") or (other_parameter[-4:] == ".cpp") or (other_parameter[-4:] == ".cxx") or (other_parameter[-3:] == ".cc") or (other_parameter[-4:] == ".c++") or other_parameter[-4:] == ".xml") and other_parameter[0] != '\\':
+ if other_parameter[0] == '/':
+ expanded_list.write(" " + other_parameter)
+ else:
+ expanded_list.write(" " + os.path.join(abs_benchmarks_root, other_parameter))
+ else:
+ expanded_list.write(" " + other_parameter.replace("BENCHMARKS\_ROOT", abs_benchmarks_root).replace("BENCHMARKS_ROOT", abs_benchmarks_root))
+ expanded_list.write("\n")
+ expanded_list.close()
+ #Adding parameters
+ logging.info(" Considering all tool arguments")
+ arg_lists = args.args
+ if not arg_lists:
+ arg_lists = [("")]
+ arged_list_name = os.path.join(abs_path, "arged_list")
+ arged_list = open(arged_list_name, "w")
+ lines = open(expanded_list_name).readlines()
+ for arg_list in arg_lists:
+ for line in lines:
+ arged_list.write(line.rstrip())
+ if len(arg_list) > 0:
+ arg = arg_list[0]
+ if arg[0] == "\"":
+ arg = arg[1:-1]
+ arged_list.write(" " + arg)
+ if args.commonargs != None and len(args.commonargs) > 0:
+ for commonarg in args.commonargs:
+ arged_list.write(" " + commonarg[0].replace("#", " "))
+ arged_list.write("\n")
+ arged_list.close()
+ #Name of benchmarks
+ full_names = set()
+ #Adding benchmark name
+ logging.info(" Adding benchmark name")
+ named_list_name = os.path.join(abs_path, "named_list")
+ named_list = open(named_list_name, "w")
+ lines = open(arged_list_name).readlines()
+ for line in lines:
+ named_list.write(line.rstrip())
+ #Retrieve configuration name and benchmark name
+ configuration_name = ""
+ benchmark_name = ""
+ tokens = shlex.split(line)
+ for token in tokens:
+ if token.find("--configuration-name") != -1:
+ configuration_name = token[len("--configuration-name="):]
+ if token.find("--benchmark-name") != -1:
+ benchmark_name = token[len("--benchmark-name="):]
+ if benchmark_name == "":
+ if args.tool != "bambu" and args.tool != "zebu":
+ logging.error("Missing benchmark name")
+ sys.exit(1)
+ benchmark_name = os.path.basename(line.split()[0])[:-2]
+ named_list.write(" --benchmark-name=" + benchmark_name)
+ full_name = configuration_name + ":" + benchmark_name
+ logging.info(" " + full_name)
+ if full_name in full_names:
+ logging.error("Duplicated configuration name - benchmark name: " + full_name)
+ sys.exit(1)
+ full_names.add(full_name)
+ named_list.write("\n")
+ named_list.close()
+ #Generating folder
+ logging.info(" Generating output directories")
+ lines = open(named_list_name).readlines()
+ for line in lines:
+ new_dir = ComputeDirectory(line)
+ logging.info(" Creating directory " + new_dir)
+ os.makedirs(new_dir)
+ logging.info(" Skipping generation of lists and directories")
+ named_list_name = os.path.join(abs_path, "named_list")
+ if not os.path.exists(named_list_name):
+ logging.error("List of previous run not found")
+ sys.exit(1)
+#Create threads
+logging.info(" Launching tool")
+lock = threading.RLock()
+lock_creation_destruction = threading.RLock()
+passed_benchmark = 0
+total_benchmark = 0
+threads = []
+children = [None] * n_jobs
+for thread_index in range(n_jobs):
+ threads.insert(thread_index, threading.Thread(target=execute_tests, args=(named_list_name, thread_index)))
+ threads[thread_index].daemon=True
+ threads[thread_index].start()
+ #Wait threads
+ for thread_index in range(n_jobs):
+ while threads[thread_index].isAlive():
+ threads[thread_index].join(100)
+except KeyboardInterrupt:
+ logging.error("SIGINT received")
+ failure = True
+ for local_thread_index in range(n_jobs):
+ if children[local_thread_index] != None:
+ if children[local_thread_index].poll() == None:
+ try:
+ kill_proc_tree(children[local_thread_index].pid)
+ except OSError:
+ pass
+ sys.exit(1)
+#Collect results
+#In case, it create the JUnit file
+if args.junitdir != "":
+ junit_file = open(junit_file_name, "w")
+ junit_file.write("\n")
+ junit_file.write("\n")
+ CreateJunitBody(abs_path,junit_file)
+ junit_file.write("\n")
+#In case, it create the PerfPublisher file
+if args.perfpublisherdir != "":
+ perfpublisher_file = open(perfpublisher_file_name, "w")
+ perfpublisher_file.write("\n")
+ perfpublisher_file.write("\n")
+ CreatePerfPublisherBody(abs_path,perfpublisher_file)
+ perfpublisher_file.write("\n")
+#Prepare final report
+if args.tool == "bambu" or args.tool == "zebu":
+ report_file_name = os.path.join(abs_path, "report")
+ report_file = open(report_file_name)
+ lines = report_file.readlines()
+ report_file.close()
+ report_file = open(report_file_name, "w")
+ command = [tool_exe, "--version"]
+ subprocess.call(command, stderr=report_file, stdout=report_file)
+ report_file.write("SYSTEM INFORMATION:\n")
+ report_file.flush()
+ command = ["lsb_release", "-a"]
+ subprocess.call(command, stderr=report_file, stdout=report_file)
+ report_file.write("\n")
+ report_file.write("CURRENT TIME:\n")
+ report_file.write(str(datetime.datetime.now())+ "\n\n")
+ report_file.write("PASSED TESTS:\n")
+ report_file.write(str(passed_benchmark) + "/" + str(total_benchmark) + "\n\n")
+ failed_counter_file_name = os.path.join(abs_path, "failed_counter")
+ failed_counter_file = open(failed_counter_file_name, "w")
+ failed_counter_file.write(str(total_benchmark - passed_benchmark))
+ failed_counter_file.close()
+ for line in lines:
+ report_file.write(line)
+ report_file.close()
+if args.mail != None:
+ outcome = ""
+ if args.stop:
+ if failure:
+ outcome = "FAILURE"
+ else:
+ outcome = "SUCCESS"
+ else:
+ outcome = str(passed_benchmark) + "/" + str(total_benchmark)
+ full_name = ""
+ if len(args.name):
+ for name in args.name:
+ full_name = full_name + name[0]
+ else:
+ full_name = "Bambu"
+ local_command = "cat " + report_file_name + " | mutt -s \"" + full_name + ": " + outcome + "\" " + args.mail
+ subprocess.call(local_command, shell=True)
+if failure:
+ sys.exit(1)
diff --git a/documentation/tutorial_fpl_2022/04-axi/Exercise1/bambu.sh b/documentation/tutorial_fpl_2022/04-axi/Exercise1/bambu.sh
new file mode 100755
index 000000000..c4bdb214e
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/04-axi/Exercise1/bambu.sh
@@ -0,0 +1,11 @@
+script=$(readlink -e $0)
+root_dir=$(dirname $script)
+rm -rf read
+mkdir -p read
+cd read
+echo "#simulating read"
+bambu ../read.c --top-fname=read \
+ --generate-tb=../test_read.xml --simulator=VERILATOR --simulate \
+ -v4 --generate-interface=INFER --compiler=I386_CLANG6 "$@" |& tee read.log
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/04-axi/Exercise1/read.c b/documentation/tutorial_fpl_2022/04-axi/Exercise1/read.c
new file mode 100644
index 000000000..15bfeb4d4
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/04-axi/Exercise1/read.c
@@ -0,0 +1,6 @@
+#pragma HLS_interface data m_axi direct
+short int read(short int* data)
+ return *(data);
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/04-axi/Exercise1/test_read.xml b/documentation/tutorial_fpl_2022/04-axi/Exercise1/test_read.xml
new file mode 100644
index 000000000..41d2b7747
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/04-axi/Exercise1/test_read.xml
@@ -0,0 +1,4 @@
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/04-axi/Exercise2/bambu.sh b/documentation/tutorial_fpl_2022/04-axi/Exercise2/bambu.sh
new file mode 100755
index 000000000..02705a7e4
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/04-axi/Exercise2/bambu.sh
@@ -0,0 +1,11 @@
+script=$(readlink -e $0)
+root_dir=$(dirname $script)
+rm -rf sum
+mkdir -p sum
+cd sum
+echo "#simulating sum"
+bambu ../sum.c --top-fname=sum \
+ --generate-tb=../test_sum.xml --simulator=VERILATOR --simulate \
+ -v4 --generate-interface=INFER --compiler=I386_CLANG6 "$@" |& tee sum.log
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/04-axi/Exercise2/sum.c b/documentation/tutorial_fpl_2022/04-axi/Exercise2/sum.c
new file mode 100644
index 000000000..a985ac74b
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/04-axi/Exercise2/sum.c
@@ -0,0 +1,14 @@
+#pragma HLS_interface v m_axi direct
+#pragma HLS_interface n m_axi direct
+int sum(int* v, unsigned* n)
+ int sum = 0;
+ for(unsigned i = 0; i < *(n); i++)
+ {
+ sum += v[i];
+ }
+ return sum;
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/04-axi/Exercise2/test_sum.xml b/documentation/tutorial_fpl_2022/04-axi/Exercise2/test_sum.xml
new file mode 100644
index 000000000..857a03ac1
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/04-axi/Exercise2/test_sum.xml
@@ -0,0 +1,4 @@
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/04-axi/Exercise3/bambu.sh b/documentation/tutorial_fpl_2022/04-axi/Exercise3/bambu.sh
new file mode 100755
index 000000000..c5e5e0773
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/04-axi/Exercise3/bambu.sh
@@ -0,0 +1,11 @@
+script=$(readlink -e $0)
+root_dir=$(dirname $script)
+rm -rf maxNumbers
+mkdir -p maxNumbers
+cd maxNumbers
+echo "#simulating maxNumbers"
+bambu ../maxNumbers.c --top-fname=maxNumbers \
+ --generate-tb=../test_maxNumbers.xml --simulator=VERILATOR --simulate \
+ -v4 --generate-interface=INFER --compiler=I386_CLANG6 "$@" |& tee maxNumbers.log
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/04-axi/Exercise3/maxNumbers.c b/documentation/tutorial_fpl_2022/04-axi/Exercise3/maxNumbers.c
new file mode 100644
index 000000000..17b105f9e
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/04-axi/Exercise3/maxNumbers.c
@@ -0,0 +1,21 @@
+#pragma HLS_interface a m_axi direct
+#pragma HLS_interface n_ptr m_axi direct
+#pragma HLS_interface res m_axi direct
+void maxNumbers(int* a, unsigned int* n_ptr, int* res)
+ unsigned i;
+ int result;
+ unsigned int n = *n_ptr;
+ if(n == 0)
+ {
+ *res = (int)(1 << 31);
+ return;
+ }
+ result = a[0];
+ for(i = 1; i < n; ++i)
+ result = result < a[i] ? a[i] : result;
+ *res = result;
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/04-axi/Exercise3/test_maxNumbers.xml b/documentation/tutorial_fpl_2022/04-axi/Exercise3/test_maxNumbers.xml
new file mode 100644
index 000000000..f254d9d30
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/04-axi/Exercise3/test_maxNumbers.xml
@@ -0,0 +1,5 @@
\ No newline at end of file
diff --git a/documentation/tutorial_fpl_2022/README.md b/documentation/tutorial_fpl_2022/README.md
new file mode 100644
index 000000000..a93e6861d
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/README.md
@@ -0,0 +1 @@
+[](https://colab.research.google.com/github/ferrandi/PandA-bambu/blob/main/documentation/tutorial_fpl_2022/bambu.ipynb)
diff --git a/documentation/tutorial_fpl_2022/bambu.ipynb b/documentation/tutorial_fpl_2022/bambu.ipynb
new file mode 100644
index 000000000..3ddc0ca1d
--- /dev/null
+++ b/documentation/tutorial_fpl_2022/bambu.ipynb
@@ -0,0 +1,1142 @@
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "V_2WzZoHkY8D"
+ },
+ "source": [
+ "# **Initial setup**\n",
+ "\n",
+ "Install Bambu and required packages:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "fM6c5RAukY8J"
+ },
+ "outputs": [],
+ "source": [
+ "!echo \"deb http://ppa.launchpad.net/git-core/ppa/ubuntu $(cat /etc/os-release | grep UBUNTU_CODENAME | sed 's/.*=//g') main\" >> /etc/apt/sources.list.d/git-core.list\n",
+ "!apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A1715D88E1DF1F24\n",
+ "!apt-get update\n",
+ "!apt-get install -y --no-install-recommends build-essential ca-certificates gcc-multilib git iverilog verilator wget\n",
+ "!wget https://release.bambuhls.eu/appimage/bambu-fpl22.AppImage\n",
+ "!chmod +x bambu-*.AppImage\n",
+ "!ln -sf $PWD/bambu-*.AppImage /bin/bambu\n",
+ "!ln -sf $PWD/bambu-*.AppImage /bin/spider\n",
+ "!ln -sf $PWD/bambu-*.AppImage /bin/tree-panda-gcc\n",
+ "!ln -sf $PWD/bambu-*.AppImage /bin/clang-12\n",
+ "!git clone --depth 1 --filter=blob:none --sparse https://github.com/ferrandi/PandA-bambu.git\n",
+ "%cd PandA-bambu\n",
+ "!git sparse-checkout set documentation/tutorial_fpl_2022 \n",
+ "%cd ..\n",
+ "!mv PandA-bambu/documentation/tutorial_fpl_2022/ bambu-tutorial"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1K0z3rWNkY8N"
+ },
+ "source": [
+ "# **Productive HLS with Bambu**\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4mnSaH9BkY8N"
+ },
+ "source": [
+ "## **Exercise 1**\n",
+ "\n",
+ "Have a look at the C code in /content/bambu-tutorial/01-introduction/Exercise1/icrc.c\n",
+ "\n",
+ "Launch bambu:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "8qyCr3u4kY8P"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/01-introduction/Exercise1\n",
+ "!bambu icrc.c --top-fname=icrc1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "m6Gxt2YLkY8Q"
+ },
+ "source": [
+ "Inspect the generated Verilog file in the explorer tab on the left (icrc1.v)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "uuKpPqQLkY8R"
+ },
+ "source": [
+ "Take a brief look at the available Bambu options:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "kq5_j96FkY8S"
+ },
+ "outputs": [],
+ "source": [
+ "!bambu --help"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ewJCDjrpkY8S"
+ },
+ "source": [
+ "Modify the command line to change the amount of debug information displayed, and generate VHDL instead of Verilog code:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "41_XU34tkY8T"
+ },
+ "outputs": [],
+ "source": [
+ "!bambu icrc.c --top-fname=icrc1 -wH"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Ll1qN_GfkY8U"
+ },
+ "source": [
+ "## **Exercise 2**\n",
+ "\n",
+ "We remain on the same input C code as before, let's add co-simulation:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mQetLrjPkY8V"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/01-introduction/Exercise1\n",
+ "!bambu icrc.c --top-fname=icrc1 --simulate --simulator=VERILATOR"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "OcEj7Mv3kY8V"
+ },
+ "source": [
+ "We did not specify any input values. Inspect what Bambu generated automatically:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ZjdchGrHkY8W"
+ },
+ "outputs": [],
+ "source": [
+ "!cat test.xml"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "g4mAX0LzkY8W"
+ },
+ "source": [
+ "You can find the actual testbench in HLS_output/simulation."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "qVqDccKlkY8X"
+ },
+ "source": [
+ "## **Exercise 3**\n",
+ "\n",
+ "Implement and synthesize a module that returns the minimum and maximum value in an array of integers with arbitrary size.\n",
+ "\n",
+ "Write the input C code starting from this snippet:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zV1oZ1RfkY8X"
+ },
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/01-introduction/Exercise2/minmax.c\n",
+ "void min_max(int input[10], int* out_max)\n",
+ "{\n",
+ " int local_max = input[0];\n",
+ " int i = 0;\n",
+ " for(i = 0; i < 10; i++)\n",
+ " {\n",
+ " if(input[i] > local_max)\n",
+ " {\n",
+ " local_max = input[i];\n",
+ " }\n",
+ " }\n",
+ " *out_max = local_max;\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gG5f1mdzkY8Y"
+ },
+ "source": [
+ "Write a testbench to test arrays with different elements and different sizes.\n",
+ "\n",
+ "Start from the XML snippet below **(parameter names need to correspond to function arguments in your code)**:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "gs5Hch82kY8Y"
+ },
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/01-introduction/Exercise2/testbench.xml\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "c-RVspt3kY8Z"
+ },
+ "source": [
+ "Synthesize with Bambu and simulate with Verilator **(double check the command line if you changed file/function names)**:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "f_T0r66AkY8Z"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/01-introduction/Exercise2/\n",
+ "!bambu minmax.c --top-fname=min_max --generate-tb=testbench.xml --simulate --simulator=VERILATOR"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UzvbptSYkY8a"
+ },
+ "source": [
+ "What happens if you pass an array with a different number of elements than what is specified in num_elements? **(remember to fix the XML file afterwards, we will need it again)**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "r1Bp4LYckY8b"
+ },
+ "source": [
+ "## **Exercise 4**\n",
+ "\n",
+ "Bambu can synthesize accelerators described in an LLVM IR through the Clang frontend.\n",
+ "\n",
+ "Synthesize /content/bambu-tutorial/01-introduction/Exercise3/matmul.ll, which contains a matrix multiplication kernel generated by [soda-opt](https://gitlab.pnnl.gov/sodalite/soda-opt):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "rDZvVlgVkY8b"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/01-introduction/Exercise3/\n",
+ "!bambu matmul.ll --top-fname=main_kernel --generate-tb=test.xml --simulate --simulator=VERILATOR --compiler=I386_CLANG12"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "w1GgyDUfkY8b"
+ },
+ "source": [
+ "Note: kernels generated by soda-opt require at least Clang 10."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dD2n8XFRkY8c"
+ },
+ "source": [
+ "## **Exercise 5**\n",
+ "\n",
+ "Let's go back to the C code that finds minumim and maximum in an array of numbers, and compare performance across different target platforms and frequencies.\n",
+ "\n",
+ "Start from the given command and modify the options appropriately to test the following combinations:\n",
+ "\n",
+ "\n",
+ "* nx1h140tsp (NG-LARGE) – 66MHz\n",
+ "* nx1h35S (NG-MEDIUM) - 50Mhz\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "DGB_zYb0kY8c"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/01-introduction/Exercise2\n",
+ "!bambu minmax.c --top-fname=min_max --device-name=nx1h35S --clock-period=20 --no-iob --simulate --simulator=VERILATOR --generate-tb=testbench.xml"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tvRAqSe1kY8c"
+ },
+ "source": [
+ "Look also at the different simulation and synthesis scripts generated by Bambu."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cLgC0cxVkY8c"
+ },
+ "source": [
+ "## **Exercise 6**\n",
+ "\n",
+ "Ask Bambu to print a C verion of its internal IR and all relevant graphs:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KC56EVOrkY8d"
+ },
+ "outputs": [],
+ "source": [
+ "!bambu minmax.c --top-fname=min_max --pretty-print=out.c --print-dot"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "aMVwFt8okY8d"
+ },
+ "source": [
+ "Look at /content/bambu-tutorial/01-introduction/Exercise2/out.c and then print the FSM graph:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "g32enpA0kY8d"
+ },
+ "outputs": [],
+ "source": [
+ "from graphviz import Source\n",
+ "Source.from_file('HLS_output/dot/min_max/fsm.dot')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "FlSS4Ynj27Gb"
+ },
+ "outputs": [],
+ "source": [
+ "from graphviz import Source\n",
+ "Source.from_file('HLS_output/dot/min_max/HLS_STGraph.dot')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dLZBqaNqwdcU"
+ },
+ "source": [
+ "## **Other examples**\n",
+ "\n",
+ "You can play around with a set of other examples that contain different applications and showcase different features of Bambu.\n",
+ "\n",
+ " - /content/bambu-tutorial/01-introduction/Exercise4: Function Proxy\n",
+ " - /content/bambu-tutorial/04-simd/Exercise1: SIMD optimization\n",
+ " - /content/bambu-tutorial/01-introduction/Exercise5: LU decomposition\n",
+ " - /content/bambu-tutorial/01-introduction/Exercise6: integration of IPs written in Verilog\n",
+ " - /content/bambu-tutorial/01-introduction/Exercise7: sorting algorithm\n",
+ " - /content/bambu-tutorial/01-introduction/Exercise8: cryptographic core\n",
+ " - /content/bambu-tutorial/01-introduction/Exercise9: search and insertion in a binary tree\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "IlQDB6nqHqz0"
+ },
+ "source": [
+ "# **Optimizations**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dWWTy4TGZLfk"
+ },
+ "source": [
+ "## **Exercise 1** \n",
+ "\n",
+ "Modify Bambu options to evaluate the effect of:\n",
+ "\n",
+ "\n",
+ "* different levels of optimization (-O0, -O1, -O2, -O3, -Os)\n",
+ "* vectorization (-ftree-vectorize)\n",
+ "* inlining (-finline-limit=100000)\n",
+ "* different frontend compilers (--compiler={I386_GCC49|I386_GCC7|I386_CLANG6|I386_CLANG12})\n",
+ "\n",
+ "#### **ADPCM from CHStone benchmark suite**\n",
+ "Adaptive Diferential Pulse-Code Modulation is an algorithm used to perform audio compression (mainly in telephony). It is part of the CHStone benchmark suite for C-based HLS tools.\n",
+ "* Yuko Hara, Hiroyuki Tomiyama, Shinya Honda and Hiroaki Takada, \"Proposal and Quantitative Analysis of the CHStone Benchmark Program Suite for Practical C-based High-level Synthesis\", *Journal of Information Processing*, Vol. 17, pp.242-254, (2009)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "YOXiFqzSIDR9"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/03-optimizations/Exercise1/\n",
+ "!bambu adpcm.c -O0 --simulate"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "mJOV50V4IiE2"
+ },
+ "source": [
+ "## **Exercise 2** \n",
+ "\n",
+ "Use the command that yielded the best result in Exercise 1 and verify if SDC scheduling can introduce further improvements.\n",
+ "\n",
+ "* -s or --speculative-sdc-scheduling"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Qi_ZpIr1IzZ7"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/03-optimizations/Exercise1/\n",
+ "!bambu adpcm.c -O0 --simulate"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "StGBkKaJJEfr"
+ },
+ "source": [
+ "## **Exercise 3**\n",
+ "\n",
+ "Modify Bambu options to evaluate the effect of different integer division implementations.\n",
+ "\n",
+ "--hls-div=\n",
+ "* none - use a HDL based pipelined restoring division\n",
+ "* nr1 - use a C-based non-restoring division with unrolling factor equal to 1 (default)\n",
+ "* nr2 - use a C-based non-restoring division with unrolling factor equal to 2\n",
+ "* NR - use a C-based Newton-Raphson division\n",
+ "* as - use a C-based align divisor shift dividend method\n",
+ "\n",
+ "#### **FPDiv from CHStone**\n",
+ "Soft floating-point division implementation from the CHStone benchmark suite for C-based HLS.\n",
+ "* Yuko Hara, Hiroyuki Tomiyama, Shinya Honda and Hiroaki Takada, \"Proposal and Quantitative Analysis of the CHStone Benchmark Program Suite for Practical C-based High-level Synthesis\", *Journal of Information Processing*, Vol. 17, pp.242-254, (2009).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "bnEJ4nwuJLfo"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/03-optimizations/Exercise3/\n",
+ "!bambu dfdiv.c --simulate --clock-period=15 --hls-div=none"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "elFjSeb2oq9s"
+ },
+ "source": [
+ "## **Exercise 4**\n",
+ "Not all target devices may expose support for the synthesis of the whole set of Bambu IR instructions. If this is the case, Bambu takes care of replacing unsupported IR instructions with function calls which implement the same functionality using supported functional units only.\n",
+ "\n",
+ "As an example, NanoXplore nx1h140tsp FPGA board does not support 64x64 integer multiplication, thus Bambu will replace this simple IR instruction with a function call to a different multiplier implementation which exploits smaller multipliers to build the final result."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Q21rEJY2o-i3"
+ },
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/03-optimizations/Exercise4/module.c\n",
+ "long long func_replace(long long a, long long b)\n",
+ "{\n",
+ " return a * b;\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "YEYtnEJe_FS0"
+ },
+ "source": [
+ "Make sure you run the above cell before launching Bambu in the following.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "oL1QqXero2pR"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/03-optimizations/Exercise4/\n",
+ "!bambu module.c --top-fname=func_replace --device-name=nx1h140tsp --generate-tb=a=4,b=5 --simulate --panda-parameter=function-opt=0 --print-dot"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UjV9dfwz_O9e"
+ },
+ "source": [
+ "Once the synthesis has completed it is possible to have a look at the initial and final call graph. In the former no function calls are present, while in the latter a __umul64 function call has been added. This function implements the 64x64 integer multiplication by means of smaller 32x32 multipliers."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Jgv1TnykqT2V"
+ },
+ "outputs": [],
+ "source": [
+ "from graphviz import Source\n",
+ "Source.from_file('HLS_output/dot/call_graph.dot')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "bSBnpIUKqV_6"
+ },
+ "outputs": [],
+ "source": [
+ "from graphviz import Source\n",
+ "Source.from_file('HLS_output/dot/call_graph_final.dot')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "wiVlnuPy_wQO"
+ },
+ "source": [
+ "Also it is possible to have a look at the __umul64 FSM graph to see how the 64x64 multiplication logic is implemented."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "g9i03s7Uqa1w"
+ },
+ "outputs": [],
+ "source": [
+ "from graphviz import Source\n",
+ "Source.from_file('HLS_output/dot/__umul64/fsm.dot')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "YSRwNv1o2Jqx"
+ },
+ "source": [
+ "## **Exercise 5** \n",
+ "Bambu expose a complete support for floating-point arithemtic and all libm functions.\n",
+ "In the following you can define any arbitrary floating-point computation and take a look at the generated design structure.\n",
+ "\n",
+ "As an example, try to write a C implementation that compute the following:\n",
+ "\n",
+ "# $awesome\\_math(a,b,c) = acos(\\frac{a^2+b^2-c^2}{2ab})$\n",
+ "\n",
+ "Experiment with single and double precision data types, different softfloat and libm implementations offered by bambu.\n",
+ "\n",
+ "Start by editing this code and then try different bambu options:\n",
+ "* Different floating-point arithmetic implementations (--softfloat, --soft-fp, --flopoco)\n",
+ "* Different libm implementations (--libm-std-rounding)\n",
+ "* Different square implementation (pow, simple multiplication)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "LuhiArbj6XnA"
+ },
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/03-optimizations/Exercise5/module.c\n",
+ "#include \n",
+ "float awesome_math(float a, float b, float c)\n",
+ "{\n",
+ " return a * b + c;\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4dstvQzcCt9z"
+ },
+ "source": [
+ "Make sure you run the above cell after you write the C implementation inside, so that the file is updated, then launch Bambu to perform the synthesis."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "_Rg4Gthy2vDm"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/03-optimizations/Exercise5/\n",
+ "!bambu module.c -O3 -lm --simulate --top-fname=awesome_math --generate-tb=\"a=3.0,b=4.0,c=5.0\" --panda-parameter=function-opt=0 --print-dot"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Rlg6keEyDARF"
+ },
+ "source": [
+ "After the synthesis has completed it is possible to observe how the floating-point operations have been converted to function calls to the internal Bambu arithmetic cores and libm implementation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "r9Lplti4nD-H"
+ },
+ "outputs": [],
+ "source": [
+ "from graphviz import Source\n",
+ "Source.from_file('HLS_output/dot/call_graph_final.dot')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "wUcQsiLvn_nF"
+ },
+ "outputs": [],
+ "source": [
+ "from graphviz import Source\n",
+ "Source.from_file('HLS_output/dot/__float_adde8m23b_127nih/fsm.dot')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "xoE9JUqsDa4Z"
+ },
+ "source": [
+ "## **Excercise 6**\n",
+ "Further support for custom floating-point arithmetic is also available in Bambu.\n",
+ "\n",
+ "The user may define custom floating-point types through the command line API and the HLS engine will take care of the conversions and implementation starting from standard float/double types defined in the C implementation.\n",
+ "\n",
+ "```\n",
+ "--fp-subnormal\n",
+ " Enable the soft-based implementation of floating-point operations with\n",
+ " subnormals support.\n",
+ "\n",
+ "--fp-exception-mode=\n",
+ " Set the soft-based exception handling mode:\n",
+ " ieee - IEEE754 standard exceptions (default)\n",
+ " saturation - Inf is replaced with max value, Nan becomes undefined behaviour\n",
+ " overflow - Inf and Nan results in undefined behaviour\n",
+ "\n",
+ "--fp-rounding-mode=\n",
+ " Set the soft-based rounding handling mode:\n",
+ " nearest_even - IEEE754 standard rounding mode (default)\n",
+ " truncate - No rounding is applied\n",
+ "\n",
+ "--fp-format=*emb\n",
+ " Define arbitrary precision floating-point format by function (use comma separated\n",
+ " list for multiple definitions). (i.e.: e8m27b-127nihs represent IEEE754 single precision FP)\n",
+ " func_name - Set arbitrary floating-point format for a specific function (using\n",
+ " @ symbol here will resolve to the top function)\n",
+ " (Arbitrary floating-point format will apply to specified function\n",
+ " only, use --propagate-fp-format to extend it to called functions)\n",
+ " exp_bits - Number of bits used by the exponent\n",
+ " frac_bits - Number of bits used by the fractional value\n",
+ " exp_bias - Bias applied to the unsigned value represented by the exponent bits\n",
+ " rnd_mode - Rounding mode (exclusive option):\n",
+ " n - nearest_even: IEEE754 standard rounding mode\n",
+ " t - truncate : no rounding is applied\n",
+ " exc_mode - Exception mode (exclusive option):\n",
+ " i - ieee : IEEE754 standard exceptions\n",
+ " a - saturation: Inf is replaced with max value, Nan becomes undefined behaviour\n",
+ " o - overflow : Inf and Nan results in undefined behaviour\n",
+ " spec - Floating-point specialization string (multiple choice):\n",
+ " h - hidden one: IEEE754 standard representation with hidden one\n",
+ " s - subnormals: IEEE754 subnormal numbers\n",
+ " sign - Static sign representation (exclusive option):\n",
+ " - IEEE754 dynamic sign is used if omitted\n",
+ " 1 - all values are considered as negative numbers\n",
+ " 0 - all values are considered as positive numbers\n",
+ "\n",
+ "--fp-format=inline-math\n",
+ " The \"inline-math\" flag may be added to fp-format option to force floating-point\n",
+ " arithmetic operators always inline policy\n",
+ "\n",
+ "--fp-format=inline-conversion\n",
+ " The \"inline-conversion\" flag may be added to fp-format option to force floating-point\n",
+ " conversion operators always inline policy\n",
+ "\n",
+ "--fp-format-interface\n",
+ " User-defined floating-point format is applied to top interface signature if required\n",
+ " (default modifies top function body only)\n",
+ "\n",
+ "--fp-format-propagate\n",
+ " Propagate user-defined floating-point format to called function when possible\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "6JnEKX_4EFJ2"
+ },
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/03-optimizations/Exercise6/module.c\n",
+ "float user_fp(float a, float b, float c)\n",
+ "{\n",
+ " return a * b + c;\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tmQE2zMsEOps"
+ },
+ "source": [
+ "As an example, the above code may be synthesized as is with standard single-precision floating-point arithmetic or it may be synthesized replacing float with half-precision encoding, thus a 16-bit floating-point format through the following."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "qGJlenxDEpOA"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/03-optimizations/Exercise6/\n",
+ "!bambu module.c -O3 -lm --simulate --top-fname=user_fp --fp-format=user_fp*e5m10b-16nih --fp-format-interface --generate-tb=\"a=3.0,b=4.0,c=5.0\" --print-dot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KtnrCuAbFsdw"
+ },
+ "outputs": [],
+ "source": [
+ "from graphviz import Source\n",
+ "Source.from_file('HLS_output/dot/call_graph_final.dot')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "W7Xm7x00kY8k"
+ },
+ "source": [
+ "# AXI"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "astRZQCPkY8k"
+ },
+ "source": [
+ "## **Exercise 1**\n",
+ "Start by writing a C function called read that simply reads a number from an AXI bus and returns the value that is retrieved from the bus.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "8WWTClIgkY8k"
+ },
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/04-axi/Exercise1/module.c\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "k6gr5V0OkY8l"
+ },
+ "source": [
+ "Now add the interface infer flag to the bambu command and execute."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "DPtMp3i9kY8l"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/04-axi/Exercise1/\n",
+ "!bambu module.c --top-fname=read --compiler=I386_CLANG6"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "vlliLe-skY8l"
+ },
+ "source": [
+ "Open the generated Verilog file and look for the top module, called read. Notice the presence of the AXI signals and how their size matches the size of the data.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "E2Skz8FskY8m"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/04-axi/Exercise1/\n",
+ "!cat read.v"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "_bARuIvZkY8m"
+ },
+ "source": [
+ "Finally, launch the simulation and check that everything works properly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "F35QAjj-kY8m"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/04-axi/Exercise1/\n",
+ "!bambu module.c --top-fname=read --compiler=I386_CLANG6 --generate-interface=INFER --generate-tb=\"data={96}\" --simulator=VERILATOR --simulate -v4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "C_GmMHuokY8m"
+ },
+ "source": [
+ "## **Exercise 2**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RQWFRIg4kY8n"
+ },
+ "source": [
+ "Consider the following code, that adds up all of the n elements of a vector v. Edit the code so that both the number of elements and the elements of the vector are read from an external memory through an AXI bus."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "4QUMMgpnkY8n"
+ },
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/04-axi/Exercise2/module.c\n",
+ "\n",
+ "int sum(int* v, unsigned* n)\n",
+ "{\n",
+ " int sum = 0;\n",
+ "\n",
+ " for(unsigned i = 0; i < *(n); i++)\n",
+ " {\n",
+ " sum += v[i];\n",
+ " }\n",
+ "\n",
+ " return sum;\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's also write a test file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/04-axi/Exercise2/test.xml\n",
+ "\n",
+ "\n",
+ " \n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "XFRxYuA5kY8n"
+ },
+ "source": [
+ "Launch bambu and simulate the execution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "j6_SFQLbkY8n"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/04-axi/Exercise2/\n",
+ "!bambu module.c --top-fname=sum --compiler=I386_CLANG6 --generate-interface=INFER --generate-tb=test.xml --simulator=VERILATOR --simulate -v4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "X9fgghWdkY8n"
+ },
+ "source": [
+ "## **Exercise 3**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QbhpuMj1kY8n"
+ },
+ "source": [
+ "Let's consider the following code, that computes the maximum among the elements of a vector. We want to read the number of elements of the vector and its data from an AXI bus, however, instead of returning the result, we then want to write the result to an external memory available over a different AXI bus. In order for bambu to generate the module according to our needs, we will need to provide additional information through \"bundle\", an optional parameter of the pragma directive.\n",
+ "With the addition of the optional parameter, the directive becomes:\n",
+ "\n",
+ "#pragma HLS_interface m_axi direct bundle=\n",
+ "\n",
+ "By associating different variables to the same bundle name, we are telling bambu that they will use the same bus. When different names are used, bambu will generate a bus for each bundle.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "hay4ndWJkY8o"
+ },
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/04-axi/Exercise3/module.c\n",
+ "\n",
+ "void maxNumbers(int* a, unsigned int* n_ptr, int* res)\n",
+ "\n",
+ "{\n",
+ " unsigned i;\n",
+ " int result;\n",
+ " unsigned int n = *n_ptr;\n",
+ "\n",
+ " if(n == 0)\n",
+ " {\n",
+ " *res = (int)(1 << 31);\n",
+ " return;\n",
+ " }\n",
+ " result = a[0];\n",
+ " for(i = 1; i < n; ++i)\n",
+ " result = result < a[i] ? a[i] : result;\n",
+ " *res = result;\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile /content/bambu-tutorial/04-axi/Exercise3/test.xml\n",
+ "\n",
+ "\n",
+ " \n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3Z5AxO4jkY8o"
+ },
+ "source": [
+ "Once again, we can run bambu with the same command and perform a simulation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "3vtw_MkckY8o"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/04-axi/Exercise3/\n",
+ "!bambu module.c --top-fname=maxNumbers --compiler=I386_CLANG6 --generate-interface=INFER --generate-tb=test.xml --simulator=VERILATOR --simulate -v4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "MRgKzu0VkY8o"
+ },
+ "source": [
+ "If we open the module definition, we can actually check that two AXI buses are defined and used."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "bld4NJSlkY8o"
+ },
+ "outputs": [],
+ "source": [
+ "%cd /content/bambu-tutorial/04-axi/Exercise3/\n",
+ "!cat maxNumbers.v"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.4"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0