diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise1/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise1/bambu.sh new file mode 100755 index 000000000..34be96f47 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise1/bambu.sh @@ -0,0 +1,11 @@ +#!/bin/bash +script=$(readlink -e $0) +root_dir=$(dirname $script) + +rm -rf icrc1 +mkdir -p icrc1 +cd icrc1 +echo "#synthesis of icrc1" +bambu ../icrc.c --top-fname=icrc1 \ + --generate-tb=../test_icrc1.xml --simulator=VERILATOR --simulate \ + -v2 --print-dot --pretty-print=a.c "$@" |& tee icrc1.log \ No newline at end of file diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise1/icrc.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise1/icrc.c new file mode 100644 index 000000000..8852b50a1 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise1/icrc.c @@ -0,0 +1,14 @@ +unsigned short icrc1(unsigned short crc, unsigned char onech) +{ + int i; + unsigned short ans=(crc^onech << 8); + + for (i=0;i<8;i++) { + if (ans & 0x8000) + ans = (ans <<= 1) ^ 4129; + else + ans <<= 1; + } + return ans; +} + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/minmax.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/minmax.c new file mode 100644 index 000000000..2058b7576 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/minmax.c @@ -0,0 +1,19 @@ +void min_max(int * input, int num_elements, int * max, int * min) +{ + int local_max = input[0]; + int local_min = input[0]; + int i = 0; + for(i = 0; i < num_elements; i++) + { + if(input[i] > local_max) + { + local_max = input[i]; + } + else if(input[i] < local_min) + { + local_min = input[i]; + } + } + *min = local_min; + *max = local_max; +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/synthesize.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/synthesize.sh new file mode 100755 index 000000000..48f30b583 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/synthesize.sh @@ -0,0 +1,2 @@ +#!/bin/bash +bambu minmax.c --generate-tb=testbench.xml --simulate "$@" |& tee log.txt diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/testbench.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/testbench.xml new file mode 100644 index 000000000..3781cfbc4 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise2/solution/testbench.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise3/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/bambu.sh new file mode 100644 index 000000000..d8574c1d5 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/bambu.sh @@ -0,0 +1,2 @@ +#!/bin/bash +bambu matmul.ll --top-fname=main_kernel --generate-tb=test.xml --simulate --simulator=VERILATOR --compiler=I386_CLANG12 "$@" |& tee log.txt diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise3/matmul.ll b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/matmul.ll new file mode 100644 index 000000000..b23ebcf11 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/matmul.ll @@ -0,0 +1,637 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +declare i8* @malloc(i64) + +declare void @free(i8*) + +define void @main_kernel(float* noalias %0, float* noalias %1, float* noalias %2) !dbg !3 { + %4 = alloca float, i64 ptrtoint (float* getelementptr (float, float* null, i64 16) to i64), align 4, !dbg !7 + %5 = getelementptr float, float* %0, i64 0, !dbg !9 + %6 = load float, float* %5, align 4, !dbg !10 + %7 = getelementptr float, float* %4, i64 0, !dbg !11 + store float %6, float* %7, align 4, !dbg !12 + %8 = getelementptr float, float* %0, i64 1, !dbg !13 + %9 = load float, float* %8, align 4, !dbg !14 + %10 = getelementptr float, float* %4, i64 1, !dbg !15 + store float %9, float* %10, align 4, !dbg !16 + %11 = getelementptr float, float* %0, i64 2, !dbg !17 + %12 = load float, float* %11, align 4, !dbg !18 + %13 = getelementptr float, float* %4, i64 2, !dbg !19 + store float %12, float* %13, align 4, !dbg !20 + %14 = getelementptr float, float* %0, i64 3, !dbg !21 + %15 = load float, float* %14, align 4, !dbg !22 + %16 = getelementptr float, float* %4, i64 3, !dbg !23 + store float %15, float* %16, align 4, !dbg !24 + %17 = getelementptr float, float* %0, i64 4, !dbg !25 + %18 = load float, float* %17, align 4, !dbg !26 + %19 = getelementptr float, float* %4, i64 4, !dbg !27 + store float %18, float* %19, align 4, !dbg !28 + %20 = getelementptr float, float* %0, i64 5, !dbg !29 + %21 = load float, float* %20, align 4, !dbg !30 + %22 = getelementptr float, float* %4, i64 5, !dbg !31 + store float %21, float* %22, align 4, !dbg !32 + %23 = getelementptr float, float* %0, i64 6, !dbg !33 + %24 = load float, float* %23, align 4, !dbg !34 + %25 = getelementptr float, float* %4, i64 6, !dbg !35 + store float %24, float* %25, align 4, !dbg !36 + %26 = getelementptr float, float* %0, i64 7, !dbg !37 + %27 = load float, float* %26, align 4, !dbg !38 + %28 = getelementptr float, float* %4, i64 7, !dbg !39 + store float %27, float* %28, align 4, !dbg !40 + %29 = getelementptr float, float* %0, i64 8, !dbg !41 + %30 = load float, float* %29, align 4, !dbg !42 + %31 = getelementptr float, float* %4, i64 8, !dbg !43 + store float %30, float* %31, align 4, !dbg !44 + %32 = getelementptr float, float* %0, i64 9, !dbg !45 + %33 = load float, float* %32, align 4, !dbg !46 + %34 = getelementptr float, float* %4, i64 9, !dbg !47 + store float %33, float* %34, align 4, !dbg !48 + %35 = getelementptr float, float* %0, i64 10, !dbg !49 + %36 = load float, float* %35, align 4, !dbg !50 + %37 = getelementptr float, float* %4, i64 10, !dbg !51 + store float %36, float* %37, align 4, !dbg !52 + %38 = getelementptr float, float* %0, i64 11, !dbg !53 + %39 = load float, float* %38, align 4, !dbg !54 + %40 = getelementptr float, float* %4, i64 11, !dbg !55 + store float %39, float* %40, align 4, !dbg !56 + %41 = getelementptr float, float* %0, i64 12, !dbg !57 + %42 = load float, float* %41, align 4, !dbg !58 + %43 = getelementptr float, float* %4, i64 12, !dbg !59 + store float %42, float* %43, align 4, !dbg !60 + %44 = getelementptr float, float* %0, i64 13, !dbg !61 + %45 = load float, float* %44, align 4, !dbg !62 + %46 = getelementptr float, float* %4, i64 13, !dbg !63 + store float %45, float* %46, align 4, !dbg !64 + %47 = getelementptr float, float* %0, i64 14, !dbg !65 + %48 = load float, float* %47, align 4, !dbg !66 + %49 = getelementptr float, float* %4, i64 14, !dbg !67 + store float %48, float* %49, align 4, !dbg !68 + %50 = getelementptr float, float* %0, i64 15, !dbg !69 + %51 = load float, float* %50, align 4, !dbg !70 + %52 = getelementptr float, float* %4, i64 15, !dbg !71 + store float %51, float* %52, align 4, !dbg !72 + %53 = alloca float, i64 ptrtoint (float* getelementptr (float, float* null, i64 8) to i64), align 4, !dbg !73 + %54 = getelementptr float, float* %1, i64 0, !dbg !74 + %55 = load float, float* %54, align 4, !dbg !75 + %56 = getelementptr float, float* %53, i64 0, !dbg !76 + store float %55, float* %56, align 4, !dbg !77 + %57 = getelementptr float, float* %1, i64 1, !dbg !78 + %58 = load float, float* %57, align 4, !dbg !79 + %59 = getelementptr float, float* %53, i64 1, !dbg !80 + store float %58, float* %59, align 4, !dbg !81 + %60 = getelementptr float, float* %1, i64 2, !dbg !82 + %61 = load float, float* %60, align 4, !dbg !83 + %62 = getelementptr float, float* %53, i64 2, !dbg !84 + store float %61, float* %62, align 4, !dbg !85 + %63 = getelementptr float, float* %1, i64 3, !dbg !86 + %64 = load float, float* %63, align 4, !dbg !87 + %65 = getelementptr float, float* %53, i64 3, !dbg !88 + store float %64, float* %65, align 4, !dbg !89 + %66 = getelementptr float, float* %1, i64 4, !dbg !90 + %67 = load float, float* %66, align 4, !dbg !91 + %68 = getelementptr float, float* %53, i64 4, !dbg !92 + store float %67, float* %68, align 4, !dbg !93 + %69 = getelementptr float, float* %1, i64 5, !dbg !94 + %70 = load float, float* %69, align 4, !dbg !95 + %71 = getelementptr float, float* %53, i64 5, !dbg !96 + store float %70, float* %71, align 4, !dbg !97 + %72 = getelementptr float, float* %1, i64 6, !dbg !98 + %73 = load float, float* %72, align 4, !dbg !99 + %74 = getelementptr float, float* %53, i64 6, !dbg !100 + store float %73, float* %74, align 4, !dbg !101 + %75 = getelementptr float, float* %1, i64 7, !dbg !102 + %76 = load float, float* %75, align 4, !dbg !103 + %77 = getelementptr float, float* %53, i64 7, !dbg !104 + store float %76, float* %77, align 4, !dbg !105 + %78 = alloca float, i64 ptrtoint (float* getelementptr (float, float* null, i64 8) to i64), align 4, !dbg !106 + %79 = getelementptr float, float* %2, i64 0, !dbg !107 + %80 = load float, float* %79, align 4, !dbg !108 + %81 = getelementptr float, float* %78, i64 0, !dbg !109 + store float %80, float* %81, align 4, !dbg !110 + %82 = getelementptr float, float* %2, i64 1, !dbg !111 + %83 = load float, float* %82, align 4, !dbg !112 + %84 = getelementptr float, float* %78, i64 1, !dbg !113 + store float %83, float* %84, align 4, !dbg !114 + %85 = getelementptr float, float* %2, i64 2, !dbg !115 + %86 = load float, float* %85, align 4, !dbg !116 + %87 = getelementptr float, float* %78, i64 2, !dbg !117 + store float %86, float* %87, align 4, !dbg !118 + %88 = getelementptr float, float* %2, i64 3, !dbg !119 + %89 = load float, float* %88, align 4, !dbg !120 + %90 = getelementptr float, float* %78, i64 3, !dbg !121 + store float %89, float* %90, align 4, !dbg !122 + %91 = getelementptr float, float* %2, i64 4, !dbg !123 + %92 = load float, float* %91, align 4, !dbg !124 + %93 = getelementptr float, float* %78, i64 4, !dbg !125 + store float %92, float* %93, align 4, !dbg !126 + %94 = getelementptr float, float* %2, i64 5, !dbg !127 + %95 = load float, float* %94, align 4, !dbg !128 + %96 = getelementptr float, float* %78, i64 5, !dbg !129 + store float %95, float* %96, align 4, !dbg !130 + %97 = getelementptr float, float* %2, i64 6, !dbg !131 + %98 = load float, float* %97, align 4, !dbg !132 + %99 = getelementptr float, float* %78, i64 6, !dbg !133 + store float %98, float* %99, align 4, !dbg !134 + %100 = getelementptr float, float* %2, i64 7, !dbg !135 + %101 = load float, float* %100, align 4, !dbg !136 + %102 = getelementptr float, float* %78, i64 7, !dbg !137 + store float %101, float* %102, align 4, !dbg !138 + %103 = getelementptr float, float* %4, i64 0, !dbg !139 + %104 = load float, float* %103, align 4, !dbg !140 + %105 = getelementptr float, float* %53, i64 0, !dbg !141 + %106 = load float, float* %105, align 4, !dbg !142 + %107 = getelementptr float, float* %78, i64 0, !dbg !143 + %108 = load float, float* %107, align 4, !dbg !144 + %109 = fmul float %104, %106, !dbg !145 + %110 = fadd float %108, %109, !dbg !146 + %111 = getelementptr float, float* %4, i64 1, !dbg !147 + %112 = load float, float* %111, align 4, !dbg !148 + %113 = getelementptr float, float* %53, i64 2, !dbg !149 + %114 = load float, float* %113, align 4, !dbg !150 + %115 = fmul float %112, %114, !dbg !151 + %116 = fadd float %110, %115, !dbg !152 + %117 = getelementptr float, float* %4, i64 2, !dbg !153 + %118 = load float, float* %117, align 4, !dbg !154 + %119 = getelementptr float, float* %53, i64 4, !dbg !155 + %120 = load float, float* %119, align 4, !dbg !156 + %121 = fmul float %118, %120, !dbg !157 + %122 = fadd float %116, %121, !dbg !158 + %123 = getelementptr float, float* %4, i64 3, !dbg !159 + %124 = load float, float* %123, align 4, !dbg !160 + %125 = getelementptr float, float* %53, i64 6, !dbg !161 + %126 = load float, float* %125, align 4, !dbg !162 + %127 = fmul float %124, %126, !dbg !163 + %128 = fadd float %122, %127, !dbg !164 + %129 = getelementptr float, float* %78, i64 0, !dbg !165 + store float %128, float* %129, align 4, !dbg !166 + %130 = getelementptr float, float* %53, i64 1, !dbg !167 + %131 = load float, float* %130, align 4, !dbg !168 + %132 = getelementptr float, float* %78, i64 1, !dbg !169 + %133 = load float, float* %132, align 4, !dbg !170 + %134 = fmul float %104, %131, !dbg !171 + %135 = fadd float %133, %134, !dbg !172 + %136 = getelementptr float, float* %53, i64 3, !dbg !173 + %137 = load float, float* %136, align 4, !dbg !174 + %138 = fmul float %112, %137, !dbg !175 + %139 = fadd float %135, %138, !dbg !176 + %140 = getelementptr float, float* %53, i64 5, !dbg !177 + %141 = load float, float* %140, align 4, !dbg !178 + %142 = fmul float %118, %141, !dbg !179 + %143 = fadd float %139, %142, !dbg !180 + %144 = getelementptr float, float* %53, i64 7, !dbg !181 + %145 = load float, float* %144, align 4, !dbg !182 + %146 = fmul float %124, %145, !dbg !183 + %147 = fadd float %143, %146, !dbg !184 + %148 = getelementptr float, float* %78, i64 1, !dbg !185 + store float %147, float* %148, align 4, !dbg !186 + %149 = getelementptr float, float* %4, i64 4, !dbg !187 + %150 = load float, float* %149, align 4, !dbg !188 + %151 = getelementptr float, float* %78, i64 2, !dbg !189 + %152 = load float, float* %151, align 4, !dbg !190 + %153 = fmul float %150, %106, !dbg !191 + %154 = fadd float %152, %153, !dbg !192 + %155 = getelementptr float, float* %4, i64 5, !dbg !193 + %156 = load float, float* %155, align 4, !dbg !194 + %157 = fmul float %156, %114, !dbg !195 + %158 = fadd float %154, %157, !dbg !196 + %159 = getelementptr float, float* %4, i64 6, !dbg !197 + %160 = load float, float* %159, align 4, !dbg !198 + %161 = fmul float %160, %120, !dbg !199 + %162 = fadd float %158, %161, !dbg !200 + %163 = getelementptr float, float* %4, i64 7, !dbg !201 + %164 = load float, float* %163, align 4, !dbg !202 + %165 = fmul float %164, %126, !dbg !203 + %166 = fadd float %162, %165, !dbg !204 + %167 = getelementptr float, float* %78, i64 2, !dbg !205 + store float %166, float* %167, align 4, !dbg !206 + %168 = getelementptr float, float* %78, i64 3, !dbg !207 + %169 = load float, float* %168, align 4, !dbg !208 + %170 = fmul float %150, %131, !dbg !209 + %171 = fadd float %169, %170, !dbg !210 + %172 = fmul float %156, %137, !dbg !211 + %173 = fadd float %171, %172, !dbg !212 + %174 = fmul float %160, %141, !dbg !213 + %175 = fadd float %173, %174, !dbg !214 + %176 = fmul float %164, %145, !dbg !215 + %177 = fadd float %175, %176, !dbg !216 + %178 = getelementptr float, float* %78, i64 3, !dbg !217 + store float %177, float* %178, align 4, !dbg !218 + %179 = getelementptr float, float* %4, i64 8, !dbg !219 + %180 = load float, float* %179, align 4, !dbg !220 + %181 = getelementptr float, float* %78, i64 4, !dbg !221 + %182 = load float, float* %181, align 4, !dbg !222 + %183 = fmul float %180, %106, !dbg !223 + %184 = fadd float %182, %183, !dbg !224 + %185 = getelementptr float, float* %4, i64 9, !dbg !225 + %186 = load float, float* %185, align 4, !dbg !226 + %187 = fmul float %186, %114, !dbg !227 + %188 = fadd float %184, %187, !dbg !228 + %189 = getelementptr float, float* %4, i64 10, !dbg !229 + %190 = load float, float* %189, align 4, !dbg !230 + %191 = fmul float %190, %120, !dbg !231 + %192 = fadd float %188, %191, !dbg !232 + %193 = getelementptr float, float* %4, i64 11, !dbg !233 + %194 = load float, float* %193, align 4, !dbg !234 + %195 = fmul float %194, %126, !dbg !235 + %196 = fadd float %192, %195, !dbg !236 + %197 = getelementptr float, float* %78, i64 4, !dbg !237 + store float %196, float* %197, align 4, !dbg !238 + %198 = getelementptr float, float* %78, i64 5, !dbg !239 + %199 = load float, float* %198, align 4, !dbg !240 + %200 = fmul float %180, %131, !dbg !241 + %201 = fadd float %199, %200, !dbg !242 + %202 = fmul float %186, %137, !dbg !243 + %203 = fadd float %201, %202, !dbg !244 + %204 = fmul float %190, %141, !dbg !245 + %205 = fadd float %203, %204, !dbg !246 + %206 = fmul float %194, %145, !dbg !247 + %207 = fadd float %205, %206, !dbg !248 + %208 = getelementptr float, float* %78, i64 5, !dbg !249 + store float %207, float* %208, align 4, !dbg !250 + %209 = getelementptr float, float* %4, i64 12, !dbg !251 + %210 = load float, float* %209, align 4, !dbg !252 + %211 = getelementptr float, float* %78, i64 6, !dbg !253 + %212 = load float, float* %211, align 4, !dbg !254 + %213 = fmul float %210, %106, !dbg !255 + %214 = fadd float %212, %213, !dbg !256 + %215 = getelementptr float, float* %4, i64 13, !dbg !257 + %216 = load float, float* %215, align 4, !dbg !258 + %217 = fmul float %216, %114, !dbg !259 + %218 = fadd float %214, %217, !dbg !260 + %219 = getelementptr float, float* %4, i64 14, !dbg !261 + %220 = load float, float* %219, align 4, !dbg !262 + %221 = fmul float %220, %120, !dbg !263 + %222 = fadd float %218, %221, !dbg !264 + %223 = getelementptr float, float* %4, i64 15, !dbg !265 + %224 = load float, float* %223, align 4, !dbg !266 + %225 = fmul float %224, %126, !dbg !267 + %226 = fadd float %222, %225, !dbg !268 + %227 = getelementptr float, float* %78, i64 6, !dbg !269 + store float %226, float* %227, align 4, !dbg !270 + %228 = getelementptr float, float* %78, i64 7, !dbg !271 + %229 = load float, float* %228, align 4, !dbg !272 + %230 = fmul float %210, %131, !dbg !273 + %231 = fadd float %229, %230, !dbg !274 + %232 = fmul float %216, %137, !dbg !275 + %233 = fadd float %231, %232, !dbg !276 + %234 = fmul float %220, %141, !dbg !277 + %235 = fadd float %233, %234, !dbg !278 + %236 = fmul float %224, %145, !dbg !279 + %237 = fadd float %235, %236, !dbg !280 + %238 = getelementptr float, float* %78, i64 7, !dbg !281 + store float %237, float* %238, align 4, !dbg !282 + %239 = getelementptr float, float* %78, i64 0, !dbg !283 + %240 = load float, float* %239, align 4, !dbg !284 + %241 = getelementptr float, float* %2, i64 0, !dbg !285 + store float %240, float* %241, align 4, !dbg !286 + %242 = getelementptr float, float* %78, i64 1, !dbg !287 + %243 = load float, float* %242, align 4, !dbg !288 + %244 = getelementptr float, float* %2, i64 1, !dbg !289 + store float %243, float* %244, align 4, !dbg !290 + %245 = getelementptr float, float* %78, i64 2, !dbg !291 + %246 = load float, float* %245, align 4, !dbg !292 + %247 = getelementptr float, float* %2, i64 2, !dbg !293 + store float %246, float* %247, align 4, !dbg !294 + %248 = getelementptr float, float* %78, i64 3, !dbg !295 + %249 = load float, float* %248, align 4, !dbg !296 + %250 = getelementptr float, float* %2, i64 3, !dbg !297 + store float %249, float* %250, align 4, !dbg !298 + %251 = getelementptr float, float* %78, i64 4, !dbg !299 + %252 = load float, float* %251, align 4, !dbg !300 + %253 = getelementptr float, float* %2, i64 4, !dbg !301 + store float %252, float* %253, align 4, !dbg !302 + %254 = getelementptr float, float* %78, i64 5, !dbg !303 + %255 = load float, float* %254, align 4, !dbg !304 + %256 = getelementptr float, float* %2, i64 5, !dbg !305 + store float %255, float* %256, align 4, !dbg !306 + %257 = getelementptr float, float* %78, i64 6, !dbg !307 + %258 = load float, float* %257, align 4, !dbg !308 + %259 = getelementptr float, float* %2, i64 6, !dbg !309 + store float %258, float* %259, align 4, !dbg !310 + %260 = getelementptr float, float* %78, i64 7, !dbg !311 + %261 = load float, float* %260, align 4, !dbg !312 + %262 = getelementptr float, float* %2, i64 7, !dbg !313 + store float %261, float* %262, align 4, !dbg !314 + ret void, !dbg !315 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "mlir", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "LLVMDialectModule", directory: "/") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "main_kernel", linkageName: "main_kernel", scope: null, file: !4, line: 2, type: !5, scopeLine: 2, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !6) +!4 = !DIFile(filename: "output/04optimized.mlir", directory: "/files0/extended/bohm747/Development/soda/soda-opt/docs/tutorials/dataflow2022") +!5 = !DISubroutineType(types: !6) +!6 = !{} +!7 = !DILocation(line: 11, column: 10, scope: !8) +!8 = !DILexicalBlockFile(scope: !3, file: !4, discriminator: 0) +!9 = !DILocation(line: 18, column: 11, scope: !8) +!10 = !DILocation(line: 19, column: 11, scope: !8) +!11 = !DILocation(line: 26, column: 11, scope: !8) +!12 = !DILocation(line: 27, column: 5, scope: !8) +!13 = !DILocation(line: 34, column: 11, scope: !8) +!14 = !DILocation(line: 35, column: 11, scope: !8) +!15 = !DILocation(line: 42, column: 11, scope: !8) +!16 = !DILocation(line: 43, column: 5, scope: !8) +!17 = !DILocation(line: 50, column: 11, scope: !8) +!18 = !DILocation(line: 51, column: 11, scope: !8) +!19 = !DILocation(line: 58, column: 11, scope: !8) +!20 = !DILocation(line: 59, column: 5, scope: !8) +!21 = !DILocation(line: 66, column: 11, scope: !8) +!22 = !DILocation(line: 67, column: 11, scope: !8) +!23 = !DILocation(line: 74, column: 11, scope: !8) +!24 = !DILocation(line: 75, column: 5, scope: !8) +!25 = !DILocation(line: 82, column: 11, scope: !8) +!26 = !DILocation(line: 83, column: 11, scope: !8) +!27 = !DILocation(line: 90, column: 11, scope: !8) +!28 = !DILocation(line: 91, column: 5, scope: !8) +!29 = !DILocation(line: 98, column: 11, scope: !8) +!30 = !DILocation(line: 99, column: 11, scope: !8) +!31 = !DILocation(line: 106, column: 11, scope: !8) +!32 = !DILocation(line: 107, column: 5, scope: !8) +!33 = !DILocation(line: 114, column: 12, scope: !8) +!34 = !DILocation(line: 115, column: 12, scope: !8) +!35 = !DILocation(line: 122, column: 12, scope: !8) +!36 = !DILocation(line: 123, column: 5, scope: !8) +!37 = !DILocation(line: 130, column: 12, scope: !8) +!38 = !DILocation(line: 131, column: 12, scope: !8) +!39 = !DILocation(line: 138, column: 12, scope: !8) +!40 = !DILocation(line: 139, column: 5, scope: !8) +!41 = !DILocation(line: 146, column: 12, scope: !8) +!42 = !DILocation(line: 147, column: 12, scope: !8) +!43 = !DILocation(line: 154, column: 12, scope: !8) +!44 = !DILocation(line: 155, column: 5, scope: !8) +!45 = !DILocation(line: 162, column: 12, scope: !8) +!46 = !DILocation(line: 163, column: 12, scope: !8) +!47 = !DILocation(line: 170, column: 12, scope: !8) +!48 = !DILocation(line: 171, column: 5, scope: !8) +!49 = !DILocation(line: 178, column: 12, scope: !8) +!50 = !DILocation(line: 179, column: 12, scope: !8) +!51 = !DILocation(line: 186, column: 12, scope: !8) +!52 = !DILocation(line: 187, column: 5, scope: !8) +!53 = !DILocation(line: 194, column: 12, scope: !8) +!54 = !DILocation(line: 195, column: 12, scope: !8) +!55 = !DILocation(line: 202, column: 12, scope: !8) +!56 = !DILocation(line: 203, column: 5, scope: !8) +!57 = !DILocation(line: 210, column: 12, scope: !8) +!58 = !DILocation(line: 211, column: 12, scope: !8) +!59 = !DILocation(line: 218, column: 12, scope: !8) +!60 = !DILocation(line: 219, column: 5, scope: !8) +!61 = !DILocation(line: 226, column: 12, scope: !8) +!62 = !DILocation(line: 227, column: 12, scope: !8) +!63 = !DILocation(line: 234, column: 12, scope: !8) +!64 = !DILocation(line: 235, column: 5, scope: !8) +!65 = !DILocation(line: 242, column: 12, scope: !8) +!66 = !DILocation(line: 243, column: 12, scope: !8) +!67 = !DILocation(line: 250, column: 12, scope: !8) +!68 = !DILocation(line: 251, column: 5, scope: !8) +!69 = !DILocation(line: 258, column: 12, scope: !8) +!70 = !DILocation(line: 259, column: 12, scope: !8) +!71 = !DILocation(line: 266, column: 12, scope: !8) +!72 = !DILocation(line: 267, column: 5, scope: !8) +!73 = !DILocation(line: 272, column: 12, scope: !8) +!74 = !DILocation(line: 279, column: 12, scope: !8) +!75 = !DILocation(line: 280, column: 12, scope: !8) +!76 = !DILocation(line: 287, column: 12, scope: !8) +!77 = !DILocation(line: 288, column: 5, scope: !8) +!78 = !DILocation(line: 295, column: 12, scope: !8) +!79 = !DILocation(line: 296, column: 12, scope: !8) +!80 = !DILocation(line: 303, column: 12, scope: !8) +!81 = !DILocation(line: 304, column: 5, scope: !8) +!82 = !DILocation(line: 311, column: 12, scope: !8) +!83 = !DILocation(line: 312, column: 12, scope: !8) +!84 = !DILocation(line: 319, column: 12, scope: !8) +!85 = !DILocation(line: 320, column: 5, scope: !8) +!86 = !DILocation(line: 327, column: 12, scope: !8) +!87 = !DILocation(line: 328, column: 12, scope: !8) +!88 = !DILocation(line: 335, column: 12, scope: !8) +!89 = !DILocation(line: 336, column: 5, scope: !8) +!90 = !DILocation(line: 343, column: 12, scope: !8) +!91 = !DILocation(line: 344, column: 12, scope: !8) +!92 = !DILocation(line: 351, column: 12, scope: !8) +!93 = !DILocation(line: 352, column: 5, scope: !8) +!94 = !DILocation(line: 359, column: 12, scope: !8) +!95 = !DILocation(line: 360, column: 12, scope: !8) +!96 = !DILocation(line: 367, column: 12, scope: !8) +!97 = !DILocation(line: 368, column: 5, scope: !8) +!98 = !DILocation(line: 375, column: 12, scope: !8) +!99 = !DILocation(line: 376, column: 12, scope: !8) +!100 = !DILocation(line: 383, column: 12, scope: !8) +!101 = !DILocation(line: 384, column: 5, scope: !8) +!102 = !DILocation(line: 391, column: 12, scope: !8) +!103 = !DILocation(line: 392, column: 12, scope: !8) +!104 = !DILocation(line: 399, column: 12, scope: !8) +!105 = !DILocation(line: 400, column: 5, scope: !8) +!106 = !DILocation(line: 405, column: 12, scope: !8) +!107 = !DILocation(line: 412, column: 12, scope: !8) +!108 = !DILocation(line: 413, column: 12, scope: !8) +!109 = !DILocation(line: 420, column: 12, scope: !8) +!110 = !DILocation(line: 421, column: 5, scope: !8) +!111 = !DILocation(line: 428, column: 12, scope: !8) +!112 = !DILocation(line: 429, column: 12, scope: !8) +!113 = !DILocation(line: 436, column: 12, scope: !8) +!114 = !DILocation(line: 437, column: 5, scope: !8) +!115 = !DILocation(line: 444, column: 12, scope: !8) +!116 = !DILocation(line: 445, column: 12, scope: !8) +!117 = !DILocation(line: 452, column: 12, scope: !8) +!118 = !DILocation(line: 453, column: 5, scope: !8) +!119 = !DILocation(line: 460, column: 12, scope: !8) +!120 = !DILocation(line: 461, column: 12, scope: !8) +!121 = !DILocation(line: 468, column: 12, scope: !8) +!122 = !DILocation(line: 469, column: 5, scope: !8) +!123 = !DILocation(line: 476, column: 12, scope: !8) +!124 = !DILocation(line: 477, column: 12, scope: !8) +!125 = !DILocation(line: 484, column: 12, scope: !8) +!126 = !DILocation(line: 485, column: 5, scope: !8) +!127 = !DILocation(line: 492, column: 12, scope: !8) +!128 = !DILocation(line: 493, column: 12, scope: !8) +!129 = !DILocation(line: 500, column: 12, scope: !8) +!130 = !DILocation(line: 501, column: 5, scope: !8) +!131 = !DILocation(line: 508, column: 12, scope: !8) +!132 = !DILocation(line: 509, column: 12, scope: !8) +!133 = !DILocation(line: 516, column: 12, scope: !8) +!134 = !DILocation(line: 517, column: 5, scope: !8) +!135 = !DILocation(line: 524, column: 12, scope: !8) +!136 = !DILocation(line: 525, column: 12, scope: !8) +!137 = !DILocation(line: 532, column: 12, scope: !8) +!138 = !DILocation(line: 533, column: 5, scope: !8) +!139 = !DILocation(line: 540, column: 12, scope: !8) +!140 = !DILocation(line: 541, column: 12, scope: !8) +!141 = !DILocation(line: 548, column: 12, scope: !8) +!142 = !DILocation(line: 549, column: 12, scope: !8) +!143 = !DILocation(line: 556, column: 12, scope: !8) +!144 = !DILocation(line: 557, column: 12, scope: !8) +!145 = !DILocation(line: 558, column: 12, scope: !8) +!146 = !DILocation(line: 559, column: 12, scope: !8) +!147 = !DILocation(line: 566, column: 12, scope: !8) +!148 = !DILocation(line: 567, column: 12, scope: !8) +!149 = !DILocation(line: 574, column: 12, scope: !8) +!150 = !DILocation(line: 575, column: 12, scope: !8) +!151 = !DILocation(line: 576, column: 12, scope: !8) +!152 = !DILocation(line: 577, column: 12, scope: !8) +!153 = !DILocation(line: 584, column: 12, scope: !8) +!154 = !DILocation(line: 585, column: 12, scope: !8) +!155 = !DILocation(line: 592, column: 12, scope: !8) +!156 = !DILocation(line: 593, column: 12, scope: !8) +!157 = !DILocation(line: 594, column: 12, scope: !8) +!158 = !DILocation(line: 595, column: 12, scope: !8) +!159 = !DILocation(line: 602, column: 12, scope: !8) +!160 = !DILocation(line: 603, column: 12, scope: !8) +!161 = !DILocation(line: 610, column: 12, scope: !8) +!162 = !DILocation(line: 611, column: 12, scope: !8) +!163 = !DILocation(line: 612, column: 12, scope: !8) +!164 = !DILocation(line: 613, column: 12, scope: !8) +!165 = !DILocation(line: 620, column: 12, scope: !8) +!166 = !DILocation(line: 621, column: 5, scope: !8) +!167 = !DILocation(line: 628, column: 12, scope: !8) +!168 = !DILocation(line: 629, column: 12, scope: !8) +!169 = !DILocation(line: 636, column: 12, scope: !8) +!170 = !DILocation(line: 637, column: 12, scope: !8) +!171 = !DILocation(line: 638, column: 12, scope: !8) +!172 = !DILocation(line: 639, column: 12, scope: !8) +!173 = !DILocation(line: 646, column: 12, scope: !8) +!174 = !DILocation(line: 647, column: 12, scope: !8) +!175 = !DILocation(line: 648, column: 12, scope: !8) +!176 = !DILocation(line: 649, column: 12, scope: !8) +!177 = !DILocation(line: 656, column: 12, scope: !8) +!178 = !DILocation(line: 657, column: 12, scope: !8) +!179 = !DILocation(line: 658, column: 12, scope: !8) +!180 = !DILocation(line: 659, column: 12, scope: !8) +!181 = !DILocation(line: 666, column: 12, scope: !8) +!182 = !DILocation(line: 667, column: 12, scope: !8) +!183 = !DILocation(line: 668, column: 12, scope: !8) +!184 = !DILocation(line: 669, column: 12, scope: !8) +!185 = !DILocation(line: 676, column: 12, scope: !8) +!186 = !DILocation(line: 677, column: 5, scope: !8) +!187 = !DILocation(line: 684, column: 12, scope: !8) +!188 = !DILocation(line: 685, column: 12, scope: !8) +!189 = !DILocation(line: 692, column: 12, scope: !8) +!190 = !DILocation(line: 693, column: 12, scope: !8) +!191 = !DILocation(line: 694, column: 12, scope: !8) +!192 = !DILocation(line: 695, column: 12, scope: !8) +!193 = !DILocation(line: 702, column: 12, scope: !8) +!194 = !DILocation(line: 703, column: 12, scope: !8) +!195 = !DILocation(line: 704, column: 12, scope: !8) +!196 = !DILocation(line: 705, column: 12, scope: !8) +!197 = !DILocation(line: 712, column: 12, scope: !8) +!198 = !DILocation(line: 713, column: 12, scope: !8) +!199 = !DILocation(line: 714, column: 12, scope: !8) +!200 = !DILocation(line: 715, column: 12, scope: !8) +!201 = !DILocation(line: 722, column: 12, scope: !8) +!202 = !DILocation(line: 723, column: 12, scope: !8) +!203 = !DILocation(line: 724, column: 12, scope: !8) +!204 = !DILocation(line: 725, column: 12, scope: !8) +!205 = !DILocation(line: 732, column: 12, scope: !8) +!206 = !DILocation(line: 733, column: 5, scope: !8) +!207 = !DILocation(line: 740, column: 12, scope: !8) +!208 = !DILocation(line: 741, column: 12, scope: !8) +!209 = !DILocation(line: 742, column: 12, scope: !8) +!210 = !DILocation(line: 743, column: 12, scope: !8) +!211 = !DILocation(line: 744, column: 12, scope: !8) +!212 = !DILocation(line: 745, column: 12, scope: !8) +!213 = !DILocation(line: 746, column: 12, scope: !8) +!214 = !DILocation(line: 747, column: 12, scope: !8) +!215 = !DILocation(line: 748, column: 12, scope: !8) +!216 = !DILocation(line: 749, column: 12, scope: !8) +!217 = !DILocation(line: 756, column: 12, scope: !8) +!218 = !DILocation(line: 757, column: 5, scope: !8) +!219 = !DILocation(line: 764, column: 12, scope: !8) +!220 = !DILocation(line: 765, column: 12, scope: !8) +!221 = !DILocation(line: 772, column: 12, scope: !8) +!222 = !DILocation(line: 773, column: 12, scope: !8) +!223 = !DILocation(line: 774, column: 12, scope: !8) +!224 = !DILocation(line: 775, column: 12, scope: !8) +!225 = !DILocation(line: 782, column: 12, scope: !8) +!226 = !DILocation(line: 783, column: 12, scope: !8) +!227 = !DILocation(line: 784, column: 12, scope: !8) +!228 = !DILocation(line: 785, column: 12, scope: !8) +!229 = !DILocation(line: 792, column: 12, scope: !8) +!230 = !DILocation(line: 793, column: 12, scope: !8) +!231 = !DILocation(line: 794, column: 12, scope: !8) +!232 = !DILocation(line: 795, column: 12, scope: !8) +!233 = !DILocation(line: 802, column: 12, scope: !8) +!234 = !DILocation(line: 803, column: 12, scope: !8) +!235 = !DILocation(line: 804, column: 12, scope: !8) +!236 = !DILocation(line: 805, column: 12, scope: !8) +!237 = !DILocation(line: 812, column: 12, scope: !8) +!238 = !DILocation(line: 813, column: 5, scope: !8) +!239 = !DILocation(line: 820, column: 12, scope: !8) +!240 = !DILocation(line: 821, column: 12, scope: !8) +!241 = !DILocation(line: 822, column: 12, scope: !8) +!242 = !DILocation(line: 823, column: 12, scope: !8) +!243 = !DILocation(line: 824, column: 12, scope: !8) +!244 = !DILocation(line: 825, column: 12, scope: !8) +!245 = !DILocation(line: 826, column: 12, scope: !8) +!246 = !DILocation(line: 827, column: 12, scope: !8) +!247 = !DILocation(line: 828, column: 12, scope: !8) +!248 = !DILocation(line: 829, column: 12, scope: !8) +!249 = !DILocation(line: 836, column: 12, scope: !8) +!250 = !DILocation(line: 837, column: 5, scope: !8) +!251 = !DILocation(line: 844, column: 12, scope: !8) +!252 = !DILocation(line: 845, column: 12, scope: !8) +!253 = !DILocation(line: 852, column: 12, scope: !8) +!254 = !DILocation(line: 853, column: 12, scope: !8) +!255 = !DILocation(line: 854, column: 12, scope: !8) +!256 = !DILocation(line: 855, column: 12, scope: !8) +!257 = !DILocation(line: 862, column: 12, scope: !8) +!258 = !DILocation(line: 863, column: 12, scope: !8) +!259 = !DILocation(line: 864, column: 12, scope: !8) +!260 = !DILocation(line: 865, column: 12, scope: !8) +!261 = !DILocation(line: 872, column: 12, scope: !8) +!262 = !DILocation(line: 873, column: 12, scope: !8) +!263 = !DILocation(line: 874, column: 12, scope: !8) +!264 = !DILocation(line: 875, column: 12, scope: !8) +!265 = !DILocation(line: 882, column: 12, scope: !8) +!266 = !DILocation(line: 883, column: 12, scope: !8) +!267 = !DILocation(line: 884, column: 12, scope: !8) +!268 = !DILocation(line: 885, column: 12, scope: !8) +!269 = !DILocation(line: 892, column: 12, scope: !8) +!270 = !DILocation(line: 893, column: 5, scope: !8) +!271 = !DILocation(line: 900, column: 12, scope: !8) +!272 = !DILocation(line: 901, column: 12, scope: !8) +!273 = !DILocation(line: 902, column: 12, scope: !8) +!274 = !DILocation(line: 903, column: 12, scope: !8) +!275 = !DILocation(line: 904, column: 12, scope: !8) +!276 = !DILocation(line: 905, column: 12, scope: !8) +!277 = !DILocation(line: 906, column: 12, scope: !8) +!278 = !DILocation(line: 907, column: 12, scope: !8) +!279 = !DILocation(line: 908, column: 12, scope: !8) +!280 = !DILocation(line: 909, column: 12, scope: !8) +!281 = !DILocation(line: 916, column: 12, scope: !8) +!282 = !DILocation(line: 917, column: 5, scope: !8) +!283 = !DILocation(line: 924, column: 12, scope: !8) +!284 = !DILocation(line: 925, column: 12, scope: !8) +!285 = !DILocation(line: 932, column: 12, scope: !8) +!286 = !DILocation(line: 933, column: 5, scope: !8) +!287 = !DILocation(line: 940, column: 12, scope: !8) +!288 = !DILocation(line: 941, column: 12, scope: !8) +!289 = !DILocation(line: 948, column: 12, scope: !8) +!290 = !DILocation(line: 949, column: 5, scope: !8) +!291 = !DILocation(line: 956, column: 12, scope: !8) +!292 = !DILocation(line: 957, column: 12, scope: !8) +!293 = !DILocation(line: 964, column: 12, scope: !8) +!294 = !DILocation(line: 965, column: 5, scope: !8) +!295 = !DILocation(line: 972, column: 12, scope: !8) +!296 = !DILocation(line: 973, column: 12, scope: !8) +!297 = !DILocation(line: 980, column: 12, scope: !8) +!298 = !DILocation(line: 981, column: 5, scope: !8) +!299 = !DILocation(line: 988, column: 12, scope: !8) +!300 = !DILocation(line: 989, column: 12, scope: !8) +!301 = !DILocation(line: 996, column: 12, scope: !8) +!302 = !DILocation(line: 997, column: 5, scope: !8) +!303 = !DILocation(line: 1004, column: 12, scope: !8) +!304 = !DILocation(line: 1005, column: 12, scope: !8) +!305 = !DILocation(line: 1012, column: 12, scope: !8) +!306 = !DILocation(line: 1013, column: 5, scope: !8) +!307 = !DILocation(line: 1020, column: 12, scope: !8) +!308 = !DILocation(line: 1021, column: 12, scope: !8) +!309 = !DILocation(line: 1028, column: 12, scope: !8) +!310 = !DILocation(line: 1029, column: 5, scope: !8) +!311 = !DILocation(line: 1036, column: 12, scope: !8) +!312 = !DILocation(line: 1037, column: 12, scope: !8) +!313 = !DILocation(line: 1044, column: 12, scope: !8) +!314 = !DILocation(line: 1045, column: 5, scope: !8) +!315 = !DILocation(line: 1046, column: 5, scope: !8) diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise3/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/test.xml new file mode 100644 index 000000000..56aea2e94 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise3/test.xml @@ -0,0 +1,8 @@ + + + + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise4/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise4/bambu.sh new file mode 100644 index 000000000..a9c45b8a0 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise4/bambu.sh @@ -0,0 +1,2 @@ +#!/bin/bash +bambu proxies.c --top-fname=funcA "$@" |& tee log.txt diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise4/proxies.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise4/proxies.c new file mode 100644 index 000000000..cc89ba13e --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise4/proxies.c @@ -0,0 +1,18 @@ +int __attribute__ ((noinline)) funcC(int a[2]){ + return a[0] * a[0] + a[1] * a[1]; +} + +int __attribute__ ((noinline)) funcB(int a[2]){ + int i; + for(i=0; i<2; i++) + a[i] = a[i] + 1; + return funcC(a); +} + +int funcA(){ + int temp1, temp2; + int a[2] = {0,1}; + temp1 = funcC(a); + temp2 = funcB(a); + return temp1 + temp2; +} \ No newline at end of file diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5/LUdecomposition.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/LUdecomposition.c new file mode 100644 index 000000000..a20e6b9f7 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/LUdecomposition.c @@ -0,0 +1,270 @@ +//////////////////////////////////////////////////////////////////////////////// +// int Upper_Triangular_Solve(float *U, float *B, float x[], int n) // +// // +// Description: // +// This routine solves the linear equation Ux = B, where U is an n x n // +// upper triangular matrix. (The subdiagonal part of the matrix is // +// not addressed.) // +// The algorithm follows: // +// x[n-1] = B[n-1]/U[n-1][n-1], and // +// x[i] = [B[i] - (U[i][i+1] * x[i+1] + ... + U[i][n-1] * x[n-1])] // +// / U[i][i], // +// for i = n-2, ..., 0. // +// // +// Arguments: // +// float *U Pointer to the first element of the upper triangular // +// matrix. // +// float *B Pointer to the column vector, (n x 1) matrix, B. // +// float *x Pointer to the column vector, (n x 1) matrix, x. // +// int n The number of rows or columns of the matrix U. // +// // +// Return Values: // +// 0 Success // +// -1 Failure - The matrix U is singular. // +// // +// Example: // +// #define N // +// float A[N][N], B[N], x[N]; // +// // +// (your code to create matrix A and column vector B) // +// err = Upper_Triangular_Solve(&A[0][0], B, x, n); // +// if (err < 0) printf(" Matrix A is singular\n"); // +// else printf(" The solution is \n"); // +// ... // +//////////////////////////////////////////////////////////////////////////////// +// // +int Upper_Triangular_Solve(float *U, float B[], float x[], int n) +{ + int i, k; + +// Solve the linear equation Ux = B for x, where U is an upper +// triangular matrix. + + for (k = n-1, U += n * (n - 1); k >= 0; U -= n, k--) { + if (*(U + k) == 0.0) return -1; // The matrix U is singular + x[k] = B[k]; + for (i = k + 1; i < n; i++) x[k] -= x[i] * *(U + i); + x[k] /= *(U + k); + } + + return 0; +} + +//////////////////////////////////////////////////////////////////////////////// +// void Unit_Lower_Triangular_Solve(float *L, float *B, float x[], int n) // +// // +// Description: // +// This routine solves the linear equation Lx = B, where L is an n x n // +// unit lower triangular matrix. (Only the subdiagonal part of the matrix// +// is addressed.) The diagonal is assumed to consist of 1's and is not // +// addressed. // +// The algorithm follows: // +// x[0] = B[0], and // +// x[i] = B[i] - (L[i][0] * x[0] + ... + L[i][i-1] * x[i-1]), // +// for i = 1, ..., n-1. // +// // +// Arguments: // +// float *L Pointer to the first element of the unit lower triangular // +// matrix. // +// float *B Pointer to the column vector, (n x 1) matrix, B. // +// float *x Pointer to the column vector, (n x 1) matrix, x. // +// int n The number of rows or columns of the matrix L. // +// // +// Return Values: // +// void // +// // +// Example: // +// #define N // +// float A[N][N], B[N], x[N]; // +// // +// (your code to create matrix A and column vector B) // +// Unit_Lower_Triangular_Solve(&A[0][0], B, x, n); // +// printf(" The solution is \n"); // +// ... // +//////////////////////////////////////////////////////////////////////////////// +// // +void Unit_Lower_Triangular_Solve(float *L, float B[], float x[], int n) +{ + int i, k; + +// Solve the linear equation Lx = B for x, where L is a unit lower +// triangular matrix. + + x[0] = B[0]; + for (k = 1, L += n; k < n; L += n, k++) + for (i = 0, x[k] = B[k]; i < k; i++) x[k] -= x[i] * *(L + i); +} + +//////////////////////////////////////////////////////////////////////////////// +// int Doolittle_LU_Decomposition(float *A, int n) // +// // +// Description: // +// This routine uses Doolittle's method to decompose the n x n matrix A // +// into a unit lower triangular matrix L and an upper triangular matrix U // +// such that A = LU. // +// The matrices L and U replace the matrix A so that the original matrix // +// A is destroyed. // +// Note! In Doolittle's method the diagonal elements of L are 1 and are // +// not stored. // +// Note! The determinant of A is the product of the diagonal elements // +// of U. (det A = det L * det U = det U). // +// This routine is suitable for those classes of matrices which when // +// performing Gaussian elimination do not need to undergo partial // +// pivoting, e.g. positive definite symmetric matrices, diagonally // +// dominant band matrices, etc. // +// For the more general case in which partial pivoting is needed use // +// Doolittle_LU_Decomposition_with_Pivoting. // +// The LU decomposition is convenient when one needs to solve the linear // +// equation Ax = B for the vector x while the matrix A is fixed and the // +// vector B is varied. The routine for solving the linear system Ax = B // +// after performing the LU decomposition for A is Doolittle_LU_Solve // +// (see below). // +// // +// The Doolittle method is given by evaluating, in order, the following // +// pair of expressions for k = 0, ... , n-1: // +// U[k][j] = A[k][j] - (L[k][0]*U[0][j] + ... + L[k][k-1]*U[k-1][j]) // +// for j = k, k+1, ... , n-1 // +// L[i][k] = (A[i][k] - (L[i][0]*U[0][k] + . + L[i][k-1]*U[k-1][k])) // +// / U[k][k] // +// for i = k+1, ... , n-1. // +// The matrix U forms the upper triangular matrix, and the matrix L // +// forms the lower triangular matrix. // +// // +// Arguments: // +// float *A Pointer to the first element of the matrix A[n][n]. // +// int n The number of rows or columns of the matrix A. // +// // +// Return Values: // +// 0 Success // +// -1 Failure - The matrix A is singular. // +// // +// Example: // +// #define N // +// float A[N][N]; // +// // +// (your code to intialize the matrix A) // +// // +// err = Doolittle_LU_Decomposition(&A[0][0], N); // +// if (err < 0) printf(" Matrix A is singular\n"); // +// else { printf(" The LU decomposition of A is \n"); // +// ... // +//////////////////////////////////////////////////////////////////////////////// +// // +int Doolittle_LU_Decomposition(float *A, int n) +{ + int i, j, k, p; + float *p_k, *p_row, *p_col; + +// For each row and column, k = 0, ..., n-1, +// find the upper triangular matrix elements for row k +// and if the matrix is non-singular (nonzero diagonal element). +// find the lower triangular matrix elements for column k. + + for (k = 0, p_k = A; k < n; p_k += n, k++) { + for (j = k; j < n; j++) { + for (p = 0, p_col = A; p < k; p_col += n, p++) + *(p_k + j) -= *(p_k + p) * *(p_col + j); + } + if ( *(p_k + k) == 0.0 ) return -1; + for (i = k+1, p_row = p_k + n; i < n; p_row += n, i++) { + for (p = 0, p_col = A; p < k; p_col += n, p++) + *(p_row + k) -= *(p_row + p) * *(p_col + k); + *(p_row + k) /= *(p_k + k); + } + } + return 0; +} + + +//////////////////////////////////////////////////////////////////////////////// +// int Doolittle_LU_Solve(float *LU, float *B, float *x, int n) // +// // +// Description: // +// This routine uses Doolittle's method to solve the linear equation // +// Ax = B. This routine is called after the matrix A has been decomposed // +// into a product of a unit lower triangular matrix L and an upper // +// triangular matrix U without pivoting. The argument LU is a pointer to // +// the matrix the subdiagonal part of which is L and the superdiagonal // +// together with the diagonal part is U. (The diagonal part of L is 1 and // +// is not stored.) The matrix A = LU. // +// The solution proceeds by solving the linear equation Ly = B for y and // +// subsequently solving the linear equation Ux = y for x. // +// // +// Arguments: // +// float *LU Pointer to the first element of the matrix whose elements // +// form the lower and upper triangular matrix factors of A. // +// float *B Pointer to the column vector, (n x 1) matrix, B // +// float *x Solution to the equation Ax = B. // +// int n The number of rows or columns of the matrix LU. // +// // +// Return Values: // +// 0 Success // +// -1 Failure - The matrix A is singular. // +// // +// Example: // +// #define N // +// float A[N][N], B[N], x[N]; // +// // +// (your code to create matrix A and column vector B) // +// err = Doolittle_LU_Decomposition(&A[0][0], N); // +// if (err < 0) printf(" Matrix A is singular\n"); // +// else { // +// err = Doolittle_LU_Solve(&A[0][0], B, x, n); // +// if (err < 0) printf(" Matrix A is singular\n"); // +// else printf(" The solution is \n"); // +// ... // +// } // +//////////////////////////////////////////////////////////////////////////////// +// // +int Doolittle_LU_Solve(float *LU, float B[], float x[], int n) +{ + +// Solve the linear equation Lx = B for x, where L is a lower +// triangular matrix with an implied 1 along the diagonal. + + Unit_Lower_Triangular_Solve(LU, B, x, n); + +// Solve the linear equation Ux = y, where y is the solution +// obtained above of Lx = B and U is an upper triangular matrix. + + return Upper_Triangular_Solve(LU, x, x, n); +} + +int invertMatrix(float *LU, float *invA, float *I) +{ + int i, j; + // float I[4][4] = {{1, 0, 0, 0}, {0, 1, 0, 0}, {0, 0, 1, 0}, {0, 0, 0, 1}}; + float resultColumn[4]; + + for (i = 0; i < 4; ++i) + { + int res = Doolittle_LU_Solve(LU, I + i*4, resultColumn, 4); + + if (res != 0) return res; + for (j = 0; j < 4; ++j) + *(invA + i + j * 4) = resultColumn[j]; + } + + return 0; +} + +//float A[4][4] = {{1, 1, 1, 1}, {1, 4, 2, 3}, {1, 2, 1, 2}, {1, 1, 1, 0}}; +//float invA[4][4]= {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}; + +int fun(float *A, float *invA, float *b, float *x, float *I) +{ + int res = Doolittle_LU_Decomposition((float *)A, 4); + + if (res != 0) return res; + + // float b[4] = {63, 105, 48, 186}; + // float x[4]; + + res = Doolittle_LU_Solve((float *)A, b, x, 4); + + if (res != 0) return res; + + res = invertMatrix((float *)A, (float *)invA, I); + + return res; +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/bambu.sh new file mode 100755 index 000000000..f37d97ff8 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/bambu.sh @@ -0,0 +1,12 @@ +#!/bin/bash +script=$(readlink -e $0) +root_dir=$(dirname $script) + +rm -rf ludecomp +mkdir -p ludecomp +cd ludecomp +echo "#synthesis of fun" +bambu $root_dir/LUdecomposition.c --top-fname=fun \ + -O1 \ + --generate-tb=$root_dir/test.xml --simulate --simulator=VERILATOR \ + -v2 --print-dot "$@" |& tee log.txt diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/test.xml new file mode 100644 index 000000000..500454cc7 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5/test.xml @@ -0,0 +1,4 @@ + + + + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/bambu.sh new file mode 100644 index 000000000..11ecb2ac9 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/bambu.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +!bambu helm.c --top-fname=helm_naive -Icommon.h --simulate --simulator=VERILATOR --generate-tb=test.xml --compiler=I386_CLANG6 \ No newline at end of file diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/common.h b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/common.h new file mode 100644 index 000000000..1381fda68 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/common.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include +#include + +typedef float real_t; + +real_t* make_empty(size_t size) +{ + return (real_t*)calloc(size, sizeof(real_t)); +} + +real_t* make_random(size_t size) +{ + real_t* result = make_empty(size); + if (!result) return NULL; + + real_t* end = result + size; + for (real_t* ptr = result; ptr != end; ++ptr) { + *ptr = ((real_t)random() / RAND_MAX) * (real_t)(2) - (real_t)(1); + } + + return result; +} + +real_t* make_copy(const real_t* data, size_t size) +{ + real_t* result = make_empty(size); + if (!result) return NULL; + + memcpy(result, data, size*sizeof(real_t)); + return result; +} + +real_t mse(const real_t* a, const real_t* b, size_t size) +{ + real_t accu = 0; + const real_t* a_end = a + size; + for (; a != a_end; ++a,++b) { + real_t err = (*a - *b); + accu += err * err; + } + return accu / (real_t)(size); +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/helm.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/helm.c new file mode 100644 index 000000000..4eb45d408 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/helm.c @@ -0,0 +1,146 @@ +#include "common.h" + +#pragma GCC diagnostic ignored "-Wincompatible-pointer-types" + +const size_t P = 11; + +void helm_naive( + real_t w[P], + real_t L[P][P], + real_t d[4], + real_t u[P][P][P], + real_t r[P][P][P] +) +{ + for (size_t x = 0; x < P; ++x) + for (size_t y = 0; y < P; ++y) + for (size_t z = 0; z < P; ++z) { + r[x][y][z] = d[0] * w[x] * w[y] * w[z] * u[x][y][z]; + } + + for (size_t x = 0; x < P; ++x) + for (size_t y = 0; y < P; ++y) + for (size_t z = 0; z < P; ++z) { + real_t accu = 0; + for (size_t k = 0; k < P; ++k) { + accu += L[x][k] * w[y] * w[z] * u[k][y][z]; + } + r[x][y][z] += d[1] * accu; + } + + for (size_t x = 0; x < P; ++x) + for (size_t y = 0; y < P; ++y) + for (size_t z = 0; z < P; ++z) { + real_t accu = 0; + for (size_t k = 0; k < P; ++k) { + accu += w[x] * L[y][k] * w[z] * u[x][k][z]; + } + r[x][y][z] += d[2] * accu; + } + + for (size_t x = 0; x < P; ++x) + for (size_t y = 0; y < P; ++y) + for (size_t z = 0; z < P; ++z) { + real_t accu = 0; + for (size_t k = 0; k < P; ++k) { + accu += w[x] * w[y] * L[z][k] * u[x][y][k]; + } + r[x][y][z] += d[3] * accu; + } +} + +void helm_factor_impl( + real_t w[P], + real_t L[P][P], + real_t d[4], + real_t u[P][P][P], + real_t L_hat[P][P], + real_t M_u[P][P][P], + real_t r[P][P][P] +) +{ + for (size_t x = 0; x < P; ++x) + for (size_t y = 0; y < P; ++y) + for (size_t z = 0; z < P; ++z) { + real_t M_u_xyz = w[x] * w[y] * w[z] * u[x][y][z]; + M_u[x][y][z] = M_u_xyz; + r[x][y][z] = M_u_xyz * d[0]; + } + + for (size_t i = 0; i < P; ++i) + for (size_t j = 0; j < P; ++j) { + L_hat[i][j] = L[i][j] / w[j]; + } + + for (size_t x = 0; x < P; ++x) + for (size_t y = 0; y < P; ++y) + for (size_t z = 0; z < P; ++z) { + real_t accu = 0; + for (size_t k = 0; k < P; ++k) { + accu += L_hat[x][k] * M_u[k][y][z]; + } + r[x][y][z] += d[1] * accu; + } + + for (size_t x = 0; x < P; ++x) + for (size_t y = 0; y < P; ++y) + for (size_t z = 0; z < P; ++z) { + real_t accu = 0; + for (size_t k = 0; k < P; ++k) { + accu += L_hat[y][k] * M_u[x][k][z]; + } + r[x][y][z] += d[2] * accu; + } + + for (size_t x = 0; x < P; ++x) + for (size_t y = 0; y < P; ++y) + for (size_t z = 0; z < P; ++z) { + real_t accu = 0; + for (size_t k = 0; k < P; ++k) { + accu += L_hat[z][k] * M_u[x][y][k]; + } + r[x][y][z] += d[3] * accu; + } +} + +void helm_factor( + real_t w[P], + real_t L[P][P], + real_t d[4], + real_t u[P][P][P], + real_t r[P][P][P] +) +{ + real_t* L_hat = make_empty(P*P); + real_t* M_u = make_empty(P*P*P); + + helm_factor_impl( + w, + L, + d, + u, + L_hat, + M_u, + r + ); +} + +int main(int argc, const char* argv[]) +{ + srandom(0xDEADBEEF); + + real_t* w = make_random(P); + real_t* L = make_random(P*P); + real_t* d = make_random(4); + real_t* u = make_random(P*P*P); + + real_t* r1 = make_empty(P*P*P); + helm_naive(w, L, d, u, r1); + + real_t* r2 = make_empty(P*P*P); + helm_factor(w, L, d, u, r2); + real_t mse2 = mse(r1, r2, P*P*P); + printf("mse2 = %G\n", mse2); + + return EXIT_SUCCESS; +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/test.xml new file mode 100644 index 000000000..88c970fa1 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise5a/test.xml @@ -0,0 +1,10 @@ + + + + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/README.txt b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/README.txt new file mode 100644 index 000000000..bdaccd406 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/README.txt @@ -0,0 +1,24 @@ +Simple example describing how to integrate and verify existing IP with functions written in C that receives structs passed by pointers. + +Hereafter a small description of files +-------------------------------------- + +top.c: file to be compiled/synthesized by bambu. +module_lib.h: header that declares the interfaces to existing Verilog IPs. +module_lib.xml: XML file that describes interfaces of existing Verilog IPs. +module1.v: verilog of an existing synthesizable IP. +module1.c: C stub used to emulate the module1 IP in C. +module2.v: verilog of an existing synthesizable IP. +module2.c: C stub used to emulate the module2 IP in C. +printer1.v: verilog of an existing non-synthesizable IP. +printer1.c: C stub used to emulate the printer1 IP in C. +printer2.v: verilog of an existing non-synthesizable IP. +printer2.c: C stub used to emulate the printer2 IP in C. +main_test.c: C testbench +constraints_STD.xml: resource constraint file passed to bambu to generate a Verilog design with just 1 my_ip module. +test.xml: XML file describing the testbench inputs. It is empty since we use the main_test.c as testbench generator. +bambu.sh: synthesis and simulation script. It requires Vivado RTL and Verilator to properly work. + +All C/H files were validated using the "gcc -c" command. +A C executable can be created with this command: "gcc -o ip_test main_test.c top.c module1.c module2.c printer1.c printer2.c + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/bambu.sh new file mode 100755 index 000000000..2365638eb --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/bambu.sh @@ -0,0 +1,16 @@ +#!/bin/bash +script=$(readlink -e $0) +root_dir=$(dirname $script) + +rm -rf hls +mkdir -p hls +cd hls +echo "# integrating IP simulation" +bambu $root_dir/main_test.c $root_dir/top.c --top-fname=main --top-rtldesign-name=my_ip \ + --C-no-parse=$root_dir/module1.c,$root_dir/module2.c,$root_dir/printer1.c,$root_dir/printer2.c \ + --file-input-data=$root_dir/module1.v,$root_dir/module2.v,$root_dir/printer1.v,$root_dir/printer2.v \ + $root_dir/module_lib.xml $root_dir/constraints_STD.xml \ + --experimental-setup=BAMBU -O3 \ + --no-iob --memory-allocation-policy=ALL_BRAM \ + --generate-tb=$root_dir/test.xml --simulate --simulator=VERILATOR \ + --print-dot -v4 "$@" |& tee log.txt diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/constraints_STD.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/constraints_STD.xml new file mode 100644 index 000000000..bd4f51938 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/constraints_STD.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/main_test.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/main_test.c new file mode 100644 index 000000000..fdc441e97 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/main_test.c @@ -0,0 +1,22 @@ +#include "module_lib.h" +#ifdef BAMBU_PROFILING +extern void __builtin_bambu_time_start(); +extern void __builtin_bambu_time_stop(); +#endif + +int main() +{ + uint32_t param1=10; + uint32_t param2=10<<16; +#ifdef BAMBU_PROFILING + __builtin_bambu_time_start(); +#endif + my_ip(0, param1, param2); + my_ip(1, param1, param2); + my_ip(2, param1, param2); + my_ip(3, param1, param2); +#ifdef BAMBU_PROFILING + __builtin_bambu_time_stop(); +#endif + return 0; +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.c new file mode 100644 index 000000000..21ba71f5f --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.c @@ -0,0 +1,8 @@ +#include "module_lib.h" +void module1(uint32_t input1, uint16_t input2, module1_output_t *outputs) +{ + outputs->output1 = input1 * input2; + outputs->output2 = input1 + input2; + outputs->output3 = (~input2) + 1; + outputs->output4 = input2 | (((uint32_t)input2)<<16); +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.v b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.v new file mode 100644 index 000000000..32ffa7810 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module1.v @@ -0,0 +1,226 @@ +module module1_IP + (input wire clock, + input wire reset, + + input wire start_port, + output reg done_port, + + input wire [31:0] input1, + input wire [15:0] input2, + + output reg [63:0] output1, + output reg [63:0] output2, + output reg [15:0] output3, + output reg [31:0] output4); + + reg done_port_reg; + reg [63:0] output1_reg; + reg [63:0] output2_reg; + reg [15:0] output3_reg; + reg [31:0] output4_reg; + + + //---------------------------------------------------------------- + // Simulate processing on input + //---------------------------------------------------------------- + + always @(posedge clock) begin + if (!reset) begin + done_port_reg <= 0; + output1_reg <= 0; + output2_reg <= 0; + output3_reg <= 0; + output4_reg <= 0; + end + else begin + done_port_reg <= start_port; + output1_reg <= input1 * input2; + output2_reg <= {32'd0, input1} + {48'd0, input2}; + output3_reg <= (~input2) + 1; + output4_reg <= {input2, input2}; + end + end + + + //---------------------------------------------------------------- + // Outputs, two cycle latency + //---------------------------------------------------------------- + + always @(posedge clock) begin + if (!reset) begin + done_port <= 0; + output1 <= 0; + output2 <= 0; + output3 <= 0; + output4 <= 0; + end + else begin + done_port <= done_port_reg; + output1 <= output1_reg; + output2 <= output2_reg; + output3 <= output3_reg; + output4 <= output4_reg; + end + end + +endmodule + +module module1 (clock, reset, start_port, input1, input2, outputs, done_port, Min_oe_ram, Mout_oe_ram, Min_we_ram, Mout_we_ram, Min_addr_ram, Mout_addr_ram, M_Rdata_ram, Min_Wdata_ram, Mout_Wdata_ram, Min_data_ram_size, Mout_data_ram_size, M_DataRdy); + parameter BITSIZE_outputs=1, BITSIZE_Min_addr_ram=1, BITSIZE_Mout_addr_ram=1, BITSIZE_M_Rdata_ram=8, BITSIZE_Min_Wdata_ram=8, BITSIZE_Mout_Wdata_ram=8, BITSIZE_Min_data_ram_size=1, BITSIZE_Mout_data_ram_size=1; + // IN + input clock; + input reset; + input start_port; + input [31:0] input1; + input [15:0] input2; + input [BITSIZE_outputs-1:0] outputs; + input Min_oe_ram; + input Min_we_ram; + input [BITSIZE_Min_addr_ram-1:0] Min_addr_ram; + input [BITSIZE_M_Rdata_ram-1:0] M_Rdata_ram; + input [BITSIZE_Min_Wdata_ram-1:0] Min_Wdata_ram; + input [BITSIZE_Min_data_ram_size-1:0] Min_data_ram_size; + input M_DataRdy; + // OUT + output done_port; + output Mout_oe_ram; + output Mout_we_ram; + output [BITSIZE_Mout_addr_ram-1:0] Mout_addr_ram; + output [BITSIZE_Mout_Wdata_ram-1:0] Mout_Wdata_ram; + output [BITSIZE_Mout_data_ram_size-1:0] Mout_data_ram_size; + + wire [63:0] output1_int; + wire [63:0] output2_int; + wire [15:0] output3_int; + wire [31:0] output4_int; + reg [63:0] output1_reg; + reg [63:0] output2_reg; + reg [15:0] output3_reg; + reg [31:0] output4_reg; + + reg done_port; + wire done_port_my_ip; + wire start_port_fsm; + reg start_port_memstore; + wire done_port_memstore; + reg [63:0] data_int; + reg [BITSIZE_outputs-1:0] addr_int; + reg [6:0] size_int; + + reg Min_oe_ram_int; + reg Min_we_ram_int; + reg [BITSIZE_Min_addr_ram-1:0] Min_addr_ram_int; + reg [BITSIZE_Min_Wdata_ram-1:0] Min_Wdata_ram_int; + reg [BITSIZE_Min_data_ram_size-1:0] Min_data_ram_size_int; + parameter [2:0] S_0 = 3'd0, + S_1 = 3'd1, + S_2 = 3'd2, + S_3 = 3'd3, + S_4 = 3'd4; + reg [2:0] _present_state=S_0, _next_state; + + module1_IP my_module1_IP (.done_port(done_port_my_ip), .clock(clock), .reset(reset), .start_port(start_port), .input1(input1), .input2(input2), .output1(output1_int), .output2(output2_int), .output3(output3_int), .output4(output4_int)); + assign start_port_fsm = done_port_my_ip; + + __builtin_memstore #(.BITSIZE_data(64), .BITSIZE_addr(BITSIZE_outputs), .BITSIZE_size(7), .BITSIZE_Min_addr_ram(BITSIZE_Min_addr_ram), .BITSIZE_Mout_addr_ram(BITSIZE_Mout_addr_ram), .BITSIZE_M_Rdata_ram(BITSIZE_M_Rdata_ram), .BITSIZE_Min_Wdata_ram(BITSIZE_Min_Wdata_ram), .BITSIZE_Mout_Wdata_ram(BITSIZE_Mout_Wdata_ram), .BITSIZE_Min_data_ram_size(BITSIZE_Min_data_ram_size), .BITSIZE_Mout_data_ram_size(BITSIZE_Mout_data_ram_size)) my__builtin_memstore (.clock(clock), .reset(reset), .start_port(start_port_memstore), .data(data_int), .addr(addr_int), .size(size_int), .done_port(done_port_memstore), .Min_oe_ram(Min_oe_ram_int), .Mout_oe_ram(Mout_oe_ram), .Min_we_ram(Min_we_ram_int), .Mout_we_ram(Mout_we_ram), .Min_addr_ram(Min_addr_ram_int), .Mout_addr_ram(Mout_addr_ram), .M_Rdata_ram(M_Rdata_ram), .Min_Wdata_ram(Min_Wdata_ram_int), .Mout_Wdata_ram(Mout_Wdata_ram), .Min_data_ram_size(Min_data_ram_size_int), .Mout_data_ram_size(Mout_data_ram_size), .M_DataRdy(M_DataRdy)); + + always @(posedge clock or negedge reset) + if (!reset) + begin + _present_state <= S_0; + end + else + _present_state <= _next_state; + + always @(posedge clock or negedge reset) + if (!reset) + begin + output1_reg <= 0; + output2_reg <= 0; + output3_reg <= 0; + output4_reg <= 0; + end + else if(done_port_my_ip == 1'b1) + begin + output1_reg <= output1_int; + output2_reg <= output2_int; + output3_reg <= output3_int; + output4_reg <= output4_int; + end + + always @(*) + begin + _next_state=S_0; + done_port=1'b0; + start_port_memstore=1'b0; + addr_int=0; + data_int=0; + size_int=0; + Min_oe_ram_int=Min_oe_ram; + Min_we_ram_int=Min_we_ram; + Min_data_ram_size_int=Min_data_ram_size; + Min_Wdata_ram_int=Min_Wdata_ram; + Min_addr_ram_int=Min_addr_ram; + case (_present_state) + S_0 : + if(start_port_fsm != 1'b1) + begin + _next_state=S_0; + end + else + begin + _next_state=S_1; + end + S_1 : + begin + _next_state=S_1; + start_port_memstore=1'b1; + addr_int=outputs; + data_int=output1_reg; + size_int=64; + if(done_port_memstore) + begin + _next_state=S_2; + end + end + S_2 : + begin + _next_state=S_2; + start_port_memstore=1'b1; + addr_int=outputs+64/8; + data_int=output2_reg; + size_int=64; + if(done_port_memstore) + begin + _next_state=S_3; + end + end + S_3 : + begin + _next_state=S_3; + start_port_memstore=1'b1; + addr_int=outputs+(64+64)/8; + data_int=output3_reg; + size_int=15; + if(done_port_memstore) + begin + _next_state=S_4; + end + end + S_4 : + begin + _next_state=S_4; + start_port_memstore=1'b1; + addr_int=outputs+(64+64+32)/8; + data_int=output4_reg; + size_int=32; + if(done_port_memstore) + begin + _next_state=S_0; + done_port=1'b1; + end + end + endcase + end +endmodule + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.c new file mode 100644 index 000000000..890e8444a --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.c @@ -0,0 +1,7 @@ +#include "module_lib.h" +void module2(uint32_t input1, module2_output_t *outputs) +{ + outputs->output1 = input1 * input1; + outputs->output2 = input1 | (((uint64_t)input1)<<32); + outputs->output3 = (uint16_t)input1; +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.v b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.v new file mode 100644 index 000000000..4d42e2395 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module2.v @@ -0,0 +1,201 @@ +module module2_IP + (input wire clock, + input wire reset, + + input wire start_port, + output reg done_port, + + input wire [31:0] input1, + + output reg [63:0] output1, + output reg [63:0] output2, + output reg [15:0] output3); + + reg done_port_reg; + reg [63:0] output1_reg; + reg [63:0] output2_reg; + reg [15:0] output3_reg; + + + //---------------------------------------------------------------- + // Simulate processing on input + //---------------------------------------------------------------- + + always @(posedge clock) begin + if (!reset) begin + done_port_reg <= 0; + output1_reg <= 0; + output2_reg <= 0; + output3_reg <= 0; + end + else begin + done_port_reg <= start_port; + output1_reg <= input1 * input1; + output2_reg <= {input1, input1}; + output3_reg <= input1[15:0]; + end + end + + + //---------------------------------------------------------------- + // Outputs, two cycle latency + //---------------------------------------------------------------- + + always @(posedge clock) begin + if (!reset) begin + done_port <= 0; + output1 <= 0; + output2 <= 0; + output3 <= 0; + end + else begin + done_port <= done_port_reg; + output1 <= output1_reg; + output2 <= output2_reg; + output3 <= output3_reg; + end + end + +endmodule + +module module2 (clock, reset, start_port, input1, outputs, done_port, Min_oe_ram, Mout_oe_ram, Min_we_ram, Mout_we_ram, Min_addr_ram, Mout_addr_ram, M_Rdata_ram, Min_Wdata_ram, Mout_Wdata_ram, Min_data_ram_size, Mout_data_ram_size, M_DataRdy); + parameter BITSIZE_outputs=1, BITSIZE_Min_addr_ram=1, BITSIZE_Mout_addr_ram=1, BITSIZE_M_Rdata_ram=8, BITSIZE_Min_Wdata_ram=8, BITSIZE_Mout_Wdata_ram=8, BITSIZE_Min_data_ram_size=1, BITSIZE_Mout_data_ram_size=1; + // IN + input clock; + input reset; + input start_port; + input [31:0] input1; + input [BITSIZE_outputs-1:0] outputs; + input Min_oe_ram; + input Min_we_ram; + input [BITSIZE_Min_addr_ram-1:0] Min_addr_ram; + input [BITSIZE_M_Rdata_ram-1:0] M_Rdata_ram; + input [BITSIZE_Min_Wdata_ram-1:0] Min_Wdata_ram; + input [BITSIZE_Min_data_ram_size-1:0] Min_data_ram_size; + input M_DataRdy; + // OUT + output done_port; + output Mout_oe_ram; + output Mout_we_ram; + output [BITSIZE_Mout_addr_ram-1:0] Mout_addr_ram; + output [BITSIZE_Mout_Wdata_ram-1:0] Mout_Wdata_ram; + output [BITSIZE_Mout_data_ram_size-1:0] Mout_data_ram_size; + + wire [63:0] output1_int; + wire [63:0] output2_int; + wire [15:0] output3_int; + reg [63:0] output1_reg; + reg [63:0] output2_reg; + reg [15:0] output3_reg; + + reg done_port; + wire done_port_my_ip; + wire start_port_fsm; + reg start_port_memstore; + wire done_port_memstore; + reg [63:0] data_int; + reg [BITSIZE_outputs-1:0] addr_int; + reg [6:0] size_int; + + reg Min_oe_ram_int; + reg Min_we_ram_int; + reg [BITSIZE_Min_addr_ram-1:0] Min_addr_ram_int; + reg [BITSIZE_Min_Wdata_ram-1:0] Min_Wdata_ram_int; + reg [BITSIZE_Min_data_ram_size-1:0] Min_data_ram_size_int; + parameter [1:0] S_0 = 2'd0, + S_1 = 2'd1, + S_2 = 2'd2, + S_3 = 2'd3; + reg [1:0] _present_state=S_0, _next_state; + + module2_IP my_module2_IP (.done_port(done_port_my_ip), .clock(clock), .reset(reset), .start_port(start_port), .input1(input1), .output1(output1_int), .output2(output2_int), .output3(output3_int)); + assign start_port_fsm = done_port_my_ip; + + __builtin_memstore #(.BITSIZE_data(64), .BITSIZE_addr(BITSIZE_outputs), .BITSIZE_size(7), .BITSIZE_Min_addr_ram(BITSIZE_Min_addr_ram), .BITSIZE_Mout_addr_ram(BITSIZE_Mout_addr_ram), .BITSIZE_M_Rdata_ram(BITSIZE_M_Rdata_ram), .BITSIZE_Min_Wdata_ram(BITSIZE_Min_Wdata_ram), .BITSIZE_Mout_Wdata_ram(BITSIZE_Mout_Wdata_ram), .BITSIZE_Min_data_ram_size(BITSIZE_Min_data_ram_size), .BITSIZE_Mout_data_ram_size(BITSIZE_Mout_data_ram_size)) my__builtin_memstore (.clock(clock), .reset(reset), .start_port(start_port_memstore), .data(data_int), .addr(addr_int), .size(size_int), .done_port(done_port_memstore), .Min_oe_ram(Min_oe_ram_int), .Mout_oe_ram(Mout_oe_ram), .Min_we_ram(Min_we_ram_int), .Mout_we_ram(Mout_we_ram), .Min_addr_ram(Min_addr_ram_int), .Mout_addr_ram(Mout_addr_ram), .M_Rdata_ram(M_Rdata_ram), .Min_Wdata_ram(Min_Wdata_ram_int), .Mout_Wdata_ram(Mout_Wdata_ram), .Min_data_ram_size(Min_data_ram_size_int), .Mout_data_ram_size(Mout_data_ram_size), .M_DataRdy(M_DataRdy)); + + always @(posedge clock or negedge reset) + if (!reset) + begin + _present_state <= S_0; + end + else + _present_state <= _next_state; + + always @(posedge clock or negedge reset) + if (!reset) + begin + output1_reg <= 0; + output2_reg <= 0; + output3_reg <= 0; + end + else if(done_port_my_ip == 1'b1) + begin + output1_reg <= output1_int; + output2_reg <= output2_int; + output3_reg <= output3_int; + end + + always @(*) + begin + _next_state=S_0; + done_port=1'b0; + start_port_memstore=1'b0; + addr_int=0; + data_int=0; + size_int=0; + Min_oe_ram_int=Min_oe_ram; + Min_we_ram_int=Min_we_ram; + Min_data_ram_size_int=Min_data_ram_size; + Min_Wdata_ram_int=Min_Wdata_ram; + Min_addr_ram_int=Min_addr_ram; + case (_present_state) + S_0 : + if(start_port_fsm != 1'b1) + begin + _next_state=S_0; + end + else + begin + _next_state=S_1; + end + S_1 : + begin + _next_state=S_1; + start_port_memstore=1'b1; + addr_int=outputs; + data_int=output1_reg; + size_int=64; + if(done_port_memstore) + begin + _next_state=S_2; + end + end + S_2 : + begin + _next_state=S_2; + start_port_memstore=1'b1; + addr_int=outputs+64/8; + data_int=output2_reg; + size_int=64; + if(done_port_memstore) + begin + _next_state=S_3; + end + end + S_3 : + begin + _next_state=S_3; + start_port_memstore=1'b1; + addr_int=outputs+(64+64)/8; + size_int=16; + data_int=output3_reg; + if(done_port_memstore) + begin + _next_state=S_0; + done_port=1'b1; + end + end + endcase + end +endmodule + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.h b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.h new file mode 100644 index 000000000..6bff2ed40 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.h @@ -0,0 +1,32 @@ +#ifndef MODULE_LIB_H +#define MODULE_LIB_H + +#include + +typedef struct { + uint64_t output1; + uint64_t output2; + uint16_t output3; + uint32_t output4; +} module1_output_t; + +extern void module1(uint32_t input1, uint16_t input2, module1_output_t *outputs); + + + +typedef struct { + uint64_t output1; + uint64_t output2; + uint16_t output3; +} module2_output_t; + +extern void module2(uint32_t input1, module2_output_t *outputs); + + +extern void printer1(uint64_t value1, uint64_t value2, uint16_t value3, uint32_t value4); + +extern void printer2(uint64_t value1, uint64_t value2, uint16_t value3); + +extern void my_ip(uint8_t command, uint32_t param1, uint32_t param2); + +#endif diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.xml new file mode 100644 index 000000000..edd567d75 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/module_lib.xml @@ -0,0 +1,262 @@ + + + + module_lib + + module1 + + MEM_ACC_11,MEM_ACC_N1 + + + Module 1 IP + foo + foo + foo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + module2 + MEM_ACC_11,MEM_ACC_N1 + + + + Module 2 IP + foo + foo + foo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + printer2 + + + + Printer 2 IP + foo + foo + foo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + printer1 + + + + Printer 1 IP + foo + foo + foo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.c new file mode 100644 index 000000000..49fd269f7 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.c @@ -0,0 +1,6 @@ +#include "module_lib.h" +#include +void printer1(uint64_t value1, uint64_t value2, uint16_t value3, uint32_t value4) +{ + printf("printer1: %llx %llx %x %x\n", value1, value2, value3, value4); +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.v b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.v new file mode 100644 index 000000000..b572577e6 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer1.v @@ -0,0 +1,45 @@ +module printer1 + (input wire clock, + input wire reset, + + input wire start_port, + output reg done_port, + + input wire [63:0] value1, + input wire [63:0] value2, + input wire [15:0] value3, + input wire [31:0] value4); + + reg done_port_reg; + + //---------------------------------------------------------------- + // Simulate processing on input + //---------------------------------------------------------------- + + always @(posedge clock) begin + if (!reset) begin + done_port_reg <= 0; + end + else begin + done_port_reg <= start_port; + end + end + + + //---------------------------------------------------------------- + // Outputs, two cycle latency + //---------------------------------------------------------------- + + always @(posedge clock) begin + if (!reset) begin + done_port <= 0; + end + else begin + done_port <= done_port_reg; + if (done_port_reg) begin + $display("printer1: %h %h %h %h", value1, value2, value3, value4); + end + end + end + +endmodule diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.c new file mode 100644 index 000000000..a73c63f62 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.c @@ -0,0 +1,6 @@ +#include "module_lib.h" +#include +void printer2(uint64_t value1, uint64_t value2, uint16_t value3) +{ + printf("printer2: %llx %llx %x\n", value1, value2, value3); +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.v b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.v new file mode 100644 index 000000000..15fc5bdd0 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/printer2.v @@ -0,0 +1,44 @@ +module printer2 + (input wire clock, + input wire reset, + + input wire start_port, + output reg done_port, + + input wire [63:0] value1, + input wire [63:0] value2, + input wire [15:0] value3); + + reg done_port_reg; + + //---------------------------------------------------------------- + // Simulate processing on input + //---------------------------------------------------------------- + + always @(posedge clock) begin + if (!reset) begin + done_port_reg <= 0; + end + else begin + done_port_reg <= start_port; + end + end + + + //---------------------------------------------------------------- + // Outputs, two cycle latency + //---------------------------------------------------------------- + + always @(posedge clock) begin + if (!reset) begin + done_port <= 0; + end + else begin + done_port <= done_port_reg; + if (done_port_reg) begin + $display("printer2: %h %h %h", value1, value2, value3); + end + end + end + +endmodule diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/test.xml new file mode 100644 index 000000000..6a8f8acff --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/test.xml @@ -0,0 +1,4 @@ + + + + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise6/top.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/top.c new file mode 100644 index 000000000..46fa1c11a --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise6/top.c @@ -0,0 +1,23 @@ +#include "module_lib.h" + +void my_ip(uint8_t command, uint32_t param1, uint32_t param2) { + static module1_output_t module1_output; + static module2_output_t module2_output; + + switch(command) { + case 0: + module1(param1, param2 >> 16, &module1_output); + break; + case 1: + module2(param1, &module2_output); + break; + case 2: + printer1(module1_output.output1, module1_output.output2, module1_output.output3, module1_output.output4); + break; + case 3: + printer2(module2_output.output1, module2_output.output2, module2_output.output3); + break; + default: + break; + } +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/aggregate.h b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/aggregate.h new file mode 100644 index 000000000..2d0e9085f --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/aggregate.h @@ -0,0 +1,16 @@ +#ifndef AGGREGATE_H +#define AGGREGATE_H + +struct aggregate +{ + float a0; + float a1; + float a2; + float a3; + float a4; + float a5; + float a6; + float a7; +}; + +#endif /* AGGREGATE_H */ diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/bambu.sh new file mode 100755 index 000000000..7219597b6 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/bambu.sh @@ -0,0 +1,12 @@ +#!/bin/bash +script=$(readlink -e $0) +root_dir=$(dirname $script) + +rm -rf hls +mkdir hls +cd hls +echo "#simulation of qsort" +bambu $root_dir/test.c $root_dir/less.c $root_dir/qsort.c --top-fname=test \ + -Os --no-iob \ + --generate-tb=$root_dir/test.xml --simulate \ + -v2 --print-dot --pretty-print=a.c "$@" |& tee log.txt diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/less.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/less.c new file mode 100644 index 000000000..6b5a20dfd --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/less.c @@ -0,0 +1,31 @@ +#include "aggregate.h" + + +int less (const void * a, const void * b, void * notUsed) +{ + struct aggregate * aPtr = (struct aggregate *)a; + struct aggregate * bPtr = (struct aggregate *)b; + + float aSum = aPtr->a0 + + aPtr->a1 + + aPtr->a2 + + aPtr->a3 + + aPtr->a4 + + aPtr->a5 + + aPtr->a6 + + aPtr->a7; + float bSum = bPtr->a0 + + bPtr->a1 + + bPtr->a2 + + bPtr->a3 + + bPtr->a4 + + bPtr->a5 + + bPtr->a6 + + bPtr->a7; + int equal = (bSum - aSum) == 0; + if (equal) return 0; + + int lt = (aSum - bSum) < 0; + return lt ? -1 : 1; +} + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/qsort.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/qsort.c new file mode 100644 index 000000000..64e5a1a81 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/qsort.c @@ -0,0 +1,247 @@ +/* Copyright (C) 1991-2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Douglas C. Schmidt (schmidt@ics.uci.edu). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* If you consider tuning this algorithm, you should consult first: + Engineering a sort function; Jon Bentley and M. Douglas McIlroy; + Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993. */ + +#include +#include + +/* Byte-wise swap two items of size SIZE. */ +#define SWAP(a, b, size) \ + do \ + { \ + size_t __size = (size); \ + char *__a = (a), *__b = (b); \ + do \ + { \ + char __tmp = *__a; \ + *__a++ = *__b; \ + *__b++ = __tmp; \ + } while (--__size > 0); \ + } while (0) + +/* Discontinue quicksort algorithm when partition gets below this size. + This particular magic number was chosen to work best on a Sun 4/260. */ +#define MAX_THRESH 4 + +/* Stack node declarations used to store unfulfilled partition obligations. */ +typedef struct + { + char *lo; + char *hi; + } stack_node; + +/* The next 4 #defines implement a very fast in-line stack abstraction. */ +/* The stack needs log (total_elements) entries (we could even subtract + log(MAX_THRESH)). Since total_elements has type size_t, we get as + upper bound for log (total_elements): + bits per byte (CHAR_BIT) * sizeof(size_t). */ +#define STACK_SIZE (CHAR_BIT * sizeof(size_t)) +#define PUSH(low, high) ((void) ((top->lo = (low)), (top->hi = (high)), ++top)) +#define POP(low, high) ((void) (--top, (low = top->lo), (high = top->hi))) +#define STACK_NOT_EMPTY (stack < top) + + +/* Order size using quicksort. This implementation incorporates + four optimizations discussed in Sedgewick: + + 1. Non-recursive, using an explicit stack of pointer that store the + next array partition to sort. To save time, this maximum amount + of space required to store an array of SIZE_MAX is allocated on the + stack. Assuming a 32-bit (64 bit) integer for size_t, this needs + only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes). + Pretty cheap, actually. + + 2. Chose the pivot element using a median-of-three decision tree. + This reduces the probability of selecting a bad pivot value and + eliminates certain extraneous comparisons. + + 3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving + insertion sort to order the MAX_THRESH items within each partition. + This is a big win, since insertion sort is faster for small, mostly + sorted array segments. + + 4. The larger of the two sub-partitions is always pushed onto the + stack first, with the algorithm then concentrating on the + smaller partition. This *guarantees* no more than log (total_elems) + stack size is needed (actually O(1) in this case)! */ + +void +_quicksort (void *const pbase, size_t total_elems, size_t size, + int (*cmp)(const void *, const void *, void *), void *arg) +{ + char *base_ptr = (char *) pbase; + + const size_t max_thresh = MAX_THRESH * size; + + if (total_elems == 0) + /* Avoid lossage with unsigned arithmetic below. */ + return; + + if (total_elems > MAX_THRESH) + { + char *lo = base_ptr; + char *hi = &lo[size * (total_elems - 1)]; + stack_node stack[STACK_SIZE]; + stack_node *top = stack; + + PUSH (NULL, NULL); + + while (STACK_NOT_EMPTY) + { + char *left_ptr; + char *right_ptr; + + /* Select median value from among LO, MID, and HI. Rearrange + LO and HI so the three values are sorted. This lowers the + probability of picking a pathological pivot value and + skips a comparison for both the LEFT_PTR and RIGHT_PTR in + the while loops. */ + + char *mid = lo + size * ((hi - lo) / size >> 1); + + if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) + SWAP (mid, lo, size); + if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) + SWAP (mid, hi, size); + else + goto jump_over; + if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) + SWAP (mid, lo, size); + jump_over:; + + left_ptr = lo + size; + right_ptr = hi - size; + + /* Here's the famous ``collapse the walls'' section of quicksort. + Gotta like those tight inner loops! They are the main reason + that this algorithm runs much faster than others. */ + do + { + while ((*cmp) ((void *) left_ptr, (void *) mid, arg) < 0) + left_ptr += size; + + while ((*cmp) ((void *) mid, (void *) right_ptr, arg) < 0) + right_ptr -= size; + + if (left_ptr < right_ptr) + { + SWAP (left_ptr, right_ptr, size); + if (mid == left_ptr) + mid = right_ptr; + else if (mid == right_ptr) + mid = left_ptr; + left_ptr += size; + right_ptr -= size; + } + else if (left_ptr == right_ptr) + { + left_ptr += size; + right_ptr -= size; + break; + } + } + while (left_ptr <= right_ptr); + + /* Set up pointers for next iteration. First determine whether + left and right partitions are below the threshold size. If so, + ignore one or both. Otherwise, push the larger partition's + bounds on the stack and continue sorting the smaller one. */ + + if ((size_t) (right_ptr - lo) <= max_thresh) + { + if ((size_t) (hi - left_ptr) <= max_thresh) + /* Ignore both small partitions. */ + POP (lo, hi); + else + /* Ignore small left partition. */ + lo = left_ptr; + } + else if ((size_t) (hi - left_ptr) <= max_thresh) + /* Ignore small right partition. */ + hi = right_ptr; + else if ((right_ptr - lo) > (hi - left_ptr)) + { + /* Push larger left partition indices. */ + PUSH (lo, right_ptr); + lo = left_ptr; + } + else + { + /* Push larger right partition indices. */ + PUSH (left_ptr, hi); + hi = right_ptr; + } + } + } + + /* Once the BASE_PTR array is partially sorted by quicksort the rest + is completely sorted using insertion sort, since this is efficient + for partitions below MAX_THRESH size. BASE_PTR points to the beginning + of the array to sort, and END_PTR points at the very last element in + the array (*not* one beyond it!). */ + +#define min(x, y) ((x) < (y) ? (x) : (y)) + + { + char *const end_ptr = &base_ptr[size * (total_elems - 1)]; + char *tmp_ptr = base_ptr; + char *thresh = min(end_ptr, base_ptr + max_thresh); + char *run_ptr; + + /* Find smallest element in first threshold and place it at the + array's beginning. This is the smallest array element, + and the operation speeds up insertion sort's inner loop. */ + + for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size) + if ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0) + tmp_ptr = run_ptr; + + if (tmp_ptr != base_ptr) + SWAP (tmp_ptr, base_ptr, size); + + /* Insertion sort, running from left-hand-side up to right-hand-side. */ + + run_ptr = base_ptr + size; + while ((run_ptr += size) <= end_ptr) + { + tmp_ptr = run_ptr - size; + while ((*cmp) ((void *) run_ptr, (void *) tmp_ptr, arg) < 0) + tmp_ptr -= size; + + tmp_ptr += size; + if (tmp_ptr != run_ptr) + { + char *trav; + + trav = run_ptr + size; + while (--trav >= run_ptr) + { + char c = *trav; + char *hi, *lo; + + for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo) + *hi = *lo; + *hi = c; + } + } + } + } +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.c new file mode 100644 index 000000000..f968a2897 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.c @@ -0,0 +1,17 @@ +typedef unsigned int size_t; + +typedef int (*__compar_d_fn_t)(void *, void *, void *); + +#include "aggregate.h" + +//#include "qsort.c" +void +_quicksort (void *const pbase, size_t total_elems, size_t size, + int (*cmp)(const void *, const void *, void *), void *arg); +//#include "less.c" +int less (void * a, void * b, void * notUsed); + +void test(float * const pbase, size_t total_elems) +{ + _quicksort(pbase, (sizeof(float) * total_elems) / sizeof(struct aggregate), sizeof(struct aggregate), less , (void *)0); +} diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.xml b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.xml new file mode 100644 index 000000000..a98af4c69 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise7/test.xml @@ -0,0 +1,4 @@ + + + + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise8/Keccak.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise8/Keccak.c new file mode 100644 index 000000000..cbc8cc651 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise8/Keccak.c @@ -0,0 +1,142 @@ +/* + * The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, + * Michaël Peeters and Gilles Van Assche. For more information, feedback or + * questions, please refer to our website: http://keccak.noekeon.org/ + * Implementation by the designers, + * hereby denoted as "the implementer". + * To the extent possible under law, the implementer has waived all copyright + * and related or neighboring rights to the source code in this file. + * http://creativecommons.org/publicdomain/zero/1.0/ + * + */ +typedef unsigned char UINT8; +typedef unsigned long long int UINT64; +#define nrRounds 24 + +#define GET_KRC_VAL(index) (KeccakRoundConstants[index]) + +static UINT64 KeccakRoundConstants[nrRounds] = { + 0x0000000000000001ULL, + 0x0000000000008082ULL, + 0x800000000000808aULL, + 0x8000000080008000ULL, + 0x000000000000808bULL, + 0x0000000080000001ULL, + 0x8000000080008081ULL, + 0x8000000000008009ULL, + 0x000000000000008aULL, + 0x0000000000000088ULL, + 0x0000000080008009ULL, + 0x000000008000000aULL, + 0x000000008000808bULL, + 0x800000000000008bULL, + 0x8000000000008089ULL, + 0x8000000000008003ULL, + 0x8000000000008002ULL, + 0x8000000000000080ULL, + 0x000000000000800aULL, + 0x800000008000000aULL, + 0x8000000080008081ULL, + 0x8000000000008080ULL, + 0x0000000080000001ULL, + 0x8000000080008008ULL +}; + +#define nrLanes 25 +static unsigned char KeccakRhoOffsets[nrLanes] = { + 0, + 1, + 62, + 28, + 27, + 36, + 44, + 6, + 55, + 20, + 3, + 10, + 43, + 25, + 39, + 41, + 45, + 15, + 21, + 8, + 18, + 2, + 61, + 56, + 14 +}; + +#define index(x, y) (((x)%5)+5*((y)%5)) +#define ROL64(a, offset) ((offset != 0) ? ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset))) : a) + +void theta(UINT64 *A) +{ + unsigned int x, y; + UINT64 C[5], D[5]; + + for(x=0; x<5; x++) { + C[x] = 0; + for(y=0; y<5; y++) + C[x] ^= A[index(x, y)]; + } + for(x=0; x<5; x++) + D[x] = ROL64(C[(x+1)%5], 1) ^ C[(x+4)%5]; + for(x=0; x<5; x++) + for(y=0; y<5; y++) + A[index(x, y)] ^= D[x]; +} + +void rho(UINT64 *A) +{ + unsigned int x, y; + + for(x=0; x<5; x++) for(y=0; y<5; y++) + A[index(x, y)] = ROL64(A[index(x, y)], KeccakRhoOffsets[index(x, y)]); +} + +void pi(UINT64 *A) +{ + unsigned int x, y; + UINT64 tempA[25]; + + for(x=0; x<5; x++) for(y=0; y<5; y++) + tempA[index(x, y)] = A[index(x, y)]; + for(x=0; x<5; x++) for(y=0; y<5; y++) + A[index(0*x+1*y, 2*x+3*y)] = tempA[index(x, y)]; +} + +void chi(UINT64 *A) +{ + unsigned int x, y; + UINT64 C[5]; + + for(y=0; y<5; y++) { + for(x=0; x<5; x++) + C[x] = A[index(x, y)] ^ ((~A[index(x+1, y)]) & A[index(x+2, y)]); + for(x=0; x<5; x++) + A[index(x, y)] = C[x]; + } +} + +void iota(UINT64 *A, unsigned int indexRound) +{ + A[index(0, 0)] ^= GET_KRC_VAL(indexRound); +} + + +void kekka_coproc(UINT64 A[25]) +{ + unsigned int i; + for(i=0;i + + + diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise9/bambu.sh b/documentation/tutorial_fpl_2022/01-introduction/Exercise9/bambu.sh new file mode 100755 index 000000000..a67cdc80f --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise9/bambu.sh @@ -0,0 +1,13 @@ +#!/bin/bash +script=$(readlink -e $0) +root_dir=$(dirname $script) + +rm -rf search +mkdir -p search +cd search +echo "#simulation of search function" +bambu $root_dir/tree.c --top-fname=main --top-rtldesign-name=search \ + -DNDEBUG -DBAMBU_PROFILING \ + -O3 --experimental-setup=BAMBU \ + --generate-tb=$root_dir/test_search.xml --simulator=ICARUS --simulate \ + --print-dot -v2 "$@" |& tee log.txt \ No newline at end of file diff --git a/documentation/tutorial_fpl_2022/01-introduction/Exercise9/tree.c b/documentation/tutorial_fpl_2022/01-introduction/Exercise9/tree.c new file mode 100644 index 000000000..088ea68c2 --- /dev/null +++ b/documentation/tutorial_fpl_2022/01-introduction/Exercise9/tree.c @@ -0,0 +1,323 @@ +#include +#include +#include + +#define MAX_NUMBER_OF_NODES 255 +extern void __builtin_bambu_time_start(); +extern void __builtin_bambu_time_stop(); + +/* stack data structure */ +struct stack +{ + void *data; + struct stack *next; +}; + +typedef struct stack node_stack; + +/* Auxiliary memory stack allocation utilities */ +static node_stack StaticPoolStack[MAX_NUMBER_OF_NODES]; +static node_stack* head_stack_free_list; + +void push_stack_free_list(node_stack ** head, node_stack * new_node) +{ + new_node->data = 0; + new_node->next = *head; + *head = new_node; +} +node_stack* pop_stack_free_list(node_stack ** head) +{ + node_stack* retval = 0; + node_stack * next_node = NULL; + + if (*head == NULL) + return NULL; + + next_node = (*head)->next; + retval = *head; + *head = next_node; + + return retval; +} +void init_stack_free_list() +{ + int index; + for(index=0; index < MAX_NUMBER_OF_NODES; ++index) + push_stack_free_list(&head_stack_free_list, &StaticPoolStack[index]); +} + +/* Stack related functions */ +void push(node_stack** head, void *t) +{ + node_stack* temp = pop_stack_free_list(&head_stack_free_list); + assert(temp); + temp->data = t; + temp->next = (*head); + *head= temp; +} +_Bool isEmpty(node_stack *head) +{ + return (head == NULL)? 1 : 0; +} +void *pop(node_stack** head) +{ + void *res; + node_stack *top; + + assert(!isEmpty(*head)); + top = *head; + res = top->data; + *head = top->next; + push_stack_free_list(&head_stack_free_list, top); + return res; +} + +void* top(node_stack* head) +{ + return head->data; +} + +/* binary tree data structure */ +struct bin_tree { + int data; + struct bin_tree * right, * left; +}; +typedef struct bin_tree node_tree; + +/* Auxiliary memory tree allocation utilities */ +static node_tree StaticPoolTree[MAX_NUMBER_OF_NODES]; +static node_tree* head_tree_free_list; + +void push_tree_free_list(node_tree ** head, node_tree * new_node) +{ + new_node->data = 0; + new_node->left = *head; + new_node->right = 0; + *head = new_node; +} +node_tree* pop_tree_free_list(node_tree ** head) +{ + node_tree* retval = 0; + node_tree * next_node = NULL; + + if (*head == NULL) + return NULL; + + next_node = (*head)->left; + retval = *head; + *head = next_node; + + return retval; +} +void init_tree_free_list() +{ + int index; + for(index=0; index < MAX_NUMBER_OF_NODES; ++index) + push_tree_free_list(&head_tree_free_list, &StaticPoolTree[index]); +} + + +/* binary tree functions */ +void insert(node_tree ** tree, int val) +{ + node_tree *temp = NULL; + if(!(*tree)) + { + temp = pop_tree_free_list(&head_tree_free_list); + assert(temp); + temp->left = temp->right = NULL; + temp->data = val; + *tree = temp; + return; + } + if(val < (*tree)->data) + { + insert(&(*tree)->left, val); + } + else if(val > (*tree)->data) + { + insert(&(*tree)->right, val); + } +} + +void print_preorder(node_tree * root) +{ + if (root) + { + node_tree *current; + node_stack *s = NULL; + push(&s, root); + + while (!isEmpty(s)) + { + current = pop(&s); + printf ("%d\n", current->data); + + if (current->right) + push(&s, current->right); + if (current->left) + push(&s, current->left); + } + } +} + +/* Iterative function for inorder binary tree print */ +void print_inorder(node_tree *root) +{ + node_tree *current = root; + node_stack *s = NULL; + _Bool done = 0; + + while (!done) + { + if(current != NULL) + { + push(&s, current); + current = current->left; + } + else + { + if (!isEmpty(s)) + { + current = pop(&s); + printf("%d\n", current->data); + current = current->right; + } + else + done = 1; + } + } +} + + +void print_postorder(node_tree * root) +{ + if (root) + { + node_tree *prev=NULL; + node_stack *s = NULL; + push(&s, root); + + while (!isEmpty(s)) { + node_tree *curr = top(s); + if (!prev || prev->left == curr || prev->right == curr) { + if (curr->left) + push(&s, curr->left); + else if (curr->right) + push(&s, curr->right); + } else if (curr->left == prev) { + if (curr->right) + push(&s, curr->right); + } else { + printf("%d\n", curr->data); + pop(&s); + } + prev = curr; + } + } +} + +void deltree(node_tree * root) +{ + if (root) + { + node_tree *prev=NULL; + node_stack *s = NULL; + push(&s, root); + + while (!isEmpty(s)) { + node_tree *curr = top(s); + if (!prev || prev->left == curr || prev->right == curr) { + if (curr->left) + push(&s, curr->left); + else if (curr->right) + push(&s, curr->right); + } else if (curr->left == prev) { + if (curr->right) + push(&s, curr->right); + } else { + push_tree_free_list(&head_tree_free_list, curr); + pop(&s); + } + prev = curr; + } + } +} + +node_tree* __attribute__ ((noinline)) search(node_tree * tree, int val) +{ + if(tree == NULL|| tree->data == val) + return tree; + + if (tree->data < val) + return search(tree->right, val); + else + return search(tree->left, val); +} + +int main() +{ + node_tree *root; + node_tree *tmp; + //int i; + init_tree_free_list(); + init_stack_free_list(); + + root = NULL; + /* Inserting nodes into tree */ + insert(&root, 9); + insert(&root, 4); + insert(&root, 15); + insert(&root, 6); + insert(&root, 12); + insert(&root, 17); + insert(&root, 2); + + /* Printing nodes of tree */ + printf("Pre Order Display\n"); + print_preorder(root); + + printf("In Order Display\n"); + print_inorder(root); + + printf("Post Order Display\n"); + print_postorder(root); + + /* Search node into tree */ +#ifdef BAMBU_PROFILING + __builtin_bambu_time_start(); +#endif + tmp = search(root, 4); +#ifdef BAMBU_PROFILING + __builtin_bambu_time_stop(); +#endif + if (tmp) + { + printf("Searched node=%d\n", tmp->data); + } + else + { + printf("Data Not found in tree.\n"); + } + + /* Search node into tree */ +#ifdef BAMBU_PROFILING + __builtin_bambu_time_start(); +#endif + tmp = search(root, 6); +#ifdef BAMBU_PROFILING + __builtin_bambu_time_stop(); +#endif + if (tmp) + { + printf("Second searched node=%d\n", tmp->data); + } + else + { + printf("Data Not found in tree.\n"); + } + + /* Deleting all nodes of tree */ + deltree(root); + return 0; +} diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/README b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/README new file mode 100644 index 000000000..ebe8c2298 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/README @@ -0,0 +1,5 @@ +Evaluate the effects of GCC optimizations on the number of cycles of adpcm benchmark: +- Different level of optimizations +- Vectorization +- Different inlining + diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/adpcm.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/adpcm.c new file mode 100755 index 000000000..613b1bdb5 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/adpcm.c @@ -0,0 +1,882 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*************************************************************************/ +/* */ +/* SNU-RT Benchmark Suite for Worst Case Timing Analysis */ +/* ===================================================== */ +/* Collected and Modified by S.-S. Lim */ +/* sslim@archi.snu.ac.kr */ +/* Real-Time Research Group */ +/* Seoul National University */ +/* */ +/* */ +/* < Features > - restrictions for our experimental environment */ +/* */ +/* 1. Completely structured. */ +/* - There are no unconditional jumps. */ +/* - There are no exit from loop bodies. */ +/* (There are no 'break' or 'return' in loop bodies) */ +/* 2. No 'switch' statements. */ +/* 3. No 'do..while' statements. */ +/* 4. Expressions are restricted. */ +/* - There are no multiple expressions joined by 'or', */ +/* 'and' operations. */ +/* 5. No library calls. */ +/* - All the functions needed are implemented in the */ +/* source file. */ +/* */ +/* */ +/*************************************************************************/ +/* */ +/* FILE: adpcm.c */ +/* SOURCE : C Algorithms for Real-Time DSP by P. M. Embree */ +/* */ +/* DESCRIPTION : */ +/* */ +/* CCITT G.722 ADPCM (Adaptive Differential Pulse Code Modulation) */ +/* algorithm. */ +/* 16khz sample rate data is stored in the array test_data[SIZE]. */ +/* Results are stored in the array compressed[SIZE] and result[SIZE].*/ +/* Execution time is determined by the constant SIZE (default value */ +/* is 2000). */ +/* */ +/* REMARK : */ +/* */ +/* EXECUTION TIME : */ +/* */ +/* */ +/*************************************************************************/ +#include + +int encode (int, int); +void decode (int); +int filtez (int *bpl, int *dlt); +void upzero (int dlt, int *dlti, int *bli); +int filtep (int rlt1, int al1, int rlt2, int al2); +int quantl (int el, int detl); +int logscl (int il, int nbl); +int scalel (int nbl, int shift_constant); +int uppol2 (int al1, int al2, int plt, int plt1, int plt2); +int uppol1 (int al1, int apl2, int plt, int plt1); +int logsch (int ih, int nbh); +void reset (); + +/* G722 C code */ + +/* variables for transimit quadrature mirror filter here */ +int tqmf[24]; + +/* QMF filter coefficients: +scaled by a factor of 4 compared to G722 CCITT recomendation */ +const int h[24] = { + 12, -44, -44, 212, 48, -624, 128, 1448, + -840, -3220, 3804, 15504, 15504, 3804, -3220, -840, + 1448, 128, -624, 48, 212, -44, -44, 12 +}; + +int xl, xh; + +/* variables for receive quadrature mirror filter here */ +int accumc[11], accumd[11]; + +/* outputs of decode() */ +int xout1, xout2; + +int xs, xd; + +/* variables for encoder (hi and lo) here */ + +int il, szl, spl, sl, el; + +const int qq4_code4_table[16] = { + 0, -20456, -12896, -8968, -6288, -4240, -2584, -1200, + 20456, 12896, 8968, 6288, 4240, 2584, 1200, 0 +}; + + +const int qq6_code6_table[64] = { + -136, -136, -136, -136, -24808, -21904, -19008, -16704, + -14984, -13512, -12280, -11192, -10232, -9360, -8576, -7856, + -7192, -6576, -6000, -5456, -4944, -4464, -4008, -3576, + -3168, -2776, -2400, -2032, -1688, -1360, -1040, -728, + 24808, 21904, 19008, 16704, 14984, 13512, 12280, 11192, + 10232, 9360, 8576, 7856, 7192, 6576, 6000, 5456, + 4944, 4464, 4008, 3576, 3168, 2776, 2400, 2032, + 1688, 1360, 1040, 728, 432, 136, -432, -136 +}; + +int delay_bpl[6]; + +int delay_dltx[6]; + +const int wl_code_table[16] = { + -60, 3042, 1198, 538, 334, 172, 58, -30, + 3042, 1198, 538, 334, 172, 58, -30, -60 +}; + +const int ilb_table[32] = { + 2048, 2093, 2139, 2186, 2233, 2282, 2332, 2383, + 2435, 2489, 2543, 2599, 2656, 2714, 2774, 2834, + 2896, 2960, 3025, 3091, 3158, 3228, 3298, 3371, + 3444, 3520, 3597, 3676, 3756, 3838, 3922, 4008 +}; + +int nbl; /* delay line */ +int al1, al2; +int plt, plt1, plt2; +int dlt; +int rlt, rlt1, rlt2; + +/* decision levels - pre-multiplied by 8, 0 to indicate end */ +const int decis_levl[30] = { + 280, 576, 880, 1200, 1520, 1864, 2208, 2584, + 2960, 3376, 3784, 4240, 4696, 5200, 5712, 6288, + 6864, 7520, 8184, 8968, 9752, 10712, 11664, 12896, + 14120, 15840, 17560, 20456, 23352, 32767 +}; + +int detl; + +/* quantization table 31 long to make quantl look-up easier, +last entry is for mil=30 case when wd is max */ +const int quant26bt_pos[31] = { + 61, 60, 59, 58, 57, 56, 55, 54, + 53, 52, 51, 50, 49, 48, 47, 46, + 45, 44, 43, 42, 41, 40, 39, 38, + 37, 36, 35, 34, 33, 32, 32 +}; + +/* quantization table 31 long to make quantl look-up easier, +last entry is for mil=30 case when wd is max */ +const int quant26bt_neg[31] = { + 63, 62, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, + 17, 16, 15, 14, 13, 12, 11, 10, + 9, 8, 7, 6, 5, 4, 4 +}; + + +int deth; +int sh; /* this comes from adaptive predictor */ +int eh; + +const int qq2_code2_table[4] = { + -7408, -1616, 7408, 1616 +}; + +const int wh_code_table[4] = { + 798, -214, 798, -214 +}; + + +int dh, ih; +int nbh, szh; +int sph, ph, yh, rh; + +int delay_dhx[6]; + +int delay_bph[6]; + +int ah1, ah2; +int ph1, ph2; +int rh1, rh2; + +/* variables for decoder here */ +int ilr, rl; +int dec_deth, dec_detl, dec_dlt; + +int dec_del_bpl[6]; + +int dec_del_dltx[6]; + +int dec_plt, dec_plt1, dec_plt2; +int dec_szl, dec_spl, dec_sl; +int dec_rlt1, dec_rlt2, dec_rlt; +int dec_al1, dec_al2; +int dl; +int dec_nbl, dec_dh, dec_nbh; + +/* variables used in filtez */ +int dec_del_bph[6]; + +int dec_del_dhx[6]; + +int dec_szh; +/* variables used in filtep */ +int dec_rh1, dec_rh2; +int dec_ah1, dec_ah2; +int dec_ph, dec_sph; + +int dec_sh; + +int dec_ph1, dec_ph2; + +/* G722 encode function two ints in, one 8 bit output */ + +/* put input samples in xin1 = first value, xin2 = second value */ +/* returns il and ih stored together */ + +int +abs (int n) +{ + int m; + + if (n >= 0) + m = n; + else + m = -n; + return m; +} + +int +encode (int xin1, int xin2) +{ + int i; + const int *h_ptr; + int *tqmf_ptr, *tqmf_ptr1; + long int xa, xb; + int decis; + +/* transmit quadrature mirror filters implemented here */ + h_ptr = h; + tqmf_ptr = tqmf; + xa = (long) (*tqmf_ptr++) * (*h_ptr++); + xb = (long) (*tqmf_ptr++) * (*h_ptr++); +/* main multiply accumulate loop for samples and coefficients */ + for (i = 0; i < 10; i++) + { + xa += (long) (*tqmf_ptr++) * (*h_ptr++); + xb += (long) (*tqmf_ptr++) * (*h_ptr++); + } +/* final mult/accumulate */ + xa += (long) (*tqmf_ptr++) * (*h_ptr++); + xb += (long) (*tqmf_ptr) * (*h_ptr++); + +/* update delay line tqmf */ + tqmf_ptr1 = tqmf_ptr - 2; + for (i = 0; i < 22; i++) + *tqmf_ptr-- = *tqmf_ptr1--; + *tqmf_ptr-- = xin1; + *tqmf_ptr = xin2; + +/* scale outputs */ + xl = (xa + xb) >> 15; + xh = (xa - xb) >> 15; + +/* end of quadrature mirror filter code */ + +/* starting with lower sub band encoder */ + +/* filtez - compute predictor output section - zero section */ + szl = filtez (delay_bpl, delay_dltx); + +/* filtep - compute predictor output signal (pole section) */ + spl = filtep (rlt1, al1, rlt2, al2); + +/* compute the predictor output value in the lower sub_band encoder */ + sl = szl + spl; + el = xl - sl; + +/* quantl: quantize the difference signal */ + il = quantl (el, detl); + +/* computes quantized difference signal */ +/* for invqbl, truncate by 2 lsbs, so mode = 3 */ + dlt = ((long) detl * qq4_code4_table[il >> 2]) >> 15; + +/* logscl: updates logarithmic quant. scale factor in low sub band */ + nbl = logscl (il, nbl); + +/* scalel: compute the quantizer scale factor in the lower sub band */ +/* calling parameters nbl and 8 (constant such that scalel can be scaleh) */ + detl = scalel (nbl, 8); + +/* parrec - simple addition to compute recontructed signal for adaptive pred */ + plt = dlt + szl; + +/* upzero: update zero section predictor coefficients (sixth order)*/ +/* calling parameters: dlt, dlt1, dlt2, ..., dlt6 from dlt */ +/* bpli (linear_buffer in which all six values are delayed */ +/* return params: updated bpli, delayed dltx */ + upzero (dlt, delay_dltx, delay_bpl); + +/* uppol2- update second predictor coefficient apl2 and delay it as al2 */ +/* calling parameters: al1, al2, plt, plt1, plt2 */ + al2 = uppol2 (al1, al2, plt, plt1, plt2); + +/* uppol1 :update first predictor coefficient apl1 and delay it as al1 */ +/* calling parameters: al1, apl2, plt, plt1 */ + al1 = uppol1 (al1, al2, plt, plt1); + +/* recons : compute recontructed signal for adaptive predictor */ + rlt = sl + dlt; + +/* done with lower sub_band encoder; now implement delays for next time*/ + rlt2 = rlt1; + rlt1 = rlt; + plt2 = plt1; + plt1 = plt; + +/* high band encode */ + + szh = filtez (delay_bph, delay_dhx); + + sph = filtep (rh1, ah1, rh2, ah2); + +/* predic: sh = sph + szh */ + sh = sph + szh; +/* subtra: eh = xh - sh */ + eh = xh - sh; + +/* quanth - quantization of difference signal for higher sub-band */ +/* quanth: in-place for speed params: eh, deth (has init. value) */ + if (eh >= 0) + { + ih = 3; /* 2,3 are pos codes */ + } + else + { + ih = 1; /* 0,1 are neg codes */ + } + decis = (564L * (long) deth) >> 12L; + if (abs (eh) > decis) + ih--; /* mih = 2 case */ + +/* compute the quantized difference signal, higher sub-band*/ + dh = ((long) deth * qq2_code2_table[ih]) >> 15L; + +/* logsch: update logarithmic quantizer scale factor in hi sub-band*/ + nbh = logsch (ih, nbh); + +/* note : scalel and scaleh use same code, different parameters */ + deth = scalel (nbh, 10); + +/* parrec - add pole predictor output to quantized diff. signal */ + ph = dh + szh; + +/* upzero: update zero section predictor coefficients (sixth order) */ +/* calling parameters: dh, dhi, bphi */ +/* return params: updated bphi, delayed dhx */ + upzero (dh, delay_dhx, delay_bph); + +/* uppol2: update second predictor coef aph2 and delay as ah2 */ +/* calling params: ah1, ah2, ph, ph1, ph2 */ + ah2 = uppol2 (ah1, ah2, ph, ph1, ph2); + +/* uppol1: update first predictor coef. aph2 and delay it as ah1 */ + ah1 = uppol1 (ah1, ah2, ph, ph1); + +/* recons for higher sub-band */ + yh = sh + dh; + +/* done with higher sub-band encoder, now Delay for next time */ + rh2 = rh1; + rh1 = yh; + ph2 = ph1; + ph1 = ph; + +/* multiplex ih and il to get signals together */ + return (il | (ih << 6)); +} + +/* decode function, result in xout1 and xout2 */ + +void +decode (int input) +{ + int i; + long int xa1, xa2; /* qmf accumulators */ + const int *h_ptr; + int *ac_ptr, *ac_ptr1, *ad_ptr, *ad_ptr1; + +/* split transmitted word from input into ilr and ih */ + ilr = input & 0x3f; + ih = input >> 6; + +/* LOWER SUB_BAND DECODER */ + +/* filtez: compute predictor output for zero section */ + dec_szl = filtez (dec_del_bpl, dec_del_dltx); + +/* filtep: compute predictor output signal for pole section */ + dec_spl = filtep (dec_rlt1, dec_al1, dec_rlt2, dec_al2); + + dec_sl = dec_spl + dec_szl; + +/* compute quantized difference signal for adaptive predic */ + dec_dlt = ((long) dec_detl * qq4_code4_table[ilr >> 2]) >> 15; + +/* compute quantized difference signal for decoder output */ + dl = ((long) dec_detl * qq6_code6_table[il]) >> 15; + + rl = dl + dec_sl; + +/* logscl: quantizer scale factor adaptation in the lower sub-band */ + dec_nbl = logscl (ilr, dec_nbl); + +/* scalel: computes quantizer scale factor in the lower sub band */ + dec_detl = scalel (dec_nbl, 8); + +/* parrec - add pole predictor output to quantized diff. signal */ +/* for partially reconstructed signal */ + dec_plt = dec_dlt + dec_szl; + +/* upzero: update zero section predictor coefficients */ + upzero (dec_dlt, dec_del_dltx, dec_del_bpl); + +/* uppol2: update second predictor coefficient apl2 and delay it as al2 */ + dec_al2 = uppol2 (dec_al1, dec_al2, dec_plt, dec_plt1, dec_plt2); + +/* uppol1: update first predictor coef. (pole setion) */ + dec_al1 = uppol1 (dec_al1, dec_al2, dec_plt, dec_plt1); + +/* recons : compute recontructed signal for adaptive predictor */ + dec_rlt = dec_sl + dec_dlt; + +/* done with lower sub band decoder, implement delays for next time */ + dec_rlt2 = dec_rlt1; + dec_rlt1 = dec_rlt; + dec_plt2 = dec_plt1; + dec_plt1 = dec_plt; + +/* HIGH SUB-BAND DECODER */ + +/* filtez: compute predictor output for zero section */ + dec_szh = filtez (dec_del_bph, dec_del_dhx); + +/* filtep: compute predictor output signal for pole section */ + dec_sph = filtep (dec_rh1, dec_ah1, dec_rh2, dec_ah2); + +/* predic:compute the predictor output value in the higher sub_band decoder */ + dec_sh = dec_sph + dec_szh; + +/* in-place compute the quantized difference signal */ + dec_dh = ((long) dec_deth * qq2_code2_table[ih]) >> 15L; + +/* logsch: update logarithmic quantizer scale factor in hi sub band */ + dec_nbh = logsch (ih, dec_nbh); + +/* scalel: compute the quantizer scale factor in the higher sub band */ + dec_deth = scalel (dec_nbh, 10); + +/* parrec: compute partially recontructed signal */ + dec_ph = dec_dh + dec_szh; + +/* upzero: update zero section predictor coefficients */ + upzero (dec_dh, dec_del_dhx, dec_del_bph); + +/* uppol2: update second predictor coefficient aph2 and delay it as ah2 */ + dec_ah2 = uppol2 (dec_ah1, dec_ah2, dec_ph, dec_ph1, dec_ph2); + +/* uppol1: update first predictor coef. (pole setion) */ + dec_ah1 = uppol1 (dec_ah1, dec_ah2, dec_ph, dec_ph1); + +/* recons : compute recontructed signal for adaptive predictor */ + rh = dec_sh + dec_dh; + +/* done with high band decode, implementing delays for next time here */ + dec_rh2 = dec_rh1; + dec_rh1 = rh; + dec_ph2 = dec_ph1; + dec_ph1 = dec_ph; + +/* end of higher sub_band decoder */ + +/* end with receive quadrature mirror filters */ + xd = rl - rh; + xs = rl + rh; + +/* receive quadrature mirror filters implemented here */ + h_ptr = h; + ac_ptr = accumc; + ad_ptr = accumd; + xa1 = (long) xd *(*h_ptr++); + xa2 = (long) xs *(*h_ptr++); +/* main multiply accumulate loop for samples and coefficients */ + for (i = 0; i < 10; i++) + { + xa1 += (long) (*ac_ptr++) * (*h_ptr++); + xa2 += (long) (*ad_ptr++) * (*h_ptr++); + } +/* final mult/accumulate */ + xa1 += (long) (*ac_ptr) * (*h_ptr++); + xa2 += (long) (*ad_ptr) * (*h_ptr++); + +/* scale by 2^14 */ + xout1 = xa1 >> 14; + xout2 = xa2 >> 14; + +/* update delay lines */ + ac_ptr1 = ac_ptr - 1; + ad_ptr1 = ad_ptr - 1; + for (i = 0; i < 10; i++) + { + *ac_ptr-- = *ac_ptr1--; + *ad_ptr-- = *ad_ptr1--; + } + *ac_ptr = xd; + *ad_ptr = xs; +} + +/* clear all storage locations */ + +void +reset () +{ + int i; + + detl = dec_detl = 32; /* reset to min scale factor */ + deth = dec_deth = 8; + nbl = al1 = al2 = plt1 = plt2 = rlt1 = rlt2 = 0; + nbh = ah1 = ah2 = ph1 = ph2 = rh1 = rh2 = 0; + dec_nbl = dec_al1 = dec_al2 = dec_plt1 = dec_plt2 = dec_rlt1 = dec_rlt2 = 0; + dec_nbh = dec_ah1 = dec_ah2 = dec_ph1 = dec_ph2 = dec_rh1 = dec_rh2 = 0; + + for (i = 0; i < 6; i++) + { + delay_dltx[i] = 0; + delay_dhx[i] = 0; + dec_del_dltx[i] = 0; + dec_del_dhx[i] = 0; + } + + for (i = 0; i < 6; i++) + { + delay_bpl[i] = 0; + delay_bph[i] = 0; + dec_del_bpl[i] = 0; + dec_del_bph[i] = 0; + } + + for (i = 0; i < 24; i++) + tqmf[i] = 0; // i<23 + + for (i = 0; i < 11; i++) + { + accumc[i] = 0; + accumd[i] = 0; + } +} + +/* filtez - compute predictor output signal (zero section) */ +/* input: bpl1-6 and dlt1-6, output: szl */ + +int +filtez (int *bpl, int *dlt) +{ + int i; + long int zl; + zl = (long) (*bpl++) * (*dlt++); + for (i = 1; i < 6; i++) + zl += (long) (*bpl++) * (*dlt++); + + return ((int) (zl >> 14)); /* x2 here */ +} + +/* filtep - compute predictor output signal (pole section) */ +/* input rlt1-2 and al1-2, output spl */ + +int +filtep (int rlt1, int al1, int rlt2, int al2) +{ + long int pl, pl2; + pl = 2 * rlt1; + pl = (long) al1 *pl; + pl2 = 2 * rlt2; + pl += (long) al2 *pl2; + return ((int) (pl >> 15)); +} + +/* quantl - quantize the difference signal in the lower sub-band */ +int +quantl (int el, int detl) +{ + int ril, mil; + long int wd, decis; + +/* abs of difference signal */ + wd = abs (el); +/* determine mil based on decision levels and detl gain */ + for (mil = 0; mil < 30; mil++) + { + decis = (decis_levl[mil] * (long) detl) >> 15L; + if (wd <= decis) + break; + } +/* if mil=30 then wd is less than all decision levels */ + if (el >= 0) + ril = quant26bt_pos[mil]; + else + ril = quant26bt_neg[mil]; + return (ril); +} + +/* logscl - update log quantizer scale factor in lower sub-band */ +/* note that nbl is passed and returned */ + +int +logscl (int il, int nbl) +{ + long int wd; + wd = ((long) nbl * 127L) >> 7L; /* leak factor 127/128 */ + nbl = (int) wd + wl_code_table[il >> 2]; + if (nbl < 0) + nbl = 0; + if (nbl > 18432) + nbl = 18432; + return (nbl); +} + +/* scalel: compute quantizer scale factor in lower or upper sub-band*/ + +int +scalel (int nbl, int shift_constant) +{ + int wd1, wd2, wd3; + wd1 = (nbl >> 6) & 31; + wd2 = nbl >> 11; + wd3 = ilb_table[wd1] >> (shift_constant + 1 - wd2); + return (wd3 << 3); +} + +/* upzero - inputs: dlt, dlti[0-5], bli[0-5], outputs: updated bli[0-5] */ +/* also implements delay of bli and update of dlti from dlt */ + +void +upzero (int dlt, int *dlti, int *bli) +{ + int i, wd2, wd3; +/*if dlt is zero, then no sum into bli */ + if (dlt == 0) + { + for (i = 0; i < 6; i++) + { + bli[i] = (int) ((255L * bli[i]) >> 8L); /* leak factor of 255/256 */ + } + } + else + { + for (i = 0; i < 6; i++) + { + if ((long) dlt * dlti[i] >= 0) + wd2 = 128; + else + wd2 = -128; + wd3 = (int) ((255L * bli[i]) >> 8L); /* leak factor of 255/256 */ + bli[i] = wd2 + wd3; + } + } +/* implement delay line for dlt */ + dlti[5] = dlti[4]; + dlti[4] = dlti[3]; + dlti[3] = dlti[2]; + dlti[2] = dlti[1]; + dlti[1] = dlti[0]; + dlti[0] = dlt; +} + +/* uppol2 - update second predictor coefficient (pole section) */ +/* inputs: al1, al2, plt, plt1, plt2. outputs: apl2 */ + +int +uppol2 (int al1, int al2, int plt, int plt1, int plt2) +{ + long int wd2, wd4; + int apl2; + wd2 = 4L * (long) al1; + if ((long) plt * plt1 >= 0L) + wd2 = -wd2; /* check same sign */ + wd2 = wd2 >> 7; /* gain of 1/128 */ + if ((long) plt * plt2 >= 0L) + { + wd4 = wd2 + 128; /* same sign case */ + } + else + { + wd4 = wd2 - 128; + } + apl2 = wd4 + (127L * (long) al2 >> 7L); /* leak factor of 127/128 */ + +/* apl2 is limited to +-.75 */ + if (apl2 > 12288) + apl2 = 12288; + if (apl2 < -12288) + apl2 = -12288; + return (apl2); +} + +/* uppol1 - update first predictor coefficient (pole section) */ +/* inputs: al1, apl2, plt, plt1. outputs: apl1 */ + +int +uppol1 (int al1, int apl2, int plt, int plt1) +{ + long int wd2; + int wd3, apl1; + wd2 = ((long) al1 * 255L) >> 8L; /* leak factor of 255/256 */ + if ((long) plt * plt1 >= 0L) + { + apl1 = (int) wd2 + 192; /* same sign case */ + } + else + { + apl1 = (int) wd2 - 192; + } +/* note: wd3= .9375-.75 is always positive */ + wd3 = 15360 - apl2; /* limit value */ + if (apl1 > wd3) + apl1 = wd3; + if (apl1 < -wd3) + apl1 = -wd3; + return (apl1); +} + +/* logsch - update log quantizer scale factor in higher sub-band */ +/* note that nbh is passed and returned */ + +int +logsch (int ih, int nbh) +{ + int wd; + wd = ((long) nbh * 127L) >> 7L; /* leak factor 127/128 */ + nbh = wd + wh_code_table[ih]; + if (nbh < 0) + nbh = 0; + if (nbh > 22528) + nbh = 22528; + return (nbh); +} + +/* ++--------------------------------------------------------------------------+ +| * Test Vectors (added for CHStone) | +| test_data : input data | +| test_compressed : expected output data for "encode" | +| test_result : expected output data for "decode" | ++--------------------------------------------------------------------------+ +*/ + +#define SIZE 100 +#define IN_END 100 + +const int test_data[SIZE] = { + 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x43, 0x43, 0x43, + 0x43, 0x43, 0x43, 0x43, 0x42, + 0x42, 0x42, 0x42, 0x42, 0x42, + 0x41, 0x41, 0x41, 0x41, 0x41, + 0x40, 0x40, 0x40, 0x40, 0x40, + 0x40, 0x40, 0x40, 0x3f, 0x3f, + 0x3f, 0x3f, 0x3f, 0x3e, 0x3e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3d, + 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, + 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, + 0x3c, 0x3c, 0x3c, 0x3c, 0x3b, + 0x3b, 0x3b, 0x3b, 0x3b, 0x3b, + 0x3b, 0x3b, 0x3b, 0x3b, 0x3b, + 0x3b, 0x3b, 0x3b, 0x3b, 0x3b, + 0x3b, 0x3b, 0x3b, 0x3b, 0x3b, + 0x3b, 0x3b, 0x3c, 0x3c, 0x3c, + 0x3c, 0x3c, 0x3c, 0x3c, 0x3c +}; +int compressed[SIZE], result[SIZE]; +const int test_compressed[SIZE] = { + 0xfd, 0xde, 0x77, 0xba, 0xf2, + 0x90, 0x20, 0xa0, 0xec, 0xed, + 0xef, 0xf1, 0xf3, 0xf4, 0xf5, + 0xf5, 0xf5, 0xf5, 0xf6, 0xf6, + 0xf6, 0xf7, 0xf8, 0xf7, 0xf8, + 0xf7, 0xf9, 0xf8, 0xf7, 0xf9, + 0xf8, 0xf8, 0xf6, 0xf8, 0xf8, + 0xf7, 0xf9, 0xf9, 0xf9, 0xf8, + 0xf7, 0xfa, 0xf8, 0xf8, 0xf7, + 0xfb, 0xfa, 0xf9, 0xf8, 0xf8 +}; +const int test_result[SIZE] = { + 0, 0xffffffff, 0xffffffff, 0, 0, + 0xffffffff, 0, 0, 0xffffffff, 0xffffffff, + 0, 0, 0x1, 0x1, 0, + 0xfffffffe, 0xffffffff, 0xfffffffe, 0, 0xfffffffc, + 0x1, 0x1, 0x1, 0xfffffffb, 0x2, + 0x2, 0x3, 0xb, 0x14, 0x14, + 0x16, 0x18, 0x20, 0x21, 0x26, + 0x27, 0x2e, 0x2f, 0x33, 0x32, + 0x35, 0x33, 0x36, 0x34, 0x37, + 0x34, 0x37, 0x35, 0x38, 0x36, + 0x39, 0x38, 0x3b, 0x3a, 0x3f, + 0x3f, 0x40, 0x3a, 0x3d, 0x3e, + 0x41, 0x3c, 0x3e, 0x3f, 0x42, + 0x3e, 0x3b, 0x37, 0x3b, 0x3e, + 0x41, 0x3b, 0x3b, 0x3a, 0x3b, + 0x36, 0x39, 0x3b, 0x3f, 0x3c, + 0x3b, 0x37, 0x3b, 0x3d, 0x41, + 0x3d, 0x3e, 0x3c, 0x3e, 0x3b, + 0x3a, 0x37, 0x3b, 0x3e, 0x41, + 0x3c, 0x3b, 0x39, 0x3a, 0x36 +}; + +void +adpcm_main () +{ + int i, j; + +/* reset, initialize required memory */ + reset (); + + j = 10; + + for (i = 0; i < IN_END; i += 2) + { + compressed[i / 2] = encode (test_data[i], test_data[i + 1]); + } + for (i = 0; i < IN_END; i += 2) + { + decode (compressed[i / 2]); + result[i] = xout1; + result[i + 1] = xout2; + } +} + +int +main () +{ + int i; + int main_result; + + main_result = 0; + adpcm_main (); + for (i = 0; i < IN_END / 2; i++) + { + if (compressed[i] != test_compressed[i]) + { + main_result += 1; + } + } + for (i = 0; i < IN_END; i++) + { + if (result[i] != test_result[i]) + { + main_result += 1; + } + } + printf ("%d\n", main_result); + return main_result; + } diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/hint.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/hint.sh new file mode 100755 index 000000000..3bb28d85f --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/hint.sh @@ -0,0 +1,4 @@ +#!/bin/bash +abs_script=$(readlink -e $0) +dir_script=$(dirname $abs_script) +bambu $dir_script/adpcm.c -O0 --simulate "$@" |& tee log.txt diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.c new file mode 100755 index 000000000..613b1bdb5 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.c @@ -0,0 +1,882 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*************************************************************************/ +/* */ +/* SNU-RT Benchmark Suite for Worst Case Timing Analysis */ +/* ===================================================== */ +/* Collected and Modified by S.-S. Lim */ +/* sslim@archi.snu.ac.kr */ +/* Real-Time Research Group */ +/* Seoul National University */ +/* */ +/* */ +/* < Features > - restrictions for our experimental environment */ +/* */ +/* 1. Completely structured. */ +/* - There are no unconditional jumps. */ +/* - There are no exit from loop bodies. */ +/* (There are no 'break' or 'return' in loop bodies) */ +/* 2. No 'switch' statements. */ +/* 3. No 'do..while' statements. */ +/* 4. Expressions are restricted. */ +/* - There are no multiple expressions joined by 'or', */ +/* 'and' operations. */ +/* 5. No library calls. */ +/* - All the functions needed are implemented in the */ +/* source file. */ +/* */ +/* */ +/*************************************************************************/ +/* */ +/* FILE: adpcm.c */ +/* SOURCE : C Algorithms for Real-Time DSP by P. M. Embree */ +/* */ +/* DESCRIPTION : */ +/* */ +/* CCITT G.722 ADPCM (Adaptive Differential Pulse Code Modulation) */ +/* algorithm. */ +/* 16khz sample rate data is stored in the array test_data[SIZE]. */ +/* Results are stored in the array compressed[SIZE] and result[SIZE].*/ +/* Execution time is determined by the constant SIZE (default value */ +/* is 2000). */ +/* */ +/* REMARK : */ +/* */ +/* EXECUTION TIME : */ +/* */ +/* */ +/*************************************************************************/ +#include + +int encode (int, int); +void decode (int); +int filtez (int *bpl, int *dlt); +void upzero (int dlt, int *dlti, int *bli); +int filtep (int rlt1, int al1, int rlt2, int al2); +int quantl (int el, int detl); +int logscl (int il, int nbl); +int scalel (int nbl, int shift_constant); +int uppol2 (int al1, int al2, int plt, int plt1, int plt2); +int uppol1 (int al1, int apl2, int plt, int plt1); +int logsch (int ih, int nbh); +void reset (); + +/* G722 C code */ + +/* variables for transimit quadrature mirror filter here */ +int tqmf[24]; + +/* QMF filter coefficients: +scaled by a factor of 4 compared to G722 CCITT recomendation */ +const int h[24] = { + 12, -44, -44, 212, 48, -624, 128, 1448, + -840, -3220, 3804, 15504, 15504, 3804, -3220, -840, + 1448, 128, -624, 48, 212, -44, -44, 12 +}; + +int xl, xh; + +/* variables for receive quadrature mirror filter here */ +int accumc[11], accumd[11]; + +/* outputs of decode() */ +int xout1, xout2; + +int xs, xd; + +/* variables for encoder (hi and lo) here */ + +int il, szl, spl, sl, el; + +const int qq4_code4_table[16] = { + 0, -20456, -12896, -8968, -6288, -4240, -2584, -1200, + 20456, 12896, 8968, 6288, 4240, 2584, 1200, 0 +}; + + +const int qq6_code6_table[64] = { + -136, -136, -136, -136, -24808, -21904, -19008, -16704, + -14984, -13512, -12280, -11192, -10232, -9360, -8576, -7856, + -7192, -6576, -6000, -5456, -4944, -4464, -4008, -3576, + -3168, -2776, -2400, -2032, -1688, -1360, -1040, -728, + 24808, 21904, 19008, 16704, 14984, 13512, 12280, 11192, + 10232, 9360, 8576, 7856, 7192, 6576, 6000, 5456, + 4944, 4464, 4008, 3576, 3168, 2776, 2400, 2032, + 1688, 1360, 1040, 728, 432, 136, -432, -136 +}; + +int delay_bpl[6]; + +int delay_dltx[6]; + +const int wl_code_table[16] = { + -60, 3042, 1198, 538, 334, 172, 58, -30, + 3042, 1198, 538, 334, 172, 58, -30, -60 +}; + +const int ilb_table[32] = { + 2048, 2093, 2139, 2186, 2233, 2282, 2332, 2383, + 2435, 2489, 2543, 2599, 2656, 2714, 2774, 2834, + 2896, 2960, 3025, 3091, 3158, 3228, 3298, 3371, + 3444, 3520, 3597, 3676, 3756, 3838, 3922, 4008 +}; + +int nbl; /* delay line */ +int al1, al2; +int plt, plt1, plt2; +int dlt; +int rlt, rlt1, rlt2; + +/* decision levels - pre-multiplied by 8, 0 to indicate end */ +const int decis_levl[30] = { + 280, 576, 880, 1200, 1520, 1864, 2208, 2584, + 2960, 3376, 3784, 4240, 4696, 5200, 5712, 6288, + 6864, 7520, 8184, 8968, 9752, 10712, 11664, 12896, + 14120, 15840, 17560, 20456, 23352, 32767 +}; + +int detl; + +/* quantization table 31 long to make quantl look-up easier, +last entry is for mil=30 case when wd is max */ +const int quant26bt_pos[31] = { + 61, 60, 59, 58, 57, 56, 55, 54, + 53, 52, 51, 50, 49, 48, 47, 46, + 45, 44, 43, 42, 41, 40, 39, 38, + 37, 36, 35, 34, 33, 32, 32 +}; + +/* quantization table 31 long to make quantl look-up easier, +last entry is for mil=30 case when wd is max */ +const int quant26bt_neg[31] = { + 63, 62, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, + 17, 16, 15, 14, 13, 12, 11, 10, + 9, 8, 7, 6, 5, 4, 4 +}; + + +int deth; +int sh; /* this comes from adaptive predictor */ +int eh; + +const int qq2_code2_table[4] = { + -7408, -1616, 7408, 1616 +}; + +const int wh_code_table[4] = { + 798, -214, 798, -214 +}; + + +int dh, ih; +int nbh, szh; +int sph, ph, yh, rh; + +int delay_dhx[6]; + +int delay_bph[6]; + +int ah1, ah2; +int ph1, ph2; +int rh1, rh2; + +/* variables for decoder here */ +int ilr, rl; +int dec_deth, dec_detl, dec_dlt; + +int dec_del_bpl[6]; + +int dec_del_dltx[6]; + +int dec_plt, dec_plt1, dec_plt2; +int dec_szl, dec_spl, dec_sl; +int dec_rlt1, dec_rlt2, dec_rlt; +int dec_al1, dec_al2; +int dl; +int dec_nbl, dec_dh, dec_nbh; + +/* variables used in filtez */ +int dec_del_bph[6]; + +int dec_del_dhx[6]; + +int dec_szh; +/* variables used in filtep */ +int dec_rh1, dec_rh2; +int dec_ah1, dec_ah2; +int dec_ph, dec_sph; + +int dec_sh; + +int dec_ph1, dec_ph2; + +/* G722 encode function two ints in, one 8 bit output */ + +/* put input samples in xin1 = first value, xin2 = second value */ +/* returns il and ih stored together */ + +int +abs (int n) +{ + int m; + + if (n >= 0) + m = n; + else + m = -n; + return m; +} + +int +encode (int xin1, int xin2) +{ + int i; + const int *h_ptr; + int *tqmf_ptr, *tqmf_ptr1; + long int xa, xb; + int decis; + +/* transmit quadrature mirror filters implemented here */ + h_ptr = h; + tqmf_ptr = tqmf; + xa = (long) (*tqmf_ptr++) * (*h_ptr++); + xb = (long) (*tqmf_ptr++) * (*h_ptr++); +/* main multiply accumulate loop for samples and coefficients */ + for (i = 0; i < 10; i++) + { + xa += (long) (*tqmf_ptr++) * (*h_ptr++); + xb += (long) (*tqmf_ptr++) * (*h_ptr++); + } +/* final mult/accumulate */ + xa += (long) (*tqmf_ptr++) * (*h_ptr++); + xb += (long) (*tqmf_ptr) * (*h_ptr++); + +/* update delay line tqmf */ + tqmf_ptr1 = tqmf_ptr - 2; + for (i = 0; i < 22; i++) + *tqmf_ptr-- = *tqmf_ptr1--; + *tqmf_ptr-- = xin1; + *tqmf_ptr = xin2; + +/* scale outputs */ + xl = (xa + xb) >> 15; + xh = (xa - xb) >> 15; + +/* end of quadrature mirror filter code */ + +/* starting with lower sub band encoder */ + +/* filtez - compute predictor output section - zero section */ + szl = filtez (delay_bpl, delay_dltx); + +/* filtep - compute predictor output signal (pole section) */ + spl = filtep (rlt1, al1, rlt2, al2); + +/* compute the predictor output value in the lower sub_band encoder */ + sl = szl + spl; + el = xl - sl; + +/* quantl: quantize the difference signal */ + il = quantl (el, detl); + +/* computes quantized difference signal */ +/* for invqbl, truncate by 2 lsbs, so mode = 3 */ + dlt = ((long) detl * qq4_code4_table[il >> 2]) >> 15; + +/* logscl: updates logarithmic quant. scale factor in low sub band */ + nbl = logscl (il, nbl); + +/* scalel: compute the quantizer scale factor in the lower sub band */ +/* calling parameters nbl and 8 (constant such that scalel can be scaleh) */ + detl = scalel (nbl, 8); + +/* parrec - simple addition to compute recontructed signal for adaptive pred */ + plt = dlt + szl; + +/* upzero: update zero section predictor coefficients (sixth order)*/ +/* calling parameters: dlt, dlt1, dlt2, ..., dlt6 from dlt */ +/* bpli (linear_buffer in which all six values are delayed */ +/* return params: updated bpli, delayed dltx */ + upzero (dlt, delay_dltx, delay_bpl); + +/* uppol2- update second predictor coefficient apl2 and delay it as al2 */ +/* calling parameters: al1, al2, plt, plt1, plt2 */ + al2 = uppol2 (al1, al2, plt, plt1, plt2); + +/* uppol1 :update first predictor coefficient apl1 and delay it as al1 */ +/* calling parameters: al1, apl2, plt, plt1 */ + al1 = uppol1 (al1, al2, plt, plt1); + +/* recons : compute recontructed signal for adaptive predictor */ + rlt = sl + dlt; + +/* done with lower sub_band encoder; now implement delays for next time*/ + rlt2 = rlt1; + rlt1 = rlt; + plt2 = plt1; + plt1 = plt; + +/* high band encode */ + + szh = filtez (delay_bph, delay_dhx); + + sph = filtep (rh1, ah1, rh2, ah2); + +/* predic: sh = sph + szh */ + sh = sph + szh; +/* subtra: eh = xh - sh */ + eh = xh - sh; + +/* quanth - quantization of difference signal for higher sub-band */ +/* quanth: in-place for speed params: eh, deth (has init. value) */ + if (eh >= 0) + { + ih = 3; /* 2,3 are pos codes */ + } + else + { + ih = 1; /* 0,1 are neg codes */ + } + decis = (564L * (long) deth) >> 12L; + if (abs (eh) > decis) + ih--; /* mih = 2 case */ + +/* compute the quantized difference signal, higher sub-band*/ + dh = ((long) deth * qq2_code2_table[ih]) >> 15L; + +/* logsch: update logarithmic quantizer scale factor in hi sub-band*/ + nbh = logsch (ih, nbh); + +/* note : scalel and scaleh use same code, different parameters */ + deth = scalel (nbh, 10); + +/* parrec - add pole predictor output to quantized diff. signal */ + ph = dh + szh; + +/* upzero: update zero section predictor coefficients (sixth order) */ +/* calling parameters: dh, dhi, bphi */ +/* return params: updated bphi, delayed dhx */ + upzero (dh, delay_dhx, delay_bph); + +/* uppol2: update second predictor coef aph2 and delay as ah2 */ +/* calling params: ah1, ah2, ph, ph1, ph2 */ + ah2 = uppol2 (ah1, ah2, ph, ph1, ph2); + +/* uppol1: update first predictor coef. aph2 and delay it as ah1 */ + ah1 = uppol1 (ah1, ah2, ph, ph1); + +/* recons for higher sub-band */ + yh = sh + dh; + +/* done with higher sub-band encoder, now Delay for next time */ + rh2 = rh1; + rh1 = yh; + ph2 = ph1; + ph1 = ph; + +/* multiplex ih and il to get signals together */ + return (il | (ih << 6)); +} + +/* decode function, result in xout1 and xout2 */ + +void +decode (int input) +{ + int i; + long int xa1, xa2; /* qmf accumulators */ + const int *h_ptr; + int *ac_ptr, *ac_ptr1, *ad_ptr, *ad_ptr1; + +/* split transmitted word from input into ilr and ih */ + ilr = input & 0x3f; + ih = input >> 6; + +/* LOWER SUB_BAND DECODER */ + +/* filtez: compute predictor output for zero section */ + dec_szl = filtez (dec_del_bpl, dec_del_dltx); + +/* filtep: compute predictor output signal for pole section */ + dec_spl = filtep (dec_rlt1, dec_al1, dec_rlt2, dec_al2); + + dec_sl = dec_spl + dec_szl; + +/* compute quantized difference signal for adaptive predic */ + dec_dlt = ((long) dec_detl * qq4_code4_table[ilr >> 2]) >> 15; + +/* compute quantized difference signal for decoder output */ + dl = ((long) dec_detl * qq6_code6_table[il]) >> 15; + + rl = dl + dec_sl; + +/* logscl: quantizer scale factor adaptation in the lower sub-band */ + dec_nbl = logscl (ilr, dec_nbl); + +/* scalel: computes quantizer scale factor in the lower sub band */ + dec_detl = scalel (dec_nbl, 8); + +/* parrec - add pole predictor output to quantized diff. signal */ +/* for partially reconstructed signal */ + dec_plt = dec_dlt + dec_szl; + +/* upzero: update zero section predictor coefficients */ + upzero (dec_dlt, dec_del_dltx, dec_del_bpl); + +/* uppol2: update second predictor coefficient apl2 and delay it as al2 */ + dec_al2 = uppol2 (dec_al1, dec_al2, dec_plt, dec_plt1, dec_plt2); + +/* uppol1: update first predictor coef. (pole setion) */ + dec_al1 = uppol1 (dec_al1, dec_al2, dec_plt, dec_plt1); + +/* recons : compute recontructed signal for adaptive predictor */ + dec_rlt = dec_sl + dec_dlt; + +/* done with lower sub band decoder, implement delays for next time */ + dec_rlt2 = dec_rlt1; + dec_rlt1 = dec_rlt; + dec_plt2 = dec_plt1; + dec_plt1 = dec_plt; + +/* HIGH SUB-BAND DECODER */ + +/* filtez: compute predictor output for zero section */ + dec_szh = filtez (dec_del_bph, dec_del_dhx); + +/* filtep: compute predictor output signal for pole section */ + dec_sph = filtep (dec_rh1, dec_ah1, dec_rh2, dec_ah2); + +/* predic:compute the predictor output value in the higher sub_band decoder */ + dec_sh = dec_sph + dec_szh; + +/* in-place compute the quantized difference signal */ + dec_dh = ((long) dec_deth * qq2_code2_table[ih]) >> 15L; + +/* logsch: update logarithmic quantizer scale factor in hi sub band */ + dec_nbh = logsch (ih, dec_nbh); + +/* scalel: compute the quantizer scale factor in the higher sub band */ + dec_deth = scalel (dec_nbh, 10); + +/* parrec: compute partially recontructed signal */ + dec_ph = dec_dh + dec_szh; + +/* upzero: update zero section predictor coefficients */ + upzero (dec_dh, dec_del_dhx, dec_del_bph); + +/* uppol2: update second predictor coefficient aph2 and delay it as ah2 */ + dec_ah2 = uppol2 (dec_ah1, dec_ah2, dec_ph, dec_ph1, dec_ph2); + +/* uppol1: update first predictor coef. (pole setion) */ + dec_ah1 = uppol1 (dec_ah1, dec_ah2, dec_ph, dec_ph1); + +/* recons : compute recontructed signal for adaptive predictor */ + rh = dec_sh + dec_dh; + +/* done with high band decode, implementing delays for next time here */ + dec_rh2 = dec_rh1; + dec_rh1 = rh; + dec_ph2 = dec_ph1; + dec_ph1 = dec_ph; + +/* end of higher sub_band decoder */ + +/* end with receive quadrature mirror filters */ + xd = rl - rh; + xs = rl + rh; + +/* receive quadrature mirror filters implemented here */ + h_ptr = h; + ac_ptr = accumc; + ad_ptr = accumd; + xa1 = (long) xd *(*h_ptr++); + xa2 = (long) xs *(*h_ptr++); +/* main multiply accumulate loop for samples and coefficients */ + for (i = 0; i < 10; i++) + { + xa1 += (long) (*ac_ptr++) * (*h_ptr++); + xa2 += (long) (*ad_ptr++) * (*h_ptr++); + } +/* final mult/accumulate */ + xa1 += (long) (*ac_ptr) * (*h_ptr++); + xa2 += (long) (*ad_ptr) * (*h_ptr++); + +/* scale by 2^14 */ + xout1 = xa1 >> 14; + xout2 = xa2 >> 14; + +/* update delay lines */ + ac_ptr1 = ac_ptr - 1; + ad_ptr1 = ad_ptr - 1; + for (i = 0; i < 10; i++) + { + *ac_ptr-- = *ac_ptr1--; + *ad_ptr-- = *ad_ptr1--; + } + *ac_ptr = xd; + *ad_ptr = xs; +} + +/* clear all storage locations */ + +void +reset () +{ + int i; + + detl = dec_detl = 32; /* reset to min scale factor */ + deth = dec_deth = 8; + nbl = al1 = al2 = plt1 = plt2 = rlt1 = rlt2 = 0; + nbh = ah1 = ah2 = ph1 = ph2 = rh1 = rh2 = 0; + dec_nbl = dec_al1 = dec_al2 = dec_plt1 = dec_plt2 = dec_rlt1 = dec_rlt2 = 0; + dec_nbh = dec_ah1 = dec_ah2 = dec_ph1 = dec_ph2 = dec_rh1 = dec_rh2 = 0; + + for (i = 0; i < 6; i++) + { + delay_dltx[i] = 0; + delay_dhx[i] = 0; + dec_del_dltx[i] = 0; + dec_del_dhx[i] = 0; + } + + for (i = 0; i < 6; i++) + { + delay_bpl[i] = 0; + delay_bph[i] = 0; + dec_del_bpl[i] = 0; + dec_del_bph[i] = 0; + } + + for (i = 0; i < 24; i++) + tqmf[i] = 0; // i<23 + + for (i = 0; i < 11; i++) + { + accumc[i] = 0; + accumd[i] = 0; + } +} + +/* filtez - compute predictor output signal (zero section) */ +/* input: bpl1-6 and dlt1-6, output: szl */ + +int +filtez (int *bpl, int *dlt) +{ + int i; + long int zl; + zl = (long) (*bpl++) * (*dlt++); + for (i = 1; i < 6; i++) + zl += (long) (*bpl++) * (*dlt++); + + return ((int) (zl >> 14)); /* x2 here */ +} + +/* filtep - compute predictor output signal (pole section) */ +/* input rlt1-2 and al1-2, output spl */ + +int +filtep (int rlt1, int al1, int rlt2, int al2) +{ + long int pl, pl2; + pl = 2 * rlt1; + pl = (long) al1 *pl; + pl2 = 2 * rlt2; + pl += (long) al2 *pl2; + return ((int) (pl >> 15)); +} + +/* quantl - quantize the difference signal in the lower sub-band */ +int +quantl (int el, int detl) +{ + int ril, mil; + long int wd, decis; + +/* abs of difference signal */ + wd = abs (el); +/* determine mil based on decision levels and detl gain */ + for (mil = 0; mil < 30; mil++) + { + decis = (decis_levl[mil] * (long) detl) >> 15L; + if (wd <= decis) + break; + } +/* if mil=30 then wd is less than all decision levels */ + if (el >= 0) + ril = quant26bt_pos[mil]; + else + ril = quant26bt_neg[mil]; + return (ril); +} + +/* logscl - update log quantizer scale factor in lower sub-band */ +/* note that nbl is passed and returned */ + +int +logscl (int il, int nbl) +{ + long int wd; + wd = ((long) nbl * 127L) >> 7L; /* leak factor 127/128 */ + nbl = (int) wd + wl_code_table[il >> 2]; + if (nbl < 0) + nbl = 0; + if (nbl > 18432) + nbl = 18432; + return (nbl); +} + +/* scalel: compute quantizer scale factor in lower or upper sub-band*/ + +int +scalel (int nbl, int shift_constant) +{ + int wd1, wd2, wd3; + wd1 = (nbl >> 6) & 31; + wd2 = nbl >> 11; + wd3 = ilb_table[wd1] >> (shift_constant + 1 - wd2); + return (wd3 << 3); +} + +/* upzero - inputs: dlt, dlti[0-5], bli[0-5], outputs: updated bli[0-5] */ +/* also implements delay of bli and update of dlti from dlt */ + +void +upzero (int dlt, int *dlti, int *bli) +{ + int i, wd2, wd3; +/*if dlt is zero, then no sum into bli */ + if (dlt == 0) + { + for (i = 0; i < 6; i++) + { + bli[i] = (int) ((255L * bli[i]) >> 8L); /* leak factor of 255/256 */ + } + } + else + { + for (i = 0; i < 6; i++) + { + if ((long) dlt * dlti[i] >= 0) + wd2 = 128; + else + wd2 = -128; + wd3 = (int) ((255L * bli[i]) >> 8L); /* leak factor of 255/256 */ + bli[i] = wd2 + wd3; + } + } +/* implement delay line for dlt */ + dlti[5] = dlti[4]; + dlti[4] = dlti[3]; + dlti[3] = dlti[2]; + dlti[2] = dlti[1]; + dlti[1] = dlti[0]; + dlti[0] = dlt; +} + +/* uppol2 - update second predictor coefficient (pole section) */ +/* inputs: al1, al2, plt, plt1, plt2. outputs: apl2 */ + +int +uppol2 (int al1, int al2, int plt, int plt1, int plt2) +{ + long int wd2, wd4; + int apl2; + wd2 = 4L * (long) al1; + if ((long) plt * plt1 >= 0L) + wd2 = -wd2; /* check same sign */ + wd2 = wd2 >> 7; /* gain of 1/128 */ + if ((long) plt * plt2 >= 0L) + { + wd4 = wd2 + 128; /* same sign case */ + } + else + { + wd4 = wd2 - 128; + } + apl2 = wd4 + (127L * (long) al2 >> 7L); /* leak factor of 127/128 */ + +/* apl2 is limited to +-.75 */ + if (apl2 > 12288) + apl2 = 12288; + if (apl2 < -12288) + apl2 = -12288; + return (apl2); +} + +/* uppol1 - update first predictor coefficient (pole section) */ +/* inputs: al1, apl2, plt, plt1. outputs: apl1 */ + +int +uppol1 (int al1, int apl2, int plt, int plt1) +{ + long int wd2; + int wd3, apl1; + wd2 = ((long) al1 * 255L) >> 8L; /* leak factor of 255/256 */ + if ((long) plt * plt1 >= 0L) + { + apl1 = (int) wd2 + 192; /* same sign case */ + } + else + { + apl1 = (int) wd2 - 192; + } +/* note: wd3= .9375-.75 is always positive */ + wd3 = 15360 - apl2; /* limit value */ + if (apl1 > wd3) + apl1 = wd3; + if (apl1 < -wd3) + apl1 = -wd3; + return (apl1); +} + +/* logsch - update log quantizer scale factor in higher sub-band */ +/* note that nbh is passed and returned */ + +int +logsch (int ih, int nbh) +{ + int wd; + wd = ((long) nbh * 127L) >> 7L; /* leak factor 127/128 */ + nbh = wd + wh_code_table[ih]; + if (nbh < 0) + nbh = 0; + if (nbh > 22528) + nbh = 22528; + return (nbh); +} + +/* ++--------------------------------------------------------------------------+ +| * Test Vectors (added for CHStone) | +| test_data : input data | +| test_compressed : expected output data for "encode" | +| test_result : expected output data for "decode" | ++--------------------------------------------------------------------------+ +*/ + +#define SIZE 100 +#define IN_END 100 + +const int test_data[SIZE] = { + 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x43, 0x43, 0x43, + 0x43, 0x43, 0x43, 0x43, 0x42, + 0x42, 0x42, 0x42, 0x42, 0x42, + 0x41, 0x41, 0x41, 0x41, 0x41, + 0x40, 0x40, 0x40, 0x40, 0x40, + 0x40, 0x40, 0x40, 0x3f, 0x3f, + 0x3f, 0x3f, 0x3f, 0x3e, 0x3e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3d, + 0x3d, 0x3d, 0x3d, 0x3d, 0x3d, + 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, + 0x3c, 0x3c, 0x3c, 0x3c, 0x3b, + 0x3b, 0x3b, 0x3b, 0x3b, 0x3b, + 0x3b, 0x3b, 0x3b, 0x3b, 0x3b, + 0x3b, 0x3b, 0x3b, 0x3b, 0x3b, + 0x3b, 0x3b, 0x3b, 0x3b, 0x3b, + 0x3b, 0x3b, 0x3c, 0x3c, 0x3c, + 0x3c, 0x3c, 0x3c, 0x3c, 0x3c +}; +int compressed[SIZE], result[SIZE]; +const int test_compressed[SIZE] = { + 0xfd, 0xde, 0x77, 0xba, 0xf2, + 0x90, 0x20, 0xa0, 0xec, 0xed, + 0xef, 0xf1, 0xf3, 0xf4, 0xf5, + 0xf5, 0xf5, 0xf5, 0xf6, 0xf6, + 0xf6, 0xf7, 0xf8, 0xf7, 0xf8, + 0xf7, 0xf9, 0xf8, 0xf7, 0xf9, + 0xf8, 0xf8, 0xf6, 0xf8, 0xf8, + 0xf7, 0xf9, 0xf9, 0xf9, 0xf8, + 0xf7, 0xfa, 0xf8, 0xf8, 0xf7, + 0xfb, 0xfa, 0xf9, 0xf8, 0xf8 +}; +const int test_result[SIZE] = { + 0, 0xffffffff, 0xffffffff, 0, 0, + 0xffffffff, 0, 0, 0xffffffff, 0xffffffff, + 0, 0, 0x1, 0x1, 0, + 0xfffffffe, 0xffffffff, 0xfffffffe, 0, 0xfffffffc, + 0x1, 0x1, 0x1, 0xfffffffb, 0x2, + 0x2, 0x3, 0xb, 0x14, 0x14, + 0x16, 0x18, 0x20, 0x21, 0x26, + 0x27, 0x2e, 0x2f, 0x33, 0x32, + 0x35, 0x33, 0x36, 0x34, 0x37, + 0x34, 0x37, 0x35, 0x38, 0x36, + 0x39, 0x38, 0x3b, 0x3a, 0x3f, + 0x3f, 0x40, 0x3a, 0x3d, 0x3e, + 0x41, 0x3c, 0x3e, 0x3f, 0x42, + 0x3e, 0x3b, 0x37, 0x3b, 0x3e, + 0x41, 0x3b, 0x3b, 0x3a, 0x3b, + 0x36, 0x39, 0x3b, 0x3f, 0x3c, + 0x3b, 0x37, 0x3b, 0x3d, 0x41, + 0x3d, 0x3e, 0x3c, 0x3e, 0x3b, + 0x3a, 0x37, 0x3b, 0x3e, 0x41, + 0x3c, 0x3b, 0x39, 0x3a, 0x36 +}; + +void +adpcm_main () +{ + int i, j; + +/* reset, initialize required memory */ + reset (); + + j = 10; + + for (i = 0; i < IN_END; i += 2) + { + compressed[i / 2] = encode (test_data[i], test_data[i + 1]); + } + for (i = 0; i < IN_END; i += 2) + { + decode (compressed[i / 2]); + result[i] = xout1; + result[i + 1] = xout2; + } +} + +int +main () +{ + int i; + int main_result; + + main_result = 0; + adpcm_main (); + for (i = 0; i < IN_END / 2; i++) + { + if (compressed[i] != test_compressed[i]) + { + main_result += 1; + } + } + for (i = 0; i < IN_END; i++) + { + if (result[i] != test_result[i]) + { + main_result += 1; + } + } + printf ("%d\n", main_result); + return main_result; + } diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.csv b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.csv new file mode 100644 index 000000000..7139cb2f5 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm.csv @@ -0,0 +1,25 @@ +Benchmark, CYCLES, HLS_execution_time, +CLANG11:adpcm_O0:main_0, 23643,43.0299999999999999989, +CLANG11:adpcm_O1:main_0, 23643,43.8199999999999999997, +CLANG11:adpcm_O2:main_0, 9651,61.9900000000000000015, +CLANG11:adpcm_O3:main_0, 8855,70.1500000000000000014, +CLANG11:adpcm_Os:main_0, 21593,49.4599999999999999992, +CLANG6:adpcm_O0:main_0, 23393,44.1500000000000000014, +CLANG6:adpcm_O1:main_0, 23393,42.9500000000000000007, +CLANG6:adpcm_O2:main_0, 17392,62.4700000000000000011, +CLANG6:adpcm_O3:main_0, 17392,62.5200000000000000004, +CLANG6:adpcm_Os:main_0, 21543,54.5099999999999999985, +GCC49:adpcm_O0:main_0, 33429,23.0499999999999999993, +GCC49:adpcm_O1:main_0, 24547,18.7199999999999999994, +GCC49:adpcm_O2:main_0, 24043,43.2599999999999999985, +GCC49:adpcm_O3:main_0, 10429,76.4499999999999999972, +GCC49:adpcm_O3_inline:main_0, 7503,99.5800000000000000017, +GCC49:adpcm_O3_vectorize:main_0, 6995,49.3100000000000000012, +GCC49:adpcm_Os:main_0, 24847,25.2099999999999999992, +GCC7:adpcm_O0:main_0, 34015,15.8599999999999999997, +GCC7:adpcm_O1:main_0, 24933,16.9500000000000000007, +GCC7:adpcm_O2:main_0, 22526,40.7200000000000000011, +GCC7:adpcm_O3:main_0, 8345,51.0699999999999999997, +GCC7:adpcm_O3_inline:main_0, 5441,59.3400000000000000001, +GCC7:adpcm_O3_vectorize:main_0, 8765,32.6100000000000000006, +GCC7:adpcm_Os:main_0, 25033,27.2199999999999999994, diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm_sdc.csv b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm_sdc.csv new file mode 100644 index 000000000..fdf820d8c --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/adpcm_sdc.csv @@ -0,0 +1,28 @@ +Benchmark, CYCLES, HLS_execution_time, +CLANG11:adpcm_O0:main_0, 23693,92.9400000000000000022, +CLANG11:adpcm_O1:main_0, 23693,91.7799999999999999989, +CLANG11:adpcm_O2:main_0, 10071,147.809999999999999998, +CLANG11:adpcm_O3:main_0, 8719,183.690000000000000002, +CLANG11:adpcm_O3_inline:main_0, 8719,183.960000000000000006, +CLANG11:adpcm_Os:main_0, 22063,97.5100000000000000019, +CLANG6:adpcm_O0:main_0, 23443,91.2099999999999999992, +CLANG6:adpcm_O1:main_0, 23443,93.6999999999999999972, +CLANG6:adpcm_O2:main_0, 17804,124.870000000000000002, +CLANG6:adpcm_O3:main_0, 17804,129.610000000000000001, +CLANG6:adpcm_O3_inline:main_0, 17804,127.849999999999999999, +CLANG6:adpcm_O3_vectorize:main_0, 17804,126.870000000000000002, +CLANG6:adpcm_Os:main_0, 22013,105.059999999999999998, +GCC49:adpcm_O0:main_0, 33479,64.3799999999999999975, +GCC49:adpcm_O1:main_0, 24297,57.0900000000000000001, +GCC49:adpcm_O2:main_0, 22863,83.5299999999999999989, +GCC49:adpcm_O3:main_0, 9149,175.929999999999999993, +GCC49:adpcm_O3_inline:main_0, 5356,210.619999999999999996, +GCC49:adpcm_O3_vectorize:main_0, 6135,110.809999999999999998, +GCC49:adpcm_Os:main_0, 24397,68.4499999999999999972, +GCC7:adpcm_O0:main_0, 32979,46.5, +GCC7:adpcm_O1:main_0, 24297,47.0299999999999999989, +GCC7:adpcm_O2:main_0, 21513,80.2399999999999999981, +GCC7:adpcm_O3:main_0, 7653,152.220000000000000001, +GCC7:adpcm_O3_inline:main_0, 5003,136.25, +GCC7:adpcm_O3_vectorize:main_0, 8235,97.4300000000000000003, +GCC7:adpcm_Os:main_0, 24397,58.2000000000000000007, diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/list b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/list new file mode 100644 index 000000000..9a26f25da --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/list @@ -0,0 +1,7 @@ +adpcm.c --benchmark-name=adpcm_O0 -O0 +adpcm.c --benchmark-name=adpcm_O1 -O1 +adpcm.c --benchmark-name=adpcm_O2 -O2 +adpcm.c --benchmark-name=adpcm_O3 -O3 +adpcm.c --benchmark-name=adpcm_O3_inline -O3 -finline-limit=1000000 +adpcm.c --benchmark-name=adpcm_O3_vectorize -O3 -ftree-vectorize +adpcm.c --benchmark-name=adpcm_Os -Os diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/synthesize.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/synthesize.sh new file mode 100755 index 000000000..8d9bc09cc --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise1/solution/synthesize.sh @@ -0,0 +1,9 @@ +#!/bin/bash +abs_script=$(readlink -e $0) +dir_script=$(dirname $abs_script) +$dir_script/../../test_panda.py --tool=bambu --bambu=bambu --spider=spider \ + --args="--configuration-name=GCC49 --compiler=I386_GCC49" \ + --args="--configuration-name=GCC7 --compiler=I386_GCC7" \ + --args="--configuration-name=CLANG6 --compiler=I386_CLANG6" \ + --args="--configuration-name=CLANG11 --compiler=I386_CLANG11" \ + -c=--simulate -b$dir_script -l$dir_script/list "$@" diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/README b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/README new file mode 100644 index 000000000..cb138e2fb --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/README @@ -0,0 +1 @@ +Evaluate the effects on the number of cycles in using different integer division implementations on the dfdiv algorithm targeting Zynq and 66MHz diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/SPARC-GCC.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/SPARC-GCC.h new file mode 100755 index 000000000..523e274f6 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/SPARC-GCC.h @@ -0,0 +1,88 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines the most convenient type that holds +| integers of at least as many bits as specified. For example, `uint8' should +| be the most convenient type that can hold unsigned integers of as many as +| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most +| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed +| to the same as `int'. +*----------------------------------------------------------------------------*/ +typedef int flag; +typedef int int8; +typedef int int16; + +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines a type that holds integers +| of _exactly_ the number of bits specified. For instance, for most +| implementation of C, `bits16' and `sbits16' should be `typedef'ed to +| `unsigned short int' and `signed short int' (or `short int'), respectively. +*----------------------------------------------------------------------------*/ +typedef unsigned short int bits16; +typedef unsigned int bits32; +typedef unsigned long long int bits64; +typedef signed long long int sbits64; + +/*---------------------------------------------------------------------------- +| The `LIT64' macro takes as its argument a textual integer literal and +| if necessary ``marks'' the literal as having a 64-bit integer type. +| For example, the GNU C Compiler (`gcc') requires that 64-bit literals be +| appended with the letters `LL' standing for `long long', which is `gcc's +| name for the 64-bit integer type. Some compilers may allow `LIT64' to be +| defined as the identity macro: `#define LIT64( a ) a'. +*----------------------------------------------------------------------------*/ +#define LIT64( a ) a##LL + +/*---------------------------------------------------------------------------- +| The macro `INLINE' can be used before functions that should be inlined. If +| a compiler does not support explicit inlining, this macro should be defined +| to be `static'. +*----------------------------------------------------------------------------*/ +#define INLINE diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/dfdiv.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/dfdiv.c new file mode 100755 index 000000000..7fd9823bd --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/dfdiv.c @@ -0,0 +1,159 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/* + * Copyright (C) 2008 + * Y. Hara, H. Tomiyama, S. Honda, H. Takada and K. Ishii + * Nagoya University, Japan + * All rights reserved. + * + * Disclaimer of Warranty + * + * These software programs are available to the user without any license fee or + * royalty on an "as is" basis. The authors disclaims any and all warranties, + * whether express, implied, or statuary, including any implied warranties or + * merchantability or of fitness for a particular purpose. In no event shall the + * copyright-holder be liable for any incidental, punitive, or consequential damages + * of any kind whatsoever arising from the use of these programs. This disclaimer + * of warranty extends to the user of these programs and user's customers, employees, + * agents, transferees, successors, and assigns. + * + */ +#include +#include "softfloat.c" + +double +ullong_to_double (unsigned long long x) +{ + union + { + double d; + unsigned long long ll; + } t; + + t.ll = x; + return t.d; +} + +/* ++--------------------------------------------------------------------------+ +| * Test Vectors (added for CHStone) | +| a_input, b_input : input data | +| z_output : expected output data | ++--------------------------------------------------------------------------+ +*/ +#define N 22 + +const float64 a_input[N] = { + 0x7FFF000000000000ULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x7FF0000000000000ULL, /* inf */ + 0x7FF0000000000000ULL, /* inf */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x0000000000000000ULL, /* 0.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x0000000000000000ULL, /* 0.0 */ + 0x8000000000000000ULL, /* -0.0 */ + 0x4008000000000000ULL, /* 3.0 */ + 0xC008000000000000ULL, /* -3.0 */ + 0x4008000000000000ULL, /* 3.0 */ + 0xC008000000000000ULL, /* -3.0 */ + 0x4000000000000000ULL, /* 2.0 */ + 0xC000000000000000ULL, /* -2.0 */ + 0x4000000000000000ULL, /* 2.0 */ + 0xC000000000000000ULL, /* -2.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0xBFF0000000000000ULL, /* -1.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0xBFF0000000000000ULL /* -1.0 */ +}; + +const float64 b_input[N] = { + 0x3FF0000000000000ULL, /* 1.0 */ + 0x7FF8000000000000ULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x7FF8000000000000ULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x0000000000000000ULL, /* 0.0 */ + 0x0000000000000000ULL, /* 0.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x4000000000000000ULL, /* 2.0 */ + 0x4000000000000000ULL, /* 2.0 */ + 0xC000000000000000ULL, /* 2.0 */ + 0xC000000000000000ULL, /* -2.0 */ + 0x4010000000000000ULL, /* 4.0 */ + 0x4010000000000000ULL, /* 4.0 */ + 0xC010000000000000ULL, /* -4.0 */ + 0xC010000000000000ULL, /* -4.0 */ + 0x3FF8000000000000ULL, /* 1.5 */ + 0x3FF8000000000000ULL, /* 1.5 */ + 0xBFF8000000000000ULL, /* -1.5 */ + 0xBFF8000000000000ULL /* -1.5 */ +}; + +const float64 z_output[N] = { + 0x7FFF000000000000ULL, /* nan */ + 0x7FF8000000000000ULL, /* nan */ + 0x7FFFFFFFFFFFFFFFULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x7FF8000000000000ULL, /* nan */ + 0x0000000000000000ULL, /* 0.0 */ + 0x7FFFFFFFFFFFFFFFULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x0000000000000000ULL, /* 0.0 */ + 0x8000000000000000ULL, /* -0.0 */ + 0x3FF8000000000000ULL, /* 1.5 */ + 0xBFF8000000000000ULL, /* -1.5 */ + 0xBFF8000000000000ULL, /* 1.5 */ + 0x3FF8000000000000ULL, /* -1.5 */ + 0x3FE0000000000000ULL, /* 0.5 */ + 0xBFE0000000000000ULL, /* 5.0 */ + 0xBFE0000000000000ULL, /* -5.0 */ + 0x3FE0000000000000ULL, /* 0.5 */ + 0x3FE5555555555555ULL, /* 0.666667 */ + 0xBFE5555555555555ULL, /* -0.666667 */ + 0xBFE5555555555555ULL, /* -0.666667 */ + 0x3FE5555555555555ULL /* 0.666667 */ +}; + +int +main () +{ + int main_result; + int i; + float64 x1, x2; + main_result = 0; + for (i = 0; i < N; i++) + { + float64 result; + x1 = a_input[i]; + x2 = b_input[i]; + result = float64_div (x1, x2); + main_result += (result != z_output[i]); + + printf + ("a_input=%016llx b_input=%016llx expected=%016llx output=%016llx (%lf)\n", + a_input[i], b_input[i], z_output[i], result, + ullong_to_double (result)); + } + printf ("%d\n", main_result); + return main_result; + } diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/hint.txt b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/hint.txt new file mode 100644 index 000000000..c68ac9a75 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/hint.txt @@ -0,0 +1 @@ +--hls-div= diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/milieu.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/milieu.h new file mode 100755 index 000000000..4d92d5e05 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/milieu.h @@ -0,0 +1,53 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Include common integer types and flags. +*----------------------------------------------------------------------------*/ +#include "SPARC-GCC.h" diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-macros b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-macros new file mode 100755 index 000000000..a735f741e --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-macros @@ -0,0 +1,247 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 64, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +| The result is stored in the location pointed to by `zPtr'. +*----------------------------------------------------------------------------*/ + +INLINE void +shift64RightJamming (bits64 a, int16 count, bits64 * zPtr) +{ + bits64 z; + + if (count == 0) + { + z = a; + } + else if (count < 64) + { + z = (a >> count) | ((a << ((-count) & 63)) != 0); + } + else + { + z = (a != 0); + } + *zPtr = z; + +} + +/*---------------------------------------------------------------------------- +| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit +| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so +| any carry out is lost. The result is broken into two 64-bit pieces which +| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void +add128 (bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 * z0Ptr, + bits64 * z1Ptr) +{ + bits64 z1; + + z1 = a1 + b1; + *z1Ptr = z1; + *z0Ptr = a0 + b0 + (z1 < a1); + +} + +/*---------------------------------------------------------------------------- +| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the +| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo +| 2^128, so any borrow out (carry out) is lost. The result is broken into two +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and +| `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void +sub128 (bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 * z0Ptr, + bits64 * z1Ptr) +{ + + *z1Ptr = a1 - b1; + *z0Ptr = a0 - b0 - (a1 < b1); + +} + +/*---------------------------------------------------------------------------- +| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken +| into two 64-bit pieces which are stored at the locations pointed to by +| `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void +mul64To128 (bits64 a, bits64 b, bits64 * z0Ptr, bits64 * z1Ptr) +{ + bits32 aHigh, aLow, bHigh, bLow; + bits64 z0, zMiddleA, zMiddleB, z1; + + aLow = a; + aHigh = a >> 32; + bLow = b; + bHigh = b >> 32; + z1 = ((bits64) aLow) * bLow; + zMiddleA = ((bits64) aLow) * bHigh; + zMiddleB = ((bits64) aHigh) * bLow; + z0 = ((bits64) aHigh) * bHigh; + zMiddleA += zMiddleB; + z0 += (((bits64) (zMiddleA < zMiddleB)) << 32) + (zMiddleA >> 32); + zMiddleA <<= 32; + z1 += zMiddleA; + z0 += (z1 < zMiddleA); + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Returns an approximation to the 64-bit integer quotient obtained by dividing +| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The +| divisor `b' must be at least 2^63. If q is the exact quotient truncated +| toward zero, the approximation returned lies between q and q + 2 inclusive. +| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit +| unsigned integer is returned. +*----------------------------------------------------------------------------*/ + +static bits64 +estimateDiv128To64 (bits64 a0, bits64 a1, bits64 b) +{ + bits64 b0, b1; + bits64 rem0, rem1, term0, term1; + bits64 z; + + if (b <= a0) + return LIT64 (0xFFFFFFFFFFFFFFFF); + b0 = b >> 32; + z = (b0 << 32 <= a0) ? LIT64 (0xFFFFFFFF00000000) : (a0 / b0) << 32; + mul64To128 (b, z, &term0, &term1); + sub128 (a0, a1, term0, term1, &rem0, &rem1); + while (((sbits64) rem0) < 0) + { + z -= LIT64 (0x100000000); + b1 = b << 32; + add128 (rem0, rem1, b0, b1, &rem0, &rem1); + } + rem0 = (rem0 << 32) | (rem1 >> 32); + z |= (b0 << 32 <= rem0) ? 0xFFFFFFFF : rem0 / b0; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 32 is returned. +*----------------------------------------------------------------------------*/ + +static int8 +countLeadingZeros32 (bits32 a) +{ + static const int8 countLeadingZerosHigh[256] = { + 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int8 shiftCount; + + shiftCount = 0; + if (a < 0x10000) + { + shiftCount += 16; + a <<= 16; + } + if (a < 0x1000000) + { + shiftCount += 8; + a <<= 8; + } + shiftCount += countLeadingZerosHigh[a >> 24]; + return shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 64 is returned. +*----------------------------------------------------------------------------*/ + +static int8 +countLeadingZeros64 (bits64 a) +{ + int8 shiftCount; + + shiftCount = 0; + if (a < ((bits64) 1) << 32) + { + shiftCount += 32; + } + else + { + a >>= 32; + } + shiftCount += countLeadingZeros32 (a); + return shiftCount; + +} diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-specialize b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-specialize new file mode 100755 index 000000000..3c5105928 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat-specialize @@ -0,0 +1,123 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Underflow tininess-detection mode, statically initialized to default value. +| (The declaration in `softfloat.h' must match the `int8' type here.) +*----------------------------------------------------------------------------*/ +#define float_detect_tininess float_tininess_before_rounding + +/*---------------------------------------------------------------------------- +| Raises the exceptions specified by `flags'. Floating-point traps can be +| defined here if desired. It is currently not possible for such a trap +| to substitute a result value. If traps are not implemented, this routine +| should be simply `float_exception_flags |= flags;'. +*----------------------------------------------------------------------------*/ + +void +float_raise (int8 flags) +{ + float_exception_flags |= flags; + +} + + +/*---------------------------------------------------------------------------- +| The pattern for a default generated double-precision NaN. +*----------------------------------------------------------------------------*/ +#define float64_default_nan LIT64( 0x7FFFFFFFFFFFFFFF ) + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag +float64_is_nan (float64 a) +{ + + return (LIT64 (0xFFE0000000000000) < (bits64) (a << 1)); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag +float64_is_signaling_nan (float64 a) +{ + + return (((a >> 51) & 0xFFF) == 0xFFE) && (a & LIT64 (0x0007FFFFFFFFFFFF)); + +} + +/*---------------------------------------------------------------------------- +| Takes two double-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static float64 +propagateFloat64NaN (float64 a, float64 b) +{ + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; + + aIsNaN = float64_is_nan (a); + aIsSignalingNaN = float64_is_signaling_nan (a); + bIsNaN = float64_is_nan (b); + bIsSignalingNaN = float64_is_signaling_nan (b); + a |= LIT64 (0x0008000000000000); + b |= LIT64 (0x0008000000000000); + if (aIsSignalingNaN | bIsSignalingNaN) + float_raise (float_flag_invalid); + return bIsSignalingNaN ? b : aIsSignalingNaN ? a : bIsNaN ? b : a; + +} diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.c new file mode 100755 index 000000000..8604da331 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.c @@ -0,0 +1,316 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +#include "milieu.h" +#include "softfloat.h" + +/*---------------------------------------------------------------------------- +| Floating-point rounding mode, extended double-precision rounding precision, +| and exception flags. +*----------------------------------------------------------------------------*/ +int8 float_rounding_mode = float_round_nearest_even; +int8 float_exception_flags = 0; + +/*---------------------------------------------------------------------------- +| Primitive arithmetic functions, including multi-word arithmetic, and +| division and square root approximations. (Can be specialized to target if +| desired.) +*----------------------------------------------------------------------------*/ +#include "softfloat-macros" + +/*---------------------------------------------------------------------------- +| Functions and definitions to determine: (1) whether tininess for underflow +| is detected before or after rounding by default, (2) what (if anything) +| happens when exceptions are raised, (3) how signaling NaNs are distinguished +| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs +| are propagated from function inputs to output. These details are target- +| specific. +*----------------------------------------------------------------------------*/ +#include "softfloat-specialize" + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE bits64 +extractFloat64Frac (float64 a) +{ + + return a & LIT64 (0x000FFFFFFFFFFFFF); + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int16 +extractFloat64Exp (float64 a) +{ + + return (a >> 52) & 0x7FF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag +extractFloat64Sign (float64 a) +{ + + return a >> 63; + +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal double-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +static void +normalizeFloat64Subnormal (bits64 aSig, int16 * zExpPtr, bits64 * zSigPtr) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros64 (aSig) - 11; + *zSigPtr = aSig << shiftCount; + *zExpPtr = 1 - shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a +| double-precision floating-point value, returning the result. After being +| shifted into the proper positions, the three fields are simply added +| together to form the result. This means that any integer portion of `zSig' +| will be added into the exponent. Since a properly normalized significand +| will have an integer portion equal to 1, the `zExp' input should be 1 less +| than the desired result exponent whenever `zSig' is a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +INLINE float64 +packFloat64 (flag zSign, int16 zExp, bits64 zSig) +{ + + return (((bits64) zSign) << 63) + (((bits64) zExp) << 52) + zSig; + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper double-precision floating- +| point value corresponding to the abstract input. Ordinarily, the abstract +| value is simply rounded and packed into the double-precision format, with +| the inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded +| to a subnormal number, and the underflow and inexact exceptions are raised +| if the abstract input cannot be represented exactly as a subnormal double- +| precision floating-point number. +| The input significand `zSig' has its binary point between bits 62 +| and 61, which is 10 bits to the left of the usual location. This shifted +| significand must be normalized or smaller. If `zSig' is not normalized, +| `zExp' must be 0; in that case, the result returned is a subnormal number, +| and it must not require rounding. In the usual case that `zSig' is +| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. +| The handling of underflow and overflow follows the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 +roundAndPackFloat64 (flag zSign, int16 zExp, bits64 zSig) +{ + int8 roundingMode; + flag roundNearestEven, isTiny; + int16 roundIncrement, roundBits; + + roundingMode = float_rounding_mode; + roundNearestEven = (roundingMode == float_round_nearest_even); + roundIncrement = 0x200; + if (!roundNearestEven) + { + if (roundingMode == float_round_to_zero) + { + roundIncrement = 0; + } + else + { + roundIncrement = 0x3FF; + if (zSign) + { + if (roundingMode == float_round_up) + roundIncrement = 0; + } + else + { + if (roundingMode == float_round_down) + roundIncrement = 0; + } + } + } + roundBits = zSig & 0x3FF; + if (0x7FD <= (bits16) zExp) + { + if ((0x7FD < zExp) + || ((zExp == 0x7FD) && ((sbits64) (zSig + roundIncrement) < 0))) + { + float_raise (float_flag_overflow | float_flag_inexact); + return packFloat64 (zSign, 0x7FF, 0) - (roundIncrement == 0); + } + if (zExp < 0) + { + isTiny = (float_detect_tininess == float_tininess_before_rounding) + || (zExp < -1) + || (zSig + roundIncrement < LIT64 (0x8000000000000000)); + shift64RightJamming (zSig, -zExp, &zSig); + zExp = 0; + roundBits = zSig & 0x3FF; + if (isTiny && roundBits) + float_raise (float_flag_underflow); + } + } + if (roundBits) + float_exception_flags |= float_flag_inexact; + zSig = (zSig + roundIncrement) >> 10; + zSig &= ~(((roundBits ^ 0x200) == 0) & roundNearestEven); + if (zSig == 0) + zExp = 0; + return packFloat64 (zSign, zExp, zSig); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the double-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 +float64_div (float64 a, float64 b) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits64 aSig, bSig, zSig; + bits64 rem0, rem1, term0, term1; + + aSig = extractFloat64Frac (a); + aExp = extractFloat64Exp (a); + aSign = extractFloat64Sign (a); + bSig = extractFloat64Frac (b); + bExp = extractFloat64Exp (b); + bSign = extractFloat64Sign (b); + zSign = aSign ^ bSign; + if (aExp == 0x7FF) + { + if (aSig) + return propagateFloat64NaN (a, b); + if (bExp == 0x7FF) + { + if (bSig) + return propagateFloat64NaN (a, b); + float_raise (float_flag_invalid); + return float64_default_nan; + } + return packFloat64 (zSign, 0x7FF, 0); + } + if (bExp == 0x7FF) + { + if (bSig) + return propagateFloat64NaN (a, b); + return packFloat64 (zSign, 0, 0); + } + if (bExp == 0) + { + if (bSig == 0) + { + if ((aExp | aSig) == 0) + { + float_raise (float_flag_invalid); + return float64_default_nan; + } + float_raise (float_flag_divbyzero); + return packFloat64 (zSign, 0x7FF, 0); + } + normalizeFloat64Subnormal (bSig, &bExp, &bSig); + } + if (aExp == 0) + { + if (aSig == 0) + return packFloat64 (zSign, 0, 0); + normalizeFloat64Subnormal (aSig, &aExp, &aSig); + } + zExp = aExp - bExp + 0x3FD; + aSig = (aSig | LIT64 (0x0010000000000000)) << 10; + bSig = (bSig | LIT64 (0x0010000000000000)) << 11; + if (bSig <= (aSig + aSig)) + { + aSig >>= 1; + ++zExp; + } + zSig = estimateDiv128To64 (aSig, 0, bSig); + if ((zSig & 0x1FF) <= 2) + { + mul64To128 (bSig, zSig, &term0, &term1); + sub128 (aSig, 0, term0, term1, &rem0, &rem1); + while ((sbits64) rem0 < 0) + { + --zSig; + add128 (rem0, rem1, 0, bSig, &rem0, &rem1); + } + zSig |= (rem1 != 0); + } + return roundAndPackFloat64 (zSign, zExp, zSig); + +} diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.h new file mode 100755 index 000000000..6d075ca15 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/softfloat.h @@ -0,0 +1,77 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point types. +*----------------------------------------------------------------------------*/ +typedef unsigned int float32; +typedef unsigned long long float64; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point underflow tininess-detection mode. +*----------------------------------------------------------------------------*/ +#define float_tininess_after_rounding 0 +#define float_tininess_before_rounding 1 + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point rounding mode. +*----------------------------------------------------------------------------*/ +#define float_round_nearest_even 0 +#define float_round_to_zero 1 +#define float_round_up 2 +#define float_round_down 3 + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point exception flags. +*----------------------------------------------------------------------------*/ +#define float_flag_inexact 1 +#define float_flag_divbyzero 2 +#define float_flag_underflow 4 +#define float_flag_overflow 8 +#define float_flag_invalid 16 diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/SPARC-GCC.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/SPARC-GCC.h new file mode 100755 index 000000000..523e274f6 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/SPARC-GCC.h @@ -0,0 +1,88 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines the most convenient type that holds +| integers of at least as many bits as specified. For example, `uint8' should +| be the most convenient type that can hold unsigned integers of as many as +| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most +| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed +| to the same as `int'. +*----------------------------------------------------------------------------*/ +typedef int flag; +typedef int int8; +typedef int int16; + +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines a type that holds integers +| of _exactly_ the number of bits specified. For instance, for most +| implementation of C, `bits16' and `sbits16' should be `typedef'ed to +| `unsigned short int' and `signed short int' (or `short int'), respectively. +*----------------------------------------------------------------------------*/ +typedef unsigned short int bits16; +typedef unsigned int bits32; +typedef unsigned long long int bits64; +typedef signed long long int sbits64; + +/*---------------------------------------------------------------------------- +| The `LIT64' macro takes as its argument a textual integer literal and +| if necessary ``marks'' the literal as having a 64-bit integer type. +| For example, the GNU C Compiler (`gcc') requires that 64-bit literals be +| appended with the letters `LL' standing for `long long', which is `gcc's +| name for the 64-bit integer type. Some compilers may allow `LIT64' to be +| defined as the identity macro: `#define LIT64( a ) a'. +*----------------------------------------------------------------------------*/ +#define LIT64( a ) a##LL + +/*---------------------------------------------------------------------------- +| The macro `INLINE' can be used before functions that should be inlined. If +| a compiler does not support explicit inlining, this macro should be defined +| to be `static'. +*----------------------------------------------------------------------------*/ +#define INLINE diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.c new file mode 100755 index 000000000..7fd9823bd --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.c @@ -0,0 +1,159 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/* + * Copyright (C) 2008 + * Y. Hara, H. Tomiyama, S. Honda, H. Takada and K. Ishii + * Nagoya University, Japan + * All rights reserved. + * + * Disclaimer of Warranty + * + * These software programs are available to the user without any license fee or + * royalty on an "as is" basis. The authors disclaims any and all warranties, + * whether express, implied, or statuary, including any implied warranties or + * merchantability or of fitness for a particular purpose. In no event shall the + * copyright-holder be liable for any incidental, punitive, or consequential damages + * of any kind whatsoever arising from the use of these programs. This disclaimer + * of warranty extends to the user of these programs and user's customers, employees, + * agents, transferees, successors, and assigns. + * + */ +#include +#include "softfloat.c" + +double +ullong_to_double (unsigned long long x) +{ + union + { + double d; + unsigned long long ll; + } t; + + t.ll = x; + return t.d; +} + +/* ++--------------------------------------------------------------------------+ +| * Test Vectors (added for CHStone) | +| a_input, b_input : input data | +| z_output : expected output data | ++--------------------------------------------------------------------------+ +*/ +#define N 22 + +const float64 a_input[N] = { + 0x7FFF000000000000ULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x7FF0000000000000ULL, /* inf */ + 0x7FF0000000000000ULL, /* inf */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x0000000000000000ULL, /* 0.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x0000000000000000ULL, /* 0.0 */ + 0x8000000000000000ULL, /* -0.0 */ + 0x4008000000000000ULL, /* 3.0 */ + 0xC008000000000000ULL, /* -3.0 */ + 0x4008000000000000ULL, /* 3.0 */ + 0xC008000000000000ULL, /* -3.0 */ + 0x4000000000000000ULL, /* 2.0 */ + 0xC000000000000000ULL, /* -2.0 */ + 0x4000000000000000ULL, /* 2.0 */ + 0xC000000000000000ULL, /* -2.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0xBFF0000000000000ULL, /* -1.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0xBFF0000000000000ULL /* -1.0 */ +}; + +const float64 b_input[N] = { + 0x3FF0000000000000ULL, /* 1.0 */ + 0x7FF8000000000000ULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x7FF8000000000000ULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x0000000000000000ULL, /* 0.0 */ + 0x0000000000000000ULL, /* 0.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x3FF0000000000000ULL, /* 1.0 */ + 0x4000000000000000ULL, /* 2.0 */ + 0x4000000000000000ULL, /* 2.0 */ + 0xC000000000000000ULL, /* 2.0 */ + 0xC000000000000000ULL, /* -2.0 */ + 0x4010000000000000ULL, /* 4.0 */ + 0x4010000000000000ULL, /* 4.0 */ + 0xC010000000000000ULL, /* -4.0 */ + 0xC010000000000000ULL, /* -4.0 */ + 0x3FF8000000000000ULL, /* 1.5 */ + 0x3FF8000000000000ULL, /* 1.5 */ + 0xBFF8000000000000ULL, /* -1.5 */ + 0xBFF8000000000000ULL /* -1.5 */ +}; + +const float64 z_output[N] = { + 0x7FFF000000000000ULL, /* nan */ + 0x7FF8000000000000ULL, /* nan */ + 0x7FFFFFFFFFFFFFFFULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x7FF8000000000000ULL, /* nan */ + 0x0000000000000000ULL, /* 0.0 */ + 0x7FFFFFFFFFFFFFFFULL, /* nan */ + 0x7FF0000000000000ULL, /* inf */ + 0x0000000000000000ULL, /* 0.0 */ + 0x8000000000000000ULL, /* -0.0 */ + 0x3FF8000000000000ULL, /* 1.5 */ + 0xBFF8000000000000ULL, /* -1.5 */ + 0xBFF8000000000000ULL, /* 1.5 */ + 0x3FF8000000000000ULL, /* -1.5 */ + 0x3FE0000000000000ULL, /* 0.5 */ + 0xBFE0000000000000ULL, /* 5.0 */ + 0xBFE0000000000000ULL, /* -5.0 */ + 0x3FE0000000000000ULL, /* 0.5 */ + 0x3FE5555555555555ULL, /* 0.666667 */ + 0xBFE5555555555555ULL, /* -0.666667 */ + 0xBFE5555555555555ULL, /* -0.666667 */ + 0x3FE5555555555555ULL /* 0.666667 */ +}; + +int +main () +{ + int main_result; + int i; + float64 x1, x2; + main_result = 0; + for (i = 0; i < N; i++) + { + float64 result; + x1 = a_input[i]; + x2 = b_input[i]; + result = float64_div (x1, x2); + main_result += (result != z_output[i]); + + printf + ("a_input=%016llx b_input=%016llx expected=%016llx output=%016llx (%lf)\n", + a_input[i], b_input[i], z_output[i], result, + ullong_to_double (result)); + } + printf ("%d\n", main_result); + return main_result; + } diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.csv b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.csv new file mode 100644 index 000000000..daedfee6a --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/dfdiv.csv @@ -0,0 +1,6 @@ +Benchmark, CYCLES, HLS_execution_time, +GCC49:dfdiv_NR:main_0, 825,44.9199999999999999983, +GCC49:dfdiv_as:main_0, 841,30.1399999999999999994, +GCC49:dfdiv_none:main_0, 1777,37.5, +GCC49:dfdiv_nr1:main_0, 1849,41.1800000000000000003, +GCC49:dfdiv_nr2:main_0, 1105,43.119999999999999999, diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/list b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/list new file mode 100644 index 000000000..a46f62bf6 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/list @@ -0,0 +1,5 @@ +dfdiv.c --benchmark-name=dfdiv_none --hls-div=none +dfdiv.c --benchmark-name=dfdiv_nr1 --hls-div=nr1 +dfdiv.c --benchmark-name=dfdiv_nr2 --hls-div=nr2 +dfdiv.c --benchmark-name=dfdiv_NR --hls-div=NR +dfdiv.c --benchmark-name=dfdiv_as --hls-div=as diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/milieu.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/milieu.h new file mode 100755 index 000000000..4d92d5e05 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/milieu.h @@ -0,0 +1,53 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Include common integer types and flags. +*----------------------------------------------------------------------------*/ +#include "SPARC-GCC.h" diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-macros b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-macros new file mode 100755 index 000000000..a735f741e --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-macros @@ -0,0 +1,247 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 64, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +| The result is stored in the location pointed to by `zPtr'. +*----------------------------------------------------------------------------*/ + +INLINE void +shift64RightJamming (bits64 a, int16 count, bits64 * zPtr) +{ + bits64 z; + + if (count == 0) + { + z = a; + } + else if (count < 64) + { + z = (a >> count) | ((a << ((-count) & 63)) != 0); + } + else + { + z = (a != 0); + } + *zPtr = z; + +} + +/*---------------------------------------------------------------------------- +| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit +| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so +| any carry out is lost. The result is broken into two 64-bit pieces which +| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void +add128 (bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 * z0Ptr, + bits64 * z1Ptr) +{ + bits64 z1; + + z1 = a1 + b1; + *z1Ptr = z1; + *z0Ptr = a0 + b0 + (z1 < a1); + +} + +/*---------------------------------------------------------------------------- +| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the +| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo +| 2^128, so any borrow out (carry out) is lost. The result is broken into two +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and +| `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void +sub128 (bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 * z0Ptr, + bits64 * z1Ptr) +{ + + *z1Ptr = a1 - b1; + *z0Ptr = a0 - b0 - (a1 < b1); + +} + +/*---------------------------------------------------------------------------- +| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken +| into two 64-bit pieces which are stored at the locations pointed to by +| `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void +mul64To128 (bits64 a, bits64 b, bits64 * z0Ptr, bits64 * z1Ptr) +{ + bits32 aHigh, aLow, bHigh, bLow; + bits64 z0, zMiddleA, zMiddleB, z1; + + aLow = a; + aHigh = a >> 32; + bLow = b; + bHigh = b >> 32; + z1 = ((bits64) aLow) * bLow; + zMiddleA = ((bits64) aLow) * bHigh; + zMiddleB = ((bits64) aHigh) * bLow; + z0 = ((bits64) aHigh) * bHigh; + zMiddleA += zMiddleB; + z0 += (((bits64) (zMiddleA < zMiddleB)) << 32) + (zMiddleA >> 32); + zMiddleA <<= 32; + z1 += zMiddleA; + z0 += (z1 < zMiddleA); + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Returns an approximation to the 64-bit integer quotient obtained by dividing +| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The +| divisor `b' must be at least 2^63. If q is the exact quotient truncated +| toward zero, the approximation returned lies between q and q + 2 inclusive. +| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit +| unsigned integer is returned. +*----------------------------------------------------------------------------*/ + +static bits64 +estimateDiv128To64 (bits64 a0, bits64 a1, bits64 b) +{ + bits64 b0, b1; + bits64 rem0, rem1, term0, term1; + bits64 z; + + if (b <= a0) + return LIT64 (0xFFFFFFFFFFFFFFFF); + b0 = b >> 32; + z = (b0 << 32 <= a0) ? LIT64 (0xFFFFFFFF00000000) : (a0 / b0) << 32; + mul64To128 (b, z, &term0, &term1); + sub128 (a0, a1, term0, term1, &rem0, &rem1); + while (((sbits64) rem0) < 0) + { + z -= LIT64 (0x100000000); + b1 = b << 32; + add128 (rem0, rem1, b0, b1, &rem0, &rem1); + } + rem0 = (rem0 << 32) | (rem1 >> 32); + z |= (b0 << 32 <= rem0) ? 0xFFFFFFFF : rem0 / b0; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 32 is returned. +*----------------------------------------------------------------------------*/ + +static int8 +countLeadingZeros32 (bits32 a) +{ + static const int8 countLeadingZerosHigh[256] = { + 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int8 shiftCount; + + shiftCount = 0; + if (a < 0x10000) + { + shiftCount += 16; + a <<= 16; + } + if (a < 0x1000000) + { + shiftCount += 8; + a <<= 8; + } + shiftCount += countLeadingZerosHigh[a >> 24]; + return shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 64 is returned. +*----------------------------------------------------------------------------*/ + +static int8 +countLeadingZeros64 (bits64 a) +{ + int8 shiftCount; + + shiftCount = 0; + if (a < ((bits64) 1) << 32) + { + shiftCount += 32; + } + else + { + a >>= 32; + } + shiftCount += countLeadingZeros32 (a); + return shiftCount; + +} diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-specialize b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-specialize new file mode 100755 index 000000000..3c5105928 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat-specialize @@ -0,0 +1,123 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Underflow tininess-detection mode, statically initialized to default value. +| (The declaration in `softfloat.h' must match the `int8' type here.) +*----------------------------------------------------------------------------*/ +#define float_detect_tininess float_tininess_before_rounding + +/*---------------------------------------------------------------------------- +| Raises the exceptions specified by `flags'. Floating-point traps can be +| defined here if desired. It is currently not possible for such a trap +| to substitute a result value. If traps are not implemented, this routine +| should be simply `float_exception_flags |= flags;'. +*----------------------------------------------------------------------------*/ + +void +float_raise (int8 flags) +{ + float_exception_flags |= flags; + +} + + +/*---------------------------------------------------------------------------- +| The pattern for a default generated double-precision NaN. +*----------------------------------------------------------------------------*/ +#define float64_default_nan LIT64( 0x7FFFFFFFFFFFFFFF ) + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag +float64_is_nan (float64 a) +{ + + return (LIT64 (0xFFE0000000000000) < (bits64) (a << 1)); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag +float64_is_signaling_nan (float64 a) +{ + + return (((a >> 51) & 0xFFF) == 0xFFE) && (a & LIT64 (0x0007FFFFFFFFFFFF)); + +} + +/*---------------------------------------------------------------------------- +| Takes two double-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static float64 +propagateFloat64NaN (float64 a, float64 b) +{ + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; + + aIsNaN = float64_is_nan (a); + aIsSignalingNaN = float64_is_signaling_nan (a); + bIsNaN = float64_is_nan (b); + bIsSignalingNaN = float64_is_signaling_nan (b); + a |= LIT64 (0x0008000000000000); + b |= LIT64 (0x0008000000000000); + if (aIsSignalingNaN | bIsSignalingNaN) + float_raise (float_flag_invalid); + return bIsSignalingNaN ? b : aIsSignalingNaN ? a : bIsNaN ? b : a; + +} diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.c new file mode 100755 index 000000000..8604da331 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.c @@ -0,0 +1,316 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +#include "milieu.h" +#include "softfloat.h" + +/*---------------------------------------------------------------------------- +| Floating-point rounding mode, extended double-precision rounding precision, +| and exception flags. +*----------------------------------------------------------------------------*/ +int8 float_rounding_mode = float_round_nearest_even; +int8 float_exception_flags = 0; + +/*---------------------------------------------------------------------------- +| Primitive arithmetic functions, including multi-word arithmetic, and +| division and square root approximations. (Can be specialized to target if +| desired.) +*----------------------------------------------------------------------------*/ +#include "softfloat-macros" + +/*---------------------------------------------------------------------------- +| Functions and definitions to determine: (1) whether tininess for underflow +| is detected before or after rounding by default, (2) what (if anything) +| happens when exceptions are raised, (3) how signaling NaNs are distinguished +| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs +| are propagated from function inputs to output. These details are target- +| specific. +*----------------------------------------------------------------------------*/ +#include "softfloat-specialize" + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE bits64 +extractFloat64Frac (float64 a) +{ + + return a & LIT64 (0x000FFFFFFFFFFFFF); + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int16 +extractFloat64Exp (float64 a) +{ + + return (a >> 52) & 0x7FF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag +extractFloat64Sign (float64 a) +{ + + return a >> 63; + +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal double-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +static void +normalizeFloat64Subnormal (bits64 aSig, int16 * zExpPtr, bits64 * zSigPtr) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros64 (aSig) - 11; + *zSigPtr = aSig << shiftCount; + *zExpPtr = 1 - shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a +| double-precision floating-point value, returning the result. After being +| shifted into the proper positions, the three fields are simply added +| together to form the result. This means that any integer portion of `zSig' +| will be added into the exponent. Since a properly normalized significand +| will have an integer portion equal to 1, the `zExp' input should be 1 less +| than the desired result exponent whenever `zSig' is a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +INLINE float64 +packFloat64 (flag zSign, int16 zExp, bits64 zSig) +{ + + return (((bits64) zSign) << 63) + (((bits64) zExp) << 52) + zSig; + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper double-precision floating- +| point value corresponding to the abstract input. Ordinarily, the abstract +| value is simply rounded and packed into the double-precision format, with +| the inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded +| to a subnormal number, and the underflow and inexact exceptions are raised +| if the abstract input cannot be represented exactly as a subnormal double- +| precision floating-point number. +| The input significand `zSig' has its binary point between bits 62 +| and 61, which is 10 bits to the left of the usual location. This shifted +| significand must be normalized or smaller. If `zSig' is not normalized, +| `zExp' must be 0; in that case, the result returned is a subnormal number, +| and it must not require rounding. In the usual case that `zSig' is +| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. +| The handling of underflow and overflow follows the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 +roundAndPackFloat64 (flag zSign, int16 zExp, bits64 zSig) +{ + int8 roundingMode; + flag roundNearestEven, isTiny; + int16 roundIncrement, roundBits; + + roundingMode = float_rounding_mode; + roundNearestEven = (roundingMode == float_round_nearest_even); + roundIncrement = 0x200; + if (!roundNearestEven) + { + if (roundingMode == float_round_to_zero) + { + roundIncrement = 0; + } + else + { + roundIncrement = 0x3FF; + if (zSign) + { + if (roundingMode == float_round_up) + roundIncrement = 0; + } + else + { + if (roundingMode == float_round_down) + roundIncrement = 0; + } + } + } + roundBits = zSig & 0x3FF; + if (0x7FD <= (bits16) zExp) + { + if ((0x7FD < zExp) + || ((zExp == 0x7FD) && ((sbits64) (zSig + roundIncrement) < 0))) + { + float_raise (float_flag_overflow | float_flag_inexact); + return packFloat64 (zSign, 0x7FF, 0) - (roundIncrement == 0); + } + if (zExp < 0) + { + isTiny = (float_detect_tininess == float_tininess_before_rounding) + || (zExp < -1) + || (zSig + roundIncrement < LIT64 (0x8000000000000000)); + shift64RightJamming (zSig, -zExp, &zSig); + zExp = 0; + roundBits = zSig & 0x3FF; + if (isTiny && roundBits) + float_raise (float_flag_underflow); + } + } + if (roundBits) + float_exception_flags |= float_flag_inexact; + zSig = (zSig + roundIncrement) >> 10; + zSig &= ~(((roundBits ^ 0x200) == 0) & roundNearestEven); + if (zSig == 0) + zExp = 0; + return packFloat64 (zSign, zExp, zSig); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the double-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 +float64_div (float64 a, float64 b) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits64 aSig, bSig, zSig; + bits64 rem0, rem1, term0, term1; + + aSig = extractFloat64Frac (a); + aExp = extractFloat64Exp (a); + aSign = extractFloat64Sign (a); + bSig = extractFloat64Frac (b); + bExp = extractFloat64Exp (b); + bSign = extractFloat64Sign (b); + zSign = aSign ^ bSign; + if (aExp == 0x7FF) + { + if (aSig) + return propagateFloat64NaN (a, b); + if (bExp == 0x7FF) + { + if (bSig) + return propagateFloat64NaN (a, b); + float_raise (float_flag_invalid); + return float64_default_nan; + } + return packFloat64 (zSign, 0x7FF, 0); + } + if (bExp == 0x7FF) + { + if (bSig) + return propagateFloat64NaN (a, b); + return packFloat64 (zSign, 0, 0); + } + if (bExp == 0) + { + if (bSig == 0) + { + if ((aExp | aSig) == 0) + { + float_raise (float_flag_invalid); + return float64_default_nan; + } + float_raise (float_flag_divbyzero); + return packFloat64 (zSign, 0x7FF, 0); + } + normalizeFloat64Subnormal (bSig, &bExp, &bSig); + } + if (aExp == 0) + { + if (aSig == 0) + return packFloat64 (zSign, 0, 0); + normalizeFloat64Subnormal (aSig, &aExp, &aSig); + } + zExp = aExp - bExp + 0x3FD; + aSig = (aSig | LIT64 (0x0010000000000000)) << 10; + bSig = (bSig | LIT64 (0x0010000000000000)) << 11; + if (bSig <= (aSig + aSig)) + { + aSig >>= 1; + ++zExp; + } + zSig = estimateDiv128To64 (aSig, 0, bSig); + if ((zSig & 0x1FF) <= 2) + { + mul64To128 (bSig, zSig, &term0, &term1); + sub128 (aSig, 0, term0, term1, &rem0, &rem1); + while ((sbits64) rem0 < 0) + { + --zSig; + add128 (rem0, rem1, 0, bSig, &rem0, &rem1); + } + zSig |= (rem1 != 0); + } + return roundAndPackFloat64 (zSign, zExp, zSig); + +} diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.h b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.h new file mode 100755 index 000000000..6d075ca15 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/softfloat.h @@ -0,0 +1,77 @@ +/* ++--------------------------------------------------------------------------+ +| CHStone : a suite of benchmark programs for C-based High-Level Synthesis | +| ======================================================================== | +| | +| * Collected and Modified : Y. Hara, H. Tomiyama, S. Honda, | +| H. Takada and K. Ishii | +| Nagoya University, Japan | +| | +| * Remark : | +| 1. This source code is modified to unify the formats of the benchmark | +| programs in CHStone. | +| 2. Test vectors are added for CHStone. | +| 3. If "main_result" is 0 at the end of the program, the program is | +| correctly executed. | +| 4. Please follow the copyright of each benchmark program. | ++--------------------------------------------------------------------------+ +*/ +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point types. +*----------------------------------------------------------------------------*/ +typedef unsigned int float32; +typedef unsigned long long float64; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point underflow tininess-detection mode. +*----------------------------------------------------------------------------*/ +#define float_tininess_after_rounding 0 +#define float_tininess_before_rounding 1 + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point rounding mode. +*----------------------------------------------------------------------------*/ +#define float_round_nearest_even 0 +#define float_round_to_zero 1 +#define float_round_up 2 +#define float_round_down 3 + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point exception flags. +*----------------------------------------------------------------------------*/ +#define float_flag_inexact 1 +#define float_flag_divbyzero 2 +#define float_flag_underflow 4 +#define float_flag_overflow 8 +#define float_flag_invalid 16 diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/synthesize.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/synthesize.sh new file mode 100755 index 000000000..5180fef4b --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise3/solution/synthesize.sh @@ -0,0 +1,6 @@ +#!/bin/bash +abs_script=$(readlink -e $0) +dir_script=$(dirname $abs_script) +$dir_script/../../test_panda.py --tool=bambu --bambu=bambu --spider=spider \ + --args="--configuration-name=GCC49 --compiler=I386_GCC49" \ + -c=--simulate -b$dir_script -l$dir_script/list "$@" diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise4/module.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise4/module.c new file mode 100644 index 000000000..0d0c6140c --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise4/module.c @@ -0,0 +1,4 @@ +long long func_replace(long long a, long long b) +{ + return a * b; +} \ No newline at end of file diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/README b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/README new file mode 100644 index 000000000..955068302 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/README @@ -0,0 +1,5 @@ +1. Generate the module implementing the following formula (single precision and double precision): + +gamma = acos((a**2+b**2-c**2)/(2*a*b)) + +2. Identify the combination of softfloat ops and libm which produces the best performances. diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.sh new file mode 100755 index 000000000..244aa73b5 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.sh @@ -0,0 +1,7 @@ +#!/bin/bash +abs_script=$(readlink -e $0) +dir_script=$(dirname $abs_script) +bambu $dir_script/module.c --top-fname=awesome_math \ + -O3 -lm --speculative-sdc-scheduling --libm-std-rounding --soft-float \ + --simulate --generate-tb="a=3.0,b=4.0,c=5.0" \ + "$@" |& tee log.txt diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.txt b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.txt new file mode 100644 index 000000000..783879ebf --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/hint.txt @@ -0,0 +1,4 @@ +Use the following parameters: +--libm-std-rounding +--soft-float +... diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/list b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/list new file mode 100644 index 000000000..b2a51a80b --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/list @@ -0,0 +1,4 @@ +module.c --benchmark-name=std_rounding_softfloat --libm-std-rounding --soft-float +module.c --benchmark-name=std_rounding_soft-fp --libm-std-rounding --soft-fp +module.c --benchmark-name=faith_rounding_softfloat --soft-float +module.c --benchmark-name=faith_rounding_soft-fp --soft-fp diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/module.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/module.c new file mode 100644 index 000000000..266c15765 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/module.c @@ -0,0 +1,24 @@ +#include + +#ifdef FP_SINGLE +#define FP_TYPE float +#define ACOS(a) acosf(a) +#else +#define FP_TYPE double +#define ACOS(a) acos(a) +#endif + +#ifdef MULT_SQUARE +#define SQUARE(a) (a*a) +#else +#ifdef FP_SINGLE +#define SQUARE(a) powf(a,2) +#else +#define SQUARE(a) pow(a,2) +#endif +#endif + +FP_TYPE awesome_math(FP_TYPE a, FP_TYPE b, FP_TYPE c) +{ + return ACOS((SQUARE(a) + SQUARE(b) - SQUARE(c))/(2*a*b)); +} diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/synthesize.sh b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/synthesize.sh new file mode 100755 index 000000000..07ba2bbd5 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/synthesize.sh @@ -0,0 +1,11 @@ +#!/bin/bash +abs_script=$(readlink -e $0) +dir_script=$(dirname $abs_script) +$dir_script/../../test_panda.py --tool=bambu --bambu=bambu --spider=spider \ + --args="--configuration-name=pow_square " \ + --args="--configuration-name=mult_square -DMULT_SQUARE" \ + --args="--configuration-name=single_pow_square -DFP_SINGLE" \ + --args="--configuration-name=single_mult_square -DFP_SINGLE -DMULT_SQUARE" \ + -c=--simulate -c=-lm -c=--generate-tb=$dir_script/testbench.xml -c=--speculative-sdc-scheduling \ + -c=--top-fname=awesome_math \ + -b$dir_script -l$dir_script/list "$@" diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/testbench.xml b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/testbench.xml new file mode 100644 index 000000000..63bd95293 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise5/solution/testbench.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/documentation/tutorial_fpl_2022/03-optimizations/Exercise6/module.c b/documentation/tutorial_fpl_2022/03-optimizations/Exercise6/module.c new file mode 100644 index 000000000..881227549 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/Exercise6/module.c @@ -0,0 +1,4 @@ +float user_fp(float a, float b, float c) +{ + return a * b + c; +} \ No newline at end of file diff --git a/documentation/tutorial_fpl_2022/03-optimizations/test_panda.py b/documentation/tutorial_fpl_2022/03-optimizations/test_panda.py new file mode 100755 index 000000000..d486f8b83 --- /dev/null +++ b/documentation/tutorial_fpl_2022/03-optimizations/test_panda.py @@ -0,0 +1,961 @@ +#!/usr/bin/python + +import argparse +import datetime +import distutils.spawn +import logging +import os +import re +import shlex +import shutil +import signal +import subprocess +import sys +import threading +import xml.dom.minidom +from collections import deque + +line_index = 0 +failure = False + +def positive_integer(value): + pos_int = int(value) + if pos_int <= 0: + raise argparse.ArgumentTypeError("%s must be a positive integer" % value) + return pos_int + +class StoreOrUpdateMin(argparse.Action): + first_parsed = True + def __call__(self, parser, namespace, values, option_string=None): + if self.first_parsed == True : + self.first_parsed = False + setattr(namespace, self.dest, values) + else : + setattr(namespace, self.dest, min(namespace.j, values)) + + +#Return children of a process +def GetChildren(parent_pid): + ret = set() + ps_command = subprocess.Popen("ps -o pid --ppid %d --noheaders" % parent_pid, shell=True, stdout=subprocess.PIPE) + ps_output = ps_command.stdout.read() + ps_command.wait() + for pid_str in ps_output.split("\n")[:-1]: + ret.add(int(pid_str)) + return ret + +#Kill a process than kill its children +def kill_proc_tree(pid): + children = GetChildren(pid) + os.kill(pid, signal.SIGKILL) + for child in children: + kill_proc_tree(child) +#Process benchmark in list +def execute_tests(named_list,thread_index): + global passed_benchmark + global total_benchmark + global line_index + global children + global failure + lines = open(named_list).readlines() + with lock: + local_index = line_index + line_index += 1 + while local_index < len(lines) and not (failure and args.stop): + cwd = ComputeDirectory(lines[local_index]) + failed_output_file_name = os.path.join(cwd, args.tool + "_failed_output") + if os.path.exists(failed_output_file_name): + os.remove(failed_output_file_name) + tool_return_value_file_name = os.path.join(cwd, args.tool + "_return_value") + if args.restart and os.path.exists(os.path.join(cwd, args.tool + "_return_value")): + tool_return_value_file = open(tool_return_value_file_name, "r") + return_value = tool_return_value_file.read() + tool_return_value_file.close() + if return_value == "0": + with lock: + total_benchmark += 1 + passed_benchmark += 1 + logging.info(" SKIPPING --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", "")) + local_index = line_index + line_index += 1 + continue + HLS_output_directory = os.path.join(cwd, "HLS_output") + if os.path.exists(HLS_output_directory): + shutil.rmtree(HLS_output_directory) + output_file_name = os.path.join(cwd, args.tool + "_execution_output") + output_file = open(output_file_name, "w") + local_args = lines[local_index] + if local_args[0] == "\"": + local_args = local_args[1:-1] + if args.tool != "bambu" and args.tool != "zebu": + tokens = shlex.split(lines[local_index]) + args_without_benchmark_name = "" + for token in tokens: + if token.find("--benchmark-name") == -1: + args_without_benchmark_name += token + " " + local_args = args_without_benchmark_name + local_command = "ulimit " + args.ulimit + "; exec timeout " + args.timeout + " " + tool_exe + local_command = local_command + " " + local_args + output_file.write("#" * 80 + "\n") + output_file.write("cd " + cwd + "; ") + output_file.write(local_command + "\n") + output_file.write("#" * 80 + "\n") + output_file.flush() + return_value = -1 + with lock_creation_destruction: + if not (failure and args.stop): + children[thread_index] = subprocess.Popen(local_command, stderr=output_file, stdout=output_file, cwd=cwd, shell=True, executable="/bin/bash") + try: + return_value = children[thread_index].wait() + except: + pass + with lock_creation_destruction: + if return_value != 0 and (args.stop or args.returnfail): + failure = True + if failure and args.stop: + for local_thread_index in range(n_jobs): + if children[local_thread_index] != None: + if children[local_thread_index].poll() == None: + try: + kill_proc_tree(children[local_thread_index].pid) + except OSError: + pass + os.fsync(output_file.fileno()) + output_file.close() + tool_return_value_file = open(tool_return_value_file_name, "w") + tool_return_value_file.write(str(return_value)) + tool_return_value_file.close() + args_file = open(os.path.join(cwd, "args"), "w") + args_file.write(lines[local_index]) + args_file.close() + if return_value == 0 and os.path.exists(os.path.join(cwd, args.tool + "_results_0.xml")): + tool_results_file_name = os.path.join(cwd, args.tool + "_results") + tool_results_file = open(tool_results_file_name, "w") + tool_results_string = "" + xml_document = xml.dom.minidom.parse(os.path.join(cwd, args.tool + "_results_0.xml")) + if len(xml_document.getElementsByTagName("CYCLES")) > 0: + cycles_tag = xml_document.getElementsByTagName("CYCLES")[0] + tool_results_string = tool_results_string + cycles_tag.attributes["value"].value + " CYCLES" + if len(xml_document.getElementsByTagName("CLOCK_SLACK")) > 0: + slack_tag = xml_document.getElementsByTagName("CLOCK_SLACK")[0] + tool_results_string = tool_results_string + " *** " + slack_tag.attributes["value"].value + "ns" + tool_results_file.write(tool_results_string) + tool_results_file.close() + if not (failure and args.stop) or (return_value != -9 and return_value != 0): + if return_value != 0: + shutil.copy(output_file_name, str(os.path.join(os.path.dirname(output_file_name), args.tool + "_failed_output"))) + with lock: + total_benchmark += 1 + if return_value == 0: + passed_benchmark += 1 + if not args.no_clean: + for sub in os.listdir(cwd): + if os.path.isdir(os.path.join(cwd, sub)): + shutil.rmtree(os.path.join(cwd, sub)) + else: + if sub != args.tool + "_return_value" and sub != args.tool + "_execution_output" and sub != args.tool + "_results_0.xml" and sub != "args": + os.remove(os.path.join(cwd, sub)) + if os.path.exists(os.path.join(cwd, args.tool + "_results_0.xml")): + logging.info(" SUCCESS (" + tool_results_string + ") --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", "")) + else: + logging.info(" SUCCESS --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", "")) + elif return_value == 124: + logging.info(" FAILURE (Timeout) --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", "")) + elif return_value == 153: + logging.info(" FAILURE (File size limit exceeded) --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", "")) + else: + logging.info(" FAILURE --- OVERALL: " + str(passed_benchmark) + " passed, " + str(total_benchmark-passed_benchmark) + " failed, " + str(len(lines)-total_benchmark) + " queued --- " + lines[local_index].replace("\\", "")) + with lock: + local_index = line_index + line_index += 1 + +#Computing relative path +def ComputeDirectory(line): + configuration_name = "" + benchmark_name = "" + tokens = shlex.split(line) + for token in tokens: + if token.find("--configuration-name") != -1: + configuration_name = token[len("--configuration-name="):] + if token.find("--benchmark-name") != -1: + benchmark_name = token[len("--benchmark-name="):] + new_dir = os.path.join(abs_path, configuration_name, benchmark_name) + return new_dir + +#Search c files +def SearchCFiles(directory): + logging.info(" Looking for file in " + str(directory)) + files = set() + for element in os.listdir(directory): + if os.path.isdir(os.path.join(directory, element)): + files = files.union(SearchCFiles(os.path.join(directory, element))) + elif (element[-2:] == ".c") or (element[-2:] == ".C") or (element[-4:] == ".CPP") or (element[-4:] == ".cpp") or (element[-4:] == ".cxx") or (element[-3:] == ".cc") or (element[-4:] == ".c++"): + files.add(os.path.join(directory, element)) + return files + +#Collecting results +def CollectResults(directory): + #Skip if this is a leaf directory + if os.path.exists(os.path.join(directory, args.tool + "_return_value")) or os.listdir(directory) == []: + return + subdirs = [s for s in sorted(os.listdir(directory)) if os.path.isdir(os.path.join(directory,s)) and s != "panda-temp" and s != "HLS_output"] + for subdir in subdirs: + CollectResults(os.path.join(directory, subdir)) + tool_failed_output = open(os.path.join(directory, args.tool + "_failed_output"), "w") + for subdir in subdirs: + if os.path.exists(os.path.join(directory, subdir, args.tool + "_failed_output")): + tool_failed_output.write(open(os.path.join(directory, subdir, args.tool + "_failed_output")).read()) + if os.path.exists(os.path.join(directory, subdir, args.tool + "_execution_output")): + tool_failed_output.write("\n") + tool_failed_output.write("\n") + tool_failed_output.write("\n") + tool_failed_output.close() + report_file = open(os.path.join(directory, "report"), "w") + for subdir in subdirs: + if os.path.exists(os.path.join(directory, subdir, args.tool + "_return_value")): + return_value_file_name = os.path.join(directory, subdir, args.tool + "_return_value") + return_value_file = open(return_value_file_name) + return_value = return_value_file.read() + return_value_file.close() + args_file = open(os.path.join(directory, subdir, "args")) + command_args = args_file.readlines()[0] + command_args = command_args.replace(abs_benchmarks_root + "/", "") + args_file.close() + if return_value == "0": + tool_results_file_name = os.path.join(directory, subdir, args.tool + "_results") + if os.path.exists(tool_results_file_name): + report_file.write("SUCCESS (" + open(tool_results_file_name).read() + " cycles) " + command_args.replace("\\", "")) + else: + report_file.write("SUCCESS: " + command_args.replace("\\", "")) + else: + if return_value == "124": + report_file.write("FAILURE(Timeout): " + command_args.replace("\\", "")) + else: + report_file.write("FAILURE: " + command_args.replace("\\", "")) + report_file.write("\n") + elif os.path.exists(os.path.join(directory, subdir, "report")): + local_report_file = open(os.path.join(directory, subdir, "report")) + report_file.write(local_report_file.read()) + local_report_file.close() + report_file.close() + if args.tool == "bambu": + local_args = "" + named_list_name = os.path.join(abs_path, "named_list") + lines = open(named_list_name).readlines() + for line in lines: + local_dir = ComputeDirectory(line) + if os.path.exists(os.path.join(local_dir, args.tool + "_results_0.xml")): + local_args = local_args + " " + os.path.join(local_dir, args.tool + "_results_0.xml") + if len(local_args) > 0: + #Generate experimental setup xml + experimental_setup_file_name = os.path.join(abs_path, "experimental_setup.xml") + temp_list = open(experimental_setup_file_name, "w") + bambu_version_file_name = os.path.join(abs_path, "bambu_version") + bambu_version_file = open(bambu_version_file_name, "w") + bambu_version_command = [tool_exe] + bambu_version_command.extend(shlex.split("--version")) + subprocess.call(bambu_version_command, stdout=bambu_version_file) + bambu_version_file.close() + bambu_version_file = open(bambu_version_file_name, "r") + bambu_version = bambu_version_file.readlines()[-2].rstrip() + bambu_version_file.close() + if args.commonargs != None: + bambu_arguments = ' '.join(' '.join(map(str,l)) for l in args.commonargs) + else: + bambu_arguments = "" + temp_list.write("\n") + temp_list.write("\n") + temp_list.write(" \n") + temp_list.write(" \n") + temp_list.write("