#485 first part, don't parallelize setZero

mratsim · Jan 3, 2021 · 69efb08 · 69efb08
1 parent d8d986d
commit 69efb08
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 9 deletions.
diff --git a/src/arraymancer/laser/tensor/initialization.nim b/src/arraymancer/laser/tensor/initialization.nim
@@ -166,14 +166,19 @@ proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) =
   when not (T is KnownSupportsCopyMem):
     t.storage.raw_buffer.reset()
     t.storage.raw_buffer.setLen(t.size)
-  else:
-    omp_parallel_chunks(
-          t.size, chunk_offset, chunk_size,
-          OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
-      zeroMem(
-        t.unsafe_raw_offset[chunk_offset].addr,
-        chunk_size * sizeof(T)
-      )
+  else: # if setZero or newTensor are used in OpenMP parallel regions
+        # making this parallel will kill performance.
+    # omp_parallel_chunks(
+    #       t.size, chunk_offset, chunk_size,
+    #       OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
+    #   zeroMem(
+    #     t.unsafe_raw_offset[chunk_offset].addr,
+    #     chunk_size * sizeof(T)
+    #   )
+    zeroMem(
+      t.unsafe_raw_offset[0].addr,
+      t.size * sizeof(T)
+    )
 
 proc newTensor*[T](shape: varargs[int]): Tensor[T] =
   var size: int

diff --git a/src/arraymancer/nn_primitives/nnp_linear.nim b/src/arraymancer/nn_primitives/nnp_linear.nim
@@ -24,7 +24,6 @@ proc linear*[T](input, weight: Tensor[T], bias: Tensor[T], output: var Tensor[T]
   #   - bias tensor shape [1, out_features]
   # Output does not need to be initialized to 0 or the proper shape, data will be overwritten
   # Output is: Y = x * W.transpose + b
-
   output = input * weight.transpose # TODO: with the transpose the non-matching rows and cols is confusing
   output +.= bias