From b483dda58796cb6a6984c818d0cf2f2bf501ce0e Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Mon, 12 Apr 2021 12:58:19 +0300
Subject: [PATCH] Fix Metal accuracy problem caused by <dtype>3 vectors usage

On example of float3 datatype:
Using of float3 data type for loading of data cuncurrently into dense array shared
between all threads in Metal threading group can lead to data race between threads.
float3 datatype has size and and alignment eq to 16 bytes while kernel assumes to
copy 12 bytes in arbitrary not aligned places.
Using of packed_float3 datatypes solves the issue
---
 src/target/source/codegen_metal.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)
diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc
index c95d578df686..270d81f218ea 100644
--- a/src/target/source/codegen_metal.cc
+++ b/src/target/source/codegen_metal.cc
@@ -178,6 +178,17 @@ void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   }
   bool fail = false;
   if (t.is_float()) {
+    // Need to care about sizes and alignment of half3/float3 because tir representation might not
+    // be aware of Metal half3/float3 details and can treat them as just three elements,
+    // while sizes and alignmnents of half3/float3 are one element more (half3-8 bytes/
+    // float13 - 16bytes).
+    // Example of problematic pattern: filling of threadgroup packed array using float3 elements
+    // by threads concurrently can lead to datarace and wrong data in threadgroup shared array.
+    // packed_(half3/float3) are exactly datatypes dealing with 3 elements and per-element
+    // alignment
+    if (lanes == 3) {
+      os << "packed_";
+    }
     switch (t.bits()) {
       case 16:
         os << "half";