DynamoRIO · AssadHashmi · Jul 3, 2019 · Jun 28, 2019 · Jun 28, 2019 · Jun 28, 2019
diff --git a/core/arch/aarch64/codec.txt b/core/arch/aarch64/codec.txt
@@ -756,7 +756,7 @@ x0110111xxxxxxxxxxxxxxxxxxxxxxxx  tbnz   tbz
 0x001100000000000010xxxxxxxxxxxx  st1    memvm : vmsz vt0 vt1 vt2 vt3
 0x001100000000000100xxxxxxxxxxxx  st3    memvm : vmsz vt0 vt1 vt2
 0x001100000000000110xxxxxxxxxxxx  st1    memvm : vmsz vt0 vt1 vt2
-0x001100000000000111xxxxxxxxxxxx  st1    memvm : vmsz vt0
+0x001100000000000111xxxxxxxxxxxx  st1    memvm : vt0 vmsz
 0x001100000000001000xxxxxxxxxxxx  st2    memvm : vmsz vt0 vt1
 0x001100000000001010xxxxxxxxxxxx  st1    memvm : vmsz vt0 vt1
 

diff --git a/core/arch/aarch64/instr.c b/core/arch/aarch64/instr.c
@@ -335,6 +335,12 @@ reg_is_gpr(reg_id_t reg)
     return (DR_REG_X0 <= reg && reg <= DR_REG_WSP);
 }
 
+bool
+reg_is_simd(reg_id_t reg)
+{
+    return (DR_REG_Q0 <= reg && reg <= DR_REG_B31);
+}
+
 bool
 reg_is_opmask(reg_id_t reg)
 {

diff --git a/core/arch/aarch64/instr_create.h b/core/arch/aarch64/instr_create.h
@@ -1796,6 +1796,33 @@
 #define INSTR_CREATE_fnmsub_scalar(dc, Rd, Rm, Rn, Ra) \
     instr_create_1dst_3src(dc, OP_fnmsub, Rd, Rm, Rn, Ra)
 
+/* Advanced SIMD (NEON) memory instructions */
+
+/**
+ * Creates an Advanced SIMD (NEON) LD1 instruction to load multiple
+ * single element structures to one vector register, e.g. LD1 {V0.4H},[X0]
+ * \param dc      The void * dcontext used to allocate memory for the instr_t.
+ * \param q       The destination vector register operand.
+ * \param r       The source memory operand.
+ * \param s       The size of the vector element.
+ */
+#define INSTR_CREATE_ld1_multi_1(dc, q, r, s) instr_create_1dst_2src(dc, OP_ld1, q, r, s)
+
+/**
+ * Creates an Advanced SIMD (NEON) ST1 instruction to store multiple
+ * single element structures from one vector register, e.g. ST1 {V1.2S},[X1]
+ * \param dc      The void * dcontext used to allocate memory for the instr_t.
+ * \param r       The destination memory operand.
+ * \param q       The source vector register operand.
+ * \param s       The size of the vector element.
+ */
+#define INSTR_CREATE_st1_multi_1(dc, r, q, s) instr_create_1dst_2src(dc, OP_st1, r, q, s)
+
+/* TODO: Remaining advanced SIMD (NEON) memory instructions:
+#define INSTR_CREATE_ld2/3/4_multi_2/3/4()
+#define INSTR_CREATE_ld1/2/3/4_single()
+and st1 equivalents including post-index variants */
+
 /* -------- SVE bitwise logical operations (predicated) ---------------- */
 
 /**

diff --git a/suite/tests/api/dis-a64.txt b/suite/tests/api/dis-a64.txt
@@ -74,7 +74,7 @@
 0b9f13ff : add    wzr, wzr, wzr, asr #4   : add    %wzr %wzr asr $0x04 -> %wzr
 0c0007ff : st4    {v31.4h, v0.4h, v1.4h, v2.4h}, [sp]: st4    $0x01 %d31 %d0 %d1 %d2 -> (%sp)[32byte]
 0c0067ff : st1    {v31.4h, v0.4h, v1.4h}, [sp]: st1    $0x01 %d31 %d0 %d1 -> (%sp)[24byte]
-0c0077ff : st1    {v31.4h}, [sp]          : st1    $0x01 %d31 -> (%sp)[8byte]
+0c0077ff : st1    {v31.4h}, [sp]          : st1    %d31 $0x01 -> (%sp)[8byte]
 0c00a7ff : st1    {v31.4h, v0.4h}, [sp]   : st1    $0x01 %d31 %d0 -> (%sp)[16byte]
 0c4027ff : ld1    {v31.4h, v0.4h, v1.4h, v2.4h}, [sp]: ld1    (%sp)[32byte] $0x01 -> %d31 %d0 %d1 %d2
 0c4047ff : ld3    {v31.4h, v0.4h, v1.4h}, [sp]: ld3    (%sp)[24byte] $0x01 -> %d31 %d0 %d1

diff --git a/suite/tests/api/ir_aarch64.c b/suite/tests/api/ir_aarch64.c
@@ -2480,6 +2480,142 @@ test_asimdsame(void *dc)
     test_instr_encoding(dc, OP_bif, instr);
 }
 
+static void
+test_asimd_mem(void *dc)
+{
+    byte *pc;
+    instr_t *instr;
+
+    /* Advanced SIMD memory (multiple structures) */
+
+    /* Load multiple 1-element structures (to 1, 2, 3 or 4 registers)
+       Naming convention based on official mnemonics:
+       INSTR_CREATE_ld1_multi_<n>() where <n> is 1, 2, 3 or 4
+
+       LD1 { <Vt>.<T> }, [<Xn|SP>]
+       LD1 { <Vt>.<T>, <Vt2>.<T> }, [<Xn|SP>]
+       LD1 { <Vt>.<T>, <Vt2>.<T>, <Vt3>.<T> }, [<Xn|SP>]
+       LD1 { <Vt>.<T>, <Vt2>.<T>, <Vt3>.<T>, <Vt4>.<T> }, [<Xn|SP>]
+
+       <T> is one of 8B, 16B, 4H, 8H, 2S, 4S, 1D, 2D */
+
+    /* LD1 { <Vt>.8B }, [<Xn|SP>] */
+    instr = INSTR_CREATE_ld1_multi_1(
+        dc, opnd_create_reg(DR_REG_Q0),
+        opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_8),
+        OPND_CREATE_BYTE());
+    test_instr_encoding(dc, OP_ld1, instr);
+
+    /* LD1 { <Vt>.16B }, [<Xn|SP>] */
+    instr = INSTR_CREATE_ld1_multi_1(
+        dc, opnd_create_reg(DR_REG_Q0),
+        opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_16),
+        OPND_CREATE_BYTE());
+    test_instr_encoding(dc, OP_ld1, instr);
+
+    /* LD1 { <Vt>.4H }, [<Xn|SP>] */
+    instr = INSTR_CREATE_ld1_multi_1(
+        dc, opnd_create_reg(DR_REG_Q0),
+        opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_4),
+        OPND_CREATE_HALF());
+    test_instr_encoding(dc, OP_ld1, instr);
+
+    /* LD1 { <Vt>.8H }, [<Xn|SP>] */
+    instr = INSTR_CREATE_ld1_multi_1(
+        dc, opnd_create_reg(DR_REG_Q0),
+        opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_8),
+        OPND_CREATE_HALF());
+    test_instr_encoding(dc, OP_ld1, instr);
+
+    /* LD1 { <Vt>.2S }, [<Xn|SP>] */
+    instr = INSTR_CREATE_ld1_multi_1(
+        dc, opnd_create_reg(DR_REG_Q0),
+        opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_2),
+        OPND_CREATE_SINGLE());
+    test_instr_encoding(dc, OP_ld1, instr);
+
+    /* LD1 { <Vt>.4S }, [<Xn|SP>] */
+    instr = INSTR_CREATE_ld1_multi_1(
+        dc, opnd_create_reg(DR_REG_Q0),
+        opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_4),
+        OPND_CREATE_SINGLE());
+    test_instr_encoding(dc, OP_ld1, instr);
+
+    /* LD1 { <Vt>.1D }, [<Xn|SP>] */
+    instr = INSTR_CREATE_ld1_multi_1(
+        dc, opnd_create_reg(DR_REG_Q0),
+        opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_1),
+        OPND_CREATE_DOUBLE());
+    test_instr_encoding(dc, OP_ld1, instr);
+
+    /* LD1 { <Vt>.2D }, [<Xn|SP>] */
+    instr = INSTR_CREATE_ld1_multi_1(
+        dc, opnd_create_reg(DR_REG_Q0),
+        opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_2),
+        OPND_CREATE_DOUBLE());
+    test_instr_encoding(dc, OP_ld1, instr);
+
+    /* Store multiple 1-element structures (to 1, 2, 3 or 4 registers)
+       Naming convention based on official mnemonics:
+       INSTR_CREATE_st1_multi_<n>() where <n> is 1, 2, 3 or 4
+
+       ST1 { <Vt>.<T> }, [<Xn|SP>]
+       ST1 { <Vt>.<T>, <Vt2>.<T> }, [<Xn|SP>]
+       ST1 { <Vt>.<T>, <Vt2>.<T>, <Vt3>.<T> }, [<Xn|SP>]
+       ST1 { <Vt>.<T>, <Vt2>.<T>, <Vt3>.<T>, <Vt4>.<T> }, [<Xn|SP>]
+
+       <T> is one of 8B, 16B, 4H, 8H, 2S, 4S, 1D, 2D */
+
+    /* ST1 { <Vt>.8B }, [<Xn|SP>] */
+    instr = INSTR_CREATE_st1_multi_1(
+        dc, opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_8),
+        opnd_create_reg(DR_REG_Q0), OPND_CREATE_BYTE());
+    test_instr_encoding(dc, OP_st1, instr);
+
+    /* ST1 { <Vt>.16B }, [<Xn|SP>] */
+    instr = INSTR_CREATE_st1_multi_1(
+        dc,
+        opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_16),
+        opnd_create_reg(DR_REG_Q0), OPND_CREATE_BYTE());
+    test_instr_encoding(dc, OP_st1, instr);
+
+    /* ST1 { <Vt>.4H }, [<Xn|SP>] */
+    instr = INSTR_CREATE_st1_multi_1(
+        dc, opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_4),
+        opnd_create_reg(DR_REG_Q0), OPND_CREATE_HALF());
+    test_instr_encoding(dc, OP_st1, instr);
+
+    /* ST1 { <Vt>.8H }, [<Xn|SP>] */
+    instr = INSTR_CREATE_st1_multi_1(
+        dc, opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_8),
+        opnd_create_reg(DR_REG_Q0), OPND_CREATE_HALF());
+    test_instr_encoding(dc, OP_st1, instr);
+
+    /* ST1 { <Vt>.2S }, [<Xn|SP>] */
+    instr = INSTR_CREATE_st1_multi_1(
+        dc, opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_2),
+        opnd_create_reg(DR_REG_Q0), OPND_CREATE_SINGLE());
+    test_instr_encoding(dc, OP_st1, instr);
+
+    /* ST1 { <Vt>.4S }, [<Xn|SP>] */
+    instr = INSTR_CREATE_st1_multi_1(
+        dc, opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_4),
+        opnd_create_reg(DR_REG_Q0), OPND_CREATE_SINGLE());
+    test_instr_encoding(dc, OP_st1, instr);
+
+    /* ST1 { <Vt>.1D }, [<Xn|SP>] */
+    instr = INSTR_CREATE_st1_multi_1(
+        dc, opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_1),
+        opnd_create_reg(DR_REG_Q0), OPND_CREATE_DOUBLE());
+    test_instr_encoding(dc, OP_st1, instr);
+
+    /* ST1 { <Vt>.2D }, [<Xn|SP>] */
+    instr = INSTR_CREATE_st1_multi_1(
+        dc, opnd_create_base_disp_aarch64(DR_REG_X0, DR_REG_NULL, 0, false, 0, 0, OPSZ_2),
+        opnd_create_reg(DR_REG_Q0), OPND_CREATE_DOUBLE());
+    test_instr_encoding(dc, OP_st1, instr);
+}
+
 static void
 test_floatdp1(void *dc)
 {
@@ -3854,6 +3990,9 @@ main(int argc, char *argv[])
     test_asimdsame(dcontext);
     print("test_asimdsame complete\n");
 
+    test_asimd_mem(dcontext);
+    print("test_asimd_mem complete\n");
+
     test_floatdp1(dcontext);
     print("test_floatdp1 complete\n");
 

diff --git a/suite/tests/api/ir_aarch64.expect b/suite/tests/api/ir_aarch64.expect
@@ -481,6 +481,23 @@ bit    %q21 %q12 -> %q12
 bif    %d3 %d3 -> %d20
 bif    %q3 %q3 -> %q20
 test_asimdsame complete
+ld1    (%x0)[8byte] $0x00 -> %q0
+ld1    (%x0)[16byte] $0x00 -> %q0
+ld1    (%x0)[4byte] $0x01 -> %q0
+ld1    (%x0)[8byte] $0x01 -> %q0
+ld1    (%x0)[2byte] $0x02 -> %q0
+ld1    (%x0)[4byte] $0x02 -> %q0
+ld1    (%x0)[1byte] $0x03 -> %q0
+ld1    (%x0)[2byte] $0x03 -> %q0
+st1    %q0 $0x00 -> (%x0)[8byte]
+st1    %q0 $0x00 -> (%x0)[16byte]
+st1    %q0 $0x01 -> (%x0)[4byte]
+st1    %q0 $0x01 -> (%x0)[8byte]
+st1    %q0 $0x02 -> (%x0)[2byte]
+st1    %q0 $0x02 -> (%x0)[4byte]
+st1    %q0 $0x03 -> (%x0)[1byte]
+st1    %q0 $0x03 -> (%x0)[2byte]
+test_asimd_mem complete
 fmov   %d27 -> %d2
 fmov   %s27 -> %s2
 fmov   %h27 -> %h2