From 89c8fe5dfd8328df9cfc352438457883e2b94b45 Mon Sep 17 00:00:00 2001
From: Javier Setoain <jsetoain@users.noreply.github.com>
Date: Thu, 30 May 2024 12:21:40 +0200
Subject: [PATCH] [aievec] Add new shuffle ops (#1516)

---
 .../aie/Dialect/AIEVec/IR/AIEVecAttributes.td |  96 +++++++++
 .../aie/Dialect/AIEVec/IR/AIEVecDialect.td    |   5 +-
 include/aie/Dialect/AIEVec/IR/AIEVecOps.h     |   6 +-
 include/aie/Dialect/AIEVec/IR/AIEVecOps.td    | 156 +++++++++++++-
 .../AIEVec/IR/AIEVecTypeConstraints.td        |   2 +
 include/aie/Dialect/AIEVec/IR/AIEVecTypes.h   |   2 +-
 include/aie/Dialect/AIEVec/IR/CMakeLists.txt  |  10 +-
 lib/Dialect/AIEVec/IR/AIEVecOps.cpp           | 122 ++++++++++-
 lib/Dialect/AIEVec/IR/CMakeLists.txt          |   1 +
 .../AIEVec/Transforms/AIEVectorize.cpp        |  10 +-
 .../Transforms/FoldMulAddChainToConvOp.cpp    |   9 +-
 .../AIEVecToCpp/TranslateAIEVecToCpp.cpp      |  34 ++-
 .../VectorToAIEVec/test-conv-op-i8-init.mlir  |   2 +-
 .../VectorToAIEVec/test-conv-op-i8.mlir       |   2 +-
 test/aievec/conv2d_i8_after_polygeist.mlir    |   2 +-
 test/aievec/conv2d_uij_i8_noinit_aie-ml.mlir  |   2 +-
 test/dialect/AIEVec/invalid.mlir              |  27 +++
 test/dialect/AIEVec/roundtrip.mlir            | 204 ++++++++++++++++++
 18 files changed, 664 insertions(+), 28 deletions(-)
 create mode 100644 include/aie/Dialect/AIEVec/IR/AIEVecAttributes.td

diff --git a/include/aie/Dialect/AIEVec/IR/AIEVecAttributes.td b/include/aie/Dialect/AIEVec/IR/AIEVecAttributes.td
new file mode 100644
index 0000000000..87f2e4f4f2
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/IR/AIEVecAttributes.td
@@ -0,0 +1,96 @@
+//===- AIEVecAttributes.td - AIE vector attributes def. ----*- tablegen -*-====//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+//
+//===----------------------------------------------------------------------===//
+// Defines AIE vector operations.
+//===----------------------------------------------------------------------===//
+
+#ifndef AIE_DIALECT_AIEVEC_IR_AIEVECATTRIBUTES_TD
+#define AIE_DIALECT_AIEVEC_IR_AIEVECATTRIBUTES_TD
+
+include "aie/Dialect/AIEVec/IR/AIEVecDialect.td"
+include "mlir/IR/EnumAttr.td"
+
+// Shuffle modes for shuffle ops.
+def SHUFFLE_MODE_T8_64X2_LO   : I32EnumAttrCase<"T8_64X2_LO",    0, "t8_64x2_lo">;
+def SHUFFLE_MODE_T8_64X2_HI   : I32EnumAttrCase<"T8_64X2_HI",    1, "t8_64x2_hi">;
+def SHUFFLE_MODE_T16_32X2_LO  : I32EnumAttrCase<"T16_32X2_LO",   2, "t16_32x2_lo">;
+def SHUFFLE_MODE_T16_32X2_HI  : I32EnumAttrCase<"T16_32X2_HI",   3, "t16_32x2_hi">;
+def SHUFFLE_MODE_T32_16X2_LO  : I32EnumAttrCase<"T32_16X2_LO",   4, "t32_16x2_lo">;
+def SHUFFLE_MODE_T32_16X2_HI  : I32EnumAttrCase<"T32_16X2_HI",   5, "t32_16x2_hi">;
+def SHUFFLE_MODE_T64_8X2_LO   : I32EnumAttrCase<"T64_8X2_LO",    6, "t64_8x2_lo">;
+def SHUFFLE_MODE_T64_8X2_HI   : I32EnumAttrCase<"T64_8X2_HI",    7, "t64_8x2_hi">;
+def SHUFFLE_MODE_T128_4X2_LO  : I32EnumAttrCase<"T128_4X2_LO",   8, "t128_4x2_lo">;
+def SHUFFLE_MODE_T128_4X2_HI  : I32EnumAttrCase<"T128_4X2_HI",   9, "t128_4x2_hi">;
+def SHUFFLE_MODE_T256_2X2_LO  : I32EnumAttrCase<"T256_2X2_LO",  10, "t256_2x2_lo">;
+def SHUFFLE_MODE_T256_2X2_HI  : I32EnumAttrCase<"T256_2X2_HI",  11, "t256_2x2_hi">;
+def SHUFFLE_MODE_T128_2X4_LO  : I32EnumAttrCase<"T128_2X4_LO",  12, "t128_2x4_lo">;
+def SHUFFLE_MODE_T128_2X4_HI  : I32EnumAttrCase<"T128_2X4_HI",  13, "t128_2x4_hi">;
+def SHUFFLE_MODE_T64_2X8_LO   : I32EnumAttrCase<"T64_2X8_LO",   14, "t64_2x8_lo">;
+def SHUFFLE_MODE_T64_2X8_HI   : I32EnumAttrCase<"T64_2X8_HI",   15, "t64_2x8_hi">;
+def SHUFFLE_MODE_T32_2X16_LO  : I32EnumAttrCase<"T32_2X16_LO",  16, "t32_2x16_lo">;
+def SHUFFLE_MODE_T32_2X16_HI  : I32EnumAttrCase<"T32_2X16_HI",  17, "t32_2x16_hi">;
+def SHUFFLE_MODE_T16_2X32_LO  : I32EnumAttrCase<"T16_2X32_LO",  18, "t16_2x32_lo">;
+def SHUFFLE_MODE_T16_2X32_HI  : I32EnumAttrCase<"T16_2X32_HI",  19, "t16_2x32_hi">;
+def SHUFFLE_MODE_T8_2X64_LO   : I32EnumAttrCase<"T8_2X64_LO",   20, "t8_2x64_lo">;
+def SHUFFLE_MODE_T8_2X64_HI   : I32EnumAttrCase<"T8_2X64_HI",   21, "t8_2x64_hi">;
+def SHUFFLE_MODE_T512_1X2_LO  : I32EnumAttrCase<"T512_1X2_LO",  22, "t512_1x2_lo">;
+def SHUFFLE_MODE_T512_1X2_HI  : I32EnumAttrCase<"T512_1X2_HI",  23, "t512_1x2_hi">;
+def SHUFFLE_MODE_T16_16X4_LO  : I32EnumAttrCase<"T16_16X4_LO",  24, "t16_16x4_lo">;
+def SHUFFLE_MODE_T16_16X4_HI  : I32EnumAttrCase<"T16_16X4_HI",  25, "t16_16x4_hi">;
+def SHUFFLE_MODE_T16_4X16_LO  : I32EnumAttrCase<"T16_4X16_LO",  26, "t16_4x16_lo">;
+def SHUFFLE_MODE_T16_4X16_HI  : I32EnumAttrCase<"T16_4X16_HI",  27, "t16_4x16_hi">;
+def SHUFFLE_MODE_T16_8X4      : I32EnumAttrCase<"T16_8X4",      28, "t16_8x4">;
+def SHUFFLE_MODE_T16_4X8      : I32EnumAttrCase<"T16_4X8",      29, "t16_4x8">;
+def SHUFFLE_MODE_T32_8X4_LO   : I32EnumAttrCase<"T32_8X4_LO",   30, "t32_8x4_lo">;
+def SHUFFLE_MODE_T32_8X4_HI   : I32EnumAttrCase<"T32_8X4_HI",   31, "t32_8x4_hi">;
+def SHUFFLE_MODE_T32_4X8_LO   : I32EnumAttrCase<"T32_4X8_LO",   32, "t32_4x8_lo">;
+def SHUFFLE_MODE_T32_4X8_HI   : I32EnumAttrCase<"T32_4X8_HI",   33, "t32_4x8_hi">;
+def SHUFFLE_MODE_T32_4X4      : I32EnumAttrCase<"T32_4X4",      34, "t32_4x4">;
+def SHUFFLE_MODE_T8_8X8       : I32EnumAttrCase<"T8_8X8",       35, "t8_8x8">;
+def SHUFFLE_MODE_T8_16X4      : I32EnumAttrCase<"T8_16X4",      36, "t8_16x4">;
+def SHUFFLE_MODE_T8_4X16      : I32EnumAttrCase<"T8_4X16",      37, "t8_4x16">;
+def SHUFFLE_MODE_T16_1X2_flip : I32EnumAttrCase<"T16_1X2_flip", 38, "t16_1x2_flip">;
+def SHUFFLE_MODE_T16_4X4      : I32EnumAttrCase<"T16_4X4",      39, "t16_4x4">;
+def SHUFFLE_MODE_T16_4X2      : I32EnumAttrCase<"T16_4X2",      40, "t16_4x2">;
+def SHUFFLE_MODE_T16_2X4      : I32EnumAttrCase<"T16_2X4",      41, "t16_2x4">;
+def SHUFFLE_MODE_T16_8X2      : I32EnumAttrCase<"T16_8X2",      42, "t16_8x2">;
+def SHUFFLE_MODE_T16_2X8      : I32EnumAttrCase<"T16_2X8",      43, "t16_2x8">;
+def SHUFFLE_MODE_T16_16X2     : I32EnumAttrCase<"T16_16X2",     44, "t16_16x2">;
+def SHUFFLE_MODE_T16_2X16     : I32EnumAttrCase<"T16_2X16",     45, "t16_2x16">;
+def SHUFFLE_MODE_T8_8X4       : I32EnumAttrCase<"T8_8X4",       46, "t8_8x4">;
+def SHUFFLE_MODE_T8_4X8       : I32EnumAttrCase<"T8_4X8",       47, "t8_4x8">;
+
+def ShuffleMode : I32EnumAttr<
+    "ShuffleMode",
+    "Shuffle mode for AIEVec shuffle operations",
+    [SHUFFLE_MODE_T8_64X2_LO, SHUFFLE_MODE_T8_64X2_HI, SHUFFLE_MODE_T16_32X2_LO,
+     SHUFFLE_MODE_T16_32X2_HI, SHUFFLE_MODE_T32_16X2_LO, SHUFFLE_MODE_T32_16X2_HI,
+     SHUFFLE_MODE_T64_8X2_LO, SHUFFLE_MODE_T64_8X2_HI, SHUFFLE_MODE_T128_4X2_LO,
+     SHUFFLE_MODE_T128_4X2_HI, SHUFFLE_MODE_T256_2X2_LO, SHUFFLE_MODE_T256_2X2_HI,
+     SHUFFLE_MODE_T128_2X4_LO, SHUFFLE_MODE_T128_2X4_HI, SHUFFLE_MODE_T64_2X8_LO,
+     SHUFFLE_MODE_T64_2X8_HI, SHUFFLE_MODE_T32_2X16_LO, SHUFFLE_MODE_T32_2X16_HI,
+     SHUFFLE_MODE_T16_2X32_LO, SHUFFLE_MODE_T16_2X32_HI, SHUFFLE_MODE_T8_2X64_LO,
+     SHUFFLE_MODE_T8_2X64_HI, SHUFFLE_MODE_T512_1X2_LO, SHUFFLE_MODE_T512_1X2_HI,
+     SHUFFLE_MODE_T16_16X4_LO, SHUFFLE_MODE_T16_16X4_HI, SHUFFLE_MODE_T16_4X16_LO,
+     SHUFFLE_MODE_T16_4X16_HI, SHUFFLE_MODE_T16_8X4, SHUFFLE_MODE_T16_4X8,
+     SHUFFLE_MODE_T32_8X4_LO, SHUFFLE_MODE_T32_8X4_HI, SHUFFLE_MODE_T32_4X8_LO,
+     SHUFFLE_MODE_T32_4X8_HI, SHUFFLE_MODE_T32_4X4, SHUFFLE_MODE_T8_8X8,
+     SHUFFLE_MODE_T8_16X4, SHUFFLE_MODE_T8_4X16, SHUFFLE_MODE_T16_1X2_flip,
+     SHUFFLE_MODE_T16_4X4, SHUFFLE_MODE_T16_4X2, SHUFFLE_MODE_T16_2X4,
+     SHUFFLE_MODE_T16_8X2, SHUFFLE_MODE_T16_2X8, SHUFFLE_MODE_T16_16X2,
+     SHUFFLE_MODE_T16_2X16, SHUFFLE_MODE_T8_8X4, SHUFFLE_MODE_T8_4X8]> {
+  let cppNamespace = "::xilinx::aievec";
+  let genSpecializedAttr = 0;
+}
+
+def AIEVec_ShuffleModeAttr : EnumAttr<AIEVec_Dialect, ShuffleMode, "mode"> {
+  let assemblyFormat = "`[` $value `]`";
+}
+
+#endif // AIE_DIALECT_AIEVEC_IR_AIEVECATTRIBUTES_TD
diff --git a/include/aie/Dialect/AIEVec/IR/AIEVecDialect.td b/include/aie/Dialect/AIEVec/IR/AIEVecDialect.td
index 4741fdba97..569995ec63 100644
--- a/include/aie/Dialect/AIEVec/IR/AIEVecDialect.td
+++ b/include/aie/Dialect/AIEVec/IR/AIEVecDialect.td
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2022 Xilinx Inc.
+// (c) Copyright 2022-2024 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 // Defines AIE vector dialect.
@@ -19,6 +19,9 @@ def AIEVec_Dialect : Dialect {
   let name = "aievec";
   let summary = "Types and operations for AIE vector dialect";
   let cppNamespace = "::xilinx::aievec";
+
+  let useDefaultAttributePrinterParser = 1;
+
   let extraClassDeclaration = [{
     void registerTypes();
   }];
diff --git a/include/aie/Dialect/AIEVec/IR/AIEVecOps.h b/include/aie/Dialect/AIEVec/IR/AIEVecOps.h
index b1a26c3c43..1bffeaff19 100644
--- a/include/aie/Dialect/AIEVec/IR/AIEVecOps.h
+++ b/include/aie/Dialect/AIEVec/IR/AIEVecOps.h
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2022 Xilinx Inc.
+// (c) Copyright 2022-2024 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 // This file defines the AIE vector dialect and the operations.
@@ -17,6 +17,10 @@
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+#include "aie/Dialect/AIEVec/IR/AIEVecEnums.h.inc"
+#define GET_ATTRDEF_CLASSES
+#include "aie/Dialect/AIEVec/IR/AIEVecAttributes.h.inc"
+
 #include "AIEVecDialect.h"
 
 #define GET_OP_CLASSES
diff --git a/include/aie/Dialect/AIEVec/IR/AIEVecOps.td b/include/aie/Dialect/AIEVec/IR/AIEVecOps.td
index 3861dc32d6..73a5ef80b7 100644
--- a/include/aie/Dialect/AIEVec/IR/AIEVecOps.td
+++ b/include/aie/Dialect/AIEVec/IR/AIEVecOps.td
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2023 AMD Inc.
+// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 // Defines AIE vector operations.
@@ -14,6 +14,7 @@
 #define AIEVEC_OPS
 
 include "aie/Dialect/AIE/IR/AIEAttrs.td"
+include "aie/Dialect/AIEVec/IR/AIEVecAttributes.td"
 include "aie/Dialect/AIEVec/IR/AIEVecTypes.td"
 include "aie/Dialect/AIEVec/IR/AIEVecTypeConstraints.td"
 
@@ -587,8 +588,8 @@ def AIEVec_ShiftOp:
   }];
 }
 
-def AIEVec_ShuffleOp:
-  AIEVec_Op<"shuffle", [
+def AIEVec_LegacyShuffleOp:
+  AIEVec_Op<"legacyshuffle", [
     Pure
   ]>,
   Arguments<(ins AnyVector:$source,
@@ -872,4 +873,153 @@ def AIEVec_MatMulOp:
   let hasVerifier = 0;
 }
 
+def AIEVec_ShuffleOp : AIEVec_Op<"shuffle",
+    [Pure, AllTypesMatch<["lhs", "result"]>,
+     OptionalTypesMatchWith<"result and rhs have the same type", "result", "rhs",
+                            "::llvm::cast<Type>($_self)">]>,
+  Arguments<(ins VectorOfBitWidthAndElementTypes<
+                      512, [I8, I16, I32, I64, I128, I256,
+                            I512, BF16, F32]>:$lhs,
+                 Optional<VectorOfBitWidthAndElementTypes<
+                      512, [I8, I16, I32, I64, I128, I256,
+                            I512, BF16, F32]>>:$rhs,
+                 AIEVec_ShuffleModeAttr:$mode)>,
+  Results<(outs AnyVector:$result)> {
+  let summary = "AIE2 shuffle";
+  let description = [{
+    AMD AIEv2-specific vector shuffle. It performs a shuffle of the elements of
+    1 or 2 input vectors using the specified shuffle mode. The shuffle mode is
+    specified as:
+
+      `t<width>_<r>x<c>(_(hi|lo))?`
+
+    where `<width>` is the bitwidth of the vector element type, `<r>` and `<c>`
+    are the number of rows and columns that will be transposed to perform the
+    shuffle, and, for modes that require two 512-bit vectors, `hi` and `lo`
+    indicate which part of the resulting extended 1024-bit vector will be
+    assembled and returned.
+
+    E.g.: `t32_4x8` would take two 512-bit vectors, `lhs` and `rhs`, with 16
+    elements of 32 bits each. The resulting vector would contain either the
+    least (`lo`) or most (`hi`) significant 16 elements of the 32 element vector
+    that would result from selecting, out of the concatenated vectors `lhs:rhs`,
+    8 blocks of 4 elements, each block taking one of every 8 elements starting
+    from the block index.
+
+    That is, for two `vector<16xi32>` operands containing:
+    ```
+    lhs = [0,   1,  2,  3, ..., 15]
+    rhs = [17, 18, 19, 20, ..., 31]
+    ```
+
+    The first 8 blocks would be:
+    ```
+    b0 = [0,  8, 16, 24]
+    b1 = [1,  9, 17, 25]
+    b2 = [2, 10, 18, 26]
+    b3 = [3, 11, 19, 27]
+       ...
+    b7 = [7, 15, 23, 31]
+    ```
+
+    `t32_4x8_lo` would return first four blocks:
+    ```
+    result = [0, 8, 16, 24, 1, 9, 17, 25, ..., 3, 11, 19, 27]
+    ```
+
+    And `t32_4x8_hi` would return the last four blocks:
+    ```
+    result = [4, 12, 20, 28, 5, 13, 21, 29, ..., 7, 15, 24, 31]
+    ```
+
+    It can be seen as flattened 4x8 matrix, split in two 16-element halfs, being
+    tranposed to a 8x4 arrangement. In the example above:
+
+    ```
+    lhs = [ 0,  1,  2,  3,  4,  5,  6,  7]
+          [ 8,  9, 10, 11, 12, 13, 14, 15]
+    rhs = [16, 17, 18, 19, 20, 21, 22, 23]
+          [24, 25, 26, 27, 28, 29, 30, 31]
+    ```
+
+    Would result in:
+    ```
+    t32_4x8_lo = [0,  8, 16, 24]
+                 [1,  9, 17, 25]
+                 [2, 10, 18, 26]
+                 [3, 11, 19, 27]
+    t32_4x8_hi = [4, 12, 20, 28]
+                 [5, 13, 21, 29]
+                 [6, 14, 22, 30]
+                 [7, 15, 23, 31]
+    ```
+
+    A special mode, `t16_1x2_flip`, swaps each pair of elements in a vector with
+    32 16-bit elements. E.g.:
+    ```
+    lhs = [0, 1, 2, 3, ..., 28, 29, 30, 31]
+    ```
+    Would result in:
+    ```
+    t16_1x2_flip = [1, 0, 3, 2, ..., 29, 28, 31, 30]
+    ```
+
+    The list of supported shuffle modes, required operands, and associated
+    vector types are the following:
+
+         Shuffle Mode       | Operands           | Types Supported
+        :------------------:|:------------------:|:------------------:
+         t8_8x4             | `lhs`              | `vector<64xi8>`
+         t8_4x8             | ^                  | ^
+         t8_8x8             | ^                  | ^
+         t8_16x4            | ^                  | ^
+         t8_4x16            | ^                  | ^
+         t8_64x2_lo         | `lhs` & `rhs`      | ^
+         t8_64x2_hi         | ^                  | ^
+         t8_2x64_lo         | ^                  | ^
+         t8_2x64_hi         | ^                  | ^
+         t16_4x2            | `lhs`              | `vector<32xi16>` or `vector<32xbf16>`
+         t16_2x4            | ^                  | ^
+         t16_4x4            | ^                  | ^
+         t16_8x2            | ^                  | ^
+         t16_2x8            | ^                  | ^
+         t16_8x4            | ^                  | ^
+         t16_4x8            | ^                  | ^
+         t16_16x2           | ^                  | ^
+         t16_2x16           | ^                  | ^
+         t16_1x2_flip       | ^                  | ^
+         t16_32x2_lo        | `lhs` & `rhs`      | ^
+         t16_32x2_hi        | ^                  | ^
+         t16_2x32_lo        | ^                  | ^
+         t16_2x32_hi        | ^                  | ^
+         t16_16x4_lo        | ^                  | ^
+         t16_16x4_hi        | ^                  | ^
+         t16_4x16_lo        | ^                  | ^
+         t16_4x16_hi        | ^                  | ^
+         t32_4x4            | `lhs`              | `vector<16xi32>` or `vector<16xf32>`
+         t32_16x2_lo        | `lhs` & `rhs`      | ^
+         t32_16x2_hi        | ^                  | ^
+         t32_2x16_lo        | ^                  | ^
+         t32_2x16_hi        | ^                  | ^
+         t32_8x4_lo         | ^                  | ^
+         t32_8x4_hi         | ^                  | ^
+         t32_4x8_lo         | ^                  | ^
+         t32_4x8_hi         | ^                  | ^
+         t64_8x2_lo         | ^                  | `vector<8xi64>`
+         t64_8x2_hi         | ^                  | ^
+         t64_2x8_lo         | ^                  | ^
+         t64_2x8_hi         | ^                  | ^
+         t128_4x2_lo        | ^                  | `vector<4xi128>`
+         t128_4x2_hi        | ^                  | ^
+         t128_2x4_lo        | ^                  | ^
+         t128_2x4_hi        | ^                  | ^
+         t256_2x2_lo        | ^                  | `vector<2xi256>`
+         t256_2x2_hi        | ^                  | ^
+         t512_1x2_lo        | ^                  | `vector<1xi512>`
+         t512_1x2_hi        | ^                  | ^
+  }];
+  let assemblyFormat = [{$lhs (`,` $rhs^)? $mode attr-dict `:` type($result)}];
+  let hasVerifier = 1;
+}
+
 #endif // AIEVEC_OPS
diff --git a/include/aie/Dialect/AIEVec/IR/AIEVecTypeConstraints.td b/include/aie/Dialect/AIEVec/IR/AIEVecTypeConstraints.td
index cf22124fc6..52ba841596 100644
--- a/include/aie/Dialect/AIEVec/IR/AIEVecTypeConstraints.td
+++ b/include/aie/Dialect/AIEVec/IR/AIEVecTypeConstraints.td
@@ -17,6 +17,8 @@ include "mlir/IR/BuiltinTypes.td"
 include "mlir/IR/OpBase.td"
 
 def I4  : I<4>;
+def I256 : I<256>;
+def I512 : I<512>;
 
 class TypeShape<string name> :
   StrFunc<"cast<::mlir::ShapedType>($" # name # ").getShape()">;
diff --git a/include/aie/Dialect/AIEVec/IR/AIEVecTypes.h b/include/aie/Dialect/AIEVec/IR/AIEVecTypes.h
index f69a9a77fc..698c6bd584 100644
--- a/include/aie/Dialect/AIEVec/IR/AIEVecTypes.h
+++ b/include/aie/Dialect/AIEVec/IR/AIEVecTypes.h
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2022 Xilinx Inc.
+// (c) Copyright 2022 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/include/aie/Dialect/AIEVec/IR/CMakeLists.txt b/include/aie/Dialect/AIEVec/IR/CMakeLists.txt
index 2d3bd90b90..91e3475555 100644
--- a/include/aie/Dialect/AIEVec/IR/CMakeLists.txt
+++ b/include/aie/Dialect/AIEVec/IR/CMakeLists.txt
@@ -3,8 +3,16 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2022 Xilinx Inc.
+# (c) Copyright 2022-2024 Advanced Micro Devices, Inc. or its affiliates
 
 add_mlir_dialect(AIEVecOps aievec)
 add_mlir_doc(AIEVecOps AIEVecDialect ./ -gen-dialect-doc -dialect=aievec)
 
+# Add AIEVec attributes
+set(LLVM_TARGET_DEFINITIONS AIEVecAttributes.td)
+mlir_tablegen(AIEVecEnums.h.inc -gen-enum-decls)
+mlir_tablegen(AIEVecEnums.cpp.inc -gen-enum-defs)
+mlir_tablegen(AIEVecAttributes.h.inc -gen-attrdef-decls)
+mlir_tablegen(AIEVecAttributes.cpp.inc -gen-attrdef-defs)
+add_public_tablegen_target(MLIRAIEVecAttributesIncGen)
+add_dependencies(mlir-generic-headers MLIRAIEVecAttributesIncGen)
\ No newline at end of file
diff --git a/lib/Dialect/AIEVec/IR/AIEVecOps.cpp b/lib/Dialect/AIEVec/IR/AIEVecOps.cpp
index b0055be060..572f1f7b61 100644
--- a/lib/Dialect/AIEVec/IR/AIEVecOps.cpp
+++ b/lib/Dialect/AIEVec/IR/AIEVecOps.cpp
@@ -4,25 +4,28 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2022 Xilinx Inc.
+// (c) Copyright 2022-2024 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 // This file implements AIE vector op printing, pasing, and verification.
 //===----------------------------------------------------------------------===//
 
-#include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
-#include "aie/Dialect/AIEVec/AIEVecUtils.h"
-
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/FoldUtils.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+#include "aie/Dialect/AIEVec/AIEVecUtils.h"
+#include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
 
 using namespace llvm;
 using namespace mlir;
 using namespace xilinx;
 using namespace xilinx::aievec;
 
+#include "aie/Dialect/AIEVec/IR/AIEVecEnums.cpp.inc"
 #include "aie/Dialect/AIEVec/IR/AIEVecOpsDialect.cpp.inc"
 
 //===----------------------------------------------------------------------===//
@@ -31,6 +34,10 @@ using namespace xilinx::aievec;
 
 void AIEVecDialect::initialize() {
   registerTypes();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "aie/Dialect/AIEVec/IR/AIEVecAttributes.cpp.inc"
+      >();
   addOperations<
 #define GET_OP_LIST
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.cpp.inc"
@@ -1620,8 +1627,105 @@ ParseResult ShiftOp::parse(OpAsmParser &parser, OperationState &result) {
 // ShuffleOp
 //===----------------------------------------------------------------------===//
 
+// This verification function makes sure that the shuffle mode supports the
+// number and type of operands provided.
+LogicalResult ShuffleOp::verify() {
+  unsigned modeBitWidth;
+  bool requireRhs = true;
+  auto mode = getMode();
+  switch (mode) {
+  case ShuffleMode::T8_8X8:  // 35
+  case ShuffleMode::T8_16X4: // 36
+  case ShuffleMode::T8_4X16: // 37
+  case ShuffleMode::T8_8X4:  // 46
+  case ShuffleMode::T8_4X8:  // 47
+    requireRhs = false;
+    LLVM_FALLTHROUGH;
+  case ShuffleMode::T8_64X2_LO: //  0
+  case ShuffleMode::T8_64X2_HI: //  1
+  case ShuffleMode::T8_2X64_LO: // 20
+  case ShuffleMode::T8_2X64_HI: // 21
+    modeBitWidth = 8u;
+    break;
+  case ShuffleMode::T16_8X4:      // 28
+  case ShuffleMode::T16_4X8:      // 29
+  case ShuffleMode::T16_1X2_flip: // 38
+  case ShuffleMode::T16_4X4:      // 39
+  case ShuffleMode::T16_4X2:      // 40
+  case ShuffleMode::T16_2X4:      // 41
+  case ShuffleMode::T16_8X2:      // 42
+  case ShuffleMode::T16_2X8:      // 43
+  case ShuffleMode::T16_16X2:     // 44
+  case ShuffleMode::T16_2X16:     // 45
+    requireRhs = false;
+    LLVM_FALLTHROUGH;
+  case ShuffleMode::T16_32X2_LO: //  2
+  case ShuffleMode::T16_32X2_HI: //  3
+  case ShuffleMode::T16_2X32_LO: // 18
+  case ShuffleMode::T16_2X32_HI: // 19
+  case ShuffleMode::T16_16X4_LO: // 24
+  case ShuffleMode::T16_16X4_HI: // 25
+  case ShuffleMode::T16_4X16_LO: // 26
+  case ShuffleMode::T16_4X16_HI: // 27
+    modeBitWidth = 16u;
+    break;
+  case ShuffleMode::T32_4X4: // 34
+    requireRhs = false;
+    LLVM_FALLTHROUGH;
+  case ShuffleMode::T32_16X2_LO: //  4
+  case ShuffleMode::T32_16X2_HI: //  5
+  case ShuffleMode::T32_2X16_LO: // 16
+  case ShuffleMode::T32_2X16_HI: // 17
+  case ShuffleMode::T32_8X4_LO:  // 30
+  case ShuffleMode::T32_8X4_HI:  // 31
+  case ShuffleMode::T32_4X8_LO:  // 32
+  case ShuffleMode::T32_4X8_HI:  // 33
+    modeBitWidth = 32u;
+    break;
+  case ShuffleMode::T64_8X2_LO: //  6
+  case ShuffleMode::T64_8X2_HI: //  7
+  case ShuffleMode::T64_2X8_LO: // 14
+  case ShuffleMode::T64_2X8_HI: // 15
+    modeBitWidth = 64u;
+    break;
+  case ShuffleMode::T128_4X2_LO: //  8
+  case ShuffleMode::T128_4X2_HI: //  9
+  case ShuffleMode::T128_2X4_LO: // 12
+  case ShuffleMode::T128_2X4_HI: // 13
+    modeBitWidth = 128u;
+    break;
+  case ShuffleMode::T256_2X2_LO: // 10
+  case ShuffleMode::T256_2X2_HI: // 11
+    modeBitWidth = 256u;
+    break;
+  case ShuffleMode::T512_1X2_LO: // 22
+  case ShuffleMode::T512_1X2_HI: // 23
+    modeBitWidth = 512u;
+    break;
+  }
+
+  // Verify number of operands
+  if (requireRhs && !getRhs())
+    return emitError() << "shuffle mode '" << stringifyEnum(mode)
+                       << "' requires a second operand";
+
+  if (!requireRhs && getRhs())
+    return emitError() << "shuffle mode '" << stringifyEnum(mode)
+                       << "' does not admit a second operand";
+
+  // Verify vector element type
+  auto elemBitWidth =
+      cast<VectorType>(getLhs().getType()).getElementTypeBitWidth();
+  if (modeBitWidth != elemBitWidth)
+    return emitError() << "shuffle mode '" << stringifyEnum(mode)
+                       << "' requires vectors of " << modeBitWidth
+                       << "-bit elements";
+
+  return success();
+}
+
 // Print out Shuffle op.
-void ShuffleOp::print(OpAsmPrinter &p) {
+void LegacyShuffleOp::print(OpAsmPrinter &p) {
   // Print the source vector
   p << " " << getSource();
 
@@ -1633,7 +1737,7 @@ void ShuffleOp::print(OpAsmPrinter &p) {
 }
 
 // Verify Shuffle op.
-LogicalResult ShuffleOp::verify() {
+LogicalResult LegacyShuffleOp::verify() {
   // Verify the types
   VectorType sourceType = llvm::dyn_cast<VectorType>(getSource().getType());
   VectorType resultType = llvm::dyn_cast<VectorType>(getResult().getType());
@@ -1660,7 +1764,8 @@ LogicalResult ShuffleOp::verify() {
 }
 
 // Parse Shuffle op.
-ParseResult ShuffleOp::parse(OpAsmParser &parser, OperationState &result) {
+ParseResult LegacyShuffleOp::parse(OpAsmParser &parser,
+                                   OperationState &result) {
   llvm::SMLoc typesLoc;
   SmallVector<Type, 2> types;
   OpAsmParser::UnresolvedOperand source;
@@ -1886,5 +1991,8 @@ ParseResult FMAConvOp::parse(OpAsmParser &parser, OperationState &result) {
   return parseMulFMAConvOp(parser, result, true);
 }
 
+#define GET_ATTRDEF_CLASSES
+#include "aie/Dialect/AIEVec/IR/AIEVecAttributes.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.cpp.inc"
diff --git a/lib/Dialect/AIEVec/IR/CMakeLists.txt b/lib/Dialect/AIEVec/IR/CMakeLists.txt
index 632851b995..dc10d46704 100644
--- a/lib/Dialect/AIEVec/IR/CMakeLists.txt
+++ b/lib/Dialect/AIEVec/IR/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_dialect_library(MLIRAIEVecDialect
 
   DEPENDS
   MLIRAIEVecOpsIncGen
+  MLIRAIEVecAttributesIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
diff --git a/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp b/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp
index 19da8bf6f2..07c51cf5cb 100644
--- a/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp
+++ b/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp
@@ -739,9 +739,9 @@ static aievec::ShiftOp generateShiftOp(Value lhs, Value rhs, int32_t shiftBytes,
   return shiftOp;
 }
 
-static aievec::ShuffleOp generateShuffleOp(Value source, VectState *state,
-                                           Location loc, unsigned mode,
-                                           VectorType resType = nullptr) {
+static aievec::LegacyShuffleOp generateShuffleOp(Value source, VectState *state,
+                                                 Location loc, unsigned mode,
+                                                 VectorType resType = nullptr) {
   auto vecType = cast<VectorType>(source.getType());
 
   if (!resType) {
@@ -750,8 +750,8 @@ static aievec::ShuffleOp generateShuffleOp(Value source, VectState *state,
     resType = createVectorType(lanes, scalarType);
   }
 
-  auto shuffleOp =
-      state->builder.create<aievec::ShuffleOp>(loc, resType, source, mode);
+  auto shuffleOp = state->builder.create<aievec::LegacyShuffleOp>(loc, resType,
+                                                                  source, mode);
 
   return shuffleOp;
 }
diff --git a/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp b/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp
index a458b4367c..942eda66d0 100644
--- a/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp
+++ b/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp
@@ -447,10 +447,11 @@ struct FoldMulAddChainToConvOpPattern
                 .getResult();
       // If the filter has duplicate elements, pack them.
       if (group.bcastDist == 2)
-        grpRhs =
-            rewriter
-                .create<aievec::ShuffleOp>(loc, signalVecTy, grpRhs, /*mode=*/0)
-                .getResult();
+        // NOTE: This shuffle mode works for `vector<64xi8>`
+        grpRhs = rewriter
+                     .create<aievec::ShuffleOp>(loc, signalVecTy, grpRhs,
+                                                grpRhs, ShuffleMode::T8_64X2_LO)
+                     .getResult();
       // If the first element of the filter to be used is not 0, shift the
       // filter to align the first element to the beginning.
       if (group.bcastShift) {
diff --git a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
index df35d3ba88..fd71fb79bd 100644
--- a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
+++ b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
@@ -1041,6 +1041,37 @@ static LogicalResult printOperation(CppEmitter &emitter,
 // Generate the shuffle intrinsic
 static LogicalResult printOperation(CppEmitter &emitter,
                                     aievec::ShuffleOp shuffleOp) {
+  Value lhs = shuffleOp.getLhs();
+  Value rhs = shuffleOp.getRhs();
+  aievec::ShuffleMode mode = shuffleOp.getMode();
+
+  raw_indented_ostream &os = emitter.ostream();
+
+  // Generate the initialization for the result
+  if (failed(emitter.emitAssignPrefix(*shuffleOp)))
+    return failure();
+
+  os << "shuffle";
+  os << "(";
+  if (!emitter.hasValueInScope(lhs))
+    return failure();
+  os << emitter.getOrCreateName(lhs);
+  os << ", ";
+  if (rhs) {
+    if (!emitter.hasValueInScope(rhs))
+      return failure();
+    os << emitter.getOrCreateName(rhs);
+    os << ", ";
+  }
+  os << "eShuffleMode::shuffle_T" << stringifyEnum(mode).substr(1);
+  os << ")";
+
+  return success();
+}
+
+// Generate the shuffle intrinsic
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    aievec::LegacyShuffleOp shuffleOp) {
   Value source = shuffleOp.getSource();
   unsigned mode = shuffleOp.getMode();
 
@@ -3233,7 +3264,8 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
                 SelectOp, SRSOp, SubOp, SubElemOp, UPDOp, UPSOp, FMAElemOp,
                 MulElemOp, BroadcastOp, BroadcastScalarOp, MulConvOp, FMAConvOp,
                 ShiftOp, ShuffleOp, CastOp, MinOp, MaxOp, NegOp, CmpOp, SelOp,
-                ExtElemOp, BxorOp, BnegOp, BandOp, BorOp, UnpackOp, MatMulOp>(
+                ExtElemOp, BxorOp, BnegOp, BandOp, BorOp, UnpackOp, MatMulOp,
+                LegacyShuffleOp>(
               [&](auto op) { return printOperation(*this, op); })
           .Default([&](Operation *) {
             return op.emitOpError("unable to find printer for op");
diff --git a/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir b/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir
index 3701ae2051..27950f04ef 100644
--- a/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir
+++ b/test/Conversion/VectorToAIEVec/test-conv-op-i8-init.mlir
@@ -36,7 +36,7 @@ func.func @conv2d(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memref<
 //       CHECK:    %[[C0:.*]] = arith.constant 0 : index
 //       CHECK:    %[[T0:.*]] = aievec.upd %[[A1]][%[[C0]]] {index = 0 : i8, offset = 0 : i32} : memref<48xi8>, vector<32xi8>
 //       CHECK:    %[[T1:.*]] = aievec.concat %[[T0]], %[[T0]] : vector<32xi8>, vector<64xi8>
-//       CHECK:    %[[T2:.*]] = aievec.shuffle %[[T1]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
+//       CHECK:    %[[T2:.*]] = aievec.shuffle %[[T1]], %[[T1]] [t8_64x2_lo] : vector<64xi8>
 //       CHECK:    affine.for %[[I:.*]] = 0 to 16 {
 //       CHECK:      affine.for %[[J:.*]] = 0 to 256 step 32 {
 //       CHECK:        %[[T3:.*]] = aievec.upd %[[A2]][%[[I]], %[[J]]] {index = 0 : i8, offset = 0 : i32} : memref<16x256xi8>, vector<32xi8>
diff --git a/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir b/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir
index 1f920f4f38..d0db180f3a 100644
--- a/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir
+++ b/test/Conversion/VectorToAIEVec/test-conv-op-i8.mlir
@@ -34,7 +34,7 @@ func.func @conv2d(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memref<
 //       CHECK:    %[[C0:.*]] = arith.constant 0 : index
 //       CHECK:    %[[T0:.*]] = aievec.upd %[[A1]][%[[C0]]] {index = 0 : i8, offset = 0 : i32} : memref<48xi8>, vector<32xi8>
 //       CHECK:    %[[T1:.*]] = aievec.concat %[[T0]], %[[T0]] : vector<32xi8>, vector<64xi8>
-//       CHECK:    %[[T2:.*]] = aievec.shuffle %[[T1]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
+//       CHECK:    %[[T2:.*]] = aievec.shuffle %[[T1]], %[[T1]] [t8_64x2_lo] : vector<64xi8>
 //       CHECK:    affine.for %[[I:.*]] = 0 to 16 {
 //       CHECK:      affine.for %[[J:.*]] = 0 to 256 step 32 {
 //       CHECK:        %[[T3:.*]] = aievec.upd %[[A0]][%[[I]], %[[J]]] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<64xi8>
diff --git a/test/aievec/conv2d_i8_after_polygeist.mlir b/test/aievec/conv2d_i8_after_polygeist.mlir
index de4949bff5..eaa5e40451 100644
--- a/test/aievec/conv2d_i8_after_polygeist.mlir
+++ b/test/aievec/conv2d_i8_after_polygeist.mlir
@@ -38,7 +38,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.endianness"
 //      CHECK:    %[[C16:.*]] = arith.constant 16 : index
 //      CHECK:    %[[C0:.*]] = arith.constant 0 : index
 //      CHECK:    %[[T0:.*]] = aievec.upd %[[A1]][%[[C0:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?xi8>, vector<64xi8>
-//      CHECK:    %[[T1:.*]] = aievec.shuffle %[[T0:.*]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
+//      CHECK:    %[[T1:.*]] = aievec.legacyshuffle %[[T0:.*]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
 //      CHECK:    scf.for %[[A3:.*]] = %[[C0:.*]] to %[[C16:.*]] step %[[C1:.*]] {
 //      CHECK:      scf.for %[[A4:.*]] = %[[C0:.*]] to %[[C256:.*]] step %[[C32:.*]] {
 //      CHECK:        %[[T2:.*]] = aievec.upd %[[A0]][%[[A3:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x288xi8>, vector<64xi8>
diff --git a/test/aievec/conv2d_uij_i8_noinit_aie-ml.mlir b/test/aievec/conv2d_uij_i8_noinit_aie-ml.mlir
index f6e7667ad5..7e5f8d90a8 100644
--- a/test/aievec/conv2d_uij_i8_noinit_aie-ml.mlir
+++ b/test/aievec/conv2d_uij_i8_noinit_aie-ml.mlir
@@ -80,7 +80,7 @@ func.func @conv2d (%A: memref<18x288xi8>, %B: memref<48xi8>, %C: memref<16x256xi
 //      CHECK:    %[[C8:.*]] = arith.constant 8 : i32
 //      CHECK:    %[[C0:.*]] = arith.constant 0 : index
 //      CHECK:    %[[T0:.*]] = aievec.upd %[[A1]][%[[C0]]] {index = 0 : i8, offset = 0 : i32} : memref<48xi8>, vector<64xi8>
-//      CHECK:    %[[T1:.*]] = aievec.shuffle %[[T0]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
+//      CHECK:    %[[T1:.*]] = aievec.legacyshuffle %[[T0]] {mode = 0 : i32} : vector<64xi8>, vector<64xi8>
 //      CHECK:    %[[T2:.*]] = aievec.shift %[[T1]], %[[T1]], %[[C8]] {isAcc = false} : vector<64xi8>, vector<64xi8>, i32, vector<64xi8>
 //      CHECK:    %[[T3:.*]] = aievec.shift %[[T1]], %[[T1]], %[[C16_i32]] {isAcc = false} : vector<64xi8>, vector<64xi8>, i32, vector<64xi8>
 //      CHECK:    scf.for %[[A3:.*]] = %[[C0]] to %[[C16]] step %[[C1]] {
diff --git a/test/dialect/AIEVec/invalid.mlir b/test/dialect/AIEVec/invalid.mlir
index 05b274cac2..1333670e7b 100644
--- a/test/dialect/AIEVec/invalid.mlir
+++ b/test/dialect/AIEVec/invalid.mlir
@@ -37,3 +37,30 @@ func.func @invalidAccumulatorType(%A : vector<2x4xi16>, %B : vector<4x8xi16>,
                                   into vector<2x8xi32>
   return %0 : vector<2x8xi32>
 }
+
+// -----
+
+func.func @invalidShuffleModeElementType(%v : vector<32xi16>)
+            -> vector<32xi16> {
+  // expected-error @+1 {{shuffle mode 't32_4x4' requires vectors of 32-bit elements}}
+  %r = aievec.shuffle %v [t32_4x4] : vector<32xi16>
+  return %r : vector<32xi16>
+}
+
+// -----
+
+func.func @invalidShuffleModeExtraOperand(%v : vector<32xi16>)
+            -> vector<32xi16> {
+  // expected-error @+1 {{shuffle mode 't16_4x8' does not admit a second operand}}
+  %r = aievec.shuffle %v, %v [t16_4x8] : vector<32xi16>
+  return %r : vector<32xi16>
+}
+
+// -----
+
+func.func @invalidShuffleModeMissingOperand(%v : vector<32xi16>)
+            -> vector<32xi16> {
+  // expected-error @+1 {{shuffle mode 't16_16x4_lo' requires a second operand}}
+  %r = aievec.shuffle %v [t16_16x4_lo] : vector<32xi16>
+  return %r : vector<32xi16>
+}
diff --git a/test/dialect/AIEVec/roundtrip.mlir b/test/dialect/AIEVec/roundtrip.mlir
index c1e88056ed..c7c20011cb 100644
--- a/test/dialect/AIEVec/roundtrip.mlir
+++ b/test/dialect/AIEVec/roundtrip.mlir
@@ -157,3 +157,207 @@ func.func @matmul_bf16(%A : vector<4x8xbf16>, %B : vector<8x4xbf16>,
                                   into vector<4x4xf32>
   return %0 : vector<4x4xf32>
 }
+
+// -----
+
+func.func @shuffle_i8(%v : vector<64xi8>) -> vector<64xi8> {
+  // CHECK: aievec.shuffle %{{.*}} [t8_8x8]
+  %0 = aievec.shuffle %v [t8_8x8] : vector<64xi8>
+  // CHECK: aievec.shuffle %{{.*}} [t8_16x4]
+  %1 = aievec.shuffle %0 [t8_16x4] : vector<64xi8>
+  // CHECK: aievec.shuffle %{{.*}} [t8_4x16]
+  %2 = aievec.shuffle %1 [t8_4x16] : vector<64xi8>
+  // CHECK: aievec.shuffle %{{.*}} [t8_8x4]
+  %3 = aievec.shuffle %2 [t8_8x4] : vector<64xi8>
+  // CHECK: aievec.shuffle %{{.*}} [t8_4x8]
+  %4 = aievec.shuffle %3 [t8_4x8] : vector<64xi8>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t8_64x2_lo]
+  %5 = aievec.shuffle %v, %4 [t8_64x2_lo] : vector<64xi8>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t8_64x2_hi]
+  %6 = aievec.shuffle %5, %v [t8_64x2_hi] : vector<64xi8>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t8_2x64_lo]
+  %7 = aievec.shuffle %v, %6 [t8_2x64_lo] : vector<64xi8>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t8_2x64_hi]
+  %8 = aievec.shuffle %7, %v [t8_2x64_hi] : vector<64xi8>
+  return %8 : vector<64xi8>
+}
+
+// -----
+
+func.func @shuffle_i16(%v : vector<32xi16>) -> vector<32xi16> {
+  // CHECK: aievec.shuffle %{{.*}} [t16_8x4]
+  %0 = aievec.shuffle %v [t16_8x4] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_4x8]
+  %1 = aievec.shuffle %0 [t16_4x8] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_1x2_flip]
+  %2 = aievec.shuffle %1 [t16_1x2_flip] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_4x4]
+  %3 = aievec.shuffle %2 [t16_4x4] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_4x2]
+  %4 = aievec.shuffle %3 [t16_4x2] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_2x4]
+  %5 = aievec.shuffle %4 [t16_2x4] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_8x2]
+  %6 = aievec.shuffle %5 [t16_8x2] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_2x8]
+  %7 = aievec.shuffle %6 [t16_2x8] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_16x2]
+  %8 = aievec.shuffle %7 [t16_16x2] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_2x16]
+  %9 = aievec.shuffle %8 [t16_2x16] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_32x2_lo]
+  %10 = aievec.shuffle %v, %9 [t16_32x2_lo] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_32x2_hi]
+  %11 = aievec.shuffle %10, %v [t16_32x2_hi] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_2x32_lo]
+  %12 = aievec.shuffle %v, %11 [t16_2x32_lo] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_2x32_hi]
+  %13 = aievec.shuffle %12, %v [t16_2x32_hi] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_16x4_lo]
+  %14 = aievec.shuffle %v, %13 [t16_16x4_lo] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_16x4_hi]
+  %15 = aievec.shuffle %14, %v [t16_16x4_hi] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_4x16_lo]
+  %16 = aievec.shuffle %v, %15 [t16_4x16_lo] : vector<32xi16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_4x16_hi]
+  %17 = aievec.shuffle %16, %v [t16_4x16_hi] : vector<32xi16>
+  return %17 : vector<32xi16>
+}
+
+// -----
+
+func.func @shuffle_bf16(%v : vector<32xbf16>) -> vector<32xbf16> {
+  // CHECK: aievec.shuffle %{{.*}} [t16_8x4]
+  %0 = aievec.shuffle %v [t16_8x4] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_4x8]
+  %1 = aievec.shuffle %0 [t16_4x8] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_1x2_flip]
+  %2 = aievec.shuffle %1 [t16_1x2_flip] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_4x4]
+  %3 = aievec.shuffle %2 [t16_4x4] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_4x2]
+  %4 = aievec.shuffle %3 [t16_4x2] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_2x4]
+  %5 = aievec.shuffle %4 [t16_2x4] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_8x2]
+  %6 = aievec.shuffle %5 [t16_8x2] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_2x8]
+  %7 = aievec.shuffle %6 [t16_2x8] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_16x2]
+  %8 = aievec.shuffle %7 [t16_16x2] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}} [t16_2x16]
+  %9 = aievec.shuffle %8 [t16_2x16] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_32x2_lo]
+  %10 = aievec.shuffle %v, %9 [t16_32x2_lo] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_32x2_hi]
+  %11 = aievec.shuffle %10, %v [t16_32x2_hi] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_2x32_lo]
+  %12 = aievec.shuffle %v, %11 [t16_2x32_lo] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_2x32_hi]
+  %13 = aievec.shuffle %12, %v [t16_2x32_hi] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_16x4_lo]
+  %14 = aievec.shuffle %v, %13 [t16_16x4_lo] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_16x4_hi]
+  %15 = aievec.shuffle %14, %v [t16_16x4_hi] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_4x16_lo]
+  %16 = aievec.shuffle %v, %15 [t16_4x16_lo] : vector<32xbf16>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t16_4x16_hi]
+  %17 = aievec.shuffle %16, %v [t16_4x16_hi] : vector<32xbf16>
+  return %17 : vector<32xbf16>
+}
+
+// -----
+
+func.func @shuffle_i32(%v : vector<16xi32>) -> vector<16xi32> {
+  // CHECK: aievec.shuffle %{{.*}} [t32_4x4]
+  %0 = aievec.shuffle %v, [t32_4x4] : vector<16xi32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_16x2_lo]
+  %1 = aievec.shuffle %0, %v [t32_16x2_lo] : vector<16xi32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_16x2_hi]
+  %2 = aievec.shuffle %v, %1 [t32_16x2_hi] : vector<16xi32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_2x16_lo]
+  %3 = aievec.shuffle %2, %v [t32_2x16_lo] : vector<16xi32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_2x16_hi]
+  %4 = aievec.shuffle %v, %3 [t32_2x16_hi] : vector<16xi32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_8x4_lo]
+  %5 = aievec.shuffle %4, %v [t32_8x4_lo] : vector<16xi32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_8x4_hi]
+  %6 = aievec.shuffle %v, %5 [t32_8x4_hi] : vector<16xi32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_4x8_lo]
+  %7 = aievec.shuffle %6, %v [t32_4x8_lo] : vector<16xi32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_4x8_hi]
+  %8 = aievec.shuffle %v, %7 [t32_4x8_hi] : vector<16xi32>
+  return %8 : vector<16xi32>
+}
+
+// -----
+
+func.func @shuffle_f32(%v : vector<16xf32>) -> vector<16xf32> {
+  // CHECK: aievec.shuffle %{{.*}} [t32_4x4]
+  %0 = aievec.shuffle %v, [t32_4x4] : vector<16xf32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_16x2_lo]
+  %1 = aievec.shuffle %0, %v [t32_16x2_lo] : vector<16xf32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_16x2_hi]
+  %2 = aievec.shuffle %v, %1 [t32_16x2_hi] : vector<16xf32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_2x16_lo]
+  %3 = aievec.shuffle %2, %v [t32_2x16_lo] : vector<16xf32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_2x16_hi]
+  %4 = aievec.shuffle %v, %3 [t32_2x16_hi] : vector<16xf32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_8x4_lo]
+  %5 = aievec.shuffle %4, %v [t32_8x4_lo] : vector<16xf32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_8x4_hi]
+  %6 = aievec.shuffle %v, %5 [t32_8x4_hi] : vector<16xf32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_4x8_lo]
+  %7 = aievec.shuffle %6, %v [t32_4x8_lo] : vector<16xf32>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t32_4x8_hi]
+  %8 = aievec.shuffle %v, %7 [t32_4x8_hi] : vector<16xf32>
+  return %8 : vector<16xf32>
+}
+
+// -----
+
+func.func @shuffle_i64(%v : vector<8xi64>) -> vector<8xi64> {
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t64_8x2_lo]
+  %0 = aievec.shuffle %v, %v [t64_8x2_lo] : vector<8xi64>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t64_8x2_hi]
+  %1 = aievec.shuffle %0, %v [t64_8x2_hi] : vector<8xi64>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t64_2x8_lo]
+  %2 = aievec.shuffle %v, %1 [t64_2x8_lo] : vector<8xi64>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t64_2x8_hi]
+  %3 = aievec.shuffle %2, %v [t64_2x8_hi] : vector<8xi64>
+  return %3 : vector<8xi64>
+}
+
+// -----
+
+func.func @shuffle_i128(%v : vector<4xi128>) -> vector<4xi128> {
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t128_4x2_lo]
+  %0 = aievec.shuffle %v, %v [t128_4x2_lo] : vector<4xi128>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t128_4x2_hi]
+  %1 = aievec.shuffle %0, %v [t128_4x2_hi] : vector<4xi128>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t128_2x4_lo]
+  %2 = aievec.shuffle %v, %1 [t128_2x4_lo] : vector<4xi128>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t128_2x4_hi]
+  %3 = aievec.shuffle %2, %v [t128_2x4_hi] : vector<4xi128>
+  return %3 : vector<4xi128>
+}
+
+// -----
+
+func.func @shuffle_i256(%v : vector<2xi256>) -> vector<2xi256> {
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t256_2x2_lo]
+  %0 = aievec.shuffle %v, %v [t256_2x2_lo] : vector<2xi256>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t256_2x2_hi]
+  %1 = aievec.shuffle %0, %v [t256_2x2_hi] : vector<2xi256>
+  return %1 : vector<2xi256>
+}
+
+// -----
+
+func.func @shuffle_i512(%v : vector<1xi512>) -> vector<1xi512> {
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t512_1x2_lo]
+  %0 = aievec.shuffle %v, %v [t512_1x2_lo] : vector<1xi512>
+  // CHECK: aievec.shuffle %{{.*}}, %{{.*}} [t512_1x2_hi]
+  %1 = aievec.shuffle %0, %v [t512_1x2_hi] : vector<1xi512>
+  return %1 : vector<1xi512>
+}