diff --git a/CMakeLists.txt b/CMakeLists.txt
index a34af1e3a2..c977f4056b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -180,6 +180,8 @@ find_library(ELF_LIB elf)
 cmake_dependent_option(AIE_ENABLE_AIRBIN
   "Enables emitting AIRBIN ELF binaries." OFF "ELF_LIB" OFF)
 
+# If we need runtime libs, then statically link them.
+set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
 
 add_flag_if_supported("-Werror=sign-compare" WERROR_SIGN_COMPARE)
 add_flag_if_supported("-Werror=unused" WERROR_USED)
diff --git a/aie_runtime_lib/AIE/aiesim/CMakeLists.txt b/aie_runtime_lib/AIE/aiesim/CMakeLists.txt
index b62f624bad..ac39f4e417 100644
--- a/aie_runtime_lib/AIE/aiesim/CMakeLists.txt
+++ b/aie_runtime_lib/AIE/aiesim/CMakeLists.txt
@@ -7,7 +7,6 @@
 
 
 set(INSTALLS
-    Makefile
     genwrapper_for_ps.cpp)
 
 
diff --git a/aie_runtime_lib/AIE/aiesim/Makefile b/aie_runtime_lib/AIE/aiesim/Makefile
deleted file mode 100644
index 722af0b6b6..0000000000
--- a/aie_runtime_lib/AIE/aiesim/Makefile
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-# # SPDX-License-Identifier: MIT
-
-#
-# From one diretory up, you can invoke the simulator by typing
-# > make -C sim
-#
-# If you want to change the host source file, you can redefine host:
-# > make -C sim host=../yourhost.cpp
-#
-# Note: The host file location is relative to the <sim> folder or can 
-#       be an absolute path
-#
-
-MLIR_AIE_INSTALL = $(dir $(shell which aie-opt))/..
-
-ifeq ($(host),) 
-host:=../../test.cpp
-endif
-ifeq ($(MLIR_AIE_SRC_DIR),) 
-MLIR_AIE_SRC_DIR:=. 
-endif
-ifeq ($(XILINX_VITIS_AIETOOLS),)
-XILINX_VITIS_AIETOOLS = $(dir $(shell which aiesimulator))/..
-endif
-
-MLIR_AIE_PROJ = $(notdir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
-
-.PHONY: all link sim clean
-.NOTPARALLEL:
-all: sim
-
-CC_ENV := (export LD_LIBRARY_PATH=${XILINX_VITIS_AIETOOLS}/lib/lnx64.o:$(LD_LIBRARY_PATH))
-CC := "${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/bin/g++" 
-CC_ARGS := -fPIC -fpermissive -c -std=c++17 -D__AIEARCH__=10 -DAIE_OPTION_SCALAR_FLOAT_ON_VECTOR -Wno-deprecated-declarations -DSC_INCLUDE_DYNAMIC_PROCESSES -D__AIESIM__ -D__PS_INIT_AIE__ -DXAIE_DEBUG -Og -flto -D main\(...\)=ps_main\(...\) -I${XILINX_VITIS_AIETOOLS}/include -I${XILINX_VITIS_AIETOOLS}/include/drivers/aiengine -I${XILINX_HLS}/include -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0 -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0/backward -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0/x86_64-pc-linux-gnu -I${XILINX_VITIS_AIETOOLS}/data/osci_systemc/include -I. -I$(MLIR_AIE_SRC_DIR) -I${XILINX_VITIS_AIETOOLS}/include/xtlm/include -I${XILINX_VITIS_AIETOOLS}/include/common_cpp/common_cpp_v1_0/include -I${MLIR_AIE_INSTALL}/runtime_lib/x86_64/test_lib/include  -I../../ -I../
-
-ps/test.o: $(host)
-	$(CC_ENV);$(CC) $(CC_ARGS) -o $@ $<
-
-ps/test_library.o: ${MLIR_AIE_INSTALL}/../runtime_lib/test_lib/test_library.cpp
-	$(CC_ENV);$(CC) $(CC_ARGS) -o $@ $<
-
-ps/genwrapper_for_ps.o: ps/genwrapper_for_ps.cpp
-	$(CC_ENV);$(CC) $(CC_ARGS) -o $@ $<
-
-ps/ps.so: ps/genwrapper_for_ps.o ps/test.o ps/test_library.o $(eval PATH:=$(XILINX_VITIS_AIETOOLS)/tps/lnx64/gcc/bin/:$(PATH))
-	(${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/bin/g++ -o "ps/ps.so" ps/genwrapper_for_ps.o ps/test.o ps/test_library.o -Wl,--as-needed -shared -lxaiengine -lxioutils -ladf_api -lsystemc -lxtlm -flto -L ${XILINX_VITIS_AIETOOLS}/lib/lnx64.o -L${XILINX_VITIS_AIETOOLS}/data/osci_systemc/lib/lnx64)
-
-link: ps/ps.so
-
-sim: ps/ps.so
-	cd ../..; aiesimulator --pkg-dir=./$(MLIR_AIE_PROJ)/sim --dump-vcd foo
-
-clean:
-	(rm -rf ps/*.o ps/*.so *.log aiesimulator_output *vcd)
diff --git a/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt b/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt
index f47215010e..977fdccc7c 100644
--- a/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt
+++ b/aie_runtime_lib/AIE2/aiesim/CMakeLists.txt
@@ -7,7 +7,6 @@
 
 
 set(INSTALLS
-    Makefile
     genwrapper_for_ps.cpp)
 
 
diff --git a/aie_runtime_lib/AIE2/aiesim/Makefile b/aie_runtime_lib/AIE2/aiesim/Makefile
deleted file mode 100644
index 1e7c476b7a..0000000000
--- a/aie_runtime_lib/AIE2/aiesim/Makefile
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-# # SPDX-License-Identifier: MIT
-
-#
-# From one diretory up, you can invoke the simulator by typing
-# > make -C sim
-#
-# If you want to change the host source file, you can redefine host:
-# > make -C sim host=../yourhost.cpp
-#
-# Note: The host file location is relative to the <sim> folder or can 
-#       be an absolute path
-#
-
-MLIR_AIE_INSTALL = $(dir $(shell which aie-opt))/..
-
-ifeq ($(host),) 
-host:=../../test.cpp
-endif
-ifeq ($(MLIR_AIE_SRC_DIR),) 
-MLIR_AIE_SRC_DIR:=. 
-endif
-ifeq ($(XILINX_VITIS_AIETOOLS),)
-XILINX_VITIS_AIETOOLS = $(dir $(shell which aiesimulator))/..
-endif
-
-MLIR_AIE_PROJ = $(notdir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
-
-.PHONY: all link sim clean
-.NOTPARALLEL:
-all: sim
-
-CC_ENV := (export LD_LIBRARY_PATH=${XILINX_VITIS_AIETOOLS}/lib/lnx64.o:$(LD_LIBRARY_PATH))
-CC := "${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/bin/g++" 
-CC_ARGS := -fPIC -fpermissive -c -std=c++17 -D__AIEARCH__=20 -DAIE_OPTION_SCALAR_FLOAT_ON_VECTOR -DAIE2_FP32_EMULATION_ACCURACY_FAST -Wno-deprecated-declarations -DSC_INCLUDE_DYNAMIC_PROCESSES -D__AIESIM__ -D__PS_INIT_AIE__ -DXAIE_DEBUG -Og -flto -D main\(...\)=ps_main\(...\) -I${XILINX_VITIS_AIETOOLS}/include -I${XILINX_VITIS_AIETOOLS}/include/drivers/aiengine -I${XILINX_HLS}/include -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0 -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0/backward -I${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/include/c++/8.3.0/x86_64-pc-linux-gnu -I${XILINX_VITIS_AIETOOLS}/data/osci_systemc/include -I. -I$(MLIR_AIE_SRC_DIR) -I${XILINX_VITIS_AIETOOLS}/include/xtlm/include -I${XILINX_VITIS_AIETOOLS}/include/common_cpp/common_cpp_v1_0/include -I${MLIR_AIE_INSTALL}/runtime_lib/x86_64/test_lib/include  -I../../ -I../
-
-ps/test.o: $(host)
-	$(CC_ENV);$(CC) $(CC_ARGS) -o $@ $<
-
-ps/test_library.o: ${MLIR_AIE_INSTALL}/../runtime_lib/test_lib/test_library.cpp
-	$(CC_ENV);$(CC) $(CC_ARGS) -o $@ $<
-
-ps/genwrapper_for_ps.o: ps/genwrapper_for_ps.cpp 
-	$(CC_ENV);$(CC) $(CC_ARGS) -o $@ $<
-
-ps/ps.so: ps/genwrapper_for_ps.o ps/test.o ps/test_library.o $(eval PATH:=$(XILINX_VITIS_AIETOOLS)/tps/lnx64/gcc/bin/:$(PATH))
-	(${XILINX_VITIS_AIETOOLS}/tps/lnx64/gcc/bin/g++ -o "ps/ps.so" ps/genwrapper_for_ps.o ps/test.o ps/test_library.o -Wl,--as-needed -shared -lxaiengine -lxioutils -ladf_api -lsystemc -lxtlm -flto -L ${XILINX_VITIS_AIETOOLS}/lib/lnx64.o -L${XILINX_VITIS_AIETOOLS}/data/osci_systemc/lib/lnx64)
-
-link: ps/ps.so
-
-sim: ps/ps.so
-	cd ../..; aiesimulator --pkg-dir=./$(MLIR_AIE_PROJ)/sim --dump-vcd foo
-
-clean:
-	(rm -rf ps/*.o ps/*.so *.log aiesimulator_output *vcd)
diff --git a/cmake/toolchainFiles/toolchain_x86_64.cmake b/cmake/toolchainFiles/toolchain_x86_64.cmake
index 6961a860f6..b0fe79758f 100644
--- a/cmake/toolchainFiles/toolchain_x86_64.cmake
+++ b/cmake/toolchainFiles/toolchain_x86_64.cmake
@@ -1,4 +1,7 @@
-# Copyright (C) 2018-2022, Xilinx Inc. All rights reserved.
-# Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2018-2024 Advanced Micro Devices, Inc. or its affiliates
 
diff --git a/docs/Building.md b/docs/Building.md
index c32ffc09e8..07b237eb09 100644
--- a/docs/Building.md
+++ b/docs/Building.md
@@ -118,6 +118,11 @@ and llvm.
     source utils/env_setup.sh <mlir-aie>/install <llvm dir>/install
     ```
 
+Note that when coming back to this install with a fresh environment, it is necessary to rerun the `utils/env_setup.sh` script to setup your environment as well as activate the Python virtual environment using the following command.
+```
+source sandbox/bin/activate
+```
+
 ## Building on X86 targetting the VCK5000
 
 In order to build and run on PCIe cards, you first have to build and install the aie-rt library. We chose to install the library in /opt/xaiengine but it is not required for the tools to be installed there. Just ensure that when building mlir-aie and mlir-air, that you point to the directory in which the aie-rt library was installed.
@@ -147,6 +152,8 @@ Then, set `${ROCM_ROOT}` to the ROCm install from the previous path. Then, run t
 
 The PCIe AIR runtime requires the use of the [AIR PCIe kernel driver](https://github.com/Xilinx/ROCm-air-platforms/tree/main/driver). The driver directory in the [ROCm-air-platforms](https://github.com/Xilinx/ROCm-air-platforms) repository contains documentation on how to compile and load the AIR PCIe kernel driver.
 
+After this is complete, refer back to Step 5 of `Building on X86 for mlir-aie development` to setup the rest of your environment.
+
 ### Sysroot
 Since the AIE tools are cross-compiling, in order to actually compile code, we need a 'sysroot' directory,
 containing an ARM rootfs.  This rootfs must match what will be available in the runtime environment.
diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
index 806275a8e9..00a9a89137 100644
--- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -13,7 +13,7 @@
 #include "aie/Dialect/AIE/IR/AIEDialect.h"
 #include "aie/Dialect/AIE/Transforms/AIEPasses.h"
 
-#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
diff --git a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
index 42cf201824..0ab77bcde7 100644
--- a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
+++ b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2023, Advanced Micro Devices, Inc.
+// (c) Copyright 2023-2024 Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 // This file contains conversions and rewrites to the Vector dialect to make
@@ -39,6 +39,55 @@ using namespace xilinx::aievec;
 //================== Common AIE canonicalization analysis ====================//
 //============================================================================//
 
+static bool isGemmBTransposedContractionOp(vector::ContractionOp op) {
+  if (op.getKind() != vector::CombiningKind::ADD)
+    return false;
+
+  // Get and check shape of operands
+  auto lhsShape = op.getLhsType().getShape();
+  auto rhsShape = op.getRhsType().getShape();
+  auto accShape = cast<ShapedType>(op.getAccType()).getShape();
+  if (lhsShape.size() < 2 || rhsShape.size() < 2 || accShape.size() < 2)
+    return false;
+
+  // Check that the innermost iterators match gemm-like iterators
+  SmallVector<vector::IteratorType> iterators = op.getIteratorTypesArray();
+  if (iterators.size() < 3)
+    return false;
+  auto innerMostIterators =
+      SmallVector<vector::IteratorType>(iterators.end() - 3, iterators.end());
+  if (vector::IteratorType::parallel != innerMostIterators[0] ||
+      vector::IteratorType::parallel != innerMostIterators[1] ||
+      vector::IteratorType::reduction != innerMostIterators[2])
+    return false;
+
+  // Get indexing maps of iterators for operands
+  SmallVector<AffineMap, 4> indexingMaps(op.getIndexingMapsArray());
+  SmallVector<int64_t> outerMostResults;
+  for (int64_t i = 0; i < indexingMaps[0].getNumResults() - 2; i++)
+    outerMostResults.push_back(i);
+
+  auto innerLhsMap = indexingMaps[0].dropResults(outerMostResults);
+  auto innerRhsMap = indexingMaps[1].dropResults(outerMostResults);
+  auto innerAccMap = indexingMaps[2].dropResults(outerMostResults);
+
+  // Check whether they conform to a "transposed B" gemm
+  auto ctx = op.getContext();
+  auto mmAidxMap =
+      AffineMap::getPermutationMap(ArrayRef<unsigned>{1, 0, 2}, ctx)
+          .dropResults(0);
+  auto mmBidxMap =
+      AffineMap::getPermutationMap(ArrayRef<unsigned>{0, 1, 2}, ctx)
+          .dropResults(0);
+  auto mmCidxMap =
+      AffineMap::getPermutationMap(ArrayRef<unsigned>{2, 0, 1}, ctx)
+          .dropResults(0);
+  int64_t numOuterMostDims = indexingMaps[0].getNumDims() - 3;
+  return innerLhsMap == mmAidxMap.shiftDims(numOuterMostDims) &&
+         innerRhsMap == mmBidxMap.shiftDims(numOuterMostDims) &&
+         innerAccMap == mmCidxMap.shiftDims(numOuterMostDims);
+}
+
 //============================================================================//
 //============ Common AIE canonicalization conversion patterns ===============//
 //============================================================================//
@@ -411,6 +460,107 @@ struct FlattenMultDimTransferWritePattern
   }
 };
 
+// This pattern takes out an implicit transposition of the `rhs` operand in a
+// gemm-like contraction op, making it an explicit `vector.transpose` op.
+// If `rhs` is coming from a widening op (`extf`/`extsi`/`extui`), the
+// transposition will be hoisted above the widening op.
+struct ExtractTransposeFromContractionOp
+    : public OpConversionPattern<vector::ContractionOp> {
+  using OpConversionPattern<vector::ContractionOp>::OpConversionPattern;
+
+  static VectorType getTransposedVectorType(VectorType vecTy) {
+    SmallVector<int64_t> shape{vecTy.getShape()};
+    auto nDim = shape.size();
+    int64_t dimNm1 = shape[nDim - 1];
+    shape[nDim - 1] = shape[nDim - 2];
+    shape[nDim - 2] = dimNm1;
+    auto elemTy = vecTy.getElementType();
+    return VectorType::get(shape, elemTy);
+  }
+
+  LogicalResult
+  matchAndRewrite(vector::ContractionOp contractOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (!isGemmBTransposedContractionOp(contractOp))
+      return failure();
+
+    Location loc = contractOp.getLoc();
+    auto ctx = rewriter.getContext();
+
+    Value rhsVal = adaptor.getRhs();
+    VectorType rhsVecTy = contractOp.getRhsType();
+    Type rhsElemTy = rhsVecTy.getElementType();
+
+    bool doExtF = false, doExtSI = false, doExtUI = false;
+    if (auto extfRhsOp = rhsVal.getDefiningOp<arith::ExtFOp>()) {
+      rhsVal = extfRhsOp.getIn();
+      rhsVecTy = cast<VectorType>(rhsVal.getType());
+      doExtF = true;
+    } else if (auto extsiRhsOp = rhsVal.getDefiningOp<arith::ExtSIOp>()) {
+      rhsVal = extsiRhsOp.getIn();
+      rhsVecTy = cast<VectorType>(rhsVal.getType());
+      doExtSI = true;
+    } else if (auto extuiRhsOp = rhsVal.getDefiningOp<arith::ExtUIOp>()) {
+      rhsVal = extuiRhsOp.getIn();
+      rhsVecTy = cast<VectorType>(rhsVal.getType());
+      doExtUI = true;
+    }
+
+    int64_t nDim = rhsVecTy.getShape().size();
+    SmallVector<int64_t> rhsPermutation;
+    for (int64_t i = 0; i < nDim - 2; i++)
+      rhsPermutation.push_back(i);
+    rhsPermutation.push_back(nDim - 1);
+    rhsPermutation.push_back(nDim - 2);
+    auto transpRhsVecTy = getTransposedVectorType(rhsVecTy);
+    rhsVal = rewriter
+                 .create<vector::TransposeOp>(loc, transpRhsVecTy, rhsVal,
+                                              rhsPermutation)
+                 .getResult();
+
+    if (doExtF)
+      rhsVal =
+          rewriter
+              .create<arith::ExtFOp>(
+                  loc, VectorType::get(transpRhsVecTy.getShape(), rhsElemTy),
+                  rhsVal)
+              .getOut();
+    if (doExtSI)
+      rhsVal =
+          rewriter
+              .create<arith::ExtSIOp>(
+                  loc, VectorType::get(transpRhsVecTy.getShape(), rhsElemTy),
+                  rhsVal)
+              .getOut();
+    if (doExtUI)
+      rhsVal =
+          rewriter
+              .create<arith::ExtUIOp>(
+                  loc, VectorType::get(transpRhsVecTy.getShape(), rhsElemTy),
+                  rhsVal)
+              .getOut();
+
+    SmallVector<AffineMap, 4> oldIdxMaps(contractOp.getIndexingMapsArray());
+
+    nDim = oldIdxMaps[1].getNumDims();
+    SmallVector<int64_t> innerDimPerm;
+    for (int64_t i = 0; i < nDim - 2; i++)
+      innerDimPerm.push_back(i);
+    innerDimPerm.push_back(nDim - 1);
+    innerDimPerm.push_back(nDim - 2);
+    auto transpPermMap = AffineMap::getPermutationMap(innerDimPerm, ctx);
+
+    auto newIdxMaps = rewriter.getAffineMapArrayAttr(
+        {oldIdxMaps[0], oldIdxMaps[1].compose(transpPermMap), oldIdxMaps[2]});
+
+    rewriter.replaceOpWithNewOp<vector::ContractionOp>(
+        contractOp, contractOp.getResult().getType(), adaptor.getLhs(), rhsVal,
+        adaptor.getAcc(), newIdxMaps, contractOp.getIteratorTypes());
+
+    return success();
+  }
+};
+
 //============================================================================//
 //============ AIEML canonicalization conversion patterns ===============//
 //============================================================================//
@@ -470,6 +620,10 @@ static void configureAIEMLCanonicalizeLegalizations(ConversionTarget &target,
       [](vector::TransferWriteOp op) {
         return cast<VectorType>(op.getVector().getType()).getRank() < 2;
       });
+  target.addDynamicallyLegalOp<vector::ContractionOp>(
+      [](vector::ContractionOp op) {
+        return !isGemmBTransposedContractionOp(op);
+      });
 }
 
 static void
@@ -477,8 +631,9 @@ populateAIEMLCanonicalizeConversionPatterns(RewritePatternSet &patterns,
                                             TargetBackend backend) {
   patterns.add<SplitUnalignedTransferReadPattern>(patterns.getContext(), 1024,
                                                   256);
-  patterns.add<FlattenMultDimTransferReadPattern,
-               FlattenMultDimTransferWritePattern>(patterns.getContext());
+  patterns
+      .add<ExtractTransposeFromContractionOp, FlattenMultDimTransferReadPattern,
+           FlattenMultDimTransferWritePattern>(patterns.getContext());
 }
 
 //============================================================================//
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
index 103cbbbcbe..236fe53140 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
@@ -6,7 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-aie.device(npu) {
+aie.device(npu1_3col) {
 
   //shim
   %tile00 = aie.tile(0, 0)
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
index 79ee207026..94f5888512 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
@@ -580,7 +580,7 @@ def core_body():
 
                 @core(cores[i][1], "conv2dk3.o")
                 def core_body():
-                    scale = 11
+                    scale = 1
                     for _ in for_(sys.maxsize):
 
                         # acquire weights and rtps once
@@ -697,7 +697,7 @@ def core_body():
 
                 @core(cores[i][3], "conv2dk3.o")
                 def core_body():
-                    scale = 11
+                    scale = 1
                     for _ in for_(sys.maxsize):
 
                         # acquire weights and rtps once
@@ -927,33 +927,24 @@ def core_body():
             )
             def sequence(inputFromL3, weightsFromL3, outputToL3):
 
-                # for c, col in enumerate(rtp_name):
-                #     for r, row in enumerate(col):
-                #         NpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1)  # scale
-
-                # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0)
-                # NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=1)
-
-                # NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0)
-
-                # NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0)
-
-                # #     # write RTP parameters
-                # npuWriteRTPOp(
-                #     "rtpComputeTile02", col=0, row=2, index=0, value=1
-                # )  # scale
-                # npuWriteRTPOp(
-                #     "rtpComputeTile03", col=0, row=3, index=0, value=1
-                # )  # scale
-                # npuWriteRTPOp(
-                #     "rtpComputeTile05", col=0, row=5, index=0, value=1
-                # )  # scale
-                # npuWriteRTPOp(
-                #     "rtpComputeTile04", col=0, row=4, index=0, value=1
-                # )  # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
-                # npuWriteRTPOp(
-                #     "rtpComputeTile04", col=0, row=4, index=1, value=0
-                # )  # skip_scale
+                NpuWriteRTPOp("rtpComputeTile02", col=0, row=2, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile03", col=0, row=3, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile04", col=0, row=5, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=1, value=0)
+                NpuWriteRTPOp("rtpComputeTile05", col=0, row=4, index=2, value=1)
+
+                NpuWriteRTPOp("rtpComputeTile15", col=1, row=5, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile14", col=1, row=4, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile12", col=1, row=2, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=1, value=0)
+
+                NpuWriteRTPOp("rtpComputeTile22", col=2, row=2, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile23", col=2, row=3, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile25", col=2, row=5, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=1)
+                NpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=1, value=0)
 
                 npu_dma_memcpy_nd(
                     metadata="act1_00_02_01",
diff --git a/programming_examples/ml/resnet/layers_conv2_x/run_makefile.lit b/programming_examples/ml/resnet/layers_conv2_x/run_makefile.lit
index c6bf5b4886..6097345491 100644
--- a/programming_examples/ml/resnet/layers_conv2_x/run_makefile.lit
+++ b/programming_examples/ml/resnet/layers_conv2_x/run_makefile.lit
@@ -1,8 +1,7 @@
 // (c) Copyright 2024 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// This test is disabled due to random failures on gihub CI
-// REQUIRES: ryzen_ai, chess, torch, has_random_failures
+// REQUIRES: ryzen_ai, chess, torch
 //
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile 
diff --git a/python/compiler/aiecc/cl_arguments.py b/python/compiler/aiecc/cl_arguments.py
index cbfb5edcd3..4473cb54ca 100644
--- a/python/compiler/aiecc/cl_arguments.py
+++ b/python/compiler/aiecc/cl_arguments.py
@@ -239,6 +239,12 @@ def parse_args(args=None):
         const=True,
         help="Generate xclbin",
     )
+    parser.add_argument(
+        "--xclbin-input",
+        dest="xclbin_input",
+        default=None,
+        help="Generate kernel into existing xclbin file",
+    )
     parser.add_argument(
         "--link_against_hsa",
         dest="link_against_hsa",
diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py
index 62dd87190b..be72ee3363 100644
--- a/python/compiler/aiecc/main.py
+++ b/python/compiler/aiecc/main.py
@@ -21,6 +21,7 @@
 import tempfile
 from textwrap import dedent
 import time
+import uuid
 
 from aie.extras.runtime.passes import Pipeline
 
@@ -207,7 +208,8 @@ def emit_partition(mlir_module_str, kernel_id="0x901", start_columns=None):
         else:
             start_columns = list(range(1, 6 - num_cols))
 
-    uuid = random.randint(2222, 9999)
+    # Generate a uuid
+    pdi_uuid = uuid.uuid4()
     return {
         "aie_partition": {
             "name": "QoS",
@@ -220,7 +222,7 @@ def emit_partition(mlir_module_str, kernel_id="0x901", start_columns=None):
             },
             "PDIs": [
                 {
-                    "uuid": "00000000-0000-0000-0000-00000000" + str(uuid),
+                    "uuid": str(pdi_uuid),
                     "file_name": "./design.pdi",
                     "cdo_groups": [
                         {
@@ -589,7 +591,25 @@ async def process_xclbin_gen(self):
 
         # fmt: off
         await self.do_call(task, ["bootgen", "-arch", "versal", "-image", self.prepend_tmp("design.bif"), "-o", self.prepend_tmp("design.pdi"), "-w"])
-        await self.do_call(task, ["xclbinutil", "--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json"), "--add-kernel", self.prepend_tmp("kernels.json"), "--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"), "--force", "--output", opts.xclbin_name])
+        if opts.xclbin_input:
+            await self.do_call(task, ["xclbinutil",
+                                      "--dump-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_input_partition.json"),
+                                      "--force", "--input", opts.xclbin_input])
+            with open(self.prepend_tmp("aie_input_partition.json")) as f:
+                input_partition = json.load(f)
+            with open(self.prepend_tmp("aie_partition.json")) as f:
+                new_partition = json.load(f)
+            input_partition["aie_partition"]["PDIs"].append(new_partition["aie_partition"]["PDIs"][0])
+            with open(self.prepend_tmp("aie_partition.json"), "w") as f:
+                json.dump(input_partition, f, indent=2)
+            flag = ['--input', opts.xclbin_input]
+        else:
+            flag = ["--add-replace-section", "MEM_TOPOLOGY:JSON:" + self.prepend_tmp("mem_topology.json")]
+
+        await self.do_call(task, ["xclbinutil"] + flag +
+                                 ["--add-kernel", self.prepend_tmp("kernels.json"),
+                                  "--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"),
+                                  "--force", "--output", opts.xclbin_name])
         # fmt: on
 
     async def process_host_cgen(self, aie_target, file_with_addresses):
@@ -801,7 +821,6 @@ def make_sim_dir(x):
             "test_lib",
             "include",
         )
-        sim_makefile = os.path.join(runtime_simlib_path, "Makefile")
         sim_genwrapper = os.path.join(runtime_simlib_path, "genwrapper_for_ps.cpp")
         file_physical = self.prepend_tmp("input_physical.mlir")
         memory_allocator = os.path.join(
@@ -893,8 +912,6 @@ def make_sim_dir(x):
                 ],
             )
         )
-        processes.append(self.do_call(task, ["cp", sim_makefile, sim_dir]))
-        processes.append(self.do_call(task, ["cp", sim_genwrapper, sim_ps_dir]))
         processes.append(
             self.do_call(
                 task,
@@ -905,7 +922,7 @@ def make_sim_dir(x):
                     "-shared",
                     "-o",
                     os.path.join(sim_ps_dir, "ps.so"),
-                    os.path.join(runtime_simlib_path, "genwrapper_for_ps.cpp"),
+                    sim_genwrapper,
                     *aie_target_defines(aie_target),
                     *host_opts,
                     *sim_cc_args,
diff --git a/test/dialect/AIEVec/precanonicalization-aieml.mlir b/test/dialect/AIEVec/precanonicalization-aieml.mlir
index e4e1004a0c..14b557b49b 100644
--- a/test/dialect/AIEVec/precanonicalization-aieml.mlir
+++ b/test/dialect/AIEVec/precanonicalization-aieml.mlir
@@ -85,3 +85,84 @@ func.func @multidim_vector_transfer(%in : memref<64x64x32x8xbf16>,
   return
 }
 
+//
+// -----
+//
+
+// CHECK: #[[IDXMAPA:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
+// CHECK: #[[IDXMAPB:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>
+// CHECK: #[[IDXMAPC:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d4, d5)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
+
+// CHECK-LABEL: func.func @vector_contract_permuted_b(
+// CHECK-SAME: %[[VA:[a-zA-Z0-9]+]]: vector<1x1x4x8xbf16>,
+// CHECK-SAME: %[[VB:[a-zA-Z0-9]+]]: vector<1x1x4x8xbf16>,
+// CHECK-SAME: %[[VC:[a-zA-Z0-9]+]]: vector<1x1x4x4xf32>
+func.func @vector_contract_permuted_b(%A : vector<1x1x4x8xbf16>,
+                                      %B : vector<1x1x4x8xbf16>,
+                                      %C : vector<1x1x4x4xf32>)
+                                    -> vector<1x1x4x4xf32> {
+  // CHECK: %[[TRB:.*]] = vector.transpose %[[VB]], [0, 1, 3, 2] :
+  // CHECK-SAME:                 vector<1x1x4x8xbf16> to vector<1x1x8x4xbf16>
+  // CHECK: %[[RES:.*]] = vector.contract {
+  // CHECK-SAME:    indexing_maps = [#[[IDXMAPA]], #[[IDXMAPB]], #[[IDXMAPC]]],
+  // CHECK-SAME:    iterator_types = ["parallel", "parallel", "reduction",
+  // CHECK-SAME:                      "parallel", "parallel", "reduction"],
+  // CHECK-SAME:    kind = #vector.kind<add>}
+  // CHECK-SAME:    %[[VA]], %[[TRB]], %[[VC]] :
+  // CHECK-SAME:          vector<1x1x4x8xbf16>, vector<1x1x8x4xbf16>
+  // CHECK-SAME:          into vector<1x1x4x4xf32>
+  %res = vector.contract {
+              indexing_maps = [#map1, #map2, #map3],
+              iterator_types = ["parallel", "parallel", "reduction",
+                                "parallel", "parallel", "reduction"],
+              kind = #vector.kind<add>} %A, %B, %C :
+              vector<1x1x4x8xbf16>, vector<1x1x4x8xbf16> into vector<1x1x4x4xf32>
+  return %res : vector<1x1x4x4xf32>
+}
+
+//
+// -----
+//
+
+// CHECK: #[[IDXMAPA:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
+// CHECK: #[[IDXMAPB:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)>
+// CHECK: #[[IDXMAPC:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d4, d5)>
+#map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
+
+// CHECK-LABEL: func.func @vector_contract_permuted_b(
+// CHECK-SAME: %[[VA:[a-zA-Z0-9]+]]: vector<1x1x4x8xbf16>,
+// CHECK-SAME: %[[VB:[a-zA-Z0-9]+]]: vector<1x1x4x8xbf16>,
+// CHECK-SAME: %[[VC:[a-zA-Z0-9]+]]: vector<1x1x4x4xf32>
+func.func @vector_contract_permuted_b(%A : vector<1x1x4x8xbf16>,
+                                      %B : vector<1x1x4x8xbf16>,
+                                      %C : vector<1x1x4x4xf32>)
+                                    -> vector<1x1x4x4xf32> {
+  // CHECK: %[[LHS:.*]] = arith.extf %[[VA]] :
+  // CHECK-SAME:                 vector<1x1x4x8xbf16> to vector<1x1x4x8xf32>
+  // CHECK: %[[TRB:.*]] = vector.transpose %[[VB]], [0, 1, 3, 2] :
+  // CHECK-SAME:                 vector<1x1x4x8xbf16> to vector<1x1x8x4xbf16>
+  // CHECK: %[[RHS:.*]] = arith.extf %[[TRB]] :
+  // CHECK-SAME:                 vector<1x1x8x4xbf16> to vector<1x1x8x4xf32>
+  // CHECK: %[[RES:.*]] = vector.contract {
+  // CHECK-SAME:    indexing_maps = [#[[IDXMAPA]], #[[IDXMAPB]], #[[IDXMAPC]]],
+  // CHECK-SAME:    iterator_types = ["parallel", "parallel", "reduction",
+  // CHECK-SAME:                      "parallel", "parallel", "reduction"],
+  // CHECK-SAME:    kind = #vector.kind<add>}
+  // CHECK-SAME:    %[[LHS]], %[[RHS]], %[[VC]] :
+  // CHECK-SAME:          vector<1x1x4x8xf32>, vector<1x1x8x4xf32>
+  // CHECK-SAME:          into vector<1x1x4x4xf32>
+  %lhs = arith.extf %A : vector<1x1x4x8xbf16> to vector<1x1x4x8xf32>
+  %rhs = arith.extf %B : vector<1x1x4x8xbf16> to vector<1x1x4x8xf32>
+  %res = vector.contract {
+              indexing_maps = [#map1, #map2, #map3],
+              iterator_types = ["parallel", "parallel", "reduction",
+                                "parallel", "parallel", "reduction"],
+              kind = #vector.kind<add>} %lhs, %rhs, %C :
+              vector<1x1x4x8xf32>, vector<1x1x4x8xf32> into vector<1x1x4x4xf32>
+  return %res : vector<1x1x4x4xf32>
+}
\ No newline at end of file
diff --git a/test/npu-xrt/add_one_two/aie1.mlir b/test/npu-xrt/add_one_two/aie1.mlir
new file mode 100644
index 0000000000..676dda4305
--- /dev/null
+++ b/test/npu-xrt/add_one_two/aie1.mlir
@@ -0,0 +1,53 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    %t00 = aie.tile(0, 0)
+    %t01 = aie.tile(0, 1)
+    %t02 = aie.tile(0, 2)
+  
+    aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ()
+
+    aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ()
+  
+    aie.core(%t02) {
+      %c8 = arith.constant 8 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1_32 = arith.constant 1 : i32
+  
+      scf.for %steps = %c0 to %c8 step %c1 {
+        %subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        %subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        scf.for %arg3 = %c0 to %c8 step %c1 {
+            %0 = memref.load %elem0[%arg3] : memref<8xi32>
+            %1 = arith.addi %0, %c1_32 : i32
+            memref.store %1, %elem1[%arg3] : memref<8xi32>
+        }
+        aie.objectfifo.release @objFifo_in1(Consume, 1)
+        aie.objectfifo.release @objFifo_out1(Produce, 1)
+      }
+      aie.end
+    }
+    func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
+      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
+      return
+    }
+  }
+}
diff --git a/test/npu-xrt/add_one_two/aie2.mlir b/test/npu-xrt/add_one_two/aie2.mlir
new file mode 100644
index 0000000000..75f1f9beb3
--- /dev/null
+++ b/test/npu-xrt/add_one_two/aie2.mlir
@@ -0,0 +1,53 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    %t00 = aie.tile(0, 0)
+    %t01 = aie.tile(0, 1)
+    %t02 = aie.tile(0, 2)
+  
+    aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ()
+
+    aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ()
+  
+    aie.core(%t02) {
+      %c8 = arith.constant 8 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c2_32 = arith.constant 2 : i32
+  
+      scf.for %steps = %c0 to %c8 step %c1 {
+        %subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        %subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        scf.for %arg3 = %c0 to %c8 step %c1 {
+            %0 = memref.load %elem0[%arg3] : memref<8xi32>
+            %1 = arith.addi %0, %c2_32 : i32
+            memref.store %1, %elem1[%arg3] : memref<8xi32>
+        }
+        aie.objectfifo.release @objFifo_in1(Consume, 1)
+        aie.objectfifo.release @objFifo_out1(Produce, 1)
+      }
+      aie.end
+    }
+    func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
+      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
+      return
+    }
+  }
+}
diff --git a/test/npu-xrt/add_one_two/run.lit b/test/npu-xrt/add_one_two/run.lit
new file mode 100644
index 0000000000..60fb9dbf43
--- /dev/null
+++ b/test/npu-xrt/add_one_two/run.lit
@@ -0,0 +1,11 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python aiecc.py --xclbin-kernel-name=ADDONE --xclbin-kernel-id=0x901 --xclbin-instance-name=ADDONEINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=add_one.xclbin --npu-insts-name=insts.txt %S/aie1.mlir
+// RUN: %python aiecc.py --xclbin-kernel-name=ADDTWO --xclbin-kernel-id=0x902 --xclbin-instance-name=ADDTWOINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-input=add_one.xclbin --xclbin-name=add_two.xclbin --npu-insts-name=insts.txt %S/aie2.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe -x add_two.xclbin -i insts.txt | FileCheck %s
+// CHECK: PASS!
+
diff --git a/test/npu-xrt/add_one_two/test.cpp b/test/npu-xrt/add_one_two/test.cpp
new file mode 100644
index 0000000000..0f2a5c3c7e
--- /dev/null
+++ b/test/npu-xrt/add_one_two/test.cpp
@@ -0,0 +1,221 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 64;
+constexpr int OUT_SIZE = 64;
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+  if (!vm_in.count(name)) {
+    throw std::runtime_error("Error: no " + name + " file was provided\n");
+  } else {
+    std::ifstream test(vm_in[name].as<std::string>());
+    if (!test) {
+      throw std::runtime_error("The " + name + " file " +
+                               vm_in[name].as<std::string>() +
+                               " does not exist.\n");
+    }
+  }
+}
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  desc.add_options()("help,h", "produce help message")(
+      "xclbin,x", po::value<std::string>()->required(),
+      "the input xclbin path")("verbosity,v",
+                               po::value<int>()->default_value(0),
+                               "the verbosity of the output")(
+      "instr,i", po::value<std::string>()->required(),
+      "path of file containing userspace instructions to be sent to the LX6");
+  po::variables_map vm;
+
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+      std::cout << desc << "\n";
+      return 1;
+    }
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+    std::cerr << "Usage:\n" << desc << "\n";
+    return 1;
+  }
+
+  check_arg_file_exists(vm, "xclbin");
+  check_arg_file_exists(vm, "instr");
+
+  std::vector<uint32_t> instr_v =
+      load_instr_sequence(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel0 = *std::find_if(xkernels.begin(), xkernels.end(),
+                                [](xrt::xclbin::kernel &k) {
+                                  auto name = k.get_name();
+                                  std::cout << "Name: " << name << std::endl;
+                                  return name == "ADDONE";
+                                });
+  auto kernelName0 = xkernel0.get_name();
+  auto xkernel1 = *std::find_if(xkernels.begin(), xkernels.end(),
+                                [](xrt::xclbin::kernel &k) {
+                                  auto name = k.get_name();
+                                  std::cout << "Name: " << name << std::endl;
+                                  return name == "ADDTWO";
+                                });
+  auto kernelName1 = xkernel1.get_name();
+
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernels: " << kernelName0 << " and "
+              << kernelName1 << "\n";
+
+  auto kernel0 = xrt::kernel(context, kernelName0);
+
+  auto bo0_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                           XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(0));
+  auto bo0_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(2));
+  auto bo0_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(3));
+  auto bo0_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(4));
+
+  auto kernel1 = xrt::kernel(context, kernelName1);
+
+  auto bo1_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                           XCL_BO_FLAGS_CACHEABLE, kernel1.group_id(0));
+  auto bo1_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(2));
+  auto bo1_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(3));
+  auto bo1_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                         XRT_BO_FLAGS_HOST_ONLY, kernel1.group_id(4));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  uint32_t *bufInA = bo0_inA.map<uint32_t *>();
+  std::vector<uint32_t> srcVecA;
+  for (int i = 0; i < IN_SIZE; i++)
+    srcVecA.push_back(i + 1);
+  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
+
+  void *bufInstr = bo0_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo0_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo0_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel 0.\n";
+
+  auto run0 = kernel0(bo0_instr, instr_v.size(), bo0_inA, bo0_inB, bo0_out);
+  run0.wait();
+
+  bo0_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  uint32_t *bufOut = bo0_out.map<uint32_t *>();
+
+  // same instructions as kernel1
+  bufInstr = bo1_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+  bo1_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // copy kernel0 output to kernel1 input
+  bufInA = bo1_inA.map<uint32_t *>();
+  memcpy(bufInA, bufOut, IN_SIZE * sizeof(uint32_t));
+  bo1_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel 1.\n";
+  auto run1 = kernel1(bo1_instr, instr_v.size(), bo1_inA, bo1_inB, bo1_out);
+  run1.wait();
+
+  bo1_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  bufOut = bo1_out.map<uint32_t *>();
+
+  int errors = 0;
+
+  for (uint32_t i = 0; i < 64; i++) {
+    uint32_t ref = (i + 1) + 1 + 2;
+    if (*(bufOut + i) != ref) {
+      std::cout << "Error in output " << *(bufOut + i) << " != " << ref
+                << std::endl;
+      errors++;
+    } else {
+      std::cout << "Correct output " << *(bufOut + i) << " == " << ref
+                << std::endl;
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nfailed.\n\n";
+    return 1;
+  }
+}
diff --git a/tools/aie2xclbin/CMakeLists.txt b/tools/aie2xclbin/CMakeLists.txt
index b6056b18c9..cd8afac255 100644
--- a/tools/aie2xclbin/CMakeLists.txt
+++ b/tools/aie2xclbin/CMakeLists.txt
@@ -21,6 +21,12 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 configure_file(configure.h.in configure.h)
 target_include_directories(aie2xclbin PRIVATE "${CMAKE_CURRENT_BINARY_DIR}")
 
+if(MSVC)
+  set(UUID "Rpcrt4.lib")
+else()
+  find_library (UUID uuid REQUIRED)
+endif()
+
 target_link_libraries(aie2xclbin
   ${dialect_libs}
   MLIRParser
@@ -37,7 +43,8 @@ target_link_libraries(aie2xclbin
   AIEX
   AIEXTransforms
   MLIRAIEVecDialect
-  MLIRXLLVMDialect)
+  MLIRXLLVMDialect
+  ${UUID})
 
 install(TARGETS aie2xclbin
   EXPORT AIE2XCLBIN
diff --git a/tools/aie2xclbin/XCLBinGen.cpp b/tools/aie2xclbin/XCLBinGen.cpp
index a45ea8bd07..a75e06662e 100644
--- a/tools/aie2xclbin/XCLBinGen.cpp
+++ b/tools/aie2xclbin/XCLBinGen.cpp
@@ -45,7 +45,13 @@
 #include <unordered_map>
 
 #ifdef _WIN32
+#include "windows.h"
+// For UUID stuff
+#include "rpcdce.h"
+
 #define setenv(name, var, ignore) _putenv_s(name, var)
+#else
+#include <uuid/uuid.h>
 #endif
 
 using namespace llvm;
@@ -126,6 +132,31 @@ void xilinx::findVitis(XCLBinGenConfig &TK) {
   }
 }
 
+static std::string getUUIDString() {
+  std::string val;
+#ifdef _WIN32
+  UUID *uuid;
+  RPC_STATUS status;
+  status = UuidCreate(uuid);
+  if (status != RPC_S_OK)
+    errs() << "Failed to create UUID\n";
+  RPC_CSTR *uuidstring;
+  status = UuidToStringA(uuid, uuidstring);
+  if (status != RPC_S_OK)
+    errs() << "Failed to convert UUID to string\n";
+  val = std::string((char *)uuidstring);
+  status = RpcStringFreeA(uuidstring);
+  if (status != RPC_S_OK)
+    errs() << "Failed to free UUID string\n";
+#else
+  uuid_t binuuid;
+  uuid_generate_random(binuuid);
+  char uuid[37];
+  uuid_unparse_lower(binuuid, uuid);
+  val = std::string(uuid);
+#endif
+  return val;
+}
 static void addAIELoweringPasses(OpPassManager &pm) {
   pm.addPass(createLowerAffinePass());
   pm.addPass(AIE::createAIECanonicalizeDevicePass());
@@ -453,6 +484,7 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp,
     if (!aiePartitionJsonOut)
       return moduleOp.emitOpError(errorMessage);
 
+    std::string uuid_str = getUUIDString();
     std::string aie_partition_json_data = R"(
       {
         "aie_partition": {
@@ -468,7 +500,7 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp,
           },
           "PDIs": [
             {
-              "uuid": "00000000-0000-0000-0000-000000008025",
+              "uuid": ")" + uuid_str + R"(",
               "file_name": "./design.pdi",
               "cdo_groups": [
                 {
diff --git a/utils/clone-llvm.sh b/utils/clone-llvm.sh
index bfd3dae037..498b36a19a 100755
--- a/utils/clone-llvm.sh
+++ b/utils/clone-llvm.sh
@@ -13,8 +13,8 @@
 ##===----------------------------------------------------------------------===##
 
 # The LLVM commit to use.
-LLVM_PROJECT_COMMIT=8a71284cb9463a90fab0d9e8edbeb5d879531e32
-DATETIME=2024051512
+LLVM_PROJECT_COMMIT=25b65be43df56c1b7bea3fe2596fb36c2788d7af
+DATETIME=2024052220
 WHEEL_VERSION=19.0.0.$DATETIME+${LLVM_PROJECT_COMMIT:0:8}
 
 ############################################################################################