diff --git a/aie_kernels/mm.cc b/aie_kernels/mm.cc index 90d06b7657..eba9367388 100644 --- a/aie_kernels/mm.cc +++ b/aie_kernels/mm.cc @@ -22,6 +22,8 @@ #include +#include "zero.cc" + template void matmul_vectorized(const T_in *__restrict pA, unsigned offsetA, @@ -273,6 +275,12 @@ extern "C" { 64, 64, 64>(a_in, offsetA, b_in, offsetB, c_out, offsetC); \ } -combos(matmul_vectorized_c_func) +#define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \ + mlir_type_out, r, s, t) \ + void zero_##mlir_type_out(ctype_out *c_out, unsigned offsetC) { \ + zero_vectorized(c_out, offsetC); \ + } + +combos(matmul_vectorized_c_func) combos(zero_vectorized_c_func) } // extern "C" diff --git a/aie_kernels/zero.cc b/aie_kernels/zero.cc new file mode 100644 index 0000000000..67223d1f30 --- /dev/null +++ b/aie_kernels/zero.cc @@ -0,0 +1,34 @@ +//===- zero.cc --------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#ifndef ZERO_CC +#define ZERO_CC + +#include +#include +#include +#include + +template +void zero_vectorized(T *__restrict pC, unsigned offsetC) { + const aie::vector zeros = aie::zeros(); + T *__restrict pC1 = pC + offsetC; + const T *__restrict c_end = pC1 + M * N; + for (; pC1 + r < c_end; pC1 += r) { + aie::store_v(pC1, zeros); + } + // Do a scalar write for any remainder not divisible by vector instruction + // size r + for (; pC1 < c_end; pC1++) { + *pC1 = 0; + } +} + +#endif