From 7781c4979c84152c3f86ef202c2bc61ff154ed88 Mon Sep 17 00:00:00 2001
From: Steven Toribio <34755817+turbotoribio@users.noreply.github.com>
Date: Wed, 12 Jul 2023 13:35:09 -0700
Subject: [PATCH] Delay Op (#2117)

`port c++ delay op to open source in tflm_signal`

-port delay op and corresponding to new open source location for C++

BUG=[b/289296081](https://b.corp.google.com/issues/289296081)
---
 python/tflite_micro/python_ops_resolver.cc    |   1 +
 signal/micro/kernels/BUILD                    |  26 ++
 signal/micro/kernels/delay.cc                 | 154 ++++++++
 .../delay_flexbuffers_generated_data.cc       |  29 ++
 .../delay_flexbuffers_generated_data.h        |  25 ++
 signal/micro/kernels/delay_test.cc            | 341 ++++++++++++++++++
 tensorflow/lite/micro/kernels/Makefile.inc    |   5 +
 tensorflow/lite/micro/kernels/micro_ops.h     |   1 +
 .../lite/micro/micro_mutable_op_resolver.h    |   5 +
 tensorflow/lite/micro/tools/make/Makefile     |   1 +
 10 files changed, 588 insertions(+)
 create mode 100644 signal/micro/kernels/delay.cc
 create mode 100644 signal/micro/kernels/delay_flexbuffers_generated_data.cc
 create mode 100644 signal/micro/kernels/delay_flexbuffers_generated_data.h
 create mode 100644 signal/micro/kernels/delay_test.cc

diff --git a/python/tflite_micro/python_ops_resolver.cc b/python/tflite_micro/python_ops_resolver.cc
index ed4320ea241..6c63f9d76b2 100644
--- a/python/tflite_micro/python_ops_resolver.cc
+++ b/python/tflite_micro/python_ops_resolver.cc
@@ -39,6 +39,7 @@ PythonOpsResolver::PythonOpsResolver() {
   AddConv2D();
   AddCos();
   AddCumSum();
+  AddDelay();
   AddDepthToSpace();
   AddDepthwiseConv2D();
   AddDequantize();
diff --git a/signal/micro/kernels/BUILD b/signal/micro/kernels/BUILD
index aea77f61627..f3ec739f425 100644
--- a/signal/micro/kernels/BUILD
+++ b/signal/micro/kernels/BUILD
@@ -8,6 +8,7 @@ package(licenses = ["notice"])
 cc_library(
     name = "register_signal_ops",
     srcs = [
+        "delay.cc",
         "framer.cc",
         "overlap_add.cc",
         "rfft.cc",
@@ -141,3 +142,28 @@ cc_test(
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
+
+cc_library(
+    name = "delay_flexbuffers_generated_data",
+    srcs = [
+        "delay_flexbuffers_generated_data.cc",
+    ],
+    hdrs = [
+        "delay_flexbuffers_generated_data.h",
+    ],
+)
+
+cc_test(
+    name = "delay_test",
+    srcs = [
+        "delay_test.cc",
+    ],
+    deps = [
+        ":delay_flexbuffers_generated_data",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
+        "//tensorflow/lite/micro/kernels:kernel_runner",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
diff --git a/signal/micro/kernels/delay.cc b/signal/micro/kernels/delay.cc
new file mode 100644
index 00000000000..155e198729d
--- /dev/null
+++ b/signal/micro/kernels/delay.cc
@@ -0,0 +1,154 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include "signal/src/circular_buffer.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/flatbuffer_utils.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_context.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+// Indices into the init flexbuffer's vector.
+// The parameter's name is in the comment that follows.
+// Elements in the vectors are ordered alphabetically by parameter name.
+constexpr int kDelayLengthIndex = 0;  // 'delay_length'
+
+struct TFLMSignalFrontendDelayParams {
+  int32_t frame_size;
+  int32_t delay_length;
+  int32_t outer_dims;
+
+  int8_t** state_buffers;
+  tflm_signal::CircularBuffer** circular_buffers;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* params = static_cast<TFLMSignalFrontendDelayParams*>(
+      context->AllocatePersistentBuffer(context,
+                                        sizeof(TFLMSignalFrontendDelayParams)));
+
+  if (params == nullptr) {
+    return nullptr;
+  }
+
+  FlexbufferWrapper fbw(reinterpret_cast<const uint8_t*>(buffer), length);
+  params->delay_length = fbw.ElementAsInt32(kDelayLengthIndex);
+  return params;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MicroContext* micro_context = GetMicroContext(context);
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt16);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt16);
+
+  auto* params =
+      reinterpret_cast<TFLMSignalFrontendDelayParams*>(node->user_data);
+
+  TF_LITE_ENSURE(context, params != nullptr);
+
+  RuntimeShape input_shape = GetTensorShape(input);
+  int innermost_dim = input_shape.Dims(input_shape.DimensionsCount() - 1);
+  params->outer_dims = input_shape.FlatSize() / innermost_dim;
+  params->frame_size = innermost_dim;
+
+  params->state_buffers =
+      static_cast<int8_t**>(context->AllocatePersistentBuffer(
+          context, params->outer_dims * sizeof(int8_t*)));
+  params->circular_buffers = static_cast<tflm_signal::CircularBuffer**>(
+      context->AllocatePersistentBuffer(
+          context, params->outer_dims * sizeof(tflm_signal::CircularBuffer*)));
+
+  for (int i = 0; i < params->outer_dims; i++) {
+    size_t capacity = params->frame_size + params->delay_length;
+
+    size_t state_size = tflm_signal::CircularBufferGetNeededMemory(capacity);
+    params->state_buffers[i] =
+        static_cast<int8_t*>(context->AllocatePersistentBuffer(
+            context, state_size * sizeof(int8_t)));
+    params->circular_buffers[i] = tflm_signal::CircularBufferInit(
+        capacity, params->state_buffers[i], state_size);
+    tflm_signal::CircularBufferWriteZeros(params->circular_buffers[i],
+                                          params->delay_length);
+  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TFLMSignalFrontendDelayParams*>(node->user_data);
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output = micro::GetEvalOutput(context, node, kOutputTensor);
+
+  const int16_t* input_data = micro::GetTensorData<int16_t>(input);
+  int16_t* output_data = micro::GetTensorData<int16_t>(output);
+
+  for (int dim_index = 0, sample_index = 0; dim_index < params->outer_dims;
+       dim_index++, sample_index += params->frame_size) {
+    tflm_signal::CircularBufferWrite(params->circular_buffers[dim_index],
+                                     &input_data[sample_index],
+                                     params->frame_size);
+    tflm_signal::CircularBufferGet(params->circular_buffers[dim_index],
+                                   params->frame_size,
+                                   &output_data[sample_index]);
+    tflm_signal::CircularBufferDiscard(params->circular_buffers[dim_index],
+                                       params->frame_size);
+  }
+  return kTfLiteOk;
+}
+
+void Reset(TfLiteContext* context, void* buffer) {
+  auto* params = static_cast<TFLMSignalFrontendDelayParams*>(buffer);
+  for (int i = 0; i < params->outer_dims; ++i) {
+    tflm_signal::CircularBufferReset(params->circular_buffers[i]);
+    tflm_signal::CircularBufferWriteZeros(params->circular_buffers[i],
+                                          params->delay_length);
+  }
+}
+
+}  // namespace
+
+namespace tflm_signal {
+TFLMRegistration* Register_DELAY() {
+  static TFLMRegistration r =
+      micro::RegisterOp(Init, Prepare, Eval, nullptr, Reset);
+  return &r;
+}
+}  // namespace tflm_signal
+
+}  // namespace tflite
diff --git a/signal/micro/kernels/delay_flexbuffers_generated_data.cc b/signal/micro/kernels/delay_flexbuffers_generated_data.cc
new file mode 100644
index 00000000000..756b7ac1738
--- /dev/null
+++ b/signal/micro/kernels/delay_flexbuffers_generated_data.cc
@@ -0,0 +1,29 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file is generated. See:
+// tensorflow/lite/micro/kernels/test_data_generation/README.md
+
+#include "signal/micro/kernels/delay_flexbuffers_generated_data.h"
+
+const int g_gen_data_size_3_delay = 23;
+const unsigned char g_gen_data_3_delay[] = {
+    0x64, 0x65, 0x6c, 0x61, 0x79, 0x5f, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68,
+    0x00, 0x01, 0x0e, 0x01, 0x01, 0x01, 0x03, 0x04, 0x02, 0x24, 0x01,
+};
+const int g_gen_data_size_5_delay = 23;
+const unsigned char g_gen_data_5_delay[] = {
+    0x64, 0x65, 0x6c, 0x61, 0x79, 0x5f, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68,
+    0x00, 0x01, 0x0e, 0x01, 0x01, 0x01, 0x05, 0x04, 0x02, 0x24, 0x01,
+};
diff --git a/signal/micro/kernels/delay_flexbuffers_generated_data.h b/signal/micro/kernels/delay_flexbuffers_generated_data.h
new file mode 100644
index 00000000000..c79273ea1a0
--- /dev/null
+++ b/signal/micro/kernels/delay_flexbuffers_generated_data.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef SIGNAL_MICRO_KERNELS_DELAY_FLEXBUFFERS_GENERATED_DATA_H_
+#define SIGNAL_MICRO_KERNELS_DELAY_FLEXBUFFERS_GENERATED_DATA_H_
+
+extern const int g_gen_data_size_3_delay;
+extern const unsigned char g_gen_data_3_delay[];
+
+extern const int g_gen_data_size_5_delay;
+extern const unsigned char g_gen_data_5_delay[];
+
+#endif  // SIGNAL_MICRO_KERNELS_DELAY_FLEXBUFFERS_GENERATED_DATA_H_
diff --git a/signal/micro/kernels/delay_test.cc b/signal/micro/kernels/delay_test.cc
new file mode 100644
index 00000000000..e6fdeb91231
--- /dev/null
+++ b/signal/micro/kernels/delay_test.cc
@@ -0,0 +1,341 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <cstring>
+
+#include "signal/micro/kernels/delay_flexbuffers_generated_data.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputsSize = 1;
+constexpr int kOutputsSize = 1;
+constexpr int kTensorsSize = kInputsSize + kOutputsSize;
+
+class DelayKernelRunner {
+ public:
+  DelayKernelRunner(int* input_dims_data, int16_t* input_data,
+                    int* output_dims_data, int16_t* output_data)
+      : tensors_{testing::CreateTensor(
+                     input_data, testing::IntArrayFromInts(input_dims_data)),
+                 testing::CreateTensor(
+                     output_data, testing::IntArrayFromInts(output_dims_data))},
+        inputs_array_{testing::IntArrayFromInts(inputs_array_data_)},
+        outputs_array_{testing::IntArrayFromInts(outputs_array_data_)},
+        kernel_runner_{*registration_, tensors_,       kTensorsSize,
+                       inputs_array_,  outputs_array_, nullptr} {}
+
+  micro::KernelRunner& kernel_runner() { return kernel_runner_; }
+
+ private:
+  int inputs_array_data_[kInputsSize + 1] = {kInputsSize, 0};
+  int outputs_array_data_[kOutputsSize + 1] = {kOutputsSize, 1};
+  TfLiteTensor tensors_[kTensorsSize] = {};
+  TfLiteIntArray* inputs_array_ = nullptr;
+  TfLiteIntArray* outputs_array_ = nullptr;
+
+  TFLMRegistration* registration_ = tflm_signal::Register_DELAY();
+  micro::KernelRunner kernel_runner_;
+};
+
+void TestDelayInvoke(const int16_t* input_data, int16_t* output_data,
+                     const int16_t* golden, int input_size, int input_num,
+                     micro::KernelRunner* runner, int16_t* input_buffer) {
+  for (int i = 0; i < input_num; i++) {
+    memcpy(input_buffer, &input_data[i * input_size],
+           sizeof(input_data[0]) * input_size);
+    TF_LITE_MICRO_EXPECT_EQ(runner->Invoke(), kTfLiteOk);
+    for (int j = 0; j < input_size; ++j) {
+      TF_LITE_MICRO_EXPECT_EQ(golden[i * input_size + j], output_data[j]);
+    }
+  }
+}
+
+void TestDelay(int* input_dims_data, const int16_t* input_data,
+               int* output_dims_data, int16_t* output_data,
+               const int16_t* golden, int input_size, int input_num,
+               const unsigned char* flexbuffers_data,
+               const unsigned int flexbuffers_data_size,
+               int16_t* input_buffer) {
+  DelayKernelRunner delay_runner(input_dims_data, input_buffer,
+                                 output_dims_data, output_data);
+
+  // TfLite uses a char* for the raw bytes whereas flexbuffers use an unsigned
+  // char*. This small discrepancy results in compiler warnings unless we
+  // reinterpret_cast right before passing in the flexbuffer bytes to the
+  // KernelRunner.
+  TF_LITE_MICRO_EXPECT_EQ(delay_runner.kernel_runner().InitAndPrepare(
+                              reinterpret_cast<const char*>(flexbuffers_data),
+                              flexbuffers_data_size),
+                          kTfLiteOk);
+  TestDelayInvoke(input_data, output_data, golden, input_size, input_num,
+                  &delay_runner.kernel_runner(), input_buffer);
+}
+// TestDelayReset() runs a test with the given inputs twice with a reset with
+// the main purpose of testing the Delay's Reset functionality. If you just
+// want to make sure Delay's Op output matches a set of golden values for an
+// input use  TestDelay() instead.
+void TestDelayReset(int* input_dims_data, const int16_t* input_data,
+                    int* output_dims_data, int16_t* output_data,
+                    const int16_t* golden, int input_size, int input_num,
+                    const unsigned char* flexbuffers_data,
+                    const unsigned int flexbuffers_data_size,
+                    int16_t* input_buffer) {
+  DelayKernelRunner delay_runner(input_dims_data, input_buffer,
+                                 output_dims_data, output_data);
+
+  // TfLite uses a char* for the raw bytes whereas flexbuffers use an unsigned
+  // char*. This small discrepancy results in compiler warnings unless we
+  // reinterpret_cast right before passing in the flexbuffer bytes to the
+  // KernelRunner.
+  TF_LITE_MICRO_EXPECT_EQ(delay_runner.kernel_runner().InitAndPrepare(
+                              reinterpret_cast<const char*>(flexbuffers_data),
+                              flexbuffers_data_size),
+                          kTfLiteOk);
+  TestDelayInvoke(input_data, output_data, golden, input_size, input_num,
+                  &delay_runner.kernel_runner(), input_buffer);
+  delay_runner.kernel_runner().Reset();
+  TestDelayInvoke(input_data, output_data, golden, input_size, input_num,
+                  &delay_runner.kernel_runner(), input_buffer);
+}
+
+}  // namespace
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(DelayTestSingleDimDelayLessThanFrameSize) {
+  const int kInputSize = 8;
+  const int kInputNum = 2;
+  int input_shape[] = {1, kInputSize};
+  int output_shape[] = {1, kInputSize};
+  // The buffer that gets passed to the model.
+  int16_t input_buffer[kInputSize];
+  // The input data. Gets copied to input_buffer kInputNum times.
+  const int16_t input[kInputNum * kInputSize] = {
+      0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+      0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+  };
+  int16_t output[kInputNum * kInputSize] = {0};
+  const int16_t golden[kInputNum * kInputSize] = {0x0, 0x0, 0x0, 0x1, 0x2, 0x3,
+                                                  0x4, 0x5, 0x6, 0x7, 0x8, 0x0,
+                                                  0x0, 0x0, 0x0, 0x0};
+  tflite::TestDelay(input_shape, input, output_shape, output, golden,
+                    kInputSize, kInputNum, g_gen_data_3_delay,
+                    g_gen_data_size_3_delay, input_buffer);
+}
+
+TF_LITE_MICRO_TEST(DelayTestSingleDimDelayGreaterThanFrameSize) {
+  const int kInputSize = 3;
+  const int kInputNum = 3;
+  int input_shape[] = {1, kInputSize};
+  int output_shape[] = {1, kInputSize};
+  // The buffer that gets passed to the model.
+  int16_t input_buffer[kInputSize];
+  // The input data. Gets copied to input_buffer kInputNum times.
+  const int16_t input[kInputNum * kInputSize] = {
+      0x1, 0x2, 0x3, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0,
+  };
+  int16_t output[kInputNum * kInputSize] = {0};
+  const int16_t golden[kInputNum * kInputSize] = {
+      0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x2, 0x3, 0x4,
+  };
+  tflite::TestDelay(input_shape, input, output_shape, output, golden,
+                    kInputSize, kInputNum, g_gen_data_5_delay,
+                    g_gen_data_size_5_delay, input_buffer);
+}
+
+TF_LITE_MICRO_TEST(DelayTestMultiDimDelayLessThanFrameSize) {
+  const int kInputSize = 16;
+  const int kInputNum = 2;
+  int input_shape[] = {2, 4, 4};
+  int output_shape[] = {2, 4, 4};
+  // The buffer that gets passed to the model.
+  int16_t input_buffer[kInputSize];
+  // The op will be invoked 2 times (Input X, X=0,1)
+  // For each invocation, the input's shape is (4, 4) but flattened for clarity
+  // On each invocation, the input data is copied to input_buffer first.
+  const int16_t input[kInputNum * kInputSize] = {
+      0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB,
+      0xC, 0xD, 0xE, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+      0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+  };
+  int16_t output[kInputNum * kInputSize] = {0};
+  // For each invocation, we expect the following output (Output X, X=0,1)
+  // Each time, the output's shape is (4, 4) but flattened for clarity
+  const int16_t golden[kInputNum * kInputSize] = {
+      // Output 0
+      0x0,
+      0x0,
+      0x0,
+      0x1,
+      0x0,
+      0x0,
+      0x0,
+      0x5,
+      0x0,
+      0x0,
+      0x0,
+      0x9,
+      0x0,
+      0x0,
+      0x0,
+      0xD,
+      // Output 1
+      0x2,
+      0x3,
+      0x4,
+      0x0,
+      0x6,
+      0x7,
+      0x8,
+      0x0,
+      0xA,
+      0xB,
+      0xC,
+      0x0,
+      0xE,
+      0xF,
+      0x0,
+      0x0,
+  };
+  tflite::TestDelay(input_shape, input, output_shape, output, golden,
+                    kInputSize, kInputNum, g_gen_data_3_delay,
+                    g_gen_data_size_3_delay, input_buffer);
+}
+
+TF_LITE_MICRO_TEST(DelayTestMultiDimDelayGreaterThanFrameSize) {
+  const int kInputSize = 16;
+  const int kInputNum = 3;
+  int input_shape[] = {2, 4, 4};
+  int output_shape[] = {2, 4, 4};
+  // The buffer that gets passed to the model.
+  int16_t input_buffer[kInputSize];
+  // The op will be invoked 3 times (Input X, X=0,1,2)
+  // For each invocation, the input's shape is (4, 4) but flattened for clarity
+  // On each invocation, the input data is copied to input_buffer first.
+  const int16_t input[kInputNum * kInputSize] = {
+      0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC,
+      0xD, 0xE, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+      0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+      0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+  };
+  int16_t output[kInputNum * kInputSize] = {0};
+  // For each invocation, we expect the following output (Output X, X=0,1,2)
+  // Each time, the output's shape is (4, 4) but flattened for clarity
+  const int16_t golden[kInputNum * kInputSize] = {
+      // Output 0
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      // Output 1
+      0x0,
+      0x1,
+      0x2,
+      0x3,
+      0x0,
+      0x5,
+      0x6,
+      0x7,
+      0x0,
+      0x9,
+      0xA,
+      0xB,
+      0x0,
+      0xD,
+      0xE,
+      0xF,
+      // Output 2
+      0x4,
+      0x0,
+      0x0,
+      0x0,
+      0x8,
+      0x0,
+      0x0,
+      0x0,
+      0xC,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+      0x0,
+  };
+  tflite::TestDelay(input_shape, input, output_shape, output, golden,
+                    kInputSize, kInputNum, g_gen_data_5_delay,
+                    g_gen_data_size_5_delay, input_buffer);
+}
+
+TF_LITE_MICRO_TEST(DelayTestResetSingleDimDelayLessThanFrameSize) {
+  const int kInputSize = 8;
+  const int kInputNum = 2;
+  int input_shape[] = {1, kInputSize};
+  int output_shape[] = {1, kInputSize};
+  // The buffer that gets passed to the model.
+  int16_t input_buffer[kInputSize];
+  // The input data. Gets copied to input_buffer kInputNum times.
+  const int16_t input[kInputNum * kInputSize] = {
+      0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+      0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+  };
+  int16_t output[kInputNum * kInputSize] = {0};
+  const int16_t golden[kInputNum * kInputSize] = {0x0, 0x0, 0x0, 0x1, 0x2, 0x3,
+                                                  0x4, 0x5, 0x6, 0x7, 0x8, 0x0,
+                                                  0x0, 0x0, 0x0, 0x0};
+  tflite::TestDelayReset(input_shape, input, output_shape, output, golden,
+                         kInputSize, kInputNum, g_gen_data_3_delay,
+                         g_gen_data_size_3_delay, input_buffer);
+}
+
+TF_LITE_MICRO_TEST(DelayTestResetSingleResetDimDelayGreaterThanFrameSize) {
+  const int kInputSize = 3;
+  const int kInputNum = 3;
+  int input_shape[] = {1, kInputSize};
+  int output_shape[] = {1, kInputSize};
+  // The buffer that gets passed to the model.
+  int16_t input_buffer[kInputSize];
+  // The input data. Gets copied to input_buffer kInputNum times.
+  const int16_t input[kInputNum * kInputSize] = {
+      0x1, 0x2, 0x3, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0,
+  };
+  int16_t output[kInputNum * kInputSize] = {0};
+  const int16_t golden[kInputNum * kInputSize] = {
+      0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x2, 0x3, 0x4,
+  };
+  tflite::TestDelayReset(input_shape, input, output_shape, output, golden,
+                         kInputSize, kInputNum, g_gen_data_5_delay,
+                         g_gen_data_size_5_delay, input_buffer);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/Makefile.inc b/tensorflow/lite/micro/kernels/Makefile.inc
index 96fb8291870..5f449b2a97c 100644
--- a/tensorflow/lite/micro/kernels/Makefile.inc
+++ b/tensorflow/lite/micro/kernels/Makefile.inc
@@ -48,6 +48,11 @@ $(eval $(call microlite_test,unidirectional_sequence_lstm_test,\
   $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/testdata/lstm_test_data.cc,\
   $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/testdata/lstm_test_data.h))
 
+$(eval $(call microlite_test,kernel_signal_delay_test,\
+  $(TENSORFLOW_ROOT)signal/micro/kernels/delay_test.cc \
+  $(TENSORFLOW_ROOT)signal/micro/kernels/delay_flexbuffers_generated_data.cc, \
+  $(TENSORFLOW_ROOT)signal/micro/kernels/delay_flexbuffers_generated_data.h))
+
 $(eval $(call microlite_test,kernel_signal_fft_test,\
   $(TENSORFLOW_ROOT)signal/micro/kernels/fft_test.cc \
   $(TENSORFLOW_ROOT)signal/micro/kernels/fft_flexbuffers_generated_data.cc \
diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index bfc21611a9d..15a1146d6e3 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -134,6 +134,7 @@ TFLMRegistration Register_ZEROS_LIKE();
 
 // TODO(b/160234179): Change custom OPs to also return by value.
 namespace tflm_signal {
+TFLMRegistration* Register_DELAY();
 TFLMRegistration* Register_FRAMER();
 TFLMRegistration* Register_OVERLAP_ADD();
 TFLMRegistration* Register_WINDOW();
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index c472b55d6c5..76f04c39a63 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -194,6 +194,11 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseCumsum);
   }
 
+  TfLiteStatus AddDelay() {
+    // TODO(b/286250473): change back name to "Delay" and remove namespace
+    return AddCustom("SignalDelay", tflite::tflm_signal::Register_DELAY());
+  }
+
   TfLiteStatus AddDepthToSpace() {
     return AddBuiltin(BuiltinOperator_DEPTH_TO_SPACE,
                       tflite::Register_DEPTH_TO_SPACE(), ParseDepthToSpace);
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 2cb056d8f23..1da2b85c81c 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -312,6 +312,7 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/memory_planner/linear_memory_planner_tes
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/memory_planner/non_persistent_buffer_planner_shim_test.cc
 
 MICROLITE_CC_KERNEL_SRCS := \
+$(TENSORFLOW_ROOT)signal/micro/kernels/delay.cc \
 $(TENSORFLOW_ROOT)signal/micro/kernels/framer.cc \
 $(TENSORFLOW_ROOT)signal/micro/kernels/rfft.cc \
 $(TENSORFLOW_ROOT)signal/micro/kernels/overlap_add.cc \