commaai · mitchellgoffpc · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/release/release_files.py b/release/release_files.py
@@ -55,7 +55,7 @@
   "tools/joystick/",
   "tools/longitudinal_maneuvers/",
 
-  "tinygrad_repo/openpilot/compile2.py",
+  "tinygrad_repo/examples/openpilot/compile3.py",
   "tinygrad_repo/extra/onnx.py",
   "tinygrad_repo/extra/onnx_ops.py",
   "tinygrad_repo/extra/thneed.py",

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
@@ -13,15 +13,6 @@ common_src = [
   "transforms/transform.cc",
 ]
 
-thneed_src_common = [
-  "thneed/thneed_common.cc",
-  "thneed/serialize.cc",
-]
-
-thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
-thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
-thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc
-
 # SNPE except on Mac and ARM Linux
 snpe_lib = []
 if arch != "Darwin" and arch != "aarch64":
@@ -59,20 +50,18 @@ fn = File("models/supercombo").abspath
 cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
 lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
 
-# Build thneed model
-if arch == "larch64" or GetOption('pc_thneed'):
-  tinygrad_opts = []
-  if not GetOption('pc_thneed'):
-    # use FLOAT16 on device for speed + don't cache the CL kernels for space
-    tinygrad_opts += ["FLOAT16=1", "PYOPENCL_NO_CACHE=1"]
-  cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn}.onnx {fn}.thneed"
-
-  lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)
+# Compile tinygrad model
+# TODO this is all super hacky
+pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
+if arch == 'larch64':
+  device_string = 'QCOM=1'
+elif arch == 'Darwin' or arch == 'aarch64':
+  device_string = 'CLANG=1 IMAGE=0'
+else:
+  device_string = 'GPU=1'
 
-  fn_dm = File("models/dmonitoring_model").abspath
-  cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn_dm}.onnx {fn_dm}.thneed"
-  lenv.Command(fn_dm + ".thneed", [fn_dm + ".onnx"] + tinygrad_files, cmd)
+for model_name in ['supercombo', 'dmonitoring_model']:
+  fn = File(f"models/{model_name}").abspath
+  cmd = f'{pythonpath_string} {device_string} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl'
+  lenv.Command(fn + "_tinygrad.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
 
-  thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
-  thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
-  lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])
diff --git a/selfdrive/modeld/dmonitoringmodeld b/selfdrive/modeld/dmonitoringmodeld
@@ -1,10 +1,4 @@
 #!/usr/bin/env bash
 
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
-cd "$DIR/../../"
-
-if [ -f "$DIR/libthneed.so" ]; then
-  export LD_PRELOAD="$DIR/libthneed.so"
-fi
-
 exec "$DIR/dmonitoringmodeld.py" "$@"
diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py
@@ -1,8 +1,16 @@
 #!/usr/bin/env python3
 import os
+from openpilot.system.hardware import TICI
+## TODO this is hack
+if TICI:
+  GPU_BACKEND = 'QCOM'
+else:
+  GPU_BACKEND = 'GPU'
+os.environ[GPU_BACKEND] = '1'
 import gc
 import math
 import time
+import pickle
 import ctypes
 import numpy as np
 from pathlib import Path
@@ -14,9 +22,11 @@
 from openpilot.common.swaglog import cloudlog
 from openpilot.common.params import Params
 from openpilot.common.realtime import set_realtime_priority
-from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
-from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext
+from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, cl_from_visionbuf
 from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import to_mv, mv_address
+from tinygrad.dtype import dtypes
 
 CALIB_LEN = 3
 MODEL_WIDTH = 1440
@@ -26,9 +36,7 @@
 
 PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
-MODEL_PATHS = {
-  ModelRunner.THNEED: Path(__file__).parent / 'models/dmonitoring_model.thneed',
-  ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'}
+MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl'
 
 class DriverStateResult(ctypes.Structure):
   _fields_ = [
@@ -59,33 +67,36 @@ class DMonitoringModelResult(ctypes.Structure):
 class ModelState:
   inputs: dict[str, np.ndarray]
   output: np.ndarray
-  model: ModelRunner
 
   def __init__(self, cl_ctx):
     assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
-    self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32)
-    self.inputs = {
-      'input_img': np.zeros(MODEL_HEIGHT * MODEL_WIDTH, dtype=np.uint8),
-      'calib': np.zeros(CALIB_LEN, dtype=np.float32)}
+    self.numpy_inputs = {'calib': np.zeros((1, CALIB_LEN), dtype=np.float32)}
+    #TODO this only works on some backends, verified on CLANG
+    self.tensor_inputs = {k: Tensor.from_blob(mv_address(v), v.shape, dtype=dtypes.float) for k, v in self.numpy_inputs.items()}
+    self.tensor_inputs['input_img'] = None
 
-    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx)
-    self.model.addInput("input_img", None)
-    self.model.addInput("calib", self.inputs['calib'])
 
-  def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
-    self.inputs['calib'][:] = calib
+    with open(MODEL_PKL_PATH, "rb") as f:
+      self.model_run = pickle.load(f)
 
-    v_offset = buf.height - MODEL_HEIGHT
-    h_offset = (buf.width - MODEL_WIDTH) // 2
-    buf_data = buf.data.reshape(-1, buf.stride)
-    input_data = self.inputs['input_img'].reshape(MODEL_HEIGHT, MODEL_WIDTH)
-    input_data[:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH]
+  def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
+    self.numpy_inputs['calib'][0,:] = calib
 
-    self.model.setInputBuffer("input_img", self.inputs['input_img'].view(np.float32))
     t1 = time.perf_counter()
-    self.model.execute()
+    if TICI:
+      if self.tensor_inputs['input_img'] is None:
+        input_img_cl = cl_from_visionbuf(buf)
+        cl_buf_desc_ptr = to_mv(input_img_cl.mem_address, 8).cast('Q')[0]
+        rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
+        self.tensor_inputs['input_img'] = Tensor.from_blob(rawbuf_ptr, (1, buf.height * 3 // 2, buf.width), dtype=dtypes.uint8, device='QCOM')
+    else:
+      self.tensor_inputs = {k: Tensor(v) for k,v in self.numpy_inputs.items()}
+      self.tensor_inputs['input_img'] = Tensor(buf.data).reshape((1,buf.height * 3 // 2,buf.width))
+
+    output = self.model_run(**self.tensor_inputs)['outputs'].numpy().flatten()
+
     t2 = time.perf_counter()
-    return self.output, t2 - t1
+    return output, t2 - t1
 
 
 def fill_driver_state(msg, ds_result: DriverStateResult):
@@ -155,8 +166,6 @@ def main():
     t2 = time.perf_counter()
 
     pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time))
-    # print("dmonitoring process: %.2fms, from last %.2fms\n" % (t2 - t1, t1 - last))
-    # last = t1
 
 
 if __name__ == "__main__":

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
@@ -1,5 +1,12 @@
 #!/usr/bin/env python3
 import os
+from openpilot.system.hardware import TICI
+## TODO this is hack
+if TICI:
+  GPU_BACKEND = 'QCOM'
+else:
+  GPU_BACKEND = 'GPU'
+os.environ[GPU_BACKEND] = '1'
 import time
 import pickle
 import numpy as np
@@ -18,21 +25,25 @@
 from openpilot.common.transformations.model import get_warp_matrix
 from openpilot.system import sentry
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
-from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.selfdrive.modeld.constants import ModelConstants
 from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext
 
+from tinygrad.tensor import Tensor
+from tinygrad.dtype import dtypes
+from tinygrad.helpers import to_mv, mv_address
+Tensor.manual_seed(1337)
+Tensor.no_grad = True
+
 PROCESS_NAME = "selfdrive.modeld.modeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 
-MODEL_PATHS = {
-  ModelRunner.THNEED: Path(__file__).parent / 'models/supercombo.thneed',
-  ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}
-
+MODEL_PATH = Path(__file__).parent / 'models/supercombo.onnx'
+MODEL_PKL_PATH = Path(__file__).parent / 'models/supercombo_tinygrad.pkl'
 METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
 
+IMG_INPUT_SHAPE = (1, 12, 128, 256)
 
 class FrameMeta:
   frame_id: int = 0
@@ -49,7 +60,6 @@ class ModelState:
   inputs: dict[str, np.ndarray]
   output: np.ndarray
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
-  model: ModelRunner
 
   def __init__(self, context: CLContext):
     self.frame = ModelFrame(context)
@@ -60,13 +70,15 @@ def __init__(self, context: CLContext):
     self.prev_desired_curv_20hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32)
 
     # img buffers are managed in openCL transform code
-    self.inputs = {
-      'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
-      'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
-      'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32),
-      'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
-      'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
+    self.numpy_inputs = {
+      'desire': np.zeros((1, (ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.DESIRE_LEN), dtype=np.float32),
+      'traffic_convention': np.zeros((1, ModelConstants.TRAFFIC_CONVENTION_LEN), dtype=np.float32),
+      'lateral_control_params': np.zeros((1, ModelConstants.LATERAL_CONTROL_PARAMS_LEN), dtype=np.float32),
+      'prev_desired_curv': np.zeros((1,(ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32),
+      'features_buffer': np.zeros((1, ModelConstants.HISTORY_BUFFER_LEN,  ModelConstants.FEATURE_LEN), dtype=np.float32),
     }
+    #TODO this only works on some backends, verified on CLANG
+    self.tensor_inputs = {k: Tensor.from_blob(mv_address(v), v.shape, dtype=dtypes.float) for k, v in self.numpy_inputs.items()} # type: ignore
 
     with open(METADATA_PATH, 'rb') as f:
       model_metadata = pickle.load(f)
@@ -76,11 +88,8 @@ def __init__(self, context: CLContext):
     self.output = np.zeros(net_output_size, dtype=np.float32)
     self.parser = Parser()
 
-    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, context)
-    self.model.addInput("input_imgs", None)
-    self.model.addInput("big_input_imgs", None)
-    for k,v in self.inputs.items():
-      self.model.addInput(k, v)
+    with open(MODEL_PKL_PATH, "rb") as f:
+      self.model_run = pickle.load(f)
 
   def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
@@ -97,18 +106,31 @@ def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_
 
     self.desire_20Hz[:-1] = self.desire_20Hz[1:]
     self.desire_20Hz[-1] = new_desire
-    self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()
-
-    self.inputs['traffic_convention'][:] = inputs['traffic_convention']
-    self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']
-
-    self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
-    self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
+    self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape((1,25,4,-1)).max(axis=2)
+
+    self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
+    self.numpy_inputs['lateral_control_params'][:] = inputs['lateral_control_params']
+    input_imgs_cl = self.frame.prepare(buf, transform.flatten())
+    big_input_imgs_cl = self.wide_frame.prepare(wbuf, transform_wide.flatten())
+
+    if TICI:
+      # The imgs tensors are backed by opencl memory, only need init once
+      if 'input_imgs' not in self.tensor_inputs:
+        cl_buf_desc_ptr = to_mv(input_imgs_cl.mem_address, 8).cast('Q')[0]
+        big_cl_buf_desc_ptr = to_mv(big_input_imgs_cl.mem_address, 8).cast('Q')[0]
+        rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
+        big_rawbuf_ptr = to_mv(big_cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
+        self.tensor_inputs['input_imgs'] = Tensor.from_blob(rawbuf_ptr, IMG_INPUT_SHAPE, dtype=dtypes.uint8, device='QCOM')
+        self.tensor_inputs['big_input_imgs'] = Tensor.from_blob(big_rawbuf_ptr, IMG_INPUT_SHAPE, dtype=dtypes.uint8, device='QCOM')
+    else:
+      self.tensor_inputs = {k: Tensor(v) for k,v in self.numpy_inputs.items()}
+      self.tensor_inputs['input_imgs'] = Tensor(self.frame.buffer_from_cl(input_imgs_cl)).reshape(IMG_INPUT_SHAPE)
+      self.tensor_inputs['big_input_imgs'] = Tensor(self.wide_frame.buffer_from_cl(big_input_imgs_cl)).reshape(IMG_INPUT_SHAPE)
 
     if prepare_only:
       return None
 
-    self.model.execute()
+    self.output = self.model_run(**self.tensor_inputs)['outputs'].numpy().flatten()
     outputs = self.parser.parse_outputs(self.slice_outputs(self.output))
 
     self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
@@ -118,9 +140,9 @@ def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_
     self.prev_desired_curv_20hz[-1] = outputs['desired_curvature'][0, :]
 
     idxs = np.arange(-4,-100,-4)[::-1]
-    self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
+    self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[idxs]
     # TODO model only uses last value now, once that changes we need to input strided action history buffer
-    self.inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :]
+    self.numpy_inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :]
     return outputs
 
 
@@ -189,7 +211,7 @@ def main(demo=False):
   cloudlog.info("modeld got CarParams: %s", CP.carName)
 
   # TODO this needs more thought, use .2s extra for now to estimate other delays
-  steer_delay = CP.steerActuatorDelay + .2
+  steer_delay =  .2
 
   DH = DesireHelper()
 

diff --git a/selfdrive/modeld/models/commonmodel.cc b/selfdrive/modeld/models/commonmodel.cc
@@ -8,6 +8,7 @@
 
 ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
   input_frames = std::make_unique<uint8_t[]>(buf_size);
+  input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));
 
   q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
   y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
@@ -22,7 +23,7 @@ ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
   loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
 }
 
-uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
+cl_mem* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection) {
   transform_queue(&this->transform, q,
                 yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
                 y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
@@ -31,19 +32,19 @@ uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, i
     CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
   }
   loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
-  if (output == NULL) {
-    CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
-    CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
-    clFinish(q);
-    return &input_frames[0];
-  } else {
-    copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
-    copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);
-
-    // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
-    clFinish(q);
-    return NULL;
-  }
+
+  copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes);
+  copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes, frame_size_bytes);
+
+  // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
+  clFinish(q);
+  return &input_frames_cl;
+}
+
+uint8_t* ModelFrame::buffer_from_cl(cl_mem *in_frames) {
+  CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, MODEL_FRAME_SIZE * 2 * sizeof(uint8_t), &input_frames[0], 0, nullptr, nullptr));
+  clFinish(q);
+  return &input_frames[0];
 }
 
 ModelFrame::~ModelFrame() {

diff --git a/selfdrive/modeld/models/commonmodel.h b/selfdrive/modeld/models/commonmodel.h
@@ -20,7 +20,8 @@ class ModelFrame {
 public:
   ModelFrame(cl_device_id device_id, cl_context context);
   ~ModelFrame();
-  uint8_t* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);
+  cl_mem* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform);
+  uint8_t* buffer_from_cl(cl_mem *in_frames);
 
   const int MODEL_WIDTH = 512;
   const int MODEL_HEIGHT = 256;
@@ -32,7 +33,7 @@ class ModelFrame {
   Transform transform;
   LoadYUVState loadyuv;
   cl_command_queue q;
-  cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl;
+  cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl, input_frames_cl;
   cl_buffer_region region;
   std::unique_ptr<uint8_t[]> input_frames;
-};
+};