Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace ThneedModel with TinygradModel #33532

Draft
wants to merge 17 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion release/release_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"tools/joystick/",
"tools/longitudinal_maneuvers/",

"tinygrad_repo/openpilot/compile2.py",
"tinygrad_repo/examples/openpilot/compile3.py",
"tinygrad_repo/extra/onnx.py",
"tinygrad_repo/extra/onnx_ops.py",
"tinygrad_repo/extra/thneed.py",
Expand Down
37 changes: 13 additions & 24 deletions selfdrive/modeld/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,6 @@ common_src = [
"transforms/transform.cc",
]

thneed_src_common = [
"thneed/thneed_common.cc",
"thneed/serialize.cc",
]

thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc

# SNPE except on Mac and ARM Linux
snpe_lib = []
if arch != "Darwin" and arch != "aarch64":
Expand Down Expand Up @@ -59,20 +50,18 @@ fn = File("models/supercombo").abspath
cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files, cmd)

# Build thneed model
if arch == "larch64" or GetOption('pc_thneed'):
tinygrad_opts = []
if not GetOption('pc_thneed'):
# use FLOAT16 on device for speed + don't cache the CL kernels for space
tinygrad_opts += ["FLOAT16=1", "PYOPENCL_NO_CACHE=1"]
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn}.onnx {fn}.thneed"

lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)
# Compile tinygrad model
# TODO this is all super hacky
pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
if arch == 'larch64':
device_string = 'QCOM=1'
elif arch == 'Darwin' or arch == 'aarch64':
device_string = 'CLANG=1 IMAGE=0'
else:
device_string = 'GPU=1'

fn_dm = File("models/dmonitoring_model").abspath
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn_dm}.onnx {fn_dm}.thneed"
lenv.Command(fn_dm + ".thneed", [fn_dm + ".onnx"] + tinygrad_files, cmd)
for model_name in ['supercombo', 'dmonitoring_model']:
fn = File(f"models/{model_name}").abspath
cmd = f'{pythonpath_string} {device_string} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl'
lenv.Command(fn + "_tinygrad.pkl", [fn + ".onnx"] + tinygrad_files, cmd)

thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])
6 changes: 0 additions & 6 deletions selfdrive/modeld/dmonitoringmodeld
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
#!/usr/bin/env bash

DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
cd "$DIR/../../"

if [ -f "$DIR/libthneed.so" ]; then
export LD_PRELOAD="$DIR/libthneed.so"
fi

exec "$DIR/dmonitoringmodeld.py" "$@"
59 changes: 34 additions & 25 deletions selfdrive/modeld/dmonitoringmodeld.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
#!/usr/bin/env python3
import os
from openpilot.system.hardware import TICI
## TODO this is hack
if TICI:
GPU_BACKEND = 'QCOM'
else:
GPU_BACKEND = 'GPU'
os.environ[GPU_BACKEND] = '1'
import gc
import math
import time
import pickle
import ctypes
import numpy as np
from pathlib import Path
Expand All @@ -14,9 +22,11 @@
from openpilot.common.swaglog import cloudlog
from openpilot.common.params import Params
from openpilot.common.realtime import set_realtime_priority
from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext
from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, cl_from_visionbuf
from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
from tinygrad.tensor import Tensor
from tinygrad.helpers import to_mv, mv_address
from tinygrad.dtype import dtypes

CALIB_LEN = 3
MODEL_WIDTH = 1440
Expand All @@ -26,9 +36,7 @@

PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
MODEL_PATHS = {
ModelRunner.THNEED: Path(__file__).parent / 'models/dmonitoring_model.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'}
MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl'

class DriverStateResult(ctypes.Structure):
_fields_ = [
Expand Down Expand Up @@ -59,33 +67,36 @@ class DMonitoringModelResult(ctypes.Structure):
class ModelState:
inputs: dict[str, np.ndarray]
output: np.ndarray
model: ModelRunner

def __init__(self, cl_ctx):
assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32)
self.inputs = {
'input_img': np.zeros(MODEL_HEIGHT * MODEL_WIDTH, dtype=np.uint8),
'calib': np.zeros(CALIB_LEN, dtype=np.float32)}
self.numpy_inputs = {'calib': np.zeros((1, CALIB_LEN), dtype=np.float32)}
#TODO this only works on some backends, verified on CLANG
self.tensor_inputs = {k: Tensor.from_blob(mv_address(v), v.shape, dtype=dtypes.float) for k, v in self.numpy_inputs.items()}
self.tensor_inputs['input_img'] = None

self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx)
self.model.addInput("input_img", None)
self.model.addInput("calib", self.inputs['calib'])

def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
self.inputs['calib'][:] = calib
with open(MODEL_PKL_PATH, "rb") as f:
self.model_run = pickle.load(f)

v_offset = buf.height - MODEL_HEIGHT
h_offset = (buf.width - MODEL_WIDTH) // 2
buf_data = buf.data.reshape(-1, buf.stride)
input_data = self.inputs['input_img'].reshape(MODEL_HEIGHT, MODEL_WIDTH)
input_data[:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH]
def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
self.numpy_inputs['calib'][0,:] = calib

self.model.setInputBuffer("input_img", self.inputs['input_img'].view(np.float32))
t1 = time.perf_counter()
self.model.execute()
if TICI:
if self.tensor_inputs['input_img'] is None:
input_img_cl = cl_from_visionbuf(buf)
cl_buf_desc_ptr = to_mv(input_img_cl.mem_address, 8).cast('Q')[0]
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
self.tensor_inputs['input_img'] = Tensor.from_blob(rawbuf_ptr, (1, buf.height * 3 // 2, buf.width), dtype=dtypes.uint8, device='QCOM')
else:
self.tensor_inputs = {k: Tensor(v) for k,v in self.numpy_inputs.items()}
self.tensor_inputs['input_img'] = Tensor(buf.data).reshape((1,buf.height * 3 // 2,buf.width))

output = self.model_run(**self.tensor_inputs)['outputs'].numpy().flatten()

t2 = time.perf_counter()
return self.output, t2 - t1
return output, t2 - t1


def fill_driver_state(msg, ds_result: DriverStateResult):
Expand Down Expand Up @@ -155,8 +166,6 @@ def main():
t2 = time.perf_counter()

pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time))
# print("dmonitoring process: %.2fms, from last %.2fms\n" % (t2 - t1, t1 - last))
# last = t1


if __name__ == "__main__":
Expand Down
78 changes: 50 additions & 28 deletions selfdrive/modeld/modeld.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
#!/usr/bin/env python3
import os
from openpilot.system.hardware import TICI
## TODO this is hack
if TICI:
GPU_BACKEND = 'QCOM'
else:
GPU_BACKEND = 'GPU'
os.environ[GPU_BACKEND] = '1'
import time
import pickle
import numpy as np
Expand All @@ -18,21 +25,25 @@
from openpilot.common.transformations.model import get_warp_matrix
from openpilot.system import sentry
from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
from openpilot.selfdrive.modeld.parse_model_outputs import Parser
from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
from openpilot.selfdrive.modeld.constants import ModelConstants
from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext

from tinygrad.tensor import Tensor
from tinygrad.dtype import dtypes
from tinygrad.helpers import to_mv, mv_address
Tensor.manual_seed(1337)
Tensor.no_grad = True

PROCESS_NAME = "selfdrive.modeld.modeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')

MODEL_PATHS = {
ModelRunner.THNEED: Path(__file__).parent / 'models/supercombo.thneed',
ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}

MODEL_PATH = Path(__file__).parent / 'models/supercombo.onnx'
MODEL_PKL_PATH = Path(__file__).parent / 'models/supercombo_tinygrad.pkl'
METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'

IMG_INPUT_SHAPE = (1, 12, 128, 256)

class FrameMeta:
frame_id: int = 0
Expand All @@ -49,7 +60,6 @@ class ModelState:
inputs: dict[str, np.ndarray]
output: np.ndarray
prev_desire: np.ndarray # for tracking the rising edge of the pulse
model: ModelRunner

def __init__(self, context: CLContext):
self.frame = ModelFrame(context)
Expand All @@ -60,13 +70,15 @@ def __init__(self, context: CLContext):
self.prev_desired_curv_20hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32)

# img buffers are managed in openCL transform code
self.inputs = {
'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32),
'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
self.numpy_inputs = {
'desire': np.zeros((1, (ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.DESIRE_LEN), dtype=np.float32),
'traffic_convention': np.zeros((1, ModelConstants.TRAFFIC_CONVENTION_LEN), dtype=np.float32),
'lateral_control_params': np.zeros((1, ModelConstants.LATERAL_CONTROL_PARAMS_LEN), dtype=np.float32),
'prev_desired_curv': np.zeros((1,(ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.PREV_DESIRED_CURV_LEN), dtype=np.float32),
'features_buffer': np.zeros((1, ModelConstants.HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32),
}
#TODO this only works on some backends, verified on CLANG
self.tensor_inputs = {k: Tensor.from_blob(mv_address(v), v.shape, dtype=dtypes.float) for k, v in self.numpy_inputs.items()} # type: ignore

with open(METADATA_PATH, 'rb') as f:
model_metadata = pickle.load(f)
Expand All @@ -76,11 +88,8 @@ def __init__(self, context: CLContext):
self.output = np.zeros(net_output_size, dtype=np.float32)
self.parser = Parser()

self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, context)
self.model.addInput("input_imgs", None)
self.model.addInput("big_input_imgs", None)
for k,v in self.inputs.items():
self.model.addInput(k, v)
with open(MODEL_PKL_PATH, "rb") as f:
self.model_run = pickle.load(f)

def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
Expand All @@ -97,18 +106,31 @@ def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_

self.desire_20Hz[:-1] = self.desire_20Hz[1:]
self.desire_20Hz[-1] = new_desire
self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()

self.inputs['traffic_convention'][:] = inputs['traffic_convention']
self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']

self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape((1,25,4,-1)).max(axis=2)

self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
self.numpy_inputs['lateral_control_params'][:] = inputs['lateral_control_params']
input_imgs_cl = self.frame.prepare(buf, transform.flatten())
big_input_imgs_cl = self.wide_frame.prepare(wbuf, transform_wide.flatten())

if TICI:
# The imgs tensors are backed by opencl memory, only need init once
if 'input_imgs' not in self.tensor_inputs:
cl_buf_desc_ptr = to_mv(input_imgs_cl.mem_address, 8).cast('Q')[0]
big_cl_buf_desc_ptr = to_mv(big_input_imgs_cl.mem_address, 8).cast('Q')[0]
rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
big_rawbuf_ptr = to_mv(big_cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw gpu pointer.
self.tensor_inputs['input_imgs'] = Tensor.from_blob(rawbuf_ptr, IMG_INPUT_SHAPE, dtype=dtypes.uint8, device='QCOM')
self.tensor_inputs['big_input_imgs'] = Tensor.from_blob(big_rawbuf_ptr, IMG_INPUT_SHAPE, dtype=dtypes.uint8, device='QCOM')
else:
self.tensor_inputs = {k: Tensor(v) for k,v in self.numpy_inputs.items()}
self.tensor_inputs['input_imgs'] = Tensor(self.frame.buffer_from_cl(input_imgs_cl)).reshape(IMG_INPUT_SHAPE)
self.tensor_inputs['big_input_imgs'] = Tensor(self.wide_frame.buffer_from_cl(big_input_imgs_cl)).reshape(IMG_INPUT_SHAPE)

if prepare_only:
return None

self.model.execute()
self.output = self.model_run(**self.tensor_inputs)['outputs'].numpy().flatten()
outputs = self.parser.parse_outputs(self.slice_outputs(self.output))

self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
Expand All @@ -118,9 +140,9 @@ def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_
self.prev_desired_curv_20hz[-1] = outputs['desired_curvature'][0, :]

idxs = np.arange(-4,-100,-4)[::-1]
self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[idxs]
# TODO model only uses last value now, once that changes we need to input strided action history buffer
self.inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :]
self.numpy_inputs['prev_desired_curv'][-ModelConstants.PREV_DESIRED_CURV_LEN:] = 0. * self.prev_desired_curv_20hz[-4, :]
return outputs


Expand Down Expand Up @@ -189,7 +211,7 @@ def main(demo=False):
cloudlog.info("modeld got CarParams: %s", CP.carName)

# TODO this needs more thought, use .2s extra for now to estimate other delays
steer_delay = CP.steerActuatorDelay + .2
steer_delay = .2

DH = DesireHelper()

Expand Down
29 changes: 15 additions & 14 deletions selfdrive/modeld/models/commonmodel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
input_frames = std::make_unique<uint8_t[]>(buf_size);
input_frames_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err));

q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
Expand All @@ -22,7 +23,7 @@ ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
}

uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
cl_mem* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection) {
transform_queue(&this->transform, q,
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
Expand All @@ -31,19 +32,19 @@ uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, i
CL_CHECK(clEnqueueCopyBuffer(q, img_buffer_20hz_cl, img_buffer_20hz_cl, (i+1)*frame_size_bytes, i*frame_size_bytes, frame_size_bytes, 0, nullptr, nullptr));
}
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, last_img_cl);
if (output == NULL) {
CL_CHECK(clEnqueueReadBuffer(q, img_buffer_20hz_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[0], 0, nullptr, nullptr));
CL_CHECK(clEnqueueReadBuffer(q, last_img_cl, CL_TRUE, 0, frame_size_bytes, &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
} else {
copy_queue(&loadyuv, q, img_buffer_20hz_cl, *output, 0, 0, frame_size_bytes);
copy_queue(&loadyuv, q, last_img_cl, *output, 0, frame_size_bytes, frame_size_bytes);

// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
clFinish(q);
return NULL;
}

copy_queue(&loadyuv, q, img_buffer_20hz_cl, input_frames_cl, 0, 0, frame_size_bytes);
copy_queue(&loadyuv, q, last_img_cl, input_frames_cl, 0, frame_size_bytes, frame_size_bytes);

// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
clFinish(q);
return &input_frames_cl;
}

uint8_t* ModelFrame::buffer_from_cl(cl_mem *in_frames) {
CL_CHECK(clEnqueueReadBuffer(q, *in_frames, CL_TRUE, 0, MODEL_FRAME_SIZE * 2 * sizeof(uint8_t), &input_frames[0], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
}

ModelFrame::~ModelFrame() {
Expand Down
7 changes: 4 additions & 3 deletions selfdrive/modeld/models/commonmodel.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ class ModelFrame {
public:
ModelFrame(cl_device_id device_id, cl_context context);
~ModelFrame();
uint8_t* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);
cl_mem* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform);
uint8_t* buffer_from_cl(cl_mem *in_frames);

const int MODEL_WIDTH = 512;
const int MODEL_HEIGHT = 256;
Expand All @@ -32,7 +33,7 @@ class ModelFrame {
Transform transform;
LoadYUVState loadyuv;
cl_command_queue q;
cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl;
cl_mem y_cl, u_cl, v_cl, img_buffer_20hz_cl, last_img_cl, input_frames_cl;
cl_buffer_region region;
std::unique_ptr<uint8_t[]> input_frames;
};
};
Loading
Loading