From e5ba1c8c0b0d6a5cc482b54a1cfdaf67a3341254 Mon Sep 17 00:00:00 2001 From: Weifeng Liu Date: Thu, 29 Aug 2024 02:55:44 +0000 Subject: [PATCH] Use shadow buffers for dGPU VF + iGPU (backing virtio-GPU) output To get best performance we must guarantee that scan-out buffers used for composition in surfaceflinger reside in GPU local memory, but importing these buffers into virtio-GPU will migrate the buffers from local memory to system memory, which will highly impact the performance. To avoid migration of these client-composited buffers, allocate a shadow buffer for each of them and import the shadow buffers into virtio-GPU for scanning-out. Right before atomic commit, leverage GPU blit engine to copy content to shadow buffer. Use shadow buffers only when feature ALLOW_P2P of virtio-GPU is not present and dGPU exists. There are several GPU instructions to blit memory: - XY_FAST_COPY_BLT (BSpec: 47982), - XY_SRC_COPY_BLT (BSpec: 48002), - XY_BLOCK_COPY_BLT (BSpec: 3678). By experiment, XY_FAST_COPY is much faster than the other two instructions. Tracked-On: OAM-124182 Signed-off-by: Weifeng Liu --- Android.bp | 10 +- bufferinfo/BufferInfo.h | 11 + bufferinfo/BufferInfoMapperMetadata.cpp | 10 + compositor/LayerData.h | 2 + drm/DrmAtomicStateManager.cpp | 21 +- drm/DrmFbImporter.cpp | 17 +- drm/DrmPlane.cpp | 5 +- hwc2_device/HwcLayer.cpp | 27 +- utils/i915_prelim.h | 118 +++++ utils/intel_blit.cpp | 659 ++++++++++++++++++++++++ utils/intel_blit.h | 48 ++ 11 files changed, 915 insertions(+), 13 deletions(-) create mode 100644 utils/i915_prelim.h create mode 100644 utils/intel_blit.cpp create mode 100644 utils/intel_blit.h diff --git a/Android.bp b/Android.bp index cb3ac78..8d6f251 100644 --- a/Android.bp +++ b/Android.bp @@ -18,10 +18,18 @@ cc_library_static { name: "libdrmhwc_utils", - srcs: ["utils/Worker.cpp"], + srcs: [ + "utils/Worker.cpp", + "utils/intel_blit.cpp" + ], include_dirs: ["vendor/intel/external/drm-hwcomposer"], + shared_libs: [ + "libdrm", + "libutils", + ], + cflags: [ "-Wall", "-Werror", diff --git a/bufferinfo/BufferInfo.h b/bufferinfo/BufferInfo.h index d5a8bcb..339f349 100644 --- a/bufferinfo/BufferInfo.h +++ b/bufferinfo/BufferInfo.h @@ -17,6 +17,7 @@ #pragma once #include +#include "utils/intel_blit.h" constexpr int kBufferMaxPlanes = 4; @@ -50,6 +51,16 @@ struct BufferInfo { /* sizes[] is used only by mapper@4 metadata getter for internal purposes */ uint32_t sizes[kBufferMaxPlanes]; int prime_fds[kBufferMaxPlanes]; + uint32_t prime_buffer_handles[kBufferMaxPlanes]; + bool use_shadow_fds; + struct intel_info info; + /* + * Shadow buffers in system memory. We will blit content of prime_fds to + * shadow_fds right before atomic commit and use the shadow buffers as frame + * buffers. + **/ + int shadow_fds[kBufferMaxPlanes]; + uint32_t shadow_buffer_handles[kBufferMaxPlanes]; uint64_t modifiers[kBufferMaxPlanes]; BufferColorSpace color_space; diff --git a/bufferinfo/BufferInfoMapperMetadata.cpp b/bufferinfo/BufferInfoMapperMetadata.cpp index 72b20f3..4895edf 100644 --- a/bufferinfo/BufferInfoMapperMetadata.cpp +++ b/bufferinfo/BufferInfoMapperMetadata.cpp @@ -28,6 +28,7 @@ #include #include "utils/log.h" +#include "utils/intel_blit.h" namespace android { @@ -81,6 +82,15 @@ BufferInfoMapperMetadata::GetFds(buffer_handle_t handle, BufferInfo *bo) { ALOGE("Invalid prime fd"); return android::BAD_VALUE; } + + int dgpu_fd = intel_dgpu_fd(); + if (dgpu_fd >= 0) { + int ret = drmPrimeFDToHandle(dgpu_fd, bo->prime_fds[i], &bo->prime_buffer_handles[i]); + if (ret) { + ALOGE("Cannot convert prime fd to handle\n"); + return android::BAD_VALUE; + } + } } return 0; diff --git a/compositor/LayerData.h b/compositor/LayerData.h index d04514d..62cde48 100644 --- a/compositor/LayerData.h +++ b/compositor/LayerData.h @@ -69,6 +69,7 @@ struct LayerData { clonned.fb = fb; clonned.pi = pi; clonned.acquire_fence = std::move(acquire_fence); + clonned.blit_fence = std::move(blit_fence); return clonned; } @@ -76,6 +77,7 @@ struct LayerData { std::shared_ptr fb; PresentInfo pi; UniqueFd acquire_fence; + UniqueFd blit_fence; }; } // namespace android diff --git a/drm/DrmAtomicStateManager.cpp b/drm/DrmAtomicStateManager.cpp index 7e7870c..f143ca1 100644 --- a/drm/DrmAtomicStateManager.cpp +++ b/drm/DrmAtomicStateManager.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include +#include #undef NDEBUG /* Required for assert to work */ #define ATRACE_TAG ATRACE_TAG_GRAPHICS @@ -26,6 +28,7 @@ #include #include #include +#include "utils/intel_blit.h" #include #include @@ -105,7 +108,6 @@ auto DrmAtomicStateManager::CommitFrame(AtomicCommitArgs &args) -> int { auto unused_planes = new_frame_state.used_planes; bool has_hdr_layer = false; - if (args.composition) { new_frame_state.used_planes.clear(); @@ -113,6 +115,23 @@ auto DrmAtomicStateManager::CommitFrame(AtomicCommitArgs &args) -> int { DrmPlane *plane = joining.plane->Get(); LayerData &layer = joining.layer; + if (layer.bi->use_shadow_fds) { + int ret = 0; + int out_handle; + // Use any tiling mode other than linear suffers from corrupted images. + uint32_t tiling = I915_TILING_NONE; + // TODO: handle multi-plane buffer + ret = intel_blit(&layer.bi->info, layer.bi->shadow_buffer_handles[0], + layer.bi->prime_buffer_handles[0], + layer.bi->pitches[0], 4, tiling, + layer.bi->width, layer.bi->height, + layer.acquire_fence.Get(), &out_handle); + if (ret) { + ALOGE("failed to blit scan-out buffer\n"); + } + layer.blit_fence = android::UniqueFd(out_handle); + } + if (layer.bi->color_space >= BufferColorSpace::kItuRec2020) { has_hdr_layer = true; } diff --git a/drm/DrmFbImporter.cpp b/drm/DrmFbImporter.cpp index 6189bd6..00f98fb 100644 --- a/drm/DrmFbImporter.cpp +++ b/drm/DrmFbImporter.cpp @@ -45,15 +45,15 @@ auto DrmFbIdHandle::CreateInstance(BufferInfo *bo, GemHandle first_gem_handle, local->gem_handles_[0] = first_gem_handle; int32_t err = 0; + int *fds = bo->use_shadow_fds ? bo->shadow_fds : bo->prime_fds; /* Framebuffer object creation require gem handle for every used plane */ for (size_t i = 1; i < local->gem_handles_.size(); i++) { - if (bo->prime_fds[i] > 0) { - if (bo->prime_fds[i] != bo->prime_fds[0]) { - err = drmPrimeFDToHandle(drm.GetFd(), bo->prime_fds[i], + if (fds[i] > 0) { + if (fds[i] != fds[0]) { + err = drmPrimeFDToHandle(drm.GetFd(), fds[i], &local->gem_handles_.at(i)); if (err != 0) { - ALOGE("failed to import prime fd %d errno=%d", bo->prime_fds[i], - errno); + ALOGE("failed to import prime fd %d errno=%d", fds[i], errno); } } else { local->gem_handles_.at(i) = local->gem_handles_[0]; @@ -129,11 +129,12 @@ auto DrmFbImporter::GetOrCreateFbId(BufferInfo *bo) -> std::shared_ptr { /* Lookup DrmFbIdHandle in cache first. First handle serves as a cache key. */ GemHandle first_handle = 0; - int32_t err = drmPrimeFDToHandle(drm_->GetFd(), bo->prime_fds[0], - &first_handle); + int *fds = bo->use_shadow_fds ? bo->shadow_fds : bo->prime_fds; + + int32_t err = drmPrimeFDToHandle(drm_->GetFd(), fds[0], &first_handle); if (err != 0) { - ALOGE("Failed to import prime fd %d ret=%d", bo->prime_fds[0], err); + ALOGE("Failed to import prime fd %d ret=%d", fds[0], err); return {}; } diff --git a/drm/DrmPlane.cpp b/drm/DrmPlane.cpp index 1fb54e7..81b5fcb 100644 --- a/drm/DrmPlane.cpp +++ b/drm/DrmPlane.cpp @@ -240,8 +240,9 @@ auto DrmPlane::AtomicSetState(drmModeAtomicReq &pset, LayerData &layer, } } - if (layer.acquire_fence && - !in_fence_fd_property_.AtomicSet(pset, layer.acquire_fence.Get())) { + int fence = layer.bi->use_shadow_fds ? layer.blit_fence.Get() : layer.acquire_fence.Get(); + if (fence > 0 && + !in_fence_fd_property_.AtomicSet(pset, fence)) { return -EINVAL; } diff --git a/hwc2_device/HwcLayer.cpp b/hwc2_device/HwcLayer.cpp index 453af60..69bfae9 100644 --- a/hwc2_device/HwcLayer.cpp +++ b/hwc2_device/HwcLayer.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #define LOG_TAG "hwc-layer" #include "HwcLayer.h" @@ -21,6 +22,7 @@ #include "HwcDisplay.h" #include "bufferinfo/BufferInfoGetter.h" #include "utils/log.h" +#include "utils/intel_blit.h" namespace android { @@ -252,6 +254,29 @@ void HwcLayer::ImportFb() { return; } + int kms_fd = parent_->GetPipe().device->GetFd(); + layer_data_.bi->use_shadow_fds = (intel_dgpu_fd() >= 0) && !virtio_gpu_allow_p2p(kms_fd); + if (layer_data_.bi->use_shadow_fds) { + uint32_t handle; + int ret = intel_create_buffer(layer_data_.bi->width, layer_data_.bi->height, + layer_data_.bi->format, layer_data_.bi->modifiers[0], + &handle); + ALOGI("create shadow buffer, modifier=0x%lx\n", (unsigned long) layer_data_.bi->modifiers[0]); + if (ret) { + ALOGE("Failed to create shadow buffer\n"); + layer_data_.bi->use_shadow_fds = false; + } else { + layer_data_.bi->shadow_buffer_handles[0] = handle; + ret = drmPrimeHandleToFD(intel_dgpu_fd(), handle, 0, &layer_data_.bi->shadow_fds[0]); + if (ret) { + ALOGE("Failed to export shadow buffer\n"); + layer_data_.bi->use_shadow_fds = false; + drmCloseBufferHandle(intel_dgpu_fd(), handle); + } + intel_blit_init(&layer_data_.bi->info); + } + } + layer_data_ .fb = parent_->GetPipe().device->GetDrmFbImporter().GetOrCreateFbId( &layer_data_.bi.value()); @@ -357,4 +382,4 @@ void HwcLayer::SwChainClearCache() { swchain_reassembled_ = false; } -} // namespace android \ No newline at end of file +} // namespace android diff --git a/utils/i915_prelim.h b/utils/i915_prelim.h new file mode 100644 index 0000000..7a94358 --- /dev/null +++ b/utils/i915_prelim.h @@ -0,0 +1,118 @@ +/* + * Copyright 2017 The Chromium OS Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ +#ifndef I915_PRELIM +#define I915_PRELIM + +#include + +#define PRELIM_DRM_I915_QUERY (1 << 16) +#define PRELIM_DRM_I915_QUERY_MEMORY_REGIONS (PRELIM_DRM_I915_QUERY | 4) +#define PRELIM_I915_OBJECT_PARAM (1ull << 48) +#define PRELIM_I915_PARAM_MEMORY_REGIONS ((1 << 16) | 0x1) +#define PRELIM_I915_USER_EXT (1 << 16) +#define PRELIM_I915_GEM_CREATE_EXT_SETPARAM (PRELIM_I915_USER_EXT | 1) +#define PRELIM_DRM_IOCTL_I915_GEM_CREATE_EXT DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_CREATE, struct prelim_drm_i915_gem_create_ext) + +#define prelim_drm_i915_gem_memory_class_instance drm_i915_gem_memory_class_instance +struct prelim_drm_i915_gem_object_param { + /* Object handle (0 for I915_GEM_CREATE_EXT_SETPARAM) */ + __u32 handle; + + /* Data pointer size */ + __u32 size; + +/* + * PRELIM_I915_OBJECT_PARAM: + * + * Select object namespace for the param. + */ +#define PRELIM_I915_OBJECT_PARAM (1ull << 48) + +/* + * PRELIM_I915_PARAM_MEMORY_REGIONS: + * + * Set the data pointer with the desired set of placements in priority + * order(each entry must be unique and supported by the device), as an array of + * prelim_drm_i915_gem_memory_class_instance, or an equivalent layout of class:instance + * pair encodings. See PRELIM_DRM_I915_QUERY_MEMORY_REGIONS for how to query the + * supported regions. + * + * Note that this requires the PRELIM_I915_OBJECT_PARAM namespace: + * .param = PRELIM_I915_OBJECT_PARAM | PRELIM_I915_PARAM_MEMORY_REGIONS + */ +#define PRELIM_I915_PARAM_MEMORY_REGIONS ((1 << 16) | 0x1) + __u64 param; + + /* Data value or pointer */ + __u64 data; +}; + +struct prelim_drm_i915_gem_create_ext_setparam { + struct i915_user_extension base; + struct prelim_drm_i915_gem_object_param param; +}; + +/** + * struct prelim_drm_i915_memory_region_info + * + * Describes one region as known to the driver. + */ +struct prelim_drm_i915_memory_region_info { + /** class:instance pair encoding */ + struct drm_i915_gem_memory_class_instance region; + + /** MBZ */ + __u32 rsvd0; + + /** MBZ */ + __u64 caps; + + /** MBZ */ + __u64 flags; + + /** Memory probed by the driver (-1 = unknown) */ + __u64 probed_size; + + /** Estimate of memory remaining (-1 = unknown) */ + __u64 unallocated_size; + + /** MBZ */ + __u64 rsvd1[8]; +}; + +struct prelim_drm_i915_query_memory_regions { + /** @num_regions: Number of supported regions */ + __u32 num_regions; + + /** @rsvd: MBZ */ + __u32 rsvd[3]; + + /** @regions: Info about each supported region */ + struct prelim_drm_i915_memory_region_info regions[]; +}; + + +struct prelim_drm_i915_gem_create_ext { + + /** + * Requested size for the object. + * + * The (page-aligned) allocated size for the object will be returned. + */ + __u64 size; + /** + * Returned handle for the object. + * + * Object handles are nonzero. + */ + __u32 handle; + __u32 pad; +#define PRELIM_I915_GEM_CREATE_EXT_SETPARAM (PRELIM_I915_USER_EXT | 1) +#define PRELIM_I915_GEM_CREATE_EXT_FLAGS_UNKNOWN \ + (~PRELIM_I915_GEM_CREATE_EXT_SETPARAM) + __u64 extensions; +}; +#endif diff --git a/utils/intel_blit.cpp b/utils/intel_blit.cpp new file mode 100644 index 0000000..f595623 --- /dev/null +++ b/utils/intel_blit.cpp @@ -0,0 +1,659 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "intel_blit.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "i915_prelim.h" + +#include + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) + +#define MI_NOOP (0) +#define MI_FLUSH_DW (0x26 << 23) +#define MI_BATCH_BUFFER_END (0x0a << 23) + +#define XY_SRC_COPY_BLT_CMD ((0x2 << 29) | (0x53 << 22) | 8) // gen >> 8 +#define XY_SRC_COPY_BLT_WRITE_ALPHA (1 << 21) +#define XY_SRC_COPY_BLT_WRITE_RGB (1 << 20) +#define XY_SRC_COPY_BLT_SRC_TILED (1 << 15) +#define XY_SRC_COPY_BLT_DST_TILED (1 << 11) + +#define XY_TILE_LINEAR 0 +#define XY_TILE_X 1 +#define XY_TILE_4 2 +#define XY_TILE_64 3 + +// GEN 125 +#define HALIGN_16 0 +#define HALIGN_32 1 +#define HALIGN_64 2 +#define HALIGN_128 3 +#define VALIGN_4 1 +#define VALIGN_8 2 +#define VALIGN_16 3 + +#define PAGE_SHIFT 12 + +// BSpec: 21523 +#define XY_BLOCK_COPY_BLT_CMD ((0x2 << 29) | (0x41 << 22) | (0x14)) + +// BSpec: 47982 +#define XY_FAST_COPY_BLT_CMD ((0x2 << 29) | (0x42 << 22) | (0x8)) + +static void batch_reset(struct intel_info *info) { + info->cur = info->vaddr; +} + +static int batch_create(struct intel_info *info) { + struct drm_i915_gem_create create; + struct drm_i915_gem_mmap mmap_arg; + int ret = 0; + memset(&create, 0, sizeof(create)); + memset(&mmap_arg, 0, sizeof(mmap_arg)); + create.size = info->size; + ret = ioctl(info->fd, DRM_IOCTL_I915_GEM_CREATE, &create); + if (ret < 0) { + ALOGE("failed to create buffer\n"); + info->batch_handle = 0; + return ret; + } + info->batch_handle = create.handle; + mmap_arg.handle = create.handle; + mmap_arg.size = info->size; + ret = ioctl(info->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg); + if (ret < 0) { + drmCloseBufferHandle(info->fd, info->batch_handle); + info->batch_handle = 0; + ALOGE("buffer map failure\n"); + return ret; + } + info->vaddr = (uint32_t *)mmap_arg.addr_ptr; + batch_reset(info); + return ret; +} + +__attribute__((unused)) +static int batch_count(struct intel_info *info) { + return info->cur - info->vaddr; +} + +static void batch_dword(struct intel_info *info, uint32_t dword) { + *info->cur++ = dword; +} + +static void batch_destroy(struct intel_info *info) { + if (info->batch_handle) { + drmCloseBufferHandle(info->fd, info->batch_handle); + info->batch_handle = 0; + } +} + +static int batch_init(struct intel_info *info) { + int ret; + info->size = 4096; + info->fd = intel_dgpu_fd(); + ret = batch_create(info); + return ret; +} + +static int batch_submit(struct intel_info *info, uint32_t src, uint32_t dst, + uint64_t src_offset, uint64_t dst_offset, + uint32_t in_fence_handle, uint32_t out_fence_handle) { + int ret; + struct drm_i915_gem_exec_object2 obj[3]; + struct drm_i915_gem_execbuffer2 execbuf; + struct drm_i915_gem_exec_fence __attribute__((unused)) fence_array[2] = { + { + .handle = out_fence_handle, + .flags = I915_EXEC_FENCE_SIGNAL, + }, + { + .handle = in_fence_handle, + .flags = I915_EXEC_FENCE_WAIT, + }, + }; + memset(obj, 0, sizeof(obj)); + obj[0].handle = dst; + obj[0].offset = dst_offset; + obj[0].flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE; + + obj[1].handle = src; + obj[1].offset = src_offset; + obj[1].flags = EXEC_OBJECT_PINNED; + + obj[2].handle = info->batch_handle; + obj[2].offset = 0; + obj[2].flags = EXEC_OBJECT_PINNED; + + memset(&execbuf, 0, sizeof(execbuf)); + execbuf.buffers_ptr = (__u64)&obj; + execbuf.buffer_count = ARRAY_SIZE(obj); + execbuf.flags = I915_EXEC_BLT; + execbuf.flags |= I915_EXEC_NO_RELOC; + execbuf.flags |= I915_EXEC_FENCE_ARRAY; + execbuf.cliprects_ptr = (__u64)(fence_array); + execbuf.num_cliprects = ARRAY_SIZE(fence_array) - (in_fence_handle == 0 ? 1 : 0); + ret = ioctl(info->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2_WR, &execbuf); + if (ret < 0) { + ALOGE("submit batchbuffer failure, errno:%d\n", errno); + return -1; + } + // struct drm_i915_gem_wait wait; + // memset(&wait, 0, sizeof(wait)); + // wait.bo_handle = info->batch_handle; + // wait.timeout_ns = 1000 * 1000 * 1000; + // ioctl(info->fd, DRM_IOCTL_I915_GEM_WAIT, &wait); + + batch_reset(info); + return 0; +} + +__attribute__((unused)) +static int emit_fast_blit_commands(struct intel_info *info, + uint32_t stride, uint32_t bpp, + __attribute__((unused)) uint32_t tiling, + __attribute__((unused)) uint16_t width, uint16_t height, + uint64_t src_offset, uint64_t dst_offset) { + uint32_t cmd, br13; + if (!info->init) { + ALOGE("Blitter is not initialized\n"); + return -1; + } + + cmd = XY_FAST_COPY_BLT_CMD; + br13 = 0; + uint32_t size = stride * height; + switch (bpp) { + case 1: + break; + case 2: + br13 |= (1 << 24); + break; + case 4: + br13 |= (1 << 24) | (1 << 25); + break; + default: + ALOGE("unknown bpp (%u)\n", bpp); + return -1; + } + assert (tiling == I915_TILING_NONE); + + batch_dword(info, cmd); + batch_dword(info, br13 | PAGE_SIZE); + batch_dword(info, 0); + batch_dword(info, ((size >> PAGE_SHIFT) << 16) | (PAGE_SIZE / 4)); + batch_dword(info, dst_offset); + batch_dword(info, dst_offset >> 32); + + batch_dword(info, 0); + batch_dword(info, PAGE_SIZE); + batch_dword(info, src_offset); + batch_dword(info, src_offset >> 32); + + batch_dword(info, MI_FLUSH_DW | 2); + batch_dword(info, 0); + batch_dword(info, 0); + batch_dword(info, 0); + batch_dword(info, MI_BATCH_BUFFER_END); + + return 0; +} + +__attribute__((unused)) +static int emit_src_blit_commands(struct intel_info *info, + uint32_t stride, uint32_t bpp, + uint32_t tiling, + uint16_t width, uint16_t height, + uint64_t src_offset, uint64_t dst_offset) { + uint32_t cmd, br13, pitch; + if (!info->init) { + ALOGE("Blitter is not initialized\n"); + return -1; + } + + cmd = XY_SRC_COPY_BLT_CMD; + br13 = 0xcc << 16; + pitch = stride; + switch (bpp) { + case 1: + break; + case 2: + br13 |= (1 << 24); + break; + case 4: + br13 |= (1 << 24) | (1 << 25); + cmd |= XY_SRC_COPY_BLT_WRITE_ALPHA | XY_SRC_COPY_BLT_WRITE_RGB; + break; + default: + ALOGE("unknown bpp (%u)\n", bpp); + return -1; + } + if (tiling != I915_TILING_NONE) { + pitch >>= 3; + cmd |= XY_SRC_COPY_BLT_DST_TILED; + cmd |= XY_SRC_COPY_BLT_SRC_TILED; + } + batch_dword(info, cmd); + batch_dword(info, br13 | (pitch & 0xffff)); + batch_dword(info, 0); + batch_dword(info, (height << 16) | width); + batch_dword(info, dst_offset); + batch_dword(info, dst_offset >> 32); + + batch_dword(info, 0); + batch_dword(info, (pitch & 0xffff)); + batch_dword(info, src_offset); + batch_dword(info, src_offset >> 32); + + batch_dword(info, MI_FLUSH_DW | 2); + batch_dword(info, 0); + batch_dword(info, 0); + batch_dword(info, 0); + batch_dword(info, MI_BATCH_BUFFER_END); + + return 0; +} + +static uint32_t tiling_to_xy_block_tiling(uint32_t tiling) { + switch (tiling) { + case I915_TILING_4: + return XY_TILE_4; + case I915_TILING_X: + return XY_TILE_X; + case I915_TILING_NONE: + return XY_TILE_LINEAR; + default: + ALOGE("Invalid tiling for XY_BLOCK_COPY_BLT"); + } + return XY_TILE_LINEAR; +} + +// For some reson unknown to me, BLOCK_BLIT command is much slower than +// SRC_BLIT. So we prefer the latter one in spite of the fact that SRC_BLIT +// will be remvoed in GPUs in future generations. +__attribute__((unused)) +static int emit_block_blit_commands(struct intel_info *info, + uint32_t stride, uint32_t bpp, + uint32_t tiling, + uint16_t width, uint16_t height, + uint64_t src_offset, uint64_t dst_offset) { + uint32_t cmd, pitch; + uint32_t color_depth; + if (!info->init) { + return -1; + } + + switch (bpp) { + case 1: + color_depth = 0b00; + break; + case 2: + color_depth = 0b01; + break; + case 4: + color_depth = 0b10; + break; + case 8: + color_depth = 0b11; + break; + default: + ALOGE("unknown bpp (%u)\n", bpp); + return -1; + } + cmd = XY_BLOCK_COPY_BLT_CMD | (color_depth << 19); + pitch = stride; + if (tiling != I915_TILING_NONE) { + pitch >>= 2; + } + batch_dword(info, cmd); + batch_dword(info, (tiling_to_xy_block_tiling(tiling) << 30) | (info->mocs.blitter_dst << 21) | (pitch & 0xffff)); + batch_dword(info, 0); // dst y1 (top) x1 (left) + batch_dword(info, (height << 16) | width); // dst y2 (bottom) x2 (right) + // 4 + batch_dword(info, dst_offset); + batch_dword(info, dst_offset >> 32); + batch_dword(info, (0x1 << 31)); // system memory + batch_dword(info, 0); // src y1 (top) x1 (left) + // 8 + batch_dword(info, (tiling_to_xy_block_tiling(tiling) << 30) | (info->mocs.blitter_src << 21) | (pitch & 0xffff)); + batch_dword(info, src_offset); + batch_dword(info, src_offset >> 32); + batch_dword(info, (0x0 << 31)); // local memory + // 12 + batch_dword(info, 0); + batch_dword(info, 0); + batch_dword(info, 0); + batch_dword(info, 0); + // 16 + batch_dword(info, (0x1 << 29) | ((width - 1) << 14) | (height - 1)); + batch_dword(info, pitch << 4); // Q Pitch can be zero? + batch_dword(info, (VALIGN_4 << 3) | (HALIGN_32)); + batch_dword(info, (0x1 << 29) | ((width - 1) << 14) | (height - 1)); + // 20 + batch_dword(info, pitch << 4); // Q Pitch can be zero? + batch_dword(info, (VALIGN_4 << 3) | (HALIGN_32)); + + batch_dword(info, MI_FLUSH_DW | 2); + batch_dword(info, 0); + batch_dword(info, 0); + batch_dword(info, 0); + batch_dword(info, MI_BATCH_BUFFER_END); + + return 0; +} + +int intel_blit(struct intel_info *info, uint32_t dst, uint32_t src, + uint32_t stride, uint32_t bpp, uint32_t tiling, uint16_t width, + uint16_t height, int in_fence, int *out_fence) { + uint32_t in_fence_handle = 0; + uint32_t out_fence_handle = 0; + const uint64_t kSrcOffset = 16 * 1024 * 1024; + const uint64_t kDstOffset = 256 * 1024 * 1024; + int ret; + + ret = drmSyncobjCreate(info->fd, 0, &out_fence_handle); + if (ret) { + ALOGE("failed to create sync object\n"); + goto out; + } + + if (in_fence >= 0) { + ret = drmSyncobjCreate(info->fd, 0, &in_fence_handle); + if (ret) { + ALOGE("%s:%u: failed to create syncobj\n", __func__, __LINE__); + goto out; + } + ret = drmSyncobjImportSyncFile(info->fd, in_fence_handle, in_fence); + if (ret) { + ALOGE("failed to import syncobj (fd=%d)\n", in_fence); + goto out; + } + } + + // ret = emit_src_blit_commands(info, stride, bpp, tiling, width, height, kSrcOffset, kDstOffset); + ret = emit_fast_blit_commands(info, stride, bpp, tiling, width, height, kSrcOffset, kDstOffset); + // ret = emit_block_blit_commands(info, stride, bpp, tiling, width, height, kSrcOffset, kDstOffset); + if (ret) { + ALOGE("failed to fill commands\n"); + goto out; + } + + ret = batch_submit(info, src, dst, kSrcOffset, kDstOffset, in_fence_handle, out_fence_handle); + if (ret) { + ALOGE("failed to submit batch\n"); + goto out; + } + ret = drmSyncobjExportSyncFile(info->fd, out_fence_handle, out_fence); + if (ret) { + ALOGE("failed to export syncobj (handle=%u)\n", out_fence_handle); + goto out; + } +out: + if (in_fence_handle) { + drmSyncobjDestroy(info->fd, in_fence_handle); + } + if (out_fence_handle) { + drmSyncobjDestroy(info->fd, out_fence_handle); + } + return ret; +} + +int intel_blit_destroy(struct intel_info *info) { + if (info->init) { + batch_destroy(info); + info->init = 0; + } + return 0; +} + +int intel_blit_init(struct intel_info *info) { + memset(info, 0, sizeof(*info)); + batch_init(info); + info->init = 1; + info->mocs.blitter_dst = 2 << 1; + info->mocs.blitter_src = 2 << 1; + ALOGV("gpubilit init success\n"); + return 0; +} + +#define ALIGN(value, alignment) ((value + alignment - 1) & ~(alignment - 1)) + +struct iris_memregion { + struct drm_i915_gem_memory_class_instance region; + uint64_t size; +}; + +struct i915_device { + bool initialized; + bool has_local_mem; + struct iris_memregion vram, sys; +}; + +static struct i915_device dev = { + .initialized = false, +}; + +static inline void +intel_gem_add_ext(__u64 *ptr, uint32_t ext_name, struct i915_user_extension *ext) { + __u64 *iter = ptr; + while (*iter != 0) { + iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension; + } + ext->name = ext_name; + *iter = (uintptr_t) ext; +} + + +static void prelim_i915_bo_update_meminfo(struct i915_device *i915_dev, + const struct prelim_drm_i915_query_memory_regions *meminfo) { + i915_dev->has_local_mem = false; + for (uint32_t i = 0; i < meminfo->num_regions; i++) { + const struct prelim_drm_i915_memory_region_info *mem = &meminfo->regions[i]; + switch (mem->region.memory_class) { + case I915_MEMORY_CLASS_SYSTEM: + i915_dev->sys.region = mem->region; + i915_dev->sys.size = mem->probed_size; + break; + case I915_MEMORY_CLASS_DEVICE: + i915_dev->vram.region = mem->region; + i915_dev->vram.size = mem->probed_size; + i915_dev->has_local_mem = i915_dev->vram.size > 0; + break; + default: + break; + } + } +} + +static int intel_update_meminfo(int fd) { + if (dev.initialized) { + return 0; + } + + struct prelim_drm_i915_query_memory_regions *meminfo = nullptr; + + struct drm_i915_query_item item = { + .query_id = PRELIM_DRM_I915_QUERY_MEMORY_REGIONS, + }; + + struct drm_i915_query query = { + .num_items = 1, + .items_ptr = (uintptr_t)&item, + }; + int ret = drmIoctl(fd, DRM_IOCTL_I915_QUERY, &query); + if (ret < 0) { + ALOGE("Failed to query PRELIM_DRM_I915_QUERY_MEMORY_REGIONS\n"); + return -1; + } + if (item.length <= 0) { + return -1; + } + + meminfo = static_cast(calloc(1, item.length)); + if (!meminfo) { + ALOGE("Out of memory\n"); + return -1; + } + item.data_ptr = (uintptr_t)meminfo; + ret = drmIoctl(fd, DRM_IOCTL_I915_QUERY, &query); + if (ret < 0 || item.length <= 0) { + free(meminfo); + ALOGE("%s:%d DRM_IOCTL_I915_QUERY error\n", __FUNCTION__, __LINE__); + return -1; + } + prelim_i915_bo_update_meminfo(&dev, meminfo); + dev.initialized = true; + free(meminfo); + return 0; +} + +int intel_dgpu_fd() { + static int temp, fd = -1; + char device_path[32]; + if (fd >= 0) + return fd; + for (int i = 0; i < 64; ++i) { + sprintf(device_path, "/dev/dri/renderD%u", 128 + i); + temp = open(device_path, O_RDWR | O_CLOEXEC); + if (temp < 0) { + return temp; + } + drmVersionPtr version = drmGetVersion(temp); + if (strncmp(version->name, "i915", version->name_len)) { + continue; + } + intel_update_meminfo(temp); + if (dev.has_local_mem) { + fd = temp; + break; + } + } + return fd; +} + +int intel_create_buffer(uint32_t width, uint32_t height, + __attribute__((unused)) uint32_t format, + uint64_t modifier, uint32_t *out_handle) { + assert(out_handle != nullptr); + int fd = intel_dgpu_fd(); + uint32_t total_size; + uint32_t tiling = I915_TILING_NONE; + uint32_t horizontal_alignment = 64; + uint32_t vertical_alignment = 4; + const uint32_t bpp = 4; + uint32_t aligned_height, stride = width * bpp; + + switch (modifier) { + case DRM_FORMAT_MOD_LINEAR: + tiling = I915_TILING_NONE; + break; + case I915_FORMAT_MOD_X_TILED: + tiling = I915_TILING_X; + break; + case I915_FORMAT_MOD_Y_TILED: + case I915_FORMAT_MOD_Y_TILED_CCS: + case I915_FORMAT_MOD_Yf_TILED: + case I915_FORMAT_MOD_Yf_TILED_CCS: + tiling = I915_TILING_Y; + break; + case I915_FORMAT_MOD_4_TILED: + tiling = I915_TILING_4; + break; + } + switch (tiling) { + default: + case I915_TILING_NONE: + /* + * The Intel GPU doesn't need any alignment in linear mode, + * but libva requires the allocation stride to be aligned to + * 16 bytes and height to 4 rows. Further, we round up the + * horizontal alignment so that row start on a cache line (64 + * bytes). + */ + horizontal_alignment = 64; + vertical_alignment = 4; + break; + + case I915_TILING_X: + horizontal_alignment = 512; + vertical_alignment = 8; + break; + + case I915_TILING_Y: + horizontal_alignment = 128; + vertical_alignment = 32; + break; + + case I915_TILING_4: + horizontal_alignment = 128; + vertical_alignment = 32; + break; + } + aligned_height = ALIGN(height, vertical_alignment); + stride = ALIGN(stride, horizontal_alignment); + total_size = aligned_height * stride; + + struct drm_i915_gem_create_ext gem_create_ext = { + .size = ALIGN(total_size, 0x10000), + }; + struct drm_i915_gem_memory_class_instance regions[2]; + struct drm_i915_gem_create_ext_memory_regions ext_regions = { + .base = {.name = I915_GEM_CREATE_EXT_MEMORY_REGIONS}, + .num_regions = 0, + .regions = (uintptr_t)regions, + }; + regions[ext_regions.num_regions++] = dev.vram.region; + regions[ext_regions.num_regions++] = dev.sys.region; + intel_gem_add_ext(&gem_create_ext.extensions, + I915_GEM_CREATE_EXT_MEMORY_REGIONS, + &ext_regions.base); + int ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_CREATE_EXT, &gem_create_ext); + if (ret) { + ALOGE("drv: DRM_IOCTL_I915_GEM_CREATE_EXT failed (size=%llu)\n", + gem_create_ext.size); + return -errno; + } + *out_handle = gem_create_ext.handle; + return 0; +} + +#define VIRTGPU_PARAM_ALLOW_P2P 12 + +bool virtio_gpu_allow_p2p(int virtgpu_fd) { + struct drm_virtgpu_getparam get_param = { 0, 0 }; + uint64_t value = 0; + get_param.param = VIRTGPU_PARAM_ALLOW_P2P; + get_param.value = (__u64) &value; + int ret = drmIoctl(virtgpu_fd, DRM_IOCTL_VIRTGPU_GETPARAM, &get_param); + if (ret || value != 1) { + return false; + } + return true; +} diff --git a/utils/intel_blit.h b/utils/intel_blit.h new file mode 100644 index 0000000..18c1169 --- /dev/null +++ b/utils/intel_blit.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __INTEL_BLIT_H__ +#define __INTEL_BLIT_H__ + +#include +#include + +#define I915_TILING_4 9 + +struct intel_info { + int fd; + uint32_t batch_handle; + uint32_t *vaddr; + uint32_t *cur; + uint64_t size; + int init; + struct { + uint32_t blitter_src; + uint32_t blitter_dst; + } mocs; +}; + +int intel_blit_destroy(struct intel_info *info); +int intel_blit_init(struct intel_info *info); +int intel_blit(struct intel_info *info, uint32_t dst, uint32_t src, + uint32_t stride, uint32_t bpp, uint32_t tiling, uint16_t width, + uint16_t height, int in_fence, int *out_fence); +int intel_create_buffer(uint32_t width, uint32_t height, uint32_t format, + uint64_t modifier, uint32_t *out_handle); +int intel_dgpu_fd(); +bool virtio_gpu_allow_p2p(int virtgpu_fd); + +#endif // __INTEL_BLIT_H__