From e5ba1c8c0b0d6a5cc482b54a1cfdaf67a3341254 Mon Sep 17 00:00:00 2001
From: Weifeng Liu <weifeng.liu@intel.com>
Date: Thu, 29 Aug 2024 02:55:44 +0000
Subject: [PATCH] Use shadow buffers for dGPU VF + iGPU (backing virtio-GPU)
 output

To get best performance we must guarantee that scan-out buffers used for
composition in surfaceflinger reside in GPU local memory, but importing
these buffers into virtio-GPU will migrate the buffers from local memory
to system memory, which will highly impact the performance. To avoid
migration of these client-composited buffers, allocate a shadow buffer
for each of them and import the shadow buffers into virtio-GPU for
scanning-out. Right before atomic commit, leverage GPU blit engine to
copy content to shadow buffer.

Use shadow buffers only when feature ALLOW_P2P of virtio-GPU is not present
and dGPU exists.

There are several GPU instructions to blit memory:
- XY_FAST_COPY_BLT (BSpec: 47982),
- XY_SRC_COPY_BLT (BSpec: 48002),
- XY_BLOCK_COPY_BLT (BSpec: 3678).
By experiment, XY_FAST_COPY is much faster than the other two instructions.

Tracked-On: OAM-124182
Signed-off-by: Weifeng Liu <weifeng.liu@intel.com>
---
 Android.bp                              |  10 +-
 bufferinfo/BufferInfo.h                 |  11 +
 bufferinfo/BufferInfoMapperMetadata.cpp |  10 +
 compositor/LayerData.h                  |   2 +
 drm/DrmAtomicStateManager.cpp           |  21 +-
 drm/DrmFbImporter.cpp                   |  17 +-
 drm/DrmPlane.cpp                        |   5 +-
 hwc2_device/HwcLayer.cpp                |  27 +-
 utils/i915_prelim.h                     | 118 +++++
 utils/intel_blit.cpp                    | 659 ++++++++++++++++++++++++
 utils/intel_blit.h                      |  48 ++
 11 files changed, 915 insertions(+), 13 deletions(-)
 create mode 100644 utils/i915_prelim.h
 create mode 100644 utils/intel_blit.cpp
 create mode 100644 utils/intel_blit.h
diff --git a/Android.bp b/Android.bp
index cb3ac78..8d6f251 100644
--- a/Android.bp
+++ b/Android.bp
@@ -18,10 +18,18 @@
 cc_library_static {
     name: "libdrmhwc_utils",
 
-    srcs: ["utils/Worker.cpp"],
+    srcs: [
+        "utils/Worker.cpp",
+        "utils/intel_blit.cpp"
+    ],
 
     include_dirs: ["vendor/intel/external/drm-hwcomposer"],
 
+    shared_libs: [
+        "libdrm",
+        "libutils",
+    ],
+
     cflags: [
         "-Wall",
         "-Werror",
diff --git a/bufferinfo/BufferInfo.h b/bufferinfo/BufferInfo.h
index d5a8bcb..339f349 100644
--- a/bufferinfo/BufferInfo.h
+++ b/bufferinfo/BufferInfo.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cstdint>
+#include "utils/intel_blit.h"
 
 constexpr int kBufferMaxPlanes = 4;
 
@@ -50,6 +51,16 @@ struct BufferInfo {
   /* sizes[] is used only by mapper@4 metadata getter for internal purposes */
   uint32_t sizes[kBufferMaxPlanes];
   int prime_fds[kBufferMaxPlanes];
+  uint32_t prime_buffer_handles[kBufferMaxPlanes];
+  bool use_shadow_fds;
+  struct intel_info info;
+  /*
+   * Shadow buffers in system memory. We will blit content of prime_fds to
+   * shadow_fds right before atomic commit and use the shadow buffers as frame
+   * buffers.
+   **/
+  int shadow_fds[kBufferMaxPlanes];
+  uint32_t shadow_buffer_handles[kBufferMaxPlanes];
   uint64_t modifiers[kBufferMaxPlanes];
 
   BufferColorSpace color_space;
diff --git a/bufferinfo/BufferInfoMapperMetadata.cpp b/bufferinfo/BufferInfoMapperMetadata.cpp
index 72b20f3..4895edf 100644
--- a/bufferinfo/BufferInfoMapperMetadata.cpp
+++ b/bufferinfo/BufferInfoMapperMetadata.cpp
@@ -28,6 +28,7 @@
 #include <cinttypes>
 
 #include "utils/log.h"
+#include "utils/intel_blit.h"
 
 namespace android {
 
@@ -81,6 +82,15 @@ BufferInfoMapperMetadata::GetFds(buffer_handle_t handle, BufferInfo *bo) {
       ALOGE("Invalid prime fd");
       return android::BAD_VALUE;
     }
+
+    int dgpu_fd = intel_dgpu_fd();
+    if (dgpu_fd >= 0) {
+      int ret = drmPrimeFDToHandle(dgpu_fd, bo->prime_fds[i], &bo->prime_buffer_handles[i]);
+      if (ret) {
+        ALOGE("Cannot convert prime fd to handle\n");
+        return android::BAD_VALUE;
+      }
+    }
   }
 
   return 0;
diff --git a/compositor/LayerData.h b/compositor/LayerData.h
index d04514d..62cde48 100644
--- a/compositor/LayerData.h
+++ b/compositor/LayerData.h
@@ -69,6 +69,7 @@ struct LayerData {
     clonned.fb = fb;
     clonned.pi = pi;
     clonned.acquire_fence = std::move(acquire_fence);
+    clonned.blit_fence = std::move(blit_fence);
     return clonned;
   }
 
@@ -76,6 +77,7 @@ struct LayerData {
   std::shared_ptr<DrmFbIdHandle> fb;
   PresentInfo pi;
   UniqueFd acquire_fence;
+  UniqueFd blit_fence;
 };
 
 }  // namespace android
diff --git a/drm/DrmAtomicStateManager.cpp b/drm/DrmAtomicStateManager.cpp
index 7e7870c..f143ca1 100644
--- a/drm/DrmAtomicStateManager.cpp
+++ b/drm/DrmAtomicStateManager.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <drm/drm_fourcc.h>
+#include <cmath>
 #undef NDEBUG /* Required for assert to work */
 
 #define ATRACE_TAG ATRACE_TAG_GRAPHICS
@@ -26,6 +28,7 @@
 #include <sched.h>
 #include <sync/sync.h>
 #include <utils/Trace.h>
+#include "utils/intel_blit.h"
 
 #include <array>
 #include <cassert>
@@ -105,7 +108,6 @@ auto DrmAtomicStateManager::CommitFrame(AtomicCommitArgs &args) -> int {
   auto unused_planes = new_frame_state.used_planes;
 
   bool has_hdr_layer = false;
-
   if (args.composition) {
     new_frame_state.used_planes.clear();
 
@@ -113,6 +115,23 @@ auto DrmAtomicStateManager::CommitFrame(AtomicCommitArgs &args) -> int {
       DrmPlane *plane = joining.plane->Get();
       LayerData &layer = joining.layer;
 
+      if (layer.bi->use_shadow_fds) {
+        int ret = 0;
+        int out_handle;
+	// Use any tiling mode other than linear suffers from corrupted images.
+        uint32_t tiling = I915_TILING_NONE;
+	// TODO: handle multi-plane buffer
+        ret = intel_blit(&layer.bi->info, layer.bi->shadow_buffer_handles[0],
+                         layer.bi->prime_buffer_handles[0],
+                         layer.bi->pitches[0], 4, tiling,
+                         layer.bi->width, layer.bi->height,
+                         layer.acquire_fence.Get(), &out_handle);
+        if (ret) {
+          ALOGE("failed to blit scan-out buffer\n");
+        }
+        layer.blit_fence = android::UniqueFd(out_handle);
+      }
+
       if (layer.bi->color_space >= BufferColorSpace::kItuRec2020) {
         has_hdr_layer = true;
       }
diff --git a/drm/DrmFbImporter.cpp b/drm/DrmFbImporter.cpp
index 6189bd6..00f98fb 100644
--- a/drm/DrmFbImporter.cpp
+++ b/drm/DrmFbImporter.cpp
@@ -45,15 +45,15 @@ auto DrmFbIdHandle::CreateInstance(BufferInfo *bo, GemHandle first_gem_handle,
   local->gem_handles_[0] = first_gem_handle;
   int32_t err = 0;
 
+  int *fds = bo->use_shadow_fds ? bo->shadow_fds : bo->prime_fds;
   /* Framebuffer object creation require gem handle for every used plane */
   for (size_t i = 1; i < local->gem_handles_.size(); i++) {
-    if (bo->prime_fds[i] > 0) {
-      if (bo->prime_fds[i] != bo->prime_fds[0]) {
-        err = drmPrimeFDToHandle(drm.GetFd(), bo->prime_fds[i],
+    if (fds[i] > 0) {
+      if (fds[i] != fds[0]) {
+        err = drmPrimeFDToHandle(drm.GetFd(), fds[i],
                                  &local->gem_handles_.at(i));
         if (err != 0) {
-          ALOGE("failed to import prime fd %d errno=%d", bo->prime_fds[i],
-                errno);
+          ALOGE("failed to import prime fd %d errno=%d", fds[i], errno);
         }
       } else {
         local->gem_handles_.at(i) = local->gem_handles_[0];
@@ -129,11 +129,12 @@ auto DrmFbImporter::GetOrCreateFbId(BufferInfo *bo)
     -> std::shared_ptr<DrmFbIdHandle> {
   /* Lookup DrmFbIdHandle in cache first. First handle serves as a cache key. */
   GemHandle first_handle = 0;
-  int32_t err = drmPrimeFDToHandle(drm_->GetFd(), bo->prime_fds[0],
-                                   &first_handle);
+  int *fds = bo->use_shadow_fds ? bo->shadow_fds : bo->prime_fds;
+
+  int32_t err = drmPrimeFDToHandle(drm_->GetFd(), fds[0], &first_handle);
 
   if (err != 0) {
-    ALOGE("Failed to import prime fd %d ret=%d", bo->prime_fds[0], err);
+    ALOGE("Failed to import prime fd %d ret=%d", fds[0], err);
     return {};
   }
 
diff --git a/drm/DrmPlane.cpp b/drm/DrmPlane.cpp
index 1fb54e7..81b5fcb 100644
--- a/drm/DrmPlane.cpp
+++ b/drm/DrmPlane.cpp
@@ -240,8 +240,9 @@ auto DrmPlane::AtomicSetState(drmModeAtomicReq &pset, LayerData &layer,
     }
   }
 
-  if (layer.acquire_fence &&
-      !in_fence_fd_property_.AtomicSet(pset, layer.acquire_fence.Get())) {
+  int fence = layer.bi->use_shadow_fds ? layer.blit_fence.Get() : layer.acquire_fence.Get();
+  if (fence > 0 &&
+      !in_fence_fd_property_.AtomicSet(pset, fence)) {
     return -EINVAL;
   }
 
diff --git a/hwc2_device/HwcLayer.cpp b/hwc2_device/HwcLayer.cpp
index 453af60..69bfae9 100644
--- a/hwc2_device/HwcLayer.cpp
+++ b/hwc2_device/HwcLayer.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <xf86drm.h>
 #define LOG_TAG "hwc-layer"
 
 #include "HwcLayer.h"
@@ -21,6 +22,7 @@
 #include "HwcDisplay.h"
 #include "bufferinfo/BufferInfoGetter.h"
 #include "utils/log.h"
+#include "utils/intel_blit.h"
 
 namespace android {
 
@@ -252,6 +254,29 @@ void HwcLayer::ImportFb() {
     return;
   }
 
+  int kms_fd = parent_->GetPipe().device->GetFd();
+  layer_data_.bi->use_shadow_fds = (intel_dgpu_fd() >= 0) && !virtio_gpu_allow_p2p(kms_fd);
+  if (layer_data_.bi->use_shadow_fds) {
+    uint32_t handle;
+    int ret = intel_create_buffer(layer_data_.bi->width, layer_data_.bi->height,
+                                  layer_data_.bi->format, layer_data_.bi->modifiers[0],
+                                  &handle);
+    ALOGI("create shadow buffer, modifier=0x%lx\n", (unsigned long) layer_data_.bi->modifiers[0]);
+    if (ret) {
+      ALOGE("Failed to create shadow buffer\n");
+      layer_data_.bi->use_shadow_fds = false;
+    } else {
+      layer_data_.bi->shadow_buffer_handles[0] = handle;
+      ret = drmPrimeHandleToFD(intel_dgpu_fd(), handle, 0, &layer_data_.bi->shadow_fds[0]);
+      if (ret) {
+        ALOGE("Failed to export shadow buffer\n");
+        layer_data_.bi->use_shadow_fds = false;
+	drmCloseBufferHandle(intel_dgpu_fd(), handle);
+      }
+      intel_blit_init(&layer_data_.bi->info);
+    }
+  }
+
   layer_data_
       .fb = parent_->GetPipe().device->GetDrmFbImporter().GetOrCreateFbId(
       &layer_data_.bi.value());
@@ -357,4 +382,4 @@ void HwcLayer::SwChainClearCache() {
   swchain_reassembled_ = false;
 }
 
-}  // namespace android
\ No newline at end of file
+}  // namespace android
diff --git a/utils/i915_prelim.h b/utils/i915_prelim.h
new file mode 100644
index 0000000..7a94358
--- /dev/null
+++ b/utils/i915_prelim.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2017 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#ifndef I915_PRELIM
+#define I915_PRELIM
+
+#include <i915_drm.h>
+
+#define PRELIM_DRM_I915_QUERY           		(1 << 16)
+#define PRELIM_DRM_I915_QUERY_MEMORY_REGIONS    (PRELIM_DRM_I915_QUERY | 4)
+#define PRELIM_I915_OBJECT_PARAM  				(1ull << 48)
+#define PRELIM_I915_PARAM_MEMORY_REGIONS 		((1 << 16) | 0x1)
+#define PRELIM_I915_USER_EXT        			(1 << 16)
+#define PRELIM_I915_GEM_CREATE_EXT_SETPARAM     (PRELIM_I915_USER_EXT | 1)
+#define PRELIM_DRM_IOCTL_I915_GEM_CREATE_EXT		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_CREATE, struct prelim_drm_i915_gem_create_ext)
+
+#define prelim_drm_i915_gem_memory_class_instance drm_i915_gem_memory_class_instance
+struct prelim_drm_i915_gem_object_param {
+	/* Object handle (0 for I915_GEM_CREATE_EXT_SETPARAM) */
+	__u32 handle;
+
+	/* Data pointer size */
+	__u32 size;
+
+/*
+ * PRELIM_I915_OBJECT_PARAM:
+ *
+ * Select object namespace for the param.
+ */
+#define PRELIM_I915_OBJECT_PARAM  (1ull << 48)
+
+/*
+ * PRELIM_I915_PARAM_MEMORY_REGIONS:
+ *
+ * Set the data pointer with the desired set of placements in priority
+ * order(each entry must be unique and supported by the device), as an array of
+ * prelim_drm_i915_gem_memory_class_instance, or an equivalent layout of class:instance
+ * pair encodings. See PRELIM_DRM_I915_QUERY_MEMORY_REGIONS for how to query the
+ * supported regions.
+ *
+ * Note that this requires the PRELIM_I915_OBJECT_PARAM namespace:
+ *	.param = PRELIM_I915_OBJECT_PARAM | PRELIM_I915_PARAM_MEMORY_REGIONS
+ */
+#define PRELIM_I915_PARAM_MEMORY_REGIONS ((1 << 16) | 0x1)
+	__u64 param;
+
+	/* Data value or pointer */
+	__u64 data;
+};
+
+struct prelim_drm_i915_gem_create_ext_setparam {
+	struct i915_user_extension base;
+	struct prelim_drm_i915_gem_object_param param;
+};
+
+/**
+ * struct prelim_drm_i915_memory_region_info
+ *
+ * Describes one region as known to the driver.
+ */
+struct prelim_drm_i915_memory_region_info {
+	/** class:instance pair encoding */
+	struct drm_i915_gem_memory_class_instance region;
+
+	/** MBZ */
+	__u32 rsvd0;
+
+	/** MBZ */
+	__u64 caps;
+
+	/** MBZ */
+	__u64 flags;
+
+	/** Memory probed by the driver (-1 = unknown) */
+	__u64 probed_size;
+
+	/** Estimate of memory remaining (-1 = unknown) */
+	__u64 unallocated_size;
+
+	/** MBZ */
+	__u64 rsvd1[8];
+};
+
+struct prelim_drm_i915_query_memory_regions {
+    /** @num_regions: Number of supported regions */
+    __u32 num_regions;
+
+    /** @rsvd: MBZ */
+    __u32 rsvd[3];
+
+    /** @regions: Info about each supported region */
+    struct prelim_drm_i915_memory_region_info regions[];
+};
+
+
+struct prelim_drm_i915_gem_create_ext {
+
+	/**
+	 * Requested size for the object.
+	 *
+	 * The (page-aligned) allocated size for the object will be returned.
+	 */
+	__u64 size;
+	/**
+	 * Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+	__u32 pad;
+#define PRELIM_I915_GEM_CREATE_EXT_SETPARAM	(PRELIM_I915_USER_EXT | 1)
+#define PRELIM_I915_GEM_CREATE_EXT_FLAGS_UNKNOWN \
+	(~PRELIM_I915_GEM_CREATE_EXT_SETPARAM)
+	__u64 extensions;
+};
+#endif
diff --git a/utils/intel_blit.cpp b/utils/intel_blit.cpp
new file mode 100644
index 0000000..f595623
--- /dev/null
+++ b/utils/intel_blit.cpp
@@ -0,0 +1,659 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "intel_blit.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <drm_fourcc.h>
+#include <virtgpu_drm.h>
+#include <xf86drm.h>
+#include <log/log.h>
+#include <cstdint>
+#include "i915_prelim.h"
+
+#include <chrono>
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+
+#define MI_NOOP (0)
+#define MI_FLUSH_DW (0x26 << 23)
+#define MI_BATCH_BUFFER_END (0x0a << 23)
+
+#define XY_SRC_COPY_BLT_CMD ((0x2 << 29) | (0x53 << 22) | 8)  // gen >> 8
+#define XY_SRC_COPY_BLT_WRITE_ALPHA (1 << 21)
+#define XY_SRC_COPY_BLT_WRITE_RGB (1 << 20)
+#define XY_SRC_COPY_BLT_SRC_TILED (1 << 15)
+#define XY_SRC_COPY_BLT_DST_TILED (1 << 11)
+
+#define XY_TILE_LINEAR                           0
+#define XY_TILE_X                                1
+#define XY_TILE_4                                2
+#define XY_TILE_64                               3
+
+// GEN 125
+#define HALIGN_16                                0
+#define HALIGN_32                                1
+#define HALIGN_64                                2
+#define HALIGN_128                               3
+#define VALIGN_4                                 1
+#define VALIGN_8                                 2
+#define VALIGN_16                                3
+
+#define PAGE_SHIFT 12
+
+// BSpec: 21523
+#define XY_BLOCK_COPY_BLT_CMD ((0x2 << 29) | (0x41 << 22) | (0x14))
+
+// BSpec: 47982
+#define XY_FAST_COPY_BLT_CMD ((0x2 << 29) | (0x42 << 22) | (0x8))
+
+static void batch_reset(struct intel_info *info) {
+  info->cur = info->vaddr;
+}
+
+static int batch_create(struct intel_info *info) {
+  struct drm_i915_gem_create create;
+  struct drm_i915_gem_mmap mmap_arg;
+  int ret = 0;
+  memset(&create, 0, sizeof(create));
+  memset(&mmap_arg, 0, sizeof(mmap_arg));
+  create.size = info->size;
+  ret = ioctl(info->fd, DRM_IOCTL_I915_GEM_CREATE, &create);
+  if (ret < 0) {
+    ALOGE("failed to create buffer\n");
+    info->batch_handle = 0;
+    return ret;
+  }
+  info->batch_handle = create.handle;
+  mmap_arg.handle = create.handle;
+  mmap_arg.size = info->size;
+  ret = ioctl(info->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
+  if (ret < 0) {
+    drmCloseBufferHandle(info->fd, info->batch_handle);
+    info->batch_handle = 0;
+    ALOGE("buffer map failure\n");
+    return ret;
+  }
+  info->vaddr = (uint32_t *)mmap_arg.addr_ptr;
+  batch_reset(info);
+  return ret;
+}
+
+__attribute__((unused))
+static int batch_count(struct intel_info *info) {
+  return info->cur - info->vaddr;
+}
+
+static void batch_dword(struct intel_info *info, uint32_t dword) {
+  *info->cur++ = dword;
+}
+
+static void batch_destroy(struct intel_info *info) {
+  if (info->batch_handle) {
+    drmCloseBufferHandle(info->fd, info->batch_handle);
+    info->batch_handle = 0;
+  }
+}
+
+static int batch_init(struct intel_info *info) {
+  int ret;
+  info->size = 4096;
+  info->fd = intel_dgpu_fd();
+  ret = batch_create(info);
+  return ret;
+}
+
+static int batch_submit(struct intel_info *info, uint32_t src, uint32_t dst,
+                        uint64_t src_offset, uint64_t dst_offset,
+                        uint32_t in_fence_handle, uint32_t out_fence_handle) {
+  int ret;
+  struct drm_i915_gem_exec_object2 obj[3];
+  struct drm_i915_gem_execbuffer2 execbuf;
+  struct drm_i915_gem_exec_fence __attribute__((unused)) fence_array[2] = {
+    {
+      .handle = out_fence_handle,
+      .flags = I915_EXEC_FENCE_SIGNAL,
+    },
+    {
+      .handle = in_fence_handle,
+      .flags = I915_EXEC_FENCE_WAIT,
+    },
+  };
+  memset(obj, 0, sizeof(obj));
+  obj[0].handle = dst;
+  obj[0].offset = dst_offset;
+  obj[0].flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE;
+
+  obj[1].handle = src;
+  obj[1].offset = src_offset;
+  obj[1].flags = EXEC_OBJECT_PINNED;
+
+  obj[2].handle = info->batch_handle;
+  obj[2].offset = 0;
+  obj[2].flags = EXEC_OBJECT_PINNED;
+
+  memset(&execbuf, 0, sizeof(execbuf));
+  execbuf.buffers_ptr = (__u64)&obj;
+  execbuf.buffer_count = ARRAY_SIZE(obj);
+  execbuf.flags = I915_EXEC_BLT;
+  execbuf.flags |= I915_EXEC_NO_RELOC;
+  execbuf.flags |= I915_EXEC_FENCE_ARRAY;
+  execbuf.cliprects_ptr = (__u64)(fence_array);
+  execbuf.num_cliprects = ARRAY_SIZE(fence_array) - (in_fence_handle == 0 ? 1 : 0);
+  ret = ioctl(info->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2_WR, &execbuf);
+  if (ret < 0) {
+    ALOGE("submit batchbuffer failure, errno:%d\n", errno);
+    return -1;
+  }
+  // struct drm_i915_gem_wait wait;
+  // memset(&wait, 0, sizeof(wait));
+  // wait.bo_handle = info->batch_handle;
+  // wait.timeout_ns = 1000 * 1000 * 1000;
+  // ioctl(info->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+
+  batch_reset(info);
+  return 0;
+}
+
+__attribute__((unused))
+static int emit_fast_blit_commands(struct intel_info *info,
+                                  uint32_t stride, uint32_t bpp,
+                                  __attribute__((unused)) uint32_t tiling,
+                                  __attribute__((unused)) uint16_t width, uint16_t height,
+                                  uint64_t src_offset, uint64_t dst_offset) {
+  uint32_t cmd, br13;
+  if (!info->init) {
+    ALOGE("Blitter is not initialized\n");
+    return -1;
+  }
+
+  cmd = XY_FAST_COPY_BLT_CMD;
+  br13 = 0;
+  uint32_t size = stride * height;
+  switch (bpp) {
+    case 1:
+      break;
+    case 2:
+      br13 |= (1 << 24);
+      break;
+    case 4:
+      br13 |= (1 << 24) | (1 << 25);
+      break;
+    default:
+      ALOGE("unknown bpp (%u)\n", bpp);
+      return -1;
+  }
+  assert (tiling == I915_TILING_NONE);
+
+  batch_dword(info, cmd);
+  batch_dword(info, br13 | PAGE_SIZE);
+  batch_dword(info, 0);
+  batch_dword(info, ((size >> PAGE_SHIFT) << 16) | (PAGE_SIZE / 4));
+  batch_dword(info, dst_offset);
+  batch_dword(info, dst_offset >> 32);
+
+  batch_dword(info, 0);
+  batch_dword(info, PAGE_SIZE);
+  batch_dword(info, src_offset);
+  batch_dword(info, src_offset >> 32);
+
+  batch_dword(info, MI_FLUSH_DW | 2);
+  batch_dword(info, 0);
+  batch_dword(info, 0);
+  batch_dword(info, 0);
+  batch_dword(info, MI_BATCH_BUFFER_END);
+
+  return 0;
+}
+
+__attribute__((unused))
+static int emit_src_blit_commands(struct intel_info *info,
+                                  uint32_t stride, uint32_t bpp,
+                                  uint32_t tiling,
+                                  uint16_t width, uint16_t height,
+                                  uint64_t src_offset, uint64_t dst_offset) {
+  uint32_t cmd, br13, pitch;
+  if (!info->init) {
+    ALOGE("Blitter is not initialized\n");
+    return -1;
+  }
+
+  cmd = XY_SRC_COPY_BLT_CMD;
+  br13 = 0xcc << 16;
+  pitch = stride;
+  switch (bpp) {
+    case 1:
+      break;
+    case 2:
+      br13 |= (1 << 24);
+      break;
+    case 4:
+      br13 |= (1 << 24) | (1 << 25);
+      cmd |= XY_SRC_COPY_BLT_WRITE_ALPHA | XY_SRC_COPY_BLT_WRITE_RGB;
+      break;
+    default:
+      ALOGE("unknown bpp (%u)\n", bpp);
+      return -1;
+  }
+  if (tiling != I915_TILING_NONE) {
+    pitch >>= 3;
+    cmd |= XY_SRC_COPY_BLT_DST_TILED;
+    cmd |= XY_SRC_COPY_BLT_SRC_TILED;
+  }
+  batch_dword(info, cmd);
+  batch_dword(info, br13 | (pitch & 0xffff));
+  batch_dword(info, 0);
+  batch_dword(info, (height << 16) | width);
+  batch_dword(info, dst_offset);
+  batch_dword(info, dst_offset >> 32);
+
+  batch_dword(info, 0);
+  batch_dword(info, (pitch & 0xffff));
+  batch_dword(info, src_offset);
+  batch_dword(info, src_offset >> 32);
+
+  batch_dword(info, MI_FLUSH_DW | 2);
+  batch_dword(info, 0);
+  batch_dword(info, 0);
+  batch_dword(info, 0);
+  batch_dword(info, MI_BATCH_BUFFER_END);
+
+  return 0;
+}
+
+static uint32_t tiling_to_xy_block_tiling(uint32_t tiling) {
+  switch (tiling) {
+  case I915_TILING_4:
+    return XY_TILE_4;
+  case I915_TILING_X:
+    return XY_TILE_X;
+  case I915_TILING_NONE:
+    return XY_TILE_LINEAR;
+  default:
+    ALOGE("Invalid tiling for XY_BLOCK_COPY_BLT");
+  }
+  return XY_TILE_LINEAR;
+}
+
+// For some reson unknown to me, BLOCK_BLIT command is much slower than
+// SRC_BLIT. So we prefer the latter one in spite of the fact that SRC_BLIT
+// will be remvoed in GPUs in future generations.
+__attribute__((unused))
+static int emit_block_blit_commands(struct intel_info *info,
+                                    uint32_t stride, uint32_t bpp,
+                                    uint32_t tiling,
+                                    uint16_t width, uint16_t height,
+                                    uint64_t src_offset, uint64_t dst_offset) {
+  uint32_t cmd, pitch;
+  uint32_t color_depth;
+  if (!info->init) {
+    return -1;
+  }
+
+  switch (bpp) {
+  case 1:
+    color_depth = 0b00;
+    break;
+  case 2:
+    color_depth = 0b01;
+    break;
+  case 4:
+    color_depth = 0b10;
+    break;
+  case 8:
+    color_depth = 0b11;
+    break;
+  default:
+    ALOGE("unknown bpp (%u)\n", bpp);
+    return -1;
+  }
+  cmd = XY_BLOCK_COPY_BLT_CMD | (color_depth << 19);
+  pitch = stride;
+  if (tiling != I915_TILING_NONE) {
+    pitch >>= 2;
+  }
+  batch_dword(info, cmd);
+  batch_dword(info, (tiling_to_xy_block_tiling(tiling) << 30) | (info->mocs.blitter_dst << 21) | (pitch & 0xffff));
+  batch_dword(info, 0); // dst y1 (top) x1 (left)
+  batch_dword(info, (height << 16) | width); // dst y2 (bottom) x2 (right)
+  // 4
+  batch_dword(info, dst_offset);
+  batch_dword(info, dst_offset >> 32);
+  batch_dword(info, (0x1 << 31)); // system memory
+  batch_dword(info, 0); // src y1 (top) x1 (left)
+  // 8
+  batch_dword(info, (tiling_to_xy_block_tiling(tiling) << 30) | (info->mocs.blitter_src << 21) | (pitch & 0xffff));
+  batch_dword(info, src_offset);
+  batch_dword(info, src_offset >> 32);
+  batch_dword(info, (0x0 << 31)); // local memory
+  // 12
+  batch_dword(info, 0);
+  batch_dword(info, 0);
+  batch_dword(info, 0);
+  batch_dword(info, 0);
+  // 16
+  batch_dword(info, (0x1 << 29) | ((width - 1) << 14) | (height - 1));
+  batch_dword(info, pitch << 4); // Q Pitch can be zero?
+  batch_dword(info, (VALIGN_4 << 3) | (HALIGN_32));
+  batch_dword(info, (0x1 << 29) | ((width - 1) << 14) | (height - 1));
+  // 20
+  batch_dword(info, pitch << 4); // Q Pitch can be zero?
+  batch_dword(info, (VALIGN_4 << 3) | (HALIGN_32));
+
+  batch_dword(info, MI_FLUSH_DW | 2);
+  batch_dword(info, 0);
+  batch_dword(info, 0);
+  batch_dword(info, 0);
+  batch_dword(info, MI_BATCH_BUFFER_END);
+
+  return 0;
+}
+
+int intel_blit(struct intel_info *info, uint32_t dst, uint32_t src,
+               uint32_t stride, uint32_t bpp, uint32_t tiling, uint16_t width,
+               uint16_t height, int in_fence, int *out_fence) {
+  uint32_t in_fence_handle = 0;
+  uint32_t out_fence_handle = 0;
+  const uint64_t kSrcOffset = 16 * 1024 * 1024;
+  const uint64_t kDstOffset = 256 * 1024 * 1024;
+  int ret;
+
+  ret = drmSyncobjCreate(info->fd, 0, &out_fence_handle);
+  if (ret) {
+    ALOGE("failed to create sync object\n");
+    goto out;
+  }
+
+  if (in_fence >= 0) {
+    ret = drmSyncobjCreate(info->fd, 0, &in_fence_handle);
+    if (ret) {
+      ALOGE("%s:%u: failed to create syncobj\n", __func__, __LINE__);
+      goto out;
+    }
+    ret = drmSyncobjImportSyncFile(info->fd, in_fence_handle, in_fence);
+    if (ret) {
+      ALOGE("failed to import syncobj (fd=%d)\n", in_fence);
+      goto out;
+    }
+  }
+
+  // ret = emit_src_blit_commands(info, stride, bpp, tiling, width, height, kSrcOffset, kDstOffset);
+  ret = emit_fast_blit_commands(info, stride, bpp, tiling, width, height, kSrcOffset, kDstOffset);
+  // ret = emit_block_blit_commands(info, stride, bpp, tiling, width, height, kSrcOffset, kDstOffset);
+  if (ret) {
+    ALOGE("failed to fill commands\n");
+    goto out;
+  }
+
+  ret = batch_submit(info, src, dst, kSrcOffset, kDstOffset, in_fence_handle, out_fence_handle);
+  if (ret) {
+    ALOGE("failed to submit batch\n");
+    goto out;
+  }
+  ret = drmSyncobjExportSyncFile(info->fd, out_fence_handle, out_fence);
+  if (ret) {
+    ALOGE("failed to export syncobj (handle=%u)\n", out_fence_handle);
+    goto out;
+  }
+out:
+  if (in_fence_handle) {
+    drmSyncobjDestroy(info->fd, in_fence_handle);
+  }
+  if (out_fence_handle) {
+    drmSyncobjDestroy(info->fd, out_fence_handle);
+  }
+  return ret;
+}
+
+int intel_blit_destroy(struct intel_info *info) {
+  if (info->init) {
+    batch_destroy(info);
+    info->init = 0;
+  }
+  return 0;
+}
+
+int intel_blit_init(struct intel_info *info) {
+  memset(info, 0, sizeof(*info));
+  batch_init(info);
+  info->init = 1;
+  info->mocs.blitter_dst = 2 << 1;
+  info->mocs.blitter_src = 2 << 1;
+  ALOGV("gpubilit init success\n");
+  return 0;
+}
+
+#define ALIGN(value, alignment) ((value + alignment - 1) & ~(alignment - 1))
+
+struct iris_memregion {
+  struct drm_i915_gem_memory_class_instance region;
+  uint64_t size;
+};
+
+struct i915_device {
+  bool initialized;
+  bool has_local_mem;
+  struct iris_memregion vram, sys;
+};
+
+static struct i915_device dev = {
+  .initialized = false,
+};
+
+static inline void
+intel_gem_add_ext(__u64 *ptr, uint32_t ext_name, struct i915_user_extension *ext) {
+  __u64 *iter = ptr;
+  while (*iter != 0) {
+    iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension;
+  }
+  ext->name = ext_name;
+  *iter = (uintptr_t) ext;
+}
+
+
+static void prelim_i915_bo_update_meminfo(struct i915_device *i915_dev,
+    const struct prelim_drm_i915_query_memory_regions *meminfo) {
+  i915_dev->has_local_mem = false;
+  for (uint32_t i = 0; i < meminfo->num_regions; i++) {
+    const struct prelim_drm_i915_memory_region_info *mem = &meminfo->regions[i];
+    switch (mem->region.memory_class) {
+    case I915_MEMORY_CLASS_SYSTEM:
+      i915_dev->sys.region = mem->region;
+      i915_dev->sys.size = mem->probed_size;
+      break;
+    case I915_MEMORY_CLASS_DEVICE:
+      i915_dev->vram.region = mem->region;
+      i915_dev->vram.size = mem->probed_size;
+      i915_dev->has_local_mem = i915_dev->vram.size > 0;
+      break;
+    default:
+      break;
+    }
+  }
+}
+
+static int intel_update_meminfo(int fd) {
+  if (dev.initialized) {
+    return 0;
+  }
+
+  struct prelim_drm_i915_query_memory_regions *meminfo = nullptr;
+
+  struct drm_i915_query_item item = {
+    .query_id = PRELIM_DRM_I915_QUERY_MEMORY_REGIONS,
+  };
+
+  struct drm_i915_query query = {
+    .num_items = 1,
+    .items_ptr = (uintptr_t)&item,
+  };
+  int ret = drmIoctl(fd, DRM_IOCTL_I915_QUERY, &query);
+  if (ret < 0) {
+    ALOGE("Failed to query PRELIM_DRM_I915_QUERY_MEMORY_REGIONS\n");
+    return -1;
+  }
+  if (item.length <= 0) {
+    return -1;
+  }
+
+  meminfo = static_cast<struct prelim_drm_i915_query_memory_regions *>(calloc(1, item.length));
+  if (!meminfo) {
+    ALOGE("Out of memory\n");
+    return -1;
+  }
+  item.data_ptr = (uintptr_t)meminfo;
+  ret = drmIoctl(fd, DRM_IOCTL_I915_QUERY, &query);
+  if (ret < 0 || item.length <= 0) {
+    free(meminfo);
+    ALOGE("%s:%d DRM_IOCTL_I915_QUERY error\n", __FUNCTION__, __LINE__);
+    return -1;
+  }
+  prelim_i915_bo_update_meminfo(&dev, meminfo);
+  dev.initialized = true;
+  free(meminfo);
+  return 0;
+}
+
+int intel_dgpu_fd() {
+  static int temp, fd = -1;
+  char device_path[32];
+  if (fd >= 0)
+    return fd;
+  for (int i = 0; i < 64; ++i) {
+    sprintf(device_path, "/dev/dri/renderD%u", 128 + i);
+    temp = open(device_path, O_RDWR | O_CLOEXEC);
+    if (temp < 0) {
+      return temp;
+    }
+    drmVersionPtr version = drmGetVersion(temp);
+    if (strncmp(version->name, "i915", version->name_len)) {
+      continue;
+    }
+    intel_update_meminfo(temp);
+    if (dev.has_local_mem) {
+      fd = temp;
+      break;
+    }
+  }
+  return fd;
+}
+
+int intel_create_buffer(uint32_t width, uint32_t height,
+                        __attribute__((unused)) uint32_t format,
+                        uint64_t modifier, uint32_t *out_handle) {
+  assert(out_handle != nullptr);
+  int fd = intel_dgpu_fd();
+  uint32_t total_size;
+  uint32_t tiling = I915_TILING_NONE;
+  uint32_t horizontal_alignment = 64;
+  uint32_t vertical_alignment = 4;
+  const uint32_t bpp = 4;
+  uint32_t aligned_height, stride = width * bpp;
+
+  switch (modifier) {
+  case DRM_FORMAT_MOD_LINEAR:
+    tiling = I915_TILING_NONE;
+    break;
+  case I915_FORMAT_MOD_X_TILED:
+    tiling = I915_TILING_X;
+    break;
+  case I915_FORMAT_MOD_Y_TILED:
+  case I915_FORMAT_MOD_Y_TILED_CCS:
+  case I915_FORMAT_MOD_Yf_TILED:
+  case I915_FORMAT_MOD_Yf_TILED_CCS:
+    tiling = I915_TILING_Y;
+    break;
+  case I915_FORMAT_MOD_4_TILED:
+    tiling = I915_TILING_4;
+    break;
+  }
+  switch (tiling) {
+  default:
+  case I915_TILING_NONE:
+    /*
+     * The Intel GPU doesn't need any alignment in linear mode,
+     * but libva requires the allocation stride to be aligned to
+     * 16 bytes and height to 4 rows. Further, we round up the
+     * horizontal alignment so that row start on a cache line (64
+     * bytes).
+     */
+    horizontal_alignment = 64;
+    vertical_alignment = 4;
+    break;
+
+  case I915_TILING_X:
+    horizontal_alignment = 512;
+    vertical_alignment = 8;
+    break;
+
+  case I915_TILING_Y:
+    horizontal_alignment = 128;
+    vertical_alignment = 32;
+    break;
+
+  case I915_TILING_4:
+    horizontal_alignment = 128;
+    vertical_alignment = 32;
+    break;
+  }
+  aligned_height = ALIGN(height, vertical_alignment);
+  stride = ALIGN(stride, horizontal_alignment);
+  total_size = aligned_height * stride;
+
+  struct drm_i915_gem_create_ext gem_create_ext = {
+    .size = ALIGN(total_size, 0x10000),
+  };
+  struct drm_i915_gem_memory_class_instance regions[2];
+  struct drm_i915_gem_create_ext_memory_regions ext_regions = {
+    .base = {.name = I915_GEM_CREATE_EXT_MEMORY_REGIONS},
+    .num_regions = 0,
+    .regions = (uintptr_t)regions,
+  };
+  regions[ext_regions.num_regions++] = dev.vram.region;
+  regions[ext_regions.num_regions++] = dev.sys.region;
+  intel_gem_add_ext(&gem_create_ext.extensions,
+                    I915_GEM_CREATE_EXT_MEMORY_REGIONS,
+                    &ext_regions.base);
+  int ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_CREATE_EXT, &gem_create_ext);
+  if (ret) {
+    ALOGE("drv: DRM_IOCTL_I915_GEM_CREATE_EXT failed (size=%llu)\n",
+      gem_create_ext.size);
+    return -errno;
+  }
+  *out_handle = gem_create_ext.handle;
+  return 0;
+}
+
+#define VIRTGPU_PARAM_ALLOW_P2P		12
+
+bool virtio_gpu_allow_p2p(int virtgpu_fd) {
+  struct drm_virtgpu_getparam get_param = { 0, 0 };
+  uint64_t value = 0;
+  get_param.param = VIRTGPU_PARAM_ALLOW_P2P;
+  get_param.value = (__u64) &value;
+  int ret = drmIoctl(virtgpu_fd, DRM_IOCTL_VIRTGPU_GETPARAM, &get_param);
+  if (ret || value != 1) {
+    return false;
+  }
+  return true;
+}
diff --git a/utils/intel_blit.h b/utils/intel_blit.h
new file mode 100644
index 0000000..18c1169
--- /dev/null
+++ b/utils/intel_blit.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __INTEL_BLIT_H__
+#define __INTEL_BLIT_H__
+
+#include <i915_drm.h>
+#include <stdint.h>
+
+#define I915_TILING_4 9
+
+struct intel_info {
+  int fd;
+  uint32_t batch_handle;
+  uint32_t *vaddr;
+  uint32_t *cur;
+  uint64_t size;
+  int init;
+  struct {
+    uint32_t blitter_src;
+    uint32_t blitter_dst;
+  } mocs;
+};
+
+int intel_blit_destroy(struct intel_info *info);
+int intel_blit_init(struct intel_info *info);
+int intel_blit(struct intel_info *info, uint32_t dst, uint32_t src,
+               uint32_t stride, uint32_t bpp, uint32_t tiling, uint16_t width,
+               uint16_t height, int in_fence, int *out_fence);
+int intel_create_buffer(uint32_t width, uint32_t height, uint32_t format,
+                        uint64_t modifier, uint32_t *out_handle);
+int intel_dgpu_fd();
+bool virtio_gpu_allow_p2p(int virtgpu_fd);
+
+#endif  // __INTEL_BLIT_H__