diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3db2fc3fed..15dd57cd5e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -541,7 +541,7 @@ if(CONFIG_AV1_ENCODER)
                                     ${LIBBROTLICOMMON_LIBRARIES})
       target_include_directories(aom PRIVATE ${LIBJXL_INCLUDE_DIRS})
     else()
-      pkg_check_modules(LIBJXL REQUIRED libjxl)
+      pkg_check_modules(LIBJXL REQUIRED libjxl libjxl_threads)
       target_link_libraries(aom PRIVATE ${LIBJXL_LDFLAGS} ${LIBJXL_LIBRARIES})
       target_include_directories(aom PRIVATE ${LIBJXL_INCLUDE_DIRS})
       if(LIBJXL_CFLAGS)
diff --git a/aom/aomcx.h b/aom/aomcx.h
index 9c7402a6ec..9ffeccc153 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1547,6 +1547,7 @@ typedef enum {
   AOM_TUNE_VMAF_MAX_GAIN = 6,
   AOM_TUNE_VMAF_NEG_MAX_GAIN = 7,
   AOM_TUNE_BUTTERAUGLI = 8,
+  AOM_TUNE_IMAGE_PERCEPTUAL_QUALITY = 9,
 } aom_tune_metric;
 
 /*!\brief Distortion metric to use for RD optimization.
diff --git a/aom_dsp/butteraugli.c b/aom_dsp/butteraugli.c
index 038efcd313..55b69831b5 100644
--- a/aom_dsp/butteraugli.c
+++ b/aom_dsp/butteraugli.c
@@ -11,9 +11,11 @@
 
 #include <assert.h>
 #include <jxl/butteraugli.h>
+#include <jxl/thread_parallel_runner.h>
 
 #include "aom_dsp/butteraugli.h"
 #include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
 #include "third_party/libyuv/include/libyuv/convert_argb.h"
 
 int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
@@ -21,7 +23,7 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
                          aom_matrix_coefficients_t matrix_coefficients,
                          aom_color_range_t color_range, float *dist_map) {
   (void)bit_depth;
-  assert(bit_depth == 8);
+  assert(bit_depth <= 10);
   const int width = source->y_crop_width;
   const int height = source->y_crop_height;
   const int ss_x = source->subsampling_x;
@@ -37,7 +39,7 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
   }
 
   const size_t stride_argb = width * 4;
-  const size_t buffer_size = height * stride_argb;
+  const size_t buffer_size = height * stride_argb * (bit_depth > 8 ? 2 : 1);
   uint8_t *src_argb = (uint8_t *)aom_malloc(buffer_size);
   uint8_t *distorted_argb = (uint8_t *)aom_malloc(buffer_size);
   if (!src_argb || !distorted_argb) {
@@ -46,30 +48,57 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
     return 0;
   }
 
+
   if (ss_x == 1 && ss_y == 1) {
-    I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
-                     source->uv_stride, source->v_buffer, source->uv_stride,
-                     src_argb, stride_argb, yuv_constants, width, height);
-    I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
-                     distorted->u_buffer, distorted->uv_stride,
-                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
-                     stride_argb, yuv_constants, width, height);
+    if (bit_depth == 8) {
+      I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                      source->uv_stride, source->v_buffer, source->uv_stride,
+                      src_argb, stride_argb, yuv_constants, width, height);
+      I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                      distorted->u_buffer, distorted->uv_stride,
+                      distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                      stride_argb, yuv_constants, width, height);
+    } else {
+      I010ToARGBMatrix(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
+                      CONVERT_TO_SHORTPTR(source->u_buffer), source->uv_stride,
+                      CONVERT_TO_SHORTPTR(source->v_buffer), source->uv_stride,
+                      src_argb, stride_argb, yuv_constants, width, height);
+      I010ToARGBMatrix(CONVERT_TO_SHORTPTR(distorted->y_buffer), distorted->y_stride,
+                      CONVERT_TO_SHORTPTR(distorted->u_buffer), distorted->uv_stride,
+                      CONVERT_TO_SHORTPTR(distorted->v_buffer), distorted->uv_stride,
+                      distorted_argb, stride_argb, yuv_constants, width, height);
+    }
   } else if (ss_x == 1 && ss_y == 0) {
-    I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
-                     source->uv_stride, source->v_buffer, source->uv_stride,
-                     src_argb, stride_argb, yuv_constants, width, height);
-    I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
-                     distorted->u_buffer, distorted->uv_stride,
-                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
-                     stride_argb, yuv_constants, width, height);
+    if (bit_depth == 8) {
+      I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                      source->uv_stride, source->v_buffer, source->uv_stride,
+                      src_argb, stride_argb, yuv_constants, width, height);
+      I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                      distorted->u_buffer, distorted->uv_stride,
+                      distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                      stride_argb, yuv_constants, width, height);
+    } else {
+      I210ToARGBMatrix(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
+                      CONVERT_TO_SHORTPTR(source->u_buffer), source->uv_stride,
+                      CONVERT_TO_SHORTPTR(source->v_buffer), source->uv_stride,
+                      src_argb, stride_argb, yuv_constants, width, height);
+      I210ToARGBMatrix(CONVERT_TO_SHORTPTR(distorted->y_buffer), distorted->y_stride,
+                      CONVERT_TO_SHORTPTR(distorted->u_buffer), distorted->uv_stride,
+                      CONVERT_TO_SHORTPTR(distorted->v_buffer), distorted->uv_stride,
+                      distorted_argb, stride_argb, yuv_constants, width, height);
+    }
   } else if (ss_x == 0 && ss_y == 0) {
-    I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
-                     source->uv_stride, source->v_buffer, source->uv_stride,
-                     src_argb, stride_argb, yuv_constants, width, height);
-    I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
-                     distorted->u_buffer, distorted->uv_stride,
-                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
-                     stride_argb, yuv_constants, width, height);
+    if (bit_depth == 8) {
+      I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                      source->uv_stride, source->v_buffer, source->uv_stride,
+                      src_argb, stride_argb, yuv_constants, width, height);
+      I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                      distorted->u_buffer, distorted->uv_stride,
+                      distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                      stride_argb, yuv_constants, width, height);
+    } else {
+      return 0;
+    }
   } else {
     aom_free(src_argb);
     aom_free(distorted_argb);
@@ -77,8 +106,13 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
   }
 
   JxlPixelFormat pixel_format = { 4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 };
+  if (bit_depth == 10) {
+    pixel_format.data_type = JXL_TYPE_UINT16;
+  }
   JxlButteraugliApi *api = JxlButteraugliApiCreate(NULL);
-  JxlButteraugliApiSetHFAsymmetry(api, 0.8f);
+  JxlParallelRunner runner = JxlThreadParallelRunnerCreate(NULL, 6);
+  JxlButteraugliApiSetParallelRunner(api, JxlThreadParallelRunner, runner);
+  JxlButteraugliApiSetHFAsymmetry(api, 0.5f);
 
   JxlButteraugliResult *result = JxlButteraugliCompute(
       api, width, height, &pixel_format, src_argb, buffer_size, &pixel_format,
diff --git a/av1/arg_defs.c b/av1/arg_defs.c
index 882b03aa07..219f4c1dcc 100644
--- a/av1/arg_defs.c
+++ b/av1/arg_defs.c
@@ -47,6 +47,7 @@ static const struct arg_enum_list tuning_enum[] = {
   { "vmaf", AOM_TUNE_VMAF_MAX_GAIN },
   { "vmaf_neg", AOM_TUNE_VMAF_NEG_MAX_GAIN },
   { "butteraugli", AOM_TUNE_BUTTERAUGLI },
+  { "image_perceptual_quality", AOM_TUNE_IMAGE_PERCEPTUAL_QUALITY },
   { NULL, 0 }
 };
 
@@ -535,8 +536,9 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = {
       ARG_DEF(NULL, "deltaq-mode", 1,
               "Delta qindex mode (0: off, 1: deltaq objective (default), "
               "2: deltaq placeholder, 3: key frame visual quality, 4: user "
-              "rating based visual quality optimization). "
-              "Currently this requires enable-tpl-model as a prerequisite."),
+              "rating based visual quality optimization, \n"
+              "                                        5: HDR deltaq optimization). "
+              "Currently, deltaq-mode=1 and 2 require enable-tpl-model as a prerequisite."),
   .deltaq_strength = ARG_DEF(NULL, "deltaq-strength", 1,
                              "Deltaq strength for"
                              " --deltaq-mode=4 (%)"),
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 9a2b9858fe..b93e56492e 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -798,7 +798,8 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
   }
 #endif
 
-  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_BUTTERAUGLI);
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR,
+              AOM_TUNE_IMAGE_PERCEPTUAL_QUALITY);
 
   RANGE_CHECK(extra_cfg, dist_metric, AOM_DIST_METRIC_PSNR,
               AOM_DIST_METRIC_QM_PSNR);
@@ -878,9 +879,6 @@ static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
 
 #if CONFIG_TUNE_BUTTERAUGLI
   if (ctx->extra_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
-    if (img->bit_depth > 8) {
-      ERROR("Only 8 bit depth images supported in tune=butteraugli mode.");
-    }
     if (img->mc != 0 && img->mc != AOM_CICP_MC_BT_709 &&
         img->mc != AOM_CICP_MC_BT_601 && img->mc != AOM_CICP_MC_BT_470_B_G) {
       ERROR(
diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c
index a8fb2f5ffc..42e4d03744 100644
--- a/av1/encoder/allintra_vis.c
+++ b/av1/encoder/allintra_vis.c
@@ -366,6 +366,140 @@ static void automatic_intra_tools_off(AV1_COMP *cpi,
   }
 }
 
+// Compute the "mean subtracted contrast normalized coefficients (MSCN)",
+// defined in the following paper:
+// "No-Reference Image Quality Assessment in the Spatial Domain",
+// DOI: 10.1109/TIP.2012.2214050
+//
+// The MSCN coefficients reflect normalized signal information regardless
+// of pixel intensity. We could think as a contrast enhanced image map.
+// The absolute sum of MSCN coefficients of a block could represent
+// the amount of information, or complexity of a block.
+// Here, we seek the ratio of the most complex and the most plain block,
+// as a complexity indicator of the image.
+static void build_mscn_map(AV1_COMP *cpi, double *mscn_map) {
+  const uint8_t *buffer = cpi->source->y_buffer;
+  const int buf_stride = cpi->source->y_stride;
+  const int frame_width = cpi->frame_info.frame_width;
+  const int frame_height = cpi->frame_info.frame_height;
+  const int half_win = 3;
+  // h = round(fspecial('gaussian', 7, 3.0) * 1000)
+  const int gauss_kernel[] = { 11, 15, 18, 19, 18, 15, 11, 15, 20, 23,
+                               25, 23, 20, 15, 18, 23, 27, 29, 27, 23,
+                               18, 19, 25, 29, 31, 29, 25, 19, 18, 23,
+                               27, 29, 27, 23, 18, 15, 20, 23, 25, 23,
+                               20, 15, 11, 15, 18, 19, 18, 15, 11 };
+  // Generate mscn map with Gaussian kernel weights.
+  double *mean_map = aom_calloc(frame_width * frame_height, sizeof(*mean_map));
+  for (int row = 0; row < frame_height; ++row) {
+    for (int col = 0; col < frame_width; ++col) {
+      double weighted_sum = 0;
+      int count = 0;
+      for (int dy = -half_win; dy <= half_win; ++dy) {
+        for (int dx = -half_win; dx <= half_win; ++dx) {
+          if (row + dy < 0 || row + dy >= frame_height || col + dx < 0 ||
+              col + dx >= frame_width) {
+            continue;
+          }
+          const int pix = buffer[(row + dy) * buf_stride + col + dx];
+          weighted_sum +=
+              pix * gauss_kernel[(dy + half_win) * (2 * half_win + 1) +
+                                 (dx + half_win)];
+          count += gauss_kernel[(dy + half_win) * (2 * half_win + 1) +
+                                (dx + half_win)];
+        }
+      }
+      const double weighted_mean = weighted_sum / count;
+      mean_map[row * frame_width + col] = weighted_mean;
+    }
+  }
+  for (int row = 0; row < frame_height; ++row) {
+    for (int col = 0; col < frame_width; ++col) {
+      double weighted_sum = 0;
+      double count = 0;
+      const double mean = mean_map[row * frame_width + col];
+      for (int dy = -half_win; dy <= half_win; ++dy) {
+        for (int dx = -half_win; dx <= half_win; ++dx) {
+          if (row + dy < 0 || row + dy >= frame_height || col + dx < 0 ||
+              col + dx >= frame_width) {
+            continue;
+          }
+          const int pix = buffer[(row + dy) * buf_stride + col + dx];
+          const double weight =
+              gauss_kernel[(dy + half_win) * (2 * half_win + 1) +
+                           (dx + half_win)];
+          weighted_sum += weight * (pix - mean) * (pix - mean);
+          count += weight;
+        }
+      }
+      const double sigma = sqrt(weighted_sum / count);
+      mscn_map[row * frame_width + col] =
+          (buffer[row * buf_stride + col] - mean) / (sigma + 1.0);
+    }
+  }
+  aom_free(mean_map);
+}
+
+// beta (= cpi->norm_wiener_variance / sb_wiener_var) is the scaling factor
+// that determines the quantizer used for a super block,
+// used in "av1_get_sbq_perceptual_ai()".
+// Its lower bound is determined by the "min_max_scale" which prevents using
+// a large quantizer that quantizes all transform coeffiencts from non-zero
+// to zero.
+// Its upper bound is determined in this function, with the help of the
+// global_msn_contrast, which measures the complexity contrast between the most
+// difficult and the most plain super block.
+static double get_dynamic_range(AV1_COMP *const cpi, const int sb_step) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int frame_width = cpi->frame_info.frame_width;
+  const int frame_height = cpi->frame_info.frame_height;
+  double *mscn_map = aom_calloc(frame_width * frame_height, sizeof(*mscn_map));
+  build_mscn_map(cpi, mscn_map);
+  double max_block_mscn = 0.0;
+  double min_block_mscn = 1000.0;
+  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sb_step) {
+    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sb_step) {
+      int pix_count = 0;
+      double block_sum_mscn = 0.0;
+      for (int row = 0; row < mi_size_high[sb_step] * MI_SIZE; ++row) {
+        for (int col = 0; col < mi_size_wide[sb_step] * MI_SIZE; ++col) {
+          const int r = mi_row * MI_SIZE + row;
+          const int c = mi_col * MI_SIZE + col;
+          if (r >= frame_height || c >= frame_width) continue;
+          block_sum_mscn += fabs(mscn_map[r * frame_width + c]);
+          ++pix_count;
+        }
+      }
+      const double block_avg_mscn = block_sum_mscn / pix_count;
+      max_block_mscn = AOMMAX(block_avg_mscn, max_block_mscn);
+      min_block_mscn = AOMMIN(block_avg_mscn, min_block_mscn);
+    }
+  }
+  double global_mscn_contrast = max_block_mscn / (min_block_mscn + 0.01);
+  global_mscn_contrast = AOMMIN(global_mscn_contrast, 20.0);
+  double max_beta = 0.0;
+  double min_beta = 1000.0;
+  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sb_step) {
+    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sb_step) {
+      const int sb_wiener_var =
+          get_var_perceptual_ai(cpi, cm->seq_params->sb_size, mi_row, mi_col);
+      double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+      double min_max_scale = AOMMAX(
+          1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col));
+      beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+      min_beta = AOMMIN(beta, min_beta);
+      max_beta = AOMMAX(beta, max_beta);
+    }
+  }
+  const double scaling_factor = 1.0;
+  max_beta = min_beta * global_mscn_contrast * scaling_factor;
+  max_beta = AOMMIN(max_beta, 6.0);
+  max_beta = AOMMAX(max_beta, 2.0);
+
+  aom_free(mscn_map);
+  return max_beta;
+}
+
 void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   uint8_t *buffer = cpi->source->y_buffer;
@@ -556,6 +690,8 @@ void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
       pick_norm_factor_and_block_size(cpi, &norm_block_size);
   const int norm_step = mi_size_wide[norm_block_size];
 
+  cpi->dynamic_range_upper_bound = get_dynamic_range(cpi, norm_step);
+
   double sb_wiener_log = 0;
   double sb_count = 0;
   for (int its_cnt = 0; its_cnt < 2; ++its_cnt) {
@@ -570,8 +706,7 @@ void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
         double min_max_scale = AOMMAX(
             1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col));
         beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
-        beta = AOMMIN(beta, 4);
-        beta = AOMMAX(beta, 0.25);
+        beta = AOMMIN(beta, cpi->dynamic_range_upper_bound);
 
         sb_wiener_var = (int)(cpi->norm_wiener_variance / beta);
 
@@ -600,10 +735,8 @@ int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
   double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
   double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
   beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+  beta = AOMMIN(beta, cpi->dynamic_range_upper_bound);
 
-  // Cap beta such that the delta q value is not much far away from the base q.
-  beta = AOMMIN(beta, 4);
-  beta = AOMMAX(beta, 0.25);
   offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   offset = AOMMIN(offset, delta_q_info->delta_q_res * 20 - 1);
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index e876fd3b31..8dbd76fa11 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -770,7 +770,8 @@ static int adjust_hdr_cb_deltaq(int base_qindex) {
   const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
   int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5));
   dqpCb = AOMMIN(0, dqpCb);
-  dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  // dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  dqpCb = (int)CLIP(dqpCb, -CHROMA_DQP_MAX, CHROMA_DQP_MAX);
   return dqpCb;
 }
 
@@ -780,7 +781,8 @@ static int adjust_hdr_cr_deltaq(int base_qindex) {
   const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
   int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5));
   dqpCr = AOMMIN(0, dqpCr);
-  dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  //dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  dqpCr = (int)CLIP(dqpCr, -CHROMA_DQP_MAX, CHROMA_DQP_MAX);
   return dqpCr;
 }
 
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index be74763182..45f021bfa7 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -35,7 +35,8 @@ void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
   double num_of_mi = 0.0;
   double geom_mean_of_scale = 0.0;
 
-  assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM);
+  assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM ||
+         cpi->oxcf.tune_cfg.tuning == AOM_TUNE_IMAGE_PERCEPTUAL_QUALITY);
 
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 23e379f6fc..c1df6bde98 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2566,9 +2566,18 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
 #endif
 
 #if !CONFIG_RD_COMMAND
-  // Determine whether to use screen content tools using two fast encoding.
-  if (!cpi->sf.hl_sf.disable_extra_sc_testing)
+  if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_PSY) {
+    // Screen content optimizations are bad for Psy tuning,
+    // disable them and avoid the extra testing to speed us up.
+    FeatureFlags *const features = &cm->features;
+    features->allow_screen_content_tools = 0;
+    features->allow_intrabc = 0;
+    cpi->use_screen_content_tools = 0;
+    cpi->is_screen_content_type = 0;
+  } else if (!cpi->sf.hl_sf.disable_extra_sc_testing) {
+    // Determine whether to use screen content tools using two fast encoding.
     av1_determine_sc_tools_with_encoding(cpi, q);
+  }
 #endif  // !CONFIG_RD_COMMAND
 
 #if CONFIG_TUNE_VMAF
@@ -3458,7 +3467,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     }
   }
 
-  if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM) {
+  if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM ||
+      oxcf->tune_cfg.tuning == AOM_TUNE_IMAGE_PERCEPTUAL_QUALITY) {
     av1_set_mb_ssim_rdmult_scaling(cpi);
   }
 
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index aeb277a823..9d46cb633d 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -88,11 +88,12 @@ extern "C" {
 #define TF_LOOKAHEAD_IDX_THR 7
 
 #define HDR_QP_LEVELS 10
-#define CHROMA_CB_QP_SCALE 1.04
-#define CHROMA_CR_QP_SCALE 1.04
+#define CHROMA_CB_QP_SCALE 1.39
+#define CHROMA_CR_QP_SCALE 1.39
 #define CHROMA_QP_SCALE -0.46
 #define CHROMA_QP_OFFSET 9.26
 #define QP_SCALE_FACTOR 2.0
+#define CHROMA_DQP_MAX 80
 #define DISABLE_HDR_LUMA_DELTAQ 1
 
 // Rational number with an int64 numerator
@@ -3296,6 +3297,12 @@ typedef struct AV1_COMP {
    */
   int64_t norm_wiener_variance;
 
+  /*!
+   * The upper bound that determines the minimum allowed q for a super block
+   * in all intra mode, deltaq-mode=3.
+   */
+  double dynamic_range_upper_bound;
+
   /*!
    * Buffer to store delta-q values for delta-q mode 4.
    */
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index b6ac2d9e7a..ec7a9fb762 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -1274,7 +1274,7 @@ void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
   // Loop through each 16x16 block.
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
-      double var = 0.0, num_of_var = 0.0;
+      double var = 0.0, num_of_var = 0.0, var_log = 0.0;
       const int index = row * num_cols + col;
 
       // Loop through each 8x8 block.
@@ -1291,31 +1291,89 @@ void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
           buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
           buf.stride = y_stride;
 
+          double blk_var;
           if (use_hbd) {
-            var += av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8,
-                                                      xd->bd);
+            blk_var = av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8,
+                                                         xd->bd);
           } else {
-            var += av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
+            blk_var = av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
           }
 
+          var_log += log(AOMMAX(blk_var, 1));
+          var += blk_var;
           num_of_var += 1.0;
         }
       }
-      var = var / num_of_var;
 
-      // Curve fitting with an exponential model on all 16x16 blocks from the
-      // midres dataset.
-      var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+      if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_IMAGE_PERCEPTUAL_QUALITY) {
+        var = exp(var_log / num_of_var);
+        const int cq_level = cpi->oxcf.rc_cfg.cq_level;
+        const double hq_level = 30 * 4;
+        const double delta =
+            cq_level < hq_level
+                ? 2.0 * (double)(hq_level - cq_level) / hq_level
+                : 10.0 * (double)(cq_level - hq_level) / (MAXQ - hq_level);
+        // Curve fitting with an exponential model on user rating dataset.
+        var = 39.126 * (1 - exp(-0.0009413 * var)) + 1.236 + delta;
+      } else {
+        var = var / num_of_var;
+        // Curve fitting with an exponential model on all 16x16 blocks from the
+        // midres dataset.
+        var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+      }
       cpi->ssim_rdmult_scaling_factors[index] = var;
       log_sum += log(var);
     }
   }
-  log_sum = exp(log_sum / (double)(num_rows * num_cols));
 
-  for (int row = 0; row < num_rows; ++row) {
-    for (int col = 0; col < num_cols; ++col) {
-      const int index = row * num_cols + col;
-      cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_IMAGE_PERCEPTUAL_QUALITY &&
+      cpi->oxcf.q_cfg.deltaq_mode != NO_DELTA_Q) {
+    const int sb_size = cpi->common.seq_params->sb_size;
+    const int num_mi_w_sb = mi_size_wide[sb_size];
+    const int num_mi_h_sb = mi_size_high[sb_size];
+    const int num_cols_sb =
+        (mi_params->mi_cols + num_mi_w_sb - 1) / num_mi_w_sb;
+    const int num_rows_sb =
+        (mi_params->mi_rows + num_mi_h_sb - 1) / num_mi_h_sb;
+    const int num_blk_w = num_mi_w_sb / num_mi_w;
+    const int num_blk_h = num_mi_h_sb / num_mi_h;
+    assert(num_blk_w * num_mi_w == num_mi_w_sb);
+    assert(num_blk_h * num_mi_h == num_mi_h_sb);
+
+    for (int row = 0; row < num_rows_sb; ++row) {
+      for (int col = 0; col < num_cols_sb; ++col) {
+        double log_sum_sb = 0.0;
+        double blk_count = 0.0;
+        for (int blk_row = row * num_blk_h;
+             blk_row < (row + 1) * num_blk_h && blk_row < num_rows; ++blk_row) {
+          for (int blk_col = col * num_blk_w;
+               blk_col < (col + 1) * num_blk_w && blk_col < num_cols;
+               ++blk_col) {
+            const int index = blk_row * num_cols + blk_col;
+            log_sum_sb += log(cpi->ssim_rdmult_scaling_factors[index]);
+            blk_count += 1.0;
+          }
+        }
+        log_sum_sb = exp(log_sum_sb / blk_count);
+        for (int blk_row = row * num_blk_h;
+             blk_row < (row + 1) * num_blk_h && blk_row < num_rows; ++blk_row) {
+          for (int blk_col = col * num_blk_w;
+               blk_col < (col + 1) * num_blk_w && blk_col < num_cols;
+               ++blk_col) {
+            const int index = blk_row * num_cols + blk_col;
+            cpi->ssim_rdmult_scaling_factors[index] /= log_sum_sb;
+          }
+        }
+      }
+    }
+  } else {
+    log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+    for (int row = 0; row < num_rows; ++row) {
+      for (int col = 0; col < num_cols; ++col) {
+        const int index = row * num_cols + col;
+        cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
+      }
     }
   }
 }
diff --git a/av1/encoder/lookahead.h b/av1/encoder/lookahead.h
index c9e1c9a52b..6d75e4b987 100644
--- a/av1/encoder/lookahead.h
+++ b/av1/encoder/lookahead.h
@@ -25,8 +25,8 @@ extern "C" {
 #endif
 
 /*!\cond */
-#define MAX_LAG_BUFFERS 48
-#define MAX_LAP_BUFFERS 48
+#define MAX_LAG_BUFFERS 120
+#define MAX_LAP_BUFFERS 120
 #define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS)
 #define LAP_LAG_IN_FRAMES 17
 
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 1e0a539a5b..7914946180 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -642,7 +642,8 @@ static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   }
 #endif  // !CONFIG_REALTIME_ONLY
 
-  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM) {
+  if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM ||
+      cpi->oxcf.tune_cfg.tuning == AOM_TUNE_IMAGE_PERCEPTUAL_QUALITY) {
     av1_set_ssim_rdmult(cpi, &x->errorperbit, bsize, mi_row, mi_col,
                         &x->rdmult);
   }
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index c6b056ff4e..25e9df9d1f 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -950,8 +950,6 @@ static void set_good_speed_features_framesize_independent(
   if (speed >= 2) {
     sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
 
-    sf->fp_sf.skip_motion_search_threshold = 25;
-
     sf->gm_sf.disable_gm_search_based_on_stats = 1;
 
     sf->part_sf.reuse_best_prediction_for_part_ab =
@@ -1139,8 +1137,6 @@ static void set_good_speed_features_framesize_independent(
   }
 
   if (speed >= 5) {
-    sf->fp_sf.reduce_mv_step_param = 4;
-
     sf->part_sf.simple_motion_search_prune_agg =
         allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3;
     sf->part_sf.ext_partition_eval_thresh =
@@ -1170,7 +1166,6 @@ static void set_good_speed_features_framesize_independent(
 
     sf->winner_mode_sf.dc_blk_pred_level = 1;
 
-    sf->fp_sf.disable_recon = 1;
   }
 
   if (speed >= 6) {
@@ -1209,7 +1204,6 @@ static void set_good_speed_features_framesize_independent(
     sf->winner_mode_sf.dc_blk_pred_level = 2;
     sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
 
-    sf->fp_sf.skip_zeromv_motion_search = 1;
   }
 }
 
diff --git a/av1/encoder/tune_butteraugli.c b/av1/encoder/tune_butteraugli.c
index 80a0fc27c3..4a226067f8 100644
--- a/av1/encoder/tune_butteraugli.c
+++ b/av1/encoder/tune_butteraugli.c
@@ -18,6 +18,7 @@
 #include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/var_based_part.h"
+#include "aom_ports/mem.h"
 
 static const int resize_factor = 2;
 
@@ -56,57 +57,115 @@ static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi,
   double log_sum = 0.0;
   double blk_count = 0.0;
 
-  // Loop through each block.
-  for (int row = 0; row < num_rows; ++row) {
-    for (int col = 0; col < num_cols; ++col) {
-      const int index = row * num_cols + col;
-      const int y_start = row * block_h;
-      const int x_start = col * block_w;
-      float dbutteraugli = 0.0f;
-      float dmse = 0.0f;
-      float px_count = 0.0f;
-
-      // Loop through each pixel.
-      for (int y = y_start; y < y_start + block_h && y < height; y++) {
-        for (int x = x_start; x < x_start + block_w && x < width; x++) {
-          dbutteraugli += powf(diffmap[y * width + x], 12.0f);
-          float px_diff = source->y_buffer[y * source->y_stride + x] -
-                          recon->y_buffer[y * recon->y_stride + x];
-          dmse += px_diff * px_diff;
-          px_count += 1.0f;
+
+  if (cm->seq_params->use_highbitdepth)
+  {
+    // Loop through each block.
+    for (int row = 0; row < num_rows; ++row) {
+      for (int col = 0; col < num_cols; ++col) {
+        const int index = row * num_cols + col;
+        const int y_start = row * block_h;
+        const int x_start = col * block_w;
+        float dbutteraugli = 0.0f;
+        float dmse = 0.0f;
+        float px_count = 0.0f;
+
+        // Loop through each pixel.
+        for (int y = y_start; y < y_start + block_h && y < height; y++) {
+          for (int x = x_start; x < x_start + block_w && x < width; x++) {
+            dbutteraugli += powf(diffmap[y * width + x], 12.0f);
+            float px_diff = CONVERT_TO_SHORTPTR(source->y_buffer)[y * source->y_stride + x] -
+                            CONVERT_TO_SHORTPTR(recon->y_buffer)[y * recon->y_stride + x];
+            dmse += px_diff * px_diff;
+            px_count += 1.0f;
+          }
         }
-      }
-      const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y),
-                               (height + ss_y) >> ss_y);
-      for (int y = y_start >> ss_y; y < y_end; y++) {
-        const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x),
-                                 (width + ss_x) >> ss_x);
-        for (int x = x_start >> ss_x; x < x_end; x++) {
-          const int src_px_index = y * source->uv_stride + x;
-          const int recon_px_index = y * recon->uv_stride + x;
-          const float px_diff_u = (float)(source->u_buffer[src_px_index] -
-                                          recon->u_buffer[recon_px_index]);
-          const float px_diff_v = (float)(source->v_buffer[src_px_index] -
-                                          recon->v_buffer[recon_px_index]);
-          dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v;
-          px_count += 2.0f;
+        const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y),
+                                (height + ss_y) >> ss_y);
+        for (int y = y_start >> ss_y; y < y_end; y++) {
+          const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x),
+                                  (width + ss_x) >> ss_x);
+          for (int x = x_start >> ss_x; x < x_end; x++) {
+            const int src_px_index = y * source->uv_stride + x;
+            const int recon_px_index = y * recon->uv_stride + x;
+            const float px_diff_u = (float)(CONVERT_TO_SHORTPTR(source->u_buffer)[src_px_index] -
+                                            CONVERT_TO_SHORTPTR(recon->u_buffer)[recon_px_index]);
+            const float px_diff_v = (float)(CONVERT_TO_SHORTPTR(source->v_buffer)[src_px_index] -
+                                            CONVERT_TO_SHORTPTR(recon->v_buffer)[recon_px_index]);
+            dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v;
+            px_count += 2.0f;
+          }
+        }
+
+        dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f);
+        dmse = dmse / px_count;
+        const float eps = 0.01f;
+        double weight;
+        if (dbutteraugli < eps || dmse < eps) {
+          weight = -1.0;
+        } else {
+          blk_count += 1.0;
+          weight = dmse / dbutteraugli;
+          weight = AOMMIN(weight, 5.0);
+          weight += K;
+          log_sum += log(weight);
         }
+        cpi->butteraugli_info.rdmult_scaling_factors[index] = weight;
       }
+    }
+  } else {
+    // Loop through each block.
+    for (int row = 0; row < num_rows; ++row) {
+      for (int col = 0; col < num_cols; ++col) {
+        const int index = row * num_cols + col;
+        const int y_start = row * block_h;
+        const int x_start = col * block_w;
+        float dbutteraugli = 0.0f;
+        float dmse = 0.0f;
+        float px_count = 0.0f;
+
+        // Loop through each pixel.
+        for (int y = y_start; y < y_start + block_h && y < height; y++) {
+          for (int x = x_start; x < x_start + block_w && x < width; x++) {
+            dbutteraugli += powf(diffmap[y * width + x], 12.0f);
+            float px_diff = source->y_buffer[y * source->y_stride + x] -
+                            recon->y_buffer[y * recon->y_stride + x];
+            dmse += px_diff * px_diff;
+            px_count += 1.0f;
+          }
+        }
+        const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y),
+                                (height + ss_y) >> ss_y);
+        for (int y = y_start >> ss_y; y < y_end; y++) {
+          const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x),
+                                  (width + ss_x) >> ss_x);
+          for (int x = x_start >> ss_x; x < x_end; x++) {
+            const int src_px_index = y * source->uv_stride + x;
+            const int recon_px_index = y * recon->uv_stride + x;
+            const float px_diff_u = (float)(source->u_buffer[src_px_index] -
+                                            recon->u_buffer[recon_px_index]);
+            const float px_diff_v = (float)(source->v_buffer[src_px_index] -
+                                            recon->v_buffer[recon_px_index]);
+            dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v;
+            px_count += 2.0f;
+          }
+        }
 
-      dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f);
-      dmse = dmse / px_count;
-      const float eps = 0.01f;
-      double weight;
-      if (dbutteraugli < eps || dmse < eps) {
-        weight = -1.0;
-      } else {
-        blk_count += 1.0;
-        weight = dmse / dbutteraugli;
-        weight = AOMMIN(weight, 5.0);
-        weight += K;
-        log_sum += log(weight);
+        dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f);
+        dmse = dmse / px_count;
+        const float eps = 0.01f;
+        double weight;
+        if (dbutteraugli < eps || dmse < eps) {
+          weight = -1.0;
+        } else {
+          blk_count += 1.0;
+          weight = dmse / dbutteraugli;
+          weight = AOMMIN(weight, 5.0);
+          weight += K;
+          log_sum += log(weight);
+        }
+        cpi->butteraugli_info.rdmult_scaling_factors[index] = weight;
       }
-      cpi->butteraugli_info.rdmult_scaling_factors[index] = weight;
     }
   }
   // Geometric average of the weights.
@@ -164,7 +223,7 @@ void av1_set_butteraugli_rdmult(const AV1_COMP *cpi, MACROBLOCK *x,
   av1_set_error_per_bit(&x->errorperbit, *rdmult);
 }
 
-static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst,
+static void copy_plane_lowbd(const uint8_t *src, int src_stride, uint8_t *dst,
                        int dst_stride, int w, int h) {
   for (int row = 0; row < h; row++) {
     memcpy(dst, src, w);
@@ -173,29 +232,66 @@ static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst,
   }
 }
 
-static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+static void copy_img_lowbd(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                      int width, int height) {
-  copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width,
+  copy_plane_lowbd(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width,
              height);
   const int width_uv = (width + src->subsampling_x) >> src->subsampling_x;
   const int height_uv = (height + src->subsampling_y) >> src->subsampling_y;
-  copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+  copy_plane_lowbd(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
              width_uv, height_uv);
-  copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+  copy_plane_lowbd(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
              width_uv, height_uv);
 }
+static void zero_plane_lowbd(uint8_t *dst, int dst_stride, int h) {
+  for (int row = 0; row < h; row++) {
+    memset(dst, 0, dst_stride);
+    dst += dst_stride;
+  }
+}
+
+static void zero_img_lowbd(YV12_BUFFER_CONFIG *dst) {
+  zero_plane_lowbd(dst->y_buffer, dst->y_stride, dst->y_height);
+  zero_plane_lowbd(dst->u_buffer, dst->uv_stride, dst->uv_height);
+  zero_plane_lowbd(dst->v_buffer, dst->uv_stride, dst->uv_height);
+}
+
+
+
 
-static void zero_plane(uint8_t *dst, int dst_stride, int h) {
+
+static void copy_plane_highbd(const uint16_t *src, int src_stride, uint16_t *dst,
+                       int dst_stride, int w, int h) {
+  for (int row = 0; row < h; row++) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_img_highbd(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                     int width, int height) {
+  copy_plane_highbd(CONVERT_TO_SHORTPTR(src->y_buffer), src->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer), dst->y_stride, width,
+             height);
+  const int width_uv = (width + src->subsampling_x) >> src->subsampling_x;
+  const int height_uv = (height + src->subsampling_y) >> src->subsampling_y;
+  copy_plane_highbd(CONVERT_TO_SHORTPTR(src->u_buffer), src->uv_stride, CONVERT_TO_SHORTPTR(dst->u_buffer), dst->uv_stride,
+             width_uv, height_uv);
+  copy_plane_highbd(CONVERT_TO_SHORTPTR(src->v_buffer), src->uv_stride, CONVERT_TO_SHORTPTR(dst->v_buffer), dst->uv_stride,
+             width_uv, height_uv);
+}
+
+static void zero_plane_highbd(uint16_t *dst, int dst_stride, int h) {
   for (int row = 0; row < h; row++) {
     memset(dst, 0, dst_stride);
     dst += dst_stride;
   }
 }
 
-static void zero_img(YV12_BUFFER_CONFIG *dst) {
-  zero_plane(dst->y_buffer, dst->y_stride, dst->y_height);
-  zero_plane(dst->u_buffer, dst->uv_stride, dst->uv_height);
-  zero_plane(dst->v_buffer, dst->uv_stride, dst->uv_height);
+static void zero_img_highbd(YV12_BUFFER_CONFIG *dst) {
+  zero_plane_highbd(CONVERT_TO_SHORTPTR(dst->y_buffer), dst->y_stride, dst->y_height);
+  zero_plane_highbd(CONVERT_TO_SHORTPTR(dst->u_buffer), dst->uv_stride, dst->uv_height);
+  zero_plane_highbd(CONVERT_TO_SHORTPTR(dst->v_buffer), dst->uv_stride, dst->uv_height);
 }
 
 void av1_setup_butteraugli_source(AV1_COMP *cpi) {
@@ -223,9 +319,15 @@ void av1_setup_butteraugli_source(AV1_COMP *cpi) {
   av1_resize_and_extend_frame_nonnormative(cpi->source, resized_dst, bit_depth,
                                            av1_num_planes(cm));
 
-  zero_img(cpi->source);
-  copy_img(resized_dst, cpi->source, width / resize_factor,
-           height / resize_factor);
+  if (cm->seq_params->use_highbitdepth) {
+    zero_img_highbd(cpi->source);
+    copy_img_highbd(resized_dst, cpi->source, width / resize_factor,
+            height / resize_factor);
+  } else {
+    zero_img_lowbd(cpi->source);
+    copy_img_lowbd(resized_dst, cpi->source, width / resize_factor,
+            height / resize_factor);
+  }
 }
 
 void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) {
@@ -242,8 +344,14 @@ void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) {
       &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y,
       cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
       cm->features.byte_alignment);
-  copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor,
-           height / resize_factor);
+
+  if (cm->seq_params->use_highbitdepth) {
+    copy_img_highbd(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor,
+            height / resize_factor);
+  } else {
+    copy_img_lowbd(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor,
+            height / resize_factor);
+  }
 
   set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source,
                                     &resized_recon, K);
@@ -262,13 +370,15 @@ void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) {
   av1_set_frame_size(cpi, cm->superres_upscaled_width,
                      cm->superres_upscaled_height);
 
-  cpi->source =
-      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
-                            cm->features.interp_filter, 0, false, false);
+  cpi->source = av1_realloc_and_scale_if_required(
+      cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+      0, false, false, cpi->oxcf.border_in_pixels,
+      cpi->oxcf.tool_cfg.enable_global_motion);
   if (cpi->unscaled_last_source != NULL) {
-    cpi->last_source = av1_scale_if_required(
+    cpi->last_source = av1_realloc_and_scale_if_required(
         cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
-        cm->features.interp_filter, 0, false, false);
+        cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
   }
 
   av1_setup_butteraugli_source(cpi);
@@ -295,7 +405,7 @@ void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) {
   // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
 
   av1_set_quantizer(cpi, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index,
-                    q_cfg->enable_chroma_deltaq);
+                    q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
   av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
   if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
     av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
diff --git a/av1/encoder/txb_rdopt.c b/av1/encoder/txb_rdopt.c
index 77bc3cd298..e3fe72a0d8 100644
--- a/av1/encoder/txb_rdopt.c
+++ b/av1/encoder/txb_rdopt.c
@@ -241,10 +241,11 @@ static AOM_FORCE_INLINE void update_coeff_eob(
 static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
                                int nz_num, int *nz_ci, int64_t rdmult,
                                int skip_cost, int non_skip_cost,
-                               tran_low_t *qcoeff, tran_low_t *dqcoeff) {
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               int sharpness) {
   const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
   const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
-  if (rd_new_eob < rd) {
+  if (rd_new_eob < rd && sharpness == 0) {
     for (int i = 0; i < nz_num; ++i) {
       const int ci = nz_ci[i];
       qcoeff[ci] = 0;
@@ -329,7 +330,7 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
   const LV_MAP_EOB_COST *txb_eob_costs =
       &coeff_costs->eob_costs[eob_multi_size][plane_type];
 
-  const int rshift = 2;
+  const int rshift = sharpness + 2;
 
   const int64_t rdmult =
       (((int64_t)x->rdmult *
@@ -395,9 +396,9 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
     default: assert(false);
   }
 
-  if (si == -1 && nz_num <= max_nz_num && sharpness == 0) {
+  if (si == -1 && nz_num <= max_nz_num) {
     update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
-                non_skip_cost, qcoeff, dqcoeff);
+                non_skip_cost, qcoeff, dqcoeff, sharpness);
   }
 
 #define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \