diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
index d5a4a280e83a..dd498fad50e8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
@@ -44,6 +44,15 @@ void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
   Value operand = op->getOperand(index);
 
   if (auto producer = operand.getDefiningOp<TilingInterface>()) {
+    // Skip promotion of fills.
+    if (isa<linalg::FillOp>(producer)) {
+      return;
+    }
+    if (auto generic = dyn_cast<linalg::GenericOp>(&*producer)) {
+      if (linalg::isaFillOpInterface(generic)) {
+        return;
+      }
+    }
     setLoweringConfig(producer, IREE::GPU::DerivedThreadConfigAttr::get(
                                     builder.getContext()));
     return;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
index 5ec02698451a..f05cf7b1890b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
@@ -64,3 +64,21 @@ func.func @lhs_only_matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) ->
 //  CHECK-SAME:   %[[B:[A-Za-z0-9]+]]: tensor<1024x128xf32>
 //   CHECK-DAG:   %[[PA:.+]] = linalg.copy {{.*}} ins(%[[A]] : tensor<32x1024xf32>)
 //       CHECK:   linalg.generic {{.*}} ins(%[[PA]], %[[B]] : tensor<32x1024xf32>, tensor<1024x128xf32>)
+
+// -----
+
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = [0]}>
+
+func.func @no_promote_fill(%b: tensor<128x128xf32>) -> tensor<4x128xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<4x128xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<4x128xf32>) -> tensor<4x128xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%fill, %b : tensor<4x128xf32>, tensor<128x128xf32>) outs(%fill : tensor<4x128xf32>) -> tensor<4x128xf32>
+  return %mm : tensor<4x128xf32>
+}
+
+// Verify that fills are not promoted.
+// CHECK-LABEL: func.func @no_promote_fill
+//   CHECK-NOT:   iree_gpu.derived_thread_config
+//       CHECK: return