From a59c93980d97f6216917415ae25f3ac88e64cbb4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nh.nicolas.hug@gmail.com>
Date: Wed, 28 Aug 2024 12:23:28 +0100
Subject: [PATCH] Add transparency and >8bits images support to AVIF decoder
 (#8613)

---
 test/test_image.py                            | 60 +++++++++++++++++++
 torchvision/csrc/io/image/cpu/decode_avif.cpp | 43 +++++++++----
 torchvision/csrc/io/image/cpu/decode_avif.h   |  5 +-
 .../csrc/io/image/cpu/decode_image.cpp        |  2 +-
 torchvision/csrc/io/image/image.cpp           |  3 +-
 torchvision/io/image.py                       | 23 ++++++-
 6 files changed, 122 insertions(+), 14 deletions(-)

diff --git a/test/test_image.py b/test/test_image.py
index 8bc18ccf26b..f1fe70135fe 100644
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -928,5 +928,65 @@ def test_decode_avif(decode_fun, scripted):
     assert img[None].is_contiguous(memory_format=torch.channels_last)
 
 
+@pytest.mark.xfail(reason="AVIF support not enabled yet.")
+# Note: decode_image fails because some of these files have a (valid) signature
+# we don't recognize. We should probably use libmagic....
+# @pytest.mark.parametrize("decode_fun", (_decode_avif, decode_image))
+@pytest.mark.parametrize("decode_fun", (_decode_avif,))
+@pytest.mark.parametrize("scripted", (False, True))
+@pytest.mark.parametrize(
+    "mode, pil_mode",
+    (
+        (ImageReadMode.RGB, "RGB"),
+        (ImageReadMode.RGB_ALPHA, "RGBA"),
+        (ImageReadMode.UNCHANGED, None),
+    ),
+)
+@pytest.mark.parametrize("filename", Path("/home/nicolashug/dev/libavif/tests/data/").glob("*.avif"))
+def test_decode_avif_against_pil(decode_fun, scripted, mode, pil_mode, filename):
+    if "reversed_dimg_order" in str(filename):
+        # Pillow properly decodes this one, but we don't (order of parts of the
+        # image is wrong). This is due to a bug that was recently fixed in
+        # libavif. Hopefully this test will end up passing soon with a new
+        # libavif version https://github.com/AOMediaCodec/libavif/issues/2311
+        pytest.xfail()
+    import pillow_avif  # noqa
+
+    encoded_bytes = read_file(filename)
+    if scripted:
+        decode_fun = torch.jit.script(decode_fun)
+    try:
+        img = decode_fun(encoded_bytes, mode=mode)
+    except RuntimeError as e:
+        if any(
+            s in str(e)
+            for s in ("BMFF parsing failed", "avifDecoderParse failed: ", "file contains more than one image")
+        ):
+            pytest.skip(reason="Expected failure, that's OK")
+        else:
+            raise e
+    assert img[None].is_contiguous(memory_format=torch.channels_last)
+    if mode == ImageReadMode.RGB:
+        assert img.shape[0] == 3
+    if mode == ImageReadMode.RGB_ALPHA:
+        assert img.shape[0] == 4
+    if img.dtype == torch.uint16:
+        img = F.to_dtype(img, dtype=torch.uint8, scale=True)
+
+    from_pil = F.pil_to_tensor(Image.open(filename).convert(pil_mode))
+    if False:
+        from torchvision.utils import make_grid
+
+        g = make_grid([img, from_pil])
+        F.to_pil_image(g).save((f"/home/nicolashug/out_images/{filename.name}.{pil_mode}.png"))
+    if mode != ImageReadMode.RGB:
+        # We don't compare against PIL for RGB because results look pretty
+        # different on RGBA images (other images are fine). The result on
+        # torchvision basically just plainly ignores the alpha channel, resuting
+        # in transparent pixels looking dark. PIL seems to be using a sort of
+        # k-nn thing, looking at the output. Take a look at the resuting images.
+        torch.testing.assert_close(img, from_pil, rtol=0, atol=3)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/torchvision/csrc/io/image/cpu/decode_avif.cpp b/torchvision/csrc/io/image/cpu/decode_avif.cpp
index ec136743806..5752f04a448 100644
--- a/torchvision/csrc/io/image/cpu/decode_avif.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_avif.cpp
@@ -8,7 +8,9 @@ namespace vision {
 namespace image {
 
 #if !AVIF_FOUND
-torch::Tensor decode_avif(const torch::Tensor& data) {
+torch::Tensor decode_avif(
+    const torch::Tensor& encoded_data,
+    ImageReadMode mode) {
   TORCH_CHECK(
       false, "decode_avif: torchvision not compiled with libavif support");
 }
@@ -23,7 +25,9 @@ struct UniquePtrDeleter {
 };
 using DecoderPtr = std::unique_ptr<avifDecoder, UniquePtrDeleter>;
 
-torch::Tensor decode_avif(const torch::Tensor& encoded_data) {
+torch::Tensor decode_avif(
+    const torch::Tensor& encoded_data,
+    ImageReadMode mode) {
   // This is based on
   // https://github.com/AOMediaCodec/libavif/blob/main/examples/avif_example_decode_memory.c
   // Refer there for more detail about what each function does, and which
@@ -58,9 +62,6 @@ torch::Tensor decode_avif(const torch::Tensor& encoded_data) {
       avifResultToString(result));
   TORCH_CHECK(
       decoder->imageCount == 1, "Avif file contains more than one image");
-  TORCH_CHECK(
-      decoder->image->depth <= 8,
-      "avif images with bitdepth > 8 are not supported");
 
   result = avifDecoderNextImage(decoder.get());
   TORCH_CHECK(
@@ -68,14 +69,36 @@ torch::Tensor decode_avif(const torch::Tensor& encoded_data) {
       "avifDecoderNextImage failed:",
       avifResultToString(result));
 
-  auto out = torch::empty(
-      {decoder->image->height, decoder->image->width, 3}, torch::kUInt8);
-
   avifRGBImage rgb;
   memset(&rgb, 0, sizeof(rgb));
   avifRGBImageSetDefaults(&rgb, decoder->image);
-  rgb.format = AVIF_RGB_FORMAT_RGB;
-  rgb.pixels = out.data_ptr<uint8_t>();
+
+  // images encoded as 10 or 12 bits will be decoded as uint16. The rest are
+  // decoded as uint8.
+  auto use_uint8 = (decoder->image->depth <= 8);
+  rgb.depth = use_uint8 ? 8 : 16;
+
+  if (mode != IMAGE_READ_MODE_UNCHANGED && mode != IMAGE_READ_MODE_RGB &&
+      mode != IMAGE_READ_MODE_RGB_ALPHA) {
+    // Other modes aren't supported, but we don't error or even warn because we
+    // have generic entry points like decode_image which may support all modes,
+    // it just depends on the underlying decoder.
+    mode = IMAGE_READ_MODE_UNCHANGED;
+  }
+
+  // If return_rgb is false it means we return rgba - nothing else.
+  auto return_rgb =
+      (mode == IMAGE_READ_MODE_RGB ||
+       (mode == IMAGE_READ_MODE_UNCHANGED && !decoder->alphaPresent));
+
+  auto num_channels = return_rgb ? 3 : 4;
+  rgb.format = return_rgb ? AVIF_RGB_FORMAT_RGB : AVIF_RGB_FORMAT_RGBA;
+  rgb.ignoreAlpha = return_rgb ? AVIF_TRUE : AVIF_FALSE;
+
+  auto out = torch::empty(
+      {rgb.height, rgb.width, num_channels},
+      use_uint8 ? torch::kUInt8 : at::kUInt16);
+  rgb.pixels = (uint8_t*)out.data_ptr();
   rgb.rowBytes = rgb.width * avifRGBImagePixelSize(&rgb);
 
   result = avifImageYUVToRGB(decoder->image, &rgb);
diff --git a/torchvision/csrc/io/image/cpu/decode_avif.h b/torchvision/csrc/io/image/cpu/decode_avif.h
index 269bce52197..0510c2104e5 100644
--- a/torchvision/csrc/io/image/cpu/decode_avif.h
+++ b/torchvision/csrc/io/image/cpu/decode_avif.h
@@ -1,11 +1,14 @@
 #pragma once
 
 #include <torch/types.h>
+#include "../image_read_mode.h"
 
 namespace vision {
 namespace image {
 
-C10_EXPORT torch::Tensor decode_avif(const torch::Tensor& data);
+C10_EXPORT torch::Tensor decode_avif(
+    const torch::Tensor& encoded_data,
+    ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED);
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp
index 0bc9d4396a5..e5a421b7287 100644
--- a/torchvision/csrc/io/image/cpu/decode_image.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_image.cpp
@@ -58,7 +58,7 @@ torch::Tensor decode_image(
       0x66, 0x74, 0x79, 0x70, 0x61, 0x76, 0x69, 0x66}; // == "ftypavif"
   TORCH_CHECK(data.numel() >= 12, err_msg);
   if ((memcmp(avif_signature, datap + 4, 8) == 0)) {
-    return decode_avif(data);
+    return decode_avif(data, mode);
   }
 
   const uint8_t webp_signature_begin[4] = {0x52, 0x49, 0x46, 0x46}; // == "RIFF"
diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp
index 091e1d21302..a777d19d3bd 100644
--- a/torchvision/csrc/io/image/image.cpp
+++ b/torchvision/csrc/io/image/image.cpp
@@ -23,7 +23,8 @@ static auto registry =
             &decode_jpeg)
         .op("image::decode_webp(Tensor encoded_data, int mode) -> Tensor",
             &decode_webp)
-        .op("image::decode_avif", &decode_avif)
+        .op("image::decode_avif(Tensor encoded_data, int mode) -> Tensor",
+            &decode_avif)
         .op("image::encode_jpeg", &encode_jpeg)
         .op("image::read_file", &read_file)
         .op("image::write_file", &write_file)
diff --git a/torchvision/io/image.py b/torchvision/io/image.py
index df3e44ab713..e169c0a4f7a 100644
--- a/torchvision/io/image.py
+++ b/torchvision/io/image.py
@@ -394,7 +394,28 @@ def decode_webp(
 
 def _decode_avif(
     input: torch.Tensor,
+    mode: ImageReadMode = ImageReadMode.UNCHANGED,
 ) -> torch.Tensor:
+    """
+    Decode an AVIF image into a 3 dimensional RGB[A] Tensor.
+
+    The values of the output tensor are in uint8 in [0, 255] for most images. If
+    the image has a bit-depth of more than 8, then the output tensor is uint16
+    in [0, 65535]. Since uint16 support is limited in pytorch, we recommend
+    calling :func:`torchvision.transforms.v2.functional.to_dtype()` with
+    ``scale=True`` after this function to convert the decoded image into a uint8
+    or float tensor.
+
+    Args:
+        input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
+            the raw bytes of the AVIF image.
+        mode (ImageReadMode): The read mode used for optionally
+            converting the image color space. Default: ``ImageReadMode.UNCHANGED``.
+            Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``.
+
+    Returns:
+        Decoded image (Tensor[image_channels, image_height, image_width])
+    """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(decode_webp)
-    return torch.ops.image.decode_avif(input)
+    return torch.ops.image.decode_avif(input, mode.value)