From a59c93980d97f6216917415ae25f3ac88e64cbb4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 28 Aug 2024 12:23:28 +0100 Subject: [PATCH] Add transparency and >8bits images support to AVIF decoder (#8613) --- test/test_image.py | 60 +++++++++++++++++++ torchvision/csrc/io/image/cpu/decode_avif.cpp | 43 +++++++++---- torchvision/csrc/io/image/cpu/decode_avif.h | 5 +- .../csrc/io/image/cpu/decode_image.cpp | 2 +- torchvision/csrc/io/image/image.cpp | 3 +- torchvision/io/image.py | 23 ++++++- 6 files changed, 122 insertions(+), 14 deletions(-) diff --git a/test/test_image.py b/test/test_image.py index 8bc18ccf26b..f1fe70135fe 100644 --- a/test/test_image.py +++ b/test/test_image.py @@ -928,5 +928,65 @@ def test_decode_avif(decode_fun, scripted): assert img[None].is_contiguous(memory_format=torch.channels_last) +@pytest.mark.xfail(reason="AVIF support not enabled yet.") +# Note: decode_image fails because some of these files have a (valid) signature +# we don't recognize. We should probably use libmagic.... +# @pytest.mark.parametrize("decode_fun", (_decode_avif, decode_image)) +@pytest.mark.parametrize("decode_fun", (_decode_avif,)) +@pytest.mark.parametrize("scripted", (False, True)) +@pytest.mark.parametrize( + "mode, pil_mode", + ( + (ImageReadMode.RGB, "RGB"), + (ImageReadMode.RGB_ALPHA, "RGBA"), + (ImageReadMode.UNCHANGED, None), + ), +) +@pytest.mark.parametrize("filename", Path("/home/nicolashug/dev/libavif/tests/data/").glob("*.avif")) +def test_decode_avif_against_pil(decode_fun, scripted, mode, pil_mode, filename): + if "reversed_dimg_order" in str(filename): + # Pillow properly decodes this one, but we don't (order of parts of the + # image is wrong). This is due to a bug that was recently fixed in + # libavif. Hopefully this test will end up passing soon with a new + # libavif version https://github.com/AOMediaCodec/libavif/issues/2311 + pytest.xfail() + import pillow_avif # noqa + + encoded_bytes = read_file(filename) + if scripted: + decode_fun = torch.jit.script(decode_fun) + try: + img = decode_fun(encoded_bytes, mode=mode) + except RuntimeError as e: + if any( + s in str(e) + for s in ("BMFF parsing failed", "avifDecoderParse failed: ", "file contains more than one image") + ): + pytest.skip(reason="Expected failure, that's OK") + else: + raise e + assert img[None].is_contiguous(memory_format=torch.channels_last) + if mode == ImageReadMode.RGB: + assert img.shape[0] == 3 + if mode == ImageReadMode.RGB_ALPHA: + assert img.shape[0] == 4 + if img.dtype == torch.uint16: + img = F.to_dtype(img, dtype=torch.uint8, scale=True) + + from_pil = F.pil_to_tensor(Image.open(filename).convert(pil_mode)) + if False: + from torchvision.utils import make_grid + + g = make_grid([img, from_pil]) + F.to_pil_image(g).save((f"/home/nicolashug/out_images/{filename.name}.{pil_mode}.png")) + if mode != ImageReadMode.RGB: + # We don't compare against PIL for RGB because results look pretty + # different on RGBA images (other images are fine). The result on + # torchvision basically just plainly ignores the alpha channel, resuting + # in transparent pixels looking dark. PIL seems to be using a sort of + # k-nn thing, looking at the output. Take a look at the resuting images. + torch.testing.assert_close(img, from_pil, rtol=0, atol=3) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/torchvision/csrc/io/image/cpu/decode_avif.cpp b/torchvision/csrc/io/image/cpu/decode_avif.cpp index ec136743806..5752f04a448 100644 --- a/torchvision/csrc/io/image/cpu/decode_avif.cpp +++ b/torchvision/csrc/io/image/cpu/decode_avif.cpp @@ -8,7 +8,9 @@ namespace vision { namespace image { #if !AVIF_FOUND -torch::Tensor decode_avif(const torch::Tensor& data) { +torch::Tensor decode_avif( + const torch::Tensor& encoded_data, + ImageReadMode mode) { TORCH_CHECK( false, "decode_avif: torchvision not compiled with libavif support"); } @@ -23,7 +25,9 @@ struct UniquePtrDeleter { }; using DecoderPtr = std::unique_ptr; -torch::Tensor decode_avif(const torch::Tensor& encoded_data) { +torch::Tensor decode_avif( + const torch::Tensor& encoded_data, + ImageReadMode mode) { // This is based on // https://github.com/AOMediaCodec/libavif/blob/main/examples/avif_example_decode_memory.c // Refer there for more detail about what each function does, and which @@ -58,9 +62,6 @@ torch::Tensor decode_avif(const torch::Tensor& encoded_data) { avifResultToString(result)); TORCH_CHECK( decoder->imageCount == 1, "Avif file contains more than one image"); - TORCH_CHECK( - decoder->image->depth <= 8, - "avif images with bitdepth > 8 are not supported"); result = avifDecoderNextImage(decoder.get()); TORCH_CHECK( @@ -68,14 +69,36 @@ torch::Tensor decode_avif(const torch::Tensor& encoded_data) { "avifDecoderNextImage failed:", avifResultToString(result)); - auto out = torch::empty( - {decoder->image->height, decoder->image->width, 3}, torch::kUInt8); - avifRGBImage rgb; memset(&rgb, 0, sizeof(rgb)); avifRGBImageSetDefaults(&rgb, decoder->image); - rgb.format = AVIF_RGB_FORMAT_RGB; - rgb.pixels = out.data_ptr(); + + // images encoded as 10 or 12 bits will be decoded as uint16. The rest are + // decoded as uint8. + auto use_uint8 = (decoder->image->depth <= 8); + rgb.depth = use_uint8 ? 8 : 16; + + if (mode != IMAGE_READ_MODE_UNCHANGED && mode != IMAGE_READ_MODE_RGB && + mode != IMAGE_READ_MODE_RGB_ALPHA) { + // Other modes aren't supported, but we don't error or even warn because we + // have generic entry points like decode_image which may support all modes, + // it just depends on the underlying decoder. + mode = IMAGE_READ_MODE_UNCHANGED; + } + + // If return_rgb is false it means we return rgba - nothing else. + auto return_rgb = + (mode == IMAGE_READ_MODE_RGB || + (mode == IMAGE_READ_MODE_UNCHANGED && !decoder->alphaPresent)); + + auto num_channels = return_rgb ? 3 : 4; + rgb.format = return_rgb ? AVIF_RGB_FORMAT_RGB : AVIF_RGB_FORMAT_RGBA; + rgb.ignoreAlpha = return_rgb ? AVIF_TRUE : AVIF_FALSE; + + auto out = torch::empty( + {rgb.height, rgb.width, num_channels}, + use_uint8 ? torch::kUInt8 : at::kUInt16); + rgb.pixels = (uint8_t*)out.data_ptr(); rgb.rowBytes = rgb.width * avifRGBImagePixelSize(&rgb); result = avifImageYUVToRGB(decoder->image, &rgb); diff --git a/torchvision/csrc/io/image/cpu/decode_avif.h b/torchvision/csrc/io/image/cpu/decode_avif.h index 269bce52197..0510c2104e5 100644 --- a/torchvision/csrc/io/image/cpu/decode_avif.h +++ b/torchvision/csrc/io/image/cpu/decode_avif.h @@ -1,11 +1,14 @@ #pragma once #include +#include "../image_read_mode.h" namespace vision { namespace image { -C10_EXPORT torch::Tensor decode_avif(const torch::Tensor& data); +C10_EXPORT torch::Tensor decode_avif( + const torch::Tensor& encoded_data, + ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED); } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp index 0bc9d4396a5..e5a421b7287 100644 --- a/torchvision/csrc/io/image/cpu/decode_image.cpp +++ b/torchvision/csrc/io/image/cpu/decode_image.cpp @@ -58,7 +58,7 @@ torch::Tensor decode_image( 0x66, 0x74, 0x79, 0x70, 0x61, 0x76, 0x69, 0x66}; // == "ftypavif" TORCH_CHECK(data.numel() >= 12, err_msg); if ((memcmp(avif_signature, datap + 4, 8) == 0)) { - return decode_avif(data); + return decode_avif(data, mode); } const uint8_t webp_signature_begin[4] = {0x52, 0x49, 0x46, 0x46}; // == "RIFF" diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp index 091e1d21302..a777d19d3bd 100644 --- a/torchvision/csrc/io/image/image.cpp +++ b/torchvision/csrc/io/image/image.cpp @@ -23,7 +23,8 @@ static auto registry = &decode_jpeg) .op("image::decode_webp(Tensor encoded_data, int mode) -> Tensor", &decode_webp) - .op("image::decode_avif", &decode_avif) + .op("image::decode_avif(Tensor encoded_data, int mode) -> Tensor", + &decode_avif) .op("image::encode_jpeg", &encode_jpeg) .op("image::read_file", &read_file) .op("image::write_file", &write_file) diff --git a/torchvision/io/image.py b/torchvision/io/image.py index df3e44ab713..e169c0a4f7a 100644 --- a/torchvision/io/image.py +++ b/torchvision/io/image.py @@ -394,7 +394,28 @@ def decode_webp( def _decode_avif( input: torch.Tensor, + mode: ImageReadMode = ImageReadMode.UNCHANGED, ) -> torch.Tensor: + """ + Decode an AVIF image into a 3 dimensional RGB[A] Tensor. + + The values of the output tensor are in uint8 in [0, 255] for most images. If + the image has a bit-depth of more than 8, then the output tensor is uint16 + in [0, 65535]. Since uint16 support is limited in pytorch, we recommend + calling :func:`torchvision.transforms.v2.functional.to_dtype()` with + ``scale=True`` after this function to convert the decoded image into a uint8 + or float tensor. + + Args: + input (Tensor[1]): a one dimensional contiguous uint8 tensor containing + the raw bytes of the AVIF image. + mode (ImageReadMode): The read mode used for optionally + converting the image color space. Default: ``ImageReadMode.UNCHANGED``. + Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``. + + Returns: + Decoded image (Tensor[image_channels, image_height, image_width]) + """ if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(decode_webp) - return torch.ops.image.decode_avif(input) + return torch.ops.image.decode_avif(input, mode.value)