Skip to content

Commit

Permalink
fix: use new barcode normalization
Browse files Browse the repository at this point in the history
- change the way we split barcodes to create directory hierarchy
- use images subdomain for both images and OCRs
  • Loading branch information
raphael0202 committed Oct 4, 2024
1 parent 15252d7 commit b49b362
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 11 deletions.
43 changes: 34 additions & 9 deletions openfoodfacts/images.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import re
from pathlib import Path
from typing import List, Optional, Tuple, Union
from urllib.parse import urlparse
Expand All @@ -12,7 +11,6 @@
logger = logging.getLogger(__name__)


BARCODE_PATH_REGEX = re.compile(r"^(...)(...)(...)(.*)$")
# Base URL of the public Open Food Facts S3 bucket
AWS_S3_BASE_URL = "https://openfoodfacts-images.s3.eu-west-3.amazonaws.com/data"

Expand Down Expand Up @@ -43,15 +41,21 @@ def split_barcode(barcode: str) -> List[str]:
if not barcode.isdigit():
raise ValueError(f"unknown barcode format: {barcode}")

match = BARCODE_PATH_REGEX.fullmatch(barcode)
# Pad the barcode with zeros to ensure it has 13 digits
barcode = barcode.zfill(13)
# Split the first 9 digits of the barcode into 3 groups of 3 digits to
# get the first 3 folder names
splits = [barcode[0:3], barcode[3:6], barcode[6:9], barcode[9:13]]

splits = [x for x in match.groups() if x] if match else [barcode]
# use the rest of the barcode as the last folder name
if len(barcode) > 13:
splits.append(barcode[13:])

if org_id is not None:
# For the pro platform only, images and OCRs belonging to an org
# are stored in a folder named after the org for all its products, ex:
# https://images.pro.openfoodfacts.org/images/products/org-lea-nature/330/713/080/3004/1.jpg
splits.append(org_id)
splits.insert(0, org_id)

return splits

Expand Down Expand Up @@ -103,9 +107,8 @@ def generate_json_ocr_url(
Environment.org
:return: the generated JSON URL
"""
return (
URLBuilder.static(flavor, environment)
+ f"/images/products{generate_json_ocr_path(code, image_id)}"
return URLBuilder.image_url(
flavor, environment, generate_json_ocr_path(code, image_id)
)


Expand Down Expand Up @@ -141,6 +144,17 @@ def extract_barcode_from_url(url: str) -> Optional[str]:


def extract_barcode_from_path(path: str) -> Optional[str]:
"""Extract a product barcode from an image/OCR path.
The barcode is normalized using the following rules:
- all leading zeros are stripped
- if the barcode is less than 8 digits, it is left-padded with zeros up to
8 digits
- if the barcode is more than 8 digits but less than 13 digits, it is
left-padded with zeros up to 13 digits
- if the barcode has 13 digits or more, it's returned as it
"""
barcode = ""

for parent in Path(path).parents:
Expand All @@ -149,7 +163,18 @@ def extract_barcode_from_path(path: str) -> Optional[str]:
else:
break

return barcode or None
# Strip leading zeros
barcode = barcode.lstrip("0")

if not barcode:
return None

if len(barcode) <= 8:
barcode = barcode.zfill(8)
return barcode

barcode = barcode.zfill(13)
return barcode


def extract_source_from_url(url: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion openfoodfacts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def static(flavor: Flavor, environment: Environment) -> str:
@staticmethod
def image_url(flavor: Flavor, environment: Environment, image_path: str) -> str:
prefix = URLBuilder._get_url(
prefix="static", tld=environment.value, base_domain=flavor.get_base_domain()
prefix="images", tld=environment.value, base_domain=flavor.get_base_domain()
)
return prefix + f"/images/products{image_path}"

Expand Down
108 changes: 107 additions & 1 deletion tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

import pytest

from openfoodfacts.images import extract_barcode_from_url, extract_source_from_url
from openfoodfacts.images import (
extract_barcode_from_url,
extract_source_from_url,
generate_image_url,
generate_json_ocr_url,
)
from openfoodfacts.types import Environment, Flavor


@pytest.mark.parametrize(
Expand Down Expand Up @@ -43,3 +49,103 @@ def test_get_barcode_from_url(url: str, output: Optional[str]):
)
def test_get_source_from_url(url: str, output: str):
assert extract_source_from_url(url) == output


@pytest.mark.parametrize(
"code,image_id,flavor,environment,expected",
[
(
"5410126726954",
"1",
Flavor.off,
Environment.org,
"https://images.openfoodfacts.org/images/products/541/012/672/6954/1.jpg",
),
(
"6539",
"1",
Flavor.off,
Environment.org,
"https://images.openfoodfacts.org/images/products/000/000/000/6539/1.jpg",
),
(
"12458465",
"2.400",
Flavor.obf,
Environment.net,
"https://images.openbeautyfacts.net/images/products/000/001/245/8465/2.400.jpg",
),
(
"org-lea-nature/5410126726954",
"1",
Flavor.off_pro,
Environment.org,
"https://images.pro.openfoodfacts.org/images/products/org-lea-nature/541/012/672/6954/1.jpg",
),
],
)
def test_generate_image_url(code, image_id, flavor, environment, expected):
assert generate_image_url(code, image_id, flavor, environment) == expected


@pytest.mark.parametrize(
"code,image_id,flavor,environment,expected",
[
(
"5410126726954",
"1",
Flavor.off,
Environment.org,
"https://images.openfoodfacts.org/images/products/541/012/672/6954/1.json",
),
(
"6539",
"1",
Flavor.off,
Environment.org,
"https://images.openfoodfacts.org/images/products/000/000/000/6539/1.json",
),
(
"org-lea-nature/5410126726954",
"1",
Flavor.off_pro,
Environment.org,
"https://images.pro.openfoodfacts.org/images/products/org-lea-nature/541/012/672/6954/1.json",
),
],
)
def test_generate_json_ocr_url(code, image_id, flavor, environment, expected):
assert generate_json_ocr_url(code, image_id, flavor, environment) == expected


@pytest.mark.parametrize(
"url,expected",
[
(
"https://world.openfoodfacts.org/images/products/541/012/672/6954/1.jpg",
"5410126726954",
),
(
"https://world.openbeautyfacts.net/images/products/000/000/001/6954/1.jpg",
"00016954",
),
(
"https://world.openbeautyfacts.net/images/products/000/009/121/6954/1.jpg",
"91216954",
),
(
"https://world.openbeautyfacts.net/images/products/000/019/121/6954/1.jpg",
"0000191216954",
),
(
"https://world.openbeautyfacts.net/images/products/343/919/121/6954/1.jpg",
"3439191216954",
),
(
"https://world.openbeautyfacts.net/images/products/343/919/121/6954862052/1.jpg",
"3439191216954862052",
),
],
)
def test_extract_barcode_from_url(url, expected):
assert extract_barcode_from_url(url) == expected

0 comments on commit b49b362

Please sign in to comment.