From 5bfaaad68ceabea2fbc9c8a445390064e6084989 Mon Sep 17 00:00:00 2001 From: tiago Date: Thu, 31 Oct 2024 16:20:43 +0000 Subject: [PATCH] (fix): refactored scryfall model --- src/mtg/migrations/0001_scryfall_data.py | 8 +-- src/mtg/models.py | 26 +++---- src/mtg/services.py | 44 ++++++++---- src/mtg/tasks.py | 87 +++++++++++++++++------- 4 files changed, 112 insertions(+), 53 deletions(-) diff --git a/src/mtg/migrations/0001_scryfall_data.py b/src/mtg/migrations/0001_scryfall_data.py index 4065dab..a7f8162 100644 --- a/src/mtg/migrations/0001_scryfall_data.py +++ b/src/mtg/migrations/0001_scryfall_data.py @@ -1,4 +1,4 @@ -# Generated by Django 5.1.2 on 2024-10-29 17:34 +# Generated by Django 5.1.2 on 2024-10-31 15:37 from django.db import migrations, models @@ -23,10 +23,8 @@ class Migration(migrations.Migration): ), ("obs", models.TextField(blank=True, verbose_name="Observations")), ("active", models.BooleanField(default=True, verbose_name="active")), - ( - "cardmarket_id", - models.PositiveIntegerField(primary_key=True, serialize=False), - ), + ("id", models.UUIDField(primary_key=True, serialize=False)), + ("cardmarket_id", models.PositiveIntegerField()), ("oracle_id", models.CharField(max_length=128, null=True)), ("name", models.CharField(max_length=256, null=True)), ("mana_cost", models.CharField(blank=True, max_length=128, null=True)), diff --git a/src/mtg/models.py b/src/mtg/models.py index dbd5edd..e10e8c7 100644 --- a/src/mtg/models.py +++ b/src/mtg/models.py @@ -2,23 +2,23 @@ from lib.models import BaseAbstractModel - -class ScryfallCardManager(models.Manager): - # Taken from https://github.com/baronvonvaderham/django-mtg-card-catalog - - def get_or_create_card(self, card_data): - """Fetch or create a card based on the provided data dictionary.""" - card, created = self.update_or_create( - cardmarket_id=card_data["cardmarket_id"], - defaults=card_data, - ) - return created, card +# class ScryfallCardManager(models.Manager): +# # Taken from https://github.com/baronvonvaderham/django-mtg-card-catalog +# +# def get_or_create_card(self, card_data): +# """Fetch or create a card based on the provided data dictionary.""" +# card, created = self.update_or_create( +# id=card_data["id"], +# defaults=card_data, +# ) +# return created, card class ScryfallCard(BaseAbstractModel): """Class to contain a local version of the scryfall data to limit the need for external API calls.""" - cardmarket_id = models.PositiveIntegerField(blank=False, primary_key=True) + id = models.UUIDField(primary_key=True, editable=True) + cardmarket_id = models.PositiveIntegerField() oracle_id = models.CharField(max_length=128, null=True) # NOQA nosemgrep name = models.CharField(max_length=256, null=True) # nosemgrep mana_cost = models.CharField(max_length=128, blank=True, null=True) # NOQA nosemgrep @@ -33,7 +33,7 @@ class ScryfallCard(BaseAbstractModel): image_small = models.URLField(blank=True, null=True) # NOQA nosemgrep image_normal = models.URLField(blank=True, null=True) # NOQA nosemgrep - objects = ScryfallCardManager() + # objects = ScryfallCardManager() class Meta: indexes = [models.Index(fields=['cardmarket_id'], name='idx_scryfallcard_cm_id')] diff --git a/src/mtg/services.py b/src/mtg/services.py index 2f960e1..7e153bd 100644 --- a/src/mtg/services.py +++ b/src/mtg/services.py @@ -1,6 +1,8 @@ +import json import unicodedata import requests +from tqdm.auto import tqdm from .constants import BASIC_TYPES, SCRYFALL_BULK_DATA_URL from .models import ScryfallCard @@ -25,7 +27,7 @@ def process_card_types(card_data): continue # If there is a ' - ', that means we have subtypes to the right, supertypes to the left - if ' — ' in type_line: + if ' - ' in type_line: main_types, subtypes = type_line.split(' - ') else: main_types, subtypes = type_line, None @@ -45,12 +47,23 @@ def scryfall_download_bulk_data(): response = requests.get(SCRYFALL_BULK_DATA_URL, timeout=10) response.raise_for_status() # Raise an error for bad responses url = response.json() + + # Find bulk data url url = next(item for item in url['data'] if item['type'] == 'default_cards') url = url['download_uri'] - response = requests.get(url, timeout=10) + # Download in chunks + response = requests.get(url, timeout=10, stream=True) response.raise_for_status() - return response.json() + + total_size = int(response.headers.get('Content-Length', 0)) if 'Content-Length' in response.headers else None + with tqdm(total=total_size, unit='B', unit_scale=True, desc='Downloading') as progress_bar: + json_data = [] + for chunk in response.iter_content(chunk_size=8192): + json_data.append(chunk) + progress_bar.update(len(chunk)) + + return json.loads(b''.join(json_data)) def scryfall_process_data(data): @@ -65,13 +78,19 @@ def scryfall_transform_card_data(raw_card_data): """Convert raw Scryfall data to model-compatible format, applying constants-based filters and transformations.""" # Skipping unwanted stuff + skipping_ids = {'90f17b85-a866-48e8-aae0-55330109550e'} if not raw_card_data.get('cardmarket_id'): return None if raw_card_data.get('name').split(' ')[0] in BASIC_TYPES: return None if '(' in raw_card_data.get('name'): return None + if raw_card_data.get('id') in skipping_ids: + return None + scryfall_id = raw_card_data.get('id') + oracle_id = raw_card_data.get('oracle_id') + cardmarket_id = raw_card_data.get('cardmarket_id') card_name = raw_card_data.get('name', '') card_name = ''.join(c for c in unicodedata.normalize('NFD', card_name) if unicodedata.category(c) != 'Mn') card_types, card_subtypes = process_card_types(raw_card_data) @@ -82,7 +101,7 @@ def scryfall_transform_card_data(raw_card_data): image_small = None image_normal = None color_identity = raw_card_data.get('color_identity') - cardmarket_id = raw_card_data.get('cardmarket_id') + cmc = raw_card_data.get('cmc') # Split cards if ' // ' in card_name: @@ -115,15 +134,16 @@ def scryfall_transform_card_data(raw_card_data): image_normal = image_uris.get('image_normal' if 'image_uris' in raw_card_data else 'normal', None) transformed_data = { - 'oracle_id': raw_card_data.get('oracle_id'), + 'id': scryfall_id, + 'oracle_id': oracle_id, 'name': card_name, - 'mana_cost': mana_cost, - 'cmc': raw_card_data.get('cmc'), - 'types': card_types, - 'subtypes': card_subtypes, - 'colors': list(colors), - 'color_identity': color_identity, - 'oracle_text': oracle_text, + 'mana_cost': json.dumps(mana_cost), + 'cmc': cmc, + 'types': json.dumps(card_types), + 'subtypes': json.dumps(card_subtypes), + 'colors': json.dumps(list(colors)), + 'color_identity': json.dumps(color_identity), + 'oracle_text': json.dumps(oracle_text), 'cardmarket_id': cardmarket_id, 'image_small': image_small, 'image_normal': image_normal, diff --git a/src/mtg/tasks.py b/src/mtg/tasks.py index f90cedd..722d747 100644 --- a/src/mtg/tasks.py +++ b/src/mtg/tasks.py @@ -1,41 +1,82 @@ -from celery import group from celery.utils.log import get_task_logger +from django.utils import timezone from tqdm.auto import tqdm from cm_prices.celery import app from mtg.models import ScryfallCard -from mtg.services import ( - scryfall_download_bulk_data, - scryfall_save_card, - scryfall_transform_card_data, -) +from mtg.services import scryfall_download_bulk_data, scryfall_transform_card_data logger = get_task_logger('tasks.common') @app.task(name='sync_scryfall_task') def sync_scryfall(*args, **kwargs): - """Run scryfall update bulk task.""" - + """Run Scryfall update bulk task.""" logger.info('BEGINNING SCRYFALL SYNC TASK') + scryfall_data = scryfall_download_bulk_data() - if kwargs.get('test'): - scryfall_data = scryfall_data[:2] - load_tasks = [] + new_cards = [] + existing_cards = [] + existing_card_ids = set(str(card_id) for card_id in ScryfallCard.objects.values_list('id', flat=True)) + for raw_card_data in tqdm(scryfall_data, unit='card'): - card = scryfall_transform_card_data(raw_card_data) - if card: - if not ScryfallCard.objects.filter(cardmarket_id=card.get('cardmarket_id')).exists(): - load_tasks.append(get_or_create_scryfall_card.s(card)) - task_group = group(load_tasks) - task_group.apply() + card_data = scryfall_transform_card_data(raw_card_data) + if card_data: + # Check if the card already exists by cardmarket_id + if card_data['id'] in existing_card_ids: + existing_cards.append(ScryfallCard(**card_data)) + else: + timestamp = timezone.now() + card_data['date_updated'] = timestamp + card_data['date_created'] = timestamp + new_cards.append(ScryfallCard(**card_data)) + + # Field list for bulk_update + fields_to_update = [ + 'oracle_id', + 'name', + 'mana_cost', + 'cmc', + 'types', + 'subtypes', + 'colors', + 'color_identity', + 'oracle_text', + 'image_small', + 'image_normal', + 'legalities', + 'cardmarket_id', + ] + + # Bulk create and update + if new_cards: + ScryfallCard.objects.bulk_create(new_cards) + logger.info('%d new cards inserted.', len(new_cards)) + if existing_cards: + bulk_update_if_changed(existing_cards, fields_to_update) + logger.info('SCRYFALL SYNC TASK COMPLETE!') -@app.task(name='get_or_create_scryfall_card') -def get_or_create_scryfall_card(card_data): - """Create card in local scryfall model.""" +def bulk_update_if_changed(update_cards, fields): + """Bulk update only cards that are different.""" + # Create a mapping of cardmarket_id to existing card data + scryfall_ids = [card.id for card in update_cards] + existing_cards = {str(card.id): card for card in ScryfallCard.objects.filter(id__in=scryfall_ids)} + + cards_to_update = [] + + for update_card in update_cards: + existing_card = existing_cards.get(update_card.id) + # Compare fields to see if there are changes + has_changes = any(getattr(existing_card, field) != getattr(update_card, field) for field in fields) + + if has_changes: + update_card.date_updated = timezone.now() + cards_to_update.append(update_card) - created, card = scryfall_save_card(card_data) - if created: - logger.info('Created new Scryfall card: %s', card.name) + # Perform the bulk update only if there are changes + if cards_to_update: + update_fields = fields + ['date_updated'] + ScryfallCard.objects.bulk_update(cards_to_update, update_fields) + logger.info('Updated %d cards.', len(cards_to_update))