diff --git a/cl/alerts/management/commands/cl_send_scheduled_alerts.py b/cl/alerts/management/commands/cl_send_scheduled_alerts.py index 14511fc3ac..f912d9d20a 100644 --- a/cl/alerts/management/commands/cl_send_scheduled_alerts.py +++ b/cl/alerts/management/commands/cl_send_scheduled_alerts.py @@ -3,6 +3,7 @@ from typing import DefaultDict import waffle +from django.http import QueryDict from django.utils.timezone import now from cl.alerts.models import ( @@ -11,8 +12,9 @@ ScheduledAlertHit, ) from cl.alerts.tasks import send_search_alert_emails -from cl.alerts.utils import InvalidDateError +from cl.alerts.utils import InvalidDateError, override_alert_query from cl.lib.command_utils import VerboseCommand, logger +from cl.search.models import SEARCH_TYPES from cl.stats.utils import tally_stat DAYS_TO_DELETE = 90 @@ -28,6 +30,28 @@ def json_date_parser(dct): return dct +def get_cut_off_date(rate: str, d: datetime.date) -> datetime.date | None: + """Given a rate of dly, wly or mly and a date, returns the date after for + building a daterange filter. + :param rate: The alert rate to send Alerts. + :param d: The date alerts are run. + :return: The cut-off date or None. + """ + cut_off_date = None + if rate == Alert.DAILY: + cut_off_date = d + elif rate == Alert.WEEKLY: + cut_off_date = d - datetime.timedelta(days=7) + elif rate == Alert.MONTHLY: + # Get the first of the month of the previous month regardless of the + # current date + early_last_month = d - datetime.timedelta(days=28) + cut_off_date = datetime.datetime( + early_last_month.year, early_last_month.month, 1 + ).date() + return cut_off_date + + def query_and_send_alerts_by_rate(rate: str) -> None: """Query and send alerts per user. @@ -62,6 +86,12 @@ def query_and_send_alerts_by_rate(rate: str) -> None: documents.append(json_date_parser(result.document_content)) alerts_to_update.append(alert.pk) + + # Override order_by to show the latest items when clicking the + # "View Full Results" button. + cut_off_date = get_cut_off_date(rate, now_time.date()) + qd = override_alert_query(alert, cut_off_date) + alert.query_run = qd.urlencode() # type: ignore hits.append( ( alert, diff --git a/cl/alerts/tasks.py b/cl/alerts/tasks.py index 384cdbcec2..fa1aeed46e 100644 --- a/cl/alerts/tasks.py +++ b/cl/alerts/tasks.py @@ -17,7 +17,12 @@ ) from cl.alerts.models import Alert, DocketAlert, ScheduledAlertHit -from cl.alerts.utils import percolate_document, user_has_donated_enough +from cl.alerts.utils import ( + alert_hits_limit_reached, + override_alert_query, + percolate_document, + user_has_donated_enough, +) from cl.api.models import WebhookEventType from cl.api.tasks import ( send_docket_alert_webhook_events, @@ -33,9 +38,9 @@ from cl.recap.constants import COURT_TIMEZONES from cl.search.constants import ALERTS_HL_TAG from cl.search.documents import AudioPercolator -from cl.search.models import Docket, DocketEntry +from cl.search.models import SEARCH_TYPES, Docket, DocketEntry from cl.search.types import ( - ESDocumentType, + ESDocumentClassType, PercolatorResponseType, SaveDocumentResponseType, SearchAlertHitType, @@ -477,7 +482,10 @@ def send_search_alert_emails( continue alert_user: UserProfile.user = User.objects.get(pk=user_id) - context = {"hits": hits} + context = { + "hits": hits, + "hits_limit": settings.SCHEDULED_ALERT_HITS_LIMIT, + } txt = txt_template.render(context) html = html_template.render(context) msg = EmailMultiAlternatives( @@ -527,6 +535,11 @@ def process_percolator_response(response: PercolatorResponseType) -> None: ALERTS_HL_TAG, ) + # Override order_by to show the latest items when clicking the + # "View Full Results" button. + qd = override_alert_query(alert_triggered) + alert_triggered.query_run = qd.urlencode() # type: ignore + # Compose RT hit to send. hits = [ ( @@ -554,6 +567,12 @@ def process_percolator_response(response: PercolatorResponseType) -> None: else: # Schedule DAILY, WEEKLY and MONTHLY Alerts + if alert_hits_limit_reached( + alert_triggered.pk, alert_triggered.user.pk + ): + # Skip storing hits for this alert-user combination because + # the SCHEDULED_ALERT_HITS_LIMIT has been reached. + continue scheduled_hits_to_create.append( ScheduledAlertHit( user=alert_triggered.user, @@ -690,7 +709,7 @@ def index_alert_document( ignore_result=True, ) def remove_doc_from_es_index( - self: Task, es_document: ESDocumentType, instance_id: int + self: Task, es_document: ESDocumentClassType, instance_id: int ) -> None: """Remove a document from an Elasticsearch index. diff --git a/cl/alerts/templates/alert_email_es.html b/cl/alerts/templates/alert_email_es.html index 91bc38e48b..4e05888dc9 100644 --- a/cl/alerts/templates/alert_email_es.html +++ b/cl/alerts/templates/alert_email_es.html @@ -24,10 +24,10 @@

- Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert — {{alert.name}} — had {{num_results}} hit{{results|pluralize}}: + Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert — {{alert.name}} — had {{num_results}}{% if num_results >= hits_limit %}+{% endif %} hit{{results|pluralize}}:

- View Full Results / Edit this Alert
+ View Full Results / Edit this Alert
Disable this Alert (one click)

{% endif %} diff --git a/cl/alerts/templates/alert_email_es.txt b/cl/alerts/templates/alert_email_es.txt index 60f9975f07..b94b0db1fa 100644 --- a/cl/alerts/templates/alert_email_es.txt +++ b/cl/alerts/templates/alert_email_es.txt @@ -10,9 +10,9 @@ CourtListener.com We have news regarding your alerts at CourtListener.com ------------------------------------------------------- -{% for alert, type, results, num_results in hits %}{% for result in results %}{% if forloop.first %}Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert -- {{alert.name}} -- had {{num_results}} hit{{results|pluralize}}: +{% for alert, type, results, num_results in hits %}{% for result in results %}{% if forloop.first %}Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert -- {{alert.name}} -- had {{num_results}}{% if num_results >= hits_limit %}+{% endif %} hit{{results|pluralize}}: ------------------------------------------------------- -Edit / Delete this Alert: https://www.courtlistener.com/alert/edit/{{ alert.pk }}/ +View Full Results / Edit this Alert: https://www.courtlistener.com/?{{ alert.query_run|safe }}&edit_alert={{ alert.pk }}/ Disable this Alert (one click): https://www.courtlistener.com{% url "disable_alert" alert.secret_key %}{% endif %} {{forloop.counter}}. {{ result.caseName|safe|striptags }} ({% if result.court_id != 'scotus' %}{{ result.court_citation_string|striptags }} {% endif %}{% if type == 'o' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %}) diff --git a/cl/alerts/tests.py b/cl/alerts/tests.py index 99369492ac..e0275e8d4d 100644 --- a/cl/alerts/tests.py +++ b/cl/alerts/tests.py @@ -1,9 +1,11 @@ import json from collections import defaultdict -from datetime import timedelta +from datetime import datetime, timedelta from unittest import mock +from urllib.parse import parse_qs, urlparse import time_machine +from django.conf import settings from django.contrib.auth.hashers import make_password from django.contrib.auth.models import User from django.core import mail @@ -11,6 +13,7 @@ from django.test import Client, override_settings from django.urls import reverse from django.utils.timezone import now +from lxml import html from rest_framework.status import ( HTTP_200_OK, HTTP_201_CREATED, @@ -24,6 +27,7 @@ from cl.alerts.factories import AlertFactory, DocketAlertWithParentsFactory from cl.alerts.management.commands.cl_send_scheduled_alerts import ( DAYS_TO_DELETE, + get_cut_off_date, ) from cl.alerts.management.commands.handle_old_docket_alerts import ( build_user_report, @@ -588,10 +592,17 @@ def setUpTestData(cls): cluster__precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, cluster__date_filed=now() - timedelta(hours=5), ) - cls.dly_oral_argument = AudioWithParentsFactory.create( - case_name="Dly Test OA", - docket__date_argued=now() - timedelta(hours=5), - ) + with mock.patch( + "cl.scrapers.tasks.microservice", + side_effect=lambda *args, **kwargs: MockResponse(200, b"10"), + ), mock.patch( + "cl.lib.es_signal_processor.avoid_es_audio_indexing", + side_effect=lambda x, y, z: False, + ): + cls.dly_oral_argument = AudioWithParentsFactory.create( + case_name="Dly Test OA", + docket__date_argued=now() - timedelta(hours=5), + ) cls.wly_opinion = OpinionWithParentsFactory.create( cluster__precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, @@ -1515,6 +1526,10 @@ def test_get_docket_notes_and_tags_by_user(self) -> None: @override_switch("oa-es-alerts-active", active=True) +@mock.patch( + "cl.lib.es_signal_processor.avoid_es_audio_indexing", + side_effect=lambda x, y, z: False, +) class SearchAlertsOAESTests(ESIndexTestCase, TestCase): """Test ES Search Alerts""" @@ -1591,7 +1606,7 @@ def tearDownClass(cls): Audio.objects.all().delete() super().tearDownClass() - def test_alert_frequency_estimation(self): + def test_alert_frequency_estimation(self, mock_abort_audio): """Test alert frequency ES API endpoint.""" search_params = { @@ -1625,7 +1640,7 @@ def test_alert_frequency_estimation(self): self.assertEqual(r.json()["count"], 1) rt_oral_argument.delete() - def test_send_oa_search_alert_webhooks(self): + def test_send_oa_search_alert_webhooks(self, mock_abort_audio): """Can we send RT OA search alerts?""" with mock.patch( @@ -1691,6 +1706,13 @@ def test_send_oa_search_alert_webhooks(self): self.assertIn("19-5735", html_content) self.assertIn("RT", html_content) + # Confirm that order_by is overridden in the 'View Full Results' URL by + # dateArgued+desc. + view_results_url = html.fromstring(str(html_content)).xpath( + '//a[text()="View Full Results / Edit this Alert"]/@href' + ) + self.assertIn("order_by=dateArgued+desc", view_results_url[0]) + # The right alert type template is used. self.assertIn("oral argument", html_content) @@ -1727,7 +1749,7 @@ def test_send_oa_search_alert_webhooks(self): webhook_events.delete() rt_oral_argument.delete() - def test_send_alert_on_document_creation(self): + def test_send_alert_on_document_creation(self, mock_abort_audio): """Avoid sending Search Alerts on document updates.""" with mock.patch( "cl.api.webhooks.requests.post", @@ -1778,7 +1800,7 @@ def test_send_alert_on_document_creation(self): self.assertEqual(len(webhook_events), 4) rt_oral_argument.delete() - def test_es_alert_update_and_delete(self): + def test_es_alert_update_and_delete(self, mock_abort_audio): """Can we update and delete an alert, and expect these changes to be properly reflected in Elasticsearch?""" @@ -1846,10 +1868,6 @@ def send_alerts_by_rate_and_confirm_assertions( # Confirm Alert date_last_hit is updated. search_alert.refresh_from_db() - if previous_date: - self.assertEqual(search_alert.date_last_hit, previous_date) - else: - self.assertEqual(search_alert.date_last_hit, mock_date) # One OA search alert email should be sent self.assertEqual(len(mail.outbox), alert_count) @@ -1859,6 +1877,36 @@ def send_alerts_by_rate_and_confirm_assertions( # The right alert type template is used. self.assertIn("oral argument", text_content) + # Extract HTML version. + html_content = None + for content, content_type in mail.outbox[alert_count - 1].alternatives: + if content_type == "text/html": + html_content = content + break + + # Confirm that order_by is overridden in the 'View Full Results' + # URL by dateArgued+desc. + view_results_url = html.fromstring(str(html_content)).xpath( + '//a[text()="View Full Results / Edit this Alert"]/@href' + ) + + parsed_url = urlparse(view_results_url[0]) + params = parse_qs(parsed_url.query) + self.assertEqual("dateArgued desc", params["order_by"][0]) + + if previous_date: + self.assertEqual(search_alert.date_last_hit, previous_date) + else: + self.assertEqual(search_alert.date_last_hit, mock_date) + + # Confirm that argued_after is properly set in the + # 'View Full Results' URL. + cut_off_date = get_cut_off_date(rate, mock_date.date()) + date_in_query = datetime.strptime( + params["argued_after"][0], "%m/%d/%Y" + ).date() + self.assertEqual(cut_off_date, date_in_query) + # Confirm stored alerts instances status is set to SENT. scheduled_hits = ScheduledAlertHit.objects.filter(alert__rate=rate) for scheduled_hit in scheduled_hits: @@ -1866,7 +1914,7 @@ def send_alerts_by_rate_and_confirm_assertions( scheduled_hit.hit_status, SCHEDULED_ALERT_HIT_STATUS.SENT ) - def test_send_alert_multiple_alert_rates(self): + def test_send_alert_multiple_alert_rates(self, mock_abort_audio): """Confirm dly, wly and mly alerts are properly stored and sent according to their periodicity. """ @@ -1972,7 +2020,7 @@ def test_send_alert_multiple_alert_rates(self): @mock.patch( "cl.alerts.management.commands.cl_send_scheduled_alerts.logger" ) - def test_group_alerts_and_hits(self, mock_logger): + def test_group_alerts_and_hits(self, mock_logger, mock_abort_audio): """""" with mock.patch( "cl.api.webhooks.requests.post", @@ -2075,7 +2123,7 @@ def test_group_alerts_and_hits(self, mock_logger): rt_oral_argument_3.delete() @override_settings(PERCOLATOR_PAGE_SIZE=5) - def test_send_multiple_rt_alerts(self): + def test_send_multiple_rt_alerts(self, mock_abort_audio): """Confirm all RT alerts are properly sent if the percolator response contains more than PERCOLATOR_PAGE_SIZE results. So additional requests are performed in order to retrieve all the available results. @@ -2167,7 +2215,7 @@ def test_send_multiple_rt_alerts(self): alert.delete() @override_settings(PERCOLATOR_PAGE_SIZE=5) - def test_batched_alerts_match_documents_ingestion(self): + def test_batched_alerts_match_documents_ingestion(self, mock_abort_audio): """Confirm that batched alerts are properly stored according to document ingestion when percolated in real time. """ @@ -2251,7 +2299,7 @@ def test_batched_alerts_match_documents_ingestion(self): alert.delete() @override_settings(PERCOLATOR_PAGE_SIZE=5) - def test_percolate_document_in_batches(self): + def test_percolate_document_in_batches(self, mock_abort_audio): """Confirm when getting alerts in batches and an alert previously retrieved is updated during this process. It's not returned again. """ @@ -2308,7 +2356,9 @@ def test_percolate_document_in_batches(self): self.assertNotIn(result.id, ids_in_results) ids_in_results.append(result.id) - def test_avoid_sending_or_scheduling_disabled_alerts(self): + def test_avoid_sending_or_scheduling_disabled_alerts( + self, mock_abort_audio + ): """Can we avoid sending or_scheduling disabled search alerts?.""" rt_alert_disabled = AlertFactory( @@ -2351,7 +2401,7 @@ def test_avoid_sending_or_scheduling_disabled_alerts(self): dly_alert_disabled.delete() oral_argument.delete() - def test_avoid_re_sending_scheduled_sent_alerts(self): + def test_avoid_re_sending_scheduled_sent_alerts(self, mock_abort_audio): """Can we prevent re-sending scheduled alerts that have already been sent?""" @@ -2494,6 +2544,87 @@ def test_avoid_re_sending_scheduled_sent_alerts(self): oral_argument_2.delete() oral_argument_3.delete() + @override_settings(SCHEDULED_ALERT_HITS_LIMIT=3) + def test_scheduled_hits_limit(self, mock_abort_audio): + """Confirm we only store the max number of alert hits set in settings.""" + + alert_1 = AlertFactory( + user=self.user_profile.user, + rate=Alert.DAILY, + name="Test Alert OA Daily 1", + query="q=USA+vs+Bank+&type=oa", + ) + alert_2 = AlertFactory( + user=self.user_profile.user, + rate=Alert.DAILY, + name="Test Alert OA Daily 2", + query="q=Texas+vs+Corp&type=oa", + ) + alert_3 = AlertFactory( + user=self.user_profile_2.user, + rate=Alert.DAILY, + name="Test Alert OA Daily 3", + query="q=Texas+vs+Corp&type=oa", + ) + + oa_created = [] + for i in range(4): + oral_argument = AudioWithParentsFactory.create( + case_name="USA vs Bank", + docket__court=self.court_1, + docket__date_argued=now().date(), + docket__docket_number="19-5735", + ) + oa_created.append(oral_argument) + + if i in (1, 2): + # Only schedule two hits for this one. + oral_argument_2 = AudioWithParentsFactory.create( + case_name="Texas vs Corp", + docket__court=self.court_1, + docket__date_argued=now().date(), + docket__docket_number="20-5030", + ) + oa_created.append(oral_argument_2) + + # Call dly command + call_command("cl_send_scheduled_alerts", rate=Alert.DAILY) + + # Two emails should be sent, one for user_profile and one for user_profile_2 + self.assertEqual(len(mail.outbox), 2) + + # Confirm emails contains the hits+ count. + self.assertIn( + f"had {settings.SCHEDULED_ALERT_HITS_LIMIT}+ hits", + mail.outbox[0].body, + ) + # No "+" if hits do not reach the SCHEDULED_ALERT_HITS_LIMIT. + self.assertIn( + f"had 2 hits", + mail.outbox[1].body, + ) + + # Confirm each email contains the max number of hits for each alert. + self.assertEqual( + mail.outbox[0].body.count("USA vs Bank"), + settings.SCHEDULED_ALERT_HITS_LIMIT, + ) + self.assertEqual( + mail.outbox[0].body.count("Texas vs Corp"), + 2, + ) + self.assertEqual( + mail.outbox[1].body.count("Texas vs Corp"), + 2, + ) + + for oa in oa_created: + oa.delete() + + alert_1.delete() + alert_2.delete() + alert_3.delete() + @override_settings(ELASTICSEARCH_DISABLED=True) class SearchAlertsIndexingCommandTests(ESIndexTestCase, TestCase): diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py index f9f958bc80..6eaa5206f1 100644 --- a/cl/alerts/utils.py +++ b/cl/alerts/utils.py @@ -1,10 +1,17 @@ from dataclasses import dataclass +from datetime import date from django.conf import settings +from django.http import QueryDict from elasticsearch_dsl import Q, Search from elasticsearch_dsl.response import Response -from cl.alerts.models import Alert, DocketAlert +from cl.alerts.models import ( + SCHEDULED_ALERT_HIT_STATUS, + Alert, + DocketAlert, + ScheduledAlertHit, +) from cl.lib.command_utils import logger from cl.lib.elasticsearch_utils import add_es_highlighting from cl.search.documents import AudioPercolator @@ -107,3 +114,50 @@ def user_has_donated_enough( ) return False return True + + +def override_alert_query( + alert: Alert, cut_off_date: date | None = None +) -> QueryDict: + """Override the query parameters for a given alert based on its type and an + optional cut-off date. + + :param alert: The Alert object for which the query will be overridden. + :param cut_off_date: An optional date used to set a threshold in the query. + :return: A QueryDict object containing the modified query parameters. + """ + + qd = QueryDict(alert.query.encode(), mutable=True) + if alert.alert_type == SEARCH_TYPES.ORAL_ARGUMENT: + qd["order_by"] = "dateArgued desc" + if cut_off_date: + qd["argued_after"] = cut_off_date.strftime("%m/%d/%Y") + else: + qd["order_by"] = "dateFiled desc" + if cut_off_date: + qd["filed_after"] = cut_off_date.strftime("%m/%d/%Y") + + return qd + + +def alert_hits_limit_reached(alert_pk: int, user_pk: int) -> bool: + """Check if the alert hits limit has been reached for a specific alert-user + combination. + + :param alert_pk: The alert_id. + :param user_pk: The user_id. + :return: True if the limit has been reached, otherwise False. + """ + + stored_hits = ScheduledAlertHit.objects.filter( + alert_id=alert_pk, + user_id=user_pk, + hit_status=SCHEDULED_ALERT_HIT_STATUS.SCHEDULED, + ) + hits_count = stored_hits.count() + if hits_count >= settings.SCHEDULED_ALERT_HITS_LIMIT: + logger.info( + f"Skipping hit for Alert ID: {alert_pk}, there are {hits_count} hits stored for this alert." + ) + return True + return False diff --git a/cl/audio/tests.py b/cl/audio/tests.py index e570621cba..95106336f5 100644 --- a/cl/audio/tests.py +++ b/cl/audio/tests.py @@ -1,3 +1,5 @@ +from unittest import mock + from django.urls import reverse from lxml import etree @@ -24,24 +26,28 @@ def setUpTestData(cls) -> None: jurisdiction="F", citation_string="Appeals. CA8.", ) - cls.audio = AudioWithParentsFactory.create( - docket=DocketFactory(court=cls.court_1), - local_path_mp3__data=ONE_SECOND_MP3_BYTES, - local_path_original_file__data=ONE_SECOND_MP3_BYTES, - duration=1, - ) - AudioWithParentsFactory.create( - docket=cls.audio.docket, - local_path_mp3__data=SMALL_WAV_BYTES, - local_path_original_file__data=SMALL_WAV_BYTES, - duration=0, - ) - AudioWithParentsFactory.create( - docket=DocketFactory(court=cls.court_2), - local_path_mp3__data=SMALL_WAV_BYTES, - local_path_original_file__data=SMALL_WAV_BYTES, - duration=5, - ) + with mock.patch( + "cl.lib.es_signal_processor.avoid_es_audio_indexing", + side_effect=lambda x, y, z: False, + ): + cls.audio = AudioWithParentsFactory.create( + docket=DocketFactory(court=cls.court_1), + local_path_mp3__data=ONE_SECOND_MP3_BYTES, + local_path_original_file__data=ONE_SECOND_MP3_BYTES, + duration=1, + ) + AudioWithParentsFactory.create( + docket=cls.audio.docket, + local_path_mp3__data=SMALL_WAV_BYTES, + local_path_original_file__data=SMALL_WAV_BYTES, + duration=0, + ) + AudioWithParentsFactory.create( + docket=DocketFactory(court=cls.court_2), + local_path_mp3__data=SMALL_WAV_BYTES, + local_path_original_file__data=SMALL_WAV_BYTES, + duration=5, + ) def test_do_jurisdiction_podcasts_have_good_content(self) -> None: """Can we simply load a jurisdiction podcast page?""" diff --git a/cl/lib/es_signal_processor.py b/cl/lib/es_signal_processor.py index e9331fe15f..3cf2d03e75 100644 --- a/cl/lib/es_signal_processor.py +++ b/cl/lib/es_signal_processor.py @@ -9,14 +9,19 @@ remove_doc_from_es_index, send_or_schedule_alerts, ) +from cl.audio.models import Audio from cl.lib.elasticsearch_utils import elasticsearch_enabled from cl.search.documents import AudioDocument from cl.search.tasks import save_document_in_es, update_document_in_es -from cl.search.types import ESDocumentType, ESModelType +from cl.search.types import ( + ESDocumentClassType, + ESDocumentInstanceType, + ESModelType, +) def updated_fields( - instance: ESModelType, es_document: ESDocumentType + instance: ESModelType, es_document: ESDocumentClassType ) -> list[str]: """Look for changes in the tracked fields of an instance. :param instance: The instance to check for changed fields. @@ -60,7 +65,7 @@ def get_fields_to_update( def document_fields_to_update( - main_doc: ESDocumentType, + main_doc: ESDocumentInstanceType, main_object: ESModelType, field_list: list[str], instance: ESModelType, @@ -96,8 +101,8 @@ def document_fields_to_update( def get_or_create_doc( - es_document: ESDocumentType, instance: ESModelType -) -> ESDocumentType | None: + es_document: ESDocumentClassType, instance: ESModelType +) -> ESDocumentInstanceType | None: """Get or create a document in Elasticsearch. :param es_document: The Elasticsearch document type. :param instance: The instance of the document to get or create. @@ -113,7 +118,7 @@ def get_or_create_doc( def update_es_documents( main_model: ESModelType, - es_document: ESDocumentType, + es_document: ESDocumentClassType, instance: ESModelType, created: bool, mapping_fields: dict, @@ -155,7 +160,7 @@ def update_es_documents( def update_remove_m2m_documents( main_model: ESModelType, - es_document: ESDocumentType, + es_document: ESDocumentClassType, instance: ESModelType, mapping_fields: dict, affected_field: str, @@ -186,7 +191,7 @@ def update_remove_m2m_documents( def update_m2m_field_in_es_document( instance: ESModelType, - es_document: ESDocumentType, + es_document: ESDocumentClassType, affected_field: str, ) -> None: """Update a single field created using a many-to-many relationship. @@ -205,7 +210,7 @@ def update_m2m_field_in_es_document( def update_reverse_related_documents( main_model: ESModelType, - es_document: ESDocumentType, + es_document: ESDocumentClassType, instance: ESModelType, query_string: str, affected_fields: list[str], @@ -235,6 +240,35 @@ def update_reverse_related_documents( ) +def avoid_es_audio_indexing( + instance: ESModelType, + es_document: ESDocumentClassType, + update_fields: list[str] | None, +): + """Check conditions to abort Elasticsearch indexing for Audio instances. + Avoid indexing for Audio instances which their mp3 file has not been + processed yet by process_audio_file. + + :param instance: The Audio instance to evaluate for Elasticsearch indexing. + :param es_document: The Elasticsearch document class. + :param update_fields: List of fields being updated, or None. + :return: True if indexing should be avoided, False otherwise. + """ + + if ( + type(instance) == Audio + and not es_document.exists(instance.pk) + and ( + not update_fields + or (update_fields and "processing_complete" not in update_fields) + ) + ): + # Avoid indexing Audio instances that haven't been previously indexed + # in ES and for which 'processing_complete' is not present in update_fields. + return True + return False + + class ESSignalProcessor(object): """Custom signal processor for Elasticsearch documents. It is responsible for managing the Elasticsearch index after certain events happen, such as @@ -299,7 +333,14 @@ def connect_signals(models, handler, signal_to_uid_mapping, weak=False): ) @elasticsearch_enabled - def handle_save(self, sender, instance=None, created=False, **kwargs): + def handle_save( + self, + sender, + instance=None, + created=False, + update_fields=None, + **kwargs, + ): """Receiver function that gets called after an object instance is saved""" mapping_fields = self.documents_model_mapping["save"][sender] if not created: @@ -311,6 +352,13 @@ def handle_save(self, sender, instance=None, created=False, **kwargs): mapping_fields, ) if not mapping_fields: + if avoid_es_audio_indexing( + instance, self.es_document, update_fields + ): + # This check is required to avoid indexing and triggering + # search alerts for Audio instances whose MP3 files have not + # yet been processed by process_audio_file. + return None chain( save_document_in_es.si(instance, self.es_document), send_or_schedule_alerts.s(self.es_document._index._name), diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py index f78fde0032..6f68227e1f 100644 --- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py +++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py @@ -172,9 +172,7 @@ def scrape_court( index=False, backscrape=backscrape, ) - process_audio_file.apply_async( - (audio_file.pk,), countdown=random.randint(0, 3600) - ) + process_audio_file.delay(audio_file.pk) logger.info( "Successfully added audio file {pk}: {name}".format( diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index 186d2101fe..b5434aaddb 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -368,7 +368,13 @@ def process_audio_file(self, pk) -> None: ).text ) audio_obj.processing_complete = True - audio_obj.save() + audio_obj.save( + update_fields=[ + "duration", + "local_path_mp3", + "processing_complete", + ] + ) @app.task( diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index 4f7dfafe54..6d27b8c871 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -8,8 +8,14 @@ from django.core.files.base import ContentFile from django.utils.timezone import now +from cl.alerts.factories import AlertFactory +from cl.alerts.models import Alert +from cl.api.factories import WebhookFactory +from cl.api.models import WebhookEvent, WebhookEventType from cl.audio.factories import AudioWithParentsFactory from cl.audio.models import Audio +from cl.donate.factories import DonationFactory +from cl.donate.models import Donation from cl.lib.microservice_utils import microservice from cl.scrapers.DupChecker import DupChecker from cl.scrapers.management.commands import ( @@ -24,14 +30,34 @@ from cl.search.factories import CourtFactory, DocketFactory from cl.search.models import Court, Docket, Opinion from cl.settings import MEDIA_ROOT -from cl.tests.cases import SimpleTestCase, TestCase +from cl.tests.cases import ESIndexTestCase, SimpleTestCase, TestCase from cl.tests.fixtures import ONE_SECOND_MP3_BYTES, SMALL_WAV_BYTES +from cl.users.factories import UserProfileWithParentsFactory -class ScraperIngestionTest(TestCase): +class ScraperIngestionTest(ESIndexTestCase, TestCase): @classmethod def setUpTestData(cls) -> None: cls.court = CourtFactory(id="test", jurisdiction="F") + cls.user_profile = UserProfileWithParentsFactory() + cls.donation = DonationFactory( + donor=cls.user_profile.user, + amount=20, + status=Donation.PROCESSED, + send_annual_reminder=True, + ) + cls.webhook_enabled = WebhookFactory( + user=cls.user_profile.user, + event_type=WebhookEventType.SEARCH_ALERT, + url="https://example.com/", + enabled=True, + ) + cls.search_alert_oa = AlertFactory( + user=cls.user_profile.user, + rate=Alert.DAILY, + name="Test Alert OA", + query="type=oa", + ) def test_extension(self): r = async_to_sync(microservice)( @@ -128,6 +154,32 @@ def test_ingest_oral_arguments(self) -> None: d_1.refresh_from_db() self.assertEqual(d_1.case_name, "Jeremy v. Julian") + # Confirm that OA Search Alerts are properly triggered after an OA is + # scraped and its MP3 file is processed. + # Two webhook events should be sent, both of them to user_profile user + webhook_events = WebhookEvent.objects.all() + self.assertEqual(len(webhook_events), 2) + + cases_names = ["Jeremy v. Julian", "Ander v. Leo"] + for webhook_sent in webhook_events: + self.assertEqual( + webhook_sent.webhook.user, + self.user_profile.user, + ) + content = webhook_sent.content + # Check if the webhook event payload is correct. + self.assertEqual( + content["webhook"]["event_type"], + WebhookEventType.SEARCH_ALERT, + ) + self.assertIn( + content["payload"]["results"][0]["caseName"], cases_names + ) + self.assertIsNotNone(content["payload"]["results"][0]["duration"]) + self.assertIsNotNone( + content["payload"]["results"][0]["local_path"] + ) + def test_parsing_xml_opinion_site_to_site_object(self) -> None: """Does a basic parse of a site reveal the right number of items?""" site = test_opinion_scraper.Site().parse() diff --git a/cl/search/tasks.py b/cl/search/tasks.py index 7f229bfca0..c145da47b7 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -19,7 +19,8 @@ from cl.search.documents import AudioDocument from cl.search.models import Docket, OpinionCluster, RECAPDocument from cl.search.types import ( - ESDocumentType, + ESDocumentClassType, + ESDocumentInstanceType, ESModelType, SaveDocumentResponseType, ) @@ -179,9 +180,12 @@ def delete_items(items, app_label, force_commit=False): interval_start=5, ) def save_document_in_es( - self: Task, instance: ESModelType, es_document: ESDocumentType + self: Task, + instance: ESModelType, + es_document: ESDocumentClassType, ) -> SaveDocumentResponseType | None: """Save a document in Elasticsearch using a provided callable. + :param self: The celery task :param instance: The instance of the document to save. :param es_document: A Elasticsearch DSL document. @@ -218,7 +222,7 @@ def save_document_in_es( ) def update_document_in_es( self: Task, - es_document: ESDocumentType, + es_document: ESDocumentInstanceType, fields_values_to_update: dict[str, Any], ) -> None: """Update a document in Elasticsearch. diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py index a7e9a845fe..77bf1a0cab 100644 --- a/cl/search/tests/tests_es_oral_arguments.py +++ b/cl/search/tests/tests_es_oral_arguments.py @@ -1,4 +1,5 @@ import datetime +from unittest import mock from django.core.cache import cache from django.db import transaction @@ -1106,7 +1107,11 @@ def test_oa_results_api_fields_es(self) -> None: msg=f"Key {key} not found in the result object.", ) - def test_oa_results_pagination(self) -> None: + @mock.patch( + "cl.lib.es_signal_processor.avoid_es_audio_indexing", + side_effect=lambda x, y, z: False, + ) + def test_oa_results_pagination(self, mock_abort_audio) -> None: created_audios = [] audios_to_create = 20 for i in range(audios_to_create): @@ -1705,7 +1710,11 @@ def test_stemming_disable_search(self) -> None: self.assertEqual(actual, expected) self.assertIn("Freedom of", r.content.decode()) - def test_keep_in_sync_related_OA_objects(self) -> None: + @mock.patch( + "cl.lib.es_signal_processor.avoid_es_audio_indexing", + side_effect=lambda x, y, z: False, + ) + def test_keep_in_sync_related_OA_objects(self, mock_abort_audio) -> None: """Test Audio documents are updated when related objects change.""" with transaction.atomic(): docket_5 = DocketFactory.create( diff --git a/cl/search/types.py b/cl/search/types.py index 9ff09547de..0fe330fbc7 100644 --- a/cl/search/types.py +++ b/cl/search/types.py @@ -1,4 +1,4 @@ -from typing import Any, Union +from typing import Any, Type, Union from elasticsearch_dsl.response import Hit @@ -27,10 +27,18 @@ ParentheticalGroup, Audio, ] -ESDocumentType = Union[ + +ESDocumentInstanceType = Union[ AudioDocument, ParentheticalGroupDocument, AudioPercolator ] +ESDocumentClassType = Union[ + Type[AudioDocument], + Type[ParentheticalGroupDocument], + Type[AudioPercolator], +] + + ESDictDocument = dict[str, Any] PercolatorResponseType = tuple[list[Hit], ESDictDocument] diff --git a/cl/settings/third_party/elasticsearch.py b/cl/settings/third_party/elasticsearch.py index 64f797ea62..2aedf9a003 100644 --- a/cl/settings/third_party/elasticsearch.py +++ b/cl/settings/third_party/elasticsearch.py @@ -146,3 +146,8 @@ # Percolator batch size for Alerts # #################################### PERCOLATOR_PAGE_SIZE = 100 + +################################################### +# The maximum number of scheduled hits per alert. # +################################################### +SCHEDULED_ALERT_HITS_LIMIT = 30