diff --git a/cl/alerts/management/commands/cl_send_scheduled_alerts.py b/cl/alerts/management/commands/cl_send_scheduled_alerts.py
index 14511fc3ac..f912d9d20a 100644
--- a/cl/alerts/management/commands/cl_send_scheduled_alerts.py
+++ b/cl/alerts/management/commands/cl_send_scheduled_alerts.py
@@ -3,6 +3,7 @@
from typing import DefaultDict
import waffle
+from django.http import QueryDict
from django.utils.timezone import now
from cl.alerts.models import (
@@ -11,8 +12,9 @@
ScheduledAlertHit,
)
from cl.alerts.tasks import send_search_alert_emails
-from cl.alerts.utils import InvalidDateError
+from cl.alerts.utils import InvalidDateError, override_alert_query
from cl.lib.command_utils import VerboseCommand, logger
+from cl.search.models import SEARCH_TYPES
from cl.stats.utils import tally_stat
DAYS_TO_DELETE = 90
@@ -28,6 +30,28 @@ def json_date_parser(dct):
return dct
+def get_cut_off_date(rate: str, d: datetime.date) -> datetime.date | None:
+ """Given a rate of dly, wly or mly and a date, returns the date after for
+ building a daterange filter.
+ :param rate: The alert rate to send Alerts.
+ :param d: The date alerts are run.
+ :return: The cut-off date or None.
+ """
+ cut_off_date = None
+ if rate == Alert.DAILY:
+ cut_off_date = d
+ elif rate == Alert.WEEKLY:
+ cut_off_date = d - datetime.timedelta(days=7)
+ elif rate == Alert.MONTHLY:
+ # Get the first of the month of the previous month regardless of the
+ # current date
+ early_last_month = d - datetime.timedelta(days=28)
+ cut_off_date = datetime.datetime(
+ early_last_month.year, early_last_month.month, 1
+ ).date()
+ return cut_off_date
+
+
def query_and_send_alerts_by_rate(rate: str) -> None:
"""Query and send alerts per user.
@@ -62,6 +86,12 @@ def query_and_send_alerts_by_rate(rate: str) -> None:
documents.append(json_date_parser(result.document_content))
alerts_to_update.append(alert.pk)
+
+ # Override order_by to show the latest items when clicking the
+ # "View Full Results" button.
+ cut_off_date = get_cut_off_date(rate, now_time.date())
+ qd = override_alert_query(alert, cut_off_date)
+ alert.query_run = qd.urlencode() # type: ignore
hits.append(
(
alert,
diff --git a/cl/alerts/tasks.py b/cl/alerts/tasks.py
index 384cdbcec2..fa1aeed46e 100644
--- a/cl/alerts/tasks.py
+++ b/cl/alerts/tasks.py
@@ -17,7 +17,12 @@
)
from cl.alerts.models import Alert, DocketAlert, ScheduledAlertHit
-from cl.alerts.utils import percolate_document, user_has_donated_enough
+from cl.alerts.utils import (
+ alert_hits_limit_reached,
+ override_alert_query,
+ percolate_document,
+ user_has_donated_enough,
+)
from cl.api.models import WebhookEventType
from cl.api.tasks import (
send_docket_alert_webhook_events,
@@ -33,9 +38,9 @@
from cl.recap.constants import COURT_TIMEZONES
from cl.search.constants import ALERTS_HL_TAG
from cl.search.documents import AudioPercolator
-from cl.search.models import Docket, DocketEntry
+from cl.search.models import SEARCH_TYPES, Docket, DocketEntry
from cl.search.types import (
- ESDocumentType,
+ ESDocumentClassType,
PercolatorResponseType,
SaveDocumentResponseType,
SearchAlertHitType,
@@ -477,7 +482,10 @@ def send_search_alert_emails(
continue
alert_user: UserProfile.user = User.objects.get(pk=user_id)
- context = {"hits": hits}
+ context = {
+ "hits": hits,
+ "hits_limit": settings.SCHEDULED_ALERT_HITS_LIMIT,
+ }
txt = txt_template.render(context)
html = html_template.render(context)
msg = EmailMultiAlternatives(
@@ -527,6 +535,11 @@ def process_percolator_response(response: PercolatorResponseType) -> None:
ALERTS_HL_TAG,
)
+ # Override order_by to show the latest items when clicking the
+ # "View Full Results" button.
+ qd = override_alert_query(alert_triggered)
+ alert_triggered.query_run = qd.urlencode() # type: ignore
+
# Compose RT hit to send.
hits = [
(
@@ -554,6 +567,12 @@ def process_percolator_response(response: PercolatorResponseType) -> None:
else:
# Schedule DAILY, WEEKLY and MONTHLY Alerts
+ if alert_hits_limit_reached(
+ alert_triggered.pk, alert_triggered.user.pk
+ ):
+ # Skip storing hits for this alert-user combination because
+ # the SCHEDULED_ALERT_HITS_LIMIT has been reached.
+ continue
scheduled_hits_to_create.append(
ScheduledAlertHit(
user=alert_triggered.user,
@@ -690,7 +709,7 @@ def index_alert_document(
ignore_result=True,
)
def remove_doc_from_es_index(
- self: Task, es_document: ESDocumentType, instance_id: int
+ self: Task, es_document: ESDocumentClassType, instance_id: int
) -> None:
"""Remove a document from an Elasticsearch index.
diff --git a/cl/alerts/templates/alert_email_es.html b/cl/alerts/templates/alert_email_es.html
index 91bc38e48b..4e05888dc9 100644
--- a/cl/alerts/templates/alert_email_es.html
+++ b/cl/alerts/templates/alert_email_es.html
@@ -24,10 +24,10 @@
- Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert — {{alert.name}} — had {{num_results}} hit{{results|pluralize}}:
+ Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert — {{alert.name}} — had {{num_results}}{% if num_results >= hits_limit %}+{% endif %} hit{{results|pluralize}}:
- View Full Results / Edit this Alert
+ View Full Results / Edit this Alert
Disable this Alert (one click)
{% endif %}
diff --git a/cl/alerts/templates/alert_email_es.txt b/cl/alerts/templates/alert_email_es.txt
index 60f9975f07..b94b0db1fa 100644
--- a/cl/alerts/templates/alert_email_es.txt
+++ b/cl/alerts/templates/alert_email_es.txt
@@ -10,9 +10,9 @@ CourtListener.com
We have news regarding your alerts at CourtListener.com
-------------------------------------------------------
-{% for alert, type, results, num_results in hits %}{% for result in results %}{% if forloop.first %}Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert -- {{alert.name}} -- had {{num_results}} hit{{results|pluralize}}:
+{% for alert, type, results, num_results in hits %}{% for result in results %}{% if forloop.first %}Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert -- {{alert.name}} -- had {{num_results}}{% if num_results >= hits_limit %}+{% endif %} hit{{results|pluralize}}:
-------------------------------------------------------
-Edit / Delete this Alert: https://www.courtlistener.com/alert/edit/{{ alert.pk }}/
+View Full Results / Edit this Alert: https://www.courtlistener.com/?{{ alert.query_run|safe }}&edit_alert={{ alert.pk }}/
Disable this Alert (one click): https://www.courtlistener.com{% url "disable_alert" alert.secret_key %}{% endif %}
{{forloop.counter}}. {{ result.caseName|safe|striptags }} ({% if result.court_id != 'scotus' %}{{ result.court_citation_string|striptags }} {% endif %}{% if type == 'o' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %})
diff --git a/cl/alerts/tests.py b/cl/alerts/tests.py
index 99369492ac..e0275e8d4d 100644
--- a/cl/alerts/tests.py
+++ b/cl/alerts/tests.py
@@ -1,9 +1,11 @@
import json
from collections import defaultdict
-from datetime import timedelta
+from datetime import datetime, timedelta
from unittest import mock
+from urllib.parse import parse_qs, urlparse
import time_machine
+from django.conf import settings
from django.contrib.auth.hashers import make_password
from django.contrib.auth.models import User
from django.core import mail
@@ -11,6 +13,7 @@
from django.test import Client, override_settings
from django.urls import reverse
from django.utils.timezone import now
+from lxml import html
from rest_framework.status import (
HTTP_200_OK,
HTTP_201_CREATED,
@@ -24,6 +27,7 @@
from cl.alerts.factories import AlertFactory, DocketAlertWithParentsFactory
from cl.alerts.management.commands.cl_send_scheduled_alerts import (
DAYS_TO_DELETE,
+ get_cut_off_date,
)
from cl.alerts.management.commands.handle_old_docket_alerts import (
build_user_report,
@@ -588,10 +592,17 @@ def setUpTestData(cls):
cluster__precedential_status=PRECEDENTIAL_STATUS.PUBLISHED,
cluster__date_filed=now() - timedelta(hours=5),
)
- cls.dly_oral_argument = AudioWithParentsFactory.create(
- case_name="Dly Test OA",
- docket__date_argued=now() - timedelta(hours=5),
- )
+ with mock.patch(
+ "cl.scrapers.tasks.microservice",
+ side_effect=lambda *args, **kwargs: MockResponse(200, b"10"),
+ ), mock.patch(
+ "cl.lib.es_signal_processor.avoid_es_audio_indexing",
+ side_effect=lambda x, y, z: False,
+ ):
+ cls.dly_oral_argument = AudioWithParentsFactory.create(
+ case_name="Dly Test OA",
+ docket__date_argued=now() - timedelta(hours=5),
+ )
cls.wly_opinion = OpinionWithParentsFactory.create(
cluster__precedential_status=PRECEDENTIAL_STATUS.PUBLISHED,
@@ -1515,6 +1526,10 @@ def test_get_docket_notes_and_tags_by_user(self) -> None:
@override_switch("oa-es-alerts-active", active=True)
+@mock.patch(
+ "cl.lib.es_signal_processor.avoid_es_audio_indexing",
+ side_effect=lambda x, y, z: False,
+)
class SearchAlertsOAESTests(ESIndexTestCase, TestCase):
"""Test ES Search Alerts"""
@@ -1591,7 +1606,7 @@ def tearDownClass(cls):
Audio.objects.all().delete()
super().tearDownClass()
- def test_alert_frequency_estimation(self):
+ def test_alert_frequency_estimation(self, mock_abort_audio):
"""Test alert frequency ES API endpoint."""
search_params = {
@@ -1625,7 +1640,7 @@ def test_alert_frequency_estimation(self):
self.assertEqual(r.json()["count"], 1)
rt_oral_argument.delete()
- def test_send_oa_search_alert_webhooks(self):
+ def test_send_oa_search_alert_webhooks(self, mock_abort_audio):
"""Can we send RT OA search alerts?"""
with mock.patch(
@@ -1691,6 +1706,13 @@ def test_send_oa_search_alert_webhooks(self):
self.assertIn("19-5735", html_content)
self.assertIn("RT", html_content)
+ # Confirm that order_by is overridden in the 'View Full Results' URL by
+ # dateArgued+desc.
+ view_results_url = html.fromstring(str(html_content)).xpath(
+ '//a[text()="View Full Results / Edit this Alert"]/@href'
+ )
+ self.assertIn("order_by=dateArgued+desc", view_results_url[0])
+
# The right alert type template is used.
self.assertIn("oral argument", html_content)
@@ -1727,7 +1749,7 @@ def test_send_oa_search_alert_webhooks(self):
webhook_events.delete()
rt_oral_argument.delete()
- def test_send_alert_on_document_creation(self):
+ def test_send_alert_on_document_creation(self, mock_abort_audio):
"""Avoid sending Search Alerts on document updates."""
with mock.patch(
"cl.api.webhooks.requests.post",
@@ -1778,7 +1800,7 @@ def test_send_alert_on_document_creation(self):
self.assertEqual(len(webhook_events), 4)
rt_oral_argument.delete()
- def test_es_alert_update_and_delete(self):
+ def test_es_alert_update_and_delete(self, mock_abort_audio):
"""Can we update and delete an alert, and expect these changes to be
properly reflected in Elasticsearch?"""
@@ -1846,10 +1868,6 @@ def send_alerts_by_rate_and_confirm_assertions(
# Confirm Alert date_last_hit is updated.
search_alert.refresh_from_db()
- if previous_date:
- self.assertEqual(search_alert.date_last_hit, previous_date)
- else:
- self.assertEqual(search_alert.date_last_hit, mock_date)
# One OA search alert email should be sent
self.assertEqual(len(mail.outbox), alert_count)
@@ -1859,6 +1877,36 @@ def send_alerts_by_rate_and_confirm_assertions(
# The right alert type template is used.
self.assertIn("oral argument", text_content)
+ # Extract HTML version.
+ html_content = None
+ for content, content_type in mail.outbox[alert_count - 1].alternatives:
+ if content_type == "text/html":
+ html_content = content
+ break
+
+ # Confirm that order_by is overridden in the 'View Full Results'
+ # URL by dateArgued+desc.
+ view_results_url = html.fromstring(str(html_content)).xpath(
+ '//a[text()="View Full Results / Edit this Alert"]/@href'
+ )
+
+ parsed_url = urlparse(view_results_url[0])
+ params = parse_qs(parsed_url.query)
+ self.assertEqual("dateArgued desc", params["order_by"][0])
+
+ if previous_date:
+ self.assertEqual(search_alert.date_last_hit, previous_date)
+ else:
+ self.assertEqual(search_alert.date_last_hit, mock_date)
+
+ # Confirm that argued_after is properly set in the
+ # 'View Full Results' URL.
+ cut_off_date = get_cut_off_date(rate, mock_date.date())
+ date_in_query = datetime.strptime(
+ params["argued_after"][0], "%m/%d/%Y"
+ ).date()
+ self.assertEqual(cut_off_date, date_in_query)
+
# Confirm stored alerts instances status is set to SENT.
scheduled_hits = ScheduledAlertHit.objects.filter(alert__rate=rate)
for scheduled_hit in scheduled_hits:
@@ -1866,7 +1914,7 @@ def send_alerts_by_rate_and_confirm_assertions(
scheduled_hit.hit_status, SCHEDULED_ALERT_HIT_STATUS.SENT
)
- def test_send_alert_multiple_alert_rates(self):
+ def test_send_alert_multiple_alert_rates(self, mock_abort_audio):
"""Confirm dly, wly and mly alerts are properly stored and sent
according to their periodicity.
"""
@@ -1972,7 +2020,7 @@ def test_send_alert_multiple_alert_rates(self):
@mock.patch(
"cl.alerts.management.commands.cl_send_scheduled_alerts.logger"
)
- def test_group_alerts_and_hits(self, mock_logger):
+ def test_group_alerts_and_hits(self, mock_logger, mock_abort_audio):
""""""
with mock.patch(
"cl.api.webhooks.requests.post",
@@ -2075,7 +2123,7 @@ def test_group_alerts_and_hits(self, mock_logger):
rt_oral_argument_3.delete()
@override_settings(PERCOLATOR_PAGE_SIZE=5)
- def test_send_multiple_rt_alerts(self):
+ def test_send_multiple_rt_alerts(self, mock_abort_audio):
"""Confirm all RT alerts are properly sent if the percolator response
contains more than PERCOLATOR_PAGE_SIZE results. So additional
requests are performed in order to retrieve all the available results.
@@ -2167,7 +2215,7 @@ def test_send_multiple_rt_alerts(self):
alert.delete()
@override_settings(PERCOLATOR_PAGE_SIZE=5)
- def test_batched_alerts_match_documents_ingestion(self):
+ def test_batched_alerts_match_documents_ingestion(self, mock_abort_audio):
"""Confirm that batched alerts are properly stored according to
document ingestion when percolated in real time.
"""
@@ -2251,7 +2299,7 @@ def test_batched_alerts_match_documents_ingestion(self):
alert.delete()
@override_settings(PERCOLATOR_PAGE_SIZE=5)
- def test_percolate_document_in_batches(self):
+ def test_percolate_document_in_batches(self, mock_abort_audio):
"""Confirm when getting alerts in batches and an alert previously
retrieved is updated during this process. It's not returned again.
"""
@@ -2308,7 +2356,9 @@ def test_percolate_document_in_batches(self):
self.assertNotIn(result.id, ids_in_results)
ids_in_results.append(result.id)
- def test_avoid_sending_or_scheduling_disabled_alerts(self):
+ def test_avoid_sending_or_scheduling_disabled_alerts(
+ self, mock_abort_audio
+ ):
"""Can we avoid sending or_scheduling disabled search alerts?."""
rt_alert_disabled = AlertFactory(
@@ -2351,7 +2401,7 @@ def test_avoid_sending_or_scheduling_disabled_alerts(self):
dly_alert_disabled.delete()
oral_argument.delete()
- def test_avoid_re_sending_scheduled_sent_alerts(self):
+ def test_avoid_re_sending_scheduled_sent_alerts(self, mock_abort_audio):
"""Can we prevent re-sending scheduled alerts that have already been
sent?"""
@@ -2494,6 +2544,87 @@ def test_avoid_re_sending_scheduled_sent_alerts(self):
oral_argument_2.delete()
oral_argument_3.delete()
+ @override_settings(SCHEDULED_ALERT_HITS_LIMIT=3)
+ def test_scheduled_hits_limit(self, mock_abort_audio):
+ """Confirm we only store the max number of alert hits set in settings."""
+
+ alert_1 = AlertFactory(
+ user=self.user_profile.user,
+ rate=Alert.DAILY,
+ name="Test Alert OA Daily 1",
+ query="q=USA+vs+Bank+&type=oa",
+ )
+ alert_2 = AlertFactory(
+ user=self.user_profile.user,
+ rate=Alert.DAILY,
+ name="Test Alert OA Daily 2",
+ query="q=Texas+vs+Corp&type=oa",
+ )
+ alert_3 = AlertFactory(
+ user=self.user_profile_2.user,
+ rate=Alert.DAILY,
+ name="Test Alert OA Daily 3",
+ query="q=Texas+vs+Corp&type=oa",
+ )
+
+ oa_created = []
+ for i in range(4):
+ oral_argument = AudioWithParentsFactory.create(
+ case_name="USA vs Bank",
+ docket__court=self.court_1,
+ docket__date_argued=now().date(),
+ docket__docket_number="19-5735",
+ )
+ oa_created.append(oral_argument)
+
+ if i in (1, 2):
+ # Only schedule two hits for this one.
+ oral_argument_2 = AudioWithParentsFactory.create(
+ case_name="Texas vs Corp",
+ docket__court=self.court_1,
+ docket__date_argued=now().date(),
+ docket__docket_number="20-5030",
+ )
+ oa_created.append(oral_argument_2)
+
+ # Call dly command
+ call_command("cl_send_scheduled_alerts", rate=Alert.DAILY)
+
+ # Two emails should be sent, one for user_profile and one for user_profile_2
+ self.assertEqual(len(mail.outbox), 2)
+
+ # Confirm emails contains the hits+ count.
+ self.assertIn(
+ f"had {settings.SCHEDULED_ALERT_HITS_LIMIT}+ hits",
+ mail.outbox[0].body,
+ )
+ # No "+" if hits do not reach the SCHEDULED_ALERT_HITS_LIMIT.
+ self.assertIn(
+ f"had 2 hits",
+ mail.outbox[1].body,
+ )
+
+ # Confirm each email contains the max number of hits for each alert.
+ self.assertEqual(
+ mail.outbox[0].body.count("USA vs Bank"),
+ settings.SCHEDULED_ALERT_HITS_LIMIT,
+ )
+ self.assertEqual(
+ mail.outbox[0].body.count("Texas vs Corp"),
+ 2,
+ )
+ self.assertEqual(
+ mail.outbox[1].body.count("Texas vs Corp"),
+ 2,
+ )
+
+ for oa in oa_created:
+ oa.delete()
+
+ alert_1.delete()
+ alert_2.delete()
+ alert_3.delete()
+
@override_settings(ELASTICSEARCH_DISABLED=True)
class SearchAlertsIndexingCommandTests(ESIndexTestCase, TestCase):
diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py
index f9f958bc80..6eaa5206f1 100644
--- a/cl/alerts/utils.py
+++ b/cl/alerts/utils.py
@@ -1,10 +1,17 @@
from dataclasses import dataclass
+from datetime import date
from django.conf import settings
+from django.http import QueryDict
from elasticsearch_dsl import Q, Search
from elasticsearch_dsl.response import Response
-from cl.alerts.models import Alert, DocketAlert
+from cl.alerts.models import (
+ SCHEDULED_ALERT_HIT_STATUS,
+ Alert,
+ DocketAlert,
+ ScheduledAlertHit,
+)
from cl.lib.command_utils import logger
from cl.lib.elasticsearch_utils import add_es_highlighting
from cl.search.documents import AudioPercolator
@@ -107,3 +114,50 @@ def user_has_donated_enough(
)
return False
return True
+
+
+def override_alert_query(
+ alert: Alert, cut_off_date: date | None = None
+) -> QueryDict:
+ """Override the query parameters for a given alert based on its type and an
+ optional cut-off date.
+
+ :param alert: The Alert object for which the query will be overridden.
+ :param cut_off_date: An optional date used to set a threshold in the query.
+ :return: A QueryDict object containing the modified query parameters.
+ """
+
+ qd = QueryDict(alert.query.encode(), mutable=True)
+ if alert.alert_type == SEARCH_TYPES.ORAL_ARGUMENT:
+ qd["order_by"] = "dateArgued desc"
+ if cut_off_date:
+ qd["argued_after"] = cut_off_date.strftime("%m/%d/%Y")
+ else:
+ qd["order_by"] = "dateFiled desc"
+ if cut_off_date:
+ qd["filed_after"] = cut_off_date.strftime("%m/%d/%Y")
+
+ return qd
+
+
+def alert_hits_limit_reached(alert_pk: int, user_pk: int) -> bool:
+ """Check if the alert hits limit has been reached for a specific alert-user
+ combination.
+
+ :param alert_pk: The alert_id.
+ :param user_pk: The user_id.
+ :return: True if the limit has been reached, otherwise False.
+ """
+
+ stored_hits = ScheduledAlertHit.objects.filter(
+ alert_id=alert_pk,
+ user_id=user_pk,
+ hit_status=SCHEDULED_ALERT_HIT_STATUS.SCHEDULED,
+ )
+ hits_count = stored_hits.count()
+ if hits_count >= settings.SCHEDULED_ALERT_HITS_LIMIT:
+ logger.info(
+ f"Skipping hit for Alert ID: {alert_pk}, there are {hits_count} hits stored for this alert."
+ )
+ return True
+ return False
diff --git a/cl/audio/tests.py b/cl/audio/tests.py
index e570621cba..95106336f5 100644
--- a/cl/audio/tests.py
+++ b/cl/audio/tests.py
@@ -1,3 +1,5 @@
+from unittest import mock
+
from django.urls import reverse
from lxml import etree
@@ -24,24 +26,28 @@ def setUpTestData(cls) -> None:
jurisdiction="F",
citation_string="Appeals. CA8.",
)
- cls.audio = AudioWithParentsFactory.create(
- docket=DocketFactory(court=cls.court_1),
- local_path_mp3__data=ONE_SECOND_MP3_BYTES,
- local_path_original_file__data=ONE_SECOND_MP3_BYTES,
- duration=1,
- )
- AudioWithParentsFactory.create(
- docket=cls.audio.docket,
- local_path_mp3__data=SMALL_WAV_BYTES,
- local_path_original_file__data=SMALL_WAV_BYTES,
- duration=0,
- )
- AudioWithParentsFactory.create(
- docket=DocketFactory(court=cls.court_2),
- local_path_mp3__data=SMALL_WAV_BYTES,
- local_path_original_file__data=SMALL_WAV_BYTES,
- duration=5,
- )
+ with mock.patch(
+ "cl.lib.es_signal_processor.avoid_es_audio_indexing",
+ side_effect=lambda x, y, z: False,
+ ):
+ cls.audio = AudioWithParentsFactory.create(
+ docket=DocketFactory(court=cls.court_1),
+ local_path_mp3__data=ONE_SECOND_MP3_BYTES,
+ local_path_original_file__data=ONE_SECOND_MP3_BYTES,
+ duration=1,
+ )
+ AudioWithParentsFactory.create(
+ docket=cls.audio.docket,
+ local_path_mp3__data=SMALL_WAV_BYTES,
+ local_path_original_file__data=SMALL_WAV_BYTES,
+ duration=0,
+ )
+ AudioWithParentsFactory.create(
+ docket=DocketFactory(court=cls.court_2),
+ local_path_mp3__data=SMALL_WAV_BYTES,
+ local_path_original_file__data=SMALL_WAV_BYTES,
+ duration=5,
+ )
def test_do_jurisdiction_podcasts_have_good_content(self) -> None:
"""Can we simply load a jurisdiction podcast page?"""
diff --git a/cl/lib/es_signal_processor.py b/cl/lib/es_signal_processor.py
index e9331fe15f..3cf2d03e75 100644
--- a/cl/lib/es_signal_processor.py
+++ b/cl/lib/es_signal_processor.py
@@ -9,14 +9,19 @@
remove_doc_from_es_index,
send_or_schedule_alerts,
)
+from cl.audio.models import Audio
from cl.lib.elasticsearch_utils import elasticsearch_enabled
from cl.search.documents import AudioDocument
from cl.search.tasks import save_document_in_es, update_document_in_es
-from cl.search.types import ESDocumentType, ESModelType
+from cl.search.types import (
+ ESDocumentClassType,
+ ESDocumentInstanceType,
+ ESModelType,
+)
def updated_fields(
- instance: ESModelType, es_document: ESDocumentType
+ instance: ESModelType, es_document: ESDocumentClassType
) -> list[str]:
"""Look for changes in the tracked fields of an instance.
:param instance: The instance to check for changed fields.
@@ -60,7 +65,7 @@ def get_fields_to_update(
def document_fields_to_update(
- main_doc: ESDocumentType,
+ main_doc: ESDocumentInstanceType,
main_object: ESModelType,
field_list: list[str],
instance: ESModelType,
@@ -96,8 +101,8 @@ def document_fields_to_update(
def get_or_create_doc(
- es_document: ESDocumentType, instance: ESModelType
-) -> ESDocumentType | None:
+ es_document: ESDocumentClassType, instance: ESModelType
+) -> ESDocumentInstanceType | None:
"""Get or create a document in Elasticsearch.
:param es_document: The Elasticsearch document type.
:param instance: The instance of the document to get or create.
@@ -113,7 +118,7 @@ def get_or_create_doc(
def update_es_documents(
main_model: ESModelType,
- es_document: ESDocumentType,
+ es_document: ESDocumentClassType,
instance: ESModelType,
created: bool,
mapping_fields: dict,
@@ -155,7 +160,7 @@ def update_es_documents(
def update_remove_m2m_documents(
main_model: ESModelType,
- es_document: ESDocumentType,
+ es_document: ESDocumentClassType,
instance: ESModelType,
mapping_fields: dict,
affected_field: str,
@@ -186,7 +191,7 @@ def update_remove_m2m_documents(
def update_m2m_field_in_es_document(
instance: ESModelType,
- es_document: ESDocumentType,
+ es_document: ESDocumentClassType,
affected_field: str,
) -> None:
"""Update a single field created using a many-to-many relationship.
@@ -205,7 +210,7 @@ def update_m2m_field_in_es_document(
def update_reverse_related_documents(
main_model: ESModelType,
- es_document: ESDocumentType,
+ es_document: ESDocumentClassType,
instance: ESModelType,
query_string: str,
affected_fields: list[str],
@@ -235,6 +240,35 @@ def update_reverse_related_documents(
)
+def avoid_es_audio_indexing(
+ instance: ESModelType,
+ es_document: ESDocumentClassType,
+ update_fields: list[str] | None,
+):
+ """Check conditions to abort Elasticsearch indexing for Audio instances.
+ Avoid indexing for Audio instances which their mp3 file has not been
+ processed yet by process_audio_file.
+
+ :param instance: The Audio instance to evaluate for Elasticsearch indexing.
+ :param es_document: The Elasticsearch document class.
+ :param update_fields: List of fields being updated, or None.
+ :return: True if indexing should be avoided, False otherwise.
+ """
+
+ if (
+ type(instance) == Audio
+ and not es_document.exists(instance.pk)
+ and (
+ not update_fields
+ or (update_fields and "processing_complete" not in update_fields)
+ )
+ ):
+ # Avoid indexing Audio instances that haven't been previously indexed
+ # in ES and for which 'processing_complete' is not present in update_fields.
+ return True
+ return False
+
+
class ESSignalProcessor(object):
"""Custom signal processor for Elasticsearch documents. It is responsible
for managing the Elasticsearch index after certain events happen, such as
@@ -299,7 +333,14 @@ def connect_signals(models, handler, signal_to_uid_mapping, weak=False):
)
@elasticsearch_enabled
- def handle_save(self, sender, instance=None, created=False, **kwargs):
+ def handle_save(
+ self,
+ sender,
+ instance=None,
+ created=False,
+ update_fields=None,
+ **kwargs,
+ ):
"""Receiver function that gets called after an object instance is saved"""
mapping_fields = self.documents_model_mapping["save"][sender]
if not created:
@@ -311,6 +352,13 @@ def handle_save(self, sender, instance=None, created=False, **kwargs):
mapping_fields,
)
if not mapping_fields:
+ if avoid_es_audio_indexing(
+ instance, self.es_document, update_fields
+ ):
+ # This check is required to avoid indexing and triggering
+ # search alerts for Audio instances whose MP3 files have not
+ # yet been processed by process_audio_file.
+ return None
chain(
save_document_in_es.si(instance, self.es_document),
send_or_schedule_alerts.s(self.es_document._index._name),
diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
index f78fde0032..6f68227e1f 100644
--- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
+++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
@@ -172,9 +172,7 @@ def scrape_court(
index=False,
backscrape=backscrape,
)
- process_audio_file.apply_async(
- (audio_file.pk,), countdown=random.randint(0, 3600)
- )
+ process_audio_file.delay(audio_file.pk)
logger.info(
"Successfully added audio file {pk}: {name}".format(
diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py
index 186d2101fe..b5434aaddb 100644
--- a/cl/scrapers/tasks.py
+++ b/cl/scrapers/tasks.py
@@ -368,7 +368,13 @@ def process_audio_file(self, pk) -> None:
).text
)
audio_obj.processing_complete = True
- audio_obj.save()
+ audio_obj.save(
+ update_fields=[
+ "duration",
+ "local_path_mp3",
+ "processing_complete",
+ ]
+ )
@app.task(
diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
index 4f7dfafe54..6d27b8c871 100644
--- a/cl/scrapers/tests.py
+++ b/cl/scrapers/tests.py
@@ -8,8 +8,14 @@
from django.core.files.base import ContentFile
from django.utils.timezone import now
+from cl.alerts.factories import AlertFactory
+from cl.alerts.models import Alert
+from cl.api.factories import WebhookFactory
+from cl.api.models import WebhookEvent, WebhookEventType
from cl.audio.factories import AudioWithParentsFactory
from cl.audio.models import Audio
+from cl.donate.factories import DonationFactory
+from cl.donate.models import Donation
from cl.lib.microservice_utils import microservice
from cl.scrapers.DupChecker import DupChecker
from cl.scrapers.management.commands import (
@@ -24,14 +30,34 @@
from cl.search.factories import CourtFactory, DocketFactory
from cl.search.models import Court, Docket, Opinion
from cl.settings import MEDIA_ROOT
-from cl.tests.cases import SimpleTestCase, TestCase
+from cl.tests.cases import ESIndexTestCase, SimpleTestCase, TestCase
from cl.tests.fixtures import ONE_SECOND_MP3_BYTES, SMALL_WAV_BYTES
+from cl.users.factories import UserProfileWithParentsFactory
-class ScraperIngestionTest(TestCase):
+class ScraperIngestionTest(ESIndexTestCase, TestCase):
@classmethod
def setUpTestData(cls) -> None:
cls.court = CourtFactory(id="test", jurisdiction="F")
+ cls.user_profile = UserProfileWithParentsFactory()
+ cls.donation = DonationFactory(
+ donor=cls.user_profile.user,
+ amount=20,
+ status=Donation.PROCESSED,
+ send_annual_reminder=True,
+ )
+ cls.webhook_enabled = WebhookFactory(
+ user=cls.user_profile.user,
+ event_type=WebhookEventType.SEARCH_ALERT,
+ url="https://example.com/",
+ enabled=True,
+ )
+ cls.search_alert_oa = AlertFactory(
+ user=cls.user_profile.user,
+ rate=Alert.DAILY,
+ name="Test Alert OA",
+ query="type=oa",
+ )
def test_extension(self):
r = async_to_sync(microservice)(
@@ -128,6 +154,32 @@ def test_ingest_oral_arguments(self) -> None:
d_1.refresh_from_db()
self.assertEqual(d_1.case_name, "Jeremy v. Julian")
+ # Confirm that OA Search Alerts are properly triggered after an OA is
+ # scraped and its MP3 file is processed.
+ # Two webhook events should be sent, both of them to user_profile user
+ webhook_events = WebhookEvent.objects.all()
+ self.assertEqual(len(webhook_events), 2)
+
+ cases_names = ["Jeremy v. Julian", "Ander v. Leo"]
+ for webhook_sent in webhook_events:
+ self.assertEqual(
+ webhook_sent.webhook.user,
+ self.user_profile.user,
+ )
+ content = webhook_sent.content
+ # Check if the webhook event payload is correct.
+ self.assertEqual(
+ content["webhook"]["event_type"],
+ WebhookEventType.SEARCH_ALERT,
+ )
+ self.assertIn(
+ content["payload"]["results"][0]["caseName"], cases_names
+ )
+ self.assertIsNotNone(content["payload"]["results"][0]["duration"])
+ self.assertIsNotNone(
+ content["payload"]["results"][0]["local_path"]
+ )
+
def test_parsing_xml_opinion_site_to_site_object(self) -> None:
"""Does a basic parse of a site reveal the right number of items?"""
site = test_opinion_scraper.Site().parse()
diff --git a/cl/search/tasks.py b/cl/search/tasks.py
index 7f229bfca0..c145da47b7 100644
--- a/cl/search/tasks.py
+++ b/cl/search/tasks.py
@@ -19,7 +19,8 @@
from cl.search.documents import AudioDocument
from cl.search.models import Docket, OpinionCluster, RECAPDocument
from cl.search.types import (
- ESDocumentType,
+ ESDocumentClassType,
+ ESDocumentInstanceType,
ESModelType,
SaveDocumentResponseType,
)
@@ -179,9 +180,12 @@ def delete_items(items, app_label, force_commit=False):
interval_start=5,
)
def save_document_in_es(
- self: Task, instance: ESModelType, es_document: ESDocumentType
+ self: Task,
+ instance: ESModelType,
+ es_document: ESDocumentClassType,
) -> SaveDocumentResponseType | None:
"""Save a document in Elasticsearch using a provided callable.
+
:param self: The celery task
:param instance: The instance of the document to save.
:param es_document: A Elasticsearch DSL document.
@@ -218,7 +222,7 @@ def save_document_in_es(
)
def update_document_in_es(
self: Task,
- es_document: ESDocumentType,
+ es_document: ESDocumentInstanceType,
fields_values_to_update: dict[str, Any],
) -> None:
"""Update a document in Elasticsearch.
diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py
index a7e9a845fe..77bf1a0cab 100644
--- a/cl/search/tests/tests_es_oral_arguments.py
+++ b/cl/search/tests/tests_es_oral_arguments.py
@@ -1,4 +1,5 @@
import datetime
+from unittest import mock
from django.core.cache import cache
from django.db import transaction
@@ -1106,7 +1107,11 @@ def test_oa_results_api_fields_es(self) -> None:
msg=f"Key {key} not found in the result object.",
)
- def test_oa_results_pagination(self) -> None:
+ @mock.patch(
+ "cl.lib.es_signal_processor.avoid_es_audio_indexing",
+ side_effect=lambda x, y, z: False,
+ )
+ def test_oa_results_pagination(self, mock_abort_audio) -> None:
created_audios = []
audios_to_create = 20
for i in range(audios_to_create):
@@ -1705,7 +1710,11 @@ def test_stemming_disable_search(self) -> None:
self.assertEqual(actual, expected)
self.assertIn("Freedom of", r.content.decode())
- def test_keep_in_sync_related_OA_objects(self) -> None:
+ @mock.patch(
+ "cl.lib.es_signal_processor.avoid_es_audio_indexing",
+ side_effect=lambda x, y, z: False,
+ )
+ def test_keep_in_sync_related_OA_objects(self, mock_abort_audio) -> None:
"""Test Audio documents are updated when related objects change."""
with transaction.atomic():
docket_5 = DocketFactory.create(
diff --git a/cl/search/types.py b/cl/search/types.py
index 9ff09547de..0fe330fbc7 100644
--- a/cl/search/types.py
+++ b/cl/search/types.py
@@ -1,4 +1,4 @@
-from typing import Any, Union
+from typing import Any, Type, Union
from elasticsearch_dsl.response import Hit
@@ -27,10 +27,18 @@
ParentheticalGroup,
Audio,
]
-ESDocumentType = Union[
+
+ESDocumentInstanceType = Union[
AudioDocument, ParentheticalGroupDocument, AudioPercolator
]
+ESDocumentClassType = Union[
+ Type[AudioDocument],
+ Type[ParentheticalGroupDocument],
+ Type[AudioPercolator],
+]
+
+
ESDictDocument = dict[str, Any]
PercolatorResponseType = tuple[list[Hit], ESDictDocument]
diff --git a/cl/settings/third_party/elasticsearch.py b/cl/settings/third_party/elasticsearch.py
index 64f797ea62..2aedf9a003 100644
--- a/cl/settings/third_party/elasticsearch.py
+++ b/cl/settings/third_party/elasticsearch.py
@@ -146,3 +146,8 @@
# Percolator batch size for Alerts #
####################################
PERCOLATOR_PAGE_SIZE = 100
+
+###################################################
+# The maximum number of scheduled hits per alert. #
+###################################################
+SCHEDULED_ALERT_HITS_LIMIT = 30