Merge branch 'main' into update-cl-xml-from-cap

freelawproject · Oct 28, 2024 · ebf53e8 · ebf53e8
2 parents 931f2cd + 5ac88cd
commit ebf53e8
Show file tree

Hide file tree

Showing 38 changed files with 3,686 additions and 566 deletions.
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -121,6 +121,11 @@ jobs:
       - name: Watch cl-retry-webhooks rollout status
         run: kubectl rollout status -n ${{ env.EKS_NAMESPACE }} deployment/cl-retry-webhooks
 
+      - name: Rollout cl-send-rt-percolator-alerts
+        run: kubectl set image -n ${{ env.EKS_NAMESPACE }} deployment/cl-send-rt-percolator-alerts cl-send-rt-percolator-alerts=freelawproject/courtlistener:${{ steps.vars.outputs.sha_short }}-prod
+      - name: Watch cl-send-rt-percolator-alerts rollout status
+        run: kubectl rollout status -n ${{ env.EKS_NAMESPACE }} deployment/cl-send-rt-percolator-alerts
+
       - name: Rollout cl-es-sweep-indexer
         run: kubectl set image -n ${{ env.EKS_NAMESPACE }} deployment/cl-es-sweep-indexer sweep-indexer=freelawproject/courtlistener:${{ steps.vars.outputs.sha_short }}-prod
       - name: Watch cl-es-sweep-indexer rollout status

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -11,6 +11,39 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
+    steps:
+      - name: Check out CourtListener
+        uses: actions/checkout@v4
+        with:
+          path: courtlistener
+
+      # Build docker image
+      - name: Set up docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./courtlistener
+          file: ./courtlistener/docker/django/Dockerfile
+          push: false # This image is for testing only
+          tags: courtlistener:latest
+          outputs: type=docker,dest=/tmp/courtlistener.tar
+          build-args: |
+            BUILD_ENV=dev
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: courtlistener
+          path: /tmp/courtlistener.tar
+          compression-level: 0
+          retention-days: 1
+
+  test:
+    runs-on: ubuntu-latest
+    needs: build
     strategy:
       fail-fast: false
       matrix:
@@ -50,30 +83,25 @@ jobs:
       - name: Echo github actor name for debugging
         run: echo ${{ github.actor }}
 
-      # Build and cache docker images so tests are always run on the latest
-      # dependencies
       - name: Set up docker Buildx
         uses: docker/setup-buildx-action@v3
         with:
           driver-opts: network=host
-      - name: Prebuild docker images
-        uses: docker/build-push-action@v6
-        with:
-          context: ./courtlistener
-          file: ./courtlistener/docker/django/Dockerfile
-          load: true
-          build-args: |
-            BUILD_ENV=dev
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
 
       # Prepare Docker images
       - name: Pull docker images
         working-directory: courtlistener/docker/courtlistener
         run: docker compose pull --quiet --ignore-buildable
-      - name: Build docker images
+      - name: Download courtlistener image
+        uses: actions/download-artifact@v4
+        with:
+          name: courtlistener
+          path: /tmp
+      - name: Load courtlistener image
+        run: docker load --input /tmp/courtlistener.tar
+      - name: Build cl-postgresql images # TODO: replace with an off-the-shelf image. Until then, build now for later steps.
         working-directory: courtlistener/docker/courtlistener
-        run: docker compose build
+        run: docker compose build cl-postgresql
       - name: List docker images
         run: docker image ls -a --no-trunc
 
@@ -82,7 +110,8 @@ jobs:
         run: docker network create -d bridge --attachable cl_net_overlay
       - name: Start docker containers
         working-directory: courtlistener/docker/courtlistener
-        run: docker compose -f docker-compose.yml -f docker-compose.tmpfs.yml up -d --no-build --pull=never
+        run: > # don't build, rather use loaded image from build step, specified by merging overriding config
+          docker compose -f docker-compose.yml -f docker-compose.tmpfs.yml -f <(echo 'services: { cl-django: { image: "courtlistener" }, cl-celery: { image: "courtlistener" } }') up -d --no-build --pull=never
       - name: List docker containers
         run: docker ps -a --no-trunc
       - name: Show the docker startup logs

diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -7,6 +7,7 @@
 import pytz
 from asgiref.sync import async_to_sync
 from django.contrib.auth.models import User
+from django.contrib.contenttypes.models import ContentType
 from django.http import QueryDict
 from django.utils import timezone
 from elasticsearch import Elasticsearch
@@ -21,8 +22,8 @@
 from cl.alerts.utils import (
     TaskCompletionStatus,
     add_document_hit_to_alert_set,
-    alert_hits_limit_reached,
     has_document_alert_hit_been_triggered,
+    scheduled_alert_hits_limit_reached,
 )
 from cl.api.models import WebhookEventType
 from cl.api.tasks import send_search_alert_webhook_es
@@ -625,6 +626,9 @@ def query_and_schedule_alerts(
     """
 
     alert_users = User.objects.filter(alerts__rate=rate).distinct()
+    docket_content_type = ContentType.objects.get(
+        app_label="search", model="docket"
+    )
     for user in alert_users:
         alerts = user.alerts.filter(rate=rate, alert_type=SEARCH_TYPES.RECAP)
         logger.info(f"Running '{rate}' alerts for user '{user}': {alerts}")
@@ -644,7 +648,7 @@ def query_and_schedule_alerts(
                 continue
             for hit in results_to_send:
                 # Schedule DAILY, WEEKLY and MONTHLY Alerts
-                if alert_hits_limit_reached(alert.pk, user.pk):
+                if scheduled_alert_hits_limit_reached(alert.pk, user.pk):
                     # Skip storing hits for this alert-user combination because
                     # the SCHEDULED_ALERT_HITS_LIMIT has been reached.
                     continue
@@ -662,6 +666,8 @@ def query_and_schedule_alerts(
                         user=user,
                         alert=alert,
                         document_content=hit_copy.to_dict(),
+                        content_type=docket_content_type,
+                        object_id=hit_copy.docket_id,
                     )
                 )
                 # Send webhooks

diff --git a/cl/alerts/management/commands/cl_send_rt_percolator_alerts.py b/cl/alerts/management/commands/cl_send_rt_percolator_alerts.py
@@ -0,0 +1,36 @@
+import time
+
+from cl.alerts.management.commands.cl_send_scheduled_alerts import (
+    query_and_send_alerts_by_rate,
+)
+from cl.alerts.models import Alert
+from cl.lib.command_utils import VerboseCommand
+
+
+class Command(VerboseCommand):
+    help = """Send real-time alerts scheduled by the Percolator every 5 minutes.
+     This process is performed to accumulate alerts that can be grouped into a
+     single email if they belong to the same user. """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.options = {}
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--testing-mode",
+            action="store_true",
+            help="Use this flag for testing purposes.",
+        )
+
+    def handle(self, *args, **options):
+        super().handle(*args, **options)
+        testing_mode = options.get("testing_mode", False)
+        while True:
+            query_and_send_alerts_by_rate(Alert.REAL_TIME)
+            if testing_mode:
+                # Perform only 1 iteration for testing purposes.
+                break
+
+            # Wait for 5 minutes.
+            time.sleep(300)
diff --git a/cl/alerts/management/commands/cl_send_scheduled_alerts.py b/cl/alerts/management/commands/cl_send_scheduled_alerts.py
@@ -1,9 +1,10 @@
 import datetime
 from collections import defaultdict
-from typing import DefaultDict
+from typing import Any, DefaultDict
 
 import waffle
 from asgiref.sync import async_to_sync
+from django.conf import settings
 
 from cl.alerts.models import (
     SCHEDULED_ALERT_HIT_STATUS,
@@ -13,6 +14,8 @@
 from cl.alerts.tasks import send_search_alert_emails
 from cl.alerts.utils import InvalidDateError, override_alert_query
 from cl.lib.command_utils import VerboseCommand, logger
+from cl.search.models import SEARCH_TYPES
+from cl.search.types import ESDictDocument
 from cl.stats.utils import tally_stat
 
 DAYS_TO_DELETE = 90
@@ -50,6 +53,30 @@ def get_cut_off_date(rate: str, d: datetime.date) -> datetime.date | None:
     return cut_off_date
 
 
+def merge_alert_child_documents(
+    documents: list[ESDictDocument],
+) -> ESDictDocument:
+    """Merge multiple child hits within the same main document.
+    :param documents: A list of document hits.
+    :return: A document dictionary where documents has been merged.
+    """
+
+    main_document = documents[0].copy()
+    child_docs: list[ESDictDocument] = []
+    for doc in documents:
+        if "child_docs" in doc:
+            child_docs.extend(doc["child_docs"])
+            if len(child_docs) >= settings.RECAP_CHILD_HITS_PER_RESULT:
+                # Nested child limits reached. Set child_remaining True to show
+                # the "Show the view additional hits button"
+                main_document["child_remaining"] = True
+                break
+
+    if child_docs:
+        main_document["child_docs"] = child_docs
+    return main_document
+
+
 def query_and_send_alerts_by_rate(rate: str) -> None:
     """Query and send alerts per user.
 
@@ -59,58 +86,77 @@ def query_and_send_alerts_by_rate(rate: str) -> None:
 
     alerts_sent_count = 0
     now_time = datetime.datetime.now()
-    alerts_to_update = []
-    scheduled_hits_rate = ScheduledAlertHit.objects.filter(
-        alert__rate=rate, hit_status=SCHEDULED_ALERT_HIT_STATUS.SCHEDULED
-    ).select_related("user", "alert")
-
-    # Create a nested dictionary structure to hold the groups.
-    grouped_hits: DefaultDict[
-        int, DefaultDict[Alert, list[ScheduledAlertHit]]
-    ] = defaultdict(lambda: defaultdict(list))
-
-    # Group scheduled hits by User and Alert.
-    for hit in scheduled_hits_rate:
-        user_id = hit.user.pk
-        alert = hit.alert
-        grouped_hits[user_id][alert].append(hit)
-
-    for user_id, alerts in grouped_hits.items():
+    # Get unique alert users with scheduled alert hits
+    user_ids = (
+        ScheduledAlertHit.objects.filter(
+            alert__rate=rate, hit_status=SCHEDULED_ALERT_HIT_STATUS.SCHEDULED
+        )
+        .values_list("user", flat=True)
+        .distinct()
+    )
+
+    for user_id in user_ids:
+        # Query ScheduledAlertHits for every user.
+        scheduled_hits = ScheduledAlertHit.objects.filter(
+            user_id=user_id,
+            alert__rate=rate,
+            hit_status=SCHEDULED_ALERT_HIT_STATUS.SCHEDULED,
+        ).select_related("user", "alert")
+
+        # Group scheduled hits by Alert and the main_doc_id
+        grouped_hits: DefaultDict[
+            Alert, DefaultDict[int, list[dict[str, Any]]]
+        ] = defaultdict(lambda: defaultdict(list))
+        alerts_to_update = set()
+        for hit in scheduled_hits:
+            alert = hit.alert
+            doc_content = json_date_parser(hit.document_content)
+            match hit.alert.alert_type:
+                case SEARCH_TYPES.RECAP:
+                    main_doc_id = doc_content.get("docket_id")
+                case SEARCH_TYPES.ORAL_ARGUMENT:
+                    main_doc_id = doc_content.get("id")
+                case _:
+                    # Not supported alert type.
+                    continue
+            grouped_hits[alert][main_doc_id].append(doc_content)
+            alerts_to_update.add(alert.pk)
+
+        # Merge child documents with the same main_doc_id if the document dict
+        # contains the child_docs key.
+        merged_hits: DefaultDict[Alert, list[dict[str, Any]]] = defaultdict(
+            list
+        )
+        for alert, document_groups in grouped_hits.items():
+            for documents in document_groups.values():
+                merged_hits[alert].append(
+                    merge_alert_child_documents(documents)
+                )
+
         hits = []
-        for alert, results in alerts.items():
+        for alert, documents in merged_hits.items():
             search_type = alert.alert_type
-            documents = []
-            for result in results:
-                documents.append(json_date_parser(result.document_content))
-
-            alerts_to_update.append(alert.pk)
 
             # Override order_by to show the latest items when clicking the
             # "View Full Results" button.
             cut_off_date = get_cut_off_date(rate, now_time.date())
             qd = override_alert_query(alert, cut_off_date)
             alert.query_run = qd.urlencode()  # type: ignore
-            hits.append(
-                (
-                    alert,
-                    search_type,
-                    documents,
-                    len(documents),
-                )
-            )
+            hits.append((alert, search_type, documents, len(documents)))
+
         if hits:
             send_search_alert_emails.delay(
                 [(user_id, hits)], scheduled_alert=True
             )
             alerts_sent_count += 1
 
-    # Update Alert's date_last_hit in bulk.
-    Alert.objects.filter(id__in=alerts_to_update).update(
-        date_last_hit=now_time
-    )
+        # Update Alert's date_last_hit in bulk for this user's alerts
+        Alert.objects.filter(id__in=alerts_to_update).update(
+            date_last_hit=now_time
+        )
 
-    # Update Scheduled alert hits status to "SENT".
-    scheduled_hits_rate.update(hit_status=SCHEDULED_ALERT_HIT_STATUS.SENT)
+        # Update Scheduled alert hits status to "SENT" for this user
+        scheduled_hits.update(hit_status=SCHEDULED_ALERT_HIT_STATUS.SENT)
 
     # Remove old Scheduled alert hits sent, daily.
     if rate == Alert.DAILY:
@@ -124,16 +170,12 @@ def query_and_send_alerts_by_rate(rate: str) -> None:
 
 
 def send_scheduled_alerts(rate: str) -> None:
-    if rate == Alert.DAILY:
-        query_and_send_alerts_by_rate(Alert.DAILY)
-    elif rate == Alert.WEEKLY:
-        query_and_send_alerts_by_rate(Alert.WEEKLY)
-    elif rate == Alert.MONTHLY:
+    if rate == Alert.MONTHLY:
         if datetime.date.today().day > 28:
             raise InvalidDateError(
                 "Monthly alerts cannot be run on the 29th, 30th or 31st."
             )
-        query_and_send_alerts_by_rate(Alert.MONTHLY)
+    query_and_send_alerts_by_rate(rate)
 
 
 def delete_old_scheduled_alerts() -> int: