Date: Wed, 3 Jul 2024 19:27:29 -0600
Subject: [PATCH 17/33] fix(alerts): Fix should_docket_hit_be_included date
comparison
- Fixed email templates
- Refactored retrieve_task_info
---
.../commands/cl_send_recap_alerts.py | 98 ++++++++++++-------
cl/alerts/tasks.py | 2 +
cl/alerts/templates/alert_email_es.html | 5 +-
cl/alerts/templates/alert_email_es.txt | 4 +-
cl/alerts/tests/tests_recap_alerts.py | 2 +-
5 files changed, 71 insertions(+), 40 deletions(-)
diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
index ce5ebc0f1f..e9ec48e2d8 100644
--- a/cl/alerts/management/commands/cl_send_recap_alerts.py
+++ b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -27,6 +27,7 @@
from cl.api.models import WebhookEventType
from cl.api.tasks import send_es_search_alert_webhook
from cl.lib.command_utils import VerboseCommand, logger
+from cl.lib.date_time import dt_as_local_date
from cl.lib.elasticsearch_utils import do_es_sweep_alert_query
from cl.lib.redis_utils import get_redis_interface
from cl.search.documents import DocketDocument, RECAPSweepDocument
@@ -76,12 +77,10 @@ def compute_estimated_remaining_time(
return initial_wait
start_time = datetime.datetime.fromtimestamp(start_time_millis / 1000.0)
+ time_now = datetime.datetime.now()
estimated_time_remaining = max(
datetime.timedelta(
- seconds=(
- (datetime.datetime.now() - start_time).total_seconds()
- / created
- )
+ seconds=((time_now - start_time).total_seconds() / created)
* (total - created)
).total_seconds(),
initial_wait,
@@ -90,6 +89,31 @@ def compute_estimated_remaining_time(
return estimated_time_remaining
+def retrieve_task_info(task_info: dict[str, Any]) -> dict[str, Any]:
+ """Retrieve task information from the given task dict.
+
+ :param task_info: A dictionary containing the task status information.
+ :return: A dictionary with the task completion status, created documents
+ count, total documents count, and the task start time in milliseconds.
+ Retrieve default values in case task_info is not valid.
+ """
+
+ if task_info:
+ status = task_info["task"]["status"]
+ return {
+ "completed": task_info["completed"],
+ "created": status["created"],
+ "total": status["total"],
+ "start_time_millis": task_info["task"]["start_time_in_millis"],
+ }
+ return {
+ "completed": False,
+ "created": 0,
+ "total": 0,
+ "start_time_millis": None,
+ }
+
+
def index_daily_recap_documents(
r: Redis,
source_index_name: str,
@@ -110,6 +134,9 @@ def index_daily_recap_documents(
"""
if r.exists("alert_sweep:re_index_completed"):
+ logger.info(
+ "The re-index task has been completed and will be omitted."
+ )
# The re-indexing has been completed for the day. Abort it and proceed
# with sending alerts.
return 0
@@ -127,6 +154,7 @@ def index_daily_recap_documents(
# If "alert_sweep:query_date" already exists get it from Redis.
local_midnight_str: str = str(r.get("alert_sweep:query_date"))
local_midnight = datetime.datetime.fromisoformat(local_midnight_str)
+ logger.info(f"Resuming re-indexing process for date: {local_midnight}")
es = connections.get_connection()
# Convert the local (PDT) midnight time to UTC
@@ -222,58 +250,50 @@ def index_daily_recap_documents(
# Store the task ID in Redis
task_id = response["task"]
r.set("alert_sweep:task_id", task_id)
+ logger.info(f"Re-indexing task scheduled ID: {task_id}")
else:
task_id = r.get("alert_sweep:task_id")
+ logger.info(f"Resuming re-index task ID: {task_id}")
initial_wait = 0.01 if testing else 60.0
time.sleep(initial_wait)
- task_info = get_task_status(task_id, es)
- if task_info:
- status = task_info["task"]["status"]
- created = status["created"]
- total = status["total"]
- start_time_millis = task_info["task"]["start_time_in_millis"]
- else:
- task_info["completed"] = False
- created = 0
- total = 0
- start_time_millis = None
-
+ get_task_info = retrieve_task_info(get_task_status(task_id, es))
iterations_count = 0
estimated_time_remaining = compute_estimated_remaining_time(
- initial_wait, start_time_millis, created, total
+ initial_wait,
+ get_task_info["start_time_millis"],
+ get_task_info["created"],
+ get_task_info["total"],
)
- while not task_info["completed"]:
+ while not get_task_info["completed"]:
logger.info(
- f"Task progress: {created}/{total} documents. Estimated time to"
- f" finish: {estimated_time_remaining}."
+ f"Task progress: {get_task_info['created']}/{get_task_info['total']} documents. "
+ f"Estimated time to finish: {estimated_time_remaining} seconds."
)
task_info = get_task_status(task_id, es)
+ get_task_info = retrieve_task_info(task_info)
time.sleep(estimated_time_remaining)
- if task_info and not task_info["completed"]:
- status = task_info["task"]["status"]
- start_time_millis = task_info["task"]["start_time_in_millis"]
- created = status["created"]
- total = status["total"]
- if total and created:
- estimated_time_remaining = compute_estimated_remaining_time(
- initial_wait, start_time_millis, created, total
- )
+ if task_info and not get_task_info["completed"]:
+ estimated_time_remaining = compute_estimated_remaining_time(
+ initial_wait,
+ get_task_info["start_time_millis"],
+ get_task_info["created"],
+ get_task_info["total"],
+ )
if not task_info:
iterations_count += 1
if iterations_count > 10:
logger.error(
"Re_index alert sweep index task has failed: %s/%s",
- created,
- total,
+ get_task_info["created"],
+ get_task_info["total"],
)
break
- r.delete("alert_sweep:query_date")
r.delete("alert_sweep:task_id")
if not testing:
r.set("alert_sweep:re_index_completed", 1, ex=3600 * 12)
- return total
+ return get_task_info["total"]
def should_docket_hit_be_included(
@@ -290,9 +310,16 @@ def should_docket_hit_be_included(
docket = Docket.objects.filter(id=docket_id).only("date_modified").first()
if not docket:
return False
- date_modified = docket.date_modified.date()
if not has_document_alert_hit_been_triggered(r, alert_id, "d", docket_id):
- if date_modified == timezone.now().date():
+ local_midnight_localized = timezone.localtime(
+ timezone.make_aware(
+ datetime.datetime.fromisoformat(
+ str(r.get("alert_sweep:query_date"))
+ )
+ )
+ )
+ date_modified_localized = dt_as_local_date(docket.date_modified)
+ if date_modified_localized == local_midnight_localized.date():
return True
return False
@@ -541,3 +568,4 @@ def handle(self, *args, **options):
query_and_schedule_alerts(r, Alert.WEEKLY)
query_and_schedule_alerts(r, Alert.MONTHLY)
r.delete("alert_sweep:re_index_completed")
+ r.delete("alert_sweep:query_date")
diff --git a/cl/alerts/tasks.py b/cl/alerts/tasks.py
index 885ac9b413..f7b004bc54 100644
--- a/cl/alerts/tasks.py
+++ b/cl/alerts/tasks.py
@@ -475,6 +475,8 @@ def send_search_alert_emails(
:param email_alerts_to_send: A list of two tuples containing the user to
whom the alerts should be sent. A list of tuples containing the Search
Alert, (Alert, search type, documents, and number of documents)
+ :param scheduled_alert: A boolean indicating weather this alert has been
+ scheduled
:return: None
"""
diff --git a/cl/alerts/templates/alert_email_es.html b/cl/alerts/templates/alert_email_es.html
index 2b15d540bd..dc2f797268 100644
--- a/cl/alerts/templates/alert_email_es.html
+++ b/cl/alerts/templates/alert_email_es.html
@@ -36,7 +36,7 @@
diff --git a/cl/alerts/templates/alert_email_es.txt b/cl/alerts/templates/alert_email_es.txt
index b836b10caa..2b7ec3b569 100644
--- a/cl/alerts/templates/alert_email_es.txt
+++ b/cl/alerts/templates/alert_email_es.txt
@@ -15,7 +15,7 @@ We have news regarding your alerts at CourtListener.com
View Full Results / Edit this Alert: https://www.courtlistener.com/?{{ alert.query_run|safe }}&edit_alert={{ alert.pk }}
Disable this Alert (one click): https://www.courtlistener.com{% url "disable_alert" alert.secret_key %}{% endif %}
-{{forloop.counter}}. {{ result.caseName|render_string_or_list|safe|striptags }} ({% if result.court_id != 'scotus' %}{{ result.court_citation_string|render_string_or_list|striptags }} {% endif %}{% if type == 'o' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %})
+{{forloop.counter}}. {{ result.caseName|render_string_or_list|safe|striptags }} ({% if result.court_id != 'scotus' %}{{ result.court_citation_string|render_string_or_list|striptags }} {% endif %}{% if type == 'o' or type == 'r' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %})
{% if type == 'oa' %}{% if result.dateArgued %}Date Argued: {{ result.dateArgued|date:"F jS, Y" }}{% else %}Date Argued: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} | Duration: {{ result.duration|naturalduration }}{% if result.judge %} | Judge: {{ result.judge|render_string_or_list|safe|striptags|underscore_to_space }}{% endif %}{% endif %}
{% if type == 'o' or type == 'oa' %}{% if result|get_highlight:"text" %}...{{ result|get_highlight:"text"|safe|striptags|underscore_to_space|compress_whitespace }}...{% endif %}{% endif %}
{% if type == 'r' %}{% if result.dateFiled %}Date Filed: {{ result.dateFiled|date:"F jS, Y" }}{% else %}Date Filed: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %}
@@ -24,7 +24,7 @@ Disable this Alert (one click): https://www.courtlistener.com{% url "disable_ale
{% if doc.plain_text %}{% contains_highlights doc.plain_text.0 True as highlighted %}{% if highlighted %}...{% endif %}{{ doc.plain_text|render_string_or_list|safe|striptags|underscore_to_space }}...{% endif %}
View this document on our site: https://www.courtlistener.com{% if doc.absolute_url %}{{ doc.absolute_url }}{% else %}{{ result.docket_absolute_url }}#minute-entry-{{ doc.docket_entry_id }}{% endif %}
{% endwith %}{% endfor %}
-{% if result.child_remaining %}{% extract_q_value alert.query_run as q_value %}View Additional Results for this Case: https://www.courtlistener.com/?type={{ type|urlencode }}&q={% if q_value %}({{ q_value|urlencode }})%20AND%20{% endif %}docket_id%3A{{ result.docket_id|urlencode }}{% endif %}
+{% if result.child_docs and result.child_remaining %}{% extract_q_value alert.query_run as q_value %}View Additional Results for this Case: https://www.courtlistener.com/?type={{ type|urlencode }}&q={% if q_value %}({{ q_value|urlencode }})%20AND%20{% endif %}docket_id%3A{{ result.docket_id|urlencode }}{% endif %}
{% endif %}~~~~~
- View this item on our site: https://www.courtlistener.com{% if type == 'r' %}{{result.docket_absolute_url}}{% else %}{{result.absolute_url}}{% endif %}
{% if result.download_url %} - Download original from the court: {{result.download_url}}
diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py
index 22f647b7d9..cf9ea6b145 100644
--- a/cl/alerts/tests/tests_recap_alerts.py
+++ b/cl/alerts/tests/tests_recap_alerts.py
@@ -1333,7 +1333,7 @@ def test_alert_frequency_estimation(self):
# RECAPDocument filed today that belongs to a docket filed outside
# the estimation range.
- date_outside_range = now() - datetime.timedelta(days=101)
+ date_outside_range = now() - datetime.timedelta(days=102)
alert_de = DocketEntryWithParentsFactory(
docket=DocketFactory(
court=self.court,
From 5077e01c81318ecf6bb726fc256161f54a636614 Mon Sep 17 00:00:00 2001
From: Alberto Islas
Date: Tue, 9 Jul 2024 20:02:46 -0600
Subject: [PATCH 18/33] fix(alerts): Changed approach to filter out
cross-object hits by using extra Docket-only and RD-only queries.
---
.../commands/cl_send_recap_alerts.py | 279 +++++++++++++-----
.../commands/clean_up_search_alerts.py | 2 +-
cl/alerts/tests/tests_recap_alerts.py | 274 +++++++++++++----
cl/alerts/utils.py | 21 --
cl/lib/elasticsearch_utils.py | 176 ++++++-----
cl/search/documents.py | 21 +-
cl/search/tests/tests_es_oral_arguments.py | 2 +-
cl/search/tests/tests_es_person.py | 4 +-
8 files changed, 547 insertions(+), 232 deletions(-)
diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
index e9ec48e2d8..763651c985 100644
--- a/cl/alerts/management/commands/cl_send_recap_alerts.py
+++ b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -12,7 +12,8 @@
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import RequestError, TransportError
from elasticsearch_dsl import connections
-from elasticsearch_dsl.response import Hit
+from elasticsearch_dsl.response import Hit, Response
+from elasticsearch_dsl.utils import AttrList
from redis import Redis
from cl.alerts.models import Alert, ScheduledAlertHit
@@ -21,7 +22,6 @@
add_document_hit_to_alert_set,
alert_hits_limit_reached,
has_document_alert_hit_been_triggered,
- query_includes_rd_field,
recap_document_hl_matched,
)
from cl.api.models import WebhookEventType
@@ -30,7 +30,11 @@
from cl.lib.date_time import dt_as_local_date
from cl.lib.elasticsearch_utils import do_es_sweep_alert_query
from cl.lib.redis_utils import get_redis_interface
-from cl.search.documents import DocketDocument, RECAPSweepDocument
+from cl.search.documents import (
+ DocketDocument,
+ ESRECAPSweepDocument,
+ RECAPSweepDocument,
+)
from cl.search.exception import (
BadProximityQuery,
UnbalancedParenthesesQuery,
@@ -117,8 +121,9 @@ def retrieve_task_info(task_info: dict[str, Any]) -> dict[str, Any]:
def index_daily_recap_documents(
r: Redis,
source_index_name: str,
- target_index: Type[RECAPSweepDocument],
+ target_index: Type[RECAPSweepDocument] | Type[ESRECAPSweepDocument],
testing: bool = False,
+ only_rd: bool = False,
) -> int:
"""Index Dockets added/modified during the day and all their RECAPDocuments
and RECAPDocuments added/modified during the day and their parent Dockets.
@@ -130,6 +135,8 @@ def index_daily_recap_documents(
:param target_index: The target Elasticsearch index to which documents will
be re-indexed.
:param testing: Boolean flag for testing mode.
+ :param only_rd: Whether to reindex only RECAPDocuments into the
+ ESRECAPSweepDocument index.
:return: The total number of documents re-indexed.
"""
@@ -166,14 +173,31 @@ def index_daily_recap_documents(
today_datetime_iso = local_midnight_utc.isoformat().replace("+00:00", "Z")
next_day_utc_iso = next_day_utc.isoformat().replace("+00:00", "Z")
# Re Index API query.
- query = {
- "bool": {
- "should": [
- # Dockets added/modified today
- {
- "bool": {
- "must": [
- {
+ query = (
+ {
+ "bool": {
+ "should": [
+ # Dockets added/modified today
+ {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ "timestamp": {
+ "gte": today_datetime_iso,
+ "lt": next_day_utc_iso,
+ }
+ }
+ },
+ {"term": {"docket_child": "docket"}},
+ ]
+ }
+ },
+ # RECAPDocuments with parents added/modified today
+ {
+ "has_parent": {
+ "parent_type": "docket",
+ "query": {
"range": {
"timestamp": {
"gte": today_datetime_iso,
@@ -181,29 +205,29 @@ def index_daily_recap_documents(
}
}
},
- {"term": {"docket_child": "docket"}},
- ]
- }
- },
- # RECAPDocuments with parents added/modified today
- {
- "has_parent": {
- "parent_type": "docket",
- "query": {
- "range": {
- "timestamp": {
- "gte": today_datetime_iso,
- "lt": next_day_utc_iso,
- }
- }
- },
- }
- },
- # RECAPDocuments added/modified today
- {
- "bool": {
- "must": [
- {
+ }
+ },
+ # RECAPDocuments added/modified today
+ {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ "timestamp": {
+ "gte": today_datetime_iso,
+ "lt": next_day_utc_iso,
+ }
+ }
+ },
+ {"term": {"docket_child": "recap_document"}},
+ ]
+ }
+ },
+ # Dockets that are parents of RECAPDocuments added/modified today
+ {
+ "has_child": {
+ "type": "recap_document",
+ "query": {
"range": {
"timestamp": {
"gte": today_datetime_iso,
@@ -211,27 +235,49 @@ def index_daily_recap_documents(
}
}
},
- {"term": {"docket_child": "recap_document"}},
- ]
- }
- },
- # Dockets that are parents of RECAPDocuments added/modified today
- {
- "has_child": {
- "type": "recap_document",
- "query": {
- "range": {
- "timestamp": {
- "gte": today_datetime_iso,
- "lt": next_day_utc_iso,
+ }
+ },
+ ]
+ }
+ }
+ if not only_rd
+ else {
+ "bool": {
+ "should": [
+ # RECAPDocuments with parents added/modified today
+ {
+ "has_parent": {
+ "parent_type": "docket",
+ "query": {
+ "range": {
+ "timestamp": {
+ "gte": today_datetime_iso,
+ "lt": next_day_utc_iso,
+ }
}
- }
- },
- }
- },
- ]
+ },
+ }
+ },
+ # RECAPDocuments added/modified today
+ {
+ "bool": {
+ "must": [
+ {
+ "range": {
+ "timestamp": {
+ "gte": today_datetime_iso,
+ "lt": next_day_utc_iso,
+ }
+ }
+ },
+ {"term": {"docket_child": "recap_document"}},
+ ]
+ }
+ },
+ ]
+ }
}
- }
+ )
if not r.exists("alert_sweep:task_id"):
# Remove the index from the previous day and create a new one.
@@ -241,12 +287,39 @@ def index_daily_recap_documents(
# In case of a failure, store the task_id in Redis so the command
# can be resumed.
- response = es.reindex(
- source={"index": source_index_name, "query": query},
- dest={"index": target_index_name},
- wait_for_completion=False,
- refresh=True,
- )
+ params = {
+ "source": {"index": source_index_name, "query": query},
+ "dest": {"index": target_index_name},
+ "wait_for_completion": False,
+ "refresh": True,
+ }
+ if only_rd:
+ # Re-index only RECADocument fields to the ESRECAPSweepDocument
+ # index
+ params["script"] = {
+ "source": """
+ def fields = [
+ 'id',
+ 'docket_entry_id',
+ 'description',
+ 'entry_number',
+ 'entry_date_filed',
+ 'short_description',
+ 'document_type',
+ 'document_number',
+ 'pacer_doc_id',
+ 'plain_text',
+ 'attachment_number',
+ 'is_available',
+ 'page_count',
+ 'filepath_local',
+ 'absolute_url',
+ 'cites'
+ ];
+ ctx._source.keySet().retainAll(fields);
+ """
+ }
+ response = es.reindex(**params)
# Store the task ID in Redis
task_id = response["task"]
r.set("alert_sweep:task_id", task_id)
@@ -291,8 +364,6 @@ def index_daily_recap_documents(
break
r.delete("alert_sweep:task_id")
- if not testing:
- r.set("alert_sweep:re_index_completed", 1, ex=3600 * 12)
return get_task_info["total"]
@@ -324,12 +395,20 @@ def should_docket_hit_be_included(
return False
-def filter_rd_alert_hits(r: Redis, alert_id: int, rd_hits, check_rd_hl=False):
+def filter_rd_alert_hits(
+ r: Redis,
+ alert_id: int,
+ rd_hits: AttrList,
+ rd_ids: list[int],
+ check_rd_hl=False,
+):
"""Filter RECAP document hits based on specified conditions.
:param r: The Redis interface.
:param alert_id: The ID of the alert.
- :param rd_hits: A list of RECAP document hits to be processed.
+ :param rd_hits: A list of RECAPDocument hits to be processed.
+ :param rd_ids: A list of RECAPDocument IDs that matched the RECAPDocument
+ only query.
:param check_rd_hl: A boolean indicating whether to check if the RECAP
document hit matched RD HLs.
:return: A list of RECAP document hits that meet all specified conditions.
@@ -343,7 +422,10 @@ def filter_rd_alert_hits(r: Redis, alert_id: int, rd_hits, check_rd_hl=False):
)
]
if check_rd_hl:
- conditions.append(recap_document_hl_matched(rd_hit))
+ if not recap_document_hl_matched(rd_hit):
+ # If the RECAPDocument hit didn't match any HL. Check if it should be included
+ # due to it matched the RECAPDocument only query.
+ conditions.append(rd_hit["_source"]["id"] in rd_ids)
if all(conditions):
rds_to_send.append(rd_hit)
add_document_hit_to_alert_set(
@@ -354,11 +436,13 @@ def filter_rd_alert_hits(r: Redis, alert_id: int, rd_hits, check_rd_hl=False):
def query_alerts(
search_params: QueryDict,
-) -> tuple[list[Hit] | None, int | None]:
+) -> tuple[list[Hit] | None, Response | None, Response | None]:
try:
search_query = RECAPSweepDocument.search()
+ child_search_query = ESRECAPSweepDocument.search()
return do_es_sweep_alert_query(
search_query,
+ child_search_query,
search_params,
)
except (
@@ -371,35 +455,52 @@ def query_alerts(
):
traceback.print_exc()
logger.info(f"Search for this alert failed: {search_params}\n")
- return None, None
+ return None, None, None
def process_alert_hits(
- r: Redis, results: list[Hit], search_params: QueryDict, alert_id: int
+ r: Redis,
+ results: list[Hit],
+ parent_results: Response | None,
+ child_results: Response | None,
+ alert_id: int,
) -> list[Hit]:
"""Process alert hits by filtering and prepare the results to send based
on alert conditions.
:param r: The Redis instance.
:param results: A list of Hit objects containing search results.
- :param search_params: Query parameters used for the search.
+ :param parent_results: The ES Response for the docket-only query.
+ :param child_results: The ES Response for the RECAPDocument-only query.
:param alert_id: The ID of the alert being processed.
:return: A list of Hit objects that are filtered and prepared to be sent.
"""
- includes_rd_fields = query_includes_rd_field(search_params)
+ docket_hits = parent_results.hits if parent_results else []
+ docket_ids = [int(d.docket_id) for d in docket_hits]
+
+ rd_hits = child_results.hits if child_results else []
+ rd_ids = [int(r.id) for r in rd_hits]
results_to_send = []
if len(results) > 0:
for hit in results:
- if not includes_rd_fields:
+ if hit.docket_id in docket_ids:
# Possible Docket-only alert
rds_to_send = filter_rd_alert_hits(
- r, alert_id, hit["child_docs"], check_rd_hl=True
+ r, alert_id, hit["child_docs"], rd_ids, check_rd_hl=True
)
if rds_to_send:
# Cross-object query
hit["child_docs"] = rds_to_send
results_to_send.append(hit)
+ if should_docket_hit_be_included(
+ r, alert_id, hit.docket_id
+ ):
+ add_document_hit_to_alert_set(
+ r, alert_id, "d", hit.docket_id
+ )
+
+ # Docket-only alert
elif should_docket_hit_be_included(r, alert_id, hit.docket_id):
# Docket-only alert
hit["child_docs"] = []
@@ -407,10 +508,11 @@ def process_alert_hits(
add_document_hit_to_alert_set(
r, alert_id, "d", hit.docket_id
)
+
else:
# RECAP-only alerts or cross-object alerts
rds_to_send = filter_rd_alert_hits(
- r, alert_id, hit["child_docs"]
+ r, alert_id, hit["child_docs"], rd_ids
)
if rds_to_send:
# Cross-object alert
@@ -456,13 +558,19 @@ def query_and_send_alerts(r: Redis, rate: str) -> None:
alerts_to_update = []
for alert in alerts:
search_params = QueryDict(alert.query.encode(), mutable=True)
- results, _ = query_alerts(search_params)
+ results, parent_results, child_results = query_alerts(
+ search_params
+ )
if not results:
continue
alerts_to_update.append(alert.pk)
search_type = search_params.get("type", SEARCH_TYPES.RECAP)
results_to_send = process_alert_hits(
- r, results, search_params, alert.pk
+ r,
+ results,
+ parent_results,
+ child_results,
+ alert.pk,
)
if results_to_send:
hits.append(
@@ -500,11 +608,18 @@ def query_and_schedule_alerts(r: Redis, rate: str):
scheduled_hits_to_create = []
for alert in alerts:
search_params = QueryDict(alert.query.encode(), mutable=True)
- results, _ = query_alerts(search_params)
+ results, parent_results, child_results = query_alerts(
+ search_params
+ )
if not results:
continue
+
results_to_send = process_alert_hits(
- r, results, search_params, alert.pk
+ r,
+ results,
+ parent_results,
+ child_results,
+ alert.pk,
)
if results_to_send:
for hit in results_to_send:
@@ -563,6 +678,16 @@ def handle(self, *args, **options):
RECAPSweepDocument,
testing=testing_mode,
)
+ index_daily_recap_documents(
+ r,
+ DocketDocument._index._name,
+ ESRECAPSweepDocument,
+ testing=testing_mode,
+ only_rd=True,
+ )
+ if not testing_mode:
+ r.set("alert_sweep:re_index_completed", 1, ex=3600 * 12)
+
query_and_send_alerts(r, Alert.REAL_TIME)
query_and_send_alerts(r, Alert.DAILY)
query_and_schedule_alerts(r, Alert.WEEKLY)
diff --git a/cl/alerts/management/commands/clean_up_search_alerts.py b/cl/alerts/management/commands/clean_up_search_alerts.py
index b00d7128a3..cf1ceb2f54 100644
--- a/cl/alerts/management/commands/clean_up_search_alerts.py
+++ b/cl/alerts/management/commands/clean_up_search_alerts.py
@@ -75,7 +75,7 @@ def validate_queries_syntax(options: OptionsType) -> None:
if search_form.is_valid():
cd = search_form.cleaned_data
try:
- s, _ = build_es_base_query(search_query, cd)
+ s, _, _ = build_es_base_query(search_query, cd)
s = s.extra(size=0)
s.execute().to_dict()
# Waiting between requests to avoid hammering ES too quickly.
diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py
index cf9ea6b145..1fd573c8fb 100644
--- a/cl/alerts/tests/tests_recap_alerts.py
+++ b/cl/alerts/tests/tests_recap_alerts.py
@@ -17,7 +17,7 @@
index_daily_recap_documents,
)
from cl.alerts.models import SEARCH_TYPES, Alert, ScheduledAlertHit
-from cl.alerts.utils import query_includes_rd_field, recap_document_hl_matched
+from cl.alerts.utils import recap_document_hl_matched
from cl.api.factories import WebhookFactory
from cl.api.models import WebhookEvent, WebhookEventType
from cl.donate.models import NeonMembership
@@ -78,6 +78,7 @@ def setUp(self):
self.r = get_redis_interface("CACHE")
self.r.delete("alert_sweep:query_date")
self.r.delete("alert_sweep:task_id")
+ self.r.delete("alert_hits:")
@staticmethod
def get_html_content_from_email(email_content):
@@ -257,7 +258,10 @@ async def test_recap_document_hl_matched(self) -> None:
"q": '"401 Civil"',
}
search_query = RECAPSweepDocument.search()
- results, total_hits = await sync_to_async(do_es_sweep_alert_query)(
+ results, parent_results, _ = await sync_to_async(
+ do_es_sweep_alert_query
+ )(
+ search_query,
search_query,
search_params,
)
@@ -272,7 +276,10 @@ async def test_recap_document_hl_matched(self) -> None:
"q": '"Mauris iaculis, leo sit amet hendrerit vehicula"',
}
search_query = RECAPSweepDocument.search()
- results, total_hits = await sync_to_async(do_es_sweep_alert_query)(
+ results, parent_results, _ = await sync_to_async(
+ do_es_sweep_alert_query
+ )(
+ search_query,
search_query,
search_params,
)
@@ -287,7 +294,10 @@ async def test_recap_document_hl_matched(self) -> None:
"q": "SUBPOENAS SERVED OFF Mauris iaculis",
}
search_query = RECAPSweepDocument.search()
- results, total_hits = await sync_to_async(do_es_sweep_alert_query)(
+ results, parent_results, _ = await sync_to_async(
+ do_es_sweep_alert_query
+ )(
+ search_query,
search_query,
search_params,
)
@@ -296,58 +306,6 @@ async def test_recap_document_hl_matched(self) -> None:
rd_field_matched = recap_document_hl_matched(rd)
self.assertEqual(rd_field_matched, True)
- async def test_query_includes_rd_field(self) -> None:
- """Test query_includes_rd_field method that checks if a query
- includes any indexed fields in the query string or filters specific to
- RECAP Documents.
- """
-
- # Docket-only query
- search_params = {
- "type": SEARCH_TYPES.RECAP,
- "q": '"401 Civil"',
- }
- self.assertEqual(query_includes_rd_field(search_params), False)
-
- # RECAPDocument-only query
- search_params = {
- "type": SEARCH_TYPES.RECAP,
- "q": 'description:"lorem ipsum"',
- }
- self.assertEqual(query_includes_rd_field(search_params), True)
-
- # Cross-object query
- search_params = {
- "type": SEARCH_TYPES.RECAP,
- "q": 'case_name:"American v." description:"lorem ipsum"',
- }
- self.assertEqual(query_includes_rd_field(search_params), True)
-
- # Docket-only query
- search_params = {
- "type": SEARCH_TYPES.RECAP,
- "q": "",
- "case_name": "SUBPOENAS",
- }
- self.assertEqual(query_includes_rd_field(search_params), False)
-
- # RECAPDocument-only query
- search_params = {
- "type": SEARCH_TYPES.RECAP,
- "q": "",
- "description": "Lorem",
- }
- self.assertEqual(query_includes_rd_field(search_params), True)
-
- # Cross-object query
- search_params = {
- "type": SEARCH_TYPES.RECAP,
- "q": "",
- "case_name": "SUBPOENAS",
- "document_number": 1,
- }
- self.assertEqual(query_includes_rd_field(search_params), True)
-
def test_filter_recap_alerts_to_send(self) -> None:
"""Test filter RECAP alerts that met the conditions to be sent:
- RECAP type alert.
@@ -907,6 +865,210 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None:
docket.delete()
+ def test_special_cross_object_alerts(self) -> None:
+ """This test confirms that hits are properly filtered out or included
+ in alerts for special cross-object alerts that can match either a
+ Docket-only hit and/or Docket + RDs simultaneously in the same hit.
+ These cases include queries that use an OR clause combining
+ Docket field + RD fields or a text query that can match a Docket and
+ RD field simultaneously.
+ """
+
+ # The following test confirms that an alert with a query that can match
+ # a Docket or RECAPDocuments simultaneously is properly filtered.
+ cross_object_alert_d_or_rd_field = AlertFactory(
+ user=self.user_profile.user,
+ rate=Alert.REAL_TIME,
+ name="Test Alert Cross-object query",
+ query=f"q=docket_id:{self.de.docket.pk} OR pacer_doc_id:{self.rd_2.pacer_doc_id}&type=r",
+ )
+ with mock.patch(
+ "cl.api.webhooks.requests.post",
+ side_effect=lambda *args, **kwargs: MockResponse(
+ 200, mock_raw=True
+ ),
+ ), time_machine.travel(self.mock_date, tick=False):
+ call_command("cl_send_recap_alerts", testing_mode=True)
+
+ # A new alert should be triggered containing a Docket-only hit and a
+ # Docket with the nested RD matched.
+ self.assertEqual(
+ len(mail.outbox), 1, msg="Outgoing emails don't match."
+ )
+ html_content = self.get_html_content_from_email(mail.outbox[0])
+ self._confirm_number_of_alerts(html_content, 1)
+ # This hit should only display the Docket matched by its ID,
+ # no RECAPDocument should be matched.
+ self._assert_child_hits_content(
+ html_content,
+ cross_object_alert_d_or_rd_field.name,
+ self.de.docket.case_name,
+ [],
+ )
+
+ # This hit should display the rd_2 nested below its parent docket.
+ self._assert_child_hits_content(
+ html_content,
+ cross_object_alert_d_or_rd_field.name,
+ self.de_1.docket.case_name,
+ [self.rd_2.description],
+ )
+
+ # Assert email text version:
+ txt_email = mail.outbox[0].body
+ self.assertIn(cross_object_alert_d_or_rd_field.name, txt_email)
+ self.assertIn(self.rd_2.description, txt_email)
+
+ # This test confirms a text query cross-object alert matches documents
+ # according to trigger conditions like indexed date and previous triggers
+ # by the same document.
+ two_days_before = self.mock_date - datetime.timedelta(days=2)
+ mock_two_days_before = two_days_before.replace(hour=5)
+ with time_machine.travel(mock_two_days_before, tick=False):
+ docket = DocketFactory(
+ court=self.court,
+ case_name="United States of America",
+ docket_number="1:21-bk-1009",
+ source=Docket.RECAP,
+ )
+
+ with time_machine.travel(
+ self.mock_date, tick=False
+ ), self.captureOnCommitCallbacks(execute=True):
+ alert_de = DocketEntryWithParentsFactory(
+ docket=docket,
+ entry_number=1,
+ date_filed=datetime.date(2024, 8, 19),
+ description="MOTION for Leave to File",
+ )
+ rd_3 = RECAPDocumentFactory(
+ docket_entry=alert_de,
+ description="Motion to File New",
+ document_number="2",
+ pacer_doc_id="018036652875",
+ plain_text="United states Lorem",
+ )
+
+ docket_2 = DocketFactory(
+ court=self.court,
+ case_name="United States of America vs Lorem",
+ docket_number="1:21-bk-1008",
+ source=Docket.RECAP,
+ )
+
+ cross_object_alert_text = AlertFactory(
+ user=self.user_profile.user,
+ rate=Alert.REAL_TIME,
+ name="Test Alert Cross-object query",
+ query=f'q="United states"&type=r',
+ )
+ with mock.patch(
+ "cl.api.webhooks.requests.post",
+ side_effect=lambda *args, **kwargs: MockResponse(
+ 200, mock_raw=True
+ ),
+ ), time_machine.travel(self.mock_date, tick=False):
+ call_command("cl_send_recap_alerts", testing_mode=True)
+
+ # A new alert should be triggered containing two hits. One matched by
+ # the rd_3 plain text description and one matched by docket_2 case_name
+ self.assertEqual(
+ len(mail.outbox), 2, msg="Outgoing emails don't match."
+ )
+ html_content = self.get_html_content_from_email(mail.outbox[1])
+ # rd_3 should appear nested in this hit.
+ self._confirm_number_of_alerts(html_content, 1)
+ self._assert_child_hits_content(
+ html_content,
+ cross_object_alert_text.name,
+ docket.case_name,
+ [rd_3.description],
+ )
+ # The docket_2 hit shouldn't contain RDs.
+ self._assert_child_hits_content(
+ html_content,
+ cross_object_alert_text.name,
+ docket_2.case_name,
+ [],
+ )
+
+ # Modify the docket today:
+ with time_machine.travel(
+ self.mock_date, tick=False
+ ), self.captureOnCommitCallbacks(execute=True):
+ docket.cause = "405 Civil"
+ docket.save()
+
+ # Trigger the alert again:
+ with mock.patch(
+ "cl.api.webhooks.requests.post",
+ side_effect=lambda *args, **kwargs: MockResponse(
+ 200, mock_raw=True
+ ),
+ ), time_machine.travel(self.mock_date, tick=False):
+ call_command("cl_send_recap_alerts", testing_mode=True)
+
+ # A new alert should be triggered containing docket as a hit with no
+ # nested RDs.
+ html_content = self.get_html_content_from_email(mail.outbox[2])
+ self.assertEqual(
+ len(mail.outbox), 3, msg="Outgoing emails don't match."
+ )
+ self._assert_child_hits_content(
+ html_content,
+ cross_object_alert_text.name,
+ docket.case_name,
+ [],
+ )
+
+ # Trigger alert again:
+ with mock.patch(
+ "cl.api.webhooks.requests.post",
+ side_effect=lambda *args, **kwargs: MockResponse(
+ 200, mock_raw=True
+ ),
+ ), time_machine.travel(self.mock_date, tick=False):
+ call_command("cl_send_recap_alerts", testing_mode=True)
+
+ # No new alerts should be triggered.
+ self.assertEqual(
+ len(mail.outbox), 3, msg="Outgoing emails don't match."
+ )
+
+ # This test confirms that we're able to trigger cross-object alerts
+ # that include an OR clause and match documents that belong to the
+ # same case.
+ cross_object_alert_d_or_rd_field_same_case = AlertFactory(
+ user=self.user_profile.user,
+ rate=Alert.REAL_TIME,
+ name="Test Alert Cross-object query",
+ query=f"q=docket_id:{self.de.docket.pk} OR pacer_doc_id:{self.rd.pacer_doc_id}&type=r",
+ )
+ with mock.patch(
+ "cl.api.webhooks.requests.post",
+ side_effect=lambda *args, **kwargs: MockResponse(
+ 200, mock_raw=True
+ ),
+ ), time_machine.travel(self.mock_date, tick=False):
+ call_command("cl_send_recap_alerts", testing_mode=True)
+
+ # A new alert should be triggered, containing the RD document nested below
+ # its parent docket.
+ html_content = self.get_html_content_from_email(mail.outbox[3])
+ self.assertEqual(
+ len(mail.outbox), 4, msg="Outgoing emails don't match."
+ )
+ self._confirm_number_of_alerts(html_content, 1)
+ self._assert_child_hits_content(
+ html_content,
+ cross_object_alert_d_or_rd_field_same_case.name,
+ self.de.docket.case_name,
+ [self.rd.description],
+ )
+
+ docket.delete()
+ docket_2.delete()
+
def test_limit_alert_case_child_hits(self) -> None:
"""Test limit case child hits up to 5 and display the "View additional
results for this Case" button.
diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py
index 44277a04a2..6ad589a913 100644
--- a/cl/alerts/utils.py
+++ b/cl/alerts/utils.py
@@ -169,27 +169,6 @@ def recap_document_hl_matched(rd_hit: Hit) -> bool:
return False
-def query_includes_rd_field(query_params: CleanData) -> bool:
- """Determine whether the query includes any indexed fields in the query
- string or filters specific to RECAP Documents.
-
- :param query_params: The query parameters.
- :return: True if any recap document fields or filters are included in the
- query, otherwise False.
- """
-
- query_string = query_params.get("q", "")
- for rd_field in recap_document_indexed_fields:
- if f"{rd_field}:" in query_string:
- return True
-
- for rd_filter in recap_document_filters:
- if query_params.get(rd_filter, ""):
- return True
-
- return False
-
-
def make_alert_set_key(alert_id: int, document_type: str) -> str:
"""Generate a Redis key for storing alert hits.
diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
index 72c982deca..f22b61de47 100644
--- a/cl/lib/elasticsearch_utils.py
+++ b/cl/lib/elasticsearch_utils.py
@@ -1070,7 +1070,7 @@ def build_es_base_query(
child_highlighting: bool = True,
api_version: Literal["v3", "v4"] | None = None,
alerts: bool = False,
-) -> tuple[Search, QueryString | None]:
+) -> tuple[Search, QueryString | None, QueryString | None]:
"""Builds filters and fulltext_query based on the given cleaned
data and returns an elasticsearch query.
@@ -1079,14 +1079,15 @@ def build_es_base_query(
:param child_highlighting: Whether highlighting should be enabled in child docs.
:param api_version: Optional, the request API version.
:param alerts: If highlighting is being applied to search Alerts hits.
- :return: A two-tuple, the Elasticsearch search query object and an ES
- QueryString for child documents, or None if there is no need to query
- child documents.
+ :return: A three-tuple, the Elasticsearch search query object and an ES
+ QueryString for child documents or None if there is no need to query
+ child documents and a QueryString for parent documents or None.
"""
main_query = None
string_query = None
- join_query = None
+ child_docs_query = None
+ parent_query = None
filters = []
plain_doc = False
match cd["type"]:
@@ -1131,12 +1132,14 @@ def build_es_base_query(
],
)
)
- main_query, join_query = build_full_join_es_queries(
- cd,
- child_query_fields,
- parent_query_fields,
- child_highlighting=child_highlighting,
- api_version=api_version,
+ main_query, child_docs_query, parent_query = (
+ build_full_join_es_queries(
+ cd,
+ child_query_fields,
+ parent_query_fields,
+ child_highlighting=child_highlighting,
+ api_version=api_version,
+ )
)
case (
@@ -1167,13 +1170,15 @@ def build_es_base_query(
],
)
)
- main_query, join_query = build_full_join_es_queries(
- cd,
- child_query_fields,
- parent_query_fields,
- child_highlighting=child_highlighting,
- api_version=api_version,
- alerts=alerts,
+ main_query, child_docs_query, parent_query = (
+ build_full_join_es_queries(
+ cd,
+ child_query_fields,
+ parent_query_fields,
+ child_highlighting=child_highlighting,
+ api_version=api_version,
+ alerts=alerts,
+ )
)
case SEARCH_TYPES.OPINION:
@@ -1207,13 +1212,15 @@ def build_es_base_query(
],
)
)
- main_query, join_query = build_full_join_es_queries(
- cd,
- child_query_fields,
- parent_query_fields,
- mlt_query,
- child_highlighting=child_highlighting,
- api_version=api_version,
+ main_query, child_docs_query, parent_query = (
+ build_full_join_es_queries(
+ cd,
+ child_query_fields,
+ parent_query_fields,
+ mlt_query,
+ child_highlighting=child_highlighting,
+ api_version=api_version,
+ )
)
if not any([filters, string_query, main_query]):
@@ -1222,7 +1229,7 @@ def build_es_base_query(
match_all_query = get_match_all_query(
cd, search_query, api_version, child_highlighting
)
- return match_all_query, join_query
+ return match_all_query, child_docs_query, parent_query
if plain_doc:
# Combine the filters and string query for plain documents like Oral
@@ -1231,7 +1238,7 @@ def build_es_base_query(
cd, filters, string_query, api_version
)
- return search_query.query(main_query), join_query
+ return search_query.query(main_query), child_docs_query, parent_query
def build_has_parent_parties_query(
@@ -1261,7 +1268,7 @@ def build_has_parent_parties_query(
def build_child_docs_query(
- join_query: QueryString | None,
+ child_docs_query: QueryString | None,
cd: CleanData,
exclude_docs_for_empty_field: str = "",
) -> QueryString:
@@ -1271,7 +1278,7 @@ def build_child_docs_query(
to retrieve child documents directly, such as in the Opinions Feed,
RECAP Feed, RECAP Documents count query, and V4 RECAP_DOCUMENT Search API.
- :param join_query: Existing Elasticsearch QueryString object or None
+ :param child_docs_query: Existing Elasticsearch QueryString object or None
:param cd: The user input CleanedData
:param exclude_docs_for_empty_field: Field that should not be empty for a
document to be included
@@ -1289,7 +1296,7 @@ def build_child_docs_query(
]
parties_has_parent_query = build_has_parent_parties_query(parties_filters)
- if not join_query:
+ if not child_docs_query:
# Match all query case.
if not exclude_docs_for_empty_field:
if cd["type"] == SEARCH_TYPES.OPINION:
@@ -1311,7 +1318,7 @@ def build_child_docs_query(
filters.append(child_query_recap)
return Q("bool", filter=filters)
- query_dict = join_query.to_dict()
+ query_dict = child_docs_query.to_dict()
if "filter" in query_dict["bool"]:
existing_filter = query_dict["bool"]["filter"]
if cd["type"] == SEARCH_TYPES.OPINION:
@@ -1373,7 +1380,7 @@ def get_facet_dict_for_search_query(
"""
cd["just_facets_query"] = True
- search_query, _ = build_es_base_query(search_query, cd)
+ search_query, _, _ = build_es_base_query(search_query, cd)
search_query.aggs.bucket("status", A("terms", field="status.raw"))
search_query = search_query.extra(size=0)
response = search_query.execute()
@@ -1395,7 +1402,7 @@ def build_es_main_query(
applicable.
"""
search_query_base = search_query
- search_query, join_query = build_es_base_query(search_query, cd)
+ search_query, child_docs_query, _ = build_es_base_query(search_query, cd)
top_hits_limit = 5
child_docs_count_query = None
match cd["type"]:
@@ -1413,7 +1420,9 @@ def build_es_main_query(
top_hits_limit,
)
case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS:
- child_docs_count_query = build_child_docs_query(join_query, cd)
+ child_docs_count_query = build_child_docs_query(
+ child_docs_query, cd
+ )
if child_docs_count_query:
# Get the total RECAP Documents count.
child_docs_count_query = search_query_base.query(
@@ -2214,13 +2223,13 @@ def build_search_feed_query(
hl_field = "text"
if cd["type"] == SEARCH_TYPES.RECAP:
hl_field = "plain_text"
- s, join_query = build_es_base_query(search_query, cd)
+ s, child_docs_query, _ = build_es_base_query(search_query, cd)
if jurisdiction or cd["type"] == SEARCH_TYPES.RECAP:
# An Opinion Jurisdiction feed or RECAP Search displays child documents
# Eliminate items that lack the ordering field and apply highlighting
# to create a snippet for the plain_text or text fields.
s = build_child_docs_query(
- join_query,
+ child_docs_query,
cd=cd,
exclude_docs_for_empty_field=exclude_docs_for_empty_field,
)
@@ -2336,7 +2345,7 @@ def build_full_join_es_queries(
child_highlighting: bool = True,
api_version: Literal["v3", "v4"] | None = None,
alerts: bool = False,
-) -> tuple[QueryString | list, QueryString | None]:
+) -> tuple[QueryString | list, QueryString | None, QueryString | None]:
"""Build a complete Elasticsearch query with both parent and child document
conditions.
@@ -2347,7 +2356,8 @@ def build_full_join_es_queries(
:param child_highlighting: Whether highlighting should be enabled in child docs.
:param api_version: Optional, the request API version.
:param alerts: If highlighting is being applied to search Alerts hits.
- :return: An Elasticsearch QueryString object.
+ :return: A three-tuple: the main join query, the child documents query, and
+ the parent documents query.
"""
q_should = []
@@ -2363,7 +2373,8 @@ def build_full_join_es_queries(
case SEARCH_TYPES.PEOPLE:
child_type = "position"
- join_query = None
+ child_docs_query = None
+ parent_query = None
if cd["type"] in [
SEARCH_TYPES.RECAP,
SEARCH_TYPES.DOCKETS,
@@ -2419,18 +2430,18 @@ def build_full_join_es_queries(
case [], []:
pass
case [], _:
- join_query = Q(
+ child_docs_query = Q(
"bool",
should=child_text_query,
minimum_should_match=1,
)
case _, []:
- join_query = Q(
+ child_docs_query = Q(
"bool",
filter=child_filters,
)
case _, _:
- join_query = Q(
+ child_docs_query = Q(
"bool",
filter=child_filters,
should=child_text_query,
@@ -2446,7 +2457,7 @@ def build_full_join_es_queries(
(child_highlighting, cd["type"]), {}
)
has_child_query = build_has_child_query(
- join_query,
+ child_docs_query,
child_type,
query_hits_limit,
hl_fields,
@@ -2525,9 +2536,9 @@ def build_full_join_es_queries(
q_should.append(parent_query)
if not q_should:
- return [], join_query
+ return [], child_docs_query, parent_query
- final_query = apply_custom_score_to_main_query(
+ main_join_query = apply_custom_score_to_main_query(
cd,
Q(
"bool",
@@ -2535,10 +2546,7 @@ def build_full_join_es_queries(
),
api_version,
)
- return (
- final_query,
- join_query,
- )
+ return (main_join_query, child_docs_query, parent_query)
def limit_inner_hits(
@@ -2859,10 +2867,8 @@ def do_es_api_query(
child documents.
"""
- child_docs_query = None
-
try:
- s, join_query = build_es_base_query(
+ s, child_docs_query, _ = build_es_base_query(
search_query, cd, cd["highlight"], api_version
)
except (
@@ -2881,7 +2887,7 @@ def do_es_api_query(
# Note that in V3 Case Law Search, opinions are collapsed by cluster_id
# meaning that only one result per cluster is shown.
s = build_child_docs_query(
- join_query,
+ child_docs_query,
cd=cd,
)
main_query = search_query.query(s)
@@ -2917,7 +2923,7 @@ def do_es_api_query(
)
else:
child_docs_query = build_child_docs_query(
- join_query,
+ child_docs_query,
cd=cd,
)
# Build query params for the ES V4 Search API endpoints.
@@ -3032,7 +3038,7 @@ def do_es_alert_estimation_query(
days=int(day_count)
)
cd[before_field] = None
- estimation_query, _ = build_es_base_query(search_query, cd)
+ estimation_query, _, _ = build_es_base_query(search_query, cd)
if cd["type"] == SEARCH_TYPES.RECAP:
# The RECAP estimation query consists of two requests: one to estimate
@@ -3054,8 +3060,8 @@ def do_es_alert_estimation_query(
multi_search = multi_search.add(main_doc_count_query)
# Build RECAPDocuments count query.
- _, join_query = build_es_base_query(search_query, cd)
- child_docs_count_query = build_child_docs_query(join_query, cd)
+ _, child_docs_query, _ = build_es_base_query(search_query, cd)
+ child_docs_count_query = build_child_docs_query(child_docs_query, cd)
child_total = 0
if child_docs_count_query:
child_docs_count_query = search_query.query(child_docs_count_query)
@@ -3077,11 +3083,14 @@ def do_es_alert_estimation_query(
def do_es_sweep_alert_query(
search_query: Search,
+ child_search_query: Search,
cd: CleanData,
-) -> tuple[list[Hit] | None, int | None]:
+) -> tuple[list[Hit] | None, Response | None, Response | None]:
"""Build an ES query for its use in the daily RECAP sweep index.
:param search_query: Elasticsearch DSL Search object.
+ :param child_search_query: The Elasticsearch DSL search query to perform
+ the child-only query.
:param cd: The query CleanedData
:return: A two-tuple, the Elasticsearch search query object and an ES
Query for child documents, or None if there is no need to query
@@ -3092,29 +3101,54 @@ def do_es_sweep_alert_query(
if search_form.is_valid():
cd = search_form.cleaned_data
else:
- return None, None
-
- total_hits = None
-
- s, _ = build_es_base_query(search_query, cd, True, alerts=True)
+ return None, None, None
+ s, child_query, parent_query = build_es_base_query(
+ search_query, cd, True, alerts=True
+ )
main_query = add_es_highlighting(s, cd, alerts=True)
main_query = main_query.sort(build_sort_results(cd))
main_query = main_query.extra(
from_=0, size=settings.SCHEDULED_ALERT_HITS_LIMIT
)
- results = main_query.execute()
- if results:
- total_hits = results.hits.total.value
-
- limit_inner_hits({}, results, cd["type"])
- set_results_highlights(results, cd["type"])
- for result in results:
+ multi_search = MultiSearch()
+ multi_search = multi_search.add(main_query)
+ if parent_query:
+ parent_search = search_query.query(parent_query)
+ parent_search = parent_search.extra(
+ from_=0, size=settings.SCHEDULED_ALERT_HITS_LIMIT
+ )
+ parent_search = parent_search.source(includes=["docket_id"])
+ multi_search = multi_search.add(parent_search)
+
+ if child_query:
+ child_search = child_search_query.query(child_query)
+ child_search = child_search.extra(
+ from_=0,
+ size=settings.SCHEDULED_ALERT_HITS_LIMIT
+ * settings.RECAP_CHILD_HITS_PER_RESULT,
+ )
+ child_search = child_search.source(includes=["id"])
+ multi_search = multi_search.add(child_search)
+
+ responses = multi_search.execute()
+ main_results = responses[0]
+ rd_results = None
+ docket_results = None
+ if parent_query:
+ docket_results = responses[1]
+ if child_query:
+ rd_results = responses[2]
+
+ limit_inner_hits({}, main_results, cd["type"])
+ set_results_highlights(main_results, cd["type"])
+
+ for result in main_results:
child_result_objects = []
if hasattr(result, "child_docs"):
for child_doc in result.child_docs:
child_result_objects.append(child_doc.to_dict())
result["child_docs"] = child_result_objects
- return results, total_hits
+ return main_results, docket_results, rd_results
diff --git a/cl/search/documents.py b/cl/search/documents.py
index 378dbb9477..cedc638170 100644
--- a/cl/search/documents.py
+++ b/cl/search/documents.py
@@ -4,6 +4,7 @@
from django.http import QueryDict
from django.utils.html import escape, strip_tags
from django_elasticsearch_dsl import Document, fields
+from elasticsearch_dsl import Document as DSLDocument
from cl.alerts.models import Alert
from cl.audio.models import Audio
@@ -364,7 +365,7 @@ def prepare_percolator_query(self, instance):
cd = search_form.cleaned_data
search_query = AudioDocument.search()
- query, _ = build_es_base_query(search_query, cd)
+ query, _, _ = build_es_base_query(search_query, cd)
return query.to_dict()["query"]
@@ -961,8 +962,7 @@ def prepare_timestamp(self, instance):
return datetime.utcnow()
-@recap_index.document
-class ESRECAPDocument(DocketBaseDocument):
+class ESRECAPBaseDocument(DSLDocument):
id = fields.IntegerField(attr="pk")
docket_entry_id = fields.IntegerField(attr="docket_entry.pk")
description = fields.TextField(
@@ -1030,6 +1030,10 @@ class ESRECAPDocument(DocketBaseDocument):
fields.IntegerField(multi=True),
)
+
+@recap_index.document
+class ESRECAPDocument(DocketBaseDocument, ESRECAPBaseDocument):
+
class Django:
model = RECAPDocument
ignore_signals = True
@@ -1837,3 +1841,14 @@ class Index:
"number_of_replicas": settings.ELASTICSEARCH_RECAP_NUMBER_OF_REPLICAS,
"analysis": settings.ELASTICSEARCH_DSL["analysis"],
}
+
+
+class ESRECAPSweepDocument(ESRECAPBaseDocument):
+
+ class Index:
+ name = "recap_document_sweep"
+ settings = {
+ "number_of_shards": settings.ELASTICSEARCH_RECAP_NUMBER_OF_SHARDS,
+ "number_of_replicas": settings.ELASTICSEARCH_RECAP_NUMBER_OF_REPLICAS,
+ "analysis": settings.ELASTICSEARCH_DSL["analysis"],
+ }
diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py
index f47453b7bc..a8f2db4541 100644
--- a/cl/search/tests/tests_es_oral_arguments.py
+++ b/cl/search/tests/tests_es_oral_arguments.py
@@ -984,7 +984,7 @@ def confirm_query_matched(response, query_id) -> bool:
@staticmethod
def save_percolator_query(cd):
search_query = AudioDocument.search()
- query, _ = build_es_base_query(search_query, cd)
+ query, _, _ = build_es_base_query(search_query, cd)
query_dict = query.to_dict()["query"]
percolator_query = AudioPercolator(
percolator_query=query_dict, rate=Alert.REAL_TIME
diff --git a/cl/search/tests/tests_es_person.py b/cl/search/tests/tests_es_person.py
index 0eb72bfe96..ef994b9a50 100644
--- a/cl/search/tests/tests_es_person.py
+++ b/cl/search/tests/tests_es_person.py
@@ -1342,7 +1342,7 @@ def test_has_child_filters(self) -> None:
"type": SEARCH_TYPES.PEOPLE,
}
s = PersonDocument.search()
- main_query, _ = build_es_base_query(s, cd)
+ main_query, _, _ = build_es_base_query(s, cd)
self.assertEqual(main_query.count(), 2)
# Query by parent field dob_state and child field selection_method.
@@ -1352,7 +1352,7 @@ def test_has_child_filters(self) -> None:
"type": SEARCH_TYPES.PEOPLE,
}
s = PersonDocument.search()
- main_query, _ = build_es_base_query(s, cd)
+ main_query, _, _ = build_es_base_query(s, cd)
self.assertEqual(main_query.count(), 1)
position_5.delete()
From a4e4e62250184b1de9b754cceff5cf60deb19723 Mon Sep 17 00:00:00 2001
From: Alberto Islas
Date: Wed, 10 Jul 2024 16:42:27 -0600
Subject: [PATCH 19/33] fix(alerts): Added more tests related to filtering
cross-object hits.
- Fixed issues and improved command resumability
---
.../commands/cl_send_recap_alerts.py | 115 +++++---
cl/alerts/tests/tests_recap_alerts.py | 265 ++++++++++++++----
2 files changed, 290 insertions(+), 90 deletions(-)
diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
index 763651c985..7f05ce4291 100644
--- a/cl/alerts/management/commands/cl_send_recap_alerts.py
+++ b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -2,7 +2,7 @@
import datetime
import time
import traceback
-from typing import Any, Type
+from typing import Any, Literal, Type
import pytz
from asgiref.sync import async_to_sync
@@ -140,12 +140,21 @@ def index_daily_recap_documents(
:return: The total number of documents re-indexed.
"""
- if r.exists("alert_sweep:re_index_completed"):
+ if r.exists("alert_sweep:main_re_index_completed"):
logger.info(
- "The re-index task has been completed and will be omitted."
+ "The main re-index task has been completed and will be omitted."
)
- # The re-indexing has been completed for the day. Abort it and proceed
- # with sending alerts.
+ # The main re-indexing has been completed for the day. Abort it and
+ # proceed with RECAPDocument re-index.
+ return 0
+
+ if r.exists("alert_sweep:rd_re_index_completed"):
+ logger.info(
+ "The RECAPDocument only re-index task has been completed and will "
+ "be omitted."
+ )
+ # The RECAPDocument re-indexing has been completed for the day. Abort
+ # it and proceed with sending alerts.
return 0
if not r.exists("alert_sweep:query_date"):
@@ -368,7 +377,7 @@ def fields = [
def should_docket_hit_be_included(
- r: Redis, alert_id: int, docket_id: int
+ r: Redis, alert_id: int, docket_id: int, query_date: datetime.date
) -> bool:
"""Determine if a Docket alert should be triggered based on its
date_modified and if the docket has triggered the alert previously.
@@ -376,21 +385,19 @@ def should_docket_hit_be_included(
:param r: The Redis interface.
:param alert_id: The ID of the alert.
:param docket_id: The ID of the docket.
+ :param query_date: The daily re_index query date.
:return: True if the Docket alert should be triggered, False otherwise.
"""
docket = Docket.objects.filter(id=docket_id).only("date_modified").first()
if not docket:
return False
if not has_document_alert_hit_been_triggered(r, alert_id, "d", docket_id):
- local_midnight_localized = timezone.localtime(
- timezone.make_aware(
- datetime.datetime.fromisoformat(
- str(r.get("alert_sweep:query_date"))
- )
- )
- )
+ # Confirm the docket has been modified during the day we’re sending
+ # alerts to avoid triggering docket-only alerts due to RECAPDocuments
+ # related to the case being indexed during the day since RD contains
+ # docket fields indexed which can trigger docket-only alerts.
date_modified_localized = dt_as_local_date(docket.date_modified)
- if date_modified_localized == local_midnight_localized.date():
+ if date_modified_localized == query_date:
return True
return False
@@ -464,6 +471,7 @@ def process_alert_hits(
parent_results: Response | None,
child_results: Response | None,
alert_id: int,
+ query_date: datetime.date,
) -> list[Hit]:
"""Process alert hits by filtering and prepare the results to send based
on alert conditions.
@@ -473,9 +481,9 @@ def process_alert_hits(
:param parent_results: The ES Response for the docket-only query.
:param child_results: The ES Response for the RECAPDocument-only query.
:param alert_id: The ID of the alert being processed.
+ :param query_date: The daily re_index query date.
:return: A list of Hit objects that are filtered and prepared to be sent.
"""
-
docket_hits = parent_results.hits if parent_results else []
docket_ids = [int(d.docket_id) for d in docket_hits]
@@ -490,32 +498,31 @@ def process_alert_hits(
r, alert_id, hit["child_docs"], rd_ids, check_rd_hl=True
)
if rds_to_send:
- # Cross-object query
+ # Docket OR RECAPDocument alert.
hit["child_docs"] = rds_to_send
results_to_send.append(hit)
if should_docket_hit_be_included(
- r, alert_id, hit.docket_id
+ r, alert_id, hit.docket_id, query_date
):
add_document_hit_to_alert_set(
r, alert_id, "d", hit.docket_id
)
- # Docket-only alert
- elif should_docket_hit_be_included(r, alert_id, hit.docket_id):
+ elif should_docket_hit_be_included(
+ r, alert_id, hit.docket_id, query_date
+ ):
# Docket-only alert
hit["child_docs"] = []
results_to_send.append(hit)
add_document_hit_to_alert_set(
r, alert_id, "d", hit.docket_id
)
-
else:
- # RECAP-only alerts or cross-object alerts
+ # RECAPDocument-only alerts or cross-object alerts
rds_to_send = filter_rd_alert_hits(
r, alert_id, hit["child_docs"], rd_ids
)
if rds_to_send:
- # Cross-object alert
hit["child_docs"] = rds_to_send
results_to_send.append(hit)
return results_to_send
@@ -541,7 +548,18 @@ def send_search_alert_webhooks(
)
-def query_and_send_alerts(r: Redis, rate: str) -> None:
+def query_and_send_alerts(
+ r: Redis, rate: Literal["rt", "dly"], query_date: datetime.date
+) -> None:
+ """Query the sweep index and send alerts based on the specified rate
+ and date.
+
+ :param r: The Redis interface.
+ :param rate: The rate at which to query alerts.
+ :param query_date: The daily re_index query date.
+ :return: None.
+ """
+
alert_users: UserProfile.user = User.objects.filter(
alerts__rate=rate
).distinct()
@@ -566,11 +584,7 @@ def query_and_send_alerts(r: Redis, rate: str) -> None:
alerts_to_update.append(alert.pk)
search_type = search_params.get("type", SEARCH_TYPES.RECAP)
results_to_send = process_alert_hits(
- r,
- results,
- parent_results,
- child_results,
- alert.pk,
+ r, results, parent_results, child_results, alert.pk, query_date
)
if results_to_send:
hits.append(
@@ -600,7 +614,18 @@ def query_and_send_alerts(r: Redis, rate: str) -> None:
logger.info(f"Sent {alerts_sent_count} {rate} email alerts.")
-def query_and_schedule_alerts(r: Redis, rate: str):
+def query_and_schedule_alerts(
+ r: Redis, rate: Literal["wly", "mly"], query_date: datetime.date
+) -> None:
+ """Query the sweep index and schedule alerts based on the specified rate
+ and date.
+
+ :param r: The Redis interface.
+ :param rate: The rate at which to query alerts.
+ :param query_date: The daily re_index query date.
+ :return: None.
+ """
+
alert_users = User.objects.filter(alerts__rate=rate).distinct()
for user in alert_users:
alerts = user.alerts.filter(rate=rate, alert_type=SEARCH_TYPES.RECAP)
@@ -615,11 +640,7 @@ def query_and_schedule_alerts(r: Redis, rate: str):
continue
results_to_send = process_alert_hits(
- r,
- results,
- parent_results,
- child_results,
- alert.pk,
+ r, results, parent_results, child_results, alert.pk, query_date
)
if results_to_send:
for hit in results_to_send:
@@ -678,6 +699,10 @@ def handle(self, *args, **options):
RECAPSweepDocument,
testing=testing_mode,
)
+ if not testing_mode:
+ # main_re_index_completed key so the main re_index task can be
+ # omitted in case of a failure.
+ r.set("alert_sweep:main_re_index_completed", 1, ex=3600 * 12)
index_daily_recap_documents(
r,
DocketDocument._index._name,
@@ -686,11 +711,21 @@ def handle(self, *args, **options):
only_rd=True,
)
if not testing_mode:
- r.set("alert_sweep:re_index_completed", 1, ex=3600 * 12)
+ # rd_re_index_completed key so the RECAPDocument re_index task
+ # can be omitted in case of a failure.
+ r.set("alert_sweep:rd_re_index_completed", 1, ex=3600 * 12)
- query_and_send_alerts(r, Alert.REAL_TIME)
- query_and_send_alerts(r, Alert.DAILY)
- query_and_schedule_alerts(r, Alert.WEEKLY)
- query_and_schedule_alerts(r, Alert.MONTHLY)
- r.delete("alert_sweep:re_index_completed")
+ query_date = timezone.localtime(
+ timezone.make_aware(
+ datetime.datetime.fromisoformat(
+ str(r.get("alert_sweep:query_date"))
+ )
+ )
+ ).date()
+ query_and_send_alerts(r, Alert.REAL_TIME, query_date)
+ query_and_send_alerts(r, Alert.DAILY, query_date)
+ query_and_schedule_alerts(r, Alert.WEEKLY, query_date)
+ query_and_schedule_alerts(r, Alert.MONTHLY, query_date)
+ r.delete("alert_sweep:main_re_index_completed")
+ r.delete("alert_sweep:rd_re_index_completed")
r.delete("alert_sweep:query_date")
diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py
index 1fd573c8fb..a661347975 100644
--- a/cl/alerts/tests/tests_recap_alerts.py
+++ b/cl/alerts/tests/tests_recap_alerts.py
@@ -78,7 +78,9 @@ def setUp(self):
self.r = get_redis_interface("CACHE")
self.r.delete("alert_sweep:query_date")
self.r.delete("alert_sweep:task_id")
- self.r.delete("alert_hits:")
+ keys = self.r.keys("alert_hits:*")
+ if keys:
+ self.r.delete(*keys)
@staticmethod
def get_html_content_from_email(email_content):
@@ -141,9 +143,7 @@ def _count_alert_hits_and_child_hits(
self.assertTrue(
alert_element, msg=f"Not alert with title {alert_title} found."
)
-
alert_cases = self._extract_cases_from_alert(tree, alert_title)
-
self.assertEqual(
len(alert_cases),
expected_hits,
@@ -152,21 +152,23 @@ def _count_alert_hits_and_child_hits(
% (alert_title, expected_hits, len(alert_cases)),
)
if case_title:
- child_hit_count = 0
for case in alert_cases:
- case_text = " ".join(case.xpath(".//text()")).strip()
+ child_hit_count = 0
+ case_text = " ".join(
+ [element.strip() for element in case.xpath(".//text()")]
+ )
if case_title in case_text:
child_hit_count = len(
case.xpath("following-sibling::ul[1]/li/a")
)
-
- self.assertEqual(
- child_hit_count,
- expected_child_hits,
- msg="Did not get the right number of child hits for the case %s. "
- "Expected: %s - Got: %s\n\n"
- % (case_title, expected_child_hits, child_hit_count),
- )
+ self.assertEqual(
+ child_hit_count,
+ expected_child_hits,
+ msg="Did not get the right number of child hits for the case %s. "
+ "Expected: %s - Got: %s\n\n"
+ % (case_title, expected_child_hits, child_hit_count),
+ )
+ break
def _assert_child_hits_content(
self,
@@ -865,17 +867,17 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None:
docket.delete()
- def test_special_cross_object_alerts(self) -> None:
+ def test_special_cross_object_alerts_or_clause(self) -> None:
"""This test confirms that hits are properly filtered out or included
in alerts for special cross-object alerts that can match either a
Docket-only hit and/or Docket + RDs simultaneously in the same hit.
These cases include queries that use an OR clause combining
- Docket field + RD fields or a text query that can match a Docket and
- RD field simultaneously.
+ Docket field + RD fields.
"""
# The following test confirms that an alert with a query that can match
- # a Docket or RECAPDocuments simultaneously is properly filtered.
+ # a Docket or RECAPDocuments from different cases simultaneously are
+ # properly filtered.
cross_object_alert_d_or_rd_field = AlertFactory(
user=self.user_profile.user,
rate=Alert.REAL_TIME,
@@ -897,34 +899,88 @@ def test_special_cross_object_alerts(self) -> None:
)
html_content = self.get_html_content_from_email(mail.outbox[0])
self._confirm_number_of_alerts(html_content, 1)
+
# This hit should only display the Docket matched by its ID,
# no RECAPDocument should be matched.
- self._assert_child_hits_content(
+ self._count_alert_hits_and_child_hits(
html_content,
cross_object_alert_d_or_rd_field.name,
+ 2,
self.de.docket.case_name,
- [],
+ 0,
)
-
- # This hit should display the rd_2 nested below its parent docket.
+ # The second hit should display the rd_2 nested below its parent docket.
self._assert_child_hits_content(
html_content,
cross_object_alert_d_or_rd_field.name,
self.de_1.docket.case_name,
[self.rd_2.description],
)
-
# Assert email text version:
txt_email = mail.outbox[0].body
self.assertIn(cross_object_alert_d_or_rd_field.name, txt_email)
self.assertIn(self.rd_2.description, txt_email)
+ # This test confirms that we're able to trigger cross-object alerts
+ # that include an OR clause and match documents that belong to the
+ # same case.
+ cross_object_alert_d_or_rd_field_same_case = AlertFactory(
+ user=self.user_profile.user,
+ rate=Alert.REAL_TIME,
+ name="Test Alert Cross-object query",
+ query=f"q=docket_id:{self.de.docket.pk} OR pacer_doc_id:{self.rd.pacer_doc_id}&type=r",
+ )
+ with mock.patch(
+ "cl.api.webhooks.requests.post",
+ side_effect=lambda *args, **kwargs: MockResponse(
+ 200, mock_raw=True
+ ),
+ ), time_machine.travel(self.mock_date, tick=False):
+ call_command("cl_send_recap_alerts", testing_mode=True)
+
+ # A new alert should be triggered, containing the RD document nested
+ # below its parent docket.
+ self.assertEqual(
+ len(mail.outbox), 2, msg="Outgoing emails don't match."
+ )
+ html_content = self.get_html_content_from_email(mail.outbox[1])
+ self._confirm_number_of_alerts(html_content, 1)
+ self._count_alert_hits_and_child_hits(
+ html_content,
+ cross_object_alert_d_or_rd_field.name,
+ 1,
+ self.de.docket.case_name,
+ 1,
+ )
+ self._assert_child_hits_content(
+ html_content,
+ cross_object_alert_d_or_rd_field_same_case.name,
+ self.de.docket.case_name,
+ [self.rd.description],
+ )
+
+ def test_special_cross_object_alerts_text_query(self) -> None:
+ """This test confirms that hits are properly filtered out or included
+ in alerts for special cross-object alerts that can match either a
+ Docket-only hit and/or Docket + RDs simultaneously in the same hit.
+ These cases include queries that use a text query that can match a
+ Docket and RD field simultaneously.
+ """
+
# This test confirms a text query cross-object alert matches documents
# according to trigger conditions like indexed date and previous triggers
# by the same document.
+ cross_object_alert_text = AlertFactory(
+ user=self.user_profile.user,
+ rate=Alert.REAL_TIME,
+ name="Test Alert Cross-object text query",
+ query=f'q="United states"&type=r',
+ )
two_days_before = self.mock_date - datetime.timedelta(days=2)
mock_two_days_before = two_days_before.replace(hour=5)
- with time_machine.travel(mock_two_days_before, tick=False):
+ with time_machine.travel(
+ mock_two_days_before, tick=False
+ ), self.captureOnCommitCallbacks(execute=True):
docket = DocketFactory(
court=self.court,
case_name="United States of America",
@@ -932,6 +988,22 @@ def test_special_cross_object_alerts(self) -> None:
source=Docket.RECAP,
)
+ with mock.patch(
+ "cl.api.webhooks.requests.post",
+ side_effect=lambda *args, **kwargs: MockResponse(
+ 200, mock_raw=True
+ ),
+ ), time_machine.travel(self.mock_date, tick=False):
+ call_command("cl_send_recap_alerts", testing_mode=True)
+
+ # No alert should be triggered since the matched docket was not
+ # modified during the current day.
+ self.assertEqual(
+ len(mail.outbox), 0, msg="Outgoing emails don't match."
+ )
+
+ # Index new documents that match cross_object_alert_text, an RD, and
+ # an empty docket.
with time_machine.travel(
self.mock_date, tick=False
), self.captureOnCommitCallbacks(execute=True):
@@ -951,17 +1023,11 @@ def test_special_cross_object_alerts(self) -> None:
docket_2 = DocketFactory(
court=self.court,
- case_name="United States of America vs Lorem",
+ case_name="United States vs Lorem",
docket_number="1:21-bk-1008",
source=Docket.RECAP,
)
- cross_object_alert_text = AlertFactory(
- user=self.user_profile.user,
- rate=Alert.REAL_TIME,
- name="Test Alert Cross-object query",
- query=f'q="United states"&type=r',
- )
with mock.patch(
"cl.api.webhooks.requests.post",
side_effect=lambda *args, **kwargs: MockResponse(
@@ -970,14 +1036,21 @@ def test_special_cross_object_alerts(self) -> None:
), time_machine.travel(self.mock_date, tick=False):
call_command("cl_send_recap_alerts", testing_mode=True)
- # A new alert should be triggered containing two hits. One matched by
+ # An alert should be triggered containing two hits. One matched by
# the rd_3 plain text description and one matched by docket_2 case_name
self.assertEqual(
- len(mail.outbox), 2, msg="Outgoing emails don't match."
+ len(mail.outbox), 1, msg="Outgoing emails don't match."
)
- html_content = self.get_html_content_from_email(mail.outbox[1])
- # rd_3 should appear nested in this hit.
+ html_content = self.get_html_content_from_email(mail.outbox[0])
self._confirm_number_of_alerts(html_content, 1)
+ self._count_alert_hits_and_child_hits(
+ html_content,
+ cross_object_alert_text.name,
+ 2,
+ docket.case_name,
+ 1,
+ )
+ # rd_3 should appear nested in this hit.
self._assert_child_hits_content(
html_content,
cross_object_alert_text.name,
@@ -991,8 +1064,7 @@ def test_special_cross_object_alerts(self) -> None:
docket_2.case_name,
[],
)
-
- # Modify the docket today:
+ # Modify 1:21-bk-1009 docket today:
with time_machine.travel(
self.mock_date, tick=False
), self.captureOnCommitCallbacks(execute=True):
@@ -1008,11 +1080,19 @@ def test_special_cross_object_alerts(self) -> None:
), time_machine.travel(self.mock_date, tick=False):
call_command("cl_send_recap_alerts", testing_mode=True)
- # A new alert should be triggered containing docket as a hit with no
- # nested RDs.
- html_content = self.get_html_content_from_email(mail.outbox[2])
+ # A new alert should be triggered containing the docket as a hit with
+ # no nested RDs.
+ html_content = self.get_html_content_from_email(mail.outbox[1])
self.assertEqual(
- len(mail.outbox), 3, msg="Outgoing emails don't match."
+ len(mail.outbox), 2, msg="Outgoing emails don't match."
+ )
+ self._confirm_number_of_alerts(html_content, 1)
+ self._count_alert_hits_and_child_hits(
+ html_content,
+ cross_object_alert_text.name,
+ 1,
+ docket.case_name,
+ 0,
)
self._assert_child_hits_content(
html_content,
@@ -1029,20 +1109,41 @@ def test_special_cross_object_alerts(self) -> None:
),
), time_machine.travel(self.mock_date, tick=False):
call_command("cl_send_recap_alerts", testing_mode=True)
-
# No new alerts should be triggered.
self.assertEqual(
- len(mail.outbox), 3, msg="Outgoing emails don't match."
+ len(mail.outbox), 2, msg="Outgoing emails don't match."
)
+ # Index new documents that match cross_object_alert_text, an RD, and
+ # an empty docket.
+ with time_machine.travel(
+ self.mock_date, tick=False
+ ), self.captureOnCommitCallbacks(execute=True):
+ rd_4 = RECAPDocumentFactory(
+ docket_entry=alert_de,
+ description="Hearing new",
+ document_number="3",
+ pacer_doc_id="0180366528790",
+ plain_text="Lorem ipsum",
+ )
+ rd_5 = RECAPDocumentFactory(
+ docket_entry=alert_de,
+ description="Hearing new 2",
+ document_number="4",
+ pacer_doc_id="018026657750",
+ plain_text="United states of america plain text",
+ )
+
# This test confirms that we're able to trigger cross-object alerts
- # that include an OR clause and match documents that belong to the
- # same case.
- cross_object_alert_d_or_rd_field_same_case = AlertFactory(
+ # that include an OR clause and a cross-object text query.
+ cross_object_alert_d_or_rd_field_text_query = AlertFactory(
user=self.user_profile.user,
rate=Alert.REAL_TIME,
- name="Test Alert Cross-object query",
- query=f"q=docket_id:{self.de.docket.pk} OR pacer_doc_id:{self.rd.pacer_doc_id}&type=r",
+ name="Test Alert Cross-object query combined.",
+ query=f"q=docket_id:{self.de.docket.pk} OR "
+ f"pacer_doc_id:{self.rd.pacer_doc_id} OR "
+ f'("United States of America" OR '
+ f"pacer_doc_id:{rd_3.pacer_doc_id})&type=r",
)
with mock.patch(
"cl.api.webhooks.requests.post",
@@ -1054,16 +1155,80 @@ def test_special_cross_object_alerts(self) -> None:
# A new alert should be triggered, containing the RD document nested below
# its parent docket.
+ html_content = self.get_html_content_from_email(mail.outbox[2])
+ self.assertEqual(
+ len(mail.outbox), 3, msg="Outgoing emails don't match."
+ )
+ # The email contains two alerts: one for cross_object_alert_text
+ # triggered by the new rd_5 added, and one for cross_object_alert_d_or_rd_field_text_query.
+ self._confirm_number_of_alerts(html_content, 2)
+ self._count_alert_hits_and_child_hits(
+ html_content,
+ cross_object_alert_text.name,
+ 1,
+ docket.case_name,
+ 1,
+ )
+ # The cross_object_alert_d_or_rd_field_text_query alert contains two
+ # hits. The first one matches "docket" and rd_3 and rd_5 nested below
+ # due to the OR clause in the text query, and the second hit matches
+ # self.de.docket and self.rd.
+ self._count_alert_hits_and_child_hits(
+ html_content,
+ cross_object_alert_d_or_rd_field_text_query.name,
+ 2,
+ docket.case_name,
+ 2,
+ )
+ self._assert_child_hits_content(
+ html_content,
+ cross_object_alert_d_or_rd_field_text_query.name,
+ docket.case_name,
+ [rd_3.description, rd_5.description],
+ )
+ self._assert_child_hits_content(
+ html_content,
+ cross_object_alert_d_or_rd_field_text_query.name,
+ self.de.docket.case_name,
+ [self.rd.description],
+ )
+
+ # This test confirms that hits are properly filtered when using AND in
+ # the text query.
+ cross_object_alert_d_or_rd_field_text_query_and = AlertFactory(
+ user=self.user_profile.user,
+ rate=Alert.REAL_TIME,
+ name="Test Alert Cross-object query combined.",
+ query=f'q=("United States of America" AND '
+ f"pacer_doc_id:{rd_3.pacer_doc_id})&type=r",
+ )
+ with mock.patch(
+ "cl.api.webhooks.requests.post",
+ side_effect=lambda *args, **kwargs: MockResponse(
+ 200, mock_raw=True
+ ),
+ ), time_machine.travel(self.mock_date, tick=False):
+ call_command("cl_send_recap_alerts", testing_mode=True)
+
+ # A new alert should be triggered, containing rd_3 document nested below
+ # its parent docket.
html_content = self.get_html_content_from_email(mail.outbox[3])
self.assertEqual(
len(mail.outbox), 4, msg="Outgoing emails don't match."
)
self._confirm_number_of_alerts(html_content, 1)
+ self._count_alert_hits_and_child_hits(
+ html_content,
+ cross_object_alert_d_or_rd_field_text_query_and.name,
+ 1,
+ docket.case_name,
+ 1,
+ )
self._assert_child_hits_content(
html_content,
- cross_object_alert_d_or_rd_field_same_case.name,
- self.de.docket.case_name,
- [self.rd.description],
+ cross_object_alert_d_or_rd_field_text_query_and.name,
+ docket.case_name,
+ [rd_3.description],
)
docket.delete()
From b56f2354e43c74267811fd1f1567b2e6ae2cdb7d Mon Sep 17 00:00:00 2001
From: Alberto Islas
Date: Thu, 25 Jul 2024 14:51:55 -0500
Subject: [PATCH 20/33] fix(alerts): Restore send_es_search_alert_webhook to
avoid conflicts due to scheduled task
- This can be removed after tasks in the queue have been processed.
---
.../commands/cl_send_recap_alerts.py | 4 +-
cl/alerts/tasks.py | 4 +-
cl/api/tasks.py | 42 +++++++++++++++++++
3 files changed, 46 insertions(+), 4 deletions(-)
diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
index 7f05ce4291..193823cb22 100644
--- a/cl/alerts/management/commands/cl_send_recap_alerts.py
+++ b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -25,7 +25,7 @@
recap_document_hl_matched,
)
from cl.api.models import WebhookEventType
-from cl.api.tasks import send_es_search_alert_webhook
+from cl.api.tasks import send_search_alert_webhook_es
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.date_time import dt_as_local_date
from cl.lib.elasticsearch_utils import do_es_sweep_alert_query
@@ -543,7 +543,7 @@ def send_search_alert_webhooks(
event_type=WebhookEventType.SEARCH_ALERT, enabled=True
)
for user_webhook in user_webhooks:
- send_es_search_alert_webhook.delay(
+ send_search_alert_webhook_es.delay(
results_to_send, user_webhook.pk, alert_id
)
diff --git a/cl/alerts/tasks.py b/cl/alerts/tasks.py
index f7b004bc54..e3aad0a071 100644
--- a/cl/alerts/tasks.py
+++ b/cl/alerts/tasks.py
@@ -25,7 +25,7 @@
from cl.api.models import WebhookEventType
from cl.api.tasks import (
send_docket_alert_webhook_events,
- send_es_search_alert_webhook,
+ send_search_alert_webhook_es,
)
from cl.celery_init import app
from cl.custom_filters.templatetags.text_filters import best_case_name
@@ -458,7 +458,7 @@ def send_webhook_alert_hits(
event_type=WebhookEventType.SEARCH_ALERT, enabled=True
)
for user_webhook in user_webhooks:
- send_es_search_alert_webhook.delay(
+ send_search_alert_webhook_es.delay(
documents,
user_webhook.pk,
alert.pk,
diff --git a/cl/api/tasks.py b/cl/api/tasks.py
index a0d6112444..7f0b8d2cdd 100644
--- a/cl/api/tasks.py
+++ b/cl/api/tasks.py
@@ -82,8 +82,50 @@ def send_docket_alert_webhook_events(
send_webhook_event(webhook_event, json_bytes)
+# TODO: Remove after scheduled OA alerts have been processed.
@app.task()
def send_es_search_alert_webhook(
+ results: list[dict[str, Any]],
+ webhook_pk: int,
+ alert: Alert,
+) -> None:
+ """Send a search alert webhook event containing search results from a
+ search alert object.
+
+ :param results: The search results returned by SOLR for this alert.
+ :param webhook_pk: The webhook endpoint ID object to send the event to.
+ :param alert: The search alert object.
+ """
+
+ webhook = Webhook.objects.get(pk=webhook_pk)
+ serialized_alert = SearchAlertSerializerModel(alert).data
+ es_results = []
+ for result in results:
+ result["snippet"] = result["text"]
+ es_results.append(ResultObject(initial=result))
+ serialized_results = V3OAESResultSerializer(es_results, many=True).data
+
+ post_content = {
+ "webhook": generate_webhook_key_content(webhook),
+ "payload": {
+ "results": serialized_results,
+ "alert": serialized_alert,
+ },
+ }
+ renderer = JSONRenderer()
+ json_bytes = renderer.render(
+ post_content,
+ accepted_media_type="application/json;",
+ )
+ webhook_event = WebhookEvent.objects.create(
+ webhook=webhook,
+ content=post_content,
+ )
+ send_webhook_event(webhook_event, json_bytes)
+
+
+@app.task()
+def send_search_alert_webhook_es(
results: list[dict[str, Any]] | list[Hit],
webhook_pk: int,
alert_pk: int,
From d1026640871d3bcca5774cca102dc466c905c406 Mon Sep 17 00:00:00 2001
From: Alberto Islas
Date: Mon, 29 Jul 2024 10:13:13 -0500
Subject: [PATCH 21/33] fix(alerts): Fixed MLY alerts test can't be sent after
the 28th
---
cl/alerts/tests/tests_recap_alerts.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py
index a661347975..3a067858d2 100644
--- a/cl/alerts/tests/tests_recap_alerts.py
+++ b/cl/alerts/tests/tests_recap_alerts.py
@@ -1608,7 +1608,9 @@ def test_schedule_wly_and_mly_recap_alerts(self) -> None:
self.assertIn(self.rd.description, txt_email)
# Send scheduled Monthly alerts and check assertions.
- call_command("cl_send_scheduled_alerts", rate=Alert.MONTHLY)
+ current_date = now().replace(day=28, hour=0)
+ with time_machine.travel(current_date, tick=False):
+ call_command("cl_send_scheduled_alerts", rate=Alert.MONTHLY)
self.assertEqual(
len(mail.outbox), 2, msg="Outgoing emails don't match."
)
From 57b6df7ac2e46514ca729f4d528ec7a46149fe82 Mon Sep 17 00:00:00 2001
From: Alberto Islas
Date: Thu, 26 Sep 2024 16:55:26 -0600
Subject: [PATCH 22/33] fix(alerts): Fixed merge conflicts and adjust test
accordingly new RECAP_CHILD_HITS_PER_RESULT value
---
cl/alerts/tests/tests_recap_alerts.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py
index 3a067858d2..b339a10ef7 100644
--- a/cl/alerts/tests/tests_recap_alerts.py
+++ b/cl/alerts/tests/tests_recap_alerts.py
@@ -1247,15 +1247,15 @@ def test_limit_alert_case_child_hits(self) -> None:
description="MOTION for Leave to File Amicus Curiae Lorem Served",
)
rd_descriptions = []
- for i in range(6):
+ for i in range(4):
rd = RECAPDocumentFactory(
docket_entry=alert_de,
description=f"Motion to File {i+1}",
document_number=f"{i+1}",
pacer_doc_id=f"018036652436{i+1}",
)
- if i < 5:
- # Omit the last alert to compare. Only up to 5 should be
+ if i < 3:
+ # Omit the last alert to compare. Only up to 3 should be
# included in the case.
rd_descriptions.append(rd.description)
@@ -1286,13 +1286,13 @@ def test_limit_alert_case_child_hits(self) -> None:
html_content = self.get_html_content_from_email(mail.outbox[0])
self.assertIn(recap_only_alert.name, html_content)
self._confirm_number_of_alerts(html_content, 1)
- # The case alert should contain up to 5 child hits.
+ # The case alert should contain up to 3 child hits.
self._count_alert_hits_and_child_hits(
html_content,
recap_only_alert.name,
1,
self.de.docket.case_name,
- 5,
+ 3,
)
self._assert_child_hits_content(
html_content,
From b35ef0aad49b230ad98735c4631986f7facd0069 Mon Sep 17 00:00:00 2001
From: Alberto Islas
Date: Thu, 26 Sep 2024 18:36:31 -0600
Subject: [PATCH 23/33] fix(elasticsearch): Fixed failing test due to
build_full_join_es_queries returning values change
---
cl/lib/elasticsearch_utils.py | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
index 34aceeeac8..df17f5e1dc 100644
--- a/cl/lib/elasticsearch_utils.py
+++ b/cl/lib/elasticsearch_utils.py
@@ -1197,15 +1197,21 @@ def build_es_base_query(
mlt_query = async_to_sync(build_more_like_this_query)(
cluster_pks
)
- main_query, join_query = build_full_join_es_queries(
- cd,
- {"opinion": []},
- [],
- mlt_query,
- child_highlighting=False,
- api_version=api_version,
+ main_query, child_docs_query, parent_query = (
+ build_full_join_es_queries(
+ cd,
+ {"opinion": []},
+ [],
+ mlt_query,
+ child_highlighting=False,
+ api_version=api_version,
+ )
+ )
+ return (
+ search_query.query(main_query),
+ child_docs_query,
+ parent_query,
)
- return search_query.query(main_query), join_query
opinion_search_fields = SEARCH_OPINION_QUERY_FIELDS
child_fields = opinion_search_fields.copy()
From 8902aa0c226c2aac1cc2ce82fe0c76205569cdde Mon Sep 17 00:00:00 2001
From: Alberto Islas
Date: Thu, 26 Sep 2024 18:46:29 -0600
Subject: [PATCH 24/33] fix(alerts): Removed recap_document_hl_matched as we no
longer rely on HL to filter out RD hits
---
.../commands/cl_send_recap_alerts.py | 22 +++---
cl/alerts/tests/tests_recap_alerts.py | 68 -------------------
cl/alerts/utils.py | 22 ------
3 files changed, 12 insertions(+), 100 deletions(-)
diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
index 193823cb22..8640ddc70a 100644
--- a/cl/alerts/management/commands/cl_send_recap_alerts.py
+++ b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -22,7 +22,6 @@
add_document_hit_to_alert_set,
alert_hits_limit_reached,
has_document_alert_hit_been_triggered,
- recap_document_hl_matched,
)
from cl.api.models import WebhookEventType
from cl.api.tasks import send_search_alert_webhook_es
@@ -407,7 +406,7 @@ def filter_rd_alert_hits(
alert_id: int,
rd_hits: AttrList,
rd_ids: list[int],
- check_rd_hl=False,
+ check_rd_matched=False,
):
"""Filter RECAP document hits based on specified conditions.
@@ -416,8 +415,8 @@ def filter_rd_alert_hits(
:param rd_hits: A list of RECAPDocument hits to be processed.
:param rd_ids: A list of RECAPDocument IDs that matched the RECAPDocument
only query.
- :param check_rd_hl: A boolean indicating whether to check if the RECAP
- document hit matched RD HLs.
+ :param check_rd_matched: A boolean indicating whether to check if the RECAP
+ document hit from the main query also matches the RECAPDocument-only query
:return: A list of RECAP document hits that meet all specified conditions.
"""
@@ -428,11 +427,10 @@ def filter_rd_alert_hits(
r, alert_id, "r", rd_hit["_source"]["id"]
)
]
- if check_rd_hl:
- if not recap_document_hl_matched(rd_hit):
- # If the RECAPDocument hit didn't match any HL. Check if it should be included
- # due to it matched the RECAPDocument only query.
- conditions.append(rd_hit["_source"]["id"] in rd_ids)
+ if check_rd_matched:
+ # Add condition to check if the RD hit is within the RD IDS returned
+ # by the RECAPDocument-only query.
+ conditions.append(rd_hit["_source"]["id"] in rd_ids)
if all(conditions):
rds_to_send.append(rd_hit)
add_document_hit_to_alert_set(
@@ -495,7 +493,11 @@ def process_alert_hits(
if hit.docket_id in docket_ids:
# Possible Docket-only alert
rds_to_send = filter_rd_alert_hits(
- r, alert_id, hit["child_docs"], rd_ids, check_rd_hl=True
+ r,
+ alert_id,
+ hit["child_docs"],
+ rd_ids,
+ check_rd_matched=True,
)
if rds_to_send:
# Docket OR RECAPDocument alert.
diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py
index b339a10ef7..8e57da973a 100644
--- a/cl/alerts/tests/tests_recap_alerts.py
+++ b/cl/alerts/tests/tests_recap_alerts.py
@@ -17,7 +17,6 @@
index_daily_recap_documents,
)
from cl.alerts.models import SEARCH_TYPES, Alert, ScheduledAlertHit
-from cl.alerts.utils import recap_document_hl_matched
from cl.api.factories import WebhookFactory
from cl.api.models import WebhookEvent, WebhookEventType
from cl.donate.models import NeonMembership
@@ -241,73 +240,6 @@ def _count_webhook_hits_and_child_hits(
% case_title,
)
- async def test_recap_document_hl_matched(self) -> None:
- """Test recap_document_hl_matched method that determines weather a hit
- contains RECAPDocument HL fields."""
-
- # Index base document factories.
- with time_machine.travel(self.mock_date, tick=False):
- index_daily_recap_documents(
- self.r,
- DocketDocument._index._name,
- RECAPSweepDocument,
- testing=True,
- )
-
- # Docket-only query
- search_params = {
- "type": SEARCH_TYPES.RECAP,
- "q": '"401 Civil"',
- }
- search_query = RECAPSweepDocument.search()
- results, parent_results, _ = await sync_to_async(
- do_es_sweep_alert_query
- )(
- search_query,
- search_query,
- search_params,
- )
- docket_result = results[0]
- for rd in docket_result["child_docs"]:
- rd_field_matched = recap_document_hl_matched(rd)
- self.assertEqual(rd_field_matched, False)
-
- # RECAPDocument-only query
- search_params = {
- "type": SEARCH_TYPES.RECAP,
- "q": '"Mauris iaculis, leo sit amet hendrerit vehicula"',
- }
- search_query = RECAPSweepDocument.search()
- results, parent_results, _ = await sync_to_async(
- do_es_sweep_alert_query
- )(
- search_query,
- search_query,
- search_params,
- )
- docket_result = results[0]
- for rd in docket_result["child_docs"]:
- rd_field_matched = recap_document_hl_matched(rd)
- self.assertEqual(rd_field_matched, True)
-
- # Cross-object query
- search_params = {
- "type": SEARCH_TYPES.RECAP,
- "q": "SUBPOENAS SERVED OFF Mauris iaculis",
- }
- search_query = RECAPSweepDocument.search()
- results, parent_results, _ = await sync_to_async(
- do_es_sweep_alert_query
- )(
- search_query,
- search_query,
- search_params,
- )
- docket_result = results[0]
- for rd in docket_result["child_docs"]:
- rd_field_matched = recap_document_hl_matched(rd)
- self.assertEqual(rd_field_matched, True)
-
def test_filter_recap_alerts_to_send(self) -> None:
"""Test filter RECAP alerts that met the conditions to be sent:
- RECAP type alert.
diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py
index 6ad589a913..d98984abff 100644
--- a/cl/alerts/utils.py
+++ b/cl/alerts/utils.py
@@ -147,28 +147,6 @@ def alert_hits_limit_reached(alert_pk: int, user_pk: int) -> bool:
return False
-def recap_document_hl_matched(rd_hit: Hit) -> bool:
- """Determine whether HL matched a RECAPDocument text field.
-
- :param rd_hit: The ES hit.
- :return: True if the hit matched a RECAPDocument field. Otherwise, False.
- """
-
- matched_rd_hl: set[str] = set()
- rd_hl_fields = set(SEARCH_RECAP_CHILD_HL_FIELDS.keys())
- if hasattr(rd_hit, "highlight"):
- highlights = rd_hit.highlight.to_dict()
- matched_rd_hl.update(
- hl_key
- for hl_key, hl_value in highlights.items()
- for hl in hl_value
- if f"<{ALERTS_HL_TAG}>" in hl
- )
- if matched_rd_hl and matched_rd_hl.issubset(rd_hl_fields):
- return True
- return False
-
-
def make_alert_set_key(alert_id: int, document_type: str) -> str:
"""Generate a Redis key for storing alert hits.
From 4babf5d3b2d0e1ec45b9604e8f532f1345ebc526 Mon Sep 17 00:00:00 2001
From: Eduardo Rosendo
Date: Fri, 18 Oct 2024 13:26:43 -0400
Subject: [PATCH 25/33] feat(custom filter): Refactor alerts_supported method
for better readability
---
cl/custom_filters/templatetags/extras.py | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/cl/custom_filters/templatetags/extras.py b/cl/custom_filters/templatetags/extras.py
index 90395d9356..39d535b2df 100644
--- a/cl/custom_filters/templatetags/extras.py
+++ b/cl/custom_filters/templatetags/extras.py
@@ -291,14 +291,9 @@ def alerts_supported(context: RequestContext, search_type: str) -> str:
"""
request = context["request"]
- return (
- search_type == SEARCH_TYPES.OPINION
- or search_type == SEARCH_TYPES.ORAL_ARGUMENT
- or (
- search_type == SEARCH_TYPES.RECAP
- and waffle.flag_is_active(request, "recap-alerts-active")
- )
- )
+ if search_type == SEARCH_TYPES.RECAP:
+ return waffle.flag_is_active(request, "recap-alerts-active")
+ return search_type in (SEARCH_TYPES.OPINION, SEARCH_TYPES.ORAL_ARGUMENT)
@register.filter
From a0085cce32242d78e8aed8db537bbb52c3517404 Mon Sep 17 00:00:00 2001
From: Eduardo Rosendo
Date: Fri, 18 Oct 2024 13:33:35 -0400
Subject: [PATCH 26/33] refactor(alerts): Cleaned up unused imports in utils.py
---
cl/alerts/utils.py | 7 -------
1 file changed, 7 deletions(-)
diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py
index d98984abff..66047e5fe9 100644
--- a/cl/alerts/utils.py
+++ b/cl/alerts/utils.py
@@ -15,13 +15,6 @@
)
from cl.lib.command_utils import logger
from cl.lib.elasticsearch_utils import add_es_highlighting
-from cl.lib.types import CleanData
-from cl.search.constants import (
- ALERTS_HL_TAG,
- SEARCH_RECAP_CHILD_HL_FIELDS,
- recap_document_filters,
- recap_document_indexed_fields,
-)
from cl.search.documents import AudioPercolator
from cl.search.models import SEARCH_TYPES, Docket
From 5f12c3037c3a3110fde28763fff6965b4090264f Mon Sep 17 00:00:00 2001
From: Eduardo Rosendo
Date: Fri, 18 Oct 2024 13:36:08 -0400
Subject: [PATCH 27/33] refactor(search): Cleanup unused constants
---
cl/search/constants.py | 23 -----------------------
1 file changed, 23 deletions(-)
diff --git a/cl/search/constants.py b/cl/search/constants.py
index 0efe9848bb..333dfbca6c 100644
--- a/cl/search/constants.py
+++ b/cl/search/constants.py
@@ -306,29 +306,6 @@
Opinion.TRIAL_COURT: "trial-court-document",
}
-recap_document_indexed_fields = [
- "id",
- "docket_entry_id",
- "description",
- "entry_number",
- "entry_date_filed",
- "short_description",
- "document_type",
- "document_number",
- "pacer_doc_id",
- "plain_text",
- "attachment_number",
- "is_available",
- "page_count",
- "cites",
-]
-
-recap_document_filters = [
- "available_only",
- "description",
- "document_number",
- "attachment_number",
-]
cardinality_query_unique_ids = {
SEARCH_TYPES.RECAP: "docket_id",
From 5fb177f83e8db2743a4d4eb6ab62acb82319597c Mon Sep 17 00:00:00 2001
From: Eduardo Rosendo
Date: Fri, 18 Oct 2024 13:41:50 -0400
Subject: [PATCH 28/33] refactor(alerts): Replaces Type import with built-in
alternative
---
cl/alerts/management/commands/cl_send_recap_alerts.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
index 8640ddc70a..9589a7e8a6 100644
--- a/cl/alerts/management/commands/cl_send_recap_alerts.py
+++ b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -2,7 +2,7 @@
import datetime
import time
import traceback
-from typing import Any, Literal, Type
+from typing import Any, Literal
import pytz
from asgiref.sync import async_to_sync
@@ -120,7 +120,7 @@ def retrieve_task_info(task_info: dict[str, Any]) -> dict[str, Any]:
def index_daily_recap_documents(
r: Redis,
source_index_name: str,
- target_index: Type[RECAPSweepDocument] | Type[ESRECAPSweepDocument],
+ target_index: type[RECAPSweepDocument] | type[ESRECAPSweepDocument],
testing: bool = False,
only_rd: bool = False,
) -> int:
From 78955f1817110635af55690533fa1ffff2458968 Mon Sep 17 00:00:00 2001
From: Eduardo Rosendo
Date: Fri, 18 Oct 2024 13:49:04 -0400
Subject: [PATCH 29/33] refactor(search): Removes unused argument from index
command
---
.../management/commands/cl_index_parent_and_child_docs.py | 5 -----
1 file changed, 5 deletions(-)
diff --git a/cl/search/management/commands/cl_index_parent_and_child_docs.py b/cl/search/management/commands/cl_index_parent_and_child_docs.py
index 57cdf390fc..366d9fe13e 100644
--- a/cl/search/management/commands/cl_index_parent_and_child_docs.py
+++ b/cl/search/management/commands/cl_index_parent_and_child_docs.py
@@ -342,11 +342,6 @@ def add_arguments(self, parser):
action="store_true",
help="Use this flag to only index documents missing in the index.",
)
- parser.add_argument(
- "--sweep-index",
- action="store_true",
- help="Whether to perform an indexing for the sweep index.",
- )
def handle(self, *args, **options):
super().handle(*args, **options)
From da72292caa66abaff484a6eecce13c71729ce2d4 Mon Sep 17 00:00:00 2001
From: Eduardo Rosendo
Date: Fri, 18 Oct 2024 14:03:55 -0400
Subject: [PATCH 30/33] feat(alert): Implements early returns in recap alert
command
---
.../commands/cl_send_recap_alerts.py | 76 ++++++++++---------
1 file changed, 39 insertions(+), 37 deletions(-)
diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
index 9589a7e8a6..6b6ce13f29 100644
--- a/cl/alerts/management/commands/cl_send_recap_alerts.py
+++ b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -588,21 +588,22 @@ def query_and_send_alerts(
results_to_send = process_alert_hits(
r, results, parent_results, child_results, alert.pk, query_date
)
- if results_to_send:
- hits.append(
- [
- alert,
- search_type,
- results_to_send,
- len(results_to_send),
- ]
- )
- alert.query_run = search_params.urlencode() # type: ignore
- alert.date_last_hit = timezone.now()
- alert.save()
+ if not results_to_send:
+ continue
+ hits.append(
+ [
+ alert,
+ search_type,
+ results_to_send,
+ len(results_to_send),
+ ]
+ )
+ alert.query_run = search_params.urlencode() # type: ignore
+ alert.date_last_hit = timezone.now()
+ alert.save()
- # Send webhooks
- send_search_alert_webhooks(user, results_to_send, alert.pk)
+ # Send webhooks
+ send_search_alert_webhooks(user, results_to_send, alert.pk)
if hits:
send_search_alert_emails.delay([(user.pk, hits)])
@@ -644,31 +645,32 @@ def query_and_schedule_alerts(
results_to_send = process_alert_hits(
r, results, parent_results, child_results, alert.pk, query_date
)
- if results_to_send:
- for hit in results_to_send:
- # Schedule DAILY, WEEKLY and MONTHLY Alerts
- if alert_hits_limit_reached(alert.pk, user.pk):
- # Skip storing hits for this alert-user combination because
- # the SCHEDULED_ALERT_HITS_LIMIT has been reached.
- continue
-
- child_result_objects = []
- hit_copy = copy.deepcopy(hit)
- if hasattr(hit_copy, "child_docs"):
- for child_doc in hit_copy.child_docs:
- child_result_objects.append(
- child_doc["_source"].to_dict()
- )
- hit_copy["child_docs"] = child_result_objects
- scheduled_hits_to_create.append(
- ScheduledAlertHit(
- user=user,
- alert=alert,
- document_content=hit_copy.to_dict(),
+ if not results_to_send:
+ continue
+ for hit in results_to_send:
+ # Schedule DAILY, WEEKLY and MONTHLY Alerts
+ if alert_hits_limit_reached(alert.pk, user.pk):
+ # Skip storing hits for this alert-user combination because
+ # the SCHEDULED_ALERT_HITS_LIMIT has been reached.
+ continue
+
+ child_result_objects = []
+ hit_copy = copy.deepcopy(hit)
+ if hasattr(hit_copy, "child_docs"):
+ for child_doc in hit_copy.child_docs:
+ child_result_objects.append(
+ child_doc["_source"].to_dict()
)
+ hit_copy["child_docs"] = child_result_objects
+ scheduled_hits_to_create.append(
+ ScheduledAlertHit(
+ user=user,
+ alert=alert,
+ document_content=hit_copy.to_dict(),
)
- # Send webhooks
- send_search_alert_webhooks(user, results_to_send, alert.pk)
+ )
+ # Send webhooks
+ send_search_alert_webhooks(user, results_to_send, alert.pk)
# Create scheduled WEEKLY and MONTHLY Alerts in bulk.
if scheduled_hits_to_create:
From 0b62dca8f3d2eafaea996bd8ff66e0bee1baf198 Mon Sep 17 00:00:00 2001
From: Eduardo Rosendo
Date: Fri, 18 Oct 2024 15:22:32 -0400
Subject: [PATCH 31/33] feat(alerts): Adds TaskCompletionStatus dataclass for
tracking task progress
This commit introduces a new dataclass to store and manage information related to running Elasticsearch tasks. The dataclass includes properties for task completion status, created and total document counts.
---
.../commands/cl_send_recap_alerts.py | 78 +++++++++----------
cl/alerts/utils.py | 8 ++
2 files changed, 45 insertions(+), 41 deletions(-)
diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
index 6b6ce13f29..65d88bf7b6 100644
--- a/cl/alerts/management/commands/cl_send_recap_alerts.py
+++ b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -19,6 +19,7 @@
from cl.alerts.models import Alert, ScheduledAlertHit
from cl.alerts.tasks import send_search_alert_emails
from cl.alerts.utils import (
+ TaskCompletionStatus,
add_document_hit_to_alert_set,
alert_hits_limit_reached,
has_document_alert_hit_been_triggered,
@@ -64,27 +65,34 @@ def get_task_status(task_id: str, es: Elasticsearch) -> dict[str, Any]:
def compute_estimated_remaining_time(
- initial_wait: float, start_time_millis: int, created: int, total: int
+ initial_wait: float, task_status: TaskCompletionStatus
) -> float:
"""Compute the estimated remaining time for the re_index task to complete.
:param initial_wait: The default wait time in seconds.
- :param start_time_millis: The start time in milliseconds epoch.
- :param created: The number of items created so far.
- :param total: The total number of items to be created.
+ :param task_status: An instance of `TaskCompletionStatus` containing task
+ information.
:return: The estimated remaining time in seconds. If the start time,
created, or total are invalid, the initial default time is returned.
"""
- if start_time_millis is None or not created or not total:
+ if (
+ task_status.start_time_millis is None
+ or not task_status.created
+ or not task_status.total
+ ):
return initial_wait
- start_time = datetime.datetime.fromtimestamp(start_time_millis / 1000.0)
+ start_time = datetime.datetime.fromtimestamp(
+ task_status.start_time_millis / 1000.0
+ )
time_now = datetime.datetime.now()
estimated_time_remaining = max(
datetime.timedelta(
- seconds=((time_now - start_time).total_seconds() / created)
- * (total - created)
+ seconds=(
+ (time_now - start_time).total_seconds() / task_status.created
+ )
+ * (task_status.total - task_status.created)
).total_seconds(),
initial_wait,
)
@@ -92,29 +100,23 @@ def compute_estimated_remaining_time(
return estimated_time_remaining
-def retrieve_task_info(task_info: dict[str, Any]) -> dict[str, Any]:
+def retrieve_task_info(task_info: dict[str, Any]) -> TaskCompletionStatus:
"""Retrieve task information from the given task dict.
:param task_info: A dictionary containing the task status information.
- :return: A dictionary with the task completion status, created documents
- count, total documents count, and the task start time in milliseconds.
- Retrieve default values in case task_info is not valid.
+ :return: A `TaskCompletionStatus` object representing the extracted task
+ information.
"""
if task_info:
status = task_info["task"]["status"]
- return {
- "completed": task_info["completed"],
- "created": status["created"],
- "total": status["total"],
- "start_time_millis": task_info["task"]["start_time_in_millis"],
- }
- return {
- "completed": False,
- "created": 0,
- "total": 0,
- "start_time_millis": None,
- }
+ return TaskCompletionStatus(
+ completed=task_info["completed"],
+ created=status["created"],
+ total=status["total"],
+ start_time_millis=task_info["task"]["start_time_in_millis"],
+ )
+ return TaskCompletionStatus()
def index_daily_recap_documents(
@@ -338,41 +340,35 @@ def fields = [
initial_wait = 0.01 if testing else 60.0
time.sleep(initial_wait)
- get_task_info = retrieve_task_info(get_task_status(task_id, es))
+ task_info = retrieve_task_info(get_task_status(task_id, es))
iterations_count = 0
estimated_time_remaining = compute_estimated_remaining_time(
- initial_wait,
- get_task_info["start_time_millis"],
- get_task_info["created"],
- get_task_info["total"],
+ initial_wait, task_info
)
- while not get_task_info["completed"]:
+ while not task_info.completed:
logger.info(
- f"Task progress: {get_task_info['created']}/{get_task_info['total']} documents. "
+ f"Task progress: {task_info.created}/{task_info.total} documents. "
f"Estimated time to finish: {estimated_time_remaining} seconds."
)
- task_info = get_task_status(task_id, es)
- get_task_info = retrieve_task_info(task_info)
+ task_status = get_task_status(task_id, es)
+ task_info = retrieve_task_info(task_status)
time.sleep(estimated_time_remaining)
- if task_info and not get_task_info["completed"]:
+ if task_info and not task_info.completed:
estimated_time_remaining = compute_estimated_remaining_time(
- initial_wait,
- get_task_info["start_time_millis"],
- get_task_info["created"],
- get_task_info["total"],
+ initial_wait, task_info
)
if not task_info:
iterations_count += 1
if iterations_count > 10:
logger.error(
"Re_index alert sweep index task has failed: %s/%s",
- get_task_info["created"],
- get_task_info["total"],
+ task_info.created,
+ task_info.total,
)
break
r.delete("alert_sweep:task_id")
- return get_task_info["total"]
+ return task_info.total
def should_docket_hit_be_included(
diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py
index 66047e5fe9..4b97fc7ba8 100644
--- a/cl/alerts/utils.py
+++ b/cl/alerts/utils.py
@@ -25,6 +25,14 @@ class DocketAlertReportObject:
docket: Docket
+@dataclass
+class TaskCompletionStatus:
+ completed: bool = False
+ created: int = 0
+ total: int = 0
+ start_time_millis: int | None = None
+
+
class OldAlertReport:
def __init__(self):
self.old_alerts = []
From 3b153d2e4a02bbc22f13c017ea56877ad8b60fbe Mon Sep 17 00:00:00 2001
From: Eduardo Rosendo
Date: Fri, 18 Oct 2024 16:24:47 -0400
Subject: [PATCH 32/33] feat(lib): Introduces EsMainQueries Dataclass
dds a new dataclass to encapsulate common Elasticsearch queries used throughout the codebase. This centralizes query definitions, making it easier to maintain and update them.
Updates the `build_es_base_query` method to return an instance of `EsMainQueries` instead of a tuple. This ensures consistent query structure and simplifies future modifications.
---
.../commands/cl_send_recap_alerts.py | 5 +-
.../commands/clean_up_search_alerts.py | 3 +-
.../commands/ready_mix_cases_project.py | 3 +-
cl/lib/elasticsearch_utils.py | 63 ++++++++++++-------
cl/lib/types.py | 9 +++
cl/search/documents.py | 4 +-
cl/search/tests/tests_es_oral_arguments.py | 5 +-
cl/search/tests/tests_es_person.py | 6 +-
8 files changed, 65 insertions(+), 33 deletions(-)
diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py
index 65d88bf7b6..a38ff91f8c 100644
--- a/cl/alerts/management/commands/cl_send_recap_alerts.py
+++ b/cl/alerts/management/commands/cl_send_recap_alerts.py
@@ -564,9 +564,8 @@ def query_and_send_alerts(
alerts_sent_count = 0
now_time = datetime.datetime.now()
for user in alert_users:
- if rate == Alert.REAL_TIME:
- if not user.profile.is_member:
- continue
+ if rate == Alert.REAL_TIME and not user.profile.is_member:
+ continue
alerts = user.alerts.filter(rate=rate, alert_type=SEARCH_TYPES.RECAP)
logger.info(f"Running alerts for user '{user}': {alerts}")
diff --git a/cl/alerts/management/commands/clean_up_search_alerts.py b/cl/alerts/management/commands/clean_up_search_alerts.py
index cf1ceb2f54..df8a28b2d6 100644
--- a/cl/alerts/management/commands/clean_up_search_alerts.py
+++ b/cl/alerts/management/commands/clean_up_search_alerts.py
@@ -75,7 +75,8 @@ def validate_queries_syntax(options: OptionsType) -> None:
if search_form.is_valid():
cd = search_form.cleaned_data
try:
- s, _, _ = build_es_base_query(search_query, cd)
+ es_queries = build_es_base_query(search_query, cd)
+ s = es_queries.search_query
s = s.extra(size=0)
s.execute().to_dict()
# Waiting between requests to avoid hammering ES too quickly.
diff --git a/cl/corpus_importer/management/commands/ready_mix_cases_project.py b/cl/corpus_importer/management/commands/ready_mix_cases_project.py
index eabb93f4a4..32c9db7ae3 100644
--- a/cl/corpus_importer/management/commands/ready_mix_cases_project.py
+++ b/cl/corpus_importer/management/commands/ready_mix_cases_project.py
@@ -198,7 +198,8 @@ def query_results_in_es(options):
}
search_query = DocketDocument.search()
- s, _ = build_es_base_query(search_query, cd)
+ es_queries = build_es_base_query(search_query, cd)
+ s = es_queries.search_query
s = s.extra(size=options["results_size"])
response = s.execute().to_dict()
extracted_data = [
diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
index d0d8db41b1..c12b9c0058 100644
--- a/cl/lib/elasticsearch_utils.py
+++ b/cl/lib/elasticsearch_utils.py
@@ -35,6 +35,7 @@
ApiPositionMapping,
BasePositionMapping,
CleanData,
+ EsMainQueries,
ESRangeQueryParams,
)
from cl.lib.utils import (
@@ -1114,18 +1115,20 @@ def build_es_base_query(
child_highlighting: bool = True,
api_version: Literal["v3", "v4"] | None = None,
alerts: bool = False,
-) -> tuple[Search, QueryString | None, QueryString | None]:
+) -> EsMainQueries:
"""Builds filters and fulltext_query based on the given cleaned
data and returns an elasticsearch query.
:param search_query: The Elasticsearch search query object.
:param cd: The cleaned data object containing the query and filters.
- :param child_highlighting: Whether highlighting should be enabled in child docs.
+ :param child_highlighting: Whether highlighting should be enabled in child
+ docs.
:param api_version: Optional, the request API version.
:param alerts: If highlighting is being applied to search Alerts hits.
- :return: A three-tuple, the Elasticsearch search query object and an ES
- QueryString for child documents or None if there is no need to query
- child documents and a QueryString for parent documents or None.
+ :return: An `EsMainQueries` object containing the Elasticsearch search
+ query object and an ES QueryString for child documents or None if there is
+ no need to query child documents and a QueryString for parent documents or
+ None.
"""
main_query = None
@@ -1244,10 +1247,10 @@ def build_es_base_query(
api_version=api_version,
)
)
- return (
- search_query.query(main_query),
- child_docs_query,
- parent_query,
+ return EsMainQueries(
+ search_query=search_query.query(main_query),
+ parent_query=parent_query,
+ child_query=child_docs_query,
)
opinion_search_fields = SEARCH_OPINION_QUERY_FIELDS
@@ -1291,7 +1294,11 @@ def build_es_base_query(
match_all_query = get_match_all_query(
cd, search_query, api_version, child_highlighting
)
- return match_all_query, child_docs_query, parent_query
+ return EsMainQueries(
+ search_query=match_all_query,
+ parent_query=parent_query,
+ child_query=child_docs_query,
+ )
if plain_doc:
# Combine the filters and string query for plain documents like Oral
@@ -1300,7 +1307,11 @@ def build_es_base_query(
cd, filters, string_query, api_version
)
- return search_query.query(main_query), child_docs_query, parent_query
+ return EsMainQueries(
+ search_query=search_query.query(main_query),
+ parent_query=parent_query,
+ child_query=child_docs_query,
+ )
def build_has_parent_parties_query(
@@ -1442,7 +1453,8 @@ def get_facet_dict_for_search_query(
"""
cd["just_facets_query"] = True
- search_query, _, _ = build_es_base_query(search_query, cd)
+ es_queries = build_es_base_query(search_query, cd)
+ search_query = es_queries.search_query
search_query.aggs.bucket("status", A("terms", field="status.raw"))
search_query = search_query.extra(size=0)
response = search_query.execute()
@@ -1464,7 +1476,9 @@ def build_es_main_query(
applicable.
"""
search_query_base = search_query
- search_query, child_docs_query, _ = build_es_base_query(search_query, cd)
+ es_queries = build_es_base_query(search_query, cd)
+ search_query = es_queries.search_query
+ child_docs_query = es_queries.child_query
top_hits_limit = 5
child_docs_count_query = None
match cd["type"]:
@@ -2390,7 +2404,9 @@ def build_search_feed_query(
hl_field = "text"
if cd["type"] == SEARCH_TYPES.RECAP:
hl_field = "plain_text"
- s, child_docs_query, _ = build_es_base_query(search_query, cd)
+ es_queries = build_es_base_query(search_query, cd)
+ s = es_queries.search_query
+ child_docs_query = es_queries.child_query
if jurisdiction or cd["type"] == SEARCH_TYPES.RECAP:
# An Opinion Jurisdiction feed or RECAP Search displays child documents
# Eliminate items that lack the ordering field and apply highlighting
@@ -2952,9 +2968,11 @@ def do_es_api_query(
"""
try:
- s, child_docs_query, _ = build_es_base_query(
+ es_queries = build_es_base_query(
search_query, cd, cd["highlight"], api_version
)
+ s = es_queries.search_query
+ child_docs_query = es_queries.child_query
except (
UnbalancedParenthesesQuery,
UnbalancedQuotesQuery,
@@ -3122,8 +3140,8 @@ def do_es_alert_estimation_query(
days=int(day_count)
)
cd[before_field] = None
- estimation_query, _, _ = build_es_base_query(search_query, cd)
-
+ es_queries = build_es_base_query(search_query, cd)
+ estimation_query = es_queries.search_query
if cd["type"] == SEARCH_TYPES.RECAP:
# The RECAP estimation query consists of two requests: one to estimate
# Docket hits and one to estimate RECAPDocument hits.
@@ -3144,7 +3162,8 @@ def do_es_alert_estimation_query(
multi_search = multi_search.add(main_doc_count_query)
# Build RECAPDocuments count query.
- _, child_docs_query, _ = build_es_base_query(search_query, cd)
+ es_queries = build_es_base_query(search_query, cd)
+ child_docs_query = es_queries.child_query
child_docs_count_query = build_child_docs_query(child_docs_query, cd)
child_total = 0
if child_docs_count_query:
@@ -3186,10 +3205,10 @@ def do_es_sweep_alert_query(
cd = search_form.cleaned_data
else:
return None, None, None
-
- s, child_query, parent_query = build_es_base_query(
- search_query, cd, True, alerts=True
- )
+ es_queries = build_es_base_query(search_query, cd, True, alerts=True)
+ s = es_queries.search_query
+ parent_query = es_queries.parent_query
+ child_query = es_queries.child_query
main_query = add_es_highlighting(s, cd, alerts=True)
main_query = main_query.sort(build_sort_results(cd))
main_query = main_query.extra(
diff --git a/cl/lib/types.py b/cl/lib/types.py
index 82d6131b5e..ff257574e9 100644
--- a/cl/lib/types.py
+++ b/cl/lib/types.py
@@ -4,6 +4,8 @@
from typing import Any, Callable, Dict, List, NotRequired, TypedDict, Union
from django.http import HttpRequest
+from django_elasticsearch_dsl.search import Search
+from elasticsearch_dsl.query import QueryString
from cl.users.models import User
@@ -190,6 +192,13 @@ def get_db_to_dataclass_map(self):
return self.__db_to_dataclass_map
+@dataclass
+class EsMainQueries:
+ search_query: Search
+ parent_query: QueryString | None = None
+ child_query: QueryString | None = None
+
+
@dataclass
class ApiPositionMapping(BasePositionMapping):
position_type_dict: defaultdict[int, list[str]] = field(
diff --git a/cl/search/documents.py b/cl/search/documents.py
index b62293ac90..59d8327875 100644
--- a/cl/search/documents.py
+++ b/cl/search/documents.py
@@ -365,8 +365,8 @@ def prepare_percolator_query(self, instance):
cd = search_form.cleaned_data
search_query = AudioDocument.search()
- query, _, _ = build_es_base_query(search_query, cd)
- return query.to_dict()["query"]
+ es_queries = build_es_base_query(search_query, cd)
+ return es_queries.search_query.to_dict()["query"]
class ES_CHILD_ID:
diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py
index 3ca921704d..5878299362 100644
--- a/cl/search/tests/tests_es_oral_arguments.py
+++ b/cl/search/tests/tests_es_oral_arguments.py
@@ -984,8 +984,9 @@ def confirm_query_matched(response, query_id) -> bool:
@staticmethod
def save_percolator_query(cd):
search_query = AudioDocument.search()
- query, _, _ = build_es_base_query(search_query, cd)
- query_dict = query.to_dict()["query"]
+ es_queries = build_es_base_query(search_query, cd)
+ search_query = es_queries.search_query
+ query_dict = search_query.to_dict()["query"]
percolator_query = AudioPercolator(
percolator_query=query_dict, rate=Alert.REAL_TIME
)
diff --git a/cl/search/tests/tests_es_person.py b/cl/search/tests/tests_es_person.py
index 5f6a195849..12a0b7f1fb 100644
--- a/cl/search/tests/tests_es_person.py
+++ b/cl/search/tests/tests_es_person.py
@@ -1342,7 +1342,8 @@ def test_has_child_filters(self) -> None:
"type": SEARCH_TYPES.PEOPLE,
}
s = PersonDocument.search()
- main_query, _, _ = build_es_base_query(s, cd)
+ es_queries = build_es_base_query(s, cd)
+ main_query = es_queries.search_query
self.assertEqual(main_query.count(), 2)
# Query by parent field dob_state and child field selection_method.
@@ -1352,7 +1353,8 @@ def test_has_child_filters(self) -> None:
"type": SEARCH_TYPES.PEOPLE,
}
s = PersonDocument.search()
- main_query, _, _ = build_es_base_query(s, cd)
+ es_queries = build_es_base_query(s, cd)
+ main_query = es_queries.search_query
self.assertEqual(main_query.count(), 1)
position_5.delete()
From eb8874e718ae18cddd700e2e2c93e53d99c24849 Mon Sep 17 00:00:00 2001
From: ttys0dev <126845556+ttys0dev@users.noreply.github.com>
Date: Sat, 5 Oct 2024 16:42:09 -0600
Subject: [PATCH 33/33] Fix check for restricted attachments
---
cl/recap/mergers.py | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py
index 6eccba198e..80244e7865 100644
--- a/cl/recap/mergers.py
+++ b/cl/recap/mergers.py
@@ -1792,6 +1792,15 @@ async def merge_attachment_page_data(
if not all(sanity_checks):
continue
+ # Missing on some restricted docs (see Juriscraper)
+ # Attachment 0 may not have page count since it is the main rd.
+ if (
+ "page_count" in attachment
+ and attachment["page_count"] is None
+ and attachment["attachment_number"] != 0
+ ):
+ continue
+
# Appellate entries with attachments don't have a main RD, transform it
# to an attachment. In ACMS attachment pages, all the documents use the
# same pacer_doc_id, so we need to make sure only one is matched to the