From 3e4f269c6230b8d5954cd922e32090e5866caf76 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 20 Jun 2024 19:22:43 -0600 Subject: [PATCH 01/33] fix(elasticsearch): Test RECAP nested index reliability --- cl/lib/elasticsearch_utils.py | 286 ++++++++++++- cl/search/api_serializers.py | 15 + cl/search/constants.py | 5 + cl/search/documents.py | 404 ++++++++++++++++++ cl/search/es_indices.py | 9 + .../cl_index_parent_and_child_docs.py | 7 + cl/search/tasks.py | 6 +- cl/search/tests/tests_es_recap.py | 69 ++- 8 files changed, 785 insertions(+), 16 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 24d49257f7..7f407441aa 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -4,6 +4,7 @@ import re import time import traceback +from collections import defaultdict from copy import deepcopy from dataclasses import fields from functools import reduce, wraps @@ -68,6 +69,7 @@ SEARCH_RECAP_CHILD_HL_FIELDS, SEARCH_RECAP_CHILD_QUERY_FIELDS, SEARCH_RECAP_HL_FIELDS, + SEARCH_RECAP_NESTED_CHILD_QUERY_FIELDS, SEARCH_RECAP_PARENT_QUERY_FIELDS, api_child_highlight_map, ) @@ -1066,6 +1068,7 @@ def build_es_base_query( cd: CleanData, child_highlighting: bool = True, api_version: Literal["v3", "v4"] | None = None, + nested_query: bool = False, ) -> tuple[Search, QueryString | None]: """Builds filters and fulltext_query based on the given cleaned data and returns an elasticsearch query. @@ -1074,6 +1077,7 @@ def build_es_base_query( :param cd: The cleaned data object containing the query and filters. :param child_highlighting: Whether highlighting should be enabled in child docs. :param api_version: Optional, the request API version. + :param nested_query: Whether to perform a nested query. :return: A two-tuple, the Elasticsearch search query object and an ES QueryString for child documents, or None if there is no need to query child documents. @@ -1151,6 +1155,15 @@ def build_es_base_query( ], ) ) + nested_child_fields = SEARCH_RECAP_NESTED_CHILD_QUERY_FIELDS.copy() + nested_child_fields.extend( + add_fields_boosting( + cd, + [ + "description", + ], + ) + ) child_query_fields = {"recap_document": child_fields} parent_query_fields = SEARCH_RECAP_PARENT_QUERY_FIELDS.copy() parent_query_fields.extend( @@ -1162,13 +1175,22 @@ def build_es_base_query( ], ) ) - main_query, join_query = build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - child_highlighting=child_highlighting, - api_version=api_version, - ) + + if nested_query: + main_query, _ = build_full_nested_es_queries( + cd, + nested_child_fields, + parent_query_fields, + ) + else: + main_query, join_query = build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + child_highlighting=child_highlighting, + api_version=api_version, + ) + case SEARCH_TYPES.OPINION: str_query = cd.get("q", "") related_match = RELATED_PATTERN.search(str_query) @@ -1984,11 +2006,14 @@ def fetch_es_results( return [], 0, error, None, None -def build_has_child_filters(cd: CleanData) -> list[QueryString]: +def build_has_child_filters( + cd: CleanData, nested_query=False +) -> list[QueryString]: """Builds Elasticsearch 'has_child' filters based on the given child type and CleanData. :param cd: The user input CleanedData. + :param nested_query: Whether to perform a nested query. :return: A list of QueryString objects containing the 'has_child' filters. """ @@ -2022,22 +2047,36 @@ def build_has_child_filters(cd: CleanData) -> list[QueryString]: attachment_number = cd.get("attachment_number", "") if available_only: + field = ( + "is_available" + if not nested_query + else "documents.is_available" + ) queries_list.extend( build_term_query( - "is_available", + field, available_only, ) ) if description: - queries_list.extend(build_text_filter("description", description)) + field = ( + "description" if not nested_query else "documents.description" + ) + queries_list.extend(build_text_filter(field, description)) if document_number: - queries_list.extend( - build_term_query("document_number", document_number) + field = ( + "document_number" + if not nested_query + else "documents.document_number" ) + queries_list.extend(build_term_query(field, document_number)) if attachment_number: - queries_list.extend( - build_term_query("attachment_number", attachment_number) + field = ( + "attachment_number" + if not nested_query + else "documents.attachment_number" ) + queries_list.extend(build_term_query(field, attachment_number)) return queries_list @@ -3014,3 +3053,222 @@ def do_es_alert_estimation_query( estimation_query, _ = build_es_base_query(search_query, cd) return estimation_query.count() + + +def build_nested_child_query( + query: QueryString | str, + child_type: str, + child_hits_limit: int, + highlighting_fields: dict[str, int] | None = None, +) -> QueryString: + """Build a nested query. + + :param query: The Elasticsearch query string or QueryString object. + :param child_type: The type of the child document. + :param child_hits_limit: The maximum number of child hits to be returned. + :param highlighting_fields: List of fields to highlight in child docs. + :return: The 'has_child' query. + """ + + highlight_options, fields_to_exclude = build_highlights_dict( + highlighting_fields, SEARCH_HL_TAG + ) + inner_hits = { + "name": f"filter_query_inner_{child_type}", + "size": child_hits_limit, + "_source": { + "excludes": fields_to_exclude, + }, + } + if highlight_options: + inner_hits["highlight"] = highlight_options + + return Q( + "nested", + path="documents", + score_mode="max", + query=query, + inner_hits=inner_hits, + ) + + +def build_full_nested_es_queries( + cd: CleanData, + child_query_fields: list[str], + parent_query_fields: list[str], +) -> tuple[QueryString | list, QueryString | None]: + """Build a complete Elasticsearch query with both parent and nested + documents conditions. + + :param cd: The query CleanedData + :param child_query_fields: A dictionary mapping child fields document type. + :param parent_query_fields: A list of fields for the parent document. + :return: An Elasticsearch QueryString object. + """ + + q_should = [] + child_query = None + if cd["type"] in [ + SEARCH_TYPES.RECAP, + SEARCH_TYPES.DOCKETS, + SEARCH_TYPES.RECAP_DOCUMENT, + SEARCH_TYPES.OPINION, + SEARCH_TYPES.PEOPLE, + ]: + # Build child filters. + child_filters = build_has_child_filters(cd, nested_query=True) + # Copy the original child_filters before appending parent fields. + # For its use later in the parent filters. + child_filters_original = deepcopy(child_filters) + # Build child text query. + child_fields = [f"documents.{field}" for field in child_query_fields] + child_text_query = build_fulltext_query( + child_fields, cd.get("q", ""), only_queries=True + ) + + # Build parent filters. + parent_filters = build_join_es_filters(cd) + + # Build the child query based on child_filters and child child_text_query + match child_filters, child_text_query: + case [], []: + pass + case [], _: + child_query = Q( + "bool", + should=child_text_query, + minimum_should_match=1, + ) + case _, []: + child_query = Q( + "bool", + filter=child_filters, + ) + case _, _: + child_query = Q( + "bool", + filter=child_filters, + should=child_text_query, + minimum_should_match=1, + ) + + _, query_hits_limit = get_child_top_hits_limit(cd, cd["type"]) + has_child_query = None + if child_text_query or child_filters: + hl_fields = api_child_highlight_map.get((True, cd["type"]), {}) + has_child_query = build_nested_child_query( + child_query, + "recap_document", + query_hits_limit, + hl_fields, + ) + + if has_child_query: + q_should.append(has_child_query) + + # Build the parent filter and text queries. + string_query = build_fulltext_query( + parent_query_fields, cd.get("q", ""), only_queries=True + ) + + # If child filters are set, add a nested query as a filter to the + # parent query to exclude results without matching children. + if child_filters_original: + parent_filters.append( + Q( + "nested", + path="documents", + score_mode="max", + query=Q("bool", filter=child_filters_original), + ) + ) + parent_query = None + match parent_filters, string_query: + case [], []: + pass + case [], _: + parent_query = Q( + "bool", + should=string_query, + minimum_should_match=1, + ) + case _, []: + parent_query = Q( + "bool", + filter=parent_filters, + ) + case _, _: + parent_query = Q( + "bool", + filter=parent_filters, + should=string_query, + minimum_should_match=1, + ) + if parent_query: + q_should.append(parent_query) + + if not q_should: + return [], child_query + + final_query = Q( + "bool", + should=q_should, + ) + return ( + final_query, + child_query, + ) + + +def do_es_sweep_nested_query( + search_query: Search, + cd: CleanData, +) -> tuple[list[defaultdict] | None, int | None]: + """Build an ES query for its use in the daily RECAP sweep index. + + :param search_query: Elasticsearch DSL Search object. + :param cd: The query CleanedData + :return: A two-tuple, the Elasticsearch search query object and an ES + Query for child documents, or None if there is no need to query + child documents. + """ + + search_form = SearchForm(cd, is_es_form=True) + if search_form.is_valid(): + cd = search_form.cleaned_data + else: + return None, None + + hits = None + try: + s, _ = build_es_base_query( + search_query, + cd, + True, + nested_query=True, + ) + except ( + UnbalancedParenthesesQuery, + UnbalancedQuotesQuery, + BadProximityQuery, + ) as e: + raise ElasticBadRequestError(detail=e.message) + main_query = add_es_highlighting(s, cd, highlighting=True) + main_query = main_query.extra(from_=0, size=30) + results = main_query.execute() + if results: + hits = results.hits.total.value + + limit_inner_hits({}, results, cd["type"]) + set_results_highlights(results, cd["type"]) + + for result in results: + child_result_objects = [] + if hasattr(result, "child_docs"): + for child_doc in result.child_docs: + child_result_objects.append( + defaultdict(lambda: None, child_doc["_source"].to_dict()) + ) + result["child_docs"] = child_result_objects + + return results, hits diff --git a/cl/search/api_serializers.py b/cl/search/api_serializers.py index 1f9cbb7d75..7c72dd5a2d 100644 --- a/cl/search/api_serializers.py +++ b/cl/search/api_serializers.py @@ -731,3 +731,18 @@ class Meta: "pacer_doc_id", "trustee_str", ) + + +class RECAPNestedResultSerializer( + RECAPMetaMixin, BaseDocketESResultSerializer +): + """The serializer class for RECAP search type results.""" + + recap_documents = BaseRECAPDocumentESResultSerializer( + many=True, read_only=True, source="child_docs" + ) + + class Meta(BaseDocketESResultSerializer.Meta): + exclude = BaseDocketESResultSerializer.Meta.exclude + ( + "docket_absolute_url", + ) diff --git a/cl/search/constants.py b/cl/search/constants.py index 55012fa9f0..91b5cb93f0 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -96,6 +96,11 @@ "chapter", "trustee_str", ] +SEARCH_RECAP_NESTED_CHILD_QUERY_FIELDS = [ + "short_description", + "plain_text", + "document_type", +] SEARCH_OPINION_QUERY_FIELDS = [ "court", "court_id", diff --git a/cl/search/documents.py b/cl/search/documents.py index d7b18f9472..5fada32689 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -29,12 +29,14 @@ parenthetical_group_index, people_db_index, recap_index, + recap_sweep_index, ) from cl.search.forms import SearchForm from cl.search.models import ( BankruptcyInformation, Citation, Docket, + DocketEntry, Opinion, OpinionCluster, ParentheticalGroup, @@ -1826,3 +1828,405 @@ def prepare_non_participating_judge_ids(self, instance): def prepare_cluster_child(self, instance): return "opinion_cluster" + + +@recap_sweep_index.document +class RECAPNestedDocument(Document): + # Docket Fields + docket_id = fields.IntegerField(attr="pk") + caseName = fields.TextField( + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + term_vector="with_positions_offsets", + ), + }, + search_analyzer="search_analyzer", + ) + case_name_full = fields.TextField( + attr="case_name_full", + analyzer="text_en_splitting_cl", + fields={ + "exact": fields.TextField( + attr="case_name_full", + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + docketNumber = fields.TextField( + attr="docket_number", + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + attr="docket_number", + analyzer="english_exact", + term_vector="with_positions_offsets", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + suitNature = fields.TextField( + attr="nature_of_suit", + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + attr="nature_of_suit", + analyzer="english_exact", + term_vector="with_positions_offsets", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + cause = fields.TextField( + attr="cause", + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + attr="cause", + analyzer="english_exact", + term_vector="with_positions_offsets", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + juryDemand = fields.TextField( + attr="jury_demand", + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + attr="jury_demand", + analyzer="english_exact", + term_vector="with_positions_offsets", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + jurisdictionType = fields.TextField( + attr="jurisdiction_type", + analyzer="text_en_splitting_cl", + fields={ + "exact": fields.TextField( + attr="jurisdiction_type", + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + dateArgued = fields.DateField(attr="date_argued") + dateFiled = fields.DateField(attr="date_filed") + dateTerminated = fields.DateField(attr="date_terminated") + assignedTo = fields.TextField( + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + analyzer="english_exact", + term_vector="with_positions_offsets", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + assigned_to_id = fields.KeywordField(attr="assigned_to.pk") + referredTo = fields.TextField( + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + analyzer="english_exact", + term_vector="with_positions_offsets", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + referred_to_id = fields.KeywordField(attr="referred_to.pk") + court = fields.TextField( + attr="court.full_name", + analyzer="text_en_splitting_cl", + fields={ + "exact": fields.TextField( + attr="court.full_name", + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + court_id = fields.TextField( + attr="court.pk", + analyzer="text_en_splitting_cl", + fields={"raw": fields.KeywordField(attr="court.pk")}, + search_analyzer="search_analyzer", + ) + court_citation_string = fields.TextField( + attr="court.citation_string", + analyzer="text_en_splitting_cl", + search_analyzer="search_analyzer", + term_vector="with_positions_offsets", + ) + chapter = fields.TextField( + analyzer="text_en_splitting_cl", + search_analyzer="search_analyzer", + ) + trustee_str = fields.TextField( + analyzer="text_en_splitting_cl", + fields={ + "exact": fields.TextField( + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ) + date_created = fields.DateField(attr="date_created") + pacer_case_id = fields.KeywordField(attr="pacer_case_id") + + # Parties + party_id = fields.ListField(fields.IntegerField(multi=True)) + party = fields.ListField( + fields.TextField( + analyzer="text_en_splitting_cl", + fields={ + "exact": fields.TextField( + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + multi=True, + ) + ) + attorney_id = fields.ListField(fields.IntegerField(multi=True)) + attorney = fields.ListField( + fields.TextField( + analyzer="text_en_splitting_cl", + fields={ + "exact": fields.TextField( + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + multi=True, + ) + ) + firm_id = fields.ListField(fields.IntegerField(multi=True)) + firm = fields.ListField( + fields.TextField( + analyzer="text_en_splitting_cl", + fields={ + "exact": fields.TextField( + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + multi=True, + ) + ) + + # RECAPDocument fields: + documents = fields.NestedField( + properties={ + "id": fields.IntegerField(attr="pk"), + "docket_entry_id": fields.IntegerField(attr="docket_entry.pk"), + "description": fields.TextField( + attr="docket_entry.description", + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + attr="docket_entry.description", + term_vector="with_positions_offsets", + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ), + "entry_number": fields.LongField(attr="docket_entry.entry_number"), + "entry_date_filed": fields.DateField( + attr="docket_entry.date_filed" + ), + "short_description": fields.TextField( + attr="description", + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + attr="description", + analyzer="english_exact", + term_vector="with_positions_offsets", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ), + "document_type": fields.TextField( + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + analyzer="english_exact", + term_vector="with_positions_offsets", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ), + "document_number": fields.LongField(), + "pacer_doc_id": fields.KeywordField(attr="pacer_doc_id"), + "plain_text": fields.TextField( + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + analyzer="english_exact", + term_vector="with_positions_offsets", + search_analyzer="search_analyzer_exact", + ), + }, + search_analyzer="search_analyzer", + ), + "attachment_number": fields.IntegerField(attr="attachment_number"), + "is_available": fields.BooleanField(attr="is_available"), + "page_count": fields.IntegerField(attr="page_count"), + "filepath_local": fields.KeywordField(index=False), + "absolute_url": fields.KeywordField(index=False), + "cites": fields.ListField( + fields.IntegerField(multi=True), + ), + } + ) + + # Meta + timestamp = fields.DateField() + + class Django: + model = Docket + ignore_signals = True + + def prepare_caseName(self, instance): + return best_case_name(instance) + + def prepare_assignedTo(self, instance): + if instance.assigned_to: + return instance.assigned_to.name_full + elif instance.assigned_to_str: + return instance.assigned_to_str + + def prepare_referredTo(self, instance): + if instance.referred_to: + return instance.referred_to.name_full + elif instance.referred_to_str: + return instance.referred_to_str + + def prepare_chapter(self, instance): + if BankruptcyInformation.objects.filter(docket=instance).exists(): + return instance.bankruptcy_information.chapter + + def prepare_trustee_str(self, instance): + if BankruptcyInformation.objects.filter(docket=instance).exists(): + return instance.bankruptcy_information.trustee_str + + def prepare_docket_child(self, instance): + return "docket" + + def prepare_docket_absolute_url(self, instance): + return instance.get_absolute_url() + + def prepare_parties(self, instance): + out = { + "party_id": set(), + "party": set(), + "attorney_id": set(), + "attorney": set(), + "firm_id": set(), + "firm": set(), + } + + # Extract only required parties values. + party_values = instance.parties.values_list("pk", "name") + for pk, name in party_values.iterator(): + out["party_id"].add(pk) + out["party"].add(name) + + # Extract only required attorney values. + atty_values = ( + Attorney.objects.filter(roles__docket=instance) + .distinct() + .values_list("pk", "name") + ) + for pk, name in atty_values.iterator(): + out["attorney_id"].add(pk) + out["attorney"].add(name) + + # Extract only required firm values. + firms_values = ( + AttorneyOrganization.objects.filter( + attorney_organization_associations__docket=instance + ) + .distinct() + .values_list("pk", "name") + ) + for pk, name in firms_values.iterator(): + out["firm_id"].add(pk) + out["firm"].add(name) + + return out + + def prepare_documents(self, instance): + rds = RECAPDocument.objects.filter(docket_entry__docket=instance) + return [ + { + "id": rd.pk, + "docket_entry_id": rd.docket_entry_id, + "description": rd.docket_entry.description, + "entry_number": rd.docket_entry.entry_number, + "entry_date_filed": rd.docket_entry.date_filed, + "short_description": rd.description, + "document_type": rd.get_document_type_display(), + "document_number": rd.document_number or None, + "pacer_doc_id": rd.pacer_doc_id, + "plain_text": escape(rd.plain_text.translate(null_map)), + "attachment_number": rd.attachment_number, + "is_available": rd.is_available, + "page_count": rd.page_count, + "filepath_local": ( + rd.filepath_local.name if rd.filepath_local else None + ), + "absolute_url": rd.get_absolute_url(), + "cites": list( + rd.cited_opinions.all().values_list( + "cited_opinion_id", flat=True + ) + ), + } + for rd in rds + ] + + def prepare(self, instance): + data = super().prepare(instance) + parties_prepared = self.prepare_parties(instance) + data["party_id"] = list(parties_prepared["party_id"]) + data["party"] = list(parties_prepared["party"]) + data["attorney_id"] = list(parties_prepared["attorney_id"]) + data["attorney"] = list(parties_prepared["attorney"]) + data["firm_id"] = list(parties_prepared["firm_id"]) + data["firm"] = list(parties_prepared["firm"]) + return data diff --git a/cl/search/es_indices.py b/cl/search/es_indices.py index 717a6abee9..bf129f0704 100644 --- a/cl/search/es_indices.py +++ b/cl/search/es_indices.py @@ -53,3 +53,12 @@ number_of_replicas=settings.ELASTICSEARCH_OPINION_NUMBER_OF_REPLICAS, analysis=settings.ELASTICSEARCH_DSL["analysis"], ) + + +# Define RECAP Nested elasticsearch index +recap_sweep_index = Index("recap_sweep") +recap_sweep_index.settings( + number_of_shards=settings.ELASTICSEARCH_RECAP_NUMBER_OF_SHARDS, + number_of_replicas=settings.ELASTICSEARCH_RECAP_NUMBER_OF_REPLICAS, + analysis=settings.ELASTICSEARCH_DSL["analysis"], +) diff --git a/cl/search/management/commands/cl_index_parent_and_child_docs.py b/cl/search/management/commands/cl_index_parent_and_child_docs.py index abb528a3b0..e184187168 100644 --- a/cl/search/management/commands/cl_index_parent_and_child_docs.py +++ b/cl/search/management/commands/cl_index_parent_and_child_docs.py @@ -342,6 +342,11 @@ def add_arguments(self, parser): action="store_true", help="Use this flag to only index documents missing in the index.", ) + parser.add_argument( + "--nested", + action="store_true", + help="Whether to perform a indexing of Nested documents.", + ) def handle(self, *args, **options): super().handle(*args, **options) @@ -475,6 +480,7 @@ def process_queryset( pk_offset = self.options["pk_offset"] document_type = self.options.get("document_type", None) missing = self.options.get("missing", False) + nested = self.options.get("nested", False) fields_map = {} if event_doc_type == EventTable.DOCKET: fields_map = recap_document_field_mapping["save"][Docket][ @@ -535,6 +541,7 @@ def process_queryset( search_type, document_type, testing_mode=testing_mode, + nested=nested, ).set(queue=queue).apply_async() case "remove_parent_and_child_docs_by_query": remove_parent_and_child_docs_by_query.si( diff --git a/cl/search/tasks.py b/cl/search/tasks.py index df7d337f26..db1a0c3651 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -45,6 +45,7 @@ OpinionDocument, PersonDocument, PositionDocument, + RECAPNestedDocument, ) from cl.search.models import ( SEARCH_TYPES, @@ -1071,6 +1072,7 @@ def index_parent_or_child_docs( search_type: str, document_type: str | None, testing_mode: bool = False, + nested: bool = False, ) -> None: """Index parent or child documents in Elasticsearch. @@ -1089,7 +1091,9 @@ def index_parent_or_child_docs( child_instances = QuerySet() match search_type: case SEARCH_TYPES.RECAP: - parent_es_document = DocketDocument + parent_es_document = ( + RECAPNestedDocument if nested else DocketDocument + ) child_es_document = ESRECAPDocument child_id_property = "RECAP" if document_type == "parent": diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index 389193b204..15f651a475 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -19,6 +19,7 @@ from cl.lib.elasticsearch_utils import ( build_es_main_query, + do_es_sweep_nested_query, fetch_es_results, merge_unavailable_fields_on_parent_document, set_results_highlights, @@ -46,9 +47,15 @@ DocketESResultSerializer, RECAPDocumentESResultSerializer, RECAPESResultSerializer, + RECAPNestedResultSerializer, ) from cl.search.api_views import SearchV4ViewSet -from cl.search.documents import ES_CHILD_ID, DocketDocument, ESRECAPDocument +from cl.search.documents import ( + ES_CHILD_ID, + DocketDocument, + ESRECAPDocument, + RECAPNestedDocument, +) from cl.search.factories import ( BankruptcyInformationFactory, CourtFactory, @@ -6672,3 +6679,63 @@ def test_recap_history_table_indexing(self) -> None: ) if keys: self.r.delete(*keys) + + +class RECAPSearchNestedIndexTest( + RECAPSearchAPICommonTests, ESIndexTestCase, TestCase +): + """ + RECAP Nested Index Tests + """ + + version_api = "v4" + skip_common_tests = False + + @classmethod + def setUpTestData(cls): + cls.rebuild_index("people_db.Person") + cls.rebuild_index("search.Docket") + cls.mock_date = now().replace(day=15, hour=0) + with time_machine.travel(cls.mock_date, tick=False): + super().setUpTestData() + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + document_type="parent", + testing_mode=True, + nested=True, + ) + + async def _test_api_results_count( + self, params, expected_count, field_name + ): + + search_query = RECAPNestedDocument.search() + results, total_hits = await sync_to_async(do_es_sweep_nested_query)( + search_query, + params, + ) + results = RECAPNestedResultSerializer(results, many=True).data + got = len(results) + self.assertEqual( + got, + expected_count, + msg="Did not get the right number of search results in API with %s " + "filter applied.\n" + "Expected: %s\n" + " Got: %s\n\n" + "Params were: %s" % (field_name, expected_count, got, params), + ) + return results + + async def test_cross_object_string_query(self) -> None: + """Confirm a cross-object string query return the right results.""" + + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": f"id:{self.rd_api.pk} cause:(401 Civil) juryDemand:Plaintiff short_description:(Order Letter) plain_text:(shown in the API)", + } + + await self._test_api_results_count(search_params, 1, "API fields") From 2955b0ba5d7426fd62671e6d00c044818f2773c2 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 21 Jun 2024 20:35:54 -0600 Subject: [PATCH 02/33] fix(alerts): Changed sweep index approach to parent-child documents --- cl/lib/elasticsearch_utils.py | 291 +++----------- cl/lib/test_helpers.py | 62 ++- cl/search/api_serializers.py | 16 +- cl/search/constants.py | 33 +- cl/search/documents.py | 369 +----------------- cl/search/forms.py | 10 + .../cl_index_parent_and_child_docs.py | 13 +- cl/search/tasks.py | 20 +- cl/search/tests/tests_es_recap.py | 126 +++++- cl/tests/cases.py | 6 +- 10 files changed, 295 insertions(+), 651 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 7f407441aa..3280abc449 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -58,6 +58,8 @@ PEOPLE_ES_HL_FIELDS, PEOPLE_ES_HL_KEYWORD_FIELDS, RELATED_PATTERN, + SEARCH_ALERTS_DOCKET_HL_FIELDS, + SEARCH_ALERTS_DOCKET_KEYWORDS_HL_FIELDS, SEARCH_ALERTS_ORAL_ARGUMENT_ES_HL_FIELDS, SEARCH_HL_TAG, SEARCH_OPINION_HL_FIELDS, @@ -69,7 +71,6 @@ SEARCH_RECAP_CHILD_HL_FIELDS, SEARCH_RECAP_CHILD_QUERY_FIELDS, SEARCH_RECAP_HL_FIELDS, - SEARCH_RECAP_NESTED_CHILD_QUERY_FIELDS, SEARCH_RECAP_PARENT_QUERY_FIELDS, api_child_highlight_map, ) @@ -889,6 +890,7 @@ def build_has_child_query( order_by: tuple[str, str] | None = None, child_highlighting: bool = True, default_current_date: datetime.date | None = None, + alerts: bool = False, ) -> QueryString: """Build a 'has_child' query. @@ -901,6 +903,7 @@ def build_has_child_query( :param child_highlighting: Whether highlighting should be enabled in child docs. :param default_current_date: The default current date to use for computing a stable date score across pagination in the V4 Search API. + :param alerts: If highlighting is being applied to search Alerts hits. :return: The 'has_child' query. """ @@ -917,8 +920,9 @@ def build_has_child_query( default_current_date=default_current_date, ) + hl_tag = ALERTS_HL_TAG if alerts else SEARCH_HL_TAG highlight_options, fields_to_exclude = build_highlights_dict( - highlighting_fields, SEARCH_HL_TAG, child_highlighting + highlighting_fields, hl_tag, child_highlighting ) inner_hits = { @@ -1068,7 +1072,7 @@ def build_es_base_query( cd: CleanData, child_highlighting: bool = True, api_version: Literal["v3", "v4"] | None = None, - nested_query: bool = False, + alerts: bool = False, ) -> tuple[Search, QueryString | None]: """Builds filters and fulltext_query based on the given cleaned data and returns an elasticsearch query. @@ -1077,7 +1081,7 @@ def build_es_base_query( :param cd: The cleaned data object containing the query and filters. :param child_highlighting: Whether highlighting should be enabled in child docs. :param api_version: Optional, the request API version. - :param nested_query: Whether to perform a nested query. + :param alerts: If highlighting is being applied to search Alerts hits. :return: A two-tuple, the Elasticsearch search query object and an ES QueryString for child documents, or None if there is no need to query child documents. @@ -1155,15 +1159,6 @@ def build_es_base_query( ], ) ) - nested_child_fields = SEARCH_RECAP_NESTED_CHILD_QUERY_FIELDS.copy() - nested_child_fields.extend( - add_fields_boosting( - cd, - [ - "description", - ], - ) - ) child_query_fields = {"recap_document": child_fields} parent_query_fields = SEARCH_RECAP_PARENT_QUERY_FIELDS.copy() parent_query_fields.extend( @@ -1175,21 +1170,14 @@ def build_es_base_query( ], ) ) - - if nested_query: - main_query, _ = build_full_nested_es_queries( - cd, - nested_child_fields, - parent_query_fields, - ) - else: - main_query, join_query = build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - child_highlighting=child_highlighting, - api_version=api_version, - ) + main_query, join_query = build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + child_highlighting=child_highlighting, + api_version=api_version, + alerts=alerts, + ) case SEARCH_TYPES.OPINION: str_query = cd.get("q", "") @@ -1300,7 +1288,7 @@ def build_child_docs_query( query for query in parent_filters if isinstance(query, QueryString) - and query.fields[0] in ["party", "attorney"] + and query.fields[0] in ["party", "attorney", "firm"] ] parties_has_parent_query = build_has_parent_parties_query(parties_filters) @@ -1475,7 +1463,15 @@ def add_es_highlighting( highlighting_fields = PEOPLE_ES_HL_FIELDS highlighting_keyword_fields = PEOPLE_ES_HL_KEYWORD_FIELDS case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: - highlighting_fields = SEARCH_RECAP_HL_FIELDS + highlighting_fields = ( + SEARCH_ALERTS_DOCKET_HL_FIELDS + if alerts + else SEARCH_RECAP_HL_FIELDS + ) + if alerts: + highlighting_keyword_fields = ( + SEARCH_ALERTS_DOCKET_KEYWORDS_HL_FIELDS + ) case SEARCH_TYPES.OPINION: highlighting_fields = SEARCH_OPINION_HL_FIELDS @@ -2006,14 +2002,11 @@ def fetch_es_results( return [], 0, error, None, None -def build_has_child_filters( - cd: CleanData, nested_query=False -) -> list[QueryString]: +def build_has_child_filters(cd: CleanData) -> list[QueryString]: """Builds Elasticsearch 'has_child' filters based on the given child type and CleanData. :param cd: The user input CleanedData. - :param nested_query: Whether to perform a nested query. :return: A list of QueryString objects containing the 'has_child' filters. """ @@ -2047,36 +2040,22 @@ def build_has_child_filters( attachment_number = cd.get("attachment_number", "") if available_only: - field = ( - "is_available" - if not nested_query - else "documents.is_available" - ) queries_list.extend( build_term_query( - field, + "is_available", available_only, ) ) if description: - field = ( - "description" if not nested_query else "documents.description" - ) - queries_list.extend(build_text_filter(field, description)) + queries_list.extend(build_text_filter("description", description)) if document_number: - field = ( - "document_number" - if not nested_query - else "documents.document_number" + queries_list.extend( + build_term_query("document_number", document_number) ) - queries_list.extend(build_term_query(field, document_number)) if attachment_number: - field = ( - "attachment_number" - if not nested_query - else "documents.attachment_number" + queries_list.extend( + build_term_query("attachment_number", attachment_number) ) - queries_list.extend(build_term_query(field, attachment_number)) return queries_list @@ -2133,6 +2112,7 @@ def build_join_es_filters(cd: CleanData) -> List: *build_text_filter("referredTo", cd.get("referred_to", "")), *build_text_filter("party", cd.get("party_name", "")), *build_text_filter("attorney", cd.get("atty_name", "")), + *build_text_filter("firm", cd.get("firm_name", "")), *build_daterange_query( "dateFiled", cd.get("filed_before", ""), @@ -2357,6 +2337,7 @@ def build_full_join_es_queries( mlt_query: Query | None = None, child_highlighting: bool = True, api_version: Literal["v3", "v4"] | None = None, + alerts: bool = False, ) -> tuple[QueryString | list, QueryString | None]: """Build a complete Elasticsearch query with both parent and child document conditions. @@ -2367,6 +2348,7 @@ def build_full_join_es_queries( :param mlt_query: the More Like This Query object. :param child_highlighting: Whether highlighting should be enabled in child docs. :param api_version: Optional, the request API version. + :param alerts: If highlighting is being applied to search Alerts hits. :return: An Elasticsearch QueryString object. """ @@ -2411,7 +2393,7 @@ def build_full_join_es_queries( query for query in parent_filters if isinstance(query, QueryString) - and query.fields[0] in ["party", "attorney"] + and query.fields[0] in ["party", "attorney", "firm"] ] has_parent_parties_filter = build_has_parent_parties_query( parties_filters @@ -2425,7 +2407,7 @@ def build_full_join_es_queries( query for query in parent_filters if not isinstance(query, QueryString) - or query.fields[0] not in ["party", "attorney"] + or query.fields[0] not in ["party", "attorney", "firm"] ] ) if parties_filters: @@ -2473,6 +2455,7 @@ def build_full_join_es_queries( get_function_score_sorting_key(cd, api_version), child_highlighting=child_highlighting, default_current_date=cd.get("request_date"), + alerts=alerts, ) if parties_filters and not has_child_query: @@ -2487,6 +2470,7 @@ def build_full_join_es_queries( SEARCH_RECAP_CHILD_HL_FIELDS, get_function_score_sorting_key(cd, api_version), default_current_date=cd.get("request_date"), + alerts=alerts, ) if has_child_query: @@ -3055,175 +3039,10 @@ def do_es_alert_estimation_query( return estimation_query.count() -def build_nested_child_query( - query: QueryString | str, - child_type: str, - child_hits_limit: int, - highlighting_fields: dict[str, int] | None = None, -) -> QueryString: - """Build a nested query. - - :param query: The Elasticsearch query string or QueryString object. - :param child_type: The type of the child document. - :param child_hits_limit: The maximum number of child hits to be returned. - :param highlighting_fields: List of fields to highlight in child docs. - :return: The 'has_child' query. - """ - - highlight_options, fields_to_exclude = build_highlights_dict( - highlighting_fields, SEARCH_HL_TAG - ) - inner_hits = { - "name": f"filter_query_inner_{child_type}", - "size": child_hits_limit, - "_source": { - "excludes": fields_to_exclude, - }, - } - if highlight_options: - inner_hits["highlight"] = highlight_options - - return Q( - "nested", - path="documents", - score_mode="max", - query=query, - inner_hits=inner_hits, - ) - - -def build_full_nested_es_queries( - cd: CleanData, - child_query_fields: list[str], - parent_query_fields: list[str], -) -> tuple[QueryString | list, QueryString | None]: - """Build a complete Elasticsearch query with both parent and nested - documents conditions. - - :param cd: The query CleanedData - :param child_query_fields: A dictionary mapping child fields document type. - :param parent_query_fields: A list of fields for the parent document. - :return: An Elasticsearch QueryString object. - """ - - q_should = [] - child_query = None - if cd["type"] in [ - SEARCH_TYPES.RECAP, - SEARCH_TYPES.DOCKETS, - SEARCH_TYPES.RECAP_DOCUMENT, - SEARCH_TYPES.OPINION, - SEARCH_TYPES.PEOPLE, - ]: - # Build child filters. - child_filters = build_has_child_filters(cd, nested_query=True) - # Copy the original child_filters before appending parent fields. - # For its use later in the parent filters. - child_filters_original = deepcopy(child_filters) - # Build child text query. - child_fields = [f"documents.{field}" for field in child_query_fields] - child_text_query = build_fulltext_query( - child_fields, cd.get("q", ""), only_queries=True - ) - - # Build parent filters. - parent_filters = build_join_es_filters(cd) - - # Build the child query based on child_filters and child child_text_query - match child_filters, child_text_query: - case [], []: - pass - case [], _: - child_query = Q( - "bool", - should=child_text_query, - minimum_should_match=1, - ) - case _, []: - child_query = Q( - "bool", - filter=child_filters, - ) - case _, _: - child_query = Q( - "bool", - filter=child_filters, - should=child_text_query, - minimum_should_match=1, - ) - - _, query_hits_limit = get_child_top_hits_limit(cd, cd["type"]) - has_child_query = None - if child_text_query or child_filters: - hl_fields = api_child_highlight_map.get((True, cd["type"]), {}) - has_child_query = build_nested_child_query( - child_query, - "recap_document", - query_hits_limit, - hl_fields, - ) - - if has_child_query: - q_should.append(has_child_query) - - # Build the parent filter and text queries. - string_query = build_fulltext_query( - parent_query_fields, cd.get("q", ""), only_queries=True - ) - - # If child filters are set, add a nested query as a filter to the - # parent query to exclude results without matching children. - if child_filters_original: - parent_filters.append( - Q( - "nested", - path="documents", - score_mode="max", - query=Q("bool", filter=child_filters_original), - ) - ) - parent_query = None - match parent_filters, string_query: - case [], []: - pass - case [], _: - parent_query = Q( - "bool", - should=string_query, - minimum_should_match=1, - ) - case _, []: - parent_query = Q( - "bool", - filter=parent_filters, - ) - case _, _: - parent_query = Q( - "bool", - filter=parent_filters, - should=string_query, - minimum_should_match=1, - ) - if parent_query: - q_should.append(parent_query) - - if not q_should: - return [], child_query - - final_query = Q( - "bool", - should=q_should, - ) - return ( - final_query, - child_query, - ) - - def do_es_sweep_nested_query( search_query: Search, cd: CleanData, -) -> tuple[list[defaultdict] | None, int | None]: +) -> tuple[list[Hit] | None, int | None]: """Build an ES query for its use in the daily RECAP sweep index. :param search_query: Elasticsearch DSL Search object. @@ -3241,19 +3060,15 @@ def do_es_sweep_nested_query( hits = None try: - s, _ = build_es_base_query( - search_query, - cd, - True, - nested_query=True, - ) + s, _ = build_es_base_query(search_query, cd, True, alerts=True) except ( UnbalancedParenthesesQuery, UnbalancedQuotesQuery, BadProximityQuery, ) as e: raise ElasticBadRequestError(detail=e.message) - main_query = add_es_highlighting(s, cd, highlighting=True) + main_query = add_es_highlighting(s, cd, alerts=True) + main_query = main_query.sort(build_sort_results(cd)) main_query = main_query.extra(from_=0, size=30) results = main_query.execute() if results: @@ -3272,3 +3087,21 @@ def do_es_sweep_nested_query( result["child_docs"] = child_result_objects return results, hits + + +def docket_field_matched(hit: Hit) -> bool: + """Determine whether HL matched a Docket field. + + :param hit: The ES hit. + :return: True if the hit matched a Docket field. Otherwise, False. + """ + + plain_hl = set(SEARCH_ALERTS_DOCKET_KEYWORDS_HL_FIELDS) + vector_hl = set(SEARCH_ALERTS_DOCKET_HL_FIELDS.keys()) + docket_hl = set() + if hasattr(hit.meta, "highlight"): + highlights = hit.meta.highlight.to_dict() + docket_hl = set([hl for hl in highlights.keys()]) + if docket_hl.issubset(plain_hl.union(vector_hl)): + return True + return False diff --git a/cl/lib/test_helpers.py b/cl/lib/test_helpers.py index 69976430f1..d7e1c49ebc 100644 --- a/cl/lib/test_helpers.py +++ b/cl/lib/test_helpers.py @@ -278,15 +278,19 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: ), "court_id": lambda x: x["result"].docket_entry.docket.court.pk, "dateArgued": lambda x: ( - ( - x["result"].docket_entry.docket.date_argued.isoformat() - if x.get("V4") - else midnight_pt_test( - x["result"].docket_entry.docket.date_argued - ).isoformat() + x["dateArgued"] + if x.get("dateArgued") + else ( + ( + x["result"].docket_entry.docket.date_argued.isoformat() + if x.get("V4") + else midnight_pt_test( + x["result"].docket_entry.docket.date_argued + ).isoformat() + ) + if x["result"].docket_entry.docket.date_argued + else None ) - if x["result"].docket_entry.docket.date_argued - else None ), "dateFiled": lambda x: ( ( @@ -315,7 +319,11 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: if x.get("docketNumber") else x["result"].docket_entry.docket.docket_number ), - "docket_id": lambda x: x["result"].docket_entry.docket_id, + "docket_id": lambda x: ( + x["docket_id"] + if x.get("docket_id") + else x["result"].docket_entry.docket_id + ), "jurisdictionType": lambda x: x[ "result" ].docket_entry.docket.jurisdiction_type, @@ -348,10 +356,14 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: recap_type_v4_api_keys = docket_api_common_keys.copy() recap_type_v4_api_keys.update( { - "attorney": lambda x: list( - DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ - "attorney" - ] + "attorney": lambda x: ( + x["attorney"] + if x.get("attorney") + else list( + DocketDocument().prepare_parties( + x["result"].docket_entry.docket + )["attorney"] + ) ), "attorney_id": lambda x: list( DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ @@ -371,10 +383,14 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: "docket_absolute_url": lambda x: x[ "result" ].docket_entry.docket.get_absolute_url(), - "firm": lambda x: list( - DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ - "firm" - ] + "firm": lambda x: ( + x["firm"] + if x.get("firm") + else list( + DocketDocument().prepare_parties( + x["result"].docket_entry.docket + )["firm"] + ) ), "firm_id": lambda x: list( DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ @@ -386,10 +402,14 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: if x["result"].docket_entry.docket.pacer_case_id else "" ), - "party": lambda x: list( - DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ - "party" - ] + "party": lambda x: ( + x["party"] + if x.get("party") + else list( + DocketDocument().prepare_parties( + x["result"].docket_entry.docket + )["party"] + ) ), "party_id": lambda x: list( DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ diff --git a/cl/search/api_serializers.py b/cl/search/api_serializers.py index 7c72dd5a2d..f5abc46b93 100644 --- a/cl/search/api_serializers.py +++ b/cl/search/api_serializers.py @@ -528,6 +528,7 @@ class BaseDocketESResultSerializer(DocumentSerializer): attorney = NoneToListField(read_only=True, required=False) firm_id = NoneToListField(read_only=True, required=False) firm = NoneToListField(read_only=True, required=False) + docket_id = HighlightedField(read_only=True) class Meta: document = DocketDocument @@ -731,18 +732,3 @@ class Meta: "pacer_doc_id", "trustee_str", ) - - -class RECAPNestedResultSerializer( - RECAPMetaMixin, BaseDocketESResultSerializer -): - """The serializer class for RECAP search type results.""" - - recap_documents = BaseRECAPDocumentESResultSerializer( - many=True, read_only=True, source="child_docs" - ) - - class Meta(BaseDocketESResultSerializer.Meta): - exclude = BaseDocketESResultSerializer.Meta.exclude + ( - "docket_absolute_url", - ) diff --git a/cl/search/constants.py b/cl/search/constants.py index 0dedfddc3c..0457e53799 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -96,11 +96,6 @@ "chapter", "trustee_str", ] -SEARCH_RECAP_NESTED_CHILD_QUERY_FIELDS = [ - "short_description", - "plain_text", - "document_type", -] SEARCH_OPINION_QUERY_FIELDS = [ "court", "court_id", @@ -161,6 +156,34 @@ "suitNature": 0, } +SEARCH_ALERTS_DOCKET_HL_FIELDS = { + "assignedTo": 0, + "caseName": 0, + "cause": 0, + "court_citation_string": 0, + "docketNumber": 0, + "juryDemand": 0, + "referredTo": 0, + "suitNature": 0, + "party": 0, + "attorney": 0, + "firm": 0, +} + +SEARCH_ALERTS_DOCKET_KEYWORDS_HL_FIELDS = { + "docket_id", + "court_id", + "firm_id", + "assigned_to_id", + "referred_to_id", + "dateFiled", + "dateArgued", + "dateTerminated", + "jurisdictionType", + "chapter", + "trustee_str", +} + SEARCH_OPINION_HL_FIELDS = { "caseName": 0, "citation": 0, diff --git a/cl/search/documents.py b/cl/search/documents.py index 5fada32689..957f03034e 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -36,7 +36,6 @@ BankruptcyInformation, Citation, Docket, - DocketEntry, Opinion, OpinionCluster, ParentheticalGroup, @@ -791,7 +790,7 @@ class DocketBaseDocument(Document): timestamp = fields.DateField() # Docket Fields - docket_id = fields.IntegerField(attr="pk") + docket_id = fields.KeywordField(attr="pk") caseName = fields.TextField( analyzer="text_en_splitting_cl", term_vector="with_positions_offsets", @@ -1831,179 +1830,15 @@ def prepare_cluster_child(self, instance): @recap_sweep_index.document -class RECAPNestedDocument(Document): - # Docket Fields - docket_id = fields.IntegerField(attr="pk") - caseName = fields.TextField( - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - analyzer="english_exact", - search_analyzer="search_analyzer_exact", - term_vector="with_positions_offsets", - ), - }, - search_analyzer="search_analyzer", - ) - case_name_full = fields.TextField( - attr="case_name_full", - analyzer="text_en_splitting_cl", - fields={ - "exact": fields.TextField( - attr="case_name_full", - analyzer="english_exact", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - docketNumber = fields.TextField( - attr="docket_number", - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - attr="docket_number", - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - suitNature = fields.TextField( - attr="nature_of_suit", - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - attr="nature_of_suit", - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - cause = fields.TextField( - attr="cause", - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - attr="cause", - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - juryDemand = fields.TextField( - attr="jury_demand", - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - attr="jury_demand", - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - jurisdictionType = fields.TextField( - attr="jurisdiction_type", - analyzer="text_en_splitting_cl", - fields={ - "exact": fields.TextField( - attr="jurisdiction_type", - analyzer="english_exact", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - dateArgued = fields.DateField(attr="date_argued") - dateFiled = fields.DateField(attr="date_filed") - dateTerminated = fields.DateField(attr="date_terminated") - assignedTo = fields.TextField( - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - assigned_to_id = fields.KeywordField(attr="assigned_to.pk") - referredTo = fields.TextField( - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - referred_to_id = fields.KeywordField(attr="referred_to.pk") - court = fields.TextField( - attr="court.full_name", - analyzer="text_en_splitting_cl", - fields={ - "exact": fields.TextField( - attr="court.full_name", - analyzer="english_exact", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - court_id = fields.TextField( - attr="court.pk", - analyzer="text_en_splitting_cl", - fields={"raw": fields.KeywordField(attr="court.pk")}, - search_analyzer="search_analyzer", - ) - court_citation_string = fields.TextField( - attr="court.citation_string", - analyzer="text_en_splitting_cl", - search_analyzer="search_analyzer", - term_vector="with_positions_offsets", - ) - chapter = fields.TextField( - analyzer="text_en_splitting_cl", - search_analyzer="search_analyzer", - ) - trustee_str = fields.TextField( - analyzer="text_en_splitting_cl", - fields={ - "exact": fields.TextField( - analyzer="english_exact", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ) - date_created = fields.DateField(attr="date_created") - pacer_case_id = fields.KeywordField(attr="pacer_case_id") - - # Parties - party_id = fields.ListField(fields.IntegerField(multi=True)) +class DocketSweepDocument(DocketDocument): party = fields.ListField( fields.TextField( analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", fields={ "exact": fields.TextField( analyzer="english_exact", + term_vector="with_positions_offsets", search_analyzer="search_analyzer_exact", ), }, @@ -2011,13 +1846,14 @@ class RECAPNestedDocument(Document): multi=True, ) ) - attorney_id = fields.ListField(fields.IntegerField(multi=True)) attorney = fields.ListField( fields.TextField( analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", fields={ "exact": fields.TextField( analyzer="english_exact", + term_vector="with_positions_offsets", search_analyzer="search_analyzer_exact", ), }, @@ -2025,13 +1861,14 @@ class RECAPNestedDocument(Document): multi=True, ) ) - firm_id = fields.ListField(fields.IntegerField(multi=True)) firm = fields.ListField( fields.TextField( analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", fields={ "exact": fields.TextField( analyzer="english_exact", + term_vector="with_positions_offsets", search_analyzer="search_analyzer_exact", ), }, @@ -2040,193 +1877,13 @@ class RECAPNestedDocument(Document): ) ) - # RECAPDocument fields: - documents = fields.NestedField( - properties={ - "id": fields.IntegerField(attr="pk"), - "docket_entry_id": fields.IntegerField(attr="docket_entry.pk"), - "description": fields.TextField( - attr="docket_entry.description", - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - attr="docket_entry.description", - term_vector="with_positions_offsets", - analyzer="english_exact", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ), - "entry_number": fields.LongField(attr="docket_entry.entry_number"), - "entry_date_filed": fields.DateField( - attr="docket_entry.date_filed" - ), - "short_description": fields.TextField( - attr="description", - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - attr="description", - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ), - "document_type": fields.TextField( - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ), - "document_number": fields.LongField(), - "pacer_doc_id": fields.KeywordField(attr="pacer_doc_id"), - "plain_text": fields.TextField( - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - ), - "attachment_number": fields.IntegerField(attr="attachment_number"), - "is_available": fields.BooleanField(attr="is_available"), - "page_count": fields.IntegerField(attr="page_count"), - "filepath_local": fields.KeywordField(index=False), - "absolute_url": fields.KeywordField(index=False), - "cites": fields.ListField( - fields.IntegerField(multi=True), - ), - } - ) - - # Meta - timestamp = fields.DateField() - class Django: model = Docket ignore_signals = True - def prepare_caseName(self, instance): - return best_case_name(instance) - - def prepare_assignedTo(self, instance): - if instance.assigned_to: - return instance.assigned_to.name_full - elif instance.assigned_to_str: - return instance.assigned_to_str - - def prepare_referredTo(self, instance): - if instance.referred_to: - return instance.referred_to.name_full - elif instance.referred_to_str: - return instance.referred_to_str - - def prepare_chapter(self, instance): - if BankruptcyInformation.objects.filter(docket=instance).exists(): - return instance.bankruptcy_information.chapter - - def prepare_trustee_str(self, instance): - if BankruptcyInformation.objects.filter(docket=instance).exists(): - return instance.bankruptcy_information.trustee_str - - def prepare_docket_child(self, instance): - return "docket" - - def prepare_docket_absolute_url(self, instance): - return instance.get_absolute_url() - def prepare_parties(self, instance): - out = { - "party_id": set(), - "party": set(), - "attorney_id": set(), - "attorney": set(), - "firm_id": set(), - "firm": set(), - } - - # Extract only required parties values. - party_values = instance.parties.values_list("pk", "name") - for pk, name in party_values.iterator(): - out["party_id"].add(pk) - out["party"].add(name) - - # Extract only required attorney values. - atty_values = ( - Attorney.objects.filter(roles__docket=instance) - .distinct() - .values_list("pk", "name") - ) - for pk, name in atty_values.iterator(): - out["attorney_id"].add(pk) - out["attorney"].add(name) - - # Extract only required firm values. - firms_values = ( - AttorneyOrganization.objects.filter( - attorney_organization_associations__docket=instance - ) - .distinct() - .values_list("pk", "name") - ) - for pk, name in firms_values.iterator(): - out["firm_id"].add(pk) - out["firm"].add(name) - - return out - - def prepare_documents(self, instance): - rds = RECAPDocument.objects.filter(docket_entry__docket=instance) - return [ - { - "id": rd.pk, - "docket_entry_id": rd.docket_entry_id, - "description": rd.docket_entry.description, - "entry_number": rd.docket_entry.entry_number, - "entry_date_filed": rd.docket_entry.date_filed, - "short_description": rd.description, - "document_type": rd.get_document_type_display(), - "document_number": rd.document_number or None, - "pacer_doc_id": rd.pacer_doc_id, - "plain_text": escape(rd.plain_text.translate(null_map)), - "attachment_number": rd.attachment_number, - "is_available": rd.is_available, - "page_count": rd.page_count, - "filepath_local": ( - rd.filepath_local.name if rd.filepath_local else None - ), - "absolute_url": rd.get_absolute_url(), - "cites": list( - rd.cited_opinions.all().values_list( - "cited_opinion_id", flat=True - ) - ), - } - for rd in rds - ] - - def prepare(self, instance): - data = super().prepare(instance) - parties_prepared = self.prepare_parties(instance) - data["party_id"] = list(parties_prepared["party_id"]) - data["party"] = list(parties_prepared["party"]) - data["attorney_id"] = list(parties_prepared["attorney_id"]) - data["attorney"] = list(parties_prepared["attorney"]) - data["firm_id"] = list(parties_prepared["firm_id"]) - data["firm"] = list(parties_prepared["firm"]) - return data +@recap_sweep_index.document +class ESRECAPSweepDocument(ESRECAPDocument): + class Django: + model = RECAPDocument + ignore_signals = True diff --git a/cl/search/forms.py b/cl/search/forms.py index 7ff40a7f3e..7dcd91eaeb 100644 --- a/cl/search/forms.py +++ b/cl/search/forms.py @@ -221,6 +221,16 @@ class SearchForm(forms.Form): ), ) atty_name.as_str_types = [SEARCH_TYPES.RECAP] + firm_name = forms.CharField( + required=False, + label="Firm Name", + widget=forms.TextInput( + attrs={ + "class": "external-input form-control", + "autocomplete": "off", + }, + ), + ) # # Oral argument fields diff --git a/cl/search/management/commands/cl_index_parent_and_child_docs.py b/cl/search/management/commands/cl_index_parent_and_child_docs.py index e184187168..c4edec4ba0 100644 --- a/cl/search/management/commands/cl_index_parent_and_child_docs.py +++ b/cl/search/management/commands/cl_index_parent_and_child_docs.py @@ -343,9 +343,9 @@ def add_arguments(self, parser): help="Use this flag to only index documents missing in the index.", ) parser.add_argument( - "--nested", + "--sweep-index", action="store_true", - help="Whether to perform a indexing of Nested documents.", + help="Whether to perform an indexing for the sweep index.", ) def handle(self, *args, **options): @@ -480,7 +480,7 @@ def process_queryset( pk_offset = self.options["pk_offset"] document_type = self.options.get("document_type", None) missing = self.options.get("missing", False) - nested = self.options.get("nested", False) + sweep_index = self.options.get("sweep_index", False) fields_map = {} if event_doc_type == EventTable.DOCKET: fields_map = recap_document_field_mapping["save"][Docket][ @@ -532,7 +532,10 @@ def process_queryset( match task_to_use: case "index_parent_and_child_docs": index_parent_and_child_docs.si( - chunk, search_type, testing_mode=testing_mode + chunk, + search_type, + testing_mode=testing_mode, + sweep_index=sweep_index, ).set(queue=queue).apply_async() case "index_parent_or_child_docs": @@ -541,7 +544,7 @@ def process_queryset( search_type, document_type, testing_mode=testing_mode, - nested=nested, + sweep_index=sweep_index, ).set(queue=queue).apply_async() case "remove_parent_and_child_docs_by_query": remove_parent_and_child_docs_by_query.si( diff --git a/cl/search/tasks.py b/cl/search/tasks.py index db1a0c3651..38bb8cdbe1 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -40,12 +40,13 @@ ES_CHILD_ID, AudioDocument, DocketDocument, + DocketSweepDocument, ESRECAPDocument, + ESRECAPSweepDocument, OpinionClusterDocument, OpinionDocument, PersonDocument, PositionDocument, - RECAPNestedDocument, ) from cl.search.models import ( SEARCH_TYPES, @@ -967,6 +968,7 @@ def index_parent_and_child_docs( instance_ids: list[int], search_type: str, testing_mode: bool = False, + sweep_index: bool = False, ) -> None: """Index parent and child documents in Elasticsearch. @@ -986,8 +988,12 @@ def index_parent_and_child_docs( child_es_document = PositionDocument child_id_property = "POSITION" case SEARCH_TYPES.RECAP: - parent_es_document = DocketDocument - child_es_document = ESRECAPDocument + parent_es_document = ( + DocketSweepDocument if sweep_index else DocketDocument + ) + child_es_document = ( + ESRECAPSweepDocument if sweep_index else ESRECAPDocument + ) child_id_property = "RECAP" case SEARCH_TYPES.OPINION: parent_es_document = OpinionClusterDocument @@ -1072,7 +1078,7 @@ def index_parent_or_child_docs( search_type: str, document_type: str | None, testing_mode: bool = False, - nested: bool = False, + sweep_index: bool = False, ) -> None: """Index parent or child documents in Elasticsearch. @@ -1092,9 +1098,11 @@ def index_parent_or_child_docs( match search_type: case SEARCH_TYPES.RECAP: parent_es_document = ( - RECAPNestedDocument if nested else DocketDocument + DocketSweepDocument if sweep_index else DocketDocument + ) + child_es_document = ( + ESRECAPSweepDocument if sweep_index else ESRECAPDocument ) - child_es_document = ESRECAPDocument child_id_property = "RECAP" if document_type == "parent": parent_instances = Docket.objects.filter(pk__in=instance_ids) diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index 15f651a475..2cc46a3842 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -20,6 +20,7 @@ from cl.lib.elasticsearch_utils import ( build_es_main_query, do_es_sweep_nested_query, + docket_field_matched, fetch_es_results, merge_unavailable_fields_on_parent_document, set_results_highlights, @@ -47,14 +48,13 @@ DocketESResultSerializer, RECAPDocumentESResultSerializer, RECAPESResultSerializer, - RECAPNestedResultSerializer, ) from cl.search.api_views import SearchV4ViewSet from cl.search.documents import ( ES_CHILD_ID, DocketDocument, + DocketSweepDocument, ESRECAPDocument, - RECAPNestedDocument, ) from cl.search.factories import ( BankruptcyInformationFactory, @@ -6681,11 +6681,11 @@ def test_recap_history_table_indexing(self) -> None: self.r.delete(*keys) -class RECAPSearchNestedIndexTest( - RECAPSearchAPICommonTests, ESIndexTestCase, TestCase +class RECAPSearchSweepIndexTest( + RECAPSearchAPICommonTests, ESIndexTestCase, TestCase, V4SearchAPIAssertions ): """ - RECAP Nested Index Tests + RECAP Sweep Index Tests """ version_api = "v4" @@ -6703,21 +6703,20 @@ def setUpTestData(cls): search_type=SEARCH_TYPES.RECAP, queue="celery", pk_offset=0, - document_type="parent", testing_mode=True, - nested=True, + sweep_index=True, ) async def _test_api_results_count( self, params, expected_count, field_name ): - search_query = RECAPNestedDocument.search() + search_query = DocketSweepDocument.search() results, total_hits = await sync_to_async(do_es_sweep_nested_query)( search_query, params, ) - results = RECAPNestedResultSerializer(results, many=True).data + results = RECAPESResultSerializer(results, many=True).data got = len(results) self.assertEqual( got, @@ -6730,12 +6729,113 @@ async def _test_api_results_count( ) return results - async def test_cross_object_string_query(self) -> None: - """Confirm a cross-object string query return the right results.""" + async def test_cross_object_string_query_and_hl(self) -> None: + """Confirm a cross-object string query return the right results and + highlighting is properly applied. + """ + + # Docket-only query HL + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": f"id:{self.rd_api.pk} cause:(401 Civil) " + f"court_citation_string:Appeals juryDemand:Plaintiff " + f"docket_id:{self.rd_api.docket_entry.docket.pk} " + f"dateArgued:[2022-05-19T00:00:00Z TO 2022-05-21T00:00:00Z]", + "assigned_to": "George", + "referred_to": "George", + "case_name": "America vs API", + "docket_number": "1:24-bk-0000", + "nature_of_suit": "569", + "party_name": "Defendant John Doe", + "atty_name": "John Doe", + "firm_name": "Associates America", + } + + # RECAP Search type HL disabled. + r = await self._test_api_results_count(search_params, 1, "API fields") + keys_count = len(r[0]) + self.assertEqual(keys_count, len(recap_type_v4_api_keys)) + rd_keys_count = len(r[0]["recap_documents"][0]) + self.assertEqual(rd_keys_count, len(recap_document_v4_api_keys)) + + content_to_compare = { + "result": self.rd_api, + "V4": True, + "assignedTo": "George Doe II", + "caseName": "America vs API Lorem", + "cause": "401 Civil", + "court_citation_string": "Appeals. CA9.", + "docketNumber": "1:24-bk-0000", + "juryDemand": "Plaintiff", + "referredTo": "George Doe II", + "suitNature": "569", + "party": [ + "Defendant John Doe" + ], + "firm": ["Associates America"], + "attorney": ["John Doe"], + "docket_id": f"{self.rd_api.docket_entry.docket.pk}", + "dateArgued": f"2022-05-19", + } + await self._test_api_fields_content( + r, + content_to_compare, + recap_type_v4_api_keys, + recap_document_v4_api_keys, + v4_recap_meta_keys, + ) + + search_params = { + "type": SEARCH_TYPES.RECAP, + "order_by": "dateFiled desc", + } + # Match all query RECAP Search type HL enabled, get snippet from ES. + with override_settings(NO_MATCH_HL_SIZE=50): + r = await self._test_api_results_count( + search_params, 5, "API fields" + ) + content_to_compare = { + "result": self.rd_2, + "snippet": "Mauris iaculis, leo sit amet hendrerit vehicula, Maecenas", + "V4": True, + } + await self._test_api_fields_content( + r, + content_to_compare, + recap_type_v4_api_keys, + recap_document_v4_api_keys, + v4_recap_meta_keys, + ) + + async def test_query_matched_docket_field(self) -> None: + # Docket-only query HL search_params = { "type": SEARCH_TYPES.RECAP, - "q": f"id:{self.rd_api.pk} cause:(401 Civil) juryDemand:Plaintiff short_description:(Order Letter) plain_text:(shown in the API)", + "q": f"id:{self.rd_api.pk} cause:(401 Civil) " + f"court_citation_string:Appeals juryDemand:Plaintiff " + f"docket_id:{self.rd_api.docket_entry.docket.pk} ", } - await self._test_api_results_count(search_params, 1, "API fields") + search_query = DocketSweepDocument.search() + results, total_hits = await sync_to_async(do_es_sweep_nested_query)( + search_query, + search_params, + ) + d_field_matched = docket_field_matched(results[0]) + self.assertEqual(d_field_matched, True) + + # RECAPDocument-only query HL + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": f"id:{self.rd_api.pk} short_description:(Order Letter) plain_text:(shown in the API)", + "description": "MOTION for Leave", + "document_number": "2", + } + search_query = DocketSweepDocument.search() + results, total_hits = await sync_to_async(do_es_sweep_nested_query)( + search_query, + search_params, + ) + d_field_matched = docket_field_matched(results[0]) + self.assertEqual(d_field_matched, False) diff --git a/cl/tests/cases.py b/cl/tests/cases.py index b7bfd2777a..42d025e918 100644 --- a/cl/tests/cases.py +++ b/cl/tests/cases.py @@ -10,6 +10,7 @@ from django_elasticsearch_dsl.registries import registry from lxml import etree from rest_framework.test import APITestCase +from rest_framework.utils.serializer_helpers import ReturnList from cl.lib.redis_utils import get_redis_interface from cl.search.models import SEARCH_TYPES @@ -267,7 +268,10 @@ async def _test_api_fields_content( get_expected_value, ) in fields_to_compare.items(): with self.subTest(field=field): - parent_document = api_response.data["results"][0] + if isinstance(api_response, ReturnList): + parent_document = api_response[0] + else: + parent_document = api_response.data["results"][0] actual_value = parent_document.get(field) if field in ["recap_documents", "opinions", "positions"]: child_document = actual_value[0] From 9307b77554b5ed6b090d3ab3503daf5a46492985 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 24 Jun 2024 20:47:20 -0600 Subject: [PATCH 03/33] fix(alerts): Added cl_send_recap_alerts command - Test filter out queries and hits based on fields that matched. --- .../commands/cl_send_recap_alerts.py | 112 +++++++++++ cl/alerts/tests/__init__.py | 0 cl/alerts/{ => tests}/tests.py | 0 cl/alerts/tests/tests_recap_alerts.py | 190 ++++++++++++++++++ cl/alerts/utils.py | 53 ++++- cl/lib/elasticsearch_utils.py | 58 +----- cl/lib/test_helpers.py | 62 ++---- cl/search/api_serializers.py | 1 - cl/search/constants.py | 53 +++-- cl/search/documents.py | 48 +---- cl/search/tests/tests_es_recap.py | 169 +--------------- 11 files changed, 412 insertions(+), 334 deletions(-) create mode 100644 cl/alerts/management/commands/cl_send_recap_alerts.py create mode 100644 cl/alerts/tests/__init__.py rename cl/alerts/{ => tests}/tests.py (100%) create mode 100644 cl/alerts/tests/tests_recap_alerts.py diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py new file mode 100644 index 0000000000..cefefb1cfb --- /dev/null +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -0,0 +1,112 @@ +import traceback +import datetime + +from asgiref.sync import async_to_sync +from django.contrib.auth.models import User +from django.http import QueryDict +from django.utils.timezone import now +from elasticsearch.exceptions import RequestError, TransportError + +from cl.lib.command_utils import VerboseCommand, logger +from cl.lib.elasticsearch_utils import do_es_sweep_alert_query +from cl.search.documents import DocketSweepDocument +from cl.search.models import SEARCH_TYPES +from cl.stats.utils import tally_stat +from cl.alerts.tasks import send_search_alert_emails +from cl.alerts.models import Alert +from cl.search.exception import ( + BadProximityQuery, + UnbalancedParenthesesQuery, + UnbalancedQuotesQuery, +) +from cl.alerts.utils import recap_document_hl_matched, query_includes_rd_field + + +def index_daily_recap_documents(): + # TODO implement + pass + +def has_rd_hit_been_triggered(): + # TODO implement + return False + +def has_docket_hit_been_triggered(): + # TODO implement + return True + +def query_and_send_alerts(rate): + alert_users = User.objects.filter(alerts__rate=rate).distinct() + alerts_sent_count = 0 + now_time = datetime.datetime.now() + for user in alert_users: + alerts = user.alerts.filter(rate=rate) + logger.info(f"Running alerts for user '{user}': {alerts}") + + hits = [] + alerts_to_update = [] + for alert in alerts: + search_params = QueryDict(alert.query.encode(), mutable=True) + includes_rd_fields = query_includes_rd_field(search_params) + + try: + search_query = DocketSweepDocument.search() + results, total_hits = do_es_sweep_alert_query( + search_query, + search_params, + ) + except (UnbalancedParenthesesQuery, + UnbalancedQuotesQuery, + BadProximityQuery,TransportError, ConnectionError, RequestError): + traceback.print_exc() + logger.info( + f"Search for this alert failed: {alert.query}\n" + ) + continue + + alerts_to_update.append(alert.pk) + if len(results) > 0: + search_type = search_params.get("type", SEARCH_TYPES.OPINION) + results_to_send = [] + for hit in results: + if not includes_rd_fields: + rds_to_send = [rd_hit for rd_hit in hit["child_docs"] + if not recap_document_hl_matched( + rd_hit) and not has_rd_hit_been_triggered()] + if rds_to_send: + hit["child_docs"] = rds_to_send + results_to_send.append(hit) + + hits.append( + [alert, search_type, results_to_send, len(results_to_send)] + ) + alert.query_run = search_params.urlencode() + alert.date_last_hit = now() + alert.save() + + if hits: + send_search_alert_emails.delay([(user.pk, hits)]) + alerts_sent_count += 1 + + # Update Alert's date_last_hit in bulk. + Alert.objects.filter(id__in=alerts_to_update).update( + date_last_hit=now_time + ) + async_to_sync(tally_stat)(f"alerts.sent.{rate}", inc=alerts_sent_count) + logger.info(f"Sent {alerts_sent_count} {rate} email alerts.") + + +def query_and_schedule_wly_and_mly_alerts(): + # TODO implement + pass + + +class Command(VerboseCommand): + help = "Send RECAP Search Alerts." + + def handle(self, *args, **options): + super().handle(*args, **options) + + index_daily_recap_documents() + query_and_send_alerts(Alert.REAL_TIME) + query_and_send_alerts(Alert.DAILY) + query_and_schedule_wly_and_mly_alerts() diff --git a/cl/alerts/tests/__init__.py b/cl/alerts/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cl/alerts/tests.py b/cl/alerts/tests/tests.py similarity index 100% rename from cl/alerts/tests.py rename to cl/alerts/tests/tests.py diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py new file mode 100644 index 0000000000..4b00408d2c --- /dev/null +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -0,0 +1,190 @@ +from unittest import mock + +import time_machine +from asgiref.sync import sync_to_async +from django.core import mail +from django.core.management import call_command +from django.utils.timezone import now + +from cl.alerts.factories import AlertFactory +from cl.alerts.models import SEARCH_TYPES, Alert +from cl.alerts.utils import query_includes_rd_field, recap_document_hl_matched +from cl.api.factories import WebhookFactory +from cl.api.models import WebhookEventType +from cl.donate.models import NeonMembership +from cl.lib.elasticsearch_utils import do_es_sweep_alert_query +from cl.lib.test_helpers import RECAPSearchTestCase +from cl.search.documents import DocketSweepDocument +from cl.tests.cases import ESIndexTestCase, TestCase +from cl.tests.utils import MockResponse +from cl.users.factories import UserProfileWithParentsFactory + + +class RECAPAlertsSweepIndexTest( + RECAPSearchTestCase, ESIndexTestCase, TestCase +): + """ + RECAP Alerts Sweep Index Tests + """ + + @classmethod + def setUpTestData(cls): + cls.rebuild_index("people_db.Person") + cls.rebuild_index("search.Docket") + cls.mock_date = now().replace(day=15, hour=0) + with time_machine.travel(cls.mock_date, tick=False): + super().setUpTestData() + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + sweep_index=True, + ) + + cls.user_profile = UserProfileWithParentsFactory() + NeonMembership.objects.create( + level=NeonMembership.LEGACY, user=cls.user_profile.user + ) + cls.webhook_enabled = WebhookFactory( + user=cls.user_profile.user, + event_type=WebhookEventType.SEARCH_ALERT, + url="https://example.com/", + enabled=True, + ) + cls.search_alert = AlertFactory( + user=cls.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Docket Only", + query='q="401 Civil"&type=r', + ) + cls.search_alert_2 = AlertFactory( + user=cls.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert RECAP Only", + query='q="Mauris iaculis, leo sit amet hendrerit vehicula"&type=r', + ) + cls.search_alert_3 = AlertFactory( + user=cls.user_profile.user, + rate=Alert.DAILY, + name="Test Cross object", + query="q=SUBPOENAS SERVED OFF Mauris iaculis&type=r", + ) + + async def test_recap_document_hl_matched(self) -> None: + """Test recap_document_hl_matched method that determines weather a hit + contains RECAPDocument HL fields.""" + # Docket-only query + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": '"401 Civil"', + } + search_query = DocketSweepDocument.search() + results, total_hits = await sync_to_async(do_es_sweep_alert_query)( + search_query, + search_params, + ) + docket_result = results[0] + for rd in docket_result["child_docs"]: + rd_field_matched = recap_document_hl_matched(rd) + self.assertEqual(rd_field_matched, False) + + # RECAPDocument-only query + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": '"Mauris iaculis, leo sit amet hendrerit vehicula"', + } + search_query = DocketSweepDocument.search() + results, total_hits = await sync_to_async(do_es_sweep_alert_query)( + search_query, + search_params, + ) + docket_result = results[0] + for rd in docket_result["child_docs"]: + rd_field_matched = recap_document_hl_matched(rd) + self.assertEqual(rd_field_matched, True) + + # Cross-object query + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": "SUBPOENAS SERVED OFF Mauris iaculis", + } + search_query = DocketSweepDocument.search() + results, total_hits = await sync_to_async(do_es_sweep_alert_query)( + search_query, + search_params, + ) + docket_result = results[0] + for rd in docket_result["child_docs"]: + rd_field_matched = recap_document_hl_matched(rd) + self.assertEqual(rd_field_matched, True) + + async def test_query_includes_rd_field(self) -> None: + """Test query_includes_rd_field method that checks if a query + includes any indexed fields in the query string or filters specific to + RECAP Documents. + """ + + # Docket-only query + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": '"401 Civil"', + } + self.assertEqual(query_includes_rd_field(search_params), False) + + # RECAPDocument-only query + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": 'description:"lorem ipsum"', + } + self.assertEqual(query_includes_rd_field(search_params), True) + + # Cross-object query + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": 'case_name:"American v." description:"lorem ipsum"', + } + self.assertEqual(query_includes_rd_field(search_params), True) + + # Docket-only query + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": "", + "case_name": "SUBPOENAS", + } + self.assertEqual(query_includes_rd_field(search_params), False) + + # RECAPDocument-only query + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": "", + "description": "Lorem", + } + self.assertEqual(query_includes_rd_field(search_params), True) + + # Cross-object query + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": "", + "case_name": "SUBPOENAS", + "document_number": 1, + } + self.assertEqual(query_includes_rd_field(search_params), True) + + def test_filter_out_alerts_to_send(self) -> None: + """Test RECAP alerts hit can be properly filtered out according to + their query and hits matched conditions. + """ + + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + + self.assertEqual( + len(mail.outbox), 2, msg="Outgoing emails don't match." + ) diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py index d287b3627a..408ae13da0 100644 --- a/cl/alerts/utils.py +++ b/cl/alerts/utils.py @@ -4,7 +4,7 @@ from django.conf import settings from django.http import QueryDict from elasticsearch_dsl import Q, Search -from elasticsearch_dsl.response import Response +from elasticsearch_dsl.response import Hit, Response from cl.alerts.models import ( SCHEDULED_ALERT_HIT_STATUS, @@ -14,9 +14,15 @@ ) from cl.lib.command_utils import logger from cl.lib.elasticsearch_utils import add_es_highlighting +from cl.lib.types import CleanData +from cl.search.constants import ( + ALERTS_HL_TAG, + SEARCH_RECAP_CHILD_HL_FIELDS, + recap_document_filters, + recap_document_indexed_fields, +) from cl.search.documents import AudioPercolator from cl.search.models import SEARCH_TYPES, Docket -from cl.users.models import UserProfile @dataclass @@ -138,3 +144,46 @@ def alert_hits_limit_reached(alert_pk: int, user_pk: int) -> bool: ) return True return False + + +def recap_document_hl_matched(rd_hit: Hit) -> bool: + """Determine whether HL matched a RECAPDocument text field. + + :param rd_hit: The ES hit. + :return: True if the hit matched a RECAPDocument field. Otherwise, False. + """ + + matched_rd_hl = set() + rd_hl_fields = set(SEARCH_RECAP_CHILD_HL_FIELDS.keys()) + if hasattr(rd_hit, "highlight"): + highlights = rd_hit.highlight.to_dict() + matched_rd_hl.update( + hl_key + for hl_key, hl_value in highlights.items() + for hl in hl_value + if f"<{ALERTS_HL_TAG}>" in hl + ) + if matched_rd_hl and matched_rd_hl.issubset(rd_hl_fields): + return True + return False + + +def query_includes_rd_field(query_params: CleanData) -> bool: + """Determine whether the query includes any indexed fields in the query + string or filters specific to RECAP Documents. + + :param query_params: The query parameters. + :return: True if any recap document fields or filters are included in the + query, otherwise False. + """ + + query_string = query_params.get("q", "") + for rd_field in recap_document_indexed_fields: + if f"{rd_field}:" in query_string: + return True + + for rd_filter in recap_document_filters: + if query_params.get(rd_filter, ""): + return True + + return False diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 3280abc449..b4eab7c64b 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -58,8 +58,6 @@ PEOPLE_ES_HL_FIELDS, PEOPLE_ES_HL_KEYWORD_FIELDS, RELATED_PATTERN, - SEARCH_ALERTS_DOCKET_HL_FIELDS, - SEARCH_ALERTS_DOCKET_KEYWORDS_HL_FIELDS, SEARCH_ALERTS_ORAL_ARGUMENT_ES_HL_FIELDS, SEARCH_HL_TAG, SEARCH_OPINION_HL_FIELDS, @@ -1288,7 +1286,7 @@ def build_child_docs_query( query for query in parent_filters if isinstance(query, QueryString) - and query.fields[0] in ["party", "attorney", "firm"] + and query.fields[0] in ["party", "attorney"] ] parties_has_parent_query = build_has_parent_parties_query(parties_filters) @@ -1463,15 +1461,7 @@ def add_es_highlighting( highlighting_fields = PEOPLE_ES_HL_FIELDS highlighting_keyword_fields = PEOPLE_ES_HL_KEYWORD_FIELDS case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: - highlighting_fields = ( - SEARCH_ALERTS_DOCKET_HL_FIELDS - if alerts - else SEARCH_RECAP_HL_FIELDS - ) - if alerts: - highlighting_keyword_fields = ( - SEARCH_ALERTS_DOCKET_KEYWORDS_HL_FIELDS - ) + highlighting_fields = SEARCH_RECAP_HL_FIELDS case SEARCH_TYPES.OPINION: highlighting_fields = SEARCH_OPINION_HL_FIELDS @@ -2112,7 +2102,6 @@ def build_join_es_filters(cd: CleanData) -> List: *build_text_filter("referredTo", cd.get("referred_to", "")), *build_text_filter("party", cd.get("party_name", "")), *build_text_filter("attorney", cd.get("atty_name", "")), - *build_text_filter("firm", cd.get("firm_name", "")), *build_daterange_query( "dateFiled", cd.get("filed_before", ""), @@ -2393,7 +2382,7 @@ def build_full_join_es_queries( query for query in parent_filters if isinstance(query, QueryString) - and query.fields[0] in ["party", "attorney", "firm"] + and query.fields[0] in ["party", "attorney"] ] has_parent_parties_filter = build_has_parent_parties_query( parties_filters @@ -3039,7 +3028,7 @@ def do_es_alert_estimation_query( return estimation_query.count() -def do_es_sweep_nested_query( +def do_es_sweep_alert_query( search_query: Search, cd: CleanData, ) -> tuple[list[Hit] | None, int | None]: @@ -3058,21 +3047,16 @@ def do_es_sweep_nested_query( else: return None, None - hits = None - try: - s, _ = build_es_base_query(search_query, cd, True, alerts=True) - except ( - UnbalancedParenthesesQuery, - UnbalancedQuotesQuery, - BadProximityQuery, - ) as e: - raise ElasticBadRequestError(detail=e.message) + total_hits = None + + s, _ = build_es_base_query(search_query, cd, True, alerts=True) + main_query = add_es_highlighting(s, cd, alerts=True) main_query = main_query.sort(build_sort_results(cd)) main_query = main_query.extra(from_=0, size=30) results = main_query.execute() if results: - hits = results.hits.total.value + total_hits = results.hits.total.value limit_inner_hits({}, results, cd["type"]) set_results_highlights(results, cd["type"]) @@ -3081,27 +3065,7 @@ def do_es_sweep_nested_query( child_result_objects = [] if hasattr(result, "child_docs"): for child_doc in result.child_docs: - child_result_objects.append( - defaultdict(lambda: None, child_doc["_source"].to_dict()) - ) + child_result_objects.append(child_doc.to_dict()) result["child_docs"] = child_result_objects - return results, hits - - -def docket_field_matched(hit: Hit) -> bool: - """Determine whether HL matched a Docket field. - - :param hit: The ES hit. - :return: True if the hit matched a Docket field. Otherwise, False. - """ - - plain_hl = set(SEARCH_ALERTS_DOCKET_KEYWORDS_HL_FIELDS) - vector_hl = set(SEARCH_ALERTS_DOCKET_HL_FIELDS.keys()) - docket_hl = set() - if hasattr(hit.meta, "highlight"): - highlights = hit.meta.highlight.to_dict() - docket_hl = set([hl for hl in highlights.keys()]) - if docket_hl.issubset(plain_hl.union(vector_hl)): - return True - return False + return results, total_hits diff --git a/cl/lib/test_helpers.py b/cl/lib/test_helpers.py index d7e1c49ebc..69976430f1 100644 --- a/cl/lib/test_helpers.py +++ b/cl/lib/test_helpers.py @@ -278,19 +278,15 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: ), "court_id": lambda x: x["result"].docket_entry.docket.court.pk, "dateArgued": lambda x: ( - x["dateArgued"] - if x.get("dateArgued") - else ( - ( - x["result"].docket_entry.docket.date_argued.isoformat() - if x.get("V4") - else midnight_pt_test( - x["result"].docket_entry.docket.date_argued - ).isoformat() - ) - if x["result"].docket_entry.docket.date_argued - else None + ( + x["result"].docket_entry.docket.date_argued.isoformat() + if x.get("V4") + else midnight_pt_test( + x["result"].docket_entry.docket.date_argued + ).isoformat() ) + if x["result"].docket_entry.docket.date_argued + else None ), "dateFiled": lambda x: ( ( @@ -319,11 +315,7 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: if x.get("docketNumber") else x["result"].docket_entry.docket.docket_number ), - "docket_id": lambda x: ( - x["docket_id"] - if x.get("docket_id") - else x["result"].docket_entry.docket_id - ), + "docket_id": lambda x: x["result"].docket_entry.docket_id, "jurisdictionType": lambda x: x[ "result" ].docket_entry.docket.jurisdiction_type, @@ -356,14 +348,10 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: recap_type_v4_api_keys = docket_api_common_keys.copy() recap_type_v4_api_keys.update( { - "attorney": lambda x: ( - x["attorney"] - if x.get("attorney") - else list( - DocketDocument().prepare_parties( - x["result"].docket_entry.docket - )["attorney"] - ) + "attorney": lambda x: list( + DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ + "attorney" + ] ), "attorney_id": lambda x: list( DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ @@ -383,14 +371,10 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: "docket_absolute_url": lambda x: x[ "result" ].docket_entry.docket.get_absolute_url(), - "firm": lambda x: ( - x["firm"] - if x.get("firm") - else list( - DocketDocument().prepare_parties( - x["result"].docket_entry.docket - )["firm"] - ) + "firm": lambda x: list( + DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ + "firm" + ] ), "firm_id": lambda x: list( DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ @@ -402,14 +386,10 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: if x["result"].docket_entry.docket.pacer_case_id else "" ), - "party": lambda x: ( - x["party"] - if x.get("party") - else list( - DocketDocument().prepare_parties( - x["result"].docket_entry.docket - )["party"] - ) + "party": lambda x: list( + DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ + "party" + ] ), "party_id": lambda x: list( DocketDocument().prepare_parties(x["result"].docket_entry.docket)[ diff --git a/cl/search/api_serializers.py b/cl/search/api_serializers.py index f5abc46b93..1f9cbb7d75 100644 --- a/cl/search/api_serializers.py +++ b/cl/search/api_serializers.py @@ -528,7 +528,6 @@ class BaseDocketESResultSerializer(DocumentSerializer): attorney = NoneToListField(read_only=True, required=False) firm_id = NoneToListField(read_only=True, required=False) firm = NoneToListField(read_only=True, required=False) - docket_id = HighlightedField(read_only=True) class Meta: document = DocketDocument diff --git a/cl/search/constants.py b/cl/search/constants.py index 0457e53799..2d0d011962 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -155,35 +155,6 @@ "referredTo": 0, "suitNature": 0, } - -SEARCH_ALERTS_DOCKET_HL_FIELDS = { - "assignedTo": 0, - "caseName": 0, - "cause": 0, - "court_citation_string": 0, - "docketNumber": 0, - "juryDemand": 0, - "referredTo": 0, - "suitNature": 0, - "party": 0, - "attorney": 0, - "firm": 0, -} - -SEARCH_ALERTS_DOCKET_KEYWORDS_HL_FIELDS = { - "docket_id", - "court_id", - "firm_id", - "assigned_to_id", - "referred_to_id", - "dateFiled", - "dateArgued", - "dateTerminated", - "jurisdictionType", - "chapter", - "trustee_str", -} - SEARCH_OPINION_HL_FIELDS = { "caseName": 0, "citation": 0, @@ -321,3 +292,27 @@ Opinion.ON_MOTION_TO_STRIKE: "on-motion-to-strike", Opinion.TRIAL_COURT: "trial-court-document", } + +recap_document_indexed_fields = [ + "id", + "docket_entry_id", + "description", + "entry_number", + "entry_date_filed", + "short_description", + "document_type", + "document_number", + "pacer_doc_id", + "plain_text", + "attachment_number", + "is_available", + "page_count", + "cites", +] + +recap_document_filters = [ + "available_only", + "description", + "document_number", + "attachment_number", +] diff --git a/cl/search/documents.py b/cl/search/documents.py index 957f03034e..d64f4eb724 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -790,7 +790,7 @@ class DocketBaseDocument(Document): timestamp = fields.DateField() # Docket Fields - docket_id = fields.KeywordField(attr="pk") + docket_id = fields.IntegerField(attr="pk") caseName = fields.TextField( analyzer="text_en_splitting_cl", term_vector="with_positions_offsets", @@ -1831,51 +1831,6 @@ def prepare_cluster_child(self, instance): @recap_sweep_index.document class DocketSweepDocument(DocketDocument): - party = fields.ListField( - fields.TextField( - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - multi=True, - ) - ) - attorney = fields.ListField( - fields.TextField( - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - multi=True, - ) - ) - firm = fields.ListField( - fields.TextField( - analyzer="text_en_splitting_cl", - term_vector="with_positions_offsets", - fields={ - "exact": fields.TextField( - analyzer="english_exact", - term_vector="with_positions_offsets", - search_analyzer="search_analyzer_exact", - ), - }, - search_analyzer="search_analyzer", - multi=True, - ) - ) class Django: model = Docket @@ -1884,6 +1839,7 @@ class Django: @recap_sweep_index.document class ESRECAPSweepDocument(ESRECAPDocument): + class Django: model = RECAPDocument ignore_signals = True diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index 2cc46a3842..389193b204 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -19,8 +19,6 @@ from cl.lib.elasticsearch_utils import ( build_es_main_query, - do_es_sweep_nested_query, - docket_field_matched, fetch_es_results, merge_unavailable_fields_on_parent_document, set_results_highlights, @@ -50,12 +48,7 @@ RECAPESResultSerializer, ) from cl.search.api_views import SearchV4ViewSet -from cl.search.documents import ( - ES_CHILD_ID, - DocketDocument, - DocketSweepDocument, - ESRECAPDocument, -) +from cl.search.documents import ES_CHILD_ID, DocketDocument, ESRECAPDocument from cl.search.factories import ( BankruptcyInformationFactory, CourtFactory, @@ -6679,163 +6672,3 @@ def test_recap_history_table_indexing(self) -> None: ) if keys: self.r.delete(*keys) - - -class RECAPSearchSweepIndexTest( - RECAPSearchAPICommonTests, ESIndexTestCase, TestCase, V4SearchAPIAssertions -): - """ - RECAP Sweep Index Tests - """ - - version_api = "v4" - skip_common_tests = False - - @classmethod - def setUpTestData(cls): - cls.rebuild_index("people_db.Person") - cls.rebuild_index("search.Docket") - cls.mock_date = now().replace(day=15, hour=0) - with time_machine.travel(cls.mock_date, tick=False): - super().setUpTestData() - call_command( - "cl_index_parent_and_child_docs", - search_type=SEARCH_TYPES.RECAP, - queue="celery", - pk_offset=0, - testing_mode=True, - sweep_index=True, - ) - - async def _test_api_results_count( - self, params, expected_count, field_name - ): - - search_query = DocketSweepDocument.search() - results, total_hits = await sync_to_async(do_es_sweep_nested_query)( - search_query, - params, - ) - results = RECAPESResultSerializer(results, many=True).data - got = len(results) - self.assertEqual( - got, - expected_count, - msg="Did not get the right number of search results in API with %s " - "filter applied.\n" - "Expected: %s\n" - " Got: %s\n\n" - "Params were: %s" % (field_name, expected_count, got, params), - ) - return results - - async def test_cross_object_string_query_and_hl(self) -> None: - """Confirm a cross-object string query return the right results and - highlighting is properly applied. - """ - - # Docket-only query HL - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": f"id:{self.rd_api.pk} cause:(401 Civil) " - f"court_citation_string:Appeals juryDemand:Plaintiff " - f"docket_id:{self.rd_api.docket_entry.docket.pk} " - f"dateArgued:[2022-05-19T00:00:00Z TO 2022-05-21T00:00:00Z]", - "assigned_to": "George", - "referred_to": "George", - "case_name": "America vs API", - "docket_number": "1:24-bk-0000", - "nature_of_suit": "569", - "party_name": "Defendant John Doe", - "atty_name": "John Doe", - "firm_name": "Associates America", - } - - # RECAP Search type HL disabled. - r = await self._test_api_results_count(search_params, 1, "API fields") - keys_count = len(r[0]) - self.assertEqual(keys_count, len(recap_type_v4_api_keys)) - rd_keys_count = len(r[0]["recap_documents"][0]) - self.assertEqual(rd_keys_count, len(recap_document_v4_api_keys)) - - content_to_compare = { - "result": self.rd_api, - "V4": True, - "assignedTo": "George Doe II", - "caseName": "America vs API Lorem", - "cause": "401 Civil", - "court_citation_string": "Appeals. CA9.", - "docketNumber": "1:24-bk-0000", - "juryDemand": "Plaintiff", - "referredTo": "George Doe II", - "suitNature": "569", - "party": [ - "Defendant John Doe" - ], - "firm": ["Associates America"], - "attorney": ["John Doe"], - "docket_id": f"{self.rd_api.docket_entry.docket.pk}", - "dateArgued": f"2022-05-19", - } - await self._test_api_fields_content( - r, - content_to_compare, - recap_type_v4_api_keys, - recap_document_v4_api_keys, - v4_recap_meta_keys, - ) - - search_params = { - "type": SEARCH_TYPES.RECAP, - "order_by": "dateFiled desc", - } - # Match all query RECAP Search type HL enabled, get snippet from ES. - with override_settings(NO_MATCH_HL_SIZE=50): - r = await self._test_api_results_count( - search_params, 5, "API fields" - ) - content_to_compare = { - "result": self.rd_2, - "snippet": "Mauris iaculis, leo sit amet hendrerit vehicula, Maecenas", - "V4": True, - } - await self._test_api_fields_content( - r, - content_to_compare, - recap_type_v4_api_keys, - recap_document_v4_api_keys, - v4_recap_meta_keys, - ) - - async def test_query_matched_docket_field(self) -> None: - - # Docket-only query HL - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": f"id:{self.rd_api.pk} cause:(401 Civil) " - f"court_citation_string:Appeals juryDemand:Plaintiff " - f"docket_id:{self.rd_api.docket_entry.docket.pk} ", - } - - search_query = DocketSweepDocument.search() - results, total_hits = await sync_to_async(do_es_sweep_nested_query)( - search_query, - search_params, - ) - d_field_matched = docket_field_matched(results[0]) - self.assertEqual(d_field_matched, True) - - # RECAPDocument-only query HL - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": f"id:{self.rd_api.pk} short_description:(Order Letter) plain_text:(shown in the API)", - "description": "MOTION for Leave", - "document_number": "2", - } - search_query = DocketSweepDocument.search() - results, total_hits = await sync_to_async(do_es_sweep_nested_query)( - search_query, - search_params, - ) - d_field_matched = docket_field_matched(results[0]) - self.assertEqual(d_field_matched, False) From 9b4e1c1aa58c92f7e195b64fc866295288874311 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Jun 2024 02:48:20 +0000 Subject: [PATCH 04/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../commands/cl_send_recap_alerts.py | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index cefefb1cfb..3399a6801f 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -1,5 +1,5 @@ -import traceback import datetime +import traceback from asgiref.sync import async_to_sync from django.contrib.auth.models import User @@ -7,33 +7,36 @@ from django.utils.timezone import now from elasticsearch.exceptions import RequestError, TransportError +from cl.alerts.models import Alert +from cl.alerts.tasks import send_search_alert_emails +from cl.alerts.utils import query_includes_rd_field, recap_document_hl_matched from cl.lib.command_utils import VerboseCommand, logger from cl.lib.elasticsearch_utils import do_es_sweep_alert_query from cl.search.documents import DocketSweepDocument -from cl.search.models import SEARCH_TYPES -from cl.stats.utils import tally_stat -from cl.alerts.tasks import send_search_alert_emails -from cl.alerts.models import Alert from cl.search.exception import ( BadProximityQuery, UnbalancedParenthesesQuery, UnbalancedQuotesQuery, ) -from cl.alerts.utils import recap_document_hl_matched, query_includes_rd_field +from cl.search.models import SEARCH_TYPES +from cl.stats.utils import tally_stat def index_daily_recap_documents(): # TODO implement pass + def has_rd_hit_been_triggered(): # TODO implement return False + def has_docket_hit_been_triggered(): # TODO implement return True + def query_and_send_alerts(rate): alert_users = User.objects.filter(alerts__rate=rate).distinct() alerts_sent_count = 0 @@ -54,13 +57,16 @@ def query_and_send_alerts(rate): search_query, search_params, ) - except (UnbalancedParenthesesQuery, - UnbalancedQuotesQuery, - BadProximityQuery,TransportError, ConnectionError, RequestError): + except ( + UnbalancedParenthesesQuery, + UnbalancedQuotesQuery, + BadProximityQuery, + TransportError, + ConnectionError, + RequestError, + ): traceback.print_exc() - logger.info( - f"Search for this alert failed: {alert.query}\n" - ) + logger.info(f"Search for this alert failed: {alert.query}\n") continue alerts_to_update.append(alert.pk) @@ -69,9 +75,12 @@ def query_and_send_alerts(rate): results_to_send = [] for hit in results: if not includes_rd_fields: - rds_to_send = [rd_hit for rd_hit in hit["child_docs"] - if not recap_document_hl_matched( - rd_hit) and not has_rd_hit_been_triggered()] + rds_to_send = [ + rd_hit + for rd_hit in hit["child_docs"] + if not recap_document_hl_matched(rd_hit) + and not has_rd_hit_been_triggered() + ] if rds_to_send: hit["child_docs"] = rds_to_send results_to_send.append(hit) From 8b537f0b8fd4f272d6ce6439672e7b6c29bf5be8 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 26 Jun 2024 20:34:19 -0600 Subject: [PATCH 05/33] fix(alerts): Implemented filtering of RECAP alerts hits for the sweep index command. --- .../commands/cl_send_recap_alerts.py | 116 ++++++-- cl/alerts/tests/tests_recap_alerts.py | 263 ++++++++++++++++-- cl/alerts/utils.py | 42 +++ 3 files changed, 377 insertions(+), 44 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 3399a6801f..537626bc17 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -6,19 +6,26 @@ from django.http import QueryDict from django.utils.timezone import now from elasticsearch.exceptions import RequestError, TransportError +from redis import Redis from cl.alerts.models import Alert from cl.alerts.tasks import send_search_alert_emails -from cl.alerts.utils import query_includes_rd_field, recap_document_hl_matched +from cl.alerts.utils import ( + add_document_hit_to_alert_set, + has_document_alert_hit_been_triggered, + query_includes_rd_field, + recap_document_hl_matched, +) from cl.lib.command_utils import VerboseCommand, logger from cl.lib.elasticsearch_utils import do_es_sweep_alert_query +from cl.lib.redis_utils import get_redis_interface from cl.search.documents import DocketSweepDocument from cl.search.exception import ( BadProximityQuery, UnbalancedParenthesesQuery, UnbalancedQuotesQuery, ) -from cl.search.models import SEARCH_TYPES +from cl.search.models import SEARCH_TYPES, Docket from cl.stats.utils import tally_stat @@ -27,22 +34,65 @@ def index_daily_recap_documents(): pass -def has_rd_hit_been_triggered(): - # TODO implement +def should_docket_hit_be_included( + r: Redis, alert_id: int, docket_id: int +) -> bool: + """Determine if a Docket alert should be triggered based on its + date_modified and if the docket has triggered the alert previously. + + :param r: The Redis interface. + :param alert_id: The ID of the alert. + :param docket_id: The ID of the docket. + :return: True if the Docket alert should be triggered, False otherwise. + """ + docket = Docket.objects.filter(id=docket_id).only("date_modified").first() + if not docket: + return False + date_modified = docket.date_modified.date() + if not has_document_alert_hit_been_triggered(r, alert_id, "d", docket_id): + if date_modified == now().date(): + return True return False -def has_docket_hit_been_triggered(): - # TODO implement - return True +def filter_rd_alert_hits(r, alert_id, rd_hits, check_rd_hl=False): + """Filter RECAP document hits based on specified conditions. + + :param r: The Redis interface. + :param alert_id: The ID of the alert. + :param rd_hits: A list of RECAP document hits to be processed. + :param check_rd_hl: A boolean indicating whether to check if the RECAP + document hit matched RD HLs. + :return: A list of RECAP document hits that meet all specified conditions. + """ + + rds_to_send = [] + for rd_hit in rd_hits: + conditions = [ + not has_document_alert_hit_been_triggered( + r, alert_id, "r", rd_hit["_source"]["id"] + ) + ] + if check_rd_hl: + conditions.append(recap_document_hl_matched(rd_hit)) + if all(conditions): + rds_to_send.append(rd_hit) + add_document_hit_to_alert_set( + r, alert_id, "r", rd_hit["_source"]["id"] + ) + return rds_to_send def query_and_send_alerts(rate): + r = get_redis_interface("CACHE") alert_users = User.objects.filter(alerts__rate=rate).distinct() alerts_sent_count = 0 now_time = datetime.datetime.now() for user in alert_users: - alerts = user.alerts.filter(rate=rate) + if rate == Alert.REAL_TIME: + if not user.profile.is_member: + continue + alerts = user.alerts.filter(rate=rate, alert_type=SEARCH_TYPES.RECAP) logger.info(f"Running alerts for user '{user}': {alerts}") hits = [] @@ -50,7 +100,6 @@ def query_and_send_alerts(rate): for alert in alerts: search_params = QueryDict(alert.query.encode(), mutable=True) includes_rd_fields = query_includes_rd_field(search_params) - try: search_query = DocketSweepDocument.search() results, total_hits = do_es_sweep_alert_query( @@ -71,26 +120,48 @@ def query_and_send_alerts(rate): alerts_to_update.append(alert.pk) if len(results) > 0: - search_type = search_params.get("type", SEARCH_TYPES.OPINION) + search_type = search_params.get("type", SEARCH_TYPES.RECAP) results_to_send = [] for hit in results: if not includes_rd_fields: - rds_to_send = [ - rd_hit - for rd_hit in hit["child_docs"] - if not recap_document_hl_matched(rd_hit) - and not has_rd_hit_been_triggered() - ] + # Possible Docket-only query + rds_to_send = filter_rd_alert_hits( + r, alert.pk, hit["child_docs"], check_rd_hl=True + ) if rds_to_send: + # Cross-object query + hit["child_docs"] = rds_to_send + results_to_send.append(hit) + elif should_docket_hit_be_included( + r, alert.pk, hit.docket_id + ): + hit["child_docs"] = [] + results_to_send.append(hit) + add_document_hit_to_alert_set( + r, alert.pk, "d", hit.docket_id + ) + else: + # RECAP-only alerts or cross-object alerts + rds_to_send = filter_rd_alert_hits( + r, alert.pk, hit["child_docs"] + ) + if rds_to_send: + # Cross-object query hit["child_docs"] = rds_to_send results_to_send.append(hit) - hits.append( - [alert, search_type, results_to_send, len(results_to_send)] - ) - alert.query_run = search_params.urlencode() - alert.date_last_hit = now() - alert.save() + if results_to_send: + hits.append( + [ + alert, + search_type, + results_to_send, + len(results_to_send), + ] + ) + alert.query_run = search_params.urlencode() + alert.date_last_hit = now() + alert.save() if hits: send_search_alert_emails.delay([(user.pk, hits)]) @@ -114,7 +185,6 @@ class Command(VerboseCommand): def handle(self, *args, **options): super().handle(*args, **options) - index_daily_recap_documents() query_and_send_alerts(Alert.REAL_TIME) query_and_send_alerts(Alert.DAILY) diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 4b00408d2c..d3b345f36e 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -1,3 +1,4 @@ +import datetime from unittest import mock import time_machine @@ -15,6 +16,12 @@ from cl.lib.elasticsearch_utils import do_es_sweep_alert_query from cl.lib.test_helpers import RECAPSearchTestCase from cl.search.documents import DocketSweepDocument +from cl.search.factories import ( + DocketEntryWithParentsFactory, + DocketFactory, + RECAPDocumentFactory, +) +from cl.search.models import Docket from cl.tests.cases import ESIndexTestCase, TestCase from cl.tests.utils import MockResponse from cl.users.factories import UserProfileWithParentsFactory @@ -31,7 +38,7 @@ class RECAPAlertsSweepIndexTest( def setUpTestData(cls): cls.rebuild_index("people_db.Person") cls.rebuild_index("search.Docket") - cls.mock_date = now().replace(day=15, hour=0) + cls.mock_date = now() with time_machine.travel(cls.mock_date, tick=False): super().setUpTestData() call_command( @@ -47,30 +54,26 @@ def setUpTestData(cls): NeonMembership.objects.create( level=NeonMembership.LEGACY, user=cls.user_profile.user ) + cls.user_profile_2 = UserProfileWithParentsFactory() + NeonMembership.objects.create( + level=NeonMembership.LEGACY, user=cls.user_profile_2.user + ) + cls.user_profile_no_member = UserProfileWithParentsFactory() cls.webhook_enabled = WebhookFactory( user=cls.user_profile.user, event_type=WebhookEventType.SEARCH_ALERT, url="https://example.com/", enabled=True, ) - cls.search_alert = AlertFactory( - user=cls.user_profile.user, - rate=Alert.REAL_TIME, - name="Test Alert Docket Only", - query='q="401 Civil"&type=r', - ) - cls.search_alert_2 = AlertFactory( - user=cls.user_profile.user, - rate=Alert.REAL_TIME, - name="Test Alert RECAP Only", - query='q="Mauris iaculis, leo sit amet hendrerit vehicula"&type=r', - ) - cls.search_alert_3 = AlertFactory( - user=cls.user_profile.user, - rate=Alert.DAILY, - name="Test Cross object", - query="q=SUBPOENAS SERVED OFF Mauris iaculis&type=r", - ) + + @staticmethod + def get_html_content_from_email(email_content): + html_content = None + for content, content_type in email_content.alternatives: + if content_type == "text/html": + html_content = content + break + return html_content async def test_recap_document_hl_matched(self) -> None: """Test recap_document_hl_matched method that determines weather a hit @@ -172,11 +175,83 @@ async def test_query_includes_rd_field(self) -> None: } self.assertEqual(query_includes_rd_field(search_params), True) - def test_filter_out_alerts_to_send(self) -> None: - """Test RECAP alerts hit can be properly filtered out according to + def test_filter_recap_alerts_to_send(self) -> None: + """Test filter RECAP alerts that met the conditions to be sent: + - RECAP type alert. + - RT or DLY rate + - For RT rate the user must have an active membership. + """ + + rt_recap_alert = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test RT RECAP Alert", + query='q="401 Civil"&type=r', + ) + dly_recap_alert = AlertFactory( + user=self.user_profile.user, + rate=Alert.DAILY, + name="Test DLY RECAP Alert", + query='q="401 Civil"&type=r', + ) + AlertFactory( + user=self.user_profile_2.user, + rate=Alert.REAL_TIME, + name="Test RT Opinion Alert", + query='q="401 Civil"', + ) + AlertFactory( + user=self.user_profile_no_member.user, + rate=Alert.REAL_TIME, + name="Test RT RECAP Alert no Member", + query='q="401 Civil"&type=r', + ) + + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + + # Only the RECAP RT alert for a member and the RECAP DLY alert are sent. + self.assertEqual( + len(mail.outbox), 2, msg="Outgoing emails don't match." + ) + html_content = self.get_html_content_from_email(mail.outbox[0]) + self.assertIn(rt_recap_alert.name, html_content) + + html_content = self.get_html_content_from_email(mail.outbox[1]) + self.assertIn(dly_recap_alert.name, html_content) + + def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: + """Test RECAP alerts can be properly filtered out according to their query and hits matched conditions. + + - Docket-only Alerts should be triggered only if the Docket was + modified on the day. This prevents sending Alerts due to related + RDs added on the same day which can match the query due to parent + fields indexed into the RDs. + - The Docket or RD shouldn’t have triggered the alert previously. + - RECAP-only Alerts should only include RDs that have not triggered the + same alert previously. If there are no hits after filtering RDs, + don’t send the alert. + - Cross-object queries should only include RDs that have not triggered + the same alert previously. If there are no hits after filtering RDs, + don’t send the alert. + + Assert the content structure accordingly. """ + # This docket-only alert, matches a Docket added today. + + docket_only_alert = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Docket Only", + query='q="401 Civil"&type=r', + ) with mock.patch( "cl.api.webhooks.requests.post", side_effect=lambda *args, **kwargs: MockResponse( @@ -185,6 +260,152 @@ def test_filter_out_alerts_to_send(self) -> None: ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts") + self.assertEqual( + len(mail.outbox), 1, msg="Outgoing emails don't match." + ) + html_content = self.get_html_content_from_email(mail.outbox[0]) + self.assertIn(docket_only_alert.name, html_content) + + # This test shouldn't match the Docket-only alert when the RD is added + # today since its parent Docket was not modified today. + AlertFactory( + user=self.user_profile_2.user, + rate=Alert.REAL_TIME, + name="Test Alert Docket Only Not Triggered", + query='q="405 Civil"&type=r', + ) + one_day_before = now() - datetime.timedelta(days=1) + mock_date = one_day_before.replace(hour=5) + with time_machine.travel(mock_date, tick=False): + docket = DocketFactory( + court=self.court, + case_name="SUBPOENAS SERVED CASE", + case_name_full="Jackson & Sons Holdings vs. Bank", + docket_number="1:21-bk-1234", + nature_of_suit="440", + source=Docket.RECAP, + cause="405 Civil", + jurisdiction_type="'U.S. Government Defendant", + jury_demand="1,000,000", + ) + + mock_date = now().replace(hour=5) + with time_machine.travel(mock_date, tick=False): + de = DocketEntryWithParentsFactory( + docket=docket, + entry_number=1, + date_filed=datetime.date(2024, 8, 19), + description="MOTION for Leave to File Amicus Curiae Lorem Served", + ) + rd = RECAPDocumentFactory( + docket_entry=de, + description="Motion to File", + document_number="1", + is_available=True, + page_count=5, + pacer_doc_id="018036652436", + plain_text="plain text for 018036652436", + ) + + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + sweep_index=True, + ) + + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + # No new alert should be triggered. + self.assertEqual( + len(mail.outbox), 1, msg="Outgoing emails don't match." + ) + + recap_only_alert = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert RECAP Only", + query='q="plain text for 018036652436"&type=r', + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + # 1 New alert should be triggered. self.assertEqual( len(mail.outbox), 2, msg="Outgoing emails don't match." ) + + # Trigger the alert again. + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + # No new alert should be triggered. + self.assertEqual( + len(mail.outbox), 2, msg="Outgoing emails don't match." + ) + + # Create a new RD for the same DocketEntry. + rd = RECAPDocumentFactory( + docket_entry=de, + description="Motion to File 2", + document_number="2", + is_available=True, + page_count=3, + pacer_doc_id="018036652436", + plain_text="plain text for 018036652436", + ) + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + sweep_index=True, + ) + + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + + # A new alert should be triggered containing only the new RD created. + self.assertEqual( + len(mail.outbox), 3, msg="Outgoing emails don't match." + ) + + recap_only_alert_2 = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert RECAP Only Docket Entry", + query=f"q=docket_entry_id:{de.pk}&type=r", + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + + # A new alert should be triggered containing two RDs. + self.assertEqual( + len(mail.outbox), 4, msg="Outgoing emails don't match." + ) diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py index 408ae13da0..677898a688 100644 --- a/cl/alerts/utils.py +++ b/cl/alerts/utils.py @@ -5,6 +5,7 @@ from django.http import QueryDict from elasticsearch_dsl import Q, Search from elasticsearch_dsl.response import Hit, Response +from redis import Redis from cl.alerts.models import ( SCHEDULED_ALERT_HIT_STATUS, @@ -187,3 +188,44 @@ def query_includes_rd_field(query_params: CleanData) -> bool: return True return False + + +def make_alert_set_key(alert_id: int, document_type: str) -> str: + """Generate a Redis key for storing alert hits. + + :param alert_id: The ID of the alert. + :param document_type: The type of document associated with the alert. + :return: A Redis key string in the format "alert_hits:{alert_id}.{document_type}". + """ + return f"alert_hits:{alert_id}.{document_type}" + + +def add_document_hit_to_alert_set( + r: Redis, alert_id: int, document_type: str, document_id: int +) -> None: + """Add a document ID to the Redis SET associated with an alert ID. + + :param r: Redis client instance. + :param alert_id: The alert identifier. + :param document_type: The type of document associated with the alert. + :param document_id: The docket identifier to add. + :return: None + """ + alert_key = make_alert_set_key(alert_id, document_type) + r.sadd(alert_key, document_id) + + +def has_document_alert_hit_been_triggered( + r: Redis, alert_id: int, document_type: str, document_id: int +) -> bool: + """Check if a document ID is a member of the Redis SET associated with an + alert ID. + + :param r: Redis client instance. + :param alert_id: The alert identifier. + :param document_type: The type of document associated with the alert. + :param document_id: The docket identifier to check. + :return: True if the docket ID is a member of the set, False otherwise. + """ + alert_key = make_alert_set_key(alert_id, document_type) + return r.sismember(alert_key, document_id) From c1232ec8ae11f86a6c8209061d9fe80d5c9f62bb Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 27 Jun 2024 20:22:03 -0600 Subject: [PATCH 06/33] fix(alerts): Updated ES alert email templates to support RECAP Alerts. - Added tests to assert nested child documents in case alerts. --- .../commands/cl_send_recap_alerts.py | 8 +- cl/alerts/templates/alert_email_es.html | 71 +++- cl/alerts/templates/alert_email_es.txt | 13 +- cl/alerts/tests/tests_recap_alerts.py | 305 +++++++++++++++++- cl/alerts/utils.py | 2 +- cl/custom_filters/templatetags/extras.py | 20 +- 6 files changed, 379 insertions(+), 40 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 537626bc17..fd8d3a13c9 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -124,7 +124,9 @@ def query_and_send_alerts(rate): results_to_send = [] for hit in results: if not includes_rd_fields: - # Possible Docket-only query + # Possible Docket-only alert + # TODO important to keep the original ES child structure to preserve HLs. + # Maybe we can merge HL after filtering them? rds_to_send = filter_rd_alert_hits( r, alert.pk, hit["child_docs"], check_rd_hl=True ) @@ -135,6 +137,7 @@ def query_and_send_alerts(rate): elif should_docket_hit_be_included( r, alert.pk, hit.docket_id ): + # Docket-only alert hit["child_docs"] = [] results_to_send.append(hit) add_document_hit_to_alert_set( @@ -146,7 +149,7 @@ def query_and_send_alerts(rate): r, alert.pk, hit["child_docs"] ) if rds_to_send: - # Cross-object query + # Cross-object alert hit["child_docs"] = rds_to_send results_to_send.append(hit) @@ -162,7 +165,6 @@ def query_and_send_alerts(rate): alert.query_run = search_params.urlencode() alert.date_last_hit = now() alert.save() - if hits: send_search_alert_emails.delay([(user.pk, hits)]) alerts_sent_count += 1 diff --git a/cl/alerts/templates/alert_email_es.html b/cl/alerts/templates/alert_email_es.html index 36ccee5ccc..804f33a0bb 100644 --- a/cl/alerts/templates/alert_email_es.html +++ b/cl/alerts/templates/alert_email_es.html @@ -25,7 +25,7 @@

- Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert — {{alert.name}} — had {{num_results}}{% if num_results >= hits_limit %}+{% endif %} hit{{results|pluralize}}: + Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% elif type == 'r' %}RECAP{% endif %} alert — {{alert.name}} — had {{num_results}}{% if num_results >= hits_limit %}+{% endif %} hit{{results|pluralize}}:

View Full Results / Edit this Alert
@@ -34,28 +34,65 @@

- + {{ forloop.counter }}. {{ result|get_highlight:"caseName"|safe }} - ({% if result.court_id != 'scotus' %}{{ result|get_highlight:"court_citation_string"|nbsp|safe }} {% endif %}{% if type == 'o' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %}) + ({% if result.court_id != 'scotus' %}{{ result|get_highlight:"court_citation_string"|nbsp|safe }} {% endif %}{% if type == 'o' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% elif type == 'r' %}{{ result.dateFiled|date:"Y" }}{% endif %})

-

- - View original: - - {% if result.download_url %} - - From the court + {% if type == 'r' %} + {% if result.docketNumber %} + Docket Number: + {{ result|get_highlight:"docketNumber"|safe }} + {% endif %} +
+ Date Filed: + {% if result.dateFiled %} + {{ result.dateFiled|date:"F jS, Y" }} + {% else %} + Unknown Date + {% endif %} +

+ {% if result.child_remaining %} + {% extract_q_value alert.query_run as q_value %} + + View Additional Results for this Case + + {% endif %} + {% else %} +

+ + View original: + + {% if result.download_url %} + + From the court + +   |   + {% endif %} + {% if result.local_path %} + {# Provide link to S3. #} + + Our backup -   |   {% endif %} - {% if result.local_path %} - {# Provide link to S3. #} - - Our backup - +

{% endif %} -

{% if type == 'oa' %}

Date Argued: diff --git a/cl/alerts/templates/alert_email_es.txt b/cl/alerts/templates/alert_email_es.txt index db8363e713..f4aa763cec 100644 --- a/cl/alerts/templates/alert_email_es.txt +++ b/cl/alerts/templates/alert_email_es.txt @@ -10,16 +10,23 @@ CourtListener.com We have news regarding your alerts at CourtListener.com ------------------------------------------------------- -{% for alert, type, results, num_results in hits %}{% for result in results %}{% if forloop.first %}Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% endif %} alert -- {{alert.name}} -- had {{num_results}}{% if num_results >= hits_limit %}+{% endif %} hit{{results|pluralize}}: +{% for alert, type, results, num_results in hits %}{% for result in results %}{% if forloop.first %}Your {{alert.get_rate_display|lower}} {% if type == 'o' %}opinion{% elif type == 'oa' %}oral argument{% elif type == 'r' %}RECAP{% endif %} alert -- {{alert.name}} -- had {{num_results}}{% if num_results >= hits_limit %}+{% endif %} hit{{results|pluralize}}: ------------------------------------------------------- View Full Results / Edit this Alert: https://www.courtlistener.com/?{{ alert.query_run|safe }}&edit_alert={{ alert.pk }} Disable this Alert (one click): https://www.courtlistener.com{% url "disable_alert" alert.secret_key %}{% endif %} {{forloop.counter}}. {{ result.caseName|render_string_or_list|safe|striptags }} ({% if result.court_id != 'scotus' %}{{ result.court_citation_string|render_string_or_list|striptags }} {% endif %}{% if type == 'o' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %}) {% if type == 'oa' %}{% if result.dateArgued %}Date Argued: {{ result.dateArgued|date:"F jS, Y" }}{% else %}Date Argued: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} | Duration: {{ result.duration|naturalduration }}{% if result.judge %} | Judge: {{ result.judge|render_string_or_list|safe|striptags|underscore_to_space }}{% endif %}{% endif %} -{% if type == 'o' or type == 'oa' %}{% if result|get_highlight:"text" %}...{{ result|get_highlight:"text"|safe|striptags|underscore_to_space|compress_whitespace }}...{% endif %} +{% if type == 'o' or type == 'oa' %}{% if result|get_highlight:"text" %}...{{ result|get_highlight:"text"|safe|striptags|underscore_to_space|compress_whitespace }}...{% endif %}{% endif %} +{% if type == 'r' %}{% if result.dateFiled %}Date Filed: {{ result.dateFiled|date:"F jS, Y" }}{% else %}Date Filed: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} +{% for doc in result.child_docs %}{% with doc=doc|get_attrdict:"_source" %} - {% if doc.short_description %}{{ doc.short_description|render_string_or_list|safe|striptags }} - {% endif %}Document #{% if doc.document_number %}{{ doc.document_number }}{% endif %}{% if doc.attachment_number %}, Attachment #{{ doc.attachment_number }}{% endif %} + {% if doc.description %}Description: {{ doc.description|render_string_or_list|safe|striptags }}{% endif %} + {% if doc.plain_text %}{% contains_highlights doc.plain_text.0 True as highlighted %}{% if highlighted %}...{% endif %}{{ doc.plain_text|render_string_or_list|safe|striptags|underscore_to_space }}...{% endif %} + View this document on our site: https://www.courtlistener.com{% if doc.absolute_url %}{{ doc.absolute_url }}{% else %}{{ result.docket_absolute_url }}#minute-entry-{{ doc.docket_entry_id }}{% endif %} +{% endwith %}{% endfor %} +{% if result.child_remaining %}{% extract_q_value alert.query_run as q_value %}View Additional Results for this Case: https://www.courtlistener.com/?type={{ type|urlencode }}&q={% if q_value %}({{ q_value|urlencode }})%20AND%20{% endif %}docket_id%3A{{ result.docket_id|urlencode }}{% endif %} {% endif %}~~~~~ - - View this item on our site: https://www.courtlistener.com{{result.absolute_url}} + - View this item on our site: https://www.courtlistener.com{% if type == 'r' %}{{result.docket_absolute_url}}{% else %}{{result.absolute_url}}{% endif %} {% if result.download_url %} - Download original from the court: {{result.download_url}} {% endif %}{% if result.local_path %} - Download the original from our backup: https://storage.courtlistener.com/{{ result.local_path }}{% endif %}{% endfor %} diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index d3b345f36e..8927be1f12 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -6,6 +6,7 @@ from django.core import mail from django.core.management import call_command from django.utils.timezone import now +from lxml import html from cl.alerts.factories import AlertFactory from cl.alerts.models import SEARCH_TYPES, Alert @@ -75,6 +76,108 @@ def get_html_content_from_email(email_content): break return html_content + def _confirm_number_of_alerts(self, html_content, expected_count): + """Test the number of alerts included in the email alert.""" + tree = html.fromstring(html_content) + got = len(tree.xpath("//h2")) + + self.assertEqual( + got, + expected_count, + msg="Did not get the right number of alerts in the email. " + "Expected: %s - Got: %s\n\n" % (expected_count, got), + ) + + def _count_alert_hits_and_child_hits( + self, + html_content, + alert_title, + expected_hits, + case_title, + expected_child_hits, + ): + """Confirm the following assertions for the email alert: + - An specific alert is included in the email alert. + - The specified alert contains the expected number of hits. + - The specified case contains the expected number of child hits. + """ + tree = html.fromstring(html_content) + alert_element = tree.xpath(f"//h2[contains(text(), '{alert_title}')]") + self.assertTrue( + alert_element, msg=f"Not alert with title {alert_title} found." + ) + + # Find the corresponding case_title under the alert_element + alert_index = tree.xpath("//h2").index(alert_element[0]) + alert_cases = tree.xpath( + f"//h2[{alert_index + 1}]/following-sibling::h3" + ) + self.assertEqual( + len(alert_cases), + expected_hits, + msg="Did not get the right number of hits for the alert %s. " + "Expected: %s - Got: %s\n\n" + % (alert_title, expected_hits, len(alert_cases)), + ) + if case_title: + child_hit_count = 0 + for case in alert_cases: + case_text = " ".join(case.xpath(".//text()")).strip() + if case_title in case_text: + child_hit_count = len( + case.xpath("following-sibling::ul[1]/li/a") + ) + + self.assertEqual( + child_hit_count, + expected_child_hits, + msg="Did not get the right number of child hits for the case %s. " + "Expected: %s - Got: %s\n\n" + % (case_title, expected_child_hits, child_hit_count), + ) + + def _assert_child_hits_content( + self, + html_content, + alert_title, + case_title, + expected_child_descriptions, + ): + """Confirm the child hits in a case are the expected ones, comparing + their descriptions. + """ + tree = html.fromstring(html_content) + alert_element = tree.xpath(f"//h2[contains(text(), '{alert_title}')]") + # Find the corresponding case_title under the alert_element + alert_index = tree.xpath("//h2").index(alert_element[0]) + alert_cases = tree.xpath( + f"//h2[{alert_index + 1}]/following-sibling::h3" + ) + + def extract_child_descriptions(case_item): + child_documents = case_item.xpath("//ul/li") + results = [] + for li in child_documents: + a_tag = li.xpath(".//a")[0] + full_text = a_tag.text_content() + first_part = full_text.split("\u2014")[0].strip() + results.append(first_part) + + return results + + child_descriptions = set() + for case in alert_cases: + case_text = "".join(case.xpath(".//text()")).strip() + if case_title in case_text: + child_descriptions = set(extract_child_descriptions(case)) + break + + self.assertEqual( + child_descriptions, + set(expected_child_descriptions), + msg=f"Child hits didn't match for case {case_title}", + ) + async def test_recap_document_hl_matched(self) -> None: """Test recap_document_hl_matched method that determines weather a hit contains RECAPDocument HL fields.""" @@ -244,8 +347,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: Assert the content structure accordingly. """ - # This docket-only alert, matches a Docket added today. - + # This docket-only alert matches a Docket ingested today. docket_only_alert = AlertFactory( user=self.user_profile.user, rate=Alert.REAL_TIME, @@ -265,15 +367,29 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: ) html_content = self.get_html_content_from_email(mail.outbox[0]) self.assertIn(docket_only_alert.name, html_content) + self._confirm_number_of_alerts(html_content, 1) + # The docket-only alert doesn't contain any nested child hits. + self._count_alert_hits_and_child_hits( + html_content, + docket_only_alert.name, + 1, + self.de.docket.case_name, + 0, + ) + + # Assert email text version: + txt_email = mail.outbox[0].body + self.assertIn(docket_only_alert.name, txt_email) - # This test shouldn't match the Docket-only alert when the RD is added - # today since its parent Docket was not modified today. + # The following test shouldn't match the Docket-only alert when the RD + # is added today since its parent Docket was not modified today. AlertFactory( user=self.user_profile_2.user, rate=Alert.REAL_TIME, name="Test Alert Docket Only Not Triggered", query='q="405 Civil"&type=r', ) + # Simulate docket is ingested a day before. one_day_before = now() - datetime.timedelta(days=1) mock_date = one_day_before.replace(hour=5) with time_machine.travel(mock_date, tick=False): @@ -289,16 +405,17 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: jury_demand="1,000,000", ) + # Its related RD is ingested today. mock_date = now().replace(hour=5) with time_machine.travel(mock_date, tick=False): - de = DocketEntryWithParentsFactory( + alert_de = DocketEntryWithParentsFactory( docket=docket, entry_number=1, date_filed=datetime.date(2024, 8, 19), description="MOTION for Leave to File Amicus Curiae Lorem Served", ) rd = RECAPDocumentFactory( - docket_entry=de, + docket_entry=alert_de, description="Motion to File", document_number="1", is_available=True, @@ -306,7 +423,6 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: pacer_doc_id="018036652436", plain_text="plain text for 018036652436", ) - call_command( "cl_index_parent_and_child_docs", search_type=SEARCH_TYPES.RECAP, @@ -323,11 +439,12 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: ), ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts") - # No new alert should be triggered. + # The RD ingestion's shouldn't match the docket-only alert. self.assertEqual( len(mail.outbox), 1, msg="Outgoing emails don't match." ) + # Test a RECAP-only alert query. recap_only_alert = AlertFactory( user=self.user_profile.user, rate=Alert.REAL_TIME, @@ -345,8 +462,29 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: self.assertEqual( len(mail.outbox), 2, msg="Outgoing emails don't match." ) + html_content = self.get_html_content_from_email(mail.outbox[1]) + self._confirm_number_of_alerts(html_content, 1) + # Only one child hit should be included in the case within the alert. + self._count_alert_hits_and_child_hits( + html_content, + recap_only_alert.name, + 1, + alert_de.docket.case_name, + 1, + ) + self._assert_child_hits_content( + html_content, + recap_only_alert.name, + alert_de.docket.case_name, + [rd.description], + ) + # Assert email text version: + txt_email = mail.outbox[1].body + self.assertIn(recap_only_alert.name, txt_email) + self.assertIn(rd.description, txt_email) - # Trigger the alert again. + # Trigger the same alert again to confirm that no new alert is + # triggered because previous hits have already triggered the same alert with mock.patch( "cl.api.webhooks.requests.post", side_effect=lambda *args, **kwargs: MockResponse( @@ -359,9 +497,10 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: len(mail.outbox), 2, msg="Outgoing emails don't match." ) - # Create a new RD for the same DocketEntry. - rd = RECAPDocumentFactory( - docket_entry=de, + # Create a new RD for the same DocketEntry to confirm this new RD is + # properly included in the alert email. + rd_2 = RECAPDocumentFactory( + docket_entry=alert_de, description="Motion to File 2", document_number="2", is_available=True, @@ -390,12 +529,22 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: self.assertEqual( len(mail.outbox), 3, msg="Outgoing emails don't match." ) + html_content = self.get_html_content_from_email(mail.outbox[2]) + self._confirm_number_of_alerts(html_content, 1) + self._assert_child_hits_content( + html_content, + recap_only_alert.name, + alert_de.docket.case_name, + [rd_2.description], + ) + # The following test confirms that hits previously matched with other + # alerts can match a different alert. recap_only_alert_2 = AlertFactory( user=self.user_profile.user, rate=Alert.REAL_TIME, name="Test Alert RECAP Only Docket Entry", - query=f"q=docket_entry_id:{de.pk}&type=r", + query=f"q=docket_entry_id:{alert_de.pk}&type=r", ) with mock.patch( "cl.api.webhooks.requests.post", @@ -405,7 +554,135 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts") - # A new alert should be triggered containing two RDs. + # A new alert should be triggered containing two RDs (rd and rd_2) self.assertEqual( len(mail.outbox), 4, msg="Outgoing emails don't match." ) + html_content = self.get_html_content_from_email(mail.outbox[3]) + self._confirm_number_of_alerts(html_content, 1) + self._assert_child_hits_content( + html_content, + recap_only_alert_2.name, + alert_de.docket.case_name, + [rd.description, rd_2.description], + ) + # Assert email text version: + txt_email = mail.outbox[3].body + self.assertIn(recap_only_alert.name, txt_email) + self.assertIn(rd.description, txt_email) + self.assertIn(rd_2.description, txt_email) + + # The following test confirms that a cross-object alert is properly + # matched and triggered + recap_only_alert_3 = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Cross-object query", + query=f'q="Motion to File 2"&docket_number={docket.docket_number}&type=r', + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + + # A new alert should be triggered containing one RD (rd_2) + self.assertEqual( + len(mail.outbox), 5, msg="Outgoing emails don't match." + ) + html_content = self.get_html_content_from_email(mail.outbox[4]) + self._confirm_number_of_alerts(html_content, 1) + self._assert_child_hits_content( + html_content, + recap_only_alert_3.name, + alert_de.docket.case_name, + [rd_2.description], + ) + # Assert email text version: + txt_email = mail.outbox[4].body + self.assertIn(recap_only_alert_3.name, txt_email) + self.assertIn(rd_2.description, txt_email) + + def test_limit_alert_case_child_hits(self) -> None: + """Test limit case child hits up to 5 and display the "View additional + results for this Case" button. + """ + + mock_date = now().replace(hour=5) + with time_machine.travel(mock_date, tick=False): + alert_de = DocketEntryWithParentsFactory( + docket=self.de.docket, + entry_number=1, + date_filed=datetime.date(2024, 8, 19), + description="MOTION for Leave to File Amicus Curiae Lorem Served", + ) + rd_descriptions = [] + for i in range(6): + rd = RECAPDocumentFactory( + docket_entry=alert_de, + description=f"Motion to File {i+1}", + document_number=f"{i+1}", + pacer_doc_id=f"018036652436{i+1}", + ) + if i < 5: + # Omit the last alert to compare. Only up to 5 should be + # included in the case. + rd_descriptions.append(rd.description) + + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + sweep_index=True, + ) + recap_only_alert = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert RECAP Only Docket Entry", + query=f"q=docket_entry_id:{alert_de.pk}&type=r", + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + + self.assertEqual( + len(mail.outbox), 1, msg="Outgoing emails don't match." + ) + html_content = self.get_html_content_from_email(mail.outbox[0]) + self.assertIn(recap_only_alert.name, html_content) + self._confirm_number_of_alerts(html_content, 1) + # The docket-only alert doesn't contain any nested child hits. + self._count_alert_hits_and_child_hits( + html_content, + recap_only_alert.name, + 1, + self.de.docket.case_name, + 5, + ) + self._assert_child_hits_content( + html_content, + recap_only_alert.name, + alert_de.docket.case_name, + rd_descriptions, + ) + # Assert the View more results button is present in the alert. + self.assertIn("View Additional Results for this Case", html_content) + + # Assert email text version: + txt_email = mail.outbox[0].body + self.assertIn(recap_only_alert.name, txt_email) + for description in rd_descriptions: + with self.subTest( + description=description, msg="Plain text descriptions" + ): + self.assertIn(description, txt_email) + + self.assertIn("View Additional Results for this Case", txt_email) diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py index 677898a688..44277a04a2 100644 --- a/cl/alerts/utils.py +++ b/cl/alerts/utils.py @@ -154,7 +154,7 @@ def recap_document_hl_matched(rd_hit: Hit) -> bool: :return: True if the hit matched a RECAPDocument field. Otherwise, False. """ - matched_rd_hl = set() + matched_rd_hl: set[str] = set() rd_hl_fields = set(SEARCH_RECAP_CHILD_HL_FIELDS.keys()) if hasattr(rd_hit, "highlight"): highlights = rd_hit.highlight.to_dict() diff --git a/cl/custom_filters/templatetags/extras.py b/cl/custom_filters/templatetags/extras.py index 40d2813cda..53cc1a2b71 100644 --- a/cl/custom_filters/templatetags/extras.py +++ b/cl/custom_filters/templatetags/extras.py @@ -1,5 +1,6 @@ import random import re +import urllib.parse from django import template from django.core.exceptions import ValidationError @@ -10,6 +11,7 @@ from django.utils.safestring import SafeString, mark_safe from elasticsearch_dsl import AttrDict, AttrList +from cl.search.constants import ALERTS_HL_TAG, SEARCH_HL_TAG from cl.search.models import Docket, DocketEntry register = template.Library() @@ -198,13 +200,15 @@ def citation(obj) -> SafeString: @register.simple_tag -def contains_highlights(content: str) -> bool: +def contains_highlights(content: str, alert: bool = False) -> bool: """Check if a given string contains the mark tag used in highlights. :param content: The input string to check. + :param alert: Whether this tag is being used in the alert template. :return: True if the mark highlight tag is found, otherwise False. """ - pattern = r".*?" + hl_tag = ALERTS_HL_TAG if alert else SEARCH_HL_TAG + pattern = rf"<{hl_tag}>.*?" matches = re.findall(pattern, content) return bool(matches) @@ -243,3 +247,15 @@ def get_highlight(result: AttrDict | dict[str, any], field: str) -> any: original_value = result.get(field, "") return render_string_or_list(hl_value) if hl_value else original_value + + +@register.simple_tag +def extract_q_value(query: str) -> str: + """Extract the value of the "q" parameter from a URL-encoded query string. + + :param query: The URL-encoded query string. + :return: The value of the "q" parameter or an empty string if "q" is not found. + """ + + parsed_query = urllib.parse.parse_qs(query) + return parsed_query.get("q", [""])[0] From 51c7bb65f1f962e315c337825831ec2e405b3f0b Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 28 Jun 2024 15:43:55 -0600 Subject: [PATCH 07/33] fix(alerts): Group alerts and case hits limit --- .../commands/cl_send_recap_alerts.py | 2 - cl/alerts/tests/tests_recap_alerts.py | 214 ++++++++++++++++-- cl/lib/elasticsearch_utils.py | 4 +- 3 files changed, 201 insertions(+), 19 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index fd8d3a13c9..bdaae5bfb1 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -125,8 +125,6 @@ def query_and_send_alerts(rate): for hit in results: if not includes_rd_fields: # Possible Docket-only alert - # TODO important to keep the original ES child structure to preserve HLs. - # Maybe we can merge HL after filtering them? rds_to_send = filter_rd_alert_hits( r, alert.pk, hit["child_docs"], check_rd_hl=True ) diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 8927be1f12..32eac189ec 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -5,6 +5,7 @@ from asgiref.sync import sync_to_async from django.core import mail from django.core.management import call_command +from django.test.utils import override_settings from django.utils.timezone import now from lxml import html @@ -88,6 +89,28 @@ def _confirm_number_of_alerts(self, html_content, expected_count): "Expected: %s - Got: %s\n\n" % (expected_count, got), ) + @staticmethod + def _extract_cases_from_alert(html_tree, alert_title): + """Extract the case elements (h3) under a specific alert (h2) from the + HTML tree. + """ + alert_element = html_tree.xpath( + f"//h2[contains(text(), '{alert_title}')]" + ) + h2_elements = html_tree.xpath("//h2") + alert_index = h2_elements.index(alert_element[0]) + # Find the

elements between this

and the next

+ if alert_index + 1 < len(h2_elements): + next_alert_element = h2_elements[alert_index + 1] + alert_cases = html_tree.xpath( + f"//h2[contains(text(), '{alert_title}')]/following-sibling::*[following-sibling::h2[1] = '{next_alert_element.text}'][self::h3]" + ) + else: + alert_cases = html_tree.xpath( + f"//h2[contains(text(), '{alert_title}')]/following-sibling::h3" + ) + return alert_cases + def _count_alert_hits_and_child_hits( self, html_content, @@ -107,11 +130,8 @@ def _count_alert_hits_and_child_hits( alert_element, msg=f"Not alert with title {alert_title} found." ) - # Find the corresponding case_title under the alert_element - alert_index = tree.xpath("//h2").index(alert_element[0]) - alert_cases = tree.xpath( - f"//h2[{alert_index + 1}]/following-sibling::h3" - ) + alert_cases = self._extract_cases_from_alert(tree, alert_title) + self.assertEqual( len(alert_cases), expected_hits, @@ -149,13 +169,10 @@ def _assert_child_hits_content( tree = html.fromstring(html_content) alert_element = tree.xpath(f"//h2[contains(text(), '{alert_title}')]") # Find the corresponding case_title under the alert_element - alert_index = tree.xpath("//h2").index(alert_element[0]) - alert_cases = tree.xpath( - f"//h2[{alert_index + 1}]/following-sibling::h3" - ) + alert_cases = self._extract_cases_from_alert(tree, alert_title) def extract_child_descriptions(case_item): - child_documents = case_item.xpath("//ul/li") + child_documents = case_item.xpath("./following-sibling::ul[1]/li") results = [] for li in child_documents: a_tag = li.xpath(".//a")[0] @@ -175,7 +192,7 @@ def extract_child_descriptions(case_item): self.assertEqual( child_descriptions, set(expected_child_descriptions), - msg=f"Child hits didn't match for case {case_title}", + msg=f"Child hits didn't match for case {case_title}, Got {child_descriptions}, Expected: {expected_child_descriptions} ", ) async def test_recap_document_hl_matched(self) -> None: @@ -574,7 +591,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: # The following test confirms that a cross-object alert is properly # matched and triggered - recap_only_alert_3 = AlertFactory( + cross_object_alert = AlertFactory( user=self.user_profile.user, rate=Alert.REAL_TIME, name="Test Alert Cross-object query", @@ -596,13 +613,13 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: self._confirm_number_of_alerts(html_content, 1) self._assert_child_hits_content( html_content, - recap_only_alert_3.name, + cross_object_alert.name, alert_de.docket.case_name, [rd_2.description], ) # Assert email text version: txt_email = mail.outbox[4].body - self.assertIn(recap_only_alert_3.name, txt_email) + self.assertIn(cross_object_alert.name, txt_email) self.assertIn(rd_2.description, txt_email) def test_limit_alert_case_child_hits(self) -> None: @@ -659,7 +676,7 @@ def test_limit_alert_case_child_hits(self) -> None: html_content = self.get_html_content_from_email(mail.outbox[0]) self.assertIn(recap_only_alert.name, html_content) self._confirm_number_of_alerts(html_content, 1) - # The docket-only alert doesn't contain any nested child hits. + # The case alert should contain up to 5 child hits. self._count_alert_hits_and_child_hits( html_content, recap_only_alert.name, @@ -683,6 +700,171 @@ def test_limit_alert_case_child_hits(self) -> None: with self.subTest( description=description, msg="Plain text descriptions" ): - self.assertIn(description, txt_email) + self.assertIn( + description, + txt_email, + msg="RECAPDocument wasn't found in the email content.", + ) self.assertIn("View Additional Results for this Case", txt_email) + + @override_settings(SCHEDULED_ALERT_HITS_LIMIT=3) + def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: + """Test multiple alerts can be grouped in an email and hits within an + alert are limited to SCHEDULED_ALERT_HITS_LIMIT (3) hits. + """ + + docket = DocketFactory( + court=self.court, + case_name=f"SUBPOENAS SERVED CASE", + docket_number=f"1:21-bk-123", + source=Docket.RECAP, + cause="410 Civil", + ) + for i in range(3): + DocketFactory( + court=self.court, + case_name=f"SUBPOENAS SERVED CASE {i}", + docket_number=f"1:21-bk-123{i}", + source=Docket.RECAP, + cause="410 Civil", + ) + + alert_de = DocketEntryWithParentsFactory( + docket=docket, + entry_number=1, + date_filed=datetime.date(2024, 8, 19), + description="MOTION for Leave to File Amicus Curiae Lorem Served", + ) + rd = RECAPDocumentFactory( + docket_entry=alert_de, + description="Motion to File", + document_number="1", + pacer_doc_id="018036652439", + ) + rd_2 = RECAPDocumentFactory( + docket_entry=alert_de, + description="Motion to File 2", + document_number="2", + pacer_doc_id="018036652440", + plain_text= "plain text lorem" + ) + + docket_only_alert = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Docket Only", + query='q="410 Civil"&type=r', + ) + recap_only_alert = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert RECAP Only Docket Entry", + query=f"q=docket_entry_id:{alert_de.pk}&type=r", + ) + cross_object_alert_with_hl = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Cross-object", + query=f'q="File Amicus Curiae" AND "Motion to File 2" AND ' + f'"plain text lorem" AND "410 Civil" AND ' + f'id:{rd_2.pk}&docket_number={docket.docket_number}' + f'&case_name="{docket.case_name}"&type=r', + ) + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + sweep_index=True, + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + + self.assertEqual( + len(mail.outbox), 1, msg="Outgoing emails don't match." + ) + + # Assert docket-only alert. + html_content = self.get_html_content_from_email(mail.outbox[0]) + self.assertIn(docket_only_alert.name, html_content) + self._confirm_number_of_alerts(html_content, 3) + # The docket-only alert doesn't contain any nested child hits. + self._count_alert_hits_and_child_hits( + html_content, + docket_only_alert.name, + 3, + self.de.docket.case_name, + 0, + ) + + # Assert RECAP-only alert. + self.assertIn(recap_only_alert.name, html_content) + # The recap-only alert contain 2 child hits. + self._count_alert_hits_and_child_hits( + html_content, + recap_only_alert.name, + 1, + alert_de.docket.case_name, + 2, + ) + self._assert_child_hits_content( + html_content, + recap_only_alert.name, + alert_de.docket.case_name, + [rd.description, rd_2.description], + ) + + # Assert Cross-object alert. + self.assertIn(recap_only_alert.name, html_content) + # The cross-object alert only contain 1 child hit. + self._count_alert_hits_and_child_hits( + html_content, + cross_object_alert_with_hl.name, + 1, + alert_de.docket.case_name, + 1, + ) + self._assert_child_hits_content( + html_content, + cross_object_alert_with_hl.name, + alert_de.docket.case_name, + [rd_2.description], + ) + + # Assert HL in the cross_object_alert_with_hl + self.assertIn(f"{docket.case_name}", html_content) + self.assertEqual(html_content.count(f"{docket.case_name}"), 1) + self.assertIn(f"{docket.docket_number}", html_content) + self.assertEqual( + html_content.count(f"{docket.docket_number}"), 1) + self.assertIn(f"{rd_2.plain_text}", html_content) + self.assertEqual( + html_content.count(f"{rd_2.plain_text}"), 1) + self.assertIn(f"{rd_2.description}", html_content) + self.assertEqual( + html_content.count(f"{rd_2.description}"), 1) + self.assertIn("File Amicus Curiae", html_content) + self.assertEqual( + html_content.count("File Amicus Curiae"), 1) + + # Assert email text version: + txt_email = mail.outbox[0].body + self.assertIn(recap_only_alert.name, txt_email) + self.assertIn(docket_only_alert.name, txt_email) + self.assertIn(cross_object_alert_with_hl.name, txt_email) + for description in [rd.description, rd_2.description]: + with self.subTest( + description=description, msg="Plain text descriptions" + ): + self.assertIn( + description, + txt_email, + msg="RECAPDocument wasn't found in the email content.", + ) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index b4eab7c64b..9f6447c0e1 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -3053,7 +3053,9 @@ def do_es_sweep_alert_query( main_query = add_es_highlighting(s, cd, alerts=True) main_query = main_query.sort(build_sort_results(cd)) - main_query = main_query.extra(from_=0, size=30) + main_query = main_query.extra( + from_=0, size=settings.SCHEDULED_ALERT_HITS_LIMIT + ) results = main_query.execute() if results: total_hits = results.hits.total.value From 7fc3298e7f3d290ac2aefd87f299df082fc3235e Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 28 Jun 2024 19:21:45 -0600 Subject: [PATCH 08/33] fix(alerts): Trigger RECAP search alerts webhooks --- .../commands/cl_send_recap_alerts.py | 13 +++ cl/alerts/tasks.py | 2 +- cl/alerts/tests/tests_recap_alerts.py | 100 +++++++++++++++--- cl/api/tasks.py | 47 ++++++-- cl/lib/elasticsearch_utils.py | 1 - 5 files changed, 139 insertions(+), 24 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index bdaae5bfb1..d4a533c69d 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -16,6 +16,8 @@ query_includes_rd_field, recap_document_hl_matched, ) +from cl.api.models import WebhookEventType +from cl.api.tasks import send_es_search_alert_webhook from cl.lib.command_utils import VerboseCommand, logger from cl.lib.elasticsearch_utils import do_es_sweep_alert_query from cl.lib.redis_utils import get_redis_interface @@ -163,6 +165,17 @@ def query_and_send_alerts(rate): alert.query_run = search_params.urlencode() alert.date_last_hit = now() alert.save() + + # Send webhook event if the user has a SEARCH_ALERT + # endpoint enabled. + user_webhooks = user.webhooks.filter( + event_type=WebhookEventType.SEARCH_ALERT, enabled=True + ) + for user_webhook in user_webhooks: + send_es_search_alert_webhook.delay( + results_to_send, user_webhook.pk, alert.pk + ) + if hits: send_search_alert_emails.delay([(user.pk, hits)]) alerts_sent_count += 1 diff --git a/cl/alerts/tasks.py b/cl/alerts/tasks.py index da43d8155d..037fe22b4c 100644 --- a/cl/alerts/tasks.py +++ b/cl/alerts/tasks.py @@ -461,7 +461,7 @@ def send_webhook_alert_hits( send_es_search_alert_webhook.delay( documents, user_webhook.pk, - alert, + alert.pk, ) diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 32eac189ec..d910c67272 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -6,6 +6,7 @@ from django.core import mail from django.core.management import call_command from django.test.utils import override_settings +from django.utils.html import strip_tags from django.utils.timezone import now from lxml import html @@ -13,7 +14,7 @@ from cl.alerts.models import SEARCH_TYPES, Alert from cl.alerts.utils import query_includes_rd_field, recap_document_hl_matched from cl.api.factories import WebhookFactory -from cl.api.models import WebhookEventType +from cl.api.models import WebhookEvent, WebhookEventType from cl.donate.models import NeonMembership from cl.lib.elasticsearch_utils import do_es_sweep_alert_query from cl.lib.test_helpers import RECAPSearchTestCase @@ -195,6 +196,38 @@ def extract_child_descriptions(case_item): msg=f"Child hits didn't match for case {case_title}, Got {child_descriptions}, Expected: {expected_child_descriptions} ", ) + def _count_webhook_hits_and_child_hits( + self, + webhooks, + alert_title, + expected_hits, + case_title, + expected_child_hits, + ): + """Confirm the following assertions for the search alert webhook: + - An specific alert webhook was triggered. + - The specified alert contains the expected number of hits. + - The specified case contains the expected number of child hits. + """ + + for webhook in webhooks: + if webhook["payload"]["alert"]["name"] == alert_title: + webhook_cases = webhook["payload"]["results"] + self.assertEqual( + len(webhook_cases), + expected_hits, + msg=f"Did not get the right number of hits for the alert %s. " + % alert_title, + ) + for case in webhook["payload"]["results"]: + if case_title == strip_tags(case["caseName"]): + self.assertEqual( + len(case["recap_documents"]), + expected_child_hits, + msg=f"Did not get the right number of child documents for the case %s. " + % case_title, + ) + async def test_recap_document_hl_matched(self) -> None: """Test recap_document_hl_matched method that determines weather a hit contains RECAPDocument HL fields.""" @@ -747,7 +780,7 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: description="Motion to File 2", document_number="2", pacer_doc_id="018036652440", - plain_text= "plain text lorem" + plain_text="plain text lorem", ) docket_only_alert = AlertFactory( @@ -767,10 +800,20 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: rate=Alert.REAL_TIME, name="Test Alert Cross-object", query=f'q="File Amicus Curiae" AND "Motion to File 2" AND ' - f'"plain text lorem" AND "410 Civil" AND ' - f'id:{rd_2.pk}&docket_number={docket.docket_number}' - f'&case_name="{docket.case_name}"&type=r', + f'"plain text lorem" AND "410 Civil" AND ' + f"id:{rd_2.pk}&docket_number={docket.docket_number}" + f'&case_name="{docket.case_name}"&type=r', ) + AlertFactory( + user=self.user_profile_2.user, + rate=Alert.REAL_TIME, + name="Test Alert Cross-object", + query=f'q="File Amicus Curiae" AND "Motion to File 2" AND ' + f'"plain text lorem" AND "410 Civil" AND ' + f"id:{rd_2.pk}&docket_number={docket.docket_number}" + f'&case_name="{docket.case_name}"&type=r', + ) + call_command( "cl_index_parent_and_child_docs", search_type=SEARCH_TYPES.RECAP, @@ -788,8 +831,14 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: call_command("cl_send_recap_alerts") self.assertEqual( - len(mail.outbox), 1, msg="Outgoing emails don't match." + len(mail.outbox), 2, msg="Outgoing emails don't match." + ) + + # Assert webhooks. + webhook_events = WebhookEvent.objects.all().values_list( + "content", flat=True ) + self.assertEqual(len(webhook_events), 3) # Assert docket-only alert. html_content = self.get_html_content_from_email(mail.outbox[0]) @@ -800,7 +849,14 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: html_content, docket_only_alert.name, 3, - self.de.docket.case_name, + docket.case_name, + 0, + ) + self._count_webhook_hits_and_child_hits( + list(webhook_events), + docket_only_alert.name, + 3, + docket.case_name, 0, ) @@ -814,6 +870,13 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: alert_de.docket.case_name, 2, ) + self._count_webhook_hits_and_child_hits( + list(webhook_events), + recap_only_alert.name, + 1, + alert_de.docket.case_name, + 2, + ) self._assert_child_hits_content( html_content, recap_only_alert.name, @@ -831,6 +894,13 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: alert_de.docket.case_name, 1, ) + self._count_webhook_hits_and_child_hits( + list(webhook_events), + cross_object_alert_with_hl.name, + 1, + alert_de.docket.case_name, + 1, + ) self._assert_child_hits_content( html_content, cross_object_alert_with_hl.name, @@ -840,19 +910,25 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: # Assert HL in the cross_object_alert_with_hl self.assertIn(f"{docket.case_name}", html_content) - self.assertEqual(html_content.count(f"{docket.case_name}"), 1) + self.assertEqual( + html_content.count(f"{docket.case_name}"), 1 + ) self.assertIn(f"{docket.docket_number}", html_content) self.assertEqual( - html_content.count(f"{docket.docket_number}"), 1) + html_content.count(f"{docket.docket_number}"), 1 + ) self.assertIn(f"{rd_2.plain_text}", html_content) self.assertEqual( - html_content.count(f"{rd_2.plain_text}"), 1) + html_content.count(f"{rd_2.plain_text}"), 1 + ) self.assertIn(f"{rd_2.description}", html_content) self.assertEqual( - html_content.count(f"{rd_2.description}"), 1) + html_content.count(f"{rd_2.description}"), 1 + ) self.assertIn("File Amicus Curiae", html_content) self.assertEqual( - html_content.count("File Amicus Curiae"), 1) + html_content.count("File Amicus Curiae"), 1 + ) # Assert email text version: txt_email = mail.outbox[0].body diff --git a/cl/api/tasks.py b/cl/api/tasks.py index b70420fe95..a0d6112444 100644 --- a/cl/api/tasks.py +++ b/cl/api/tasks.py @@ -1,6 +1,8 @@ import json +from collections import defaultdict from typing import Any +from elasticsearch_dsl.response import Hit from rest_framework.renderers import JSONRenderer from cl.alerts.api_serializers import SearchAlertSerializerModel @@ -10,9 +12,12 @@ from cl.api.webhooks import send_webhook_event from cl.celery_init import app from cl.corpus_importer.api_serializers import DocketEntrySerializer -from cl.search.api_serializers import V3OAESResultSerializer +from cl.search.api_serializers import ( + RECAPESResultSerializer, + V3OAESResultSerializer, +) from cl.search.api_utils import ResultObject -from cl.search.models import DocketEntry +from cl.search.models import SEARCH_TYPES, DocketEntry @app.task() @@ -79,25 +84,47 @@ def send_docket_alert_webhook_events( @app.task() def send_es_search_alert_webhook( - results: list[dict[str, Any]], + results: list[dict[str, Any]] | list[Hit], webhook_pk: int, - alert: Alert, + alert_pk: int, ) -> None: """Send a search alert webhook event containing search results from a search alert object. :param results: The search results returned by SOLR for this alert. :param webhook_pk: The webhook endpoint ID object to send the event to. - :param alert: The search alert object. + :param alert_pk: The search alert ID. """ webhook = Webhook.objects.get(pk=webhook_pk) + alert = Alert.objects.get(pk=alert_pk) serialized_alert = SearchAlertSerializerModel(alert).data - es_results = [] - for result in results: - result["snippet"] = result["text"] - es_results.append(ResultObject(initial=result)) - serialized_results = V3OAESResultSerializer(es_results, many=True).data + match alert.alert_type: + case SEARCH_TYPES.ORAL_ARGUMENT: + es_results = [] + for result in results: + result["snippet"] = result["text"] + es_results.append(ResultObject(initial=result)) + serialized_results = V3OAESResultSerializer( + es_results, many=True + ).data + case SEARCH_TYPES.RECAP: + for result in results: + child_result_objects = [] + if hasattr(result, "child_docs"): + for child_doc in result.child_docs: + child_result_objects.append( + defaultdict( + lambda: None, child_doc["_source"].to_dict() + ) + ) + result["child_docs"] = child_result_objects + serialized_results = RECAPESResultSerializer( + results, many=True + ).data + case _: + # No implemented alert type. + return None post_content = { "webhook": generate_webhook_key_content(webhook), diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 9f6447c0e1..b51d149ff6 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -4,7 +4,6 @@ import re import time import traceback -from collections import defaultdict from copy import deepcopy from dataclasses import fields from functools import reduce, wraps From b5016ba122caed7de1d207622db3fefcd30fadda Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 28 Jun 2024 21:07:55 -0600 Subject: [PATCH 09/33] fix(alerts): Schedule wly and mly RECAP Search Alerts --- .../commands/cl_send_recap_alerts.py | 244 ++++++++++++------ cl/alerts/tests/tests_recap_alerts.py | 59 ++++- 2 files changed, 221 insertions(+), 82 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index d4a533c69d..a00d4cf148 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -1,3 +1,4 @@ +import copy import datetime import traceback @@ -6,12 +7,14 @@ from django.http import QueryDict from django.utils.timezone import now from elasticsearch.exceptions import RequestError, TransportError +from elasticsearch_dsl.response import Hit from redis import Redis -from cl.alerts.models import Alert +from cl.alerts.models import Alert, ScheduledAlertHit from cl.alerts.tasks import send_search_alert_emails from cl.alerts.utils import ( add_document_hit_to_alert_set, + alert_hits_limit_reached, has_document_alert_hit_been_triggered, query_includes_rd_field, recap_document_hl_matched, @@ -29,6 +32,7 @@ ) from cl.search.models import SEARCH_TYPES, Docket from cl.stats.utils import tally_stat +from cl.users.models import UserProfile def index_daily_recap_documents(): @@ -57,7 +61,7 @@ def should_docket_hit_be_included( return False -def filter_rd_alert_hits(r, alert_id, rd_hits, check_rd_hl=False): +def filter_rd_alert_hits(r: Redis, alert_id: int, rd_hits, check_rd_hl=False): """Filter RECAP document hits based on specified conditions. :param r: The Redis interface. @@ -85,9 +89,98 @@ def filter_rd_alert_hits(r, alert_id, rd_hits, check_rd_hl=False): return rds_to_send -def query_and_send_alerts(rate): +def query_alerts( + search_params: QueryDict, +) -> tuple[list[Hit] | None, int | None]: + try: + search_query = DocketSweepDocument.search() + return do_es_sweep_alert_query( + search_query, + search_params, + ) + except ( + UnbalancedParenthesesQuery, + UnbalancedQuotesQuery, + BadProximityQuery, + TransportError, + ConnectionError, + RequestError, + ): + traceback.print_exc() + logger.info(f"Search for this alert failed: {search_params}\n") + return None, None + + +def process_alert_hits( + r: Redis, results: list[Hit], search_params: QueryDict, alert_id: int +) -> list[Hit]: + """Process alert hits by filtering and prepare the results to send based + on alert conditions. + + :param r: The Redis instance. + :param results: A list of Hit objects containing search results. + :param search_params: Query parameters used for the search. + :param alert_id: The ID of the alert being processed. + :return: A list of Hit objects that are filtered and prepared to be sent. + """ + + includes_rd_fields = query_includes_rd_field(search_params) + results_to_send = [] + if len(results) > 0: + for hit in results: + if not includes_rd_fields: + # Possible Docket-only alert + rds_to_send = filter_rd_alert_hits( + r, alert_id, hit["child_docs"], check_rd_hl=True + ) + if rds_to_send: + # Cross-object query + hit["child_docs"] = rds_to_send + results_to_send.append(hit) + elif should_docket_hit_be_included(r, alert_id, hit.docket_id): + # Docket-only alert + hit["child_docs"] = [] + results_to_send.append(hit) + add_document_hit_to_alert_set( + r, alert_id, "d", hit.docket_id + ) + else: + # RECAP-only alerts or cross-object alerts + rds_to_send = filter_rd_alert_hits( + r, alert_id, hit["child_docs"] + ) + if rds_to_send: + # Cross-object alert + hit["child_docs"] = rds_to_send + results_to_send.append(hit) + return results_to_send + + +def send_search_alert_webhooks( + user: UserProfile.user, results_to_send: list[Hit], alert_id: int +) -> None: + """Send webhook events for search alerts if the user has SEARCH_ALERT + endpoints enabled. + + :param user: The user object whose webhooks need to be checked. + :param results_to_send: A list of Hit objects that contain the search + results to be sent. + :param alert_id: The Alert ID to be sent in the webhook. + """ + user_webhooks = user.webhooks.filter( + event_type=WebhookEventType.SEARCH_ALERT, enabled=True + ) + for user_webhook in user_webhooks: + send_es_search_alert_webhook.delay( + results_to_send, user_webhook.pk, alert_id + ) + + +def query_and_send_alerts(rate: str) -> None: r = get_redis_interface("CACHE") - alert_users = User.objects.filter(alerts__rate=rate).distinct() + alert_users: UserProfile.user = User.objects.filter( + alerts__rate=rate + ).distinct() alerts_sent_count = 0 now_time = datetime.datetime.now() for user in alert_users: @@ -101,80 +194,29 @@ def query_and_send_alerts(rate): alerts_to_update = [] for alert in alerts: search_params = QueryDict(alert.query.encode(), mutable=True) - includes_rd_fields = query_includes_rd_field(search_params) - try: - search_query = DocketSweepDocument.search() - results, total_hits = do_es_sweep_alert_query( - search_query, - search_params, - ) - except ( - UnbalancedParenthesesQuery, - UnbalancedQuotesQuery, - BadProximityQuery, - TransportError, - ConnectionError, - RequestError, - ): - traceback.print_exc() - logger.info(f"Search for this alert failed: {alert.query}\n") + results, _ = query_alerts(search_params) + if not results: continue - alerts_to_update.append(alert.pk) - if len(results) > 0: - search_type = search_params.get("type", SEARCH_TYPES.RECAP) - results_to_send = [] - for hit in results: - if not includes_rd_fields: - # Possible Docket-only alert - rds_to_send = filter_rd_alert_hits( - r, alert.pk, hit["child_docs"], check_rd_hl=True - ) - if rds_to_send: - # Cross-object query - hit["child_docs"] = rds_to_send - results_to_send.append(hit) - elif should_docket_hit_be_included( - r, alert.pk, hit.docket_id - ): - # Docket-only alert - hit["child_docs"] = [] - results_to_send.append(hit) - add_document_hit_to_alert_set( - r, alert.pk, "d", hit.docket_id - ) - else: - # RECAP-only alerts or cross-object alerts - rds_to_send = filter_rd_alert_hits( - r, alert.pk, hit["child_docs"] - ) - if rds_to_send: - # Cross-object alert - hit["child_docs"] = rds_to_send - results_to_send.append(hit) - - if results_to_send: - hits.append( - [ - alert, - search_type, - results_to_send, - len(results_to_send), - ] - ) - alert.query_run = search_params.urlencode() - alert.date_last_hit = now() - alert.save() - - # Send webhook event if the user has a SEARCH_ALERT - # endpoint enabled. - user_webhooks = user.webhooks.filter( - event_type=WebhookEventType.SEARCH_ALERT, enabled=True - ) - for user_webhook in user_webhooks: - send_es_search_alert_webhook.delay( - results_to_send, user_webhook.pk, alert.pk - ) + search_type = search_params.get("type", SEARCH_TYPES.RECAP) + results_to_send = process_alert_hits( + r, results, search_params, alert.pk + ) + if results_to_send: + hits.append( + [ + alert, + search_type, + results_to_send, + len(results_to_send), + ] + ) + alert.query_run = search_params.urlencode() # type: ignore + alert.date_last_hit = now() + alert.save() + + # Send webhooks + send_search_alert_webhooks(user, results_to_send, alert.pk) if hits: send_search_alert_emails.delay([(user.pk, hits)]) @@ -188,9 +230,50 @@ def query_and_send_alerts(rate): logger.info(f"Sent {alerts_sent_count} {rate} email alerts.") -def query_and_schedule_wly_and_mly_alerts(): - # TODO implement - pass +def query_and_schedule_alerts(rate: str): + r = get_redis_interface("CACHE") + alert_users = User.objects.filter(alerts__rate=rate).distinct() + for user in alert_users: + alerts = user.alerts.filter(rate=rate, alert_type=SEARCH_TYPES.RECAP) + logger.info(f"Running '{rate}' alerts for user '{user}': {alerts}") + scheduled_hits_to_create = [] + for alert in alerts: + search_params = QueryDict(alert.query.encode(), mutable=True) + results, _ = query_alerts(search_params) + if not results: + continue + results_to_send = process_alert_hits( + r, results, search_params, alert.pk + ) + if results_to_send: + for hit in results_to_send: + # Schedule DAILY, WEEKLY and MONTHLY Alerts + if alert_hits_limit_reached(alert.pk, user.pk): + # Skip storing hits for this alert-user combination because + # the SCHEDULED_ALERT_HITS_LIMIT has been reached. + continue + + child_result_objects = [] + hit_copy = copy.deepcopy(hit) + if hasattr(hit_copy, "child_docs"): + for child_doc in hit_copy.child_docs: + child_result_objects.append( + child_doc["_source"].to_dict() + ) + hit_copy["child_docs"] = child_result_objects + scheduled_hits_to_create.append( + ScheduledAlertHit( + user=user, + alert=alert, + document_content=hit_copy.to_dict(), + ) + ) + # Send webhooks + send_search_alert_webhooks(user, results_to_send, alert.pk) + + # Create scheduled WEEKLY and MONTHLY Alerts in bulk. + if scheduled_hits_to_create: + ScheduledAlertHit.objects.bulk_create(scheduled_hits_to_create) class Command(VerboseCommand): @@ -201,4 +284,5 @@ def handle(self, *args, **options): index_daily_recap_documents() query_and_send_alerts(Alert.REAL_TIME) query_and_send_alerts(Alert.DAILY) - query_and_schedule_wly_and_mly_alerts() + query_and_schedule_alerts(Alert.WEEKLY) + query_and_schedule_alerts(Alert.MONTHLY) diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index d910c67272..d8e203edee 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -11,7 +11,7 @@ from lxml import html from cl.alerts.factories import AlertFactory -from cl.alerts.models import SEARCH_TYPES, Alert +from cl.alerts.models import SEARCH_TYPES, Alert, ScheduledAlertHit from cl.alerts.utils import query_includes_rd_field, recap_document_hl_matched from cl.api.factories import WebhookFactory from cl.api.models import WebhookEvent, WebhookEventType @@ -350,7 +350,7 @@ def test_filter_recap_alerts_to_send(self) -> None: AlertFactory( user=self.user_profile_2.user, rate=Alert.REAL_TIME, - name="Test RT Opinion Alert", + name="Test RT RECAP Alert", query='q="401 Civil"', ) AlertFactory( @@ -944,3 +944,58 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: txt_email, msg="RECAPDocument wasn't found in the email content.", ) + + def test_schedule_wly_and_mly_recap_alerts(self) -> None: + """Test Weekly and Monthly RECAP Search Alerts are scheduled daily + before being sent later. + """ + + docket_only_alert = AlertFactory( + user=self.user_profile.user, + rate=Alert.WEEKLY, + name="Test Alert Docket Only", + query='q="401 Civil"&type=r', + ) + recap_only_alert = AlertFactory( + user=self.user_profile.user, + rate=Alert.MONTHLY, + name="Test Alert RECAP Only Docket Entry", + query=f"q=docket_entry_id:{self.de.pk}&type=r", + ) + cross_object_alert_with_hl = AlertFactory( + user=self.user_profile.user, + rate=Alert.WEEKLY, + name="Test Alert Cross-object", + query=f'q="401 Civil" id:{self.rd.pk}&type=r', + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts") + + self.assertEqual( + len(mail.outbox), 0, msg="Outgoing emails don't match." + ) + schedule_alerts = ScheduledAlertHit.objects.all() + self.assertEqual(schedule_alerts.count(), 3) + + # Assert webhooks. + webhook_events = WebhookEvent.objects.all().values_list( + "content", flat=True + ) + self.assertEqual(len(webhook_events), 3) + + # Send Weekly alerts and check assertions. + call_command("cl_send_scheduled_alerts", rate=Alert.WEEKLY) + self.assertEqual( + len(mail.outbox), 1, msg="Outgoing emails don't match." + ) + + # Send Monthly alerts and check assertions. + call_command("cl_send_scheduled_alerts", rate=Alert.MONTHLY) + self.assertEqual( + len(mail.outbox), 2, msg="Outgoing emails don't match." + ) From 4a128bf1a3800df58b9db2f4f13765e0685a318e Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 1 Jul 2024 20:25:54 -0600 Subject: [PATCH 10/33] fix(alerts): Copy documents from the main index to the sweep index using the Re Index API --- .../commands/cl_send_recap_alerts.py | 237 ++++++++++++++++-- cl/alerts/tests/tests_recap_alerts.py | 198 +++++++++++++-- cl/search/documents.py | 25 +- cl/search/es_indices.py | 9 - .../cl_index_parent_and_child_docs.py | 3 - cl/search/tasks.py | 18 +- 6 files changed, 414 insertions(+), 76 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index a00d4cf148..ec88eed5cc 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -1,12 +1,17 @@ import copy import datetime +import time import traceback +from typing import Any +import pytz from asgiref.sync import async_to_sync from django.contrib.auth.models import User from django.http import QueryDict -from django.utils.timezone import now +from django.utils import timezone +from elasticsearch import Elasticsearch from elasticsearch.exceptions import RequestError, TransportError +from elasticsearch_dsl import connections from elasticsearch_dsl.response import Hit from redis import Redis @@ -24,7 +29,7 @@ from cl.lib.command_utils import VerboseCommand, logger from cl.lib.elasticsearch_utils import do_es_sweep_alert_query from cl.lib.redis_utils import get_redis_interface -from cl.search.documents import DocketSweepDocument +from cl.search.documents import DocketDocument, DocketSweepDocument from cl.search.exception import ( BadProximityQuery, UnbalancedParenthesesQuery, @@ -35,9 +40,197 @@ from cl.users.models import UserProfile -def index_daily_recap_documents(): - # TODO implement - pass +def get_task_status(task_id: str, es: Elasticsearch) -> dict[str, Any]: + """Fetch the status of a task from Elasticsearch. + + :param task_id: The ID of the task to fetch the status for. + :param es: The Elasticsearch client instance. + :return: The status of the task if successful, or an empty dictionary if + an error occurs. + """ + try: + return es.tasks.get(task_id=task_id) + except ( + TransportError, + ConnectionError, + RequestError, + ) as e: + logger.error("Error getting sweep alert index task status: %s", e) + return {} + + +def index_daily_recap_documents( + r: Redis, source_index: str, target_index: str, testing: bool = False +) -> int: + """Index Dockets added/modified during the day and all their RECAPDocuments + and RECAPDocuments added/modified during the day and their parent Dockets. + It uses the ES re_index API, + + :param r: Redis client instance. + :param source_index: The source Elasticsearch index from which documents + will be queried. + :param target_index: The target Elasticsearch index to which documents will + be re-indexed. + :param testing: Boolean flag for testing mode. + :return: The total number of documents re-indexed. + """ + + if not r.exists("alert_sweep:query_date"): + # In case of a failure, store the date when alerts should be queried in + # Redis, so the command can be resumed. + local_now = timezone.localtime().replace(tzinfo=None) + local_midnight = local_now.replace( + hour=0, minute=0, second=0, microsecond=0 + ) + r.set("alert_sweep:query_date", local_midnight.isoformat()) + + else: + # If "alert_sweep:query_date" already exists get it from Redis. + local_midnight_str: str = str(r.get("alert_sweep:query_date")) + local_midnight = datetime.datetime.fromisoformat(local_midnight_str) + + es = connections.get_connection() + # Convert the local (PDT) midnight time to UTC + local_timezone = pytz.timezone(timezone.get_current_timezone_name()) + local_midnight_localized = local_timezone.localize(local_midnight) + local_midnight_utc = local_midnight_localized.astimezone(pytz.utc) + next_day_utc = local_midnight_utc + datetime.timedelta(days=1) + + today_datetime_iso = local_midnight_utc.isoformat().replace("+00:00", "Z") + next_day_utc_iso = next_day_utc.isoformat().replace("+00:00", "Z") + + # Re Index API query. + query = { + "bool": { + "should": [ + # Dockets added/modified today + { + "bool": { + "must": [ + { + "range": { + "timestamp": { + "gte": today_datetime_iso, + "lt": next_day_utc_iso, + } + } + }, + {"term": {"docket_child": "docket"}}, + ] + } + }, + # RECAPDocuments with parents added/modified today + { + "has_parent": { + "parent_type": "docket", + "query": { + "range": { + "timestamp": { + "gte": today_datetime_iso, + "lt": next_day_utc_iso, + } + } + }, + } + }, + # RECAPDocuments added/modified today + { + "bool": { + "must": [ + { + "range": { + "timestamp": { + "gte": today_datetime_iso, + "lt": next_day_utc_iso, + } + } + }, + {"term": {"docket_child": "recap_document"}}, + ] + } + }, + # Dockets that are parents of RECAPDocuments added/modified today + { + "has_child": { + "type": "recap_document", + "query": { + "range": { + "timestamp": { + "gte": today_datetime_iso, + "lt": next_day_utc_iso, + } + } + }, + } + }, + ] + } + } + + if not r.exists("alert_sweep:task_id"): + # In case of a failure, store the task_id in Redis so the command + # can be resumed. + response = es.reindex( + source={"index": source_index, "query": query}, + dest={"index": target_index}, + wait_for_completion=False, + refresh=True, + ) + # Store the task ID in Redis + task_id = response["task"] + r.set("alert_sweep:task_id", task_id) + else: + task_id = r.get("alert_sweep:task_id") + + estimated_time_remaining = 0.1 if testing else 60 + time.sleep(estimated_time_remaining) + task_info = get_task_status(task_id, es) + if task_info: + status = task_info["task"]["status"] + created = status["created"] + total = status["total"] + else: + task_info["completed"] = False + created = 0 + total = 0 + + iterations_count = 0 + while not task_info["completed"]: + logger.info( + f"Task progress: {created}/{total} documents. Estimated time to" + f" finish: {estimated_time_remaining}." + ) + task_info = get_task_status(task_id, es) + time.sleep(estimated_time_remaining) + if task_info and not task_info["completed"]: + status = task_info["task"]["status"] + start_time_millis = task_info["task"]["start_time_in_millis"] + start_time = datetime.datetime.fromtimestamp( + start_time_millis / 1000.0 + ) + created = status["created"] + total = status["total"] + if total and created: + estimated_time_remaining = datetime.timedelta( + seconds=( + (datetime.datetime.now() - start_time).total_seconds() + / created + ) + * (total - created) + ).total_seconds() + if not task_info: + iterations_count += 1 + if iterations_count > 10: + logger.error( + "Re_index alert sweep index task has failed: %s/%s", + created, + total, + ) + break + + r.delete("alert_sweep:query_date") + r.delete("alert_sweep:task_id") + return total def should_docket_hit_be_included( @@ -56,7 +249,7 @@ def should_docket_hit_be_included( return False date_modified = docket.date_modified.date() if not has_document_alert_hit_been_triggered(r, alert_id, "d", docket_id): - if date_modified == now().date(): + if date_modified == timezone.now().date(): return True return False @@ -176,8 +369,7 @@ def send_search_alert_webhooks( ) -def query_and_send_alerts(rate: str) -> None: - r = get_redis_interface("CACHE") +def query_and_send_alerts(r: Redis, rate: str) -> None: alert_users: UserProfile.user = User.objects.filter( alerts__rate=rate ).distinct() @@ -212,7 +404,7 @@ def query_and_send_alerts(rate: str) -> None: ] ) alert.query_run = search_params.urlencode() # type: ignore - alert.date_last_hit = now() + alert.date_last_hit = timezone.now() alert.save() # Send webhooks @@ -230,8 +422,7 @@ def query_and_send_alerts(rate: str) -> None: logger.info(f"Sent {alerts_sent_count} {rate} email alerts.") -def query_and_schedule_alerts(rate: str): - r = get_redis_interface("CACHE") +def query_and_schedule_alerts(r: Redis, rate: str): alert_users = User.objects.filter(alerts__rate=rate).distinct() for user in alert_users: alerts = user.alerts.filter(rate=rate, alert_type=SEARCH_TYPES.RECAP) @@ -279,10 +470,24 @@ def query_and_schedule_alerts(rate: str): class Command(VerboseCommand): help = "Send RECAP Search Alerts." + def add_arguments(self, parser): + parser.add_argument( + "--testing-mode", + action="store_true", + help="Use this flag for testing purposes.", + ) + def handle(self, *args, **options): super().handle(*args, **options) - index_daily_recap_documents() - query_and_send_alerts(Alert.REAL_TIME) - query_and_send_alerts(Alert.DAILY) - query_and_schedule_alerts(Alert.WEEKLY) - query_and_schedule_alerts(Alert.MONTHLY) + testing_mode = options.get("testing_mode", False) + r = get_redis_interface("CACHE") + index_daily_recap_documents( + r, + DocketDocument._index._name, + DocketSweepDocument._index._name, + testing=testing_mode, + ) + query_and_send_alerts(r, Alert.REAL_TIME) + query_and_send_alerts(r, Alert.DAILY) + query_and_schedule_alerts(r, Alert.WEEKLY) + query_and_schedule_alerts(r, Alert.MONTHLY) diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index d8e203edee..80cd0889be 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -8,17 +8,22 @@ from django.test.utils import override_settings from django.utils.html import strip_tags from django.utils.timezone import now +from elasticsearch_dsl import Q from lxml import html from cl.alerts.factories import AlertFactory +from cl.alerts.management.commands.cl_send_recap_alerts import ( + index_daily_recap_documents, +) from cl.alerts.models import SEARCH_TYPES, Alert, ScheduledAlertHit from cl.alerts.utils import query_includes_rd_field, recap_document_hl_matched from cl.api.factories import WebhookFactory from cl.api.models import WebhookEvent, WebhookEventType from cl.donate.models import NeonMembership from cl.lib.elasticsearch_utils import do_es_sweep_alert_query +from cl.lib.redis_utils import get_redis_interface from cl.lib.test_helpers import RECAPSearchTestCase -from cl.search.documents import DocketSweepDocument +from cl.search.documents import DocketDocument, DocketSweepDocument from cl.search.factories import ( DocketEntryWithParentsFactory, DocketFactory, @@ -50,7 +55,6 @@ def setUpTestData(cls): queue="celery", pk_offset=0, testing_mode=True, - sweep_index=True, ) cls.user_profile = UserProfileWithParentsFactory() @@ -69,6 +73,10 @@ def setUpTestData(cls): enabled=True, ) + def setUp(self): + DocketSweepDocument._index.delete(ignore=404) + DocketSweepDocument.init() + @staticmethod def get_html_content_from_email(email_content): html_content = None @@ -366,7 +374,7 @@ def test_filter_recap_alerts_to_send(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) # Only the RECAP RT alert for a member and the RECAP DLY alert are sent. self.assertEqual( @@ -378,6 +386,152 @@ def test_filter_recap_alerts_to_send(self) -> None: html_content = self.get_html_content_from_email(mail.outbox[1]) self.assertIn(dly_recap_alert.name, html_content) + def test_index_daily_recap_documents(self) -> None: + """Test index_daily_recap_documents method over different documents + conditions. + """ + r = get_redis_interface("CACHE") + recap_search = DocketDocument.search() + recap_dockets = recap_search.query(Q("match", docket_child="docket")) + self.assertEqual(recap_dockets.count(), 2) + + recap_documents = recap_search.query( + Q("match", docket_child="recap_document") + ) + self.assertEqual(recap_documents.count(), 3) + + sweep_search = DocketSweepDocument.search() + self.assertEqual( + sweep_search.count(), + 0, + msg="Wrong number of documents in the sweep index.", + ) + + # Index documents based Dockets changed today + all their + # RECAPDocuments indexed the same day. + with time_machine.travel(self.mock_date, tick=False): + documents_indexed = index_daily_recap_documents( + r, + DocketDocument._index._name, + DocketSweepDocument._index._name, + testing=True, + ) + self.assertEqual( + documents_indexed, 5, msg="Wrong number of documents indexed." + ) + + sweep_search = DocketSweepDocument.search() + dockets_sweep = sweep_search.query(Q("match", docket_child="docket")) + self.assertEqual(dockets_sweep.count(), 2) + + documents_sweep = sweep_search.query( + Q("match", docket_child="recap_document") + ) + self.assertEqual(documents_sweep.count(), 3) + + # Index Docket changed today + their RECAPDocuments indexed on + # previous days + with time_machine.travel(self.mock_date, tick=False): + docket = DocketFactory( + court=self.court, + case_name="SUBPOENAS SERVED CASE", + docket_number="1:21-bk-1234", + source=Docket.RECAP, + ) + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) + + # Its related RD is ingested two days before. + two_days_before = now() - datetime.timedelta(days=2) + mock_two_days_before = two_days_before.replace(hour=5) + with time_machine.travel(mock_two_days_before, tick=False): + alert_de = DocketEntryWithParentsFactory( + docket=docket, + entry_number=1, + date_filed=datetime.date(2024, 8, 19), + description="MOTION for Leave to File Amicus Curiae Lorem Served", + ) + rd = RECAPDocumentFactory( + docket_entry=alert_de, + description="Motion to File", + document_number="1", + is_available=True, + ) + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) + + # Run the indexer. + with time_machine.travel(self.mock_date, tick=False): + documents_indexed = index_daily_recap_documents( + r, + DocketDocument._index._name, + DocketSweepDocument._index._name, + testing=True, + ) + self.assertEqual( + documents_indexed, 7, msg="Wrong number of documents indexed." + ) + + # Index a RECAPDocument changed today including its parent Docket + # indexed on previous days. + with time_machine.travel(mock_two_days_before, tick=False): + docket_2 = DocketFactory( + court=self.court, + case_name="SUBPOENAS SERVED CASE OFF", + docket_number="1:21-bk-1250", + source=Docket.RECAP, + ) + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) + + # Its related RD is ingested today. + with time_machine.travel(self.mock_date, tick=False): + alert_de_2 = DocketEntryWithParentsFactory( + docket=docket_2, + entry_number=1, + date_filed=datetime.date(2024, 8, 19), + description="MOTION for Leave to File Amicus Curiae Lorem Served", + ) + rd_2 = RECAPDocumentFactory( + docket_entry=alert_de_2, + description="Motion to File Lorem", + document_number="2", + ) + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) + + # Run the indexer. + with time_machine.travel(self.mock_date, tick=False): + documents_indexed = index_daily_recap_documents( + r, + DocketDocument._index._name, + DocketSweepDocument._index._name, + testing=True, + ) + self.assertEqual( + documents_indexed, 9, msg="Wrong number of documents indexed." + ) + def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: """Test RECAP alerts can be properly filtered out according to their query and hits matched conditions. @@ -410,7 +564,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) self.assertEqual( len(mail.outbox), 1, msg="Outgoing emails don't match." @@ -473,14 +627,13 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: pacer_doc_id="018036652436", plain_text="plain text for 018036652436", ) - call_command( - "cl_index_parent_and_child_docs", - search_type=SEARCH_TYPES.RECAP, - queue="celery", - pk_offset=0, - testing_mode=True, - sweep_index=True, - ) + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) with mock.patch( "cl.api.webhooks.requests.post", @@ -488,7 +641,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) # The RD ingestion's shouldn't match the docket-only alert. self.assertEqual( len(mail.outbox), 1, msg="Outgoing emails don't match." @@ -507,7 +660,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) # 1 New alert should be triggered. self.assertEqual( len(mail.outbox), 2, msg="Outgoing emails don't match." @@ -541,7 +694,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) # No new alert should be triggered. self.assertEqual( len(mail.outbox), 2, msg="Outgoing emails don't match." @@ -564,7 +717,6 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: queue="celery", pk_offset=0, testing_mode=True, - sweep_index=True, ) with mock.patch( @@ -573,7 +725,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) # A new alert should be triggered containing only the new RD created. self.assertEqual( @@ -602,7 +754,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) # A new alert should be triggered containing two RDs (rd and rd_2) self.assertEqual( @@ -636,7 +788,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) # A new alert should be triggered containing one RD (rd_2) self.assertEqual( @@ -687,7 +839,6 @@ def test_limit_alert_case_child_hits(self) -> None: queue="celery", pk_offset=0, testing_mode=True, - sweep_index=True, ) recap_only_alert = AlertFactory( user=self.user_profile.user, @@ -701,7 +852,7 @@ def test_limit_alert_case_child_hits(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) self.assertEqual( len(mail.outbox), 1, msg="Outgoing emails don't match." @@ -820,7 +971,6 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: queue="celery", pk_offset=0, testing_mode=True, - sweep_index=True, ) with mock.patch( "cl.api.webhooks.requests.post", @@ -828,7 +978,7 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) self.assertEqual( len(mail.outbox), 2, msg="Outgoing emails don't match." @@ -974,7 +1124,7 @@ def test_schedule_wly_and_mly_recap_alerts(self) -> None: 200, mock_raw=True ), ), time_machine.travel(self.mock_date, tick=False): - call_command("cl_send_recap_alerts") + call_command("cl_send_recap_alerts", testing_mode=True) self.assertEqual( len(mail.outbox), 0, msg="Outgoing emails don't match." diff --git a/cl/search/documents.py b/cl/search/documents.py index d64f4eb724..85c082ab25 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -1,5 +1,6 @@ from datetime import datetime +from django.conf import settings from django.http import QueryDict from django.utils.html import escape, strip_tags from django_elasticsearch_dsl import Document, fields @@ -29,7 +30,6 @@ parenthetical_group_index, people_db_index, recap_index, - recap_sweep_index, ) from cl.search.forms import SearchForm from cl.search.models import ( @@ -1829,17 +1829,22 @@ def prepare_cluster_child(self, instance): return "opinion_cluster" -@recap_sweep_index.document class DocketSweepDocument(DocketDocument): - - class Django: - model = Docket - ignore_signals = True + class Index: + name = "recap_sweep" + settings = { + "number_of_shards": settings.ELASTICSEARCH_RECAP_NUMBER_OF_SHARDS, + "number_of_replicas": settings.ELASTICSEARCH_RECAP_NUMBER_OF_REPLICAS, + "analysis": settings.ELASTICSEARCH_DSL["analysis"], + } -@recap_sweep_index.document class ESRECAPSweepDocument(ESRECAPDocument): - class Django: - model = RECAPDocument - ignore_signals = True + class Index: + name = "recap_sweep" + settings = { + "number_of_shards": settings.ELASTICSEARCH_RECAP_NUMBER_OF_SHARDS, + "number_of_replicas": settings.ELASTICSEARCH_RECAP_NUMBER_OF_REPLICAS, + "analysis": settings.ELASTICSEARCH_DSL["analysis"], + } diff --git a/cl/search/es_indices.py b/cl/search/es_indices.py index bf129f0704..717a6abee9 100644 --- a/cl/search/es_indices.py +++ b/cl/search/es_indices.py @@ -53,12 +53,3 @@ number_of_replicas=settings.ELASTICSEARCH_OPINION_NUMBER_OF_REPLICAS, analysis=settings.ELASTICSEARCH_DSL["analysis"], ) - - -# Define RECAP Nested elasticsearch index -recap_sweep_index = Index("recap_sweep") -recap_sweep_index.settings( - number_of_shards=settings.ELASTICSEARCH_RECAP_NUMBER_OF_SHARDS, - number_of_replicas=settings.ELASTICSEARCH_RECAP_NUMBER_OF_REPLICAS, - analysis=settings.ELASTICSEARCH_DSL["analysis"], -) diff --git a/cl/search/management/commands/cl_index_parent_and_child_docs.py b/cl/search/management/commands/cl_index_parent_and_child_docs.py index c4edec4ba0..57cdf390fc 100644 --- a/cl/search/management/commands/cl_index_parent_and_child_docs.py +++ b/cl/search/management/commands/cl_index_parent_and_child_docs.py @@ -480,7 +480,6 @@ def process_queryset( pk_offset = self.options["pk_offset"] document_type = self.options.get("document_type", None) missing = self.options.get("missing", False) - sweep_index = self.options.get("sweep_index", False) fields_map = {} if event_doc_type == EventTable.DOCKET: fields_map = recap_document_field_mapping["save"][Docket][ @@ -535,7 +534,6 @@ def process_queryset( chunk, search_type, testing_mode=testing_mode, - sweep_index=sweep_index, ).set(queue=queue).apply_async() case "index_parent_or_child_docs": @@ -544,7 +542,6 @@ def process_queryset( search_type, document_type, testing_mode=testing_mode, - sweep_index=sweep_index, ).set(queue=queue).apply_async() case "remove_parent_and_child_docs_by_query": remove_parent_and_child_docs_by_query.si( diff --git a/cl/search/tasks.py b/cl/search/tasks.py index 38bb8cdbe1..5039613578 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -968,7 +968,6 @@ def index_parent_and_child_docs( instance_ids: list[int], search_type: str, testing_mode: bool = False, - sweep_index: bool = False, ) -> None: """Index parent and child documents in Elasticsearch. @@ -988,12 +987,8 @@ def index_parent_and_child_docs( child_es_document = PositionDocument child_id_property = "POSITION" case SEARCH_TYPES.RECAP: - parent_es_document = ( - DocketSweepDocument if sweep_index else DocketDocument - ) - child_es_document = ( - ESRECAPSweepDocument if sweep_index else ESRECAPDocument - ) + parent_es_document = DocketDocument + child_es_document = ESRECAPDocument child_id_property = "RECAP" case SEARCH_TYPES.OPINION: parent_es_document = OpinionClusterDocument @@ -1078,7 +1073,6 @@ def index_parent_or_child_docs( search_type: str, document_type: str | None, testing_mode: bool = False, - sweep_index: bool = False, ) -> None: """Index parent or child documents in Elasticsearch. @@ -1097,12 +1091,8 @@ def index_parent_or_child_docs( child_instances = QuerySet() match search_type: case SEARCH_TYPES.RECAP: - parent_es_document = ( - DocketSweepDocument if sweep_index else DocketDocument - ) - child_es_document = ( - ESRECAPSweepDocument if sweep_index else ESRECAPDocument - ) + parent_es_document = DocketDocument + child_es_document = ESRECAPDocument child_id_property = "RECAP" if document_type == "parent": parent_instances = Docket.objects.filter(pk__in=instance_ids) From 3a4a456259c9ec4cb6286232cd3d75e544163d67 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 2 Jul 2024 10:16:23 -0600 Subject: [PATCH 11/33] fix(alerts): Fixed RECAPSweepDocument index mapping - Enabled RECAP Search alerts UI behind a waffle. - Added alert frequency estimation for RECAP --- .../commands/cl_send_recap_alerts.py | 6 +- cl/alerts/tests/tests_recap_alerts.py | 95 +++++++++++++++---- cl/api/urls.py | 2 +- cl/api/views.py | 17 +++- cl/custom_filters/templatetags/extras.py | 26 ++++- cl/lib/elasticsearch_utils.py | 2 +- cl/search/documents.py | 13 +-- cl/search/tasks.py | 2 - cl/search/templates/search.html | 7 +- 9 files changed, 127 insertions(+), 43 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index ec88eed5cc..864dddbf51 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -29,7 +29,7 @@ from cl.lib.command_utils import VerboseCommand, logger from cl.lib.elasticsearch_utils import do_es_sweep_alert_query from cl.lib.redis_utils import get_redis_interface -from cl.search.documents import DocketDocument, DocketSweepDocument +from cl.search.documents import DocketDocument, RECAPSweepDocument from cl.search.exception import ( BadProximityQuery, UnbalancedParenthesesQuery, @@ -286,7 +286,7 @@ def query_alerts( search_params: QueryDict, ) -> tuple[list[Hit] | None, int | None]: try: - search_query = DocketSweepDocument.search() + search_query = RECAPSweepDocument.search() return do_es_sweep_alert_query( search_query, search_params, @@ -484,7 +484,7 @@ def handle(self, *args, **options): index_daily_recap_documents( r, DocketDocument._index._name, - DocketSweepDocument._index._name, + RECAPSweepDocument._index._name, testing=testing_mode, ) query_and_send_alerts(r, Alert.REAL_TIME) diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 80cd0889be..59b427841c 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -6,6 +6,7 @@ from django.core import mail from django.core.management import call_command from django.test.utils import override_settings +from django.urls import reverse from django.utils.html import strip_tags from django.utils.timezone import now from elasticsearch_dsl import Q @@ -23,7 +24,7 @@ from cl.lib.elasticsearch_utils import do_es_sweep_alert_query from cl.lib.redis_utils import get_redis_interface from cl.lib.test_helpers import RECAPSearchTestCase -from cl.search.documents import DocketDocument, DocketSweepDocument +from cl.search.documents import DocketDocument, RECAPSweepDocument from cl.search.factories import ( DocketEntryWithParentsFactory, DocketFactory, @@ -74,8 +75,8 @@ def setUpTestData(cls): ) def setUp(self): - DocketSweepDocument._index.delete(ignore=404) - DocketSweepDocument.init() + RECAPSweepDocument._index.delete(ignore=404) + RECAPSweepDocument.init() @staticmethod def get_html_content_from_email(email_content): @@ -239,12 +240,23 @@ def _count_webhook_hits_and_child_hits( async def test_recap_document_hl_matched(self) -> None: """Test recap_document_hl_matched method that determines weather a hit contains RECAPDocument HL fields.""" + + # Index base document factories. + r = get_redis_interface("CACHE") + with time_machine.travel(self.mock_date, tick=False): + index_daily_recap_documents( + r, + DocketDocument._index._name, + RECAPSweepDocument._index._name, + testing=True, + ) + # Docket-only query search_params = { "type": SEARCH_TYPES.RECAP, "q": '"401 Civil"', } - search_query = DocketSweepDocument.search() + search_query = RECAPSweepDocument.search() results, total_hits = await sync_to_async(do_es_sweep_alert_query)( search_query, search_params, @@ -259,7 +271,7 @@ async def test_recap_document_hl_matched(self) -> None: "type": SEARCH_TYPES.RECAP, "q": '"Mauris iaculis, leo sit amet hendrerit vehicula"', } - search_query = DocketSweepDocument.search() + search_query = RECAPSweepDocument.search() results, total_hits = await sync_to_async(do_es_sweep_alert_query)( search_query, search_params, @@ -274,7 +286,7 @@ async def test_recap_document_hl_matched(self) -> None: "type": SEARCH_TYPES.RECAP, "q": "SUBPOENAS SERVED OFF Mauris iaculis", } - search_query = DocketSweepDocument.search() + search_query = RECAPSweepDocument.search() results, total_hits = await sync_to_async(do_es_sweep_alert_query)( search_query, search_params, @@ -400,7 +412,7 @@ def test_index_daily_recap_documents(self) -> None: ) self.assertEqual(recap_documents.count(), 3) - sweep_search = DocketSweepDocument.search() + sweep_search = RECAPSweepDocument.search() self.assertEqual( sweep_search.count(), 0, @@ -413,14 +425,14 @@ def test_index_daily_recap_documents(self) -> None: documents_indexed = index_daily_recap_documents( r, DocketDocument._index._name, - DocketSweepDocument._index._name, + RECAPSweepDocument._index._name, testing=True, ) self.assertEqual( documents_indexed, 5, msg="Wrong number of documents indexed." ) - sweep_search = DocketSweepDocument.search() + sweep_search = RECAPSweepDocument.search() dockets_sweep = sweep_search.query(Q("match", docket_child="docket")) self.assertEqual(dockets_sweep.count(), 2) @@ -475,7 +487,7 @@ def test_index_daily_recap_documents(self) -> None: documents_indexed = index_daily_recap_documents( r, DocketDocument._index._name, - DocketSweepDocument._index._name, + RECAPSweepDocument._index._name, testing=True, ) self.assertEqual( @@ -525,7 +537,7 @@ def test_index_daily_recap_documents(self) -> None: documents_indexed = index_daily_recap_documents( r, DocketDocument._index._name, - DocketSweepDocument._index._name, + RECAPSweepDocument._index._name, testing=True, ) self.assertEqual( @@ -807,6 +819,8 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: self.assertIn(cross_object_alert.name, txt_email) self.assertIn(rd_2.description, txt_email) + docket.delete() + def test_limit_alert_case_child_hits(self) -> None: """Test limit case child hits up to 5 and display the "View additional results for this Case" button. @@ -833,13 +847,13 @@ def test_limit_alert_case_child_hits(self) -> None: # included in the case. rd_descriptions.append(rd.description) - call_command( - "cl_index_parent_and_child_docs", - search_type=SEARCH_TYPES.RECAP, - queue="celery", - pk_offset=0, - testing_mode=True, - ) + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) recap_only_alert = AlertFactory( user=self.user_profile.user, rate=Alert.REAL_TIME, @@ -905,14 +919,16 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: source=Docket.RECAP, cause="410 Civil", ) + dockets_created = [] for i in range(3): - DocketFactory( + docket_created = DocketFactory( court=self.court, case_name=f"SUBPOENAS SERVED CASE {i}", docket_number=f"1:21-bk-123{i}", source=Docket.RECAP, cause="410 Civil", ) + dockets_created.append(docket_created) alert_de = DocketEntryWithParentsFactory( docket=docket, @@ -1095,6 +1111,10 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: msg="RECAPDocument wasn't found in the email content.", ) + docket.delete() + for d in dockets_created: + d.delete() + def test_schedule_wly_and_mly_recap_alerts(self) -> None: """Test Weekly and Monthly RECAP Search Alerts are scheduled daily before being sent later. @@ -1149,3 +1169,40 @@ def test_schedule_wly_and_mly_recap_alerts(self) -> None: self.assertEqual( len(mail.outbox), 2, msg="Outgoing emails don't match." ) + + def test_alert_frequency_estimation(self): + """Test alert frequency ES API endpoint for RECAP Alerts.""" + + search_params = { + "type": SEARCH_TYPES.RECAP, + "q": "Frequency Test RECAP", + } + r = self.client.get( + reverse( + "alert_frequency", kwargs={"version": "4", "day_count": "100"} + ), + search_params, + ) + self.assertEqual(r.json()["count"], 0) + + mock_date = now().replace(day=1, hour=5) + with time_machine.travel( + mock_date, tick=False + ), self.captureOnCommitCallbacks(execute=True): + docket = DocketFactory( + court=self.court, + case_name="Frequency Test RECAP", + docket_number="1:21-bk-1240", + source=Docket.RECAP, + date_filed=now().date(), + ) + + r = self.client.get( + reverse( + "alert_frequency", kwargs={"version": "4", "day_count": "100"} + ), + search_params, + ) + self.assertEqual(r.json()["count"], 1) + + docket.delete() diff --git a/cl/api/urls.py b/cl/api/urls.py index 7413a287b4..ea8f0c67aa 100644 --- a/cl/api/urls.py +++ b/cl/api/urls.py @@ -319,7 +319,7 @@ name="coverage_data_opinions", ), re_path( - r"^api/rest/v(?P[123])/alert-frequency/(?P\d+)/$", + r"^api/rest/v(?P[1234])/alert-frequency/(?P\d+)/$", views.get_result_count, name="alert_frequency", ), diff --git a/cl/api/views.py b/cl/api/views.py index 86941007b0..1d95e93410 100644 --- a/cl/api/views.py +++ b/cl/api/views.py @@ -20,7 +20,11 @@ build_coverage_query, get_solr_interface, ) -from cl.search.documents import AudioDocument, OpinionClusterDocument +from cl.search.documents import ( + AudioDocument, + DocketDocument, + OpinionClusterDocument, +) from cl.search.forms import SearchForm from cl.search.models import SEARCH_TYPES, Citation, Court, OpinionCluster from cl.simple_pages.coverage_utils import build_chart_data @@ -271,7 +275,10 @@ async def get_result_count(request, version, day_count): es_flag_for_o = await sync_to_async(waffle.flag_is_active)( request, "o-es-active" ) - is_es_form = es_flag_for_oa or es_flag_for_o + es_flag_for_r = await sync_to_async(waffle.flag_is_active)( + request, "recap-alerts-active" + ) + is_es_form = es_flag_for_oa or es_flag_for_o or es_flag_for_r search_form = await sync_to_async(SearchForm)( request.GET.copy(), is_es_form=is_es_form ) @@ -296,6 +303,12 @@ async def get_result_count(request, version, day_count): total_query_results = await sync_to_async( do_es_alert_estimation_query )(search_query, cd, day_count) + case SEARCH_TYPES.RECAP if es_flag_for_r: + # Elasticsearch version for RECAP + search_query = DocketDocument.search() + total_query_results = await sync_to_async( + do_es_alert_estimation_query + )(search_query, cd, day_count) case _: @sync_to_async diff --git a/cl/custom_filters/templatetags/extras.py b/cl/custom_filters/templatetags/extras.py index 53cc1a2b71..4ce97f3e63 100644 --- a/cl/custom_filters/templatetags/extras.py +++ b/cl/custom_filters/templatetags/extras.py @@ -2,9 +2,11 @@ import re import urllib.parse +import waffle from django import template from django.core.exceptions import ValidationError from django.template import Context +from django.template.context import RequestContext from django.utils.formats import date_format from django.utils.html import format_html from django.utils.http import urlencode @@ -12,7 +14,7 @@ from elasticsearch_dsl import AttrDict, AttrList from cl.search.constants import ALERTS_HL_TAG, SEARCH_HL_TAG -from cl.search.models import Docket, DocketEntry +from cl.search.models import SEARCH_TYPES, Docket, DocketEntry register = template.Library() @@ -259,3 +261,25 @@ def extract_q_value(query: str) -> str: parsed_query = urllib.parse.parse_qs(query) return parsed_query.get("q", [""])[0] + + +@register.simple_tag(takes_context=True) +def alerts_supported(context: RequestContext, search_type: str) -> str: + """Determine if search alerts are supported based on the search type and flag + status. + + :param context: The template context, which includes the request, required + for the waffle flag. + :param search_type: The type of search being performed. + :return: True if alerts are supported, False otherwise. + """ + + request = context["request"] + return ( + search_type == SEARCH_TYPES.OPINION + or search_type == SEARCH_TYPES.ORAL_ARGUMENT + or ( + search_type == SEARCH_TYPES.RECAP + and waffle.flag_is_active(request, "recap-alerts-active") + ) + ) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index b51d149ff6..5f24193886 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -3009,7 +3009,7 @@ def do_es_alert_estimation_query( """ match cd["type"]: - case SEARCH_TYPES.OPINION: + case SEARCH_TYPES.OPINION | SEARCH_TYPES.RECAP: after_field = "filed_after" before_field = "filed_before" case SEARCH_TYPES.ORAL_ARGUMENT: diff --git a/cl/search/documents.py b/cl/search/documents.py index 85c082ab25..378dbb9477 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -1829,18 +1829,7 @@ def prepare_cluster_child(self, instance): return "opinion_cluster" -class DocketSweepDocument(DocketDocument): - class Index: - name = "recap_sweep" - settings = { - "number_of_shards": settings.ELASTICSEARCH_RECAP_NUMBER_OF_SHARDS, - "number_of_replicas": settings.ELASTICSEARCH_RECAP_NUMBER_OF_REPLICAS, - "analysis": settings.ELASTICSEARCH_DSL["analysis"], - } - - -class ESRECAPSweepDocument(ESRECAPDocument): - +class RECAPSweepDocument(DocketDocument, ESRECAPDocument): class Index: name = "recap_sweep" settings = { diff --git a/cl/search/tasks.py b/cl/search/tasks.py index 5039613578..df7d337f26 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -40,9 +40,7 @@ ES_CHILD_ID, AudioDocument, DocketDocument, - DocketSweepDocument, ESRECAPDocument, - ESRECAPSweepDocument, OpinionClusterDocument, OpinionDocument, PersonDocument, diff --git a/cl/search/templates/search.html b/cl/search/templates/search.html index 05cc09a60c..a67dc6f271 100644 --- a/cl/search/templates/search.html +++ b/cl/search/templates/search.html @@ -1,6 +1,7 @@ {% extends 'base.html' %} {% load humanize %} {% load text_filters %} +{% load extras %} {% load static %} {% load waffle_tags %} @@ -177,7 +178,8 @@
- {% if search_form.type.value == SEARCH_TYPES.OPINION or search_form.type.value == SEARCH_TYPES.ORAL_ARGUMENT %} + {% alerts_supported search_form.type.value as search_alerts_supported %} + {% if search_alerts_supported %} {% include "includes/alert_modal.html" %} {% endif %}
- {% if search_form.type.value == SEARCH_TYPES.OPINION or search_form.type.value == SEARCH_TYPES.ORAL_ARGUMENT %} + {% alerts_supported search_form.type.value as search_alerts_supported %} + {% if search_alerts_supported %} {% if not error and get_string %} Date: Tue, 2 Jul 2024 19:41:51 -0600 Subject: [PATCH 12/33] fix(alerts): Tweak RECAP Alert estimation query to consider both Dockets + RD hits - Fixed RECAP MLY and WLY scheduled alerts content. --- .../commands/cl_send_scheduled_alerts.py | 4 +- cl/alerts/tasks.py | 4 +- cl/alerts/templates/alert_email_es.html | 2 +- cl/alerts/templates/alert_email_es.txt | 2 +- cl/alerts/tests/tests_recap_alerts.py | 123 ++++++++++++++---- cl/custom_filters/templatetags/extras.py | 22 +++- cl/lib/elasticsearch_utils.py | 54 +++++++- .../templates/feeds/solr_desc_template.html | 2 +- .../templates/includes/pa_search_result.html | 4 +- .../templates/includes/search_result.html | 4 +- 10 files changed, 183 insertions(+), 38 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_scheduled_alerts.py b/cl/alerts/management/commands/cl_send_scheduled_alerts.py index 8fefd675f3..e0b23c8420 100644 --- a/cl/alerts/management/commands/cl_send_scheduled_alerts.py +++ b/cl/alerts/management/commands/cl_send_scheduled_alerts.py @@ -99,7 +99,9 @@ def query_and_send_alerts_by_rate(rate: str) -> None: ) ) if hits: - send_search_alert_emails.delay([(user_id, hits)]) + send_search_alert_emails.delay( + [(user_id, hits)], scheduled_alert=True + ) alerts_sent_count += 1 # Update Alert's date_last_hit in bulk. diff --git a/cl/alerts/tasks.py b/cl/alerts/tasks.py index 037fe22b4c..885ac9b413 100644 --- a/cl/alerts/tasks.py +++ b/cl/alerts/tasks.py @@ -467,7 +467,8 @@ def send_webhook_alert_hits( @app.task(ignore_result=True) def send_search_alert_emails( - email_alerts_to_send: list[tuple[int, list[SearchAlertHitType]]] + email_alerts_to_send: list[tuple[int, list[SearchAlertHitType]]], + scheduled_alert: bool = False, ) -> None: """Send search alert emails for multiple users. @@ -491,6 +492,7 @@ def send_search_alert_emails( context = { "hits": hits, "hits_limit": settings.SCHEDULED_ALERT_HITS_LIMIT, + "scheduled_alert": scheduled_alert, } headers = {} query_string = "" diff --git a/cl/alerts/templates/alert_email_es.html b/cl/alerts/templates/alert_email_es.html index 804f33a0bb..2b15d540bd 100644 --- a/cl/alerts/templates/alert_email_es.html +++ b/cl/alerts/templates/alert_email_es.html @@ -53,7 +53,7 @@

{% if doc.short_description %}{{ doc.short_description|render_string_or_list|safe }} — {% endif %}Document #{% if doc.document_number %}{{ doc.document_number }}{% endif %}{% if doc.attachment_number %}, Attachment #{{ doc.attachment_number }}{% endif %} diff --git a/cl/alerts/templates/alert_email_es.txt b/cl/alerts/templates/alert_email_es.txt index f4aa763cec..b836b10caa 100644 --- a/cl/alerts/templates/alert_email_es.txt +++ b/cl/alerts/templates/alert_email_es.txt @@ -19,7 +19,7 @@ Disable this Alert (one click): https://www.courtlistener.com{% url "disable_ale {% if type == 'oa' %}{% if result.dateArgued %}Date Argued: {{ result.dateArgued|date:"F jS, Y" }}{% else %}Date Argued: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} | Duration: {{ result.duration|naturalduration }}{% if result.judge %} | Judge: {{ result.judge|render_string_or_list|safe|striptags|underscore_to_space }}{% endif %}{% endif %} {% if type == 'o' or type == 'oa' %}{% if result|get_highlight:"text" %}...{{ result|get_highlight:"text"|safe|striptags|underscore_to_space|compress_whitespace }}...{% endif %}{% endif %} {% if type == 'r' %}{% if result.dateFiled %}Date Filed: {{ result.dateFiled|date:"F jS, Y" }}{% else %}Date Filed: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} -{% for doc in result.child_docs %}{% with doc=doc|get_attrdict:"_source" %} - {% if doc.short_description %}{{ doc.short_description|render_string_or_list|safe|striptags }} - {% endif %}Document #{% if doc.document_number %}{{ doc.document_number }}{% endif %}{% if doc.attachment_number %}, Attachment #{{ doc.attachment_number }}{% endif %} +{% for doc in result.child_docs %}{% with doc=doc|get_es_doc_content:scheduled_alert %} - {% if doc.short_description %}{{ doc.short_description|render_string_or_list|safe|striptags }} - {% endif %}Document #{% if doc.document_number %}{{ doc.document_number }}{% endif %}{% if doc.attachment_number %}, Attachment #{{ doc.attachment_number }}{% endif %} {% if doc.description %}Description: {{ doc.description|render_string_or_list|safe|striptags }}{% endif %} {% if doc.plain_text %}{% contains_highlights doc.plain_text.0 True as highlighted %}{% if highlighted %}...{% endif %}{{ doc.plain_text|render_string_or_list|safe|striptags|underscore_to_space }}...{% endif %} View this document on our site: https://www.courtlistener.com{% if doc.absolute_url %}{{ doc.absolute_url }}{% else %}{{ result.docket_absolute_url }}#minute-entry-{{ doc.docket_entry_id }}{% endif %} diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 59b427841c..b9ea921510 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -544,6 +544,9 @@ def test_index_daily_recap_documents(self) -> None: documents_indexed, 9, msg="Wrong number of documents indexed." ) + docket.delete() + docket_2.delete() + def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: """Test RECAP alerts can be properly filtered out according to their query and hits matched conditions. @@ -712,24 +715,25 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: len(mail.outbox), 2, msg="Outgoing emails don't match." ) - # Create a new RD for the same DocketEntry to confirm this new RD is - # properly included in the alert email. - rd_2 = RECAPDocumentFactory( - docket_entry=alert_de, - description="Motion to File 2", - document_number="2", - is_available=True, - page_count=3, - pacer_doc_id="018036652436", - plain_text="plain text for 018036652436", - ) - call_command( - "cl_index_parent_and_child_docs", - search_type=SEARCH_TYPES.RECAP, - queue="celery", - pk_offset=0, - testing_mode=True, - ) + with time_machine.travel(mock_date, tick=False): + # Create a new RD for the same DocketEntry to confirm this new RD is + # properly included in the alert email. + rd_2 = RECAPDocumentFactory( + docket_entry=alert_de, + description="Motion to File 2", + document_number="2", + is_available=True, + page_count=3, + pacer_doc_id="018036652436", + plain_text="plain text for 018036652436", + ) + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) with mock.patch( "cl.api.webhooks.requests.post", @@ -906,6 +910,8 @@ def test_limit_alert_case_child_hits(self) -> None: self.assertIn("View Additional Results for this Case", txt_email) + alert_de.delete() + @override_settings(SCHEDULED_ALERT_HITS_LIMIT=3) def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: """Test multiple alerts can be grouped in an email and hits within an @@ -1146,29 +1152,77 @@ def test_schedule_wly_and_mly_recap_alerts(self) -> None: ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) + # Weekly and monthly alerts are not sent right away but are scheduled as + # ScheduledAlertHit to be sent by the cl_send_scheduled_alerts command. self.assertEqual( len(mail.outbox), 0, msg="Outgoing emails don't match." ) schedule_alerts = ScheduledAlertHit.objects.all() self.assertEqual(schedule_alerts.count(), 3) - # Assert webhooks. + # Webhooks are send immediately as hits are matched. webhook_events = WebhookEvent.objects.all().values_list( "content", flat=True ) self.assertEqual(len(webhook_events), 3) - # Send Weekly alerts and check assertions. + # Send scheduled Weekly alerts and check assertions. call_command("cl_send_scheduled_alerts", rate=Alert.WEEKLY) self.assertEqual( len(mail.outbox), 1, msg="Outgoing emails don't match." ) + # Assert docket-only alert. + html_content = self.get_html_content_from_email(mail.outbox[0]) + self._count_alert_hits_and_child_hits( + html_content, + docket_only_alert.name, + 1, + self.de.docket.case_name, + 0, + ) + self._count_alert_hits_and_child_hits( + html_content, + cross_object_alert_with_hl.name, + 1, + self.de.docket.case_name, + 1, + ) + self._assert_child_hits_content( + html_content, + cross_object_alert_with_hl.name, + self.de.docket.case_name, + [self.rd.description], + ) + # Assert email text version: + txt_email = mail.outbox[0].body + self.assertIn(docket_only_alert.name, txt_email) + self.assertIn(cross_object_alert_with_hl.name, txt_email) + self.assertIn(self.rd.description, txt_email) - # Send Monthly alerts and check assertions. + # Send scheduled Monthly alerts and check assertions. call_command("cl_send_scheduled_alerts", rate=Alert.MONTHLY) self.assertEqual( len(mail.outbox), 2, msg="Outgoing emails don't match." ) + html_content = self.get_html_content_from_email(mail.outbox[1]) + self._count_alert_hits_and_child_hits( + html_content, + recap_only_alert.name, + 1, + self.de.docket.case_name, + 2, + ) + self._assert_child_hits_content( + html_content, + recap_only_alert.name, + self.de.docket.case_name, + [self.rd.description, self.rd_att.description], + ) + # Assert email text version: + txt_email = mail.outbox[1].body + self.assertIn(recap_only_alert.name, txt_email) + self.assertIn(self.rd.description, txt_email) + self.assertIn(self.rd_att.description, txt_email) def test_alert_frequency_estimation(self): """Test alert frequency ES API endpoint for RECAP Alerts.""" @@ -1189,6 +1243,7 @@ def test_alert_frequency_estimation(self): with time_machine.travel( mock_date, tick=False ), self.captureOnCommitCallbacks(execute=True): + # Docket filed today. docket = DocketFactory( court=self.court, case_name="Frequency Test RECAP", @@ -1197,12 +1252,36 @@ def test_alert_frequency_estimation(self): date_filed=now().date(), ) + # RECAPDocument filed today that belongs to a docket filed outside + # the estimation range. + date_outside_range = now() - datetime.timedelta(days=101) + alert_de = DocketEntryWithParentsFactory( + docket=DocketFactory( + court=self.court, + case_name="Frequency Test RECAP", + docket_number="1:21-bk-1245", + source=Docket.RECAP, + date_filed=date_outside_range.date(), + ), + entry_number=1, + date_filed=now().date(), + ) + RECAPDocumentFactory( + docket_entry=alert_de, + description="Frequency Test RECAP", + document_number="1", + pacer_doc_id="018036652450", + ) + r = self.client.get( reverse( "alert_frequency", kwargs={"version": "4", "day_count": "100"} ), search_params, ) - self.assertEqual(r.json()["count"], 1) + # 2 expected hits in the last 100 days. One docket filed today + one + # RECAPDocument filed today. + self.assertEqual(r.json()["count"], 2) docket.delete() + alert_de.docket.delete() diff --git a/cl/custom_filters/templatetags/extras.py b/cl/custom_filters/templatetags/extras.py index 4ce97f3e63..b67396e296 100644 --- a/cl/custom_filters/templatetags/extras.py +++ b/cl/custom_filters/templatetags/extras.py @@ -132,11 +132,25 @@ def random_int(a: int, b: int) -> int: @register.filter -def get_attrdict(mapping, key): - """Emulates the dictionary get for AttrDict objects. Useful when keys - have spaces or other punctuation.""" +def get_es_doc_content( + mapping: AttrDict | dict, scheduled_alert: bool = False +) -> AttrDict | dict | str: + """ + Returns the ES document content placed in the "_source" field if the + document is an AttrDict, or just returns the content if it's not necessary + to extract from "_source" such as in scheduled alerts where the content is + a dict. + + :param mapping: The AttrDict or dict instance to extract the content from. + :param scheduled_alert: A boolean indicating if the content belongs to a + scheduled alert where the content is already in place. + :return: The ES document content. + """ + + if scheduled_alert: + return mapping try: - return mapping[key] + return mapping["_source"] except KeyError: return "" diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 5f24193886..af7118fd04 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -135,8 +135,8 @@ def build_numeric_range_query( def build_daterange_query( field: str, - before: datetime.date, - after: datetime.date, + before: datetime.date | str, + after: datetime.date | str, relation: Literal["INTERSECTS", "CONTAINS", "WITHIN", None] = None, ) -> list[Range]: """Given field name and date range limits returns ElasticSearch range query or None @@ -1991,7 +1991,7 @@ def fetch_es_results( return [], 0, error, None, None -def build_has_child_filters(cd: CleanData) -> list[QueryString]: +def build_has_child_filters(cd: CleanData) -> list[QueryString | Range]: """Builds Elasticsearch 'has_child' filters based on the given child type and CleanData. @@ -2027,6 +2027,8 @@ def build_has_child_filters(cd: CleanData) -> list[QueryString]: description = cd.get("description", "") document_number = cd.get("document_number", "") attachment_number = cd.get("attachment_number", "") + entry_date_filed_after = cd.get("entry_date_filed_after", "") + entry_date_filed_before = cd.get("entry_date_filed_before", "") if available_only: queries_list.extend( @@ -2045,6 +2047,14 @@ def build_has_child_filters(cd: CleanData) -> list[QueryString]: queries_list.extend( build_term_query("attachment_number", attachment_number) ) + if entry_date_filed_after or entry_date_filed_before: + queries_list.extend( + build_daterange_query( + "entry_date_filed", + entry_date_filed_before, + entry_date_filed_after, + ) + ) return queries_list @@ -3024,6 +3034,44 @@ def do_es_alert_estimation_query( cd[before_field] = None estimation_query, _ = build_es_base_query(search_query, cd) + if cd["type"] == SEARCH_TYPES.RECAP: + # The RECAP estimation query consists of two requests: one to estimate + # Docket hits and one to estimate RECAPDocument hits. + del cd[after_field] + del cd[before_field] + cd["entry_date_filed_after"] = ( + datetime.date.today() - datetime.timedelta(days=int(day_count)) + ) + cd["entry_date_filed_before"] = None + + main_doc_count_query = clean_count_query(estimation_query) + main_doc_count_query = main_doc_count_query.extra( + size=0, track_total_hits=True + ) + + # Perform the two queries in a single request. + multi_search = MultiSearch() + multi_search = multi_search.add(main_doc_count_query) + + # Build RECAPDocuments count query. + _, join_query = build_es_base_query(search_query, cd) + child_docs_count_query = build_child_docs_query(join_query, cd) + child_total = 0 + if child_docs_count_query: + child_docs_count_query = search_query.query(child_docs_count_query) + child_total_query = child_docs_count_query.extra( + size=0, track_total_hits=True + ) + multi_search = multi_search.add(child_total_query) + + responses = multi_search.execute() + parent_total = responses[0].hits.total.value + if child_docs_count_query: + child_doc_count_response = responses[1] + child_total = child_doc_count_response.hits.total.value + total_recap_estimation = parent_total + child_total + return total_recap_estimation + return estimation_query.count() diff --git a/cl/search/templates/feeds/solr_desc_template.html b/cl/search/templates/feeds/solr_desc_template.html index ce02928003..479d9fe4f5 100644 --- a/cl/search/templates/feeds/solr_desc_template.html +++ b/cl/search/templates/feeds/solr_desc_template.html @@ -7,7 +7,7 @@ {% else %} {% flag "o-es-active" %} {% if doc0.child_docs %} - {% with doc=doc0.child_docs.0|get_attrdict:"_source" %} + {% with doc=doc0.child_docs.0|get_es_doc_content %}

{{ doc.text|render_string_or_list|safe|truncatewords:"500" }}


{% endwith %} {% else %} diff --git a/cl/search/templates/includes/pa_search_result.html b/cl/search/templates/includes/pa_search_result.html index 822c359307..9c74505e3b 100644 --- a/cl/search/templates/includes/pa_search_result.html +++ b/cl/search/templates/includes/pa_search_result.html @@ -4,7 +4,7 @@ {% load humanize %} {% for result in results.object_list %} - {% with opinion=result.grouped_by_opinion_cluster_id.hits.hits.0|get_attrdict:"_source" %} + {% with opinion=result.grouped_by_opinion_cluster_id.hits.hits.0|get_es_doc_content %}

@@ -30,7 +30,7 @@

{% for parenthetical_group in result.grouped_by_opinion_cluster_id.hits.hits %}
- {% with pa_group=parenthetical_group|get_attrdict:"_source" %} + {% with pa_group=parenthetical_group|get_es_doc_content %}

{{ pa_group.representative_text|safe }} diff --git a/cl/search/templates/includes/search_result.html b/cl/search/templates/includes/search_result.html index 6776c98ad8..b099b772ae 100644 --- a/cl/search/templates/includes/search_result.html +++ b/cl/search/templates/includes/search_result.html @@ -163,7 +163,7 @@

{% endif %} {% for doc in result.child_docs %} - {% with doc=doc|get_attrdict:"_source" %} + {% with doc=doc|get_es_doc_content %}

{% if doc.short_description %}{{ doc.short_description|render_string_or_list|safe }} — {% endif %}Document #{% if doc.document_number %}{{ doc.document_number }}{% endif %}{% if doc.attachment_number %}, Attachment #{{ doc.attachment_number }}{% endif %} @@ -395,7 +395,7 @@

{% if type == SEARCH_TYPES.OPINION or type_override == SEARCH_TYPES.OPINION and simple == False %} {% for doc in result.child_docs %} - {% with doc=doc|get_attrdict:"_source" %} + {% with doc=doc|get_es_doc_content %}
{% if result.child_docs|length > 1 or doc.type != 'combined-opinion' %}

From ebf269d15efde0c94f562aaa059a355a8423e32b Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 2 Jul 2024 19:55:48 -0600 Subject: [PATCH 13/33] fix(elasticsearch): Fixed build_daterange_query type hint --- cl/lib/elasticsearch_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index af7118fd04..72c982deca 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -151,9 +151,9 @@ def build_daterange_query( params = {} if any([before, after]): - if hasattr(after, "strftime"): + if isinstance(after, datetime.date): params["gte"] = f"{after.isoformat()}T00:00:00Z" - if hasattr(before, "strftime"): + if isinstance(before, datetime.date): params["lte"] = f"{before.isoformat()}T23:59:59Z" if relation is not None: allowed_relations = ["INTERSECTS", "CONTAINS", "WITHIN"] From bffee6d2ad1a724a26b4d78b7c7552aa9df5854c Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 2 Jul 2024 21:14:53 -0600 Subject: [PATCH 14/33] fix(alerts): Fixed re_index task estimated remaining time compute --- .../commands/cl_send_recap_alerts.py | 53 ++++++++++++++----- cl/alerts/tests/tests_recap_alerts.py | 35 ++++++------ 2 files changed, 59 insertions(+), 29 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 864dddbf51..d11d6dbce1 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -59,6 +59,37 @@ def get_task_status(task_id: str, es: Elasticsearch) -> dict[str, Any]: return {} +def compute_estimated_remaining_time( + initial_wait: float, start_time_millis: int, created: int, total: int +) -> float: + """Compute the estimated remaining time for the re_index task to complete. + + :param initial_wait: The default wait time in seconds. + :param start_time_millis: The start time in milliseconds epoch. + :param created: The number of items created so far. + :param total: The total number of items to be created. + :return: The estimated remaining time in seconds. If the start time, + created, or total are invalid, the initial default time is returned. + """ + + if start_time_millis is None or not created or not total: + return initial_wait + + start_time = datetime.datetime.fromtimestamp(start_time_millis / 1000.0) + estimated_time_remaining = max( + datetime.timedelta( + seconds=( + (datetime.datetime.now() - start_time).total_seconds() + / created + ) + * (total - created) + ).total_seconds(), + initial_wait, + ) + + return estimated_time_remaining + + def index_daily_recap_documents( r: Redis, source_index: str, target_index: str, testing: bool = False ) -> int: @@ -182,19 +213,24 @@ def index_daily_recap_documents( else: task_id = r.get("alert_sweep:task_id") - estimated_time_remaining = 0.1 if testing else 60 - time.sleep(estimated_time_remaining) + initial_wait = 0.01 if testing else 60.0 + time.sleep(initial_wait) task_info = get_task_status(task_id, es) if task_info: status = task_info["task"]["status"] created = status["created"] total = status["total"] + start_time_millis = task_info["task"]["start_time_in_millis"] else: task_info["completed"] = False created = 0 total = 0 + start_time_millis = None iterations_count = 0 + estimated_time_remaining = compute_estimated_remaining_time( + initial_wait, start_time_millis, created, total + ) while not task_info["completed"]: logger.info( f"Task progress: {created}/{total} documents. Estimated time to" @@ -205,19 +241,12 @@ def index_daily_recap_documents( if task_info and not task_info["completed"]: status = task_info["task"]["status"] start_time_millis = task_info["task"]["start_time_in_millis"] - start_time = datetime.datetime.fromtimestamp( - start_time_millis / 1000.0 - ) created = status["created"] total = status["total"] if total and created: - estimated_time_remaining = datetime.timedelta( - seconds=( - (datetime.datetime.now() - start_time).total_seconds() - / created - ) - * (total - created) - ).total_seconds() + estimated_time_remaining = compute_estimated_remaining_time( + initial_wait, start_time_millis, created, total + ) if not task_info: iterations_count += 1 if iterations_count > 10: diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index b9ea921510..78cd52fe8b 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -75,6 +75,9 @@ def setUpTestData(cls): ) def setUp(self): + self.r = get_redis_interface("CACHE") + self.r.delete("alert_sweep:query_date") + self.r.delete("alert_sweep:task_id") RECAPSweepDocument._index.delete(ignore=404) RECAPSweepDocument.init() @@ -242,10 +245,9 @@ async def test_recap_document_hl_matched(self) -> None: contains RECAPDocument HL fields.""" # Index base document factories. - r = get_redis_interface("CACHE") with time_machine.travel(self.mock_date, tick=False): index_daily_recap_documents( - r, + self.r, DocketDocument._index._name, RECAPSweepDocument._index._name, testing=True, @@ -385,7 +387,7 @@ def test_filter_recap_alerts_to_send(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) # Only the RECAP RT alert for a member and the RECAP DLY alert are sent. @@ -402,7 +404,6 @@ def test_index_daily_recap_documents(self) -> None: """Test index_daily_recap_documents method over different documents conditions. """ - r = get_redis_interface("CACHE") recap_search = DocketDocument.search() recap_dockets = recap_search.query(Q("match", docket_child="docket")) self.assertEqual(recap_dockets.count(), 2) @@ -423,7 +424,7 @@ def test_index_daily_recap_documents(self) -> None: # RECAPDocuments indexed the same day. with time_machine.travel(self.mock_date, tick=False): documents_indexed = index_daily_recap_documents( - r, + self.r, DocketDocument._index._name, RECAPSweepDocument._index._name, testing=True, @@ -485,7 +486,7 @@ def test_index_daily_recap_documents(self) -> None: # Run the indexer. with time_machine.travel(self.mock_date, tick=False): documents_indexed = index_daily_recap_documents( - r, + self.r, DocketDocument._index._name, RECAPSweepDocument._index._name, testing=True, @@ -535,7 +536,7 @@ def test_index_daily_recap_documents(self) -> None: # Run the indexer. with time_machine.travel(self.mock_date, tick=False): documents_indexed = index_daily_recap_documents( - r, + self.r, DocketDocument._index._name, RECAPSweepDocument._index._name, testing=True, @@ -578,7 +579,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) self.assertEqual( @@ -655,7 +656,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) # The RD ingestion's shouldn't match the docket-only alert. self.assertEqual( @@ -674,7 +675,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) # 1 New alert should be triggered. self.assertEqual( @@ -708,7 +709,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) # No new alert should be triggered. self.assertEqual( @@ -740,7 +741,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) # A new alert should be triggered containing only the new RD created. @@ -769,7 +770,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) # A new alert should be triggered containing two RDs (rd and rd_2) @@ -803,7 +804,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) # A new alert should be triggered containing one RD (rd_2) @@ -869,7 +870,7 @@ def test_limit_alert_case_child_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) self.assertEqual( @@ -999,7 +1000,7 @@ def test_multiple_alerts_email_hits_limit_per_alert(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) self.assertEqual( @@ -1149,7 +1150,7 @@ def test_schedule_wly_and_mly_recap_alerts(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ), time_machine.travel(self.mock_date, tick=False): + ): call_command("cl_send_recap_alerts", testing_mode=True) # Weekly and monthly alerts are not sent right away but are scheduled as From 847f0fd8e5912ab561a4eb4b74199176c52870bf Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 3 Jul 2024 11:12:10 -0600 Subject: [PATCH 15/33] fix(alerts): Handle creation and removal of the RECAP alerts sweep index. - Ensure document timestamps get updated on partial updates. --- .../commands/cl_send_recap_alerts.py | 37 +++-- cl/alerts/tests/tests_recap_alerts.py | 126 ++++++++++++++---- cl/search/tasks.py | 14 +- 3 files changed, 144 insertions(+), 33 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index d11d6dbce1..ce5ebc0f1f 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -2,7 +2,7 @@ import datetime import time import traceback -from typing import Any +from typing import Any, Type import pytz from asgiref.sync import async_to_sync @@ -91,21 +91,29 @@ def compute_estimated_remaining_time( def index_daily_recap_documents( - r: Redis, source_index: str, target_index: str, testing: bool = False + r: Redis, + source_index_name: str, + target_index: Type[RECAPSweepDocument], + testing: bool = False, ) -> int: """Index Dockets added/modified during the day and all their RECAPDocuments and RECAPDocuments added/modified during the day and their parent Dockets. It uses the ES re_index API, :param r: Redis client instance. - :param source_index: The source Elasticsearch index from which documents - will be queried. + :param source_index_name: The source Elasticsearch index name from which + documents will be queried. :param target_index: The target Elasticsearch index to which documents will be re-indexed. :param testing: Boolean flag for testing mode. :return: The total number of documents re-indexed. """ + if r.exists("alert_sweep:re_index_completed"): + # The re-indexing has been completed for the day. Abort it and proceed + # with sending alerts. + return 0 + if not r.exists("alert_sweep:query_date"): # In case of a failure, store the date when alerts should be queried in # Redis, so the command can be resumed. @@ -129,7 +137,6 @@ def index_daily_recap_documents( today_datetime_iso = local_midnight_utc.isoformat().replace("+00:00", "Z") next_day_utc_iso = next_day_utc.isoformat().replace("+00:00", "Z") - # Re Index API query. query = { "bool": { @@ -199,11 +206,16 @@ def index_daily_recap_documents( } if not r.exists("alert_sweep:task_id"): + # Remove the index from the previous day and create a new one. + target_index._index.delete(ignore=404) + target_index.init() + target_index_name = target_index._index._name + # In case of a failure, store the task_id in Redis so the command # can be resumed. response = es.reindex( - source={"index": source_index, "query": query}, - dest={"index": target_index}, + source={"index": source_index_name, "query": query}, + dest={"index": target_index_name}, wait_for_completion=False, refresh=True, ) @@ -259,6 +271,8 @@ def index_daily_recap_documents( r.delete("alert_sweep:query_date") r.delete("alert_sweep:task_id") + if not testing: + r.set("alert_sweep:re_index_completed", 1, ex=3600 * 12) return total @@ -497,6 +511,12 @@ def query_and_schedule_alerts(r: Redis, rate: str): class Command(VerboseCommand): + """Query and re-index (into the RECAP sweep index) all the RECAP content + that has changed during the current period, along with their related + documents. Then use the RECAP sweep index to query and send real-time and + daily RECAP alerts. Finally, schedule weekly and monthly RECAP alerts. + """ + help = "Send RECAP Search Alerts." def add_arguments(self, parser): @@ -513,10 +533,11 @@ def handle(self, *args, **options): index_daily_recap_documents( r, DocketDocument._index._name, - RECAPSweepDocument._index._name, + RECAPSweepDocument, testing=testing_mode, ) query_and_send_alerts(r, Alert.REAL_TIME) query_and_send_alerts(r, Alert.DAILY) query_and_schedule_alerts(r, Alert.WEEKLY) query_and_schedule_alerts(r, Alert.MONTHLY) + r.delete("alert_sweep:re_index_completed") diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 78cd52fe8b..22f647b7d9 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -78,8 +78,6 @@ def setUp(self): self.r = get_redis_interface("CACHE") self.r.delete("alert_sweep:query_date") self.r.delete("alert_sweep:task_id") - RECAPSweepDocument._index.delete(ignore=404) - RECAPSweepDocument.init() @staticmethod def get_html_content_from_email(email_content): @@ -249,7 +247,7 @@ async def test_recap_document_hl_matched(self) -> None: index_daily_recap_documents( self.r, DocketDocument._index._name, - RECAPSweepDocument._index._name, + RECAPSweepDocument, testing=True, ) @@ -404,6 +402,8 @@ def test_index_daily_recap_documents(self) -> None: """Test index_daily_recap_documents method over different documents conditions. """ + RECAPSweepDocument._index.delete(ignore=404) + RECAPSweepDocument.init() recap_search = DocketDocument.search() recap_dockets = recap_search.query(Q("match", docket_child="docket")) self.assertEqual(recap_dockets.count(), 2) @@ -426,7 +426,7 @@ def test_index_daily_recap_documents(self) -> None: documents_indexed = index_daily_recap_documents( self.r, DocketDocument._index._name, - RECAPSweepDocument._index._name, + RECAPSweepDocument, testing=True, ) self.assertEqual( @@ -488,7 +488,7 @@ def test_index_daily_recap_documents(self) -> None: documents_indexed = index_daily_recap_documents( self.r, DocketDocument._index._name, - RECAPSweepDocument._index._name, + RECAPSweepDocument, testing=True, ) self.assertEqual( @@ -538,13 +538,96 @@ def test_index_daily_recap_documents(self) -> None: documents_indexed = index_daily_recap_documents( self.r, DocketDocument._index._name, - RECAPSweepDocument._index._name, + RECAPSweepDocument, + testing=True, + ) + self.assertEqual( + documents_indexed, 9, msg="Wrong number of documents indexed." + ) + + # Docket and RD created on previous days, will be used later to confirm + # documents got indexed into the sweep index after partial updates. + three_days_before = now() - datetime.timedelta(days=5) + mock_three_days_before = three_days_before.replace(hour=5) + with time_machine.travel( + mock_three_days_before, tick=False + ), self.captureOnCommitCallbacks(execute=True): + docket_old = DocketFactory( + court=self.court, + case_name="SUBPOENAS SERVED LOREM OFF", + docket_number="1:21-bk-1254", + source=Docket.RECAP, + ) + alert_de_old = DocketEntryWithParentsFactory( + docket=docket_old, + entry_number=1, + date_filed=datetime.date(2024, 8, 19), + description="MOTION for Leave to File Amicus Curiae Lorem Served", + ) + rd_old = RECAPDocumentFactory( + docket_entry=alert_de_old, + description="Motion to File", + document_number="1", + is_available=True, + ) + rd_old_2 = RECAPDocumentFactory( + docket_entry=alert_de_old, + description="Motion to File 2", + document_number="2", + is_available=True, + ) + + # Run the indexer. No new documents re_indexed. + with time_machine.travel(self.mock_date, tick=False): + documents_indexed = index_daily_recap_documents( + self.r, + DocketDocument._index._name, + RECAPSweepDocument, testing=True, ) self.assertEqual( documents_indexed, 9, msg="Wrong number of documents indexed." ) + # Update the documents today: + with time_machine.travel( + self.mock_date, tick=False + ), self.captureOnCommitCallbacks(execute=True): + rd_old_2.document_number = 3 + rd_old_2.save() + + # Run the indexer. No new documents re_indexed. + with time_machine.travel(self.mock_date, tick=False): + documents_indexed = index_daily_recap_documents( + self.r, + DocketDocument._index._name, + RECAPSweepDocument, + testing=True, + ) + self.assertEqual( + documents_indexed, 11, msg="Wrong number of documents indexed." + ) + + # Update the Docket today: + with time_machine.travel( + self.mock_date, tick=False + ), self.captureOnCommitCallbacks(execute=True): + docket_old.case_name = "SUBPOENAS SERVED LOREM OFF UPDATED" + docket_old.save() + + # Run the indexer. No new documents re_indexed. + with time_machine.travel(self.mock_date, tick=False): + documents_indexed = index_daily_recap_documents( + self.r, + DocketDocument._index._name, + RECAPSweepDocument, + testing=True, + ) + self.assertEqual( + documents_indexed, 12, msg="Wrong number of documents indexed." + ) + + docket_old.delete() docket.delete() docket_2.delete() @@ -579,7 +662,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ): + ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) self.assertEqual( @@ -610,9 +693,8 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: query='q="405 Civil"&type=r', ) # Simulate docket is ingested a day before. - one_day_before = now() - datetime.timedelta(days=1) - mock_date = one_day_before.replace(hour=5) - with time_machine.travel(mock_date, tick=False): + one_day_before = self.mock_date - datetime.timedelta(days=1) + with time_machine.travel(one_day_before, tick=False): docket = DocketFactory( court=self.court, case_name="SUBPOENAS SERVED CASE", @@ -626,8 +708,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: ) # Its related RD is ingested today. - mock_date = now().replace(hour=5) - with time_machine.travel(mock_date, tick=False): + with time_machine.travel(self.mock_date, tick=False): alert_de = DocketEntryWithParentsFactory( docket=docket, entry_number=1, @@ -656,7 +737,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ): + ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) # The RD ingestion's shouldn't match the docket-only alert. self.assertEqual( @@ -675,7 +756,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ): + ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) # 1 New alert should be triggered. self.assertEqual( @@ -709,14 +790,14 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ): + ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) # No new alert should be triggered. self.assertEqual( len(mail.outbox), 2, msg="Outgoing emails don't match." ) - with time_machine.travel(mock_date, tick=False): + with time_machine.travel(self.mock_date, tick=False): # Create a new RD for the same DocketEntry to confirm this new RD is # properly included in the alert email. rd_2 = RECAPDocumentFactory( @@ -741,7 +822,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ): + ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) # A new alert should be triggered containing only the new RD created. @@ -770,7 +851,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ): + ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) # A new alert should be triggered containing two RDs (rd and rd_2) @@ -804,7 +885,7 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: side_effect=lambda *args, **kwargs: MockResponse( 200, mock_raw=True ), - ): + ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) # A new alert should be triggered containing one RD (rd_2) @@ -831,8 +912,7 @@ def test_limit_alert_case_child_hits(self) -> None: results for this Case" button. """ - mock_date = now().replace(hour=5) - with time_machine.travel(mock_date, tick=False): + with time_machine.travel(self.mock_date, tick=False): alert_de = DocketEntryWithParentsFactory( docket=self.de.docket, entry_number=1, @@ -1239,10 +1319,8 @@ def test_alert_frequency_estimation(self): search_params, ) self.assertEqual(r.json()["count"], 0) - - mock_date = now().replace(day=1, hour=5) with time_machine.travel( - mock_date, tick=False + self.mock_date, tick=False ), self.captureOnCommitCallbacks(execute=True): # Docket filed today. docket = DocketFactory( diff --git a/cl/search/tasks.py b/cl/search/tasks.py index df7d337f26..49115e192a 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -462,6 +462,13 @@ def document_fields_to_update( continue field_value = prepare_method(main_instance) fields_to_update[field] = field_value + + if fields_to_update: + # If fields to update, append the timestamp to be updated too. + prepare_timestamp = getattr(es_document(), f"prepare_timestamp", None) + if prepare_timestamp: + field_value = prepare_timestamp(main_instance) + fields_to_update["timestamp"] = field_value return fields_to_update @@ -762,9 +769,14 @@ def update_children_docs_by_query( # Build the UpdateByQuery script and execute it script_lines = [] params = {} + if fields_to_update: + # If there are fields to update include the timestamp field too. + fields_to_update.append("timestamp") for field_to_update in fields_to_update: field_list = ( - fields_map[field_to_update] if fields_map else [field_to_update] + ["timestamp"] + if field_to_update == "timestamp" + else fields_map.get(field_to_update, [field_to_update]) ) for field_name in field_list: script_lines.append( From 4b324c9727f9399883dbdb214de8dd7cd3b1a3b9 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 3 Jul 2024 12:38:28 -0600 Subject: [PATCH 16/33] fix(elasticsearch): Fixed tests related to timestamp updates --- cl/search/tasks.py | 6 +++++- cl/search/tests/tests.py | 13 ------------- cl/search/tests/tests_es_opinion.py | 10 ++++++---- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/cl/search/tasks.py b/cl/search/tasks.py index 49115e192a..241ec87a2a 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -776,7 +776,11 @@ def update_children_docs_by_query( field_list = ( ["timestamp"] if field_to_update == "timestamp" - else fields_map.get(field_to_update, [field_to_update]) + else ( + fields_map[field_to_update] + if fields_map + else [field_to_update] + ) ) for field_name in field_list: script_lines.append( diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 2dcd3a78ab..510bbd3510 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -2612,19 +2612,6 @@ def test_remove_opinions_by_timestamp(self, mock_logging_prefix): testing_mode=True, ) - with self.captureOnCommitCallbacks(execute=True): - # Trigger a change in opinion_1 to confirm the timestamp is not - # updated. - opinion_1.type = Opinion.UNANIMOUS - opinion_1.save() - - # The timestamp in opinion_1 remains the same as it was from 5 days ago - opinion_1_doc = OpinionClusterDocument.get( - ES_CHILD_ID(opinion_1.pk).OPINION - ) - self.assertEqual(opinion_1_doc.type, "unanimous-opinion") - self.assertEqual(opinion_1_doc.timestamp.date(), five_days_ago.date()) - # The timestamp in opinion_2 is updated to 2 days ago. opinion_2_doc = OpinionClusterDocument.get( ES_CHILD_ID(opinion_2.pk).OPINION diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index ac8593562d..4e8fafb005 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -626,8 +626,9 @@ def test_extract_snippet_from_db_highlight_disabled(self) -> None: prioritizing the different text fields available in the content when highlighting is disabled.""" - with self.captureOnCommitCallbacks(execute=True): - + with time_machine.travel( + self.mock_date, tick=False + ), self.captureOnCommitCallbacks(execute=True): c_2_opinion_1 = OpinionFactory.create( extracted_by_ocr=True, author=self.person_2, @@ -635,7 +636,6 @@ def test_extract_snippet_from_db_highlight_disabled(self) -> None: html_lawbox="html_lawbox & text from DB", cluster=self.opinion_cluster_2, ) - c_2_opinion_2 = OpinionFactory.create( extracted_by_ocr=True, author=self.person_2, @@ -710,7 +710,9 @@ def test_extract_snippet_from_db_highlight_disabled(self) -> None: ) self.assertEqual(expected_text, result_opinion["snippet"]) - with self.captureOnCommitCallbacks(execute=True): + with time_machine.travel( + self.mock_date, tick=False + ), self.captureOnCommitCallbacks(execute=True): c_2_opinion_1.delete() c_2_opinion_2.delete() c_2_opinion_3.delete() From 0d63080ec2a7d5d43cd67399286c3a7091b65d77 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 3 Jul 2024 19:27:29 -0600 Subject: [PATCH 17/33] fix(alerts): Fix should_docket_hit_be_included date comparison - Fixed email templates - Refactored retrieve_task_info --- .../commands/cl_send_recap_alerts.py | 98 ++++++++++++------- cl/alerts/tasks.py | 2 + cl/alerts/templates/alert_email_es.html | 5 +- cl/alerts/templates/alert_email_es.txt | 4 +- cl/alerts/tests/tests_recap_alerts.py | 2 +- 5 files changed, 71 insertions(+), 40 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index ce5ebc0f1f..e9ec48e2d8 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -27,6 +27,7 @@ from cl.api.models import WebhookEventType from cl.api.tasks import send_es_search_alert_webhook from cl.lib.command_utils import VerboseCommand, logger +from cl.lib.date_time import dt_as_local_date from cl.lib.elasticsearch_utils import do_es_sweep_alert_query from cl.lib.redis_utils import get_redis_interface from cl.search.documents import DocketDocument, RECAPSweepDocument @@ -76,12 +77,10 @@ def compute_estimated_remaining_time( return initial_wait start_time = datetime.datetime.fromtimestamp(start_time_millis / 1000.0) + time_now = datetime.datetime.now() estimated_time_remaining = max( datetime.timedelta( - seconds=( - (datetime.datetime.now() - start_time).total_seconds() - / created - ) + seconds=((time_now - start_time).total_seconds() / created) * (total - created) ).total_seconds(), initial_wait, @@ -90,6 +89,31 @@ def compute_estimated_remaining_time( return estimated_time_remaining +def retrieve_task_info(task_info: dict[str, Any]) -> dict[str, Any]: + """Retrieve task information from the given task dict. + + :param task_info: A dictionary containing the task status information. + :return: A dictionary with the task completion status, created documents + count, total documents count, and the task start time in milliseconds. + Retrieve default values in case task_info is not valid. + """ + + if task_info: + status = task_info["task"]["status"] + return { + "completed": task_info["completed"], + "created": status["created"], + "total": status["total"], + "start_time_millis": task_info["task"]["start_time_in_millis"], + } + return { + "completed": False, + "created": 0, + "total": 0, + "start_time_millis": None, + } + + def index_daily_recap_documents( r: Redis, source_index_name: str, @@ -110,6 +134,9 @@ def index_daily_recap_documents( """ if r.exists("alert_sweep:re_index_completed"): + logger.info( + "The re-index task has been completed and will be omitted." + ) # The re-indexing has been completed for the day. Abort it and proceed # with sending alerts. return 0 @@ -127,6 +154,7 @@ def index_daily_recap_documents( # If "alert_sweep:query_date" already exists get it from Redis. local_midnight_str: str = str(r.get("alert_sweep:query_date")) local_midnight = datetime.datetime.fromisoformat(local_midnight_str) + logger.info(f"Resuming re-indexing process for date: {local_midnight}") es = connections.get_connection() # Convert the local (PDT) midnight time to UTC @@ -222,58 +250,50 @@ def index_daily_recap_documents( # Store the task ID in Redis task_id = response["task"] r.set("alert_sweep:task_id", task_id) + logger.info(f"Re-indexing task scheduled ID: {task_id}") else: task_id = r.get("alert_sweep:task_id") + logger.info(f"Resuming re-index task ID: {task_id}") initial_wait = 0.01 if testing else 60.0 time.sleep(initial_wait) - task_info = get_task_status(task_id, es) - if task_info: - status = task_info["task"]["status"] - created = status["created"] - total = status["total"] - start_time_millis = task_info["task"]["start_time_in_millis"] - else: - task_info["completed"] = False - created = 0 - total = 0 - start_time_millis = None - + get_task_info = retrieve_task_info(get_task_status(task_id, es)) iterations_count = 0 estimated_time_remaining = compute_estimated_remaining_time( - initial_wait, start_time_millis, created, total + initial_wait, + get_task_info["start_time_millis"], + get_task_info["created"], + get_task_info["total"], ) - while not task_info["completed"]: + while not get_task_info["completed"]: logger.info( - f"Task progress: {created}/{total} documents. Estimated time to" - f" finish: {estimated_time_remaining}." + f"Task progress: {get_task_info['created']}/{get_task_info['total']} documents. " + f"Estimated time to finish: {estimated_time_remaining} seconds." ) task_info = get_task_status(task_id, es) + get_task_info = retrieve_task_info(task_info) time.sleep(estimated_time_remaining) - if task_info and not task_info["completed"]: - status = task_info["task"]["status"] - start_time_millis = task_info["task"]["start_time_in_millis"] - created = status["created"] - total = status["total"] - if total and created: - estimated_time_remaining = compute_estimated_remaining_time( - initial_wait, start_time_millis, created, total - ) + if task_info and not get_task_info["completed"]: + estimated_time_remaining = compute_estimated_remaining_time( + initial_wait, + get_task_info["start_time_millis"], + get_task_info["created"], + get_task_info["total"], + ) if not task_info: iterations_count += 1 if iterations_count > 10: logger.error( "Re_index alert sweep index task has failed: %s/%s", - created, - total, + get_task_info["created"], + get_task_info["total"], ) break - r.delete("alert_sweep:query_date") r.delete("alert_sweep:task_id") if not testing: r.set("alert_sweep:re_index_completed", 1, ex=3600 * 12) - return total + return get_task_info["total"] def should_docket_hit_be_included( @@ -290,9 +310,16 @@ def should_docket_hit_be_included( docket = Docket.objects.filter(id=docket_id).only("date_modified").first() if not docket: return False - date_modified = docket.date_modified.date() if not has_document_alert_hit_been_triggered(r, alert_id, "d", docket_id): - if date_modified == timezone.now().date(): + local_midnight_localized = timezone.localtime( + timezone.make_aware( + datetime.datetime.fromisoformat( + str(r.get("alert_sweep:query_date")) + ) + ) + ) + date_modified_localized = dt_as_local_date(docket.date_modified) + if date_modified_localized == local_midnight_localized.date(): return True return False @@ -541,3 +568,4 @@ def handle(self, *args, **options): query_and_schedule_alerts(r, Alert.WEEKLY) query_and_schedule_alerts(r, Alert.MONTHLY) r.delete("alert_sweep:re_index_completed") + r.delete("alert_sweep:query_date") diff --git a/cl/alerts/tasks.py b/cl/alerts/tasks.py index 885ac9b413..f7b004bc54 100644 --- a/cl/alerts/tasks.py +++ b/cl/alerts/tasks.py @@ -475,6 +475,8 @@ def send_search_alert_emails( :param email_alerts_to_send: A list of two tuples containing the user to whom the alerts should be sent. A list of tuples containing the Search Alert, (Alert, search type, documents, and number of documents) + :param scheduled_alert: A boolean indicating weather this alert has been + scheduled :return: None """ diff --git a/cl/alerts/templates/alert_email_es.html b/cl/alerts/templates/alert_email_es.html index 2b15d540bd..dc2f797268 100644 --- a/cl/alerts/templates/alert_email_es.html +++ b/cl/alerts/templates/alert_email_es.html @@ -36,7 +36,7 @@

{{ forloop.counter }}. {{ result|get_highlight:"caseName"|safe }} - ({% if result.court_id != 'scotus' %}{{ result|get_highlight:"court_citation_string"|nbsp|safe }} {% endif %}{% if type == 'o' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% elif type == 'r' %}{{ result.dateFiled|date:"Y" }}{% endif %}) + ({% if result.court_id != 'scotus' %}{{ result|get_highlight:"court_citation_string"|nbsp|safe }} {% endif %}{% if type == 'o' or type == 'r' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %})

{% if type == 'r' %} @@ -68,11 +68,12 @@

View Additional Results for this Case +
{% endif %} {% else %}

diff --git a/cl/alerts/templates/alert_email_es.txt b/cl/alerts/templates/alert_email_es.txt index b836b10caa..2b7ec3b569 100644 --- a/cl/alerts/templates/alert_email_es.txt +++ b/cl/alerts/templates/alert_email_es.txt @@ -15,7 +15,7 @@ We have news regarding your alerts at CourtListener.com View Full Results / Edit this Alert: https://www.courtlistener.com/?{{ alert.query_run|safe }}&edit_alert={{ alert.pk }} Disable this Alert (one click): https://www.courtlistener.com{% url "disable_alert" alert.secret_key %}{% endif %} -{{forloop.counter}}. {{ result.caseName|render_string_or_list|safe|striptags }} ({% if result.court_id != 'scotus' %}{{ result.court_citation_string|render_string_or_list|striptags }} {% endif %}{% if type == 'o' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %}) +{{forloop.counter}}. {{ result.caseName|render_string_or_list|safe|striptags }} ({% if result.court_id != 'scotus' %}{{ result.court_citation_string|render_string_or_list|striptags }} {% endif %}{% if type == 'o' or type == 'r' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %}) {% if type == 'oa' %}{% if result.dateArgued %}Date Argued: {{ result.dateArgued|date:"F jS, Y" }}{% else %}Date Argued: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} | Duration: {{ result.duration|naturalduration }}{% if result.judge %} | Judge: {{ result.judge|render_string_or_list|safe|striptags|underscore_to_space }}{% endif %}{% endif %} {% if type == 'o' or type == 'oa' %}{% if result|get_highlight:"text" %}...{{ result|get_highlight:"text"|safe|striptags|underscore_to_space|compress_whitespace }}...{% endif %}{% endif %} {% if type == 'r' %}{% if result.dateFiled %}Date Filed: {{ result.dateFiled|date:"F jS, Y" }}{% else %}Date Filed: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} @@ -24,7 +24,7 @@ Disable this Alert (one click): https://www.courtlistener.com{% url "disable_ale {% if doc.plain_text %}{% contains_highlights doc.plain_text.0 True as highlighted %}{% if highlighted %}...{% endif %}{{ doc.plain_text|render_string_or_list|safe|striptags|underscore_to_space }}...{% endif %} View this document on our site: https://www.courtlistener.com{% if doc.absolute_url %}{{ doc.absolute_url }}{% else %}{{ result.docket_absolute_url }}#minute-entry-{{ doc.docket_entry_id }}{% endif %} {% endwith %}{% endfor %} -{% if result.child_remaining %}{% extract_q_value alert.query_run as q_value %}View Additional Results for this Case: https://www.courtlistener.com/?type={{ type|urlencode }}&q={% if q_value %}({{ q_value|urlencode }})%20AND%20{% endif %}docket_id%3A{{ result.docket_id|urlencode }}{% endif %} +{% if result.child_docs and result.child_remaining %}{% extract_q_value alert.query_run as q_value %}View Additional Results for this Case: https://www.courtlistener.com/?type={{ type|urlencode }}&q={% if q_value %}({{ q_value|urlencode }})%20AND%20{% endif %}docket_id%3A{{ result.docket_id|urlencode }}{% endif %} {% endif %}~~~~~ - View this item on our site: https://www.courtlistener.com{% if type == 'r' %}{{result.docket_absolute_url}}{% else %}{{result.absolute_url}}{% endif %} {% if result.download_url %} - Download original from the court: {{result.download_url}} diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 22f647b7d9..cf9ea6b145 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -1333,7 +1333,7 @@ def test_alert_frequency_estimation(self): # RECAPDocument filed today that belongs to a docket filed outside # the estimation range. - date_outside_range = now() - datetime.timedelta(days=101) + date_outside_range = now() - datetime.timedelta(days=102) alert_de = DocketEntryWithParentsFactory( docket=DocketFactory( court=self.court, From 5077e01c81318ecf6bb726fc256161f54a636614 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 9 Jul 2024 20:02:46 -0600 Subject: [PATCH 18/33] fix(alerts): Changed approach to filter out cross-object hits by using extra Docket-only and RD-only queries. --- .../commands/cl_send_recap_alerts.py | 279 +++++++++++++----- .../commands/clean_up_search_alerts.py | 2 +- cl/alerts/tests/tests_recap_alerts.py | 274 +++++++++++++---- cl/alerts/utils.py | 21 -- cl/lib/elasticsearch_utils.py | 176 ++++++----- cl/search/documents.py | 21 +- cl/search/tests/tests_es_oral_arguments.py | 2 +- cl/search/tests/tests_es_person.py | 4 +- 8 files changed, 547 insertions(+), 232 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index e9ec48e2d8..763651c985 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -12,7 +12,8 @@ from elasticsearch import Elasticsearch from elasticsearch.exceptions import RequestError, TransportError from elasticsearch_dsl import connections -from elasticsearch_dsl.response import Hit +from elasticsearch_dsl.response import Hit, Response +from elasticsearch_dsl.utils import AttrList from redis import Redis from cl.alerts.models import Alert, ScheduledAlertHit @@ -21,7 +22,6 @@ add_document_hit_to_alert_set, alert_hits_limit_reached, has_document_alert_hit_been_triggered, - query_includes_rd_field, recap_document_hl_matched, ) from cl.api.models import WebhookEventType @@ -30,7 +30,11 @@ from cl.lib.date_time import dt_as_local_date from cl.lib.elasticsearch_utils import do_es_sweep_alert_query from cl.lib.redis_utils import get_redis_interface -from cl.search.documents import DocketDocument, RECAPSweepDocument +from cl.search.documents import ( + DocketDocument, + ESRECAPSweepDocument, + RECAPSweepDocument, +) from cl.search.exception import ( BadProximityQuery, UnbalancedParenthesesQuery, @@ -117,8 +121,9 @@ def retrieve_task_info(task_info: dict[str, Any]) -> dict[str, Any]: def index_daily_recap_documents( r: Redis, source_index_name: str, - target_index: Type[RECAPSweepDocument], + target_index: Type[RECAPSweepDocument] | Type[ESRECAPSweepDocument], testing: bool = False, + only_rd: bool = False, ) -> int: """Index Dockets added/modified during the day and all their RECAPDocuments and RECAPDocuments added/modified during the day and their parent Dockets. @@ -130,6 +135,8 @@ def index_daily_recap_documents( :param target_index: The target Elasticsearch index to which documents will be re-indexed. :param testing: Boolean flag for testing mode. + :param only_rd: Whether to reindex only RECAPDocuments into the + ESRECAPSweepDocument index. :return: The total number of documents re-indexed. """ @@ -166,14 +173,31 @@ def index_daily_recap_documents( today_datetime_iso = local_midnight_utc.isoformat().replace("+00:00", "Z") next_day_utc_iso = next_day_utc.isoformat().replace("+00:00", "Z") # Re Index API query. - query = { - "bool": { - "should": [ - # Dockets added/modified today - { - "bool": { - "must": [ - { + query = ( + { + "bool": { + "should": [ + # Dockets added/modified today + { + "bool": { + "must": [ + { + "range": { + "timestamp": { + "gte": today_datetime_iso, + "lt": next_day_utc_iso, + } + } + }, + {"term": {"docket_child": "docket"}}, + ] + } + }, + # RECAPDocuments with parents added/modified today + { + "has_parent": { + "parent_type": "docket", + "query": { "range": { "timestamp": { "gte": today_datetime_iso, @@ -181,29 +205,29 @@ def index_daily_recap_documents( } } }, - {"term": {"docket_child": "docket"}}, - ] - } - }, - # RECAPDocuments with parents added/modified today - { - "has_parent": { - "parent_type": "docket", - "query": { - "range": { - "timestamp": { - "gte": today_datetime_iso, - "lt": next_day_utc_iso, - } - } - }, - } - }, - # RECAPDocuments added/modified today - { - "bool": { - "must": [ - { + } + }, + # RECAPDocuments added/modified today + { + "bool": { + "must": [ + { + "range": { + "timestamp": { + "gte": today_datetime_iso, + "lt": next_day_utc_iso, + } + } + }, + {"term": {"docket_child": "recap_document"}}, + ] + } + }, + # Dockets that are parents of RECAPDocuments added/modified today + { + "has_child": { + "type": "recap_document", + "query": { "range": { "timestamp": { "gte": today_datetime_iso, @@ -211,27 +235,49 @@ def index_daily_recap_documents( } } }, - {"term": {"docket_child": "recap_document"}}, - ] - } - }, - # Dockets that are parents of RECAPDocuments added/modified today - { - "has_child": { - "type": "recap_document", - "query": { - "range": { - "timestamp": { - "gte": today_datetime_iso, - "lt": next_day_utc_iso, + } + }, + ] + } + } + if not only_rd + else { + "bool": { + "should": [ + # RECAPDocuments with parents added/modified today + { + "has_parent": { + "parent_type": "docket", + "query": { + "range": { + "timestamp": { + "gte": today_datetime_iso, + "lt": next_day_utc_iso, + } } - } - }, - } - }, - ] + }, + } + }, + # RECAPDocuments added/modified today + { + "bool": { + "must": [ + { + "range": { + "timestamp": { + "gte": today_datetime_iso, + "lt": next_day_utc_iso, + } + } + }, + {"term": {"docket_child": "recap_document"}}, + ] + } + }, + ] + } } - } + ) if not r.exists("alert_sweep:task_id"): # Remove the index from the previous day and create a new one. @@ -241,12 +287,39 @@ def index_daily_recap_documents( # In case of a failure, store the task_id in Redis so the command # can be resumed. - response = es.reindex( - source={"index": source_index_name, "query": query}, - dest={"index": target_index_name}, - wait_for_completion=False, - refresh=True, - ) + params = { + "source": {"index": source_index_name, "query": query}, + "dest": {"index": target_index_name}, + "wait_for_completion": False, + "refresh": True, + } + if only_rd: + # Re-index only RECADocument fields to the ESRECAPSweepDocument + # index + params["script"] = { + "source": """ + def fields = [ + 'id', + 'docket_entry_id', + 'description', + 'entry_number', + 'entry_date_filed', + 'short_description', + 'document_type', + 'document_number', + 'pacer_doc_id', + 'plain_text', + 'attachment_number', + 'is_available', + 'page_count', + 'filepath_local', + 'absolute_url', + 'cites' + ]; + ctx._source.keySet().retainAll(fields); + """ + } + response = es.reindex(**params) # Store the task ID in Redis task_id = response["task"] r.set("alert_sweep:task_id", task_id) @@ -291,8 +364,6 @@ def index_daily_recap_documents( break r.delete("alert_sweep:task_id") - if not testing: - r.set("alert_sweep:re_index_completed", 1, ex=3600 * 12) return get_task_info["total"] @@ -324,12 +395,20 @@ def should_docket_hit_be_included( return False -def filter_rd_alert_hits(r: Redis, alert_id: int, rd_hits, check_rd_hl=False): +def filter_rd_alert_hits( + r: Redis, + alert_id: int, + rd_hits: AttrList, + rd_ids: list[int], + check_rd_hl=False, +): """Filter RECAP document hits based on specified conditions. :param r: The Redis interface. :param alert_id: The ID of the alert. - :param rd_hits: A list of RECAP document hits to be processed. + :param rd_hits: A list of RECAPDocument hits to be processed. + :param rd_ids: A list of RECAPDocument IDs that matched the RECAPDocument + only query. :param check_rd_hl: A boolean indicating whether to check if the RECAP document hit matched RD HLs. :return: A list of RECAP document hits that meet all specified conditions. @@ -343,7 +422,10 @@ def filter_rd_alert_hits(r: Redis, alert_id: int, rd_hits, check_rd_hl=False): ) ] if check_rd_hl: - conditions.append(recap_document_hl_matched(rd_hit)) + if not recap_document_hl_matched(rd_hit): + # If the RECAPDocument hit didn't match any HL. Check if it should be included + # due to it matched the RECAPDocument only query. + conditions.append(rd_hit["_source"]["id"] in rd_ids) if all(conditions): rds_to_send.append(rd_hit) add_document_hit_to_alert_set( @@ -354,11 +436,13 @@ def filter_rd_alert_hits(r: Redis, alert_id: int, rd_hits, check_rd_hl=False): def query_alerts( search_params: QueryDict, -) -> tuple[list[Hit] | None, int | None]: +) -> tuple[list[Hit] | None, Response | None, Response | None]: try: search_query = RECAPSweepDocument.search() + child_search_query = ESRECAPSweepDocument.search() return do_es_sweep_alert_query( search_query, + child_search_query, search_params, ) except ( @@ -371,35 +455,52 @@ def query_alerts( ): traceback.print_exc() logger.info(f"Search for this alert failed: {search_params}\n") - return None, None + return None, None, None def process_alert_hits( - r: Redis, results: list[Hit], search_params: QueryDict, alert_id: int + r: Redis, + results: list[Hit], + parent_results: Response | None, + child_results: Response | None, + alert_id: int, ) -> list[Hit]: """Process alert hits by filtering and prepare the results to send based on alert conditions. :param r: The Redis instance. :param results: A list of Hit objects containing search results. - :param search_params: Query parameters used for the search. + :param parent_results: The ES Response for the docket-only query. + :param child_results: The ES Response for the RECAPDocument-only query. :param alert_id: The ID of the alert being processed. :return: A list of Hit objects that are filtered and prepared to be sent. """ - includes_rd_fields = query_includes_rd_field(search_params) + docket_hits = parent_results.hits if parent_results else [] + docket_ids = [int(d.docket_id) for d in docket_hits] + + rd_hits = child_results.hits if child_results else [] + rd_ids = [int(r.id) for r in rd_hits] results_to_send = [] if len(results) > 0: for hit in results: - if not includes_rd_fields: + if hit.docket_id in docket_ids: # Possible Docket-only alert rds_to_send = filter_rd_alert_hits( - r, alert_id, hit["child_docs"], check_rd_hl=True + r, alert_id, hit["child_docs"], rd_ids, check_rd_hl=True ) if rds_to_send: # Cross-object query hit["child_docs"] = rds_to_send results_to_send.append(hit) + if should_docket_hit_be_included( + r, alert_id, hit.docket_id + ): + add_document_hit_to_alert_set( + r, alert_id, "d", hit.docket_id + ) + + # Docket-only alert elif should_docket_hit_be_included(r, alert_id, hit.docket_id): # Docket-only alert hit["child_docs"] = [] @@ -407,10 +508,11 @@ def process_alert_hits( add_document_hit_to_alert_set( r, alert_id, "d", hit.docket_id ) + else: # RECAP-only alerts or cross-object alerts rds_to_send = filter_rd_alert_hits( - r, alert_id, hit["child_docs"] + r, alert_id, hit["child_docs"], rd_ids ) if rds_to_send: # Cross-object alert @@ -456,13 +558,19 @@ def query_and_send_alerts(r: Redis, rate: str) -> None: alerts_to_update = [] for alert in alerts: search_params = QueryDict(alert.query.encode(), mutable=True) - results, _ = query_alerts(search_params) + results, parent_results, child_results = query_alerts( + search_params + ) if not results: continue alerts_to_update.append(alert.pk) search_type = search_params.get("type", SEARCH_TYPES.RECAP) results_to_send = process_alert_hits( - r, results, search_params, alert.pk + r, + results, + parent_results, + child_results, + alert.pk, ) if results_to_send: hits.append( @@ -500,11 +608,18 @@ def query_and_schedule_alerts(r: Redis, rate: str): scheduled_hits_to_create = [] for alert in alerts: search_params = QueryDict(alert.query.encode(), mutable=True) - results, _ = query_alerts(search_params) + results, parent_results, child_results = query_alerts( + search_params + ) if not results: continue + results_to_send = process_alert_hits( - r, results, search_params, alert.pk + r, + results, + parent_results, + child_results, + alert.pk, ) if results_to_send: for hit in results_to_send: @@ -563,6 +678,16 @@ def handle(self, *args, **options): RECAPSweepDocument, testing=testing_mode, ) + index_daily_recap_documents( + r, + DocketDocument._index._name, + ESRECAPSweepDocument, + testing=testing_mode, + only_rd=True, + ) + if not testing_mode: + r.set("alert_sweep:re_index_completed", 1, ex=3600 * 12) + query_and_send_alerts(r, Alert.REAL_TIME) query_and_send_alerts(r, Alert.DAILY) query_and_schedule_alerts(r, Alert.WEEKLY) diff --git a/cl/alerts/management/commands/clean_up_search_alerts.py b/cl/alerts/management/commands/clean_up_search_alerts.py index b00d7128a3..cf1ceb2f54 100644 --- a/cl/alerts/management/commands/clean_up_search_alerts.py +++ b/cl/alerts/management/commands/clean_up_search_alerts.py @@ -75,7 +75,7 @@ def validate_queries_syntax(options: OptionsType) -> None: if search_form.is_valid(): cd = search_form.cleaned_data try: - s, _ = build_es_base_query(search_query, cd) + s, _, _ = build_es_base_query(search_query, cd) s = s.extra(size=0) s.execute().to_dict() # Waiting between requests to avoid hammering ES too quickly. diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index cf9ea6b145..1fd573c8fb 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -17,7 +17,7 @@ index_daily_recap_documents, ) from cl.alerts.models import SEARCH_TYPES, Alert, ScheduledAlertHit -from cl.alerts.utils import query_includes_rd_field, recap_document_hl_matched +from cl.alerts.utils import recap_document_hl_matched from cl.api.factories import WebhookFactory from cl.api.models import WebhookEvent, WebhookEventType from cl.donate.models import NeonMembership @@ -78,6 +78,7 @@ def setUp(self): self.r = get_redis_interface("CACHE") self.r.delete("alert_sweep:query_date") self.r.delete("alert_sweep:task_id") + self.r.delete("alert_hits:") @staticmethod def get_html_content_from_email(email_content): @@ -257,7 +258,10 @@ async def test_recap_document_hl_matched(self) -> None: "q": '"401 Civil"', } search_query = RECAPSweepDocument.search() - results, total_hits = await sync_to_async(do_es_sweep_alert_query)( + results, parent_results, _ = await sync_to_async( + do_es_sweep_alert_query + )( + search_query, search_query, search_params, ) @@ -272,7 +276,10 @@ async def test_recap_document_hl_matched(self) -> None: "q": '"Mauris iaculis, leo sit amet hendrerit vehicula"', } search_query = RECAPSweepDocument.search() - results, total_hits = await sync_to_async(do_es_sweep_alert_query)( + results, parent_results, _ = await sync_to_async( + do_es_sweep_alert_query + )( + search_query, search_query, search_params, ) @@ -287,7 +294,10 @@ async def test_recap_document_hl_matched(self) -> None: "q": "SUBPOENAS SERVED OFF Mauris iaculis", } search_query = RECAPSweepDocument.search() - results, total_hits = await sync_to_async(do_es_sweep_alert_query)( + results, parent_results, _ = await sync_to_async( + do_es_sweep_alert_query + )( + search_query, search_query, search_params, ) @@ -296,58 +306,6 @@ async def test_recap_document_hl_matched(self) -> None: rd_field_matched = recap_document_hl_matched(rd) self.assertEqual(rd_field_matched, True) - async def test_query_includes_rd_field(self) -> None: - """Test query_includes_rd_field method that checks if a query - includes any indexed fields in the query string or filters specific to - RECAP Documents. - """ - - # Docket-only query - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": '"401 Civil"', - } - self.assertEqual(query_includes_rd_field(search_params), False) - - # RECAPDocument-only query - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": 'description:"lorem ipsum"', - } - self.assertEqual(query_includes_rd_field(search_params), True) - - # Cross-object query - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": 'case_name:"American v." description:"lorem ipsum"', - } - self.assertEqual(query_includes_rd_field(search_params), True) - - # Docket-only query - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": "", - "case_name": "SUBPOENAS", - } - self.assertEqual(query_includes_rd_field(search_params), False) - - # RECAPDocument-only query - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": "", - "description": "Lorem", - } - self.assertEqual(query_includes_rd_field(search_params), True) - - # Cross-object query - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": "", - "case_name": "SUBPOENAS", - "document_number": 1, - } - self.assertEqual(query_includes_rd_field(search_params), True) - def test_filter_recap_alerts_to_send(self) -> None: """Test filter RECAP alerts that met the conditions to be sent: - RECAP type alert. @@ -907,6 +865,210 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: docket.delete() + def test_special_cross_object_alerts(self) -> None: + """This test confirms that hits are properly filtered out or included + in alerts for special cross-object alerts that can match either a + Docket-only hit and/or Docket + RDs simultaneously in the same hit. + These cases include queries that use an OR clause combining + Docket field + RD fields or a text query that can match a Docket and + RD field simultaneously. + """ + + # The following test confirms that an alert with a query that can match + # a Docket or RECAPDocuments simultaneously is properly filtered. + cross_object_alert_d_or_rd_field = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Cross-object query", + query=f"q=docket_id:{self.de.docket.pk} OR pacer_doc_id:{self.rd_2.pacer_doc_id}&type=r", + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts", testing_mode=True) + + # A new alert should be triggered containing a Docket-only hit and a + # Docket with the nested RD matched. + self.assertEqual( + len(mail.outbox), 1, msg="Outgoing emails don't match." + ) + html_content = self.get_html_content_from_email(mail.outbox[0]) + self._confirm_number_of_alerts(html_content, 1) + # This hit should only display the Docket matched by its ID, + # no RECAPDocument should be matched. + self._assert_child_hits_content( + html_content, + cross_object_alert_d_or_rd_field.name, + self.de.docket.case_name, + [], + ) + + # This hit should display the rd_2 nested below its parent docket. + self._assert_child_hits_content( + html_content, + cross_object_alert_d_or_rd_field.name, + self.de_1.docket.case_name, + [self.rd_2.description], + ) + + # Assert email text version: + txt_email = mail.outbox[0].body + self.assertIn(cross_object_alert_d_or_rd_field.name, txt_email) + self.assertIn(self.rd_2.description, txt_email) + + # This test confirms a text query cross-object alert matches documents + # according to trigger conditions like indexed date and previous triggers + # by the same document. + two_days_before = self.mock_date - datetime.timedelta(days=2) + mock_two_days_before = two_days_before.replace(hour=5) + with time_machine.travel(mock_two_days_before, tick=False): + docket = DocketFactory( + court=self.court, + case_name="United States of America", + docket_number="1:21-bk-1009", + source=Docket.RECAP, + ) + + with time_machine.travel( + self.mock_date, tick=False + ), self.captureOnCommitCallbacks(execute=True): + alert_de = DocketEntryWithParentsFactory( + docket=docket, + entry_number=1, + date_filed=datetime.date(2024, 8, 19), + description="MOTION for Leave to File", + ) + rd_3 = RECAPDocumentFactory( + docket_entry=alert_de, + description="Motion to File New", + document_number="2", + pacer_doc_id="018036652875", + plain_text="United states Lorem", + ) + + docket_2 = DocketFactory( + court=self.court, + case_name="United States of America vs Lorem", + docket_number="1:21-bk-1008", + source=Docket.RECAP, + ) + + cross_object_alert_text = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Cross-object query", + query=f'q="United states"&type=r', + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts", testing_mode=True) + + # A new alert should be triggered containing two hits. One matched by + # the rd_3 plain text description and one matched by docket_2 case_name + self.assertEqual( + len(mail.outbox), 2, msg="Outgoing emails don't match." + ) + html_content = self.get_html_content_from_email(mail.outbox[1]) + # rd_3 should appear nested in this hit. + self._confirm_number_of_alerts(html_content, 1) + self._assert_child_hits_content( + html_content, + cross_object_alert_text.name, + docket.case_name, + [rd_3.description], + ) + # The docket_2 hit shouldn't contain RDs. + self._assert_child_hits_content( + html_content, + cross_object_alert_text.name, + docket_2.case_name, + [], + ) + + # Modify the docket today: + with time_machine.travel( + self.mock_date, tick=False + ), self.captureOnCommitCallbacks(execute=True): + docket.cause = "405 Civil" + docket.save() + + # Trigger the alert again: + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts", testing_mode=True) + + # A new alert should be triggered containing docket as a hit with no + # nested RDs. + html_content = self.get_html_content_from_email(mail.outbox[2]) + self.assertEqual( + len(mail.outbox), 3, msg="Outgoing emails don't match." + ) + self._assert_child_hits_content( + html_content, + cross_object_alert_text.name, + docket.case_name, + [], + ) + + # Trigger alert again: + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts", testing_mode=True) + + # No new alerts should be triggered. + self.assertEqual( + len(mail.outbox), 3, msg="Outgoing emails don't match." + ) + + # This test confirms that we're able to trigger cross-object alerts + # that include an OR clause and match documents that belong to the + # same case. + cross_object_alert_d_or_rd_field_same_case = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Cross-object query", + query=f"q=docket_id:{self.de.docket.pk} OR pacer_doc_id:{self.rd.pacer_doc_id}&type=r", + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts", testing_mode=True) + + # A new alert should be triggered, containing the RD document nested below + # its parent docket. + html_content = self.get_html_content_from_email(mail.outbox[3]) + self.assertEqual( + len(mail.outbox), 4, msg="Outgoing emails don't match." + ) + self._confirm_number_of_alerts(html_content, 1) + self._assert_child_hits_content( + html_content, + cross_object_alert_d_or_rd_field_same_case.name, + self.de.docket.case_name, + [self.rd.description], + ) + + docket.delete() + docket_2.delete() + def test_limit_alert_case_child_hits(self) -> None: """Test limit case child hits up to 5 and display the "View additional results for this Case" button. diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py index 44277a04a2..6ad589a913 100644 --- a/cl/alerts/utils.py +++ b/cl/alerts/utils.py @@ -169,27 +169,6 @@ def recap_document_hl_matched(rd_hit: Hit) -> bool: return False -def query_includes_rd_field(query_params: CleanData) -> bool: - """Determine whether the query includes any indexed fields in the query - string or filters specific to RECAP Documents. - - :param query_params: The query parameters. - :return: True if any recap document fields or filters are included in the - query, otherwise False. - """ - - query_string = query_params.get("q", "") - for rd_field in recap_document_indexed_fields: - if f"{rd_field}:" in query_string: - return True - - for rd_filter in recap_document_filters: - if query_params.get(rd_filter, ""): - return True - - return False - - def make_alert_set_key(alert_id: int, document_type: str) -> str: """Generate a Redis key for storing alert hits. diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 72c982deca..f22b61de47 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -1070,7 +1070,7 @@ def build_es_base_query( child_highlighting: bool = True, api_version: Literal["v3", "v4"] | None = None, alerts: bool = False, -) -> tuple[Search, QueryString | None]: +) -> tuple[Search, QueryString | None, QueryString | None]: """Builds filters and fulltext_query based on the given cleaned data and returns an elasticsearch query. @@ -1079,14 +1079,15 @@ def build_es_base_query( :param child_highlighting: Whether highlighting should be enabled in child docs. :param api_version: Optional, the request API version. :param alerts: If highlighting is being applied to search Alerts hits. - :return: A two-tuple, the Elasticsearch search query object and an ES - QueryString for child documents, or None if there is no need to query - child documents. + :return: A three-tuple, the Elasticsearch search query object and an ES + QueryString for child documents or None if there is no need to query + child documents and a QueryString for parent documents or None. """ main_query = None string_query = None - join_query = None + child_docs_query = None + parent_query = None filters = [] plain_doc = False match cd["type"]: @@ -1131,12 +1132,14 @@ def build_es_base_query( ], ) ) - main_query, join_query = build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - child_highlighting=child_highlighting, - api_version=api_version, + main_query, child_docs_query, parent_query = ( + build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + child_highlighting=child_highlighting, + api_version=api_version, + ) ) case ( @@ -1167,13 +1170,15 @@ def build_es_base_query( ], ) ) - main_query, join_query = build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - child_highlighting=child_highlighting, - api_version=api_version, - alerts=alerts, + main_query, child_docs_query, parent_query = ( + build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + child_highlighting=child_highlighting, + api_version=api_version, + alerts=alerts, + ) ) case SEARCH_TYPES.OPINION: @@ -1207,13 +1212,15 @@ def build_es_base_query( ], ) ) - main_query, join_query = build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - mlt_query, - child_highlighting=child_highlighting, - api_version=api_version, + main_query, child_docs_query, parent_query = ( + build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + mlt_query, + child_highlighting=child_highlighting, + api_version=api_version, + ) ) if not any([filters, string_query, main_query]): @@ -1222,7 +1229,7 @@ def build_es_base_query( match_all_query = get_match_all_query( cd, search_query, api_version, child_highlighting ) - return match_all_query, join_query + return match_all_query, child_docs_query, parent_query if plain_doc: # Combine the filters and string query for plain documents like Oral @@ -1231,7 +1238,7 @@ def build_es_base_query( cd, filters, string_query, api_version ) - return search_query.query(main_query), join_query + return search_query.query(main_query), child_docs_query, parent_query def build_has_parent_parties_query( @@ -1261,7 +1268,7 @@ def build_has_parent_parties_query( def build_child_docs_query( - join_query: QueryString | None, + child_docs_query: QueryString | None, cd: CleanData, exclude_docs_for_empty_field: str = "", ) -> QueryString: @@ -1271,7 +1278,7 @@ def build_child_docs_query( to retrieve child documents directly, such as in the Opinions Feed, RECAP Feed, RECAP Documents count query, and V4 RECAP_DOCUMENT Search API. - :param join_query: Existing Elasticsearch QueryString object or None + :param child_docs_query: Existing Elasticsearch QueryString object or None :param cd: The user input CleanedData :param exclude_docs_for_empty_field: Field that should not be empty for a document to be included @@ -1289,7 +1296,7 @@ def build_child_docs_query( ] parties_has_parent_query = build_has_parent_parties_query(parties_filters) - if not join_query: + if not child_docs_query: # Match all query case. if not exclude_docs_for_empty_field: if cd["type"] == SEARCH_TYPES.OPINION: @@ -1311,7 +1318,7 @@ def build_child_docs_query( filters.append(child_query_recap) return Q("bool", filter=filters) - query_dict = join_query.to_dict() + query_dict = child_docs_query.to_dict() if "filter" in query_dict["bool"]: existing_filter = query_dict["bool"]["filter"] if cd["type"] == SEARCH_TYPES.OPINION: @@ -1373,7 +1380,7 @@ def get_facet_dict_for_search_query( """ cd["just_facets_query"] = True - search_query, _ = build_es_base_query(search_query, cd) + search_query, _, _ = build_es_base_query(search_query, cd) search_query.aggs.bucket("status", A("terms", field="status.raw")) search_query = search_query.extra(size=0) response = search_query.execute() @@ -1395,7 +1402,7 @@ def build_es_main_query( applicable. """ search_query_base = search_query - search_query, join_query = build_es_base_query(search_query, cd) + search_query, child_docs_query, _ = build_es_base_query(search_query, cd) top_hits_limit = 5 child_docs_count_query = None match cd["type"]: @@ -1413,7 +1420,9 @@ def build_es_main_query( top_hits_limit, ) case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: - child_docs_count_query = build_child_docs_query(join_query, cd) + child_docs_count_query = build_child_docs_query( + child_docs_query, cd + ) if child_docs_count_query: # Get the total RECAP Documents count. child_docs_count_query = search_query_base.query( @@ -2214,13 +2223,13 @@ def build_search_feed_query( hl_field = "text" if cd["type"] == SEARCH_TYPES.RECAP: hl_field = "plain_text" - s, join_query = build_es_base_query(search_query, cd) + s, child_docs_query, _ = build_es_base_query(search_query, cd) if jurisdiction or cd["type"] == SEARCH_TYPES.RECAP: # An Opinion Jurisdiction feed or RECAP Search displays child documents # Eliminate items that lack the ordering field and apply highlighting # to create a snippet for the plain_text or text fields. s = build_child_docs_query( - join_query, + child_docs_query, cd=cd, exclude_docs_for_empty_field=exclude_docs_for_empty_field, ) @@ -2336,7 +2345,7 @@ def build_full_join_es_queries( child_highlighting: bool = True, api_version: Literal["v3", "v4"] | None = None, alerts: bool = False, -) -> tuple[QueryString | list, QueryString | None]: +) -> tuple[QueryString | list, QueryString | None, QueryString | None]: """Build a complete Elasticsearch query with both parent and child document conditions. @@ -2347,7 +2356,8 @@ def build_full_join_es_queries( :param child_highlighting: Whether highlighting should be enabled in child docs. :param api_version: Optional, the request API version. :param alerts: If highlighting is being applied to search Alerts hits. - :return: An Elasticsearch QueryString object. + :return: A three-tuple: the main join query, the child documents query, and + the parent documents query. """ q_should = [] @@ -2363,7 +2373,8 @@ def build_full_join_es_queries( case SEARCH_TYPES.PEOPLE: child_type = "position" - join_query = None + child_docs_query = None + parent_query = None if cd["type"] in [ SEARCH_TYPES.RECAP, SEARCH_TYPES.DOCKETS, @@ -2419,18 +2430,18 @@ def build_full_join_es_queries( case [], []: pass case [], _: - join_query = Q( + child_docs_query = Q( "bool", should=child_text_query, minimum_should_match=1, ) case _, []: - join_query = Q( + child_docs_query = Q( "bool", filter=child_filters, ) case _, _: - join_query = Q( + child_docs_query = Q( "bool", filter=child_filters, should=child_text_query, @@ -2446,7 +2457,7 @@ def build_full_join_es_queries( (child_highlighting, cd["type"]), {} ) has_child_query = build_has_child_query( - join_query, + child_docs_query, child_type, query_hits_limit, hl_fields, @@ -2525,9 +2536,9 @@ def build_full_join_es_queries( q_should.append(parent_query) if not q_should: - return [], join_query + return [], child_docs_query, parent_query - final_query = apply_custom_score_to_main_query( + main_join_query = apply_custom_score_to_main_query( cd, Q( "bool", @@ -2535,10 +2546,7 @@ def build_full_join_es_queries( ), api_version, ) - return ( - final_query, - join_query, - ) + return (main_join_query, child_docs_query, parent_query) def limit_inner_hits( @@ -2859,10 +2867,8 @@ def do_es_api_query( child documents. """ - child_docs_query = None - try: - s, join_query = build_es_base_query( + s, child_docs_query, _ = build_es_base_query( search_query, cd, cd["highlight"], api_version ) except ( @@ -2881,7 +2887,7 @@ def do_es_api_query( # Note that in V3 Case Law Search, opinions are collapsed by cluster_id # meaning that only one result per cluster is shown. s = build_child_docs_query( - join_query, + child_docs_query, cd=cd, ) main_query = search_query.query(s) @@ -2917,7 +2923,7 @@ def do_es_api_query( ) else: child_docs_query = build_child_docs_query( - join_query, + child_docs_query, cd=cd, ) # Build query params for the ES V4 Search API endpoints. @@ -3032,7 +3038,7 @@ def do_es_alert_estimation_query( days=int(day_count) ) cd[before_field] = None - estimation_query, _ = build_es_base_query(search_query, cd) + estimation_query, _, _ = build_es_base_query(search_query, cd) if cd["type"] == SEARCH_TYPES.RECAP: # The RECAP estimation query consists of two requests: one to estimate @@ -3054,8 +3060,8 @@ def do_es_alert_estimation_query( multi_search = multi_search.add(main_doc_count_query) # Build RECAPDocuments count query. - _, join_query = build_es_base_query(search_query, cd) - child_docs_count_query = build_child_docs_query(join_query, cd) + _, child_docs_query, _ = build_es_base_query(search_query, cd) + child_docs_count_query = build_child_docs_query(child_docs_query, cd) child_total = 0 if child_docs_count_query: child_docs_count_query = search_query.query(child_docs_count_query) @@ -3077,11 +3083,14 @@ def do_es_alert_estimation_query( def do_es_sweep_alert_query( search_query: Search, + child_search_query: Search, cd: CleanData, -) -> tuple[list[Hit] | None, int | None]: +) -> tuple[list[Hit] | None, Response | None, Response | None]: """Build an ES query for its use in the daily RECAP sweep index. :param search_query: Elasticsearch DSL Search object. + :param child_search_query: The Elasticsearch DSL search query to perform + the child-only query. :param cd: The query CleanedData :return: A two-tuple, the Elasticsearch search query object and an ES Query for child documents, or None if there is no need to query @@ -3092,29 +3101,54 @@ def do_es_sweep_alert_query( if search_form.is_valid(): cd = search_form.cleaned_data else: - return None, None - - total_hits = None - - s, _ = build_es_base_query(search_query, cd, True, alerts=True) + return None, None, None + s, child_query, parent_query = build_es_base_query( + search_query, cd, True, alerts=True + ) main_query = add_es_highlighting(s, cd, alerts=True) main_query = main_query.sort(build_sort_results(cd)) main_query = main_query.extra( from_=0, size=settings.SCHEDULED_ALERT_HITS_LIMIT ) - results = main_query.execute() - if results: - total_hits = results.hits.total.value - - limit_inner_hits({}, results, cd["type"]) - set_results_highlights(results, cd["type"]) - for result in results: + multi_search = MultiSearch() + multi_search = multi_search.add(main_query) + if parent_query: + parent_search = search_query.query(parent_query) + parent_search = parent_search.extra( + from_=0, size=settings.SCHEDULED_ALERT_HITS_LIMIT + ) + parent_search = parent_search.source(includes=["docket_id"]) + multi_search = multi_search.add(parent_search) + + if child_query: + child_search = child_search_query.query(child_query) + child_search = child_search.extra( + from_=0, + size=settings.SCHEDULED_ALERT_HITS_LIMIT + * settings.RECAP_CHILD_HITS_PER_RESULT, + ) + child_search = child_search.source(includes=["id"]) + multi_search = multi_search.add(child_search) + + responses = multi_search.execute() + main_results = responses[0] + rd_results = None + docket_results = None + if parent_query: + docket_results = responses[1] + if child_query: + rd_results = responses[2] + + limit_inner_hits({}, main_results, cd["type"]) + set_results_highlights(main_results, cd["type"]) + + for result in main_results: child_result_objects = [] if hasattr(result, "child_docs"): for child_doc in result.child_docs: child_result_objects.append(child_doc.to_dict()) result["child_docs"] = child_result_objects - return results, total_hits + return main_results, docket_results, rd_results diff --git a/cl/search/documents.py b/cl/search/documents.py index 378dbb9477..cedc638170 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -4,6 +4,7 @@ from django.http import QueryDict from django.utils.html import escape, strip_tags from django_elasticsearch_dsl import Document, fields +from elasticsearch_dsl import Document as DSLDocument from cl.alerts.models import Alert from cl.audio.models import Audio @@ -364,7 +365,7 @@ def prepare_percolator_query(self, instance): cd = search_form.cleaned_data search_query = AudioDocument.search() - query, _ = build_es_base_query(search_query, cd) + query, _, _ = build_es_base_query(search_query, cd) return query.to_dict()["query"] @@ -961,8 +962,7 @@ def prepare_timestamp(self, instance): return datetime.utcnow() -@recap_index.document -class ESRECAPDocument(DocketBaseDocument): +class ESRECAPBaseDocument(DSLDocument): id = fields.IntegerField(attr="pk") docket_entry_id = fields.IntegerField(attr="docket_entry.pk") description = fields.TextField( @@ -1030,6 +1030,10 @@ class ESRECAPDocument(DocketBaseDocument): fields.IntegerField(multi=True), ) + +@recap_index.document +class ESRECAPDocument(DocketBaseDocument, ESRECAPBaseDocument): + class Django: model = RECAPDocument ignore_signals = True @@ -1837,3 +1841,14 @@ class Index: "number_of_replicas": settings.ELASTICSEARCH_RECAP_NUMBER_OF_REPLICAS, "analysis": settings.ELASTICSEARCH_DSL["analysis"], } + + +class ESRECAPSweepDocument(ESRECAPBaseDocument): + + class Index: + name = "recap_document_sweep" + settings = { + "number_of_shards": settings.ELASTICSEARCH_RECAP_NUMBER_OF_SHARDS, + "number_of_replicas": settings.ELASTICSEARCH_RECAP_NUMBER_OF_REPLICAS, + "analysis": settings.ELASTICSEARCH_DSL["analysis"], + } diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py index f47453b7bc..a8f2db4541 100644 --- a/cl/search/tests/tests_es_oral_arguments.py +++ b/cl/search/tests/tests_es_oral_arguments.py @@ -984,7 +984,7 @@ def confirm_query_matched(response, query_id) -> bool: @staticmethod def save_percolator_query(cd): search_query = AudioDocument.search() - query, _ = build_es_base_query(search_query, cd) + query, _, _ = build_es_base_query(search_query, cd) query_dict = query.to_dict()["query"] percolator_query = AudioPercolator( percolator_query=query_dict, rate=Alert.REAL_TIME diff --git a/cl/search/tests/tests_es_person.py b/cl/search/tests/tests_es_person.py index 0eb72bfe96..ef994b9a50 100644 --- a/cl/search/tests/tests_es_person.py +++ b/cl/search/tests/tests_es_person.py @@ -1342,7 +1342,7 @@ def test_has_child_filters(self) -> None: "type": SEARCH_TYPES.PEOPLE, } s = PersonDocument.search() - main_query, _ = build_es_base_query(s, cd) + main_query, _, _ = build_es_base_query(s, cd) self.assertEqual(main_query.count(), 2) # Query by parent field dob_state and child field selection_method. @@ -1352,7 +1352,7 @@ def test_has_child_filters(self) -> None: "type": SEARCH_TYPES.PEOPLE, } s = PersonDocument.search() - main_query, _ = build_es_base_query(s, cd) + main_query, _, _ = build_es_base_query(s, cd) self.assertEqual(main_query.count(), 1) position_5.delete() From a4e4e62250184b1de9b754cceff5cf60deb19723 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 10 Jul 2024 16:42:27 -0600 Subject: [PATCH 19/33] fix(alerts): Added more tests related to filtering cross-object hits. - Fixed issues and improved command resumability --- .../commands/cl_send_recap_alerts.py | 115 +++++--- cl/alerts/tests/tests_recap_alerts.py | 265 ++++++++++++++---- 2 files changed, 290 insertions(+), 90 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 763651c985..7f05ce4291 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -2,7 +2,7 @@ import datetime import time import traceback -from typing import Any, Type +from typing import Any, Literal, Type import pytz from asgiref.sync import async_to_sync @@ -140,12 +140,21 @@ def index_daily_recap_documents( :return: The total number of documents re-indexed. """ - if r.exists("alert_sweep:re_index_completed"): + if r.exists("alert_sweep:main_re_index_completed"): logger.info( - "The re-index task has been completed and will be omitted." + "The main re-index task has been completed and will be omitted." ) - # The re-indexing has been completed for the day. Abort it and proceed - # with sending alerts. + # The main re-indexing has been completed for the day. Abort it and + # proceed with RECAPDocument re-index. + return 0 + + if r.exists("alert_sweep:rd_re_index_completed"): + logger.info( + "The RECAPDocument only re-index task has been completed and will " + "be omitted." + ) + # The RECAPDocument re-indexing has been completed for the day. Abort + # it and proceed with sending alerts. return 0 if not r.exists("alert_sweep:query_date"): @@ -368,7 +377,7 @@ def fields = [ def should_docket_hit_be_included( - r: Redis, alert_id: int, docket_id: int + r: Redis, alert_id: int, docket_id: int, query_date: datetime.date ) -> bool: """Determine if a Docket alert should be triggered based on its date_modified and if the docket has triggered the alert previously. @@ -376,21 +385,19 @@ def should_docket_hit_be_included( :param r: The Redis interface. :param alert_id: The ID of the alert. :param docket_id: The ID of the docket. + :param query_date: The daily re_index query date. :return: True if the Docket alert should be triggered, False otherwise. """ docket = Docket.objects.filter(id=docket_id).only("date_modified").first() if not docket: return False if not has_document_alert_hit_been_triggered(r, alert_id, "d", docket_id): - local_midnight_localized = timezone.localtime( - timezone.make_aware( - datetime.datetime.fromisoformat( - str(r.get("alert_sweep:query_date")) - ) - ) - ) + # Confirm the docket has been modified during the day we’re sending + # alerts to avoid triggering docket-only alerts due to RECAPDocuments + # related to the case being indexed during the day since RD contains + # docket fields indexed which can trigger docket-only alerts. date_modified_localized = dt_as_local_date(docket.date_modified) - if date_modified_localized == local_midnight_localized.date(): + if date_modified_localized == query_date: return True return False @@ -464,6 +471,7 @@ def process_alert_hits( parent_results: Response | None, child_results: Response | None, alert_id: int, + query_date: datetime.date, ) -> list[Hit]: """Process alert hits by filtering and prepare the results to send based on alert conditions. @@ -473,9 +481,9 @@ def process_alert_hits( :param parent_results: The ES Response for the docket-only query. :param child_results: The ES Response for the RECAPDocument-only query. :param alert_id: The ID of the alert being processed. + :param query_date: The daily re_index query date. :return: A list of Hit objects that are filtered and prepared to be sent. """ - docket_hits = parent_results.hits if parent_results else [] docket_ids = [int(d.docket_id) for d in docket_hits] @@ -490,32 +498,31 @@ def process_alert_hits( r, alert_id, hit["child_docs"], rd_ids, check_rd_hl=True ) if rds_to_send: - # Cross-object query + # Docket OR RECAPDocument alert. hit["child_docs"] = rds_to_send results_to_send.append(hit) if should_docket_hit_be_included( - r, alert_id, hit.docket_id + r, alert_id, hit.docket_id, query_date ): add_document_hit_to_alert_set( r, alert_id, "d", hit.docket_id ) - # Docket-only alert - elif should_docket_hit_be_included(r, alert_id, hit.docket_id): + elif should_docket_hit_be_included( + r, alert_id, hit.docket_id, query_date + ): # Docket-only alert hit["child_docs"] = [] results_to_send.append(hit) add_document_hit_to_alert_set( r, alert_id, "d", hit.docket_id ) - else: - # RECAP-only alerts or cross-object alerts + # RECAPDocument-only alerts or cross-object alerts rds_to_send = filter_rd_alert_hits( r, alert_id, hit["child_docs"], rd_ids ) if rds_to_send: - # Cross-object alert hit["child_docs"] = rds_to_send results_to_send.append(hit) return results_to_send @@ -541,7 +548,18 @@ def send_search_alert_webhooks( ) -def query_and_send_alerts(r: Redis, rate: str) -> None: +def query_and_send_alerts( + r: Redis, rate: Literal["rt", "dly"], query_date: datetime.date +) -> None: + """Query the sweep index and send alerts based on the specified rate + and date. + + :param r: The Redis interface. + :param rate: The rate at which to query alerts. + :param query_date: The daily re_index query date. + :return: None. + """ + alert_users: UserProfile.user = User.objects.filter( alerts__rate=rate ).distinct() @@ -566,11 +584,7 @@ def query_and_send_alerts(r: Redis, rate: str) -> None: alerts_to_update.append(alert.pk) search_type = search_params.get("type", SEARCH_TYPES.RECAP) results_to_send = process_alert_hits( - r, - results, - parent_results, - child_results, - alert.pk, + r, results, parent_results, child_results, alert.pk, query_date ) if results_to_send: hits.append( @@ -600,7 +614,18 @@ def query_and_send_alerts(r: Redis, rate: str) -> None: logger.info(f"Sent {alerts_sent_count} {rate} email alerts.") -def query_and_schedule_alerts(r: Redis, rate: str): +def query_and_schedule_alerts( + r: Redis, rate: Literal["wly", "mly"], query_date: datetime.date +) -> None: + """Query the sweep index and schedule alerts based on the specified rate + and date. + + :param r: The Redis interface. + :param rate: The rate at which to query alerts. + :param query_date: The daily re_index query date. + :return: None. + """ + alert_users = User.objects.filter(alerts__rate=rate).distinct() for user in alert_users: alerts = user.alerts.filter(rate=rate, alert_type=SEARCH_TYPES.RECAP) @@ -615,11 +640,7 @@ def query_and_schedule_alerts(r: Redis, rate: str): continue results_to_send = process_alert_hits( - r, - results, - parent_results, - child_results, - alert.pk, + r, results, parent_results, child_results, alert.pk, query_date ) if results_to_send: for hit in results_to_send: @@ -678,6 +699,10 @@ def handle(self, *args, **options): RECAPSweepDocument, testing=testing_mode, ) + if not testing_mode: + # main_re_index_completed key so the main re_index task can be + # omitted in case of a failure. + r.set("alert_sweep:main_re_index_completed", 1, ex=3600 * 12) index_daily_recap_documents( r, DocketDocument._index._name, @@ -686,11 +711,21 @@ def handle(self, *args, **options): only_rd=True, ) if not testing_mode: - r.set("alert_sweep:re_index_completed", 1, ex=3600 * 12) + # rd_re_index_completed key so the RECAPDocument re_index task + # can be omitted in case of a failure. + r.set("alert_sweep:rd_re_index_completed", 1, ex=3600 * 12) - query_and_send_alerts(r, Alert.REAL_TIME) - query_and_send_alerts(r, Alert.DAILY) - query_and_schedule_alerts(r, Alert.WEEKLY) - query_and_schedule_alerts(r, Alert.MONTHLY) - r.delete("alert_sweep:re_index_completed") + query_date = timezone.localtime( + timezone.make_aware( + datetime.datetime.fromisoformat( + str(r.get("alert_sweep:query_date")) + ) + ) + ).date() + query_and_send_alerts(r, Alert.REAL_TIME, query_date) + query_and_send_alerts(r, Alert.DAILY, query_date) + query_and_schedule_alerts(r, Alert.WEEKLY, query_date) + query_and_schedule_alerts(r, Alert.MONTHLY, query_date) + r.delete("alert_sweep:main_re_index_completed") + r.delete("alert_sweep:rd_re_index_completed") r.delete("alert_sweep:query_date") diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 1fd573c8fb..a661347975 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -78,7 +78,9 @@ def setUp(self): self.r = get_redis_interface("CACHE") self.r.delete("alert_sweep:query_date") self.r.delete("alert_sweep:task_id") - self.r.delete("alert_hits:") + keys = self.r.keys("alert_hits:*") + if keys: + self.r.delete(*keys) @staticmethod def get_html_content_from_email(email_content): @@ -141,9 +143,7 @@ def _count_alert_hits_and_child_hits( self.assertTrue( alert_element, msg=f"Not alert with title {alert_title} found." ) - alert_cases = self._extract_cases_from_alert(tree, alert_title) - self.assertEqual( len(alert_cases), expected_hits, @@ -152,21 +152,23 @@ def _count_alert_hits_and_child_hits( % (alert_title, expected_hits, len(alert_cases)), ) if case_title: - child_hit_count = 0 for case in alert_cases: - case_text = " ".join(case.xpath(".//text()")).strip() + child_hit_count = 0 + case_text = " ".join( + [element.strip() for element in case.xpath(".//text()")] + ) if case_title in case_text: child_hit_count = len( case.xpath("following-sibling::ul[1]/li/a") ) - - self.assertEqual( - child_hit_count, - expected_child_hits, - msg="Did not get the right number of child hits for the case %s. " - "Expected: %s - Got: %s\n\n" - % (case_title, expected_child_hits, child_hit_count), - ) + self.assertEqual( + child_hit_count, + expected_child_hits, + msg="Did not get the right number of child hits for the case %s. " + "Expected: %s - Got: %s\n\n" + % (case_title, expected_child_hits, child_hit_count), + ) + break def _assert_child_hits_content( self, @@ -865,17 +867,17 @@ def test_filter_out_alerts_to_send_by_query_and_hits(self) -> None: docket.delete() - def test_special_cross_object_alerts(self) -> None: + def test_special_cross_object_alerts_or_clause(self) -> None: """This test confirms that hits are properly filtered out or included in alerts for special cross-object alerts that can match either a Docket-only hit and/or Docket + RDs simultaneously in the same hit. These cases include queries that use an OR clause combining - Docket field + RD fields or a text query that can match a Docket and - RD field simultaneously. + Docket field + RD fields. """ # The following test confirms that an alert with a query that can match - # a Docket or RECAPDocuments simultaneously is properly filtered. + # a Docket or RECAPDocuments from different cases simultaneously are + # properly filtered. cross_object_alert_d_or_rd_field = AlertFactory( user=self.user_profile.user, rate=Alert.REAL_TIME, @@ -897,34 +899,88 @@ def test_special_cross_object_alerts(self) -> None: ) html_content = self.get_html_content_from_email(mail.outbox[0]) self._confirm_number_of_alerts(html_content, 1) + # This hit should only display the Docket matched by its ID, # no RECAPDocument should be matched. - self._assert_child_hits_content( + self._count_alert_hits_and_child_hits( html_content, cross_object_alert_d_or_rd_field.name, + 2, self.de.docket.case_name, - [], + 0, ) - - # This hit should display the rd_2 nested below its parent docket. + # The second hit should display the rd_2 nested below its parent docket. self._assert_child_hits_content( html_content, cross_object_alert_d_or_rd_field.name, self.de_1.docket.case_name, [self.rd_2.description], ) - # Assert email text version: txt_email = mail.outbox[0].body self.assertIn(cross_object_alert_d_or_rd_field.name, txt_email) self.assertIn(self.rd_2.description, txt_email) + # This test confirms that we're able to trigger cross-object alerts + # that include an OR clause and match documents that belong to the + # same case. + cross_object_alert_d_or_rd_field_same_case = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Cross-object query", + query=f"q=docket_id:{self.de.docket.pk} OR pacer_doc_id:{self.rd.pacer_doc_id}&type=r", + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts", testing_mode=True) + + # A new alert should be triggered, containing the RD document nested + # below its parent docket. + self.assertEqual( + len(mail.outbox), 2, msg="Outgoing emails don't match." + ) + html_content = self.get_html_content_from_email(mail.outbox[1]) + self._confirm_number_of_alerts(html_content, 1) + self._count_alert_hits_and_child_hits( + html_content, + cross_object_alert_d_or_rd_field.name, + 1, + self.de.docket.case_name, + 1, + ) + self._assert_child_hits_content( + html_content, + cross_object_alert_d_or_rd_field_same_case.name, + self.de.docket.case_name, + [self.rd.description], + ) + + def test_special_cross_object_alerts_text_query(self) -> None: + """This test confirms that hits are properly filtered out or included + in alerts for special cross-object alerts that can match either a + Docket-only hit and/or Docket + RDs simultaneously in the same hit. + These cases include queries that use a text query that can match a + Docket and RD field simultaneously. + """ + # This test confirms a text query cross-object alert matches documents # according to trigger conditions like indexed date and previous triggers # by the same document. + cross_object_alert_text = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Cross-object text query", + query=f'q="United states"&type=r', + ) two_days_before = self.mock_date - datetime.timedelta(days=2) mock_two_days_before = two_days_before.replace(hour=5) - with time_machine.travel(mock_two_days_before, tick=False): + with time_machine.travel( + mock_two_days_before, tick=False + ), self.captureOnCommitCallbacks(execute=True): docket = DocketFactory( court=self.court, case_name="United States of America", @@ -932,6 +988,22 @@ def test_special_cross_object_alerts(self) -> None: source=Docket.RECAP, ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts", testing_mode=True) + + # No alert should be triggered since the matched docket was not + # modified during the current day. + self.assertEqual( + len(mail.outbox), 0, msg="Outgoing emails don't match." + ) + + # Index new documents that match cross_object_alert_text, an RD, and + # an empty docket. with time_machine.travel( self.mock_date, tick=False ), self.captureOnCommitCallbacks(execute=True): @@ -951,17 +1023,11 @@ def test_special_cross_object_alerts(self) -> None: docket_2 = DocketFactory( court=self.court, - case_name="United States of America vs Lorem", + case_name="United States vs Lorem", docket_number="1:21-bk-1008", source=Docket.RECAP, ) - cross_object_alert_text = AlertFactory( - user=self.user_profile.user, - rate=Alert.REAL_TIME, - name="Test Alert Cross-object query", - query=f'q="United states"&type=r', - ) with mock.patch( "cl.api.webhooks.requests.post", side_effect=lambda *args, **kwargs: MockResponse( @@ -970,14 +1036,21 @@ def test_special_cross_object_alerts(self) -> None: ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) - # A new alert should be triggered containing two hits. One matched by + # An alert should be triggered containing two hits. One matched by # the rd_3 plain text description and one matched by docket_2 case_name self.assertEqual( - len(mail.outbox), 2, msg="Outgoing emails don't match." + len(mail.outbox), 1, msg="Outgoing emails don't match." ) - html_content = self.get_html_content_from_email(mail.outbox[1]) - # rd_3 should appear nested in this hit. + html_content = self.get_html_content_from_email(mail.outbox[0]) self._confirm_number_of_alerts(html_content, 1) + self._count_alert_hits_and_child_hits( + html_content, + cross_object_alert_text.name, + 2, + docket.case_name, + 1, + ) + # rd_3 should appear nested in this hit. self._assert_child_hits_content( html_content, cross_object_alert_text.name, @@ -991,8 +1064,7 @@ def test_special_cross_object_alerts(self) -> None: docket_2.case_name, [], ) - - # Modify the docket today: + # Modify 1:21-bk-1009 docket today: with time_machine.travel( self.mock_date, tick=False ), self.captureOnCommitCallbacks(execute=True): @@ -1008,11 +1080,19 @@ def test_special_cross_object_alerts(self) -> None: ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) - # A new alert should be triggered containing docket as a hit with no - # nested RDs. - html_content = self.get_html_content_from_email(mail.outbox[2]) + # A new alert should be triggered containing the docket as a hit with + # no nested RDs. + html_content = self.get_html_content_from_email(mail.outbox[1]) self.assertEqual( - len(mail.outbox), 3, msg="Outgoing emails don't match." + len(mail.outbox), 2, msg="Outgoing emails don't match." + ) + self._confirm_number_of_alerts(html_content, 1) + self._count_alert_hits_and_child_hits( + html_content, + cross_object_alert_text.name, + 1, + docket.case_name, + 0, ) self._assert_child_hits_content( html_content, @@ -1029,20 +1109,41 @@ def test_special_cross_object_alerts(self) -> None: ), ), time_machine.travel(self.mock_date, tick=False): call_command("cl_send_recap_alerts", testing_mode=True) - # No new alerts should be triggered. self.assertEqual( - len(mail.outbox), 3, msg="Outgoing emails don't match." + len(mail.outbox), 2, msg="Outgoing emails don't match." ) + # Index new documents that match cross_object_alert_text, an RD, and + # an empty docket. + with time_machine.travel( + self.mock_date, tick=False + ), self.captureOnCommitCallbacks(execute=True): + rd_4 = RECAPDocumentFactory( + docket_entry=alert_de, + description="Hearing new", + document_number="3", + pacer_doc_id="0180366528790", + plain_text="Lorem ipsum", + ) + rd_5 = RECAPDocumentFactory( + docket_entry=alert_de, + description="Hearing new 2", + document_number="4", + pacer_doc_id="018026657750", + plain_text="United states of america plain text", + ) + # This test confirms that we're able to trigger cross-object alerts - # that include an OR clause and match documents that belong to the - # same case. - cross_object_alert_d_or_rd_field_same_case = AlertFactory( + # that include an OR clause and a cross-object text query. + cross_object_alert_d_or_rd_field_text_query = AlertFactory( user=self.user_profile.user, rate=Alert.REAL_TIME, - name="Test Alert Cross-object query", - query=f"q=docket_id:{self.de.docket.pk} OR pacer_doc_id:{self.rd.pacer_doc_id}&type=r", + name="Test Alert Cross-object query combined.", + query=f"q=docket_id:{self.de.docket.pk} OR " + f"pacer_doc_id:{self.rd.pacer_doc_id} OR " + f'("United States of America" OR ' + f"pacer_doc_id:{rd_3.pacer_doc_id})&type=r", ) with mock.patch( "cl.api.webhooks.requests.post", @@ -1054,16 +1155,80 @@ def test_special_cross_object_alerts(self) -> None: # A new alert should be triggered, containing the RD document nested below # its parent docket. + html_content = self.get_html_content_from_email(mail.outbox[2]) + self.assertEqual( + len(mail.outbox), 3, msg="Outgoing emails don't match." + ) + # The email contains two alerts: one for cross_object_alert_text + # triggered by the new rd_5 added, and one for cross_object_alert_d_or_rd_field_text_query. + self._confirm_number_of_alerts(html_content, 2) + self._count_alert_hits_and_child_hits( + html_content, + cross_object_alert_text.name, + 1, + docket.case_name, + 1, + ) + # The cross_object_alert_d_or_rd_field_text_query alert contains two + # hits. The first one matches "docket" and rd_3 and rd_5 nested below + # due to the OR clause in the text query, and the second hit matches + # self.de.docket and self.rd. + self._count_alert_hits_and_child_hits( + html_content, + cross_object_alert_d_or_rd_field_text_query.name, + 2, + docket.case_name, + 2, + ) + self._assert_child_hits_content( + html_content, + cross_object_alert_d_or_rd_field_text_query.name, + docket.case_name, + [rd_3.description, rd_5.description], + ) + self._assert_child_hits_content( + html_content, + cross_object_alert_d_or_rd_field_text_query.name, + self.de.docket.case_name, + [self.rd.description], + ) + + # This test confirms that hits are properly filtered when using AND in + # the text query. + cross_object_alert_d_or_rd_field_text_query_and = AlertFactory( + user=self.user_profile.user, + rate=Alert.REAL_TIME, + name="Test Alert Cross-object query combined.", + query=f'q=("United States of America" AND ' + f"pacer_doc_id:{rd_3.pacer_doc_id})&type=r", + ) + with mock.patch( + "cl.api.webhooks.requests.post", + side_effect=lambda *args, **kwargs: MockResponse( + 200, mock_raw=True + ), + ), time_machine.travel(self.mock_date, tick=False): + call_command("cl_send_recap_alerts", testing_mode=True) + + # A new alert should be triggered, containing rd_3 document nested below + # its parent docket. html_content = self.get_html_content_from_email(mail.outbox[3]) self.assertEqual( len(mail.outbox), 4, msg="Outgoing emails don't match." ) self._confirm_number_of_alerts(html_content, 1) + self._count_alert_hits_and_child_hits( + html_content, + cross_object_alert_d_or_rd_field_text_query_and.name, + 1, + docket.case_name, + 1, + ) self._assert_child_hits_content( html_content, - cross_object_alert_d_or_rd_field_same_case.name, - self.de.docket.case_name, - [self.rd.description], + cross_object_alert_d_or_rd_field_text_query_and.name, + docket.case_name, + [rd_3.description], ) docket.delete() From b56f2354e43c74267811fd1f1567b2e6ae2cdb7d Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 25 Jul 2024 14:51:55 -0500 Subject: [PATCH 20/33] fix(alerts): Restore send_es_search_alert_webhook to avoid conflicts due to scheduled task - This can be removed after tasks in the queue have been processed. --- .../commands/cl_send_recap_alerts.py | 4 +- cl/alerts/tasks.py | 4 +- cl/api/tasks.py | 42 +++++++++++++++++++ 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 7f05ce4291..193823cb22 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -25,7 +25,7 @@ recap_document_hl_matched, ) from cl.api.models import WebhookEventType -from cl.api.tasks import send_es_search_alert_webhook +from cl.api.tasks import send_search_alert_webhook_es from cl.lib.command_utils import VerboseCommand, logger from cl.lib.date_time import dt_as_local_date from cl.lib.elasticsearch_utils import do_es_sweep_alert_query @@ -543,7 +543,7 @@ def send_search_alert_webhooks( event_type=WebhookEventType.SEARCH_ALERT, enabled=True ) for user_webhook in user_webhooks: - send_es_search_alert_webhook.delay( + send_search_alert_webhook_es.delay( results_to_send, user_webhook.pk, alert_id ) diff --git a/cl/alerts/tasks.py b/cl/alerts/tasks.py index f7b004bc54..e3aad0a071 100644 --- a/cl/alerts/tasks.py +++ b/cl/alerts/tasks.py @@ -25,7 +25,7 @@ from cl.api.models import WebhookEventType from cl.api.tasks import ( send_docket_alert_webhook_events, - send_es_search_alert_webhook, + send_search_alert_webhook_es, ) from cl.celery_init import app from cl.custom_filters.templatetags.text_filters import best_case_name @@ -458,7 +458,7 @@ def send_webhook_alert_hits( event_type=WebhookEventType.SEARCH_ALERT, enabled=True ) for user_webhook in user_webhooks: - send_es_search_alert_webhook.delay( + send_search_alert_webhook_es.delay( documents, user_webhook.pk, alert.pk, diff --git a/cl/api/tasks.py b/cl/api/tasks.py index a0d6112444..7f0b8d2cdd 100644 --- a/cl/api/tasks.py +++ b/cl/api/tasks.py @@ -82,8 +82,50 @@ def send_docket_alert_webhook_events( send_webhook_event(webhook_event, json_bytes) +# TODO: Remove after scheduled OA alerts have been processed. @app.task() def send_es_search_alert_webhook( + results: list[dict[str, Any]], + webhook_pk: int, + alert: Alert, +) -> None: + """Send a search alert webhook event containing search results from a + search alert object. + + :param results: The search results returned by SOLR for this alert. + :param webhook_pk: The webhook endpoint ID object to send the event to. + :param alert: The search alert object. + """ + + webhook = Webhook.objects.get(pk=webhook_pk) + serialized_alert = SearchAlertSerializerModel(alert).data + es_results = [] + for result in results: + result["snippet"] = result["text"] + es_results.append(ResultObject(initial=result)) + serialized_results = V3OAESResultSerializer(es_results, many=True).data + + post_content = { + "webhook": generate_webhook_key_content(webhook), + "payload": { + "results": serialized_results, + "alert": serialized_alert, + }, + } + renderer = JSONRenderer() + json_bytes = renderer.render( + post_content, + accepted_media_type="application/json;", + ) + webhook_event = WebhookEvent.objects.create( + webhook=webhook, + content=post_content, + ) + send_webhook_event(webhook_event, json_bytes) + + +@app.task() +def send_search_alert_webhook_es( results: list[dict[str, Any]] | list[Hit], webhook_pk: int, alert_pk: int, From d1026640871d3bcca5774cca102dc466c905c406 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 29 Jul 2024 10:13:13 -0500 Subject: [PATCH 21/33] fix(alerts): Fixed MLY alerts test can't be sent after the 28th --- cl/alerts/tests/tests_recap_alerts.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index a661347975..3a067858d2 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -1608,7 +1608,9 @@ def test_schedule_wly_and_mly_recap_alerts(self) -> None: self.assertIn(self.rd.description, txt_email) # Send scheduled Monthly alerts and check assertions. - call_command("cl_send_scheduled_alerts", rate=Alert.MONTHLY) + current_date = now().replace(day=28, hour=0) + with time_machine.travel(current_date, tick=False): + call_command("cl_send_scheduled_alerts", rate=Alert.MONTHLY) self.assertEqual( len(mail.outbox), 2, msg="Outgoing emails don't match." ) From 57b6df7ac2e46514ca729f4d528ec7a46149fe82 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 26 Sep 2024 16:55:26 -0600 Subject: [PATCH 22/33] fix(alerts): Fixed merge conflicts and adjust test accordingly new RECAP_CHILD_HITS_PER_RESULT value --- cl/alerts/tests/tests_recap_alerts.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 3a067858d2..b339a10ef7 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -1247,15 +1247,15 @@ def test_limit_alert_case_child_hits(self) -> None: description="MOTION for Leave to File Amicus Curiae Lorem Served", ) rd_descriptions = [] - for i in range(6): + for i in range(4): rd = RECAPDocumentFactory( docket_entry=alert_de, description=f"Motion to File {i+1}", document_number=f"{i+1}", pacer_doc_id=f"018036652436{i+1}", ) - if i < 5: - # Omit the last alert to compare. Only up to 5 should be + if i < 3: + # Omit the last alert to compare. Only up to 3 should be # included in the case. rd_descriptions.append(rd.description) @@ -1286,13 +1286,13 @@ def test_limit_alert_case_child_hits(self) -> None: html_content = self.get_html_content_from_email(mail.outbox[0]) self.assertIn(recap_only_alert.name, html_content) self._confirm_number_of_alerts(html_content, 1) - # The case alert should contain up to 5 child hits. + # The case alert should contain up to 3 child hits. self._count_alert_hits_and_child_hits( html_content, recap_only_alert.name, 1, self.de.docket.case_name, - 5, + 3, ) self._assert_child_hits_content( html_content, From b35ef0aad49b230ad98735c4631986f7facd0069 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 26 Sep 2024 18:36:31 -0600 Subject: [PATCH 23/33] fix(elasticsearch): Fixed failing test due to build_full_join_es_queries returning values change --- cl/lib/elasticsearch_utils.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 34aceeeac8..df17f5e1dc 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -1197,15 +1197,21 @@ def build_es_base_query( mlt_query = async_to_sync(build_more_like_this_query)( cluster_pks ) - main_query, join_query = build_full_join_es_queries( - cd, - {"opinion": []}, - [], - mlt_query, - child_highlighting=False, - api_version=api_version, + main_query, child_docs_query, parent_query = ( + build_full_join_es_queries( + cd, + {"opinion": []}, + [], + mlt_query, + child_highlighting=False, + api_version=api_version, + ) + ) + return ( + search_query.query(main_query), + child_docs_query, + parent_query, ) - return search_query.query(main_query), join_query opinion_search_fields = SEARCH_OPINION_QUERY_FIELDS child_fields = opinion_search_fields.copy() From 8902aa0c226c2aac1cc2ce82fe0c76205569cdde Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 26 Sep 2024 18:46:29 -0600 Subject: [PATCH 24/33] fix(alerts): Removed recap_document_hl_matched as we no longer rely on HL to filter out RD hits --- .../commands/cl_send_recap_alerts.py | 22 +++--- cl/alerts/tests/tests_recap_alerts.py | 68 ------------------- cl/alerts/utils.py | 22 ------ 3 files changed, 12 insertions(+), 100 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 193823cb22..8640ddc70a 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -22,7 +22,6 @@ add_document_hit_to_alert_set, alert_hits_limit_reached, has_document_alert_hit_been_triggered, - recap_document_hl_matched, ) from cl.api.models import WebhookEventType from cl.api.tasks import send_search_alert_webhook_es @@ -407,7 +406,7 @@ def filter_rd_alert_hits( alert_id: int, rd_hits: AttrList, rd_ids: list[int], - check_rd_hl=False, + check_rd_matched=False, ): """Filter RECAP document hits based on specified conditions. @@ -416,8 +415,8 @@ def filter_rd_alert_hits( :param rd_hits: A list of RECAPDocument hits to be processed. :param rd_ids: A list of RECAPDocument IDs that matched the RECAPDocument only query. - :param check_rd_hl: A boolean indicating whether to check if the RECAP - document hit matched RD HLs. + :param check_rd_matched: A boolean indicating whether to check if the RECAP + document hit from the main query also matches the RECAPDocument-only query :return: A list of RECAP document hits that meet all specified conditions. """ @@ -428,11 +427,10 @@ def filter_rd_alert_hits( r, alert_id, "r", rd_hit["_source"]["id"] ) ] - if check_rd_hl: - if not recap_document_hl_matched(rd_hit): - # If the RECAPDocument hit didn't match any HL. Check if it should be included - # due to it matched the RECAPDocument only query. - conditions.append(rd_hit["_source"]["id"] in rd_ids) + if check_rd_matched: + # Add condition to check if the RD hit is within the RD IDS returned + # by the RECAPDocument-only query. + conditions.append(rd_hit["_source"]["id"] in rd_ids) if all(conditions): rds_to_send.append(rd_hit) add_document_hit_to_alert_set( @@ -495,7 +493,11 @@ def process_alert_hits( if hit.docket_id in docket_ids: # Possible Docket-only alert rds_to_send = filter_rd_alert_hits( - r, alert_id, hit["child_docs"], rd_ids, check_rd_hl=True + r, + alert_id, + hit["child_docs"], + rd_ids, + check_rd_matched=True, ) if rds_to_send: # Docket OR RECAPDocument alert. diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index b339a10ef7..8e57da973a 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -17,7 +17,6 @@ index_daily_recap_documents, ) from cl.alerts.models import SEARCH_TYPES, Alert, ScheduledAlertHit -from cl.alerts.utils import recap_document_hl_matched from cl.api.factories import WebhookFactory from cl.api.models import WebhookEvent, WebhookEventType from cl.donate.models import NeonMembership @@ -241,73 +240,6 @@ def _count_webhook_hits_and_child_hits( % case_title, ) - async def test_recap_document_hl_matched(self) -> None: - """Test recap_document_hl_matched method that determines weather a hit - contains RECAPDocument HL fields.""" - - # Index base document factories. - with time_machine.travel(self.mock_date, tick=False): - index_daily_recap_documents( - self.r, - DocketDocument._index._name, - RECAPSweepDocument, - testing=True, - ) - - # Docket-only query - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": '"401 Civil"', - } - search_query = RECAPSweepDocument.search() - results, parent_results, _ = await sync_to_async( - do_es_sweep_alert_query - )( - search_query, - search_query, - search_params, - ) - docket_result = results[0] - for rd in docket_result["child_docs"]: - rd_field_matched = recap_document_hl_matched(rd) - self.assertEqual(rd_field_matched, False) - - # RECAPDocument-only query - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": '"Mauris iaculis, leo sit amet hendrerit vehicula"', - } - search_query = RECAPSweepDocument.search() - results, parent_results, _ = await sync_to_async( - do_es_sweep_alert_query - )( - search_query, - search_query, - search_params, - ) - docket_result = results[0] - for rd in docket_result["child_docs"]: - rd_field_matched = recap_document_hl_matched(rd) - self.assertEqual(rd_field_matched, True) - - # Cross-object query - search_params = { - "type": SEARCH_TYPES.RECAP, - "q": "SUBPOENAS SERVED OFF Mauris iaculis", - } - search_query = RECAPSweepDocument.search() - results, parent_results, _ = await sync_to_async( - do_es_sweep_alert_query - )( - search_query, - search_query, - search_params, - ) - docket_result = results[0] - for rd in docket_result["child_docs"]: - rd_field_matched = recap_document_hl_matched(rd) - self.assertEqual(rd_field_matched, True) - def test_filter_recap_alerts_to_send(self) -> None: """Test filter RECAP alerts that met the conditions to be sent: - RECAP type alert. diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py index 6ad589a913..d98984abff 100644 --- a/cl/alerts/utils.py +++ b/cl/alerts/utils.py @@ -147,28 +147,6 @@ def alert_hits_limit_reached(alert_pk: int, user_pk: int) -> bool: return False -def recap_document_hl_matched(rd_hit: Hit) -> bool: - """Determine whether HL matched a RECAPDocument text field. - - :param rd_hit: The ES hit. - :return: True if the hit matched a RECAPDocument field. Otherwise, False. - """ - - matched_rd_hl: set[str] = set() - rd_hl_fields = set(SEARCH_RECAP_CHILD_HL_FIELDS.keys()) - if hasattr(rd_hit, "highlight"): - highlights = rd_hit.highlight.to_dict() - matched_rd_hl.update( - hl_key - for hl_key, hl_value in highlights.items() - for hl in hl_value - if f"<{ALERTS_HL_TAG}>" in hl - ) - if matched_rd_hl and matched_rd_hl.issubset(rd_hl_fields): - return True - return False - - def make_alert_set_key(alert_id: int, document_type: str) -> str: """Generate a Redis key for storing alert hits. From 4babf5d3b2d0e1ec45b9604e8f532f1345ebc526 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 18 Oct 2024 13:26:43 -0400 Subject: [PATCH 25/33] feat(custom filter): Refactor alerts_supported method for better readability --- cl/custom_filters/templatetags/extras.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/cl/custom_filters/templatetags/extras.py b/cl/custom_filters/templatetags/extras.py index 90395d9356..39d535b2df 100644 --- a/cl/custom_filters/templatetags/extras.py +++ b/cl/custom_filters/templatetags/extras.py @@ -291,14 +291,9 @@ def alerts_supported(context: RequestContext, search_type: str) -> str: """ request = context["request"] - return ( - search_type == SEARCH_TYPES.OPINION - or search_type == SEARCH_TYPES.ORAL_ARGUMENT - or ( - search_type == SEARCH_TYPES.RECAP - and waffle.flag_is_active(request, "recap-alerts-active") - ) - ) + if search_type == SEARCH_TYPES.RECAP: + return waffle.flag_is_active(request, "recap-alerts-active") + return search_type in (SEARCH_TYPES.OPINION, SEARCH_TYPES.ORAL_ARGUMENT) @register.filter From a0085cce32242d78e8aed8db537bbb52c3517404 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 18 Oct 2024 13:33:35 -0400 Subject: [PATCH 26/33] refactor(alerts): Cleaned up unused imports in utils.py --- cl/alerts/utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py index d98984abff..66047e5fe9 100644 --- a/cl/alerts/utils.py +++ b/cl/alerts/utils.py @@ -15,13 +15,6 @@ ) from cl.lib.command_utils import logger from cl.lib.elasticsearch_utils import add_es_highlighting -from cl.lib.types import CleanData -from cl.search.constants import ( - ALERTS_HL_TAG, - SEARCH_RECAP_CHILD_HL_FIELDS, - recap_document_filters, - recap_document_indexed_fields, -) from cl.search.documents import AudioPercolator from cl.search.models import SEARCH_TYPES, Docket From 5f12c3037c3a3110fde28763fff6965b4090264f Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 18 Oct 2024 13:36:08 -0400 Subject: [PATCH 27/33] refactor(search): Cleanup unused constants --- cl/search/constants.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/cl/search/constants.py b/cl/search/constants.py index 0efe9848bb..333dfbca6c 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -306,29 +306,6 @@ Opinion.TRIAL_COURT: "trial-court-document", } -recap_document_indexed_fields = [ - "id", - "docket_entry_id", - "description", - "entry_number", - "entry_date_filed", - "short_description", - "document_type", - "document_number", - "pacer_doc_id", - "plain_text", - "attachment_number", - "is_available", - "page_count", - "cites", -] - -recap_document_filters = [ - "available_only", - "description", - "document_number", - "attachment_number", -] cardinality_query_unique_ids = { SEARCH_TYPES.RECAP: "docket_id", From 5fb177f83e8db2743a4d4eb6ab62acb82319597c Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 18 Oct 2024 13:41:50 -0400 Subject: [PATCH 28/33] refactor(alerts): Replaces Type import with built-in alternative --- cl/alerts/management/commands/cl_send_recap_alerts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 8640ddc70a..9589a7e8a6 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -2,7 +2,7 @@ import datetime import time import traceback -from typing import Any, Literal, Type +from typing import Any, Literal import pytz from asgiref.sync import async_to_sync @@ -120,7 +120,7 @@ def retrieve_task_info(task_info: dict[str, Any]) -> dict[str, Any]: def index_daily_recap_documents( r: Redis, source_index_name: str, - target_index: Type[RECAPSweepDocument] | Type[ESRECAPSweepDocument], + target_index: type[RECAPSweepDocument] | type[ESRECAPSweepDocument], testing: bool = False, only_rd: bool = False, ) -> int: From 78955f1817110635af55690533fa1ffff2458968 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 18 Oct 2024 13:49:04 -0400 Subject: [PATCH 29/33] refactor(search): Removes unused argument from index command --- .../management/commands/cl_index_parent_and_child_docs.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cl/search/management/commands/cl_index_parent_and_child_docs.py b/cl/search/management/commands/cl_index_parent_and_child_docs.py index 57cdf390fc..366d9fe13e 100644 --- a/cl/search/management/commands/cl_index_parent_and_child_docs.py +++ b/cl/search/management/commands/cl_index_parent_and_child_docs.py @@ -342,11 +342,6 @@ def add_arguments(self, parser): action="store_true", help="Use this flag to only index documents missing in the index.", ) - parser.add_argument( - "--sweep-index", - action="store_true", - help="Whether to perform an indexing for the sweep index.", - ) def handle(self, *args, **options): super().handle(*args, **options) From da72292caa66abaff484a6eecce13c71729ce2d4 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 18 Oct 2024 14:03:55 -0400 Subject: [PATCH 30/33] feat(alert): Implements early returns in recap alert command --- .../commands/cl_send_recap_alerts.py | 76 ++++++++++--------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 9589a7e8a6..6b6ce13f29 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -588,21 +588,22 @@ def query_and_send_alerts( results_to_send = process_alert_hits( r, results, parent_results, child_results, alert.pk, query_date ) - if results_to_send: - hits.append( - [ - alert, - search_type, - results_to_send, - len(results_to_send), - ] - ) - alert.query_run = search_params.urlencode() # type: ignore - alert.date_last_hit = timezone.now() - alert.save() + if not results_to_send: + continue + hits.append( + [ + alert, + search_type, + results_to_send, + len(results_to_send), + ] + ) + alert.query_run = search_params.urlencode() # type: ignore + alert.date_last_hit = timezone.now() + alert.save() - # Send webhooks - send_search_alert_webhooks(user, results_to_send, alert.pk) + # Send webhooks + send_search_alert_webhooks(user, results_to_send, alert.pk) if hits: send_search_alert_emails.delay([(user.pk, hits)]) @@ -644,31 +645,32 @@ def query_and_schedule_alerts( results_to_send = process_alert_hits( r, results, parent_results, child_results, alert.pk, query_date ) - if results_to_send: - for hit in results_to_send: - # Schedule DAILY, WEEKLY and MONTHLY Alerts - if alert_hits_limit_reached(alert.pk, user.pk): - # Skip storing hits for this alert-user combination because - # the SCHEDULED_ALERT_HITS_LIMIT has been reached. - continue - - child_result_objects = [] - hit_copy = copy.deepcopy(hit) - if hasattr(hit_copy, "child_docs"): - for child_doc in hit_copy.child_docs: - child_result_objects.append( - child_doc["_source"].to_dict() - ) - hit_copy["child_docs"] = child_result_objects - scheduled_hits_to_create.append( - ScheduledAlertHit( - user=user, - alert=alert, - document_content=hit_copy.to_dict(), + if not results_to_send: + continue + for hit in results_to_send: + # Schedule DAILY, WEEKLY and MONTHLY Alerts + if alert_hits_limit_reached(alert.pk, user.pk): + # Skip storing hits for this alert-user combination because + # the SCHEDULED_ALERT_HITS_LIMIT has been reached. + continue + + child_result_objects = [] + hit_copy = copy.deepcopy(hit) + if hasattr(hit_copy, "child_docs"): + for child_doc in hit_copy.child_docs: + child_result_objects.append( + child_doc["_source"].to_dict() ) + hit_copy["child_docs"] = child_result_objects + scheduled_hits_to_create.append( + ScheduledAlertHit( + user=user, + alert=alert, + document_content=hit_copy.to_dict(), ) - # Send webhooks - send_search_alert_webhooks(user, results_to_send, alert.pk) + ) + # Send webhooks + send_search_alert_webhooks(user, results_to_send, alert.pk) # Create scheduled WEEKLY and MONTHLY Alerts in bulk. if scheduled_hits_to_create: From 0b62dca8f3d2eafaea996bd8ff66e0bee1baf198 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 18 Oct 2024 15:22:32 -0400 Subject: [PATCH 31/33] feat(alerts): Adds TaskCompletionStatus dataclass for tracking task progress This commit introduces a new dataclass to store and manage information related to running Elasticsearch tasks. The dataclass includes properties for task completion status, created and total document counts. --- .../commands/cl_send_recap_alerts.py | 78 +++++++++---------- cl/alerts/utils.py | 8 ++ 2 files changed, 45 insertions(+), 41 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 6b6ce13f29..65d88bf7b6 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -19,6 +19,7 @@ from cl.alerts.models import Alert, ScheduledAlertHit from cl.alerts.tasks import send_search_alert_emails from cl.alerts.utils import ( + TaskCompletionStatus, add_document_hit_to_alert_set, alert_hits_limit_reached, has_document_alert_hit_been_triggered, @@ -64,27 +65,34 @@ def get_task_status(task_id: str, es: Elasticsearch) -> dict[str, Any]: def compute_estimated_remaining_time( - initial_wait: float, start_time_millis: int, created: int, total: int + initial_wait: float, task_status: TaskCompletionStatus ) -> float: """Compute the estimated remaining time for the re_index task to complete. :param initial_wait: The default wait time in seconds. - :param start_time_millis: The start time in milliseconds epoch. - :param created: The number of items created so far. - :param total: The total number of items to be created. + :param task_status: An instance of `TaskCompletionStatus` containing task + information. :return: The estimated remaining time in seconds. If the start time, created, or total are invalid, the initial default time is returned. """ - if start_time_millis is None or not created or not total: + if ( + task_status.start_time_millis is None + or not task_status.created + or not task_status.total + ): return initial_wait - start_time = datetime.datetime.fromtimestamp(start_time_millis / 1000.0) + start_time = datetime.datetime.fromtimestamp( + task_status.start_time_millis / 1000.0 + ) time_now = datetime.datetime.now() estimated_time_remaining = max( datetime.timedelta( - seconds=((time_now - start_time).total_seconds() / created) - * (total - created) + seconds=( + (time_now - start_time).total_seconds() / task_status.created + ) + * (task_status.total - task_status.created) ).total_seconds(), initial_wait, ) @@ -92,29 +100,23 @@ def compute_estimated_remaining_time( return estimated_time_remaining -def retrieve_task_info(task_info: dict[str, Any]) -> dict[str, Any]: +def retrieve_task_info(task_info: dict[str, Any]) -> TaskCompletionStatus: """Retrieve task information from the given task dict. :param task_info: A dictionary containing the task status information. - :return: A dictionary with the task completion status, created documents - count, total documents count, and the task start time in milliseconds. - Retrieve default values in case task_info is not valid. + :return: A `TaskCompletionStatus` object representing the extracted task + information. """ if task_info: status = task_info["task"]["status"] - return { - "completed": task_info["completed"], - "created": status["created"], - "total": status["total"], - "start_time_millis": task_info["task"]["start_time_in_millis"], - } - return { - "completed": False, - "created": 0, - "total": 0, - "start_time_millis": None, - } + return TaskCompletionStatus( + completed=task_info["completed"], + created=status["created"], + total=status["total"], + start_time_millis=task_info["task"]["start_time_in_millis"], + ) + return TaskCompletionStatus() def index_daily_recap_documents( @@ -338,41 +340,35 @@ def fields = [ initial_wait = 0.01 if testing else 60.0 time.sleep(initial_wait) - get_task_info = retrieve_task_info(get_task_status(task_id, es)) + task_info = retrieve_task_info(get_task_status(task_id, es)) iterations_count = 0 estimated_time_remaining = compute_estimated_remaining_time( - initial_wait, - get_task_info["start_time_millis"], - get_task_info["created"], - get_task_info["total"], + initial_wait, task_info ) - while not get_task_info["completed"]: + while not task_info.completed: logger.info( - f"Task progress: {get_task_info['created']}/{get_task_info['total']} documents. " + f"Task progress: {task_info.created}/{task_info.total} documents. " f"Estimated time to finish: {estimated_time_remaining} seconds." ) - task_info = get_task_status(task_id, es) - get_task_info = retrieve_task_info(task_info) + task_status = get_task_status(task_id, es) + task_info = retrieve_task_info(task_status) time.sleep(estimated_time_remaining) - if task_info and not get_task_info["completed"]: + if task_info and not task_info.completed: estimated_time_remaining = compute_estimated_remaining_time( - initial_wait, - get_task_info["start_time_millis"], - get_task_info["created"], - get_task_info["total"], + initial_wait, task_info ) if not task_info: iterations_count += 1 if iterations_count > 10: logger.error( "Re_index alert sweep index task has failed: %s/%s", - get_task_info["created"], - get_task_info["total"], + task_info.created, + task_info.total, ) break r.delete("alert_sweep:task_id") - return get_task_info["total"] + return task_info.total def should_docket_hit_be_included( diff --git a/cl/alerts/utils.py b/cl/alerts/utils.py index 66047e5fe9..4b97fc7ba8 100644 --- a/cl/alerts/utils.py +++ b/cl/alerts/utils.py @@ -25,6 +25,14 @@ class DocketAlertReportObject: docket: Docket +@dataclass +class TaskCompletionStatus: + completed: bool = False + created: int = 0 + total: int = 0 + start_time_millis: int | None = None + + class OldAlertReport: def __init__(self): self.old_alerts = [] From 3b153d2e4a02bbc22f13c017ea56877ad8b60fbe Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 18 Oct 2024 16:24:47 -0400 Subject: [PATCH 32/33] feat(lib): Introduces EsMainQueries Dataclass dds a new dataclass to encapsulate common Elasticsearch queries used throughout the codebase. This centralizes query definitions, making it easier to maintain and update them. Updates the `build_es_base_query` method to return an instance of `EsMainQueries` instead of a tuple. This ensures consistent query structure and simplifies future modifications. --- .../commands/cl_send_recap_alerts.py | 5 +- .../commands/clean_up_search_alerts.py | 3 +- .../commands/ready_mix_cases_project.py | 3 +- cl/lib/elasticsearch_utils.py | 63 ++++++++++++------- cl/lib/types.py | 9 +++ cl/search/documents.py | 4 +- cl/search/tests/tests_es_oral_arguments.py | 5 +- cl/search/tests/tests_es_person.py | 6 +- 8 files changed, 65 insertions(+), 33 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_recap_alerts.py b/cl/alerts/management/commands/cl_send_recap_alerts.py index 65d88bf7b6..a38ff91f8c 100644 --- a/cl/alerts/management/commands/cl_send_recap_alerts.py +++ b/cl/alerts/management/commands/cl_send_recap_alerts.py @@ -564,9 +564,8 @@ def query_and_send_alerts( alerts_sent_count = 0 now_time = datetime.datetime.now() for user in alert_users: - if rate == Alert.REAL_TIME: - if not user.profile.is_member: - continue + if rate == Alert.REAL_TIME and not user.profile.is_member: + continue alerts = user.alerts.filter(rate=rate, alert_type=SEARCH_TYPES.RECAP) logger.info(f"Running alerts for user '{user}': {alerts}") diff --git a/cl/alerts/management/commands/clean_up_search_alerts.py b/cl/alerts/management/commands/clean_up_search_alerts.py index cf1ceb2f54..df8a28b2d6 100644 --- a/cl/alerts/management/commands/clean_up_search_alerts.py +++ b/cl/alerts/management/commands/clean_up_search_alerts.py @@ -75,7 +75,8 @@ def validate_queries_syntax(options: OptionsType) -> None: if search_form.is_valid(): cd = search_form.cleaned_data try: - s, _, _ = build_es_base_query(search_query, cd) + es_queries = build_es_base_query(search_query, cd) + s = es_queries.search_query s = s.extra(size=0) s.execute().to_dict() # Waiting between requests to avoid hammering ES too quickly. diff --git a/cl/corpus_importer/management/commands/ready_mix_cases_project.py b/cl/corpus_importer/management/commands/ready_mix_cases_project.py index eabb93f4a4..32c9db7ae3 100644 --- a/cl/corpus_importer/management/commands/ready_mix_cases_project.py +++ b/cl/corpus_importer/management/commands/ready_mix_cases_project.py @@ -198,7 +198,8 @@ def query_results_in_es(options): } search_query = DocketDocument.search() - s, _ = build_es_base_query(search_query, cd) + es_queries = build_es_base_query(search_query, cd) + s = es_queries.search_query s = s.extra(size=options["results_size"]) response = s.execute().to_dict() extracted_data = [ diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index d0d8db41b1..c12b9c0058 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -35,6 +35,7 @@ ApiPositionMapping, BasePositionMapping, CleanData, + EsMainQueries, ESRangeQueryParams, ) from cl.lib.utils import ( @@ -1114,18 +1115,20 @@ def build_es_base_query( child_highlighting: bool = True, api_version: Literal["v3", "v4"] | None = None, alerts: bool = False, -) -> tuple[Search, QueryString | None, QueryString | None]: +) -> EsMainQueries: """Builds filters and fulltext_query based on the given cleaned data and returns an elasticsearch query. :param search_query: The Elasticsearch search query object. :param cd: The cleaned data object containing the query and filters. - :param child_highlighting: Whether highlighting should be enabled in child docs. + :param child_highlighting: Whether highlighting should be enabled in child + docs. :param api_version: Optional, the request API version. :param alerts: If highlighting is being applied to search Alerts hits. - :return: A three-tuple, the Elasticsearch search query object and an ES - QueryString for child documents or None if there is no need to query - child documents and a QueryString for parent documents or None. + :return: An `EsMainQueries` object containing the Elasticsearch search + query object and an ES QueryString for child documents or None if there is + no need to query child documents and a QueryString for parent documents or + None. """ main_query = None @@ -1244,10 +1247,10 @@ def build_es_base_query( api_version=api_version, ) ) - return ( - search_query.query(main_query), - child_docs_query, - parent_query, + return EsMainQueries( + search_query=search_query.query(main_query), + parent_query=parent_query, + child_query=child_docs_query, ) opinion_search_fields = SEARCH_OPINION_QUERY_FIELDS @@ -1291,7 +1294,11 @@ def build_es_base_query( match_all_query = get_match_all_query( cd, search_query, api_version, child_highlighting ) - return match_all_query, child_docs_query, parent_query + return EsMainQueries( + search_query=match_all_query, + parent_query=parent_query, + child_query=child_docs_query, + ) if plain_doc: # Combine the filters and string query for plain documents like Oral @@ -1300,7 +1307,11 @@ def build_es_base_query( cd, filters, string_query, api_version ) - return search_query.query(main_query), child_docs_query, parent_query + return EsMainQueries( + search_query=search_query.query(main_query), + parent_query=parent_query, + child_query=child_docs_query, + ) def build_has_parent_parties_query( @@ -1442,7 +1453,8 @@ def get_facet_dict_for_search_query( """ cd["just_facets_query"] = True - search_query, _, _ = build_es_base_query(search_query, cd) + es_queries = build_es_base_query(search_query, cd) + search_query = es_queries.search_query search_query.aggs.bucket("status", A("terms", field="status.raw")) search_query = search_query.extra(size=0) response = search_query.execute() @@ -1464,7 +1476,9 @@ def build_es_main_query( applicable. """ search_query_base = search_query - search_query, child_docs_query, _ = build_es_base_query(search_query, cd) + es_queries = build_es_base_query(search_query, cd) + search_query = es_queries.search_query + child_docs_query = es_queries.child_query top_hits_limit = 5 child_docs_count_query = None match cd["type"]: @@ -2390,7 +2404,9 @@ def build_search_feed_query( hl_field = "text" if cd["type"] == SEARCH_TYPES.RECAP: hl_field = "plain_text" - s, child_docs_query, _ = build_es_base_query(search_query, cd) + es_queries = build_es_base_query(search_query, cd) + s = es_queries.search_query + child_docs_query = es_queries.child_query if jurisdiction or cd["type"] == SEARCH_TYPES.RECAP: # An Opinion Jurisdiction feed or RECAP Search displays child documents # Eliminate items that lack the ordering field and apply highlighting @@ -2952,9 +2968,11 @@ def do_es_api_query( """ try: - s, child_docs_query, _ = build_es_base_query( + es_queries = build_es_base_query( search_query, cd, cd["highlight"], api_version ) + s = es_queries.search_query + child_docs_query = es_queries.child_query except ( UnbalancedParenthesesQuery, UnbalancedQuotesQuery, @@ -3122,8 +3140,8 @@ def do_es_alert_estimation_query( days=int(day_count) ) cd[before_field] = None - estimation_query, _, _ = build_es_base_query(search_query, cd) - + es_queries = build_es_base_query(search_query, cd) + estimation_query = es_queries.search_query if cd["type"] == SEARCH_TYPES.RECAP: # The RECAP estimation query consists of two requests: one to estimate # Docket hits and one to estimate RECAPDocument hits. @@ -3144,7 +3162,8 @@ def do_es_alert_estimation_query( multi_search = multi_search.add(main_doc_count_query) # Build RECAPDocuments count query. - _, child_docs_query, _ = build_es_base_query(search_query, cd) + es_queries = build_es_base_query(search_query, cd) + child_docs_query = es_queries.child_query child_docs_count_query = build_child_docs_query(child_docs_query, cd) child_total = 0 if child_docs_count_query: @@ -3186,10 +3205,10 @@ def do_es_sweep_alert_query( cd = search_form.cleaned_data else: return None, None, None - - s, child_query, parent_query = build_es_base_query( - search_query, cd, True, alerts=True - ) + es_queries = build_es_base_query(search_query, cd, True, alerts=True) + s = es_queries.search_query + parent_query = es_queries.parent_query + child_query = es_queries.child_query main_query = add_es_highlighting(s, cd, alerts=True) main_query = main_query.sort(build_sort_results(cd)) main_query = main_query.extra( diff --git a/cl/lib/types.py b/cl/lib/types.py index 82d6131b5e..ff257574e9 100644 --- a/cl/lib/types.py +++ b/cl/lib/types.py @@ -4,6 +4,8 @@ from typing import Any, Callable, Dict, List, NotRequired, TypedDict, Union from django.http import HttpRequest +from django_elasticsearch_dsl.search import Search +from elasticsearch_dsl.query import QueryString from cl.users.models import User @@ -190,6 +192,13 @@ def get_db_to_dataclass_map(self): return self.__db_to_dataclass_map +@dataclass +class EsMainQueries: + search_query: Search + parent_query: QueryString | None = None + child_query: QueryString | None = None + + @dataclass class ApiPositionMapping(BasePositionMapping): position_type_dict: defaultdict[int, list[str]] = field( diff --git a/cl/search/documents.py b/cl/search/documents.py index b62293ac90..59d8327875 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -365,8 +365,8 @@ def prepare_percolator_query(self, instance): cd = search_form.cleaned_data search_query = AudioDocument.search() - query, _, _ = build_es_base_query(search_query, cd) - return query.to_dict()["query"] + es_queries = build_es_base_query(search_query, cd) + return es_queries.search_query.to_dict()["query"] class ES_CHILD_ID: diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py index 3ca921704d..5878299362 100644 --- a/cl/search/tests/tests_es_oral_arguments.py +++ b/cl/search/tests/tests_es_oral_arguments.py @@ -984,8 +984,9 @@ def confirm_query_matched(response, query_id) -> bool: @staticmethod def save_percolator_query(cd): search_query = AudioDocument.search() - query, _, _ = build_es_base_query(search_query, cd) - query_dict = query.to_dict()["query"] + es_queries = build_es_base_query(search_query, cd) + search_query = es_queries.search_query + query_dict = search_query.to_dict()["query"] percolator_query = AudioPercolator( percolator_query=query_dict, rate=Alert.REAL_TIME ) diff --git a/cl/search/tests/tests_es_person.py b/cl/search/tests/tests_es_person.py index 5f6a195849..12a0b7f1fb 100644 --- a/cl/search/tests/tests_es_person.py +++ b/cl/search/tests/tests_es_person.py @@ -1342,7 +1342,8 @@ def test_has_child_filters(self) -> None: "type": SEARCH_TYPES.PEOPLE, } s = PersonDocument.search() - main_query, _, _ = build_es_base_query(s, cd) + es_queries = build_es_base_query(s, cd) + main_query = es_queries.search_query self.assertEqual(main_query.count(), 2) # Query by parent field dob_state and child field selection_method. @@ -1352,7 +1353,8 @@ def test_has_child_filters(self) -> None: "type": SEARCH_TYPES.PEOPLE, } s = PersonDocument.search() - main_query, _, _ = build_es_base_query(s, cd) + es_queries = build_es_base_query(s, cd) + main_query = es_queries.search_query self.assertEqual(main_query.count(), 1) position_5.delete() From eb8874e718ae18cddd700e2e2c93e53d99c24849 Mon Sep 17 00:00:00 2001 From: ttys0dev <126845556+ttys0dev@users.noreply.github.com> Date: Sat, 5 Oct 2024 16:42:09 -0600 Subject: [PATCH 33/33] Fix check for restricted attachments --- cl/recap/mergers.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 6eccba198e..80244e7865 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -1792,6 +1792,15 @@ async def merge_attachment_page_data( if not all(sanity_checks): continue + # Missing on some restricted docs (see Juriscraper) + # Attachment 0 may not have page count since it is the main rd. + if ( + "page_count" in attachment + and attachment["page_count"] is None + and attachment["attachment_number"] != 0 + ): + continue + # Appellate entries with attachments don't have a main RD, transform it # to an attachment. In ACMS attachment pages, all the documents use the # same pacer_doc_id, so we need to make sure only one is matched to the