Skip to content

Commit

Permalink
Merge pull request #3070 from ss108/recap-parse
Browse files Browse the repository at this point in the history
Recap parse
  • Loading branch information
mlissner authored Sep 8, 2023
2 parents 951308b + 6b763f5 commit 53285f4
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 2 deletions.
8 changes: 6 additions & 2 deletions cl/scrapers/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ async def extract_recap_pdf_base(
if not is_iter(pks):
pks = [pks]

processed = []
processed: List[int] = []
for pk in pks:
rd = await RECAPDocument.objects.aget(pk=pk)
if check_if_needed and not rd.needs_extraction:
Expand Down Expand Up @@ -305,7 +305,11 @@ async def extract_recap_pdf_base(

rd.plain_text, _ = anonymize(content)
# Do not do indexing here. Creates race condition in celery.
await rd.asave(index=False, do_extraction=False)
await rd.asave(
index=False,
do_extraction=False,
update_fields=["ocr_status", "plain_text"],
)
processed.append(pk)

return processed
Expand Down
35 changes: 35 additions & 0 deletions cl/search/signals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from django.db.models.signals import post_save
from django.dispatch import receiver

from cl.audio.models import Audio
from cl.citations.tasks import (
find_citations_and_parantheticals_for_recap_documents,
)
from cl.lib.es_signal_processor import ESSignalProcessor
from cl.search.documents import AudioDocument, ParentheticalGroupDocument
from cl.search.models import (
Expand All @@ -9,6 +15,7 @@
OpinionsCited,
Parenthetical,
ParentheticalGroup,
RECAPDocument,
)

# This field mapping is used to define which fields should be updated in the
Expand Down Expand Up @@ -130,3 +137,31 @@
AudioDocument,
oa_field_mapping,
)


@receiver(
post_save,
sender=RECAPDocument,
dispatch_uid="handle_recap_doc_change_uid",
)
def handle_recap_doc_change(
sender, instance: RECAPDocument, update_fields=None, **kwargs
):
"""
Right now, this receiver exists to enqueue the task to parse RECAPDocuments for caselaw citations.
More functionality can be put here later. There may be things currently in the save function
of RECAPDocument that would be better placed here for reasons of maintainability and testability.
"""

# Whenever pdf text is processed, it will update the plain_text field.
# When we get updated text for a doc, we want to parse it for citations.
if update_fields is not None and "plain_text" in update_fields:
# Even though the task itself filters for qualifying ocr_status,
# we don't want to clog the TQ with unncessary items.
if instance.ocr_status in (
RECAPDocument.OCR_COMPLETE,
RECAPDocument.OCR_UNNECESSARY,
):
find_citations_and_parantheticals_for_recap_documents.apply_async(
args=([instance.pk])
)
84 changes: 84 additions & 0 deletions cl/search/tests/test_search_signals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from dataclasses import dataclass
from typing import List
from unittest.mock import Mock, patch

from django.db.models.signals import post_save

from cl.search.factories import (
DocketEntryWithParentsFactory,
RECAPDocumentFactory,
)
from cl.search.models import RECAPDocument
from cl.search.signals import handle_recap_doc_change
from cl.tests.cases import SimpleTestCase


# Test that event hits the receiver function
class RECAPDocumentSignalTests(SimpleTestCase):
def setUp(self):
post_save.disconnect(handle_recap_doc_change, sender=RECAPDocument)
self.mock_receiver = Mock()
post_save.connect(self.mock_receiver, sender=RECAPDocument)

def test_recapdoc_save_emit_signal(self):
recap_doc = RECAPDocumentFactory.create(
plain_text="In Fisher v. SD Protection Inc., 948 F.3d 593 (2d Cir. 2020), the Second Circuit held that in the context of settlement of FLSA and NYLL cases, which must be approved by the trial court in accordance with Cheeks v. Freeport Pancake House, Inc., 796 F.3d 199 (2d Cir. 2015), the district court abused its discretion in limiting the amount of recoverable fees to a percentage of the recovery by the successful plaintiffs. But also: sdjnfdsjnk. Fisher, 948 F.3d at 597.",
ocr_status=RECAPDocument.OCR_UNNECESSARY,
docket_entry=DocketEntryWithParentsFactory(),
)

recap_doc.save(update_fields=["ocr_status", "plain_text"])
self.assertTrue(self.mock_receiver.called)


@dataclass
class ReceiverTestCase:
update_fields: List[str] | None
ocr_status: RECAPDocument.OCR_STATUSES
expect_enqueue: bool


class RECAPDocumentReceiverTests(SimpleTestCase):
def test_receiver_enqueues_task(self):
test_cases: List[ReceiverTestCase] = [
ReceiverTestCase(
update_fields=["plain_text", "ocr_status"],
ocr_status=RECAPDocument.OCR_UNNECESSARY,
expect_enqueue=True,
), # test that task is enq'd when the relevant fields are updated and ocr_status qualifies
ReceiverTestCase(
update_fields=["plain_text"],
ocr_status=RECAPDocument.OCR_FAILED,
expect_enqueue=False, # test that task is not enq'd when the ocr_status does not qualify
),
ReceiverTestCase(
update_fields=None,
ocr_status=RECAPDocument.OCR_COMPLETE,
expect_enqueue=False, # test that task is not enq'd when no update_fields even if ocr_status qualifies
),
ReceiverTestCase(
update_fields=["document_type"],
ocr_status=RECAPDocument.OCR_COMPLETE,
expect_enqueue=False, # test that task is not enq'd when no relevant update_fields
),
]

for test_case in test_cases:
with self.subTest(test_case=test_case):
with patch(
"cl.citations.tasks.find_citations_and_parantheticals_for_recap_documents.apply_async"
) as mock_apply:
recap_doc = RECAPDocumentFactory.create(
plain_text='"During the whole of his trip down town and return[,] Cornish had been ill, the journey being marked by frequent interruptions necessitated by the condition of his stomach and bowels. People v. Molineux, 168 NY 264, 275-276 (N.Y. 1901)."',
ocr_status=test_case.ocr_status,
docket_entry=DocketEntryWithParentsFactory(),
)

recap_doc.save(update_fields=test_case.update_fields)

if test_case.expect_enqueue:
mock_apply.assert_called_once_with(
args=([recap_doc.pk])
)
else:
mock_apply.assert_not_called()

0 comments on commit 53285f4

Please sign in to comment.