Skip to content

Commit

Permalink
Merge branch 'main' into workflow-isolate-build
Browse files Browse the repository at this point in the history
  • Loading branch information
mlissner authored Oct 25, 2024
2 parents 503b2ca + 767d836 commit fce8771
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 13 deletions.
52 changes: 49 additions & 3 deletions cl/audio/factories.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from factory import Faker
from factory import Faker, post_generation
from factory.django import DjangoModelFactory, FileField
from factory.fuzzy import FuzzyChoice

Expand All @@ -15,8 +15,54 @@ class Meta:
case_name = Faker("case_name")
sha1 = Faker("sha1")
download_url = Faker("url")
local_path_mp3 = FileField(upload_to="/tmp/audio")
local_path_original_file = FileField(upload_to="/tmp/audio/")

@classmethod
def _create(cls, model_class, *args, **kwargs):
"""Creates an instance of the model class without indexing."""
obj = model_class(*args, **kwargs)
# explicitly sets `index=False` to prevent it from being indexed in SOLR.
# Once Solr is removed, we can just remove this method completely.
obj.save(index=False)
return obj

"""
These hooks are necessary to make this factory compatible with the
`make_dev_command`. by delegating the file creation to the hooks, we prevent
the model from trying to use our storage settings when the field is not
explicitly requested.
"""

@post_generation
def local_path_mp3(self, create, extracted, **kwargs):
if extracted:
self.local_path_mp3 = extracted
elif kwargs:
# Factory Boy uses the `evaluate` method of each field to calculate
# values for object creation. The FileField class only requires the
# extra dictionary to create the stub django file.
#
# Learn more about FactoryBoy's `FileField` class:
# https://github.com/FactoryBoy/factory_boy/blob/ac49fb40ec424276c3cd3ca0925ba99a626f05f7/factory/django.py#L249
self.local_path_mp3 = FileField().evaluate(None, None, kwargs)

@post_generation
def local_path_original_file(self, create, extracted, **kwargs):
if extracted:
self.local_path_original_file = extracted
elif kwargs:
self.local_path_original_file = FileField().evaluate(
None, None, kwargs
)

@classmethod
def _after_postgeneration(cls, instance, create, results=None):
"""Save again the instance if creating and at least one hook ran."""
if create and results:
# Some post-generation hooks ran, and may have modified the instance.
instance.save(
index=False,
update_fields=["local_path_mp3", "local_path_original_file"],
)


class AudioWithParentsFactory(AudioFactory, DocketParentMixin):
Expand Down
4 changes: 4 additions & 0 deletions cl/audio/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,13 +303,17 @@ def setUpTestData(cls) -> None:
docket=DocketFactory(
court=cls.court_1, date_argued=datetime.date(2014, 8, 14)
),
local_path_mp3__data=b"\x10" * 10,
local_path_original_file__data=b"\x10" * 10,
duration=2000,
stt_status=Audio.STT_NEEDED,
)
cls.audio_to_be_retried = AudioFactory.create(
docket=DocketFactory(
court=cls.court_1, date_argued=datetime.date(2014, 8, 13)
),
local_path_mp3__data=b"\x10" * 10,
local_path_original_file__data=b"\x10" * 10,
duration=1000,
stt_status=Audio.STT_FAILED,
)
Expand Down
27 changes: 26 additions & 1 deletion cl/search/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
RelatedFactory,
SelfAttribute,
SubFactory,
post_generation,
)
from factory.django import DjangoModelFactory, FileField
from factory.fuzzy import FuzzyChoice, FuzzyText
Expand Down Expand Up @@ -269,6 +270,7 @@ class DocketEntryReuseParentsFactory(
class DocketFactory(DjangoModelFactory):
class Meta:
model = Docket
skip_postgeneration_save = True

source = FuzzyChoice(Docket.SOURCE_CHOICES, getter=lambda c: c[0])
court = SubFactory(CourtFactory)
Expand All @@ -281,9 +283,32 @@ class Meta:
pacer_case_id = Faker("pyint", min_value=100_000, max_value=400_000)
docket_number = Faker("federal_district_docket_number")
slug = Faker("slug")
filepath_local = FileField(filename="docket.xml")
date_argued = Faker("date_object")

"""
This hook is necessary to make this factory compatible with the
`make_dev_command` by delegating the file creation to the hook, we prevent
the model from trying to use our storage settings when the field is not
explicitly requested
"""

@post_generation
def filepath_local(self, create, extracted, **kwargs):
"""Attaches a stub file to an instance of this factory."""
if extracted:
self.filepath_local = extracted
elif kwargs:
# Factory Boy uses the `evaluate` method of each field to calculate
# values for object creation. The FileField class only requires the
# extra dictionary to create the stub django file.
#
# Learn more about FactoryBoy's `FileField` class:
# https://github.com/FactoryBoy/factory_boy/blob/ac49fb40ec424276c3cd3ca0925ba99a626f05f7/factory/django.py#L249
self.filepath_local = FileField().evaluate(None, None, kwargs)

if create:
self.save(update_fields=["filepath_local"])


class DocketWithChildrenFactory(DocketFactory):
clusters = RelatedFactory(
Expand Down
34 changes: 26 additions & 8 deletions cl/search/management/commands/import_harvard_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import boto3
from django.conf import settings
from django.core.files.base import ContentFile
from django.core.management.base import BaseCommand
from tqdm import tqdm

Expand Down Expand Up @@ -159,7 +160,12 @@ def process_crosswalk_file(self, crosswalk_file: str) -> None:
try:
cap_case_id = entry["cap_case_id"]
cl_cluster_id = entry["cl_cluster_id"]
pdf_path = entry["cap_path"].replace(".json", ".pdf")
json_path = entry["cap_path"]

# Construct the PDF path based on the JSON path
pdf_path = json_path.replace("cases", "case-pdfs").replace(
".json", ".pdf"
)

if pdf_path in self.processed_pdfs:
logger.info(f"Skipping already processed PDF: {pdf_path}")
Expand Down Expand Up @@ -240,6 +246,8 @@ def fetch_pdf_from_cap(self, pdf_path: str) -> Optional[bytes]:
logger.info(f"Fetching PDF from CAP: {pdf_path}")
logger.debug(f"Bucket name: {self.cap_bucket_name}")

pdf_path = pdf_path.lstrip("/")

if self.dry_run:
logger.info(f"Dry run: Would fetch PDF from {pdf_path}")
return b"Mock PDF content"
Expand Down Expand Up @@ -280,13 +288,23 @@ def store_pdf_in_cl(
storage = HarvardPDFStorage()
file_path = f"harvard_pdf/{cluster.pk}.pdf"
logger.debug(f"Saving file to: {file_path}")
storage.save(file_path, pdf_content)
logger.debug(f"File saved. Updating cluster {cluster.pk}")
cluster.filepath_pdf_harvard = file_path
cluster.save()
logger.info(
f"Cluster updated. filepath_pdf_harvard: {cluster.filepath_pdf_harvard}"
)

try:
content_file = ContentFile(pdf_content)

saved_path = storage.save(file_path, content_file)
logger.info(f"File saved successfully at: {saved_path}")

cluster.filepath_pdf_harvard = saved_path
cluster.save()
logger.info(
f"Cluster updated. filepath_pdf_harvard: {cluster.filepath_pdf_harvard}"
)
except Exception as e:
logger.error(
f"Error saving PDF for cluster {cluster.id}: {str(e)}",
exc_info=True,
)


if __name__ == "__main__":
Expand Down
5 changes: 4 additions & 1 deletion cl/search/tests/test_import_harvard_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def test_import_harvard_pdfs(
mock_s3 = MagicMock()
mock_boto3_client.return_value = mock_s3
mock_storage = MagicMock()
mock_storage.save.return_value = "mocked_saved_path.pdf"
mock_harvard_storage.return_value = mock_storage
mock_opinion_cluster_get.return_value = self.cluster
mock_tqdm.side_effect = (
Expand Down Expand Up @@ -115,4 +116,6 @@ def test_import_harvard_pdfs(

# Verify that the cluster's filepath_pdf_harvard field was updated
self.cluster.refresh_from_db()
self.assertIsNotNone(self.cluster.filepath_pdf_harvard)
self.assertEqual(
self.cluster.filepath_pdf_harvard, "mocked_saved_path.pdf"
)

0 comments on commit fce8771

Please sign in to comment.