Skip to content

Commit

Permalink
Merge pull request #4621 from jtmst/fix-harvard-pdf-import
Browse files Browse the repository at this point in the history
Fix path construction and pdf file handling
  • Loading branch information
quevon24 authored Oct 25, 2024
2 parents 8c77749 + fd91986 commit 767d836
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 9 deletions.
34 changes: 26 additions & 8 deletions cl/search/management/commands/import_harvard_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import boto3
from django.conf import settings
from django.core.files.base import ContentFile
from django.core.management.base import BaseCommand
from tqdm import tqdm

Expand Down Expand Up @@ -159,7 +160,12 @@ def process_crosswalk_file(self, crosswalk_file: str) -> None:
try:
cap_case_id = entry["cap_case_id"]
cl_cluster_id = entry["cl_cluster_id"]
pdf_path = entry["cap_path"].replace(".json", ".pdf")
json_path = entry["cap_path"]

# Construct the PDF path based on the JSON path
pdf_path = json_path.replace("cases", "case-pdfs").replace(
".json", ".pdf"
)

if pdf_path in self.processed_pdfs:
logger.info(f"Skipping already processed PDF: {pdf_path}")
Expand Down Expand Up @@ -240,6 +246,8 @@ def fetch_pdf_from_cap(self, pdf_path: str) -> Optional[bytes]:
logger.info(f"Fetching PDF from CAP: {pdf_path}")
logger.debug(f"Bucket name: {self.cap_bucket_name}")

pdf_path = pdf_path.lstrip("/")

if self.dry_run:
logger.info(f"Dry run: Would fetch PDF from {pdf_path}")
return b"Mock PDF content"
Expand Down Expand Up @@ -280,13 +288,23 @@ def store_pdf_in_cl(
storage = HarvardPDFStorage()
file_path = f"harvard_pdf/{cluster.pk}.pdf"
logger.debug(f"Saving file to: {file_path}")
storage.save(file_path, pdf_content)
logger.debug(f"File saved. Updating cluster {cluster.pk}")
cluster.filepath_pdf_harvard = file_path
cluster.save()
logger.info(
f"Cluster updated. filepath_pdf_harvard: {cluster.filepath_pdf_harvard}"
)

try:
content_file = ContentFile(pdf_content)

saved_path = storage.save(file_path, content_file)
logger.info(f"File saved successfully at: {saved_path}")

cluster.filepath_pdf_harvard = saved_path
cluster.save()
logger.info(
f"Cluster updated. filepath_pdf_harvard: {cluster.filepath_pdf_harvard}"
)
except Exception as e:
logger.error(
f"Error saving PDF for cluster {cluster.id}: {str(e)}",
exc_info=True,
)


if __name__ == "__main__":
Expand Down
5 changes: 4 additions & 1 deletion cl/search/tests/test_import_harvard_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def test_import_harvard_pdfs(
mock_s3 = MagicMock()
mock_boto3_client.return_value = mock_s3
mock_storage = MagicMock()
mock_storage.save.return_value = "mocked_saved_path.pdf"
mock_harvard_storage.return_value = mock_storage
mock_opinion_cluster_get.return_value = self.cluster
mock_tqdm.side_effect = (
Expand Down Expand Up @@ -115,4 +116,6 @@ def test_import_harvard_pdfs(

# Verify that the cluster's filepath_pdf_harvard field was updated
self.cluster.refresh_from_db()
self.assertIsNotNone(self.cluster.filepath_pdf_harvard)
self.assertEqual(
self.cluster.filepath_pdf_harvard, "mocked_saved_path.pdf"
)

0 comments on commit 767d836

Please sign in to comment.