Merge pull request #4621 from jtmst/fix-harvard-pdf-import

Fix path construction and pdf file handling
freelawproject · Oct 25, 2024 · 767d836 · 767d836
2 parents 8c77749 + fd91986
commit 767d836
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 9 deletions.
diff --git a/cl/search/management/commands/import_harvard_pdfs.py b/cl/search/management/commands/import_harvard_pdfs.py
@@ -6,6 +6,7 @@
 
 import boto3
 from django.conf import settings
+from django.core.files.base import ContentFile
 from django.core.management.base import BaseCommand
 from tqdm import tqdm
 
@@ -159,7 +160,12 @@ def process_crosswalk_file(self, crosswalk_file: str) -> None:
             try:
                 cap_case_id = entry["cap_case_id"]
                 cl_cluster_id = entry["cl_cluster_id"]
-                pdf_path = entry["cap_path"].replace(".json", ".pdf")
+                json_path = entry["cap_path"]
+
+                # Construct the PDF path based on the JSON path
+                pdf_path = json_path.replace("cases", "case-pdfs").replace(
+                    ".json", ".pdf"
+                )
 
                 if pdf_path in self.processed_pdfs:
                     logger.info(f"Skipping already processed PDF: {pdf_path}")
@@ -240,6 +246,8 @@ def fetch_pdf_from_cap(self, pdf_path: str) -> Optional[bytes]:
         logger.info(f"Fetching PDF from CAP: {pdf_path}")
         logger.debug(f"Bucket name: {self.cap_bucket_name}")
 
+        pdf_path = pdf_path.lstrip("/")
+
         if self.dry_run:
             logger.info(f"Dry run: Would fetch PDF from {pdf_path}")
             return b"Mock PDF content"
@@ -280,13 +288,23 @@ def store_pdf_in_cl(
         storage = HarvardPDFStorage()
         file_path = f"harvard_pdf/{cluster.pk}.pdf"
         logger.debug(f"Saving file to: {file_path}")
-        storage.save(file_path, pdf_content)
-        logger.debug(f"File saved. Updating cluster {cluster.pk}")
-        cluster.filepath_pdf_harvard = file_path
-        cluster.save()
-        logger.info(
-            f"Cluster updated. filepath_pdf_harvard: {cluster.filepath_pdf_harvard}"
-        )
+
+        try:
+            content_file = ContentFile(pdf_content)
+
+            saved_path = storage.save(file_path, content_file)
+            logger.info(f"File saved successfully at: {saved_path}")
+
+            cluster.filepath_pdf_harvard = saved_path
+            cluster.save()
+            logger.info(
+                f"Cluster updated. filepath_pdf_harvard: {cluster.filepath_pdf_harvard}"
+            )
+        except Exception as e:
+            logger.error(
+                f"Error saving PDF for cluster {cluster.id}: {str(e)}",
+                exc_info=True,
+            )
 
 
 if __name__ == "__main__":

diff --git a/cl/search/tests/test_import_harvard_pdfs.py b/cl/search/tests/test_import_harvard_pdfs.py
@@ -64,6 +64,7 @@ def test_import_harvard_pdfs(
         mock_s3 = MagicMock()
         mock_boto3_client.return_value = mock_s3
         mock_storage = MagicMock()
+        mock_storage.save.return_value = "mocked_saved_path.pdf"
         mock_harvard_storage.return_value = mock_storage
         mock_opinion_cluster_get.return_value = self.cluster
         mock_tqdm.side_effect = (
@@ -115,4 +116,6 @@ def test_import_harvard_pdfs(
 
         # Verify that the cluster's filepath_pdf_harvard field was updated
         self.cluster.refresh_from_db()
-        self.assertIsNotNone(self.cluster.filepath_pdf_harvard)
+        self.assertEqual(
+            self.cluster.filepath_pdf_harvard, "mocked_saved_path.pdf"
+        )