feat(harvard_merger): add harvard_id field to OpinionCluster

Harvard's Caselaw Access Project has been sunset. For projects which have existing references to CAP cases, there's a need to identify a CAP case's corresponding CL opinion cluster. An indexed `harvard_id` column is added to `OpinionCluster`. The field is also added to the `fields` of `OpinionClusterFilter`. For migration, this patch builds on work done in #4284 and #4442 and extends `import_harvard_pdfs` to populate the `harvard_id` column using CAP crosswalk file. Fixes: #4313
freelawproject · Oct 25, 2024 · cb77066 · cb77066
1 parent eb06405
commit cb77066
Show file tree

Hide file tree

Showing 7 changed files with 166 additions and 1 deletion.
diff --git a/cl/search/filters.py b/cl/search/filters.py
@@ -166,6 +166,7 @@ class Meta:
             "citation_count": INTEGER_LOOKUPS,
             "precedential_status": ["exact"],
             "date_blocked": DATE_LOOKUPS,
+            "harvard_id": ["exact"],
             "blocked": ["exact"],
         }
 

diff --git a/cl/search/management/commands/import_harvard_pdfs.py b/cl/search/management/commands/import_harvard_pdfs.py
@@ -54,7 +54,7 @@ def add_arguments(self, parser):
         parser.add_argument(
             "--job",
             type=str,
-            choices=["import_pdf"],
+            choices=["import_pdf", "assign_cap_id"],
             default="import_pdf",
             help="",
         )
@@ -242,6 +242,17 @@ def process_entry(
 
                 except OpinionCluster.DoesNotExist:
                     logger.info(f"Cluster not found for id: {cl_cluster_id}")
+
+            case "assign_cap_id":
+                try:
+                    cluster = OpinionCluster.objects.get(id=cl_cluster_id)
+                    cluster.harvard_id = cap_case_id
+                    if not self.dry_run:
+                        cluster.save()
+
+                except OpinionCluster.DoesNotExist:
+                    logger.info(f"Cluster not found for id: {cl_cluster_id}")
+
             case _:
                 raise Exception(f"Unknown job {self.job}")
 

diff --git a/cl/search/migrations/0037_add_harvard_id_to_opinioncluster.py b/cl/search/migrations/0037_add_harvard_id_to_opinioncluster.py
@@ -0,0 +1,69 @@
+# Generated by Django 5.1.2 on 2024-10-25 18:54
+
+import pgtrigger.compiler
+import pgtrigger.migrations
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("search", "0036_add_searchquery"),
+    ]
+
+    operations = [
+        pgtrigger.migrations.RemoveTrigger(
+            model_name="opinioncluster",
+            name="update_update",
+        ),
+        pgtrigger.migrations.RemoveTrigger(
+            model_name="opinioncluster",
+            name="delete_delete",
+        ),
+        migrations.AddField(
+            model_name="opinioncluster",
+            name="harvard_id",
+            field=models.CharField(
+                db_index=True,
+                default=0,
+                help_text="The ID of the item in the Caselaw Access Project (Harvard)",
+            ),
+        ),
+        migrations.AddField(
+            model_name="opinionclusterevent",
+            name="harvard_id",
+            field=models.CharField(
+                default=0,
+                help_text="The ID of the item in the Caselaw Access Project (Harvard)",
+            ),
+        ),
+        pgtrigger.migrations.AddTrigger(
+            model_name="opinioncluster",
+            trigger=pgtrigger.compiler.Trigger(
+                name="update_update",
+                sql=pgtrigger.compiler.UpsertTriggerSql(
+                    condition='WHEN (OLD."arguments" IS DISTINCT FROM (NEW."arguments") OR OLD."attorneys" IS DISTINCT FROM (NEW."attorneys") OR OLD."blocked" IS DISTINCT FROM (NEW."blocked") OR OLD."case_name" IS DISTINCT FROM (NEW."case_name") OR OLD."case_name_full" IS DISTINCT FROM (NEW."case_name_full") OR OLD."case_name_short" IS DISTINCT FROM (NEW."case_name_short") OR OLD."citation_count" IS DISTINCT FROM (NEW."citation_count") OR OLD."correction" IS DISTINCT FROM (NEW."correction") OR OLD."cross_reference" IS DISTINCT FROM (NEW."cross_reference") OR OLD."date_blocked" IS DISTINCT FROM (NEW."date_blocked") OR OLD."date_filed" IS DISTINCT FROM (NEW."date_filed") OR OLD."date_filed_is_approximate" IS DISTINCT FROM (NEW."date_filed_is_approximate") OR OLD."disposition" IS DISTINCT FROM (NEW."disposition") OR OLD."docket_id" IS DISTINCT FROM (NEW."docket_id") OR OLD."filepath_json_harvard" IS DISTINCT FROM (NEW."filepath_json_harvard") OR OLD."filepath_pdf_harvard" IS DISTINCT FROM (NEW."filepath_pdf_harvard") OR OLD."harvard_id" IS DISTINCT FROM (NEW."harvard_id") OR OLD."headmatter" IS DISTINCT FROM (NEW."headmatter") OR OLD."headnotes" IS DISTINCT FROM (NEW."headnotes") OR OLD."history" IS DISTINCT FROM (NEW."history") OR OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."judges" IS DISTINCT FROM (NEW."judges") OR OLD."nature_of_suit" IS DISTINCT FROM (NEW."nature_of_suit") OR OLD."other_dates" IS DISTINCT FROM (NEW."other_dates") OR OLD."posture" IS DISTINCT FROM (NEW."posture") OR OLD."precedential_status" IS DISTINCT FROM (NEW."precedential_status") OR OLD."procedural_history" IS DISTINCT FROM (NEW."procedural_history") OR OLD."scdb_decision_direction" IS DISTINCT FROM (NEW."scdb_decision_direction") OR OLD."scdb_id" IS DISTINCT FROM (NEW."scdb_id") OR OLD."scdb_votes_majority" IS DISTINCT FROM (NEW."scdb_votes_majority") OR OLD."scdb_votes_minority" IS DISTINCT FROM (NEW."scdb_votes_minority") OR OLD."slug" IS DISTINCT FROM (NEW."slug") OR OLD."source" IS DISTINCT FROM (NEW."source") OR OLD."summary" IS DISTINCT FROM (NEW."summary") OR OLD."syllabus" IS DISTINCT FROM (NEW."syllabus"))',
+                    func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'update\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;',
+                    hash="bc20a56b13c375017e704a6e50efd44e5c060018",
+                    operation="UPDATE",
+                    pgid="pgtrigger_update_update_c83f1",
+                    table="search_opinioncluster",
+                    when="AFTER",
+                ),
+            ),
+        ),
+        pgtrigger.migrations.AddTrigger(
+            model_name="opinioncluster",
+            trigger=pgtrigger.compiler.Trigger(
+                name="delete_delete",
+                sql=pgtrigger.compiler.UpsertTriggerSql(
+                    func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'delete\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;',
+                    hash="93725d0e8785d341973cd6af46aa9b3e9aca1ec2",
+                    operation="DELETE",
+                    pgid="pgtrigger_delete_delete_a8516",
+                    table="search_opinioncluster",
+                    when="AFTER",
+                ),
+            ),
+        ),
+    ]
diff --git a/cl/search/migrations/0037_add_harvard_id_to_opinioncluster.sql b/cl/search/migrations/0037_add_harvard_id_to_opinioncluster.sql
@@ -0,0 +1,8 @@
+BEGIN;
+ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
+ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT;
+ALTER TABLE "search_opinionclusterevent" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
+ALTER TABLE "search_opinionclusterevent" ALTER COLUMN "harvard_id" DROP DEFAULT;
+CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id");
+CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops);
+COMMIT;
diff --git a/cl/search/migrations/0037_add_harvard_id_to_opinioncluster_customers.sql b/cl/search/migrations/0037_add_harvard_id_to_opinioncluster_customers.sql
@@ -0,0 +1,6 @@
+BEGIN;
+ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
+ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT;
+CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id");
+CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops);
+COMMIT;
diff --git a/cl/search/models.py b/cl/search/models.py
@@ -2731,6 +2731,11 @@ class OpinionCluster(AbstractDateTimeModel):
         storage=IncrementingAWSMediaStorage(),
         blank=True,
     )
+    harvard_id = models.CharField(
+        help_text="The ID of the item in the Caselaw Access Project (Harvard)",
+        default=0,
+        db_index=True,
+    )
     arguments = models.TextField(
         help_text="The attorney(s) and legal arguments presented as HTML text. "
         "This is primarily seen in older opinions and can contain "

diff --git a/cl/search/tests/test_import_harvard_pdfs.py b/cl/search/tests/test_import_harvard_pdfs.py
@@ -119,3 +119,68 @@ def test_import_harvard_pdfs(
         self.assertEqual(
             self.cluster.filepath_pdf_harvard, "mocked_saved_path.pdf"
         )
+
+    @patch("cl.search.management.commands.import_harvard_pdfs.tqdm")
+    @patch(
+        "cl.search.management.commands.import_harvard_pdfs.OpinionCluster.objects.get"
+    )
+    @patch(
+        "cl.search.management.commands.import_harvard_pdfs.HarvardPDFStorage"
+    )
+    @patch("cl.search.management.commands.import_harvard_pdfs.boto3.client")
+    @patch("cl.search.management.commands.import_harvard_pdfs.os.listdir")
+    @patch("cl.search.management.commands.import_harvard_pdfs.os.path.exists")
+    def test_assign_harvard_id(
+        self,
+        mock_exists,
+        mock_listdir,
+        mock_boto3_client,
+        mock_harvard_storage,
+        mock_opinion_cluster_get,
+        mock_tqdm,
+    ):
+        # Setup mocks
+        mock_listdir.return_value = ["test_crosswalk.json"]
+        mock_exists.side_effect = lambda path: path in [
+            "/mocked_path/crosswalk_dir"
+        ]
+
+        mock_s3 = MagicMock()
+        mock_boto3_client.return_value = mock_s3
+        mock_storage = MagicMock()
+        mock_harvard_storage.return_value = mock_storage
+        mock_opinion_cluster_get.return_value = self.cluster
+        mock_tqdm.side_effect = (
+            lambda x, *args, **kwargs: x
+        )  # Make tqdm a pass-through function
+
+        crosswalk_data = [
+            {
+                "cap_case_id": 1,
+                "cl_cluster_id": self.cluster.id,
+                "cap_path": "/test/path.json",
+            }
+        ]
+
+        # Mock file operations
+        m = mock_open(read_data=json.dumps(crosswalk_data))
+
+        # Mock crosswalk_dir
+        crosswalk_dir = "/mocked_path/crosswalk_dir"
+
+        # Verify crosswalk_dir exists
+        self.assertTrue(
+            os.path.exists(crosswalk_dir),
+            f"Crosswalk directory does not exist: {crosswalk_dir}",
+        )
+
+        with patch("builtins.open", m):
+            call_command(
+                "import_harvard_pdfs",
+                crosswalk_dir=crosswalk_dir,
+                job="assign_cap_id",
+            )
+
+        # Verify that the cluster's harvard_id field was updated
+        self.cluster.refresh_from_db()
+        self.assertEqual(self.cluster.harvard_id, "1")