freelawproject · grossir · Oct 1, 2024 · Oct 1, 2024 · Oct 3, 2024 · Oct 18, 2024
diff --git a/cl/lib/command_utils.py b/cl/lib/command_utils.py
@@ -3,6 +3,8 @@
 
 from django.core.management import BaseCommand, CommandError
 
+from cl.lib.juriscraper_utils import get_module_by_court_id
+
 logger = logging.getLogger(__name__)
 
 
@@ -22,6 +24,40 @@ def handle(self, *args, **options):
             juriscraper_logger.setLevel(logging.DEBUG)
 
 
+class ScraperCommand(VerboseCommand):
+    """Base class for cl.scrapers commands that use Juriscraper
+
+    Implements the `--courts` argument to lookup for a Site object
+    """
+
+    # To be used on get_module_by_court_id
+    # Defined by inheriting classes
+    juriscraper_module_type = ""
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--courts",
+            dest="court_id",
+            metavar="COURTID",
+            type=lambda s: (
+                s
+                if "." in s
+                else get_module_by_court_id(s, self.juriscraper_module_type)
+            ),
+            required=True,
+            help=(
+                "The court(s) to scrape and extract. One of: "
+                "1. a python module or package import from the Juriscraper library, e.g."
+                "'juriscraper.opinions.united_states.federal_appellate.ca1' "
+                "or simply 'juriscraper.opinions' to do all opinions."
+                ""
+                "2. a court_id, to be used to lookup for a full module path"
+                "An error will be raised if the `court_id` matches more than "
+                "one module path. In that case, use the full path"
+            ),
+        )
+
+
 class CommandUtils:
     """A mixin to give some useful methods to sub classes."""
 

diff --git a/cl/lib/juriscraper_utils.py b/cl/lib/juriscraper_utils.py
@@ -5,6 +5,12 @@
 import juriscraper
 
 
+def walk_juriscraper():
+    return pkgutil.walk_packages(
+        juriscraper.__path__, f"{juriscraper.__name__}."
+    )
+
+
 def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
     """Identify and instantiate a Site() object given the name of a court
 
@@ -25,9 +31,7 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
 
         return importlib.import_module(juriscraper_module).Site()
 
-    for _, full_module_path, _ in pkgutil.walk_packages(
-        juriscraper.__path__, f"{juriscraper.__name__}."
-    ):
+    for _, full_module_path, _ in walk_juriscraper():
         # Get the module name from the full path and trim
         # any suffixes like _p, _u
         module_name = full_module_path.rsplit(".", 1)[1].rsplit("_", 1)[0]
@@ -42,3 +46,45 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
                 # has been stripped off it. In any case, just ignore it when
                 # this happens.
                 continue
+
+
+def get_module_by_court_id(court_id: str, module_type: str):
+    """Given a `court_id` return a juriscraper module path
+
+    Some court_ids match multiple scraper files. These will force the user
+    to use the full module path. For example, "lactapp_1" and "lactapp_5"
+    match the same `court_id`, but scrape totally different sites, and
+    their Site objects are expected to have different `extract_from_text`
+    behavior
+
+    :param court_id: court id to look for
+    :param module_type: 'opinions' or 'oral_args'. Without this, some
+        court_ids may match the 2 classes of scrapers
+
+    :raises: ValueError if there is no match or there is more than 1 match
+    :return: the full module path
+    """
+    if module_type not in ["opinions", "oral_args"]:
+        raise ValueError(
+            "module_type has to be one of ['opinions', 'oral_args']"
+        )
+
+    matches = []
+    for _, module_string, _ in walk_juriscraper():
+        if module_string.count(".") != 4 or module_type not in module_string:
+            # Skip folder and lib modules. Skip type
+            continue
+
+        module_court_id = module_string.rsplit(".", 1)[1].rsplit("_", 1)[0]
+        if module_court_id == court_id:
+            matches.append(module_string)
+
+    if len(matches) == 1:
+        return matches[0]
+    elif len(matches) == 0:
+        raise ValueError(f"'{court_id}' doesn't match any juriscraper module")
+    else:
+        raise ValueError(
+            f"'{court_id}' matches more than 1 juriscraper module."
+            f"Use a full module path. Matches: '{matches}'"
+        )
diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -24,6 +24,7 @@
 
 class Command(cl_back_scrape_opinions.Command):
     scrape_target_descr = "citations"
+    juriscraper_module_type = "opinions"
 
     def scrape_court(
         self,

diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py
@@ -18,7 +18,7 @@
 
 from cl.alerts.models import RealTimeQueue
 from cl.citations.utils import map_reporter_db_cite_type
-from cl.lib.command_utils import VerboseCommand, logger
+from cl.lib.command_utils import ScraperCommand, logger
 from cl.lib.crypto import sha1
 from cl.lib.string_utils import trunc
 from cl.people_db.lookup_utils import lookup_judges_by_messy_str
@@ -217,14 +217,16 @@ def save_everything(
         )
 
 
-class Command(VerboseCommand):
+class Command(ScraperCommand):
     help = "Runs the Juriscraper toolkit against one or many jurisdictions."
+    juriscraper_module_type = "opinions"
     scrape_target_descr = "opinions"  # for logging purposes
 
     def __init__(self, stdout=None, stderr=None, no_color=False):
         super().__init__(stdout=None, stderr=None, no_color=False)
 
     def add_arguments(self, parser):
+        super().add_arguments(parser)
         parser.add_argument(
             "--daemon",
             action="store_true",
@@ -246,20 +248,6 @@ def add_arguments(self, parser):
                 "is 30 minutes."
             ),
         )
-        parser.add_argument(
-            "--courts",
-            type=str,
-            dest="court_id",
-            metavar="COURTID",
-            required=True,
-            help=(
-                "The court(s) to scrape and extract. This should be "
-                "in the form of a python module or package import "
-                "from the Juriscraper library, e.g. "
-                '"juriscraper.opinions.united_states.federal_appellate.ca1" '
-                'or simply "opinions" to do all opinions.'
-            ),
-        )
         parser.add_argument(
             "--fullcrawl",
             dest="full_crawl",

diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
@@ -107,6 +107,7 @@ def make_objects(
 
 class Command(cl_scrape_opinions.Command):
     scrape_target_descr = "oral arguments"
+    juriscraper_module_type = "oral_args"
 
     def ingest_a_case(
         self,

diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py
@@ -0,0 +1,198 @@
+import traceback
+from datetime import datetime
+
+from django.db import transaction
+
+from cl.lib.command_utils import ScraperCommand, logger
+from cl.scrapers.tasks import update_document_from_text
+from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster
+
+
+def rerun_extract_from_text(
+    opinion: Opinion, juriscraper_module: str, stats: dict[str, int]
+):
+    """
+    Reruns `update_document_from_text` from the scraper flow, saving changes
+
+    `update_document_from_text` calls `Site.extract_from_text` and assigns
+    any changes to the proper objets, in place, but they are not saved.
+    This method saves the ones with actual changes
+
+    :param opinion: the Opinion on which to apply extract_from_text
+    :param juriscraper_module: the scraper module path
+    :param stats: dict to accumulate counts for reporting. Modified in place
+
+    :return None
+    """
+    if not opinion.plain_text and not opinion.html:
+        # May be an opinion entirely from a merged corpus
+        # or an error during text extraction
+        logger.info(
+            "Opinion %s has no `plain_text` or `html` to extract from",
+            opinion.id,
+        )
+        stats["No text to extract from"] += 1
+        return
+
+    with transaction.atomic():
+        try:
+            changes = update_document_from_text(opinion, juriscraper_module)
+        except:
+            # Probably a bad implementation of `extract_from_text`
+            logger.debug(
+                "`update_document_from_text` failed for opinion %s. Traceback: %s",
+                opinion.id,
+                traceback.format_exc(),
+            )
+            stats["Error"] += 1
+            return
+
+        if not changes:
+            logger.info("Did not get any metadata for opinion %s", opinion.id)
+            stats["No metadata extracted"] += 1
+            return
+
+        logger.info("Processing opinion %s", opinion.id)
+
+        # Check if changes exist before saving, to prevent unnecessary DB queries
+        if changes.get("Docket"):
+            opinion.cluster.docket.save()
+            logger.debug(
+                "Docket %s updated with data %s",
+                opinion.cluster.docket.id,
+                changes["Docket"],
+            )
+            stats["Docket"] += 1
+
+        if changes.get("OpinionCluster"):
+            opinion.cluster.save()
+            logger.debug(
+                "OpinionCluster %s updated with data %s",
+                opinion.cluster.id,
+                changes["OpinionCluster"],
+            )
+            stats["OpinionCluster"] += 1
+
+        if changes.get("Opinion"):
+            opinion.save()
+            logger.debug("Opinion updated with data %s", changes["Opinion"])
+            stats["Opinion"] += 1
+
+        if changes.get("Citation"):
+            if changes["Citation"].get("citation_created"):
+                logger.info(
+                    "Citation created with data %s", changes["Citation"]
+                )
+                stats["Citation"] += 1
+            else:
+                logger.debug(
+                    "Citation not created. Data %s", changes["Citation"]
+                )
+
+
+class Command(ScraperCommand):
+    help = """Updates objects by running Site.extract_from_text
+    over extracted content found on Opinion.plain_text or Opinion.html.
+
+    If `--opinion-ids` is used, filters will be ignored.
+    If not, the 2 date filters will be required, to prevent triggering
+    unwanted reprocessing of the whole court's dataset
+
+    Recommended use is to run over a sample of the target time period
+    and check if updates over Docket, OpinionCluster, Opinion and
+    Citation are as expected
+    """
+    # For aggregate reporting at the end of the command
+    stats = {
+        "Docket": 0,
+        "OpinionCluster": 0,
+        "Opinion": 0,
+        "Citation": 0,
+        "No text to extract from": 0,
+        "No metadata extracted": 0,
+        "Error": 0,
+    }
+    juriscraper_module_type = "opinions"
+
+    def add_arguments(self, parser):
+        super().add_arguments(parser)
+        parser.add_argument(
+            "--opinion-ids",
+            nargs="+",
+            type=int,
+            help="""The Opinion ids to re-process.
+            May be more than one. If this argument is used,
+            other filters will be ignored""",
+        )
+        parser.add_argument(
+            "--date-filed-gte",
+            default="",
+            type=self.parse_input_date,
+            help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
+            OpinionCluster.date_filed will have to be greater or equal""",
+        )
+        parser.add_argument(
+            "--date-filed-lte",
+            default="",
+            type=self.parse_input_date,
+            help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
+            OpinionCluster.date_filed will have to be less or equal""",
+        )
+        parser.add_argument(
+            "--cluster-status",
+            default="",
+            choices=[value for value, name in PRECEDENTIAL_STATUS.NAMES],
+            help="""A value of OpinionCluster.precedential_status. To be
+            used for filtering the Opinions to be processed
+            """,
+        )
+
+    def handle(self, *args, **options):
+        super().handle(*args, **options)
+        juriscraper_module = options["court_id"]
+
+        if options["opinion_ids"]:
+            opinions = Opinion.objects.filter(id__in=options["opinion_ids"])
+            for op in opinions:
+                rerun_extract_from_text(op, juriscraper_module, self.stats)
+
+            logger.info("Modified objects counts: %s", self.stats)
+            return
+
+        if not (options["date_filed_gte"] and options["date_filed_lte"]):
+            raise ValueError(
+                "Both `date-filed-gte` and `date-filed-lte` arguments should have values"
+            )
+
+        court_id = juriscraper_module.split(".")[-1].split("_")[0]
+        query = {
+            "docket__court_id": court_id,
+            "date_filed__gte": options["date_filed_lte"],
+            "date_filed__lte": options["date_filed_gte"],
+        }
+
+        if options["cluster_status"]:
+            query["precedential_status"] = options["cluster_status"]
+
+        qs = OpinionCluster.objects.filter(**query).prefetch_related(
+            "sub_opinions"
+        )
+        for cluster in qs:
+            opinions = cluster.sub_opinions.all()
+            for op in opinions:
+                rerun_extract_from_text(op, juriscraper_module, self.stats)
+
+        logger.info("Modified objects counts: %s", self.stats)
+
+    def parse_input_date(self, date_string: str) -> datetime | str:
+        """Parses a date string in accepted formats
+
+        :param date_string: the date string in "%Y/%m/%d" or "%Y-%m-%d"
+        :return: an empty string if the input was empty; or the date object
+        """
+        parsed_date = ""
+        if "/" in date_string:
+            parsed_date = datetime.strptime(date_string, "%Y/%m/%d")
+        elif "-" in date_string:
+            parsed_date = datetime.strptime(date_string, "%Y-%m-%d")
+        return parsed_date