feat(scrapers.update_from_text): refactor from code review

- validate citation objects from `Site.extract_from_text`. Add tests for this - abstract --courts required argument for scrapers into ScraperCommand class also, made it more flexible - refactor cl_scrape_opinions; cl_scrape_oral_arguments to account for this - delete cl.scrapers.utils.extract_recap_documents which was generating a circular import. This function was not used anywhere
freelawproject · Oct 28, 2024 · d39abee · d39abee
1 parent 5baa691
commit d39abee
Show file tree

Hide file tree

Showing 9 changed files with 214 additions and 105 deletions.
diff --git a/cl/lib/command_utils.py b/cl/lib/command_utils.py
@@ -3,6 +3,8 @@
 
 from django.core.management import BaseCommand, CommandError
 
+from cl.lib.juriscraper_utils import get_module_by_court_id
+
 logger = logging.getLogger(__name__)
 
 
@@ -22,6 +24,40 @@ def handle(self, *args, **options):
             juriscraper_logger.setLevel(logging.DEBUG)
 
 
+class ScraperCommand(VerboseCommand):
+    """Base class for cl.scrapers commands that use Juriscraper
+
+    Implements the `--courts` argument to lookup for a Site object
+    """
+
+    # To be used on get_module_by_court_id
+    # Defined by inheriting classes
+    juriscraper_module_type = ""
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--courts",
+            dest="court_id",
+            metavar="COURTID",
+            type=lambda s: (
+                s
+                if "." in s
+                else get_module_by_court_id(s, self.juriscraper_module_type)
+            ),
+            required=True,
+            help=(
+                "The court(s) to scrape and extract. One of: "
+                "1. a python module or package import from the Juriscraper library, e.g."
+                "'juriscraper.opinions.united_states.federal_appellate.ca1' "
+                "or simply 'juriscraper.opinions' to do all opinions."
+                ""
+                "2. a court_id, to be used to lookup for a full module path"
+                "An error will be raised if the `court_id` matches more than "
+                "one module path. In that case, use the full path"
+            ),
+        )
+
+
 class CommandUtils:
     """A mixin to give some useful methods to sub classes."""
 

diff --git a/cl/lib/juriscraper_utils.py b/cl/lib/juriscraper_utils.py
@@ -5,6 +5,12 @@
 import juriscraper
 
 
+def walk_juriscraper():
+    return pkgutil.walk_packages(
+        juriscraper.__path__, f"{juriscraper.__name__}."
+    )
+
+
 def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
     """Identify and instantiate a Site() object given the name of a court
 
@@ -25,9 +31,7 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
 
         return importlib.import_module(juriscraper_module).Site()
 
-    for _, full_module_path, _ in pkgutil.walk_packages(
-        juriscraper.__path__, f"{juriscraper.__name__}."
-    ):
+    for _, full_module_path, _ in walk_juriscraper():
         # Get the module name from the full path and trim
         # any suffixes like _p, _u
         module_name = full_module_path.rsplit(".", 1)[1].rsplit("_", 1)[0]
@@ -42,3 +46,45 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
                 # has been stripped off it. In any case, just ignore it when
                 # this happens.
                 continue
+
+
+def get_module_by_court_id(court_id: str, module_type: str):
+    """Given a `court_id` return a juriscraper module path
+
+    Some court_ids match multiple scraper files. These will force the user
+    to use the full module path. For example, "lactapp_1" and "lactapp_5"
+    match the same `court_id`, but scrape totally different sites, and
+    their Site objects are expected to have different `extract_from_text`
+    behavior
+
+    :param court_id: court id to look for
+    :param module_type: 'opinions' or 'oral_args'. Without this, some
+        court_ids may match the 2 classes of scrapers
+
+    :raises: ValueError if there is no match or there is more than 1 match
+    :return: the full module path
+    """
+    if module_type not in ["opinions", "oral_args"]:
+        raise ValueError(
+            "module_type has to be one of ['opinions', 'oral_args']"
+        )
+
+    matches = []
+    for _, module_string, _ in walk_juriscraper():
+        if module_string.count(".") != 4 or module_type not in module_string:
+            # Skip folder and lib modules. Skip type
+            continue
+
+        module_court_id = module_string.rsplit(".", 1)[1].rsplit("_", 1)[0]
+        if module_court_id == court_id:
+            matches.append(module_string)
+
+    if len(matches) == 1:
+        return matches[0]
+    elif len(matches) == 0:
+        raise ValueError(f"'{court_id}' doesn't match any juriscraper module")
+    else:
+        raise ValueError(
+            f"'{court_id}' matches more than 1 juriscraper module."
+            f"Use a full module path. Matches: '{matches}'"
+        )
diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -24,6 +24,7 @@
 
 class Command(cl_back_scrape_opinions.Command):
     scrape_target_descr = "citations"
+    juriscraper_module_type = "opinions"
 
     def scrape_court(
         self,

diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py
@@ -18,7 +18,7 @@
 
 from cl.alerts.models import RealTimeQueue
 from cl.citations.utils import map_reporter_db_cite_type
-from cl.lib.command_utils import VerboseCommand, logger
+from cl.lib.command_utils import ScraperCommand, logger
 from cl.lib.crypto import sha1
 from cl.lib.string_utils import trunc
 from cl.people_db.lookup_utils import lookup_judges_by_messy_str
@@ -217,14 +217,16 @@ def save_everything(
         )
 
 
-class Command(VerboseCommand):
+class Command(ScraperCommand):
     help = "Runs the Juriscraper toolkit against one or many jurisdictions."
+    juriscraper_module_type = "opinions"
     scrape_target_descr = "opinions"  # for logging purposes
 
     def __init__(self, stdout=None, stderr=None, no_color=False):
         super().__init__(stdout=None, stderr=None, no_color=False)
 
     def add_arguments(self, parser):
+        super().add_arguments(parser)
         parser.add_argument(
             "--daemon",
             action="store_true",
@@ -246,20 +248,6 @@ def add_arguments(self, parser):
                 "is 30 minutes."
             ),
         )
-        parser.add_argument(
-            "--courts",
-            type=str,
-            dest="court_id",
-            metavar="COURTID",
-            required=True,
-            help=(
-                "The court(s) to scrape and extract. This should be "
-                "in the form of a python module or package import "
-                "from the Juriscraper library, e.g. "
-                '"juriscraper.opinions.united_states.federal_appellate.ca1" '
-                'or simply "opinions" to do all opinions.'
-            ),
-        )
         parser.add_argument(
             "--fullcrawl",
             dest="full_crawl",

diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
@@ -107,6 +107,7 @@ def make_objects(
 
 class Command(cl_scrape_opinions.Command):
     scrape_target_descr = "oral arguments"
+    juriscraper_module_type = "oral_args"
 
     def ingest_a_case(
         self,

diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py
@@ -1,8 +1,9 @@
+import traceback
 from datetime import datetime
 
 from django.db import transaction
 
-from cl.lib.command_utils import VerboseCommand, logger
+from cl.lib.command_utils import ScraperCommand, logger
 from cl.scrapers.tasks import update_document_from_text
 from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster
 
@@ -23,15 +24,37 @@ def rerun_extract_from_text(
 
     :return None
     """
+    if not opinion.plain_text and not opinion.html:
+        # May be an opinion entirely from a merged corpus
+        # or an error during text extraction
+        logger.info(
+            "Opinion %s has no `plain_text` or `html` to extract from",
+            opinion.id,
+        )
+        stats["No text to extract from"] += 1
+        return
+
     with transaction.atomic():
-        changes = update_document_from_text(opinion, juriscraper_module)
+        try:
+            changes = update_document_from_text(opinion, juriscraper_module)
+        except:
+            # Probably a bad implementation of `extract_from_text`
+            logger.debug(
+                "`update_document_from_text` failed for opinion %s. Traceback: %s",
+                opinion.id,
+                traceback.format_exc(),
+            )
+            stats["Error"] += 1
+            return
+
         if not changes:
             logger.info("Did not get any metadata for opinion %s", opinion.id)
+            stats["No metadata extracted"] += 1
             return
 
         logger.info("Processing opinion %s", opinion.id)
 
-        # Check if changes exist before saving, to prevent unecessary DB queries
+        # Check if changes exist before saving, to prevent unnecessary DB queries
         if changes.get("Docket"):
             opinion.cluster.docket.save()
             logger.debug(
@@ -67,7 +90,7 @@ def rerun_extract_from_text(
                 )
 
 
-class Command(VerboseCommand):
+class Command(ScraperCommand):
     help = """Updates objects by running Site.extract_from_text
     over extracted content found on Opinion.plain_text or Opinion.html.
 
@@ -79,18 +102,20 @@ class Command(VerboseCommand):
     and check if updates over Docket, OpinionCluster, Opinion and
     Citation are as expected
     """
-    stats = {}  # assigned at the end of a command run, for testing
+    # For aggregate reporting at the end of the command
+    stats = {
+        "Docket": 0,
+        "OpinionCluster": 0,
+        "Opinion": 0,
+        "Citation": 0,
+        "No text to extract from": 0,
+        "No metadata extracted": 0,
+        "Error": 0,
+    }
+    juriscraper_module_type = "opinions"
 
     def add_arguments(self, parser):
-        parser.add_argument(
-            "--juriscraper-module",
-            help="""The Juriscraper file which contains the
-            `extract_from_text` method to be used. The `court_id`
-            will be deduced from this. Example:
-            juriscraper.opinions.united_states.federal_appellate.ca1
-            """,
-            required=True,
-        )
+        super().add_arguments(parser)
         parser.add_argument(
             "--opinion-ids",
             nargs="+",
@@ -100,15 +125,17 @@ def add_arguments(self, parser):
             other filters will be ignored""",
         )
         parser.add_argument(
-            "date-filed-gte",
+            "--date-filed-gte",
             default="",
-            help=r"""A filter value in %Y/%m/%d format.
+            type=self.parse_input_date,
+            help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
             OpinionCluster.date_filed will have to be greater or equal""",
         )
         parser.add_argument(
-            "date-filed-lte",
+            "--date-filed-lte",
             default="",
-            help=r"""A filter value in %Y/%m/%d format.
+            type=self.parse_input_date,
+            help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
             OpinionCluster.date_filed will have to be less or equal""",
         )
         parser.add_argument(
@@ -122,16 +149,14 @@ def add_arguments(self, parser):
 
     def handle(self, *args, **options):
         super().handle(*args, **options)
-        juriscraper_module = options["juriscraper_module"]
-        # For aggregate reporting
-        stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0}
+        juriscraper_module = options["court_id"]
 
         if options["opinion_ids"]:
             opinions = Opinion.objects.filter(id__in=options["opinion_ids"])
             for op in opinions:
-                rerun_extract_from_text(op, juriscraper_module, stats)
+                rerun_extract_from_text(op, juriscraper_module, self.stats)
 
-            logger.info("Modified objects counts: %s", stats)
+            logger.info("Modified objects counts: %s", self.stats)
             return
 
         if not (options["date_filed_gte"] and options["date_filed_lte"]):
@@ -140,12 +165,10 @@ def handle(self, *args, **options):
             )
 
         court_id = juriscraper_module.split(".")[-1].split("_")[0]
-        gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d")
-        lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d")
         query = {
             "docket__court_id": court_id,
-            "date_filed__gte": gte_date,
-            "date_filed__lte": lte_date,
+            "date_filed__gte": options["date_filed_lte"],
+            "date_filed__lte": options["date_filed_gte"],
         }
 
         if options["cluster_status"]:
@@ -157,7 +180,19 @@ def handle(self, *args, **options):
         for cluster in qs:
             opinions = cluster.sub_opinions.all()
             for op in opinions:
-                rerun_extract_from_text(op, juriscraper_module, stats)
-
-        logger.info("Modified objects counts: %s", stats)
-        self.stats = stats
+                rerun_extract_from_text(op, juriscraper_module, self.stats)
+
+        logger.info("Modified objects counts: %s", self.stats)
+
+    def parse_input_date(self, date_string: str) -> datetime | str:
+        """Parses a date string in accepted formats
+
+        :param date_string: the date string in "%Y/%m/%d" or "%Y-%m-%d"
+        :return: an empty string if the input was empty; or the date object
+        """
+        parsed_date = ""
+        if "/" in date_string:
+            parsed_date = datetime.strptime(date_string, "%Y/%m/%d")
+        elif "-" in date_string:
+            parsed_date = datetime.strptime(date_string, "%Y-%m-%d")
+        return parsed_date
diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py
@@ -30,6 +30,7 @@
 from cl.lib.string_utils import trunc
 from cl.lib.utils import is_iter
 from cl.recap.mergers import save_iquery_to_docket
+from cl.scrapers.utils import scraped_citation_object_is_valid
 from cl.search.models import Docket, Opinion, RECAPDocument
 
 logger = logging.getLogger(__name__)
@@ -71,8 +72,9 @@ def update_document_from_text(
             opinion.cluster.__dict__.update(data)
         elif model_name == "Citation":
             data["cluster_id"] = opinion.cluster_id
-            _, citation_created = ModelClass.objects.get_or_create(**data)
-            metadata_dict["Citation"]["created"] = citation_created
+            if scraped_citation_object_is_valid(data):
+                _, citation_created = ModelClass.objects.get_or_create(**data)
+                metadata_dict["Citation"]["created"] = citation_created
         elif model_name == "Opinion":
             opinion.__dict__.update(data)
         else: