From d871b4aae29ebad5be842df18e7bf93e85fec1fe Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Tue, 1 Oct 2024 11:09:28 -0500
Subject: [PATCH 1/3] feat(scrapers.update_from_text): new command

Helps solve: https://github.com/freelawproject/juriscraper/issues/858

- New command to re-run Site.extract_from_text over downloaded opinions
- Able to filter by Docket.court_id ,  OpinionCluster.date_filed, OpinionCluster.precedential_status
- Updates tasks.update_from_document_text to return information for logging purposes
- Updates test_opinion_scraper to get a Site.extract_from_text method
---
 .../management/commands/update_from_text.py   | 159 ++++++++++++++++++
 cl/scrapers/tasks.py                          |  12 +-
 .../test_assets/test_opinion_scraper.py       |  21 +++
 cl/scrapers/tests.py                          | 110 +++++++++++-
 4 files changed, 297 insertions(+), 5 deletions(-)
 create mode 100644 cl/scrapers/management/commands/update_from_text.py

diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py
new file mode 100644
index 0000000000..77fe5966af
--- /dev/null
+++ b/cl/scrapers/management/commands/update_from_text.py
@@ -0,0 +1,159 @@
+from datetime import datetime
+
+from django.db import transaction
+
+from cl.lib.command_utils import VerboseCommand, logger
+from cl.scrapers.tasks import update_document_from_text
+from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster
+
+
+def update_from_text(
+    opinion: Opinion, juriscraper_module: str, stats: dict[str, int]
+):
+    """Calls `update_document_from_text` as used in the scraper flow
+    and calls the corresponding model's .save()
+
+    :param opinion: the Opinion on which to apply extract_from_text
+    :param juriscraper_module: the scraper module path
+    :param stats: dict to accumulate counts for reporting. Modified in place
+
+    :return None
+    """
+    with transaction.atomic():
+        changes = update_document_from_text(opinion, juriscraper_module)
+        if not changes:
+            logger.info("Did not get any metadata for opinion %s", opinion.id)
+            return
+
+        logger.info("Processing opinion %s", opinion.id)
+
+        # Check if changes exist before saving, to prevent unecessary DB queries
+        if changes.get("Docket"):
+            opinion.cluster.docket.save()
+            logger.debug(
+                "Docket %s updated with data %s",
+                opinion.cluster.docket.id,
+                changes["Docket"],
+            )
+            stats["Docket"] += 1
+
+        if changes.get("OpinionCluster"):
+            opinion.cluster.save()
+            logger.debug(
+                "OpinionCluster %s updated with data %s",
+                opinion.cluster.id,
+                changes["OpinionCluster"],
+            )
+            stats["OpinionCluster"] += 1
+
+        if changes.get("Opinion"):
+            opinion.save()
+            logger.debug("Opinion updated with data %s", changes["Opinion"])
+            stats["Opinion"] += 1
+
+        if changes.get("Citation"):
+            if changes["Citation"].get("citation_created"):
+                logger.info(
+                    "Citation created with data %s", changes["Citation"]
+                )
+                stats["Citation"] += 1
+            else:
+                logger.debug(
+                    "Citation not created. Data %s", changes["Citation"]
+                )
+
+
+class Command(VerboseCommand):
+    help = """Updates objects by running Site.extract_from_text
+    over extracted content found on Opinion.plain_text or Opinion.html.
+
+    If `--opinion-ids` is used, filters will be ignored.
+    If not, the 2 date filters will be required, to prevent triggering
+    unwanted reprocessing of the whole court's dataset
+
+    Recommended use is to run over a sample of the target time period
+    and check if updates over Docket, OpinionCluster, Opinion and
+    Citation are as expected
+    """
+    stats = {}  # assigned at the end of a command run, for testing
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--juriscraper-module",
+            help="""The Juriscraper file which contains the
+            `extract_from_text` method to be used. The `court_id`
+            will be deduced from this. Example:
+            juriscraper.opinions.united_states.federal_appellate.ca1
+            """,
+            required=True,
+        )
+        parser.add_argument(
+            "--opinion-ids",
+            nargs="+",
+            type=int,
+            help="""The Opinion ids to re-process.
+            May be more than one. If this argument is used,
+            other filters will be ignored""",
+        )
+        parser.add_argument(
+            "date-filed-gte",
+            default="",
+            help=r"""A filter value in %Y/%m/%d format.
+            OpinionCluster.date_filed will have to be greater or equal""",
+        )
+        parser.add_argument(
+            "date-filed-lte",
+            default="",
+            help=r"""A filter value in %Y/%m/%d format.
+            OpinionCluster.date_filed will have to be less or equal""",
+        )
+        parser.add_argument(
+            "--cluster-status",
+            default="",
+            choices=[value for value, name in PRECEDENTIAL_STATUS.NAMES],
+            help="""A value of OpinionCluster.precedential_status. To be
+            used for filtering the Opinions to be processed
+            """,
+        )
+
+    def handle(self, *args, **options):
+        super().handle(*args, **options)
+        juriscraper_module = options["juriscraper_module"]
+        # For aggregate reporting
+        stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0}
+
+        if options["opinion_ids"]:
+            opinions = Opinion.objects.filter(id__in=options["opinion_ids"])
+            for op in opinions:
+                update_from_text(op, juriscraper_module, stats)
+
+            logger.info("Modified objects counts: %s", stats)
+            return
+
+        if not (options["date_filed_gte"] and options["date_filed_lte"]):
+            raise ValueError(
+                "Both `date-filed-gte` and `date-filed-lte` arguments should have values"
+            )
+
+        court_id = juriscraper_module.split(".")[-1].split("_")[0]
+        gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d")
+        lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d")
+        query = {
+            "docket__court_id": court_id,
+            "date_filed__gte": gte_date,
+            "date_filed__lte": lte_date,
+        }
+
+        if options["cluster_status"]:
+            query["precedential_status"] = options["cluster_status"]
+
+        qs = OpinionCluster.objects.filter(**query).prefetch_related(
+            "sub_opinions"
+        )
+        for cluster in qs:
+            opinions = cluster.sub_opinions.all()
+            for op in opinions:
+                update_from_text(op, juriscraper_module, stats)
+
+        logger.info("Modified objects counts: %s", stats)
+        self.stats = stats
diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py
index c60971c572..15500e94bb 100644
--- a/cl/scrapers/tasks.py
+++ b/cl/scrapers/tasks.py
@@ -39,7 +39,7 @@
 
 def update_document_from_text(
     opinion: Opinion, juriscraper_module: str = ""
-) -> None:
+) -> dict:
     """Extract additional metadata from document text
 
     We use this code with BIA decisions. Previously Tax.
@@ -54,12 +54,13 @@ def update_document_from_text(
 
     :param opinion: Opinion object
     :param juriscraper_module: full module to get Site object
-    :return: None
+    :return: the extracted data dictionary
     """
     court = opinion.cluster.docket.court.pk
     site = get_scraper_object_by_name(court, juriscraper_module)
     if site is None:
-        return
+        logger.debug("No site found %s", juriscraper_module)
+        return {}
 
     metadata_dict = site.extract_from_text(opinion.plain_text or opinion.html)
     for model_name, data in metadata_dict.items():
@@ -70,7 +71,8 @@ def update_document_from_text(
             opinion.cluster.__dict__.update(data)
         elif model_name == "Citation":
             data["cluster_id"] = opinion.cluster_id
-            ModelClass.objects.get_or_create(**data)
+            _, citation_created = ModelClass.objects.get_or_create(**data)
+            metadata_dict["Citation"]["created"] = citation_created
         elif model_name == "Opinion":
             opinion.__dict__.update(data)
         else:
@@ -78,6 +80,8 @@ def update_document_from_text(
                 f"Object type of {model_name} not yet supported."
             )
 
+    return metadata_dict
+
 
 @app.task(
     bind=True,
diff --git a/cl/scrapers/test_assets/test_opinion_scraper.py b/cl/scrapers/test_assets/test_opinion_scraper.py
index 508be0dfec..18a28d71de 100644
--- a/cl/scrapers/test_assets/test_opinion_scraper.py
+++ b/cl/scrapers/test_assets/test_opinion_scraper.py
@@ -1,3 +1,4 @@
+import re
 from datetime import datetime
 from os.path import join
 
@@ -53,3 +54,23 @@ def _get_nature_of_suit(self):
     def _get_judges(self):
         path = "//judge/text()"
         return list(self.html.xpath(path))
+
+    def extract_from_text(self, scraped_text):
+        metadata = {}
+        docket_regex = r"Docket Number: (?P<docket>\d+-\d+)"
+        disposition_regex = r"Disposition: (?P<disposition>\w+)"
+        citation_regex = r"(?P<volume>20\d{2}) (?P<reporter>VT) (?P<page>\d+)"
+        if docket_match := re.search(docket_regex, scraped_text):
+            metadata["Docket"] = {
+                "docket_number": docket_match.group("docket")
+            }
+
+        if disposition_match := re.search(disposition_regex, scraped_text):
+            metadata["OpinionCluster"] = {
+                "disposition": disposition_match.group("disposition")
+            }
+
+        if citation_match := re.search(citation_regex, scraped_text):
+            metadata["Citation"] = {**citation_match.groupdict(), "type": 8}
+
+        return metadata
diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
index 375987426a..1d818d4f39 100644
--- a/cl/scrapers/tests.py
+++ b/cl/scrapers/tests.py
@@ -1,5 +1,5 @@
 import os
-from datetime import datetime, timedelta
+from datetime import date, datetime, timedelta
 from http import HTTPStatus
 from pathlib import Path
 from unittest import TestCase, mock
@@ -30,6 +30,7 @@
     cl_back_scrape_citations,
     cl_scrape_opinions,
     cl_scrape_oral_arguments,
+    update_from_text,
 )
 from cl.scrapers.models import UrlHash
 from cl.scrapers.tasks import extract_doc_content, process_audio_file
@@ -867,3 +868,110 @@ def test_federal_jurisdictions(self):
         self.assertEqual(
             docket, self.ca2_docket, "Should match using docket number core"
         )
+
+
+class UpdateFromTestCommandTest(TestCase):
+    """Test the input processing and DB querying for the command"""
+
+    def setUp(self):
+        self.vt = CourtFactory(id="vt")
+        self.sc = CourtFactory(id="sc")
+        self.docket_sc = DocketFactory(court=self.sc, docket_number="20")
+
+        # Different dates, status and courts to test command behaviour
+        self.opinion_2020 = OpinionFactory(
+            cluster=OpinionClusterFactory(
+                docket=DocketFactory(court=self.vt, docket_number="12"),
+                date_filed=date(2020, 6, 1),
+                precedential_status="Published",
+            ),
+            plain_text="""Docket Number: 2020-12
+            Disposition: Affirmed
+            2020 VT 11""",
+        )
+        self.opinion_2020_unpub = OpinionFactory(
+            cluster=OpinionClusterFactory(
+                docket=DocketFactory(court=self.vt, docket_number="13"),
+                date_filed=date(2020, 7, 1),
+                precedential_status="Unpublished",
+            ),
+            plain_text="Docket Number: 2020-13\nDisposition: Affirmed",
+        )
+
+        self.opinion_sc = OpinionFactory(
+            cluster=OpinionClusterFactory(
+                docket=self.docket_sc,
+                date_filed=date(2021, 6, 1),
+                precedential_status="Published",
+            ),
+            plain_text="Some text with no matches",
+            id=101,
+        )
+
+        self.opinion_2022 = OpinionFactory(
+            cluster=OpinionClusterFactory(
+                docket=DocketFactory(court=self.vt, docket_number="13"),
+                date_filed=date(2022, 6, 1),
+                precedential_status="Unpublished",
+            ),
+            id=100,
+            plain_text="Docket Number: 2022-13\n2022 VT 11",
+        )
+
+    def test_inputs(self):
+        """Do all command inputs work properly?"""
+
+        # will target a single opinion, for which extract_from_text
+        # extracts no metadata. No object should be updated
+        cmd = update_from_text.Command()
+        with mock.patch(
+            "cl.scrapers.tasks.get_scraper_object_by_name",
+            return_value=test_opinion_scraper.Site(),
+        ):
+            cmd.handle(juriscraper_module="somepath.sc", opinion_ids=[101])
+
+        self.assertFalse(
+            any(cmd.stats.values()), "No object should be modified"
+        )
+
+        # will target 1 opinion, there are 2 in the time period
+        # and 3 for the court
+        with mock.patch(
+            "cl.scrapers.tasks.get_scraper_object_by_name",
+            return_value=test_opinion_scraper.Site(),
+        ):
+            update_from_text.Command().handle(
+                juriscraper_module="somepath.vt",
+                opinion_ids=[],
+                date_filed_gte="2020/06/01",
+                date_filed_lte="2021/06/01",
+                cluster_status="Published",
+            )
+
+        # Test that objects were actually updated / created
+        self.assertEqual(
+            Citation.objects.filter(cluster=self.opinion_2020.cluster).count(),
+            1,
+            "There should be a single citation for this cluster",
+        )
+        self.opinion_2020.refresh_from_db()
+        self.opinion_2020.cluster.refresh_from_db()
+        self.opinion_2020.cluster.docket.refresh_from_db()
+        self.assertEqual(
+            self.opinion_2020.cluster.disposition,
+            "Affirmed",
+            "OpinionCluster.disposition was not updated",
+        )
+        self.assertEqual(
+            self.opinion_2020.cluster.docket.docket_number,
+            "2020-12",
+            "Docket.docket_number was not updated",
+        )
+
+        # Check that other objects in the time period and court
+        # were not modified. Meaning, the filter worked
+        self.assertEqual(
+            self.opinion_2020_unpub.cluster.docket.docket_number,
+            "13",
+            "Unpublished docket should not be modified",
+        )

From 5adce999146da5a1fe3dda1869073c836dffd68d Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Fri, 18 Oct 2024 14:05:44 -0500
Subject: [PATCH 2/3] refactor(scrapers.update_from_text): change function name
 and docstring

---
 .../management/commands/update_from_text.py        | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py
index 77fe5966af..f1450c9c66 100644
--- a/cl/scrapers/management/commands/update_from_text.py
+++ b/cl/scrapers/management/commands/update_from_text.py
@@ -7,11 +7,15 @@
 from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster
 
 
-def update_from_text(
+def rerun_extract_from_text(
     opinion: Opinion, juriscraper_module: str, stats: dict[str, int]
 ):
-    """Calls `update_document_from_text` as used in the scraper flow
-    and calls the corresponding model's .save()
+    """
+    Reruns `update_document_from_text` from the scraper flow, saving changes
+
+    `update_document_from_text` calls `Site.extract_from_text` and assigns
+    any changes to the proper objets, in place, but they are not saved.
+    This method saves the ones with actual changes
 
     :param opinion: the Opinion on which to apply extract_from_text
     :param juriscraper_module: the scraper module path
@@ -125,7 +129,7 @@ def handle(self, *args, **options):
         if options["opinion_ids"]:
             opinions = Opinion.objects.filter(id__in=options["opinion_ids"])
             for op in opinions:
-                update_from_text(op, juriscraper_module, stats)
+                rerun_extract_from_text(op, juriscraper_module, stats)
 
             logger.info("Modified objects counts: %s", stats)
             return
@@ -153,7 +157,7 @@ def handle(self, *args, **options):
         for cluster in qs:
             opinions = cluster.sub_opinions.all()
             for op in opinions:
-                update_from_text(op, juriscraper_module, stats)
+                rerun_extract_from_text(op, juriscraper_module, stats)
 
         logger.info("Modified objects counts: %s", stats)
         self.stats = stats

From 6a1fadb6cbfc0b03d82744658ff1d7e3bf2c3e5a Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Fri, 25 Oct 2024 21:09:56 -0500
Subject: [PATCH 3/3] feat(scrapers.update_from_text): refactor from code
 review

- validate citation objects from `Site.extract_from_text`. Add tests for this
- abstract --courts required argument for scrapers into ScraperCommand class
also, made it more flexible
- refactor cl_scrape_opinions; cl_scrape_oral_arguments to account for this
- delete cl.scrapers.utils.extract_recap_documents which was generating a
circular import. This function was not used anywhere
---
 cl/lib/command_utils.py                       | 36 +++++++
 cl/lib/juriscraper_utils.py                   | 52 +++++++++-
 .../commands/cl_back_scrape_citations.py      |  1 +
 .../management/commands/cl_scrape_opinions.py | 20 +---
 .../commands/cl_scrape_oral_arguments.py      |  1 +
 .../management/commands/update_from_text.py   | 97 +++++++++++++------
 cl/scrapers/tasks.py                          |  6 +-
 cl/scrapers/tests.py                          | 27 +++++-
 cl/scrapers/utils.py                          | 83 ++++++----------
 9 files changed, 216 insertions(+), 107 deletions(-)

diff --git a/cl/lib/command_utils.py b/cl/lib/command_utils.py
index 2c3797f9f5..ee86463812 100644
--- a/cl/lib/command_utils.py
+++ b/cl/lib/command_utils.py
@@ -3,6 +3,8 @@
 
 from django.core.management import BaseCommand, CommandError
 
+from cl.lib.juriscraper_utils import get_module_by_court_id
+
 logger = logging.getLogger(__name__)
 
 
@@ -22,6 +24,40 @@ def handle(self, *args, **options):
             juriscraper_logger.setLevel(logging.DEBUG)
 
 
+class ScraperCommand(VerboseCommand):
+    """Base class for cl.scrapers commands that use Juriscraper
+
+    Implements the `--courts` argument to lookup for a Site object
+    """
+
+    # To be used on get_module_by_court_id
+    # Defined by inheriting classes
+    juriscraper_module_type = ""
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--courts",
+            dest="court_id",
+            metavar="COURTID",
+            type=lambda s: (
+                s
+                if "." in s
+                else get_module_by_court_id(s, self.juriscraper_module_type)
+            ),
+            required=True,
+            help=(
+                "The court(s) to scrape and extract. One of: "
+                "1. a python module or package import from the Juriscraper library, e.g."
+                "'juriscraper.opinions.united_states.federal_appellate.ca1' "
+                "or simply 'juriscraper.opinions' to do all opinions."
+                ""
+                "2. a court_id, to be used to lookup for a full module path"
+                "An error will be raised if the `court_id` matches more than "
+                "one module path. In that case, use the full path"
+            ),
+        )
+
+
 class CommandUtils:
     """A mixin to give some useful methods to sub classes."""
 
diff --git a/cl/lib/juriscraper_utils.py b/cl/lib/juriscraper_utils.py
index ae8c090f41..2eb902352b 100644
--- a/cl/lib/juriscraper_utils.py
+++ b/cl/lib/juriscraper_utils.py
@@ -5,6 +5,12 @@
 import juriscraper
 
 
+def walk_juriscraper():
+    return pkgutil.walk_packages(
+        juriscraper.__path__, f"{juriscraper.__name__}."
+    )
+
+
 def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
     """Identify and instantiate a Site() object given the name of a court
 
@@ -25,9 +31,7 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
 
         return importlib.import_module(juriscraper_module).Site()
 
-    for _, full_module_path, _ in pkgutil.walk_packages(
-        juriscraper.__path__, f"{juriscraper.__name__}."
-    ):
+    for _, full_module_path, _ in walk_juriscraper():
         # Get the module name from the full path and trim
         # any suffixes like _p, _u
         module_name = full_module_path.rsplit(".", 1)[1].rsplit("_", 1)[0]
@@ -42,3 +46,45 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
                 # has been stripped off it. In any case, just ignore it when
                 # this happens.
                 continue
+
+
+def get_module_by_court_id(court_id: str, module_type: str):
+    """Given a `court_id` return a juriscraper module path
+
+    Some court_ids match multiple scraper files. These will force the user
+    to use the full module path. For example, "lactapp_1" and "lactapp_5"
+    match the same `court_id`, but scrape totally different sites, and
+    their Site objects are expected to have different `extract_from_text`
+    behavior
+
+    :param court_id: court id to look for
+    :param module_type: 'opinions' or 'oral_args'. Without this, some
+        court_ids may match the 2 classes of scrapers
+
+    :raises: ValueError if there is no match or there is more than 1 match
+    :return: the full module path
+    """
+    if module_type not in ["opinions", "oral_args"]:
+        raise ValueError(
+            "module_type has to be one of ['opinions', 'oral_args']"
+        )
+
+    matches = []
+    for _, module_string, _ in walk_juriscraper():
+        if module_string.count(".") != 4 or module_type not in module_string:
+            # Skip folder and lib modules. Skip type
+            continue
+
+        module_court_id = module_string.rsplit(".", 1)[1].rsplit("_", 1)[0]
+        if module_court_id == court_id:
+            matches.append(module_string)
+
+    if len(matches) == 1:
+        return matches[0]
+    elif len(matches) == 0:
+        raise ValueError(f"'{court_id}' doesn't match any juriscraper module")
+    else:
+        raise ValueError(
+            f"'{court_id}' matches more than 1 juriscraper module."
+            f"Use a full module path. Matches: '{matches}'"
+        )
diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
index b2da0a4581..a445df9438 100644
--- a/cl/scrapers/management/commands/cl_back_scrape_citations.py
+++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -24,6 +24,7 @@
 
 class Command(cl_back_scrape_opinions.Command):
     scrape_target_descr = "citations"
+    juriscraper_module_type = "opinions"
 
     def scrape_court(
         self,
diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py
index 67dac880ab..8fe42e893a 100644
--- a/cl/scrapers/management/commands/cl_scrape_opinions.py
+++ b/cl/scrapers/management/commands/cl_scrape_opinions.py
@@ -18,7 +18,7 @@
 
 from cl.alerts.models import RealTimeQueue
 from cl.citations.utils import map_reporter_db_cite_type
-from cl.lib.command_utils import VerboseCommand, logger
+from cl.lib.command_utils import ScraperCommand, logger
 from cl.lib.crypto import sha1
 from cl.lib.string_utils import trunc
 from cl.people_db.lookup_utils import lookup_judges_by_messy_str
@@ -217,14 +217,16 @@ def save_everything(
         )
 
 
-class Command(VerboseCommand):
+class Command(ScraperCommand):
     help = "Runs the Juriscraper toolkit against one or many jurisdictions."
+    juriscraper_module_type = "opinions"
     scrape_target_descr = "opinions"  # for logging purposes
 
     def __init__(self, stdout=None, stderr=None, no_color=False):
         super().__init__(stdout=None, stderr=None, no_color=False)
 
     def add_arguments(self, parser):
+        super().add_arguments(parser)
         parser.add_argument(
             "--daemon",
             action="store_true",
@@ -246,20 +248,6 @@ def add_arguments(self, parser):
                 "is 30 minutes."
             ),
         )
-        parser.add_argument(
-            "--courts",
-            type=str,
-            dest="court_id",
-            metavar="COURTID",
-            required=True,
-            help=(
-                "The court(s) to scrape and extract. This should be "
-                "in the form of a python module or package import "
-                "from the Juriscraper library, e.g. "
-                '"juriscraper.opinions.united_states.federal_appellate.ca1" '
-                'or simply "opinions" to do all opinions.'
-            ),
-        )
         parser.add_argument(
             "--fullcrawl",
             dest="full_crawl",
diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
index ad284381f4..62377a98ec 100644
--- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
+++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
@@ -107,6 +107,7 @@ def make_objects(
 
 class Command(cl_scrape_opinions.Command):
     scrape_target_descr = "oral arguments"
+    juriscraper_module_type = "oral_args"
 
     def ingest_a_case(
         self,
diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py
index f1450c9c66..0c7da06ef3 100644
--- a/cl/scrapers/management/commands/update_from_text.py
+++ b/cl/scrapers/management/commands/update_from_text.py
@@ -1,8 +1,9 @@
+import traceback
 from datetime import datetime
 
 from django.db import transaction
 
-from cl.lib.command_utils import VerboseCommand, logger
+from cl.lib.command_utils import ScraperCommand, logger
 from cl.scrapers.tasks import update_document_from_text
 from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster
 
@@ -23,15 +24,37 @@ def rerun_extract_from_text(
 
     :return None
     """
+    if not opinion.plain_text and not opinion.html:
+        # May be an opinion entirely from a merged corpus
+        # or an error during text extraction
+        logger.info(
+            "Opinion %s has no `plain_text` or `html` to extract from",
+            opinion.id,
+        )
+        stats["No text to extract from"] += 1
+        return
+
     with transaction.atomic():
-        changes = update_document_from_text(opinion, juriscraper_module)
+        try:
+            changes = update_document_from_text(opinion, juriscraper_module)
+        except:
+            # Probably a bad implementation of `extract_from_text`
+            logger.debug(
+                "`update_document_from_text` failed for opinion %s. Traceback: %s",
+                opinion.id,
+                traceback.format_exc(),
+            )
+            stats["Error"] += 1
+            return
+
         if not changes:
             logger.info("Did not get any metadata for opinion %s", opinion.id)
+            stats["No metadata extracted"] += 1
             return
 
         logger.info("Processing opinion %s", opinion.id)
 
-        # Check if changes exist before saving, to prevent unecessary DB queries
+        # Check if changes exist before saving, to prevent unnecessary DB queries
         if changes.get("Docket"):
             opinion.cluster.docket.save()
             logger.debug(
@@ -67,7 +90,7 @@ def rerun_extract_from_text(
                 )
 
 
-class Command(VerboseCommand):
+class Command(ScraperCommand):
     help = """Updates objects by running Site.extract_from_text
     over extracted content found on Opinion.plain_text or Opinion.html.
 
@@ -79,18 +102,20 @@ class Command(VerboseCommand):
     and check if updates over Docket, OpinionCluster, Opinion and
     Citation are as expected
     """
-    stats = {}  # assigned at the end of a command run, for testing
+    # For aggregate reporting at the end of the command
+    stats = {
+        "Docket": 0,
+        "OpinionCluster": 0,
+        "Opinion": 0,
+        "Citation": 0,
+        "No text to extract from": 0,
+        "No metadata extracted": 0,
+        "Error": 0,
+    }
+    juriscraper_module_type = "opinions"
 
     def add_arguments(self, parser):
-        parser.add_argument(
-            "--juriscraper-module",
-            help="""The Juriscraper file which contains the
-            `extract_from_text` method to be used. The `court_id`
-            will be deduced from this. Example:
-            juriscraper.opinions.united_states.federal_appellate.ca1
-            """,
-            required=True,
-        )
+        super().add_arguments(parser)
         parser.add_argument(
             "--opinion-ids",
             nargs="+",
@@ -100,15 +125,17 @@ def add_arguments(self, parser):
             other filters will be ignored""",
         )
         parser.add_argument(
-            "date-filed-gte",
+            "--date-filed-gte",
             default="",
-            help=r"""A filter value in %Y/%m/%d format.
+            type=self.parse_input_date,
+            help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
             OpinionCluster.date_filed will have to be greater or equal""",
         )
         parser.add_argument(
-            "date-filed-lte",
+            "--date-filed-lte",
             default="",
-            help=r"""A filter value in %Y/%m/%d format.
+            type=self.parse_input_date,
+            help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
             OpinionCluster.date_filed will have to be less or equal""",
         )
         parser.add_argument(
@@ -122,16 +149,14 @@ def add_arguments(self, parser):
 
     def handle(self, *args, **options):
         super().handle(*args, **options)
-        juriscraper_module = options["juriscraper_module"]
-        # For aggregate reporting
-        stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0}
+        juriscraper_module = options["court_id"]
 
         if options["opinion_ids"]:
             opinions = Opinion.objects.filter(id__in=options["opinion_ids"])
             for op in opinions:
-                rerun_extract_from_text(op, juriscraper_module, stats)
+                rerun_extract_from_text(op, juriscraper_module, self.stats)
 
-            logger.info("Modified objects counts: %s", stats)
+            logger.info("Modified objects counts: %s", self.stats)
             return
 
         if not (options["date_filed_gte"] and options["date_filed_lte"]):
@@ -140,12 +165,10 @@ def handle(self, *args, **options):
             )
 
         court_id = juriscraper_module.split(".")[-1].split("_")[0]
-        gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d")
-        lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d")
         query = {
             "docket__court_id": court_id,
-            "date_filed__gte": gte_date,
-            "date_filed__lte": lte_date,
+            "date_filed__gte": options["date_filed_lte"],
+            "date_filed__lte": options["date_filed_gte"],
         }
 
         if options["cluster_status"]:
@@ -157,7 +180,19 @@ def handle(self, *args, **options):
         for cluster in qs:
             opinions = cluster.sub_opinions.all()
             for op in opinions:
-                rerun_extract_from_text(op, juriscraper_module, stats)
-
-        logger.info("Modified objects counts: %s", stats)
-        self.stats = stats
+                rerun_extract_from_text(op, juriscraper_module, self.stats)
+
+        logger.info("Modified objects counts: %s", self.stats)
+
+    def parse_input_date(self, date_string: str) -> datetime | str:
+        """Parses a date string in accepted formats
+
+        :param date_string: the date string in "%Y/%m/%d" or "%Y-%m-%d"
+        :return: an empty string if the input was empty; or the date object
+        """
+        parsed_date = ""
+        if "/" in date_string:
+            parsed_date = datetime.strptime(date_string, "%Y/%m/%d")
+        elif "-" in date_string:
+            parsed_date = datetime.strptime(date_string, "%Y-%m-%d")
+        return parsed_date
diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py
index 15500e94bb..7bbc8bb40b 100644
--- a/cl/scrapers/tasks.py
+++ b/cl/scrapers/tasks.py
@@ -30,6 +30,7 @@
 from cl.lib.string_utils import trunc
 from cl.lib.utils import is_iter
 from cl.recap.mergers import save_iquery_to_docket
+from cl.scrapers.utils import scraped_citation_object_is_valid
 from cl.search.models import Docket, Opinion, RECAPDocument
 
 logger = logging.getLogger(__name__)
@@ -71,8 +72,9 @@ def update_document_from_text(
             opinion.cluster.__dict__.update(data)
         elif model_name == "Citation":
             data["cluster_id"] = opinion.cluster_id
-            _, citation_created = ModelClass.objects.get_or_create(**data)
-            metadata_dict["Citation"]["created"] = citation_created
+            if scraped_citation_object_is_valid(data):
+                _, citation_created = ModelClass.objects.get_or_create(**data)
+                metadata_dict["Citation"]["created"] = citation_created
         elif model_name == "Opinion":
             opinion.__dict__.update(data)
         else:
diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
index 95e1586a21..6bfad68d8e 100644
--- a/cl/scrapers/tests.py
+++ b/cl/scrapers/tests.py
@@ -41,6 +41,7 @@
     get_binary_content,
     get_existing_docket,
     get_extension,
+    scraped_citation_object_is_valid,
     update_or_create_docket,
 )
 from cl.search.factories import (
@@ -874,7 +875,7 @@ def test_federal_jurisdictions(self):
         )
 
 
-class UpdateFromTestCommandTest(TestCase):
+class UpdateFromTextCommandTest(TestCase):
     """Test the input processing and DB querying for the command"""
 
     def setUp(self):
@@ -932,7 +933,7 @@ def test_inputs(self):
             "cl.scrapers.tasks.get_scraper_object_by_name",
             return_value=test_opinion_scraper.Site(),
         ):
-            cmd.handle(juriscraper_module="somepath.sc", opinion_ids=[101])
+            cmd.handle(court_id="somepath.sc", opinion_ids=[101])
 
         self.assertFalse(
             any(cmd.stats.values()), "No object should be modified"
@@ -945,7 +946,7 @@ def test_inputs(self):
             return_value=test_opinion_scraper.Site(),
         ):
             update_from_text.Command().handle(
-                juriscraper_module="somepath.vt",
+                court_id="somepath.vt",
                 opinion_ids=[],
                 date_filed_gte="2020/06/01",
                 date_filed_lte="2021/06/01",
@@ -979,3 +980,23 @@ def test_inputs(self):
             "13",
             "Unpublished docket should not be modified",
         )
+
+    def test_scraped_citation_object_is_valid(self):
+        """Can we validate Citation dicts got from `Site.extract_from_text`"""
+        bad_type = {"reporter": "WI", "type": Citation.FEDERAL}
+        self.assertFalse(
+            scraped_citation_object_is_valid(bad_type),
+            "Citation should be marked as invalid. Type does not match reporter",
+        )
+
+        bad_reporter = {"reporter": "Some text"}
+        self.assertFalse(
+            scraped_citation_object_is_valid(bad_reporter),
+            "Citation should be marked as invalid. Reporter does not exist",
+        )
+
+        valid_citation = {"reporter": "WI", "type": Citation.NEUTRAL}
+        self.assertTrue(
+            scraped_citation_object_is_valid(valid_citation),
+            "Citation object should be marked as valid",
+        )
diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py
index 31134ce3d2..2203bbe2c1 100644
--- a/cl/scrapers/utils.py
+++ b/cl/scrapers/utils.py
@@ -1,5 +1,4 @@
 import os
-import sys
 from datetime import date
 from typing import Optional, Tuple
 from urllib.parse import urljoin
@@ -9,15 +8,16 @@
 from asgiref.sync import async_to_sync
 from courts_db import find_court_by_id, find_court_ids_by_name
 from django.conf import settings
-from django.db.models import Q, QuerySet
+from django.db.models import Q
 from juriscraper import AbstractSite
 from juriscraper.AbstractSite import logger
 from juriscraper.lib.test_utils import MockRequest
 from lxml import html
+from reporters_db import REPORTERS
 from requests import Response, Session
 
+from cl.citations.utils import map_reporter_db_cite_type
 from cl.corpus_importer.utils import winnow_case_name
-from cl.lib.celery_utils import CeleryThrottle
 from cl.lib.decorators import retry
 from cl.lib.microservice_utils import microservice
 from cl.recap.mergers import find_docket_object
@@ -26,8 +26,7 @@
     NoDownloadUrlError,
     UnexpectedContentTypeError,
 )
-from cl.scrapers.tasks import extract_recap_pdf
-from cl.search.models import Court, Docket, RECAPDocument
+from cl.search.models import Court, Docket
 
 
 def get_child_court(child_court_name: str, court_id: str) -> Optional[Court]:
@@ -242,53 +241,6 @@ def signal_handler(signal, frame):
     die_now = True
 
 
-def extract_recap_documents(
-    docs: QuerySet,
-    ocr_available: bool = True,
-    order_by: Optional[str] = None,
-    queue: Optional[str] = None,
-) -> None:
-    """Loop over RECAPDocuments and extract their contents. Use OCR if requested.
-
-    :param docs: A queryset containing the RECAPDocuments to be processed.
-    :type docs: Django Queryset
-    :param ocr_available: Whether OCR should be completed (True) or whether items
-    should simply be updated to have status OCR_NEEDED.
-    :type ocr_available: Bool
-    :param order_by: An optimization parameter. You may opt to order the
-    processing by 'small-first' or 'big-first'.
-    :type order_by: str
-    :param queue: The celery queue to send the content to.
-    :type queue: str
-    """
-    docs = docs.exclude(filepath_local="")
-    if ocr_available:
-        # We're doing OCR. Only work with those items that require it.
-        docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED)
-    else:
-        # Focus on the items that we don't know if they need OCR.
-        docs = docs.filter(ocr_status=None)
-
-    if order_by is not None:
-        if order_by == "small-first":
-            docs = docs.order_by("page_count")
-        elif order_by == "big-first":
-            docs = docs.order_by("-page_count")
-
-    count = docs.count()
-    throttle = CeleryThrottle(queue_name=queue)
-    for i, pk in enumerate(docs.values_list("pk", flat=True)):
-        throttle.maybe_wait()
-        extract_recap_pdf.apply_async(
-            (pk, ocr_available), priority=5, queue=queue
-        )
-        if i % 1000 == 0:
-            msg = f"Sent {i + 1}/{count} tasks to celery so far."
-            logger.info(msg)
-            sys.stdout.write(f"\r{msg}")
-            sys.stdout.flush()
-
-
 def get_existing_docket(
     court_id: str, docket_number: str, appeal_from_str: str = ""
 ) -> Docket | None:
@@ -466,3 +418,30 @@ def update_or_create_docket(
             setattr(docket, field, value)
 
     return docket
+
+
+def scraped_citation_object_is_valid(citation_object: dict) -> bool:
+    """Validate Citation objects from `Site.extract_from_text`
+
+    Check that the parsed `Citation.reporter` exists in reporters-db
+    and that the `Citation.type` matches the reporters-db type
+
+    :param citation_object: dict got from `Site.extract_from_text`
+    :return: True if the parsed reporter and type match with reporters-db
+        False otherwise
+    """
+    parsed_reporter = citation_object["reporter"]
+    try:
+        reporter = REPORTERS[parsed_reporter]
+        mapped_type = map_reporter_db_cite_type(reporter[0].get("cite_type"))
+        if mapped_type == citation_object["type"]:
+            return True
+        logger.error(
+            "Citation.type '%s' from `extract_from_text` does not match reporters-db type '%s'",
+            citation_object["type"],
+            parsed_reporter,
+        )
+    except KeyError:
+        logger.error("Parsed reporter '%s' does not exist", parsed_reporter)
+
+    return False