From d871b4aae29ebad5be842df18e7bf93e85fec1fe Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Tue, 1 Oct 2024 11:09:28 -0500 Subject: [PATCH 1/3] feat(scrapers.update_from_text): new command Helps solve: https://github.com/freelawproject/juriscraper/issues/858 - New command to re-run Site.extract_from_text over downloaded opinions - Able to filter by Docket.court_id , OpinionCluster.date_filed, OpinionCluster.precedential_status - Updates tasks.update_from_document_text to return information for logging purposes - Updates test_opinion_scraper to get a Site.extract_from_text method --- .../management/commands/update_from_text.py | 159 ++++++++++++++++++ cl/scrapers/tasks.py | 12 +- .../test_assets/test_opinion_scraper.py | 21 +++ cl/scrapers/tests.py | 110 +++++++++++- 4 files changed, 297 insertions(+), 5 deletions(-) create mode 100644 cl/scrapers/management/commands/update_from_text.py diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py new file mode 100644 index 0000000000..77fe5966af --- /dev/null +++ b/cl/scrapers/management/commands/update_from_text.py @@ -0,0 +1,159 @@ +from datetime import datetime + +from django.db import transaction + +from cl.lib.command_utils import VerboseCommand, logger +from cl.scrapers.tasks import update_document_from_text +from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster + + +def update_from_text( + opinion: Opinion, juriscraper_module: str, stats: dict[str, int] +): + """Calls `update_document_from_text` as used in the scraper flow + and calls the corresponding model's .save() + + :param opinion: the Opinion on which to apply extract_from_text + :param juriscraper_module: the scraper module path + :param stats: dict to accumulate counts for reporting. Modified in place + + :return None + """ + with transaction.atomic(): + changes = update_document_from_text(opinion, juriscraper_module) + if not changes: + logger.info("Did not get any metadata for opinion %s", opinion.id) + return + + logger.info("Processing opinion %s", opinion.id) + + # Check if changes exist before saving, to prevent unecessary DB queries + if changes.get("Docket"): + opinion.cluster.docket.save() + logger.debug( + "Docket %s updated with data %s", + opinion.cluster.docket.id, + changes["Docket"], + ) + stats["Docket"] += 1 + + if changes.get("OpinionCluster"): + opinion.cluster.save() + logger.debug( + "OpinionCluster %s updated with data %s", + opinion.cluster.id, + changes["OpinionCluster"], + ) + stats["OpinionCluster"] += 1 + + if changes.get("Opinion"): + opinion.save() + logger.debug("Opinion updated with data %s", changes["Opinion"]) + stats["Opinion"] += 1 + + if changes.get("Citation"): + if changes["Citation"].get("citation_created"): + logger.info( + "Citation created with data %s", changes["Citation"] + ) + stats["Citation"] += 1 + else: + logger.debug( + "Citation not created. Data %s", changes["Citation"] + ) + + +class Command(VerboseCommand): + help = """Updates objects by running Site.extract_from_text + over extracted content found on Opinion.plain_text or Opinion.html. + + If `--opinion-ids` is used, filters will be ignored. + If not, the 2 date filters will be required, to prevent triggering + unwanted reprocessing of the whole court's dataset + + Recommended use is to run over a sample of the target time period + and check if updates over Docket, OpinionCluster, Opinion and + Citation are as expected + """ + stats = {} # assigned at the end of a command run, for testing + + def add_arguments(self, parser): + parser.add_argument( + "--juriscraper-module", + help="""The Juriscraper file which contains the + `extract_from_text` method to be used. The `court_id` + will be deduced from this. Example: + juriscraper.opinions.united_states.federal_appellate.ca1 + """, + required=True, + ) + parser.add_argument( + "--opinion-ids", + nargs="+", + type=int, + help="""The Opinion ids to re-process. + May be more than one. If this argument is used, + other filters will be ignored""", + ) + parser.add_argument( + "date-filed-gte", + default="", + help=r"""A filter value in %Y/%m/%d format. + OpinionCluster.date_filed will have to be greater or equal""", + ) + parser.add_argument( + "date-filed-lte", + default="", + help=r"""A filter value in %Y/%m/%d format. + OpinionCluster.date_filed will have to be less or equal""", + ) + parser.add_argument( + "--cluster-status", + default="", + choices=[value for value, name in PRECEDENTIAL_STATUS.NAMES], + help="""A value of OpinionCluster.precedential_status. To be + used for filtering the Opinions to be processed + """, + ) + + def handle(self, *args, **options): + super().handle(*args, **options) + juriscraper_module = options["juriscraper_module"] + # For aggregate reporting + stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0} + + if options["opinion_ids"]: + opinions = Opinion.objects.filter(id__in=options["opinion_ids"]) + for op in opinions: + update_from_text(op, juriscraper_module, stats) + + logger.info("Modified objects counts: %s", stats) + return + + if not (options["date_filed_gte"] and options["date_filed_lte"]): + raise ValueError( + "Both `date-filed-gte` and `date-filed-lte` arguments should have values" + ) + + court_id = juriscraper_module.split(".")[-1].split("_")[0] + gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d") + lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d") + query = { + "docket__court_id": court_id, + "date_filed__gte": gte_date, + "date_filed__lte": lte_date, + } + + if options["cluster_status"]: + query["precedential_status"] = options["cluster_status"] + + qs = OpinionCluster.objects.filter(**query).prefetch_related( + "sub_opinions" + ) + for cluster in qs: + opinions = cluster.sub_opinions.all() + for op in opinions: + update_from_text(op, juriscraper_module, stats) + + logger.info("Modified objects counts: %s", stats) + self.stats = stats diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index c60971c572..15500e94bb 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -39,7 +39,7 @@ def update_document_from_text( opinion: Opinion, juriscraper_module: str = "" -) -> None: +) -> dict: """Extract additional metadata from document text We use this code with BIA decisions. Previously Tax. @@ -54,12 +54,13 @@ def update_document_from_text( :param opinion: Opinion object :param juriscraper_module: full module to get Site object - :return: None + :return: the extracted data dictionary """ court = opinion.cluster.docket.court.pk site = get_scraper_object_by_name(court, juriscraper_module) if site is None: - return + logger.debug("No site found %s", juriscraper_module) + return {} metadata_dict = site.extract_from_text(opinion.plain_text or opinion.html) for model_name, data in metadata_dict.items(): @@ -70,7 +71,8 @@ def update_document_from_text( opinion.cluster.__dict__.update(data) elif model_name == "Citation": data["cluster_id"] = opinion.cluster_id - ModelClass.objects.get_or_create(**data) + _, citation_created = ModelClass.objects.get_or_create(**data) + metadata_dict["Citation"]["created"] = citation_created elif model_name == "Opinion": opinion.__dict__.update(data) else: @@ -78,6 +80,8 @@ def update_document_from_text( f"Object type of {model_name} not yet supported." ) + return metadata_dict + @app.task( bind=True, diff --git a/cl/scrapers/test_assets/test_opinion_scraper.py b/cl/scrapers/test_assets/test_opinion_scraper.py index 508be0dfec..18a28d71de 100644 --- a/cl/scrapers/test_assets/test_opinion_scraper.py +++ b/cl/scrapers/test_assets/test_opinion_scraper.py @@ -1,3 +1,4 @@ +import re from datetime import datetime from os.path import join @@ -53,3 +54,23 @@ def _get_nature_of_suit(self): def _get_judges(self): path = "//judge/text()" return list(self.html.xpath(path)) + + def extract_from_text(self, scraped_text): + metadata = {} + docket_regex = r"Docket Number: (?P\d+-\d+)" + disposition_regex = r"Disposition: (?P\w+)" + citation_regex = r"(?P20\d{2}) (?PVT) (?P\d+)" + if docket_match := re.search(docket_regex, scraped_text): + metadata["Docket"] = { + "docket_number": docket_match.group("docket") + } + + if disposition_match := re.search(disposition_regex, scraped_text): + metadata["OpinionCluster"] = { + "disposition": disposition_match.group("disposition") + } + + if citation_match := re.search(citation_regex, scraped_text): + metadata["Citation"] = {**citation_match.groupdict(), "type": 8} + + return metadata diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index 375987426a..1d818d4f39 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -1,5 +1,5 @@ import os -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta from http import HTTPStatus from pathlib import Path from unittest import TestCase, mock @@ -30,6 +30,7 @@ cl_back_scrape_citations, cl_scrape_opinions, cl_scrape_oral_arguments, + update_from_text, ) from cl.scrapers.models import UrlHash from cl.scrapers.tasks import extract_doc_content, process_audio_file @@ -867,3 +868,110 @@ def test_federal_jurisdictions(self): self.assertEqual( docket, self.ca2_docket, "Should match using docket number core" ) + + +class UpdateFromTestCommandTest(TestCase): + """Test the input processing and DB querying for the command""" + + def setUp(self): + self.vt = CourtFactory(id="vt") + self.sc = CourtFactory(id="sc") + self.docket_sc = DocketFactory(court=self.sc, docket_number="20") + + # Different dates, status and courts to test command behaviour + self.opinion_2020 = OpinionFactory( + cluster=OpinionClusterFactory( + docket=DocketFactory(court=self.vt, docket_number="12"), + date_filed=date(2020, 6, 1), + precedential_status="Published", + ), + plain_text="""Docket Number: 2020-12 + Disposition: Affirmed + 2020 VT 11""", + ) + self.opinion_2020_unpub = OpinionFactory( + cluster=OpinionClusterFactory( + docket=DocketFactory(court=self.vt, docket_number="13"), + date_filed=date(2020, 7, 1), + precedential_status="Unpublished", + ), + plain_text="Docket Number: 2020-13\nDisposition: Affirmed", + ) + + self.opinion_sc = OpinionFactory( + cluster=OpinionClusterFactory( + docket=self.docket_sc, + date_filed=date(2021, 6, 1), + precedential_status="Published", + ), + plain_text="Some text with no matches", + id=101, + ) + + self.opinion_2022 = OpinionFactory( + cluster=OpinionClusterFactory( + docket=DocketFactory(court=self.vt, docket_number="13"), + date_filed=date(2022, 6, 1), + precedential_status="Unpublished", + ), + id=100, + plain_text="Docket Number: 2022-13\n2022 VT 11", + ) + + def test_inputs(self): + """Do all command inputs work properly?""" + + # will target a single opinion, for which extract_from_text + # extracts no metadata. No object should be updated + cmd = update_from_text.Command() + with mock.patch( + "cl.scrapers.tasks.get_scraper_object_by_name", + return_value=test_opinion_scraper.Site(), + ): + cmd.handle(juriscraper_module="somepath.sc", opinion_ids=[101]) + + self.assertFalse( + any(cmd.stats.values()), "No object should be modified" + ) + + # will target 1 opinion, there are 2 in the time period + # and 3 for the court + with mock.patch( + "cl.scrapers.tasks.get_scraper_object_by_name", + return_value=test_opinion_scraper.Site(), + ): + update_from_text.Command().handle( + juriscraper_module="somepath.vt", + opinion_ids=[], + date_filed_gte="2020/06/01", + date_filed_lte="2021/06/01", + cluster_status="Published", + ) + + # Test that objects were actually updated / created + self.assertEqual( + Citation.objects.filter(cluster=self.opinion_2020.cluster).count(), + 1, + "There should be a single citation for this cluster", + ) + self.opinion_2020.refresh_from_db() + self.opinion_2020.cluster.refresh_from_db() + self.opinion_2020.cluster.docket.refresh_from_db() + self.assertEqual( + self.opinion_2020.cluster.disposition, + "Affirmed", + "OpinionCluster.disposition was not updated", + ) + self.assertEqual( + self.opinion_2020.cluster.docket.docket_number, + "2020-12", + "Docket.docket_number was not updated", + ) + + # Check that other objects in the time period and court + # were not modified. Meaning, the filter worked + self.assertEqual( + self.opinion_2020_unpub.cluster.docket.docket_number, + "13", + "Unpublished docket should not be modified", + ) From 5adce999146da5a1fe3dda1869073c836dffd68d Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Fri, 18 Oct 2024 14:05:44 -0500 Subject: [PATCH 2/3] refactor(scrapers.update_from_text): change function name and docstring --- .../management/commands/update_from_text.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py index 77fe5966af..f1450c9c66 100644 --- a/cl/scrapers/management/commands/update_from_text.py +++ b/cl/scrapers/management/commands/update_from_text.py @@ -7,11 +7,15 @@ from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster -def update_from_text( +def rerun_extract_from_text( opinion: Opinion, juriscraper_module: str, stats: dict[str, int] ): - """Calls `update_document_from_text` as used in the scraper flow - and calls the corresponding model's .save() + """ + Reruns `update_document_from_text` from the scraper flow, saving changes + + `update_document_from_text` calls `Site.extract_from_text` and assigns + any changes to the proper objets, in place, but they are not saved. + This method saves the ones with actual changes :param opinion: the Opinion on which to apply extract_from_text :param juriscraper_module: the scraper module path @@ -125,7 +129,7 @@ def handle(self, *args, **options): if options["opinion_ids"]: opinions = Opinion.objects.filter(id__in=options["opinion_ids"]) for op in opinions: - update_from_text(op, juriscraper_module, stats) + rerun_extract_from_text(op, juriscraper_module, stats) logger.info("Modified objects counts: %s", stats) return @@ -153,7 +157,7 @@ def handle(self, *args, **options): for cluster in qs: opinions = cluster.sub_opinions.all() for op in opinions: - update_from_text(op, juriscraper_module, stats) + rerun_extract_from_text(op, juriscraper_module, stats) logger.info("Modified objects counts: %s", stats) self.stats = stats From 6a1fadb6cbfc0b03d82744658ff1d7e3bf2c3e5a Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Fri, 25 Oct 2024 21:09:56 -0500 Subject: [PATCH 3/3] feat(scrapers.update_from_text): refactor from code review - validate citation objects from `Site.extract_from_text`. Add tests for this - abstract --courts required argument for scrapers into ScraperCommand class also, made it more flexible - refactor cl_scrape_opinions; cl_scrape_oral_arguments to account for this - delete cl.scrapers.utils.extract_recap_documents which was generating a circular import. This function was not used anywhere --- cl/lib/command_utils.py | 36 +++++++ cl/lib/juriscraper_utils.py | 52 +++++++++- .../commands/cl_back_scrape_citations.py | 1 + .../management/commands/cl_scrape_opinions.py | 20 +--- .../commands/cl_scrape_oral_arguments.py | 1 + .../management/commands/update_from_text.py | 97 +++++++++++++------ cl/scrapers/tasks.py | 6 +- cl/scrapers/tests.py | 27 +++++- cl/scrapers/utils.py | 83 ++++++---------- 9 files changed, 216 insertions(+), 107 deletions(-) diff --git a/cl/lib/command_utils.py b/cl/lib/command_utils.py index 2c3797f9f5..ee86463812 100644 --- a/cl/lib/command_utils.py +++ b/cl/lib/command_utils.py @@ -3,6 +3,8 @@ from django.core.management import BaseCommand, CommandError +from cl.lib.juriscraper_utils import get_module_by_court_id + logger = logging.getLogger(__name__) @@ -22,6 +24,40 @@ def handle(self, *args, **options): juriscraper_logger.setLevel(logging.DEBUG) +class ScraperCommand(VerboseCommand): + """Base class for cl.scrapers commands that use Juriscraper + + Implements the `--courts` argument to lookup for a Site object + """ + + # To be used on get_module_by_court_id + # Defined by inheriting classes + juriscraper_module_type = "" + + def add_arguments(self, parser): + parser.add_argument( + "--courts", + dest="court_id", + metavar="COURTID", + type=lambda s: ( + s + if "." in s + else get_module_by_court_id(s, self.juriscraper_module_type) + ), + required=True, + help=( + "The court(s) to scrape and extract. One of: " + "1. a python module or package import from the Juriscraper library, e.g." + "'juriscraper.opinions.united_states.federal_appellate.ca1' " + "or simply 'juriscraper.opinions' to do all opinions." + "" + "2. a court_id, to be used to lookup for a full module path" + "An error will be raised if the `court_id` matches more than " + "one module path. In that case, use the full path" + ), + ) + + class CommandUtils: """A mixin to give some useful methods to sub classes.""" diff --git a/cl/lib/juriscraper_utils.py b/cl/lib/juriscraper_utils.py index ae8c090f41..2eb902352b 100644 --- a/cl/lib/juriscraper_utils.py +++ b/cl/lib/juriscraper_utils.py @@ -5,6 +5,12 @@ import juriscraper +def walk_juriscraper(): + return pkgutil.walk_packages( + juriscraper.__path__, f"{juriscraper.__name__}." + ) + + def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""): """Identify and instantiate a Site() object given the name of a court @@ -25,9 +31,7 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""): return importlib.import_module(juriscraper_module).Site() - for _, full_module_path, _ in pkgutil.walk_packages( - juriscraper.__path__, f"{juriscraper.__name__}." - ): + for _, full_module_path, _ in walk_juriscraper(): # Get the module name from the full path and trim # any suffixes like _p, _u module_name = full_module_path.rsplit(".", 1)[1].rsplit("_", 1)[0] @@ -42,3 +46,45 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""): # has been stripped off it. In any case, just ignore it when # this happens. continue + + +def get_module_by_court_id(court_id: str, module_type: str): + """Given a `court_id` return a juriscraper module path + + Some court_ids match multiple scraper files. These will force the user + to use the full module path. For example, "lactapp_1" and "lactapp_5" + match the same `court_id`, but scrape totally different sites, and + their Site objects are expected to have different `extract_from_text` + behavior + + :param court_id: court id to look for + :param module_type: 'opinions' or 'oral_args'. Without this, some + court_ids may match the 2 classes of scrapers + + :raises: ValueError if there is no match or there is more than 1 match + :return: the full module path + """ + if module_type not in ["opinions", "oral_args"]: + raise ValueError( + "module_type has to be one of ['opinions', 'oral_args']" + ) + + matches = [] + for _, module_string, _ in walk_juriscraper(): + if module_string.count(".") != 4 or module_type not in module_string: + # Skip folder and lib modules. Skip type + continue + + module_court_id = module_string.rsplit(".", 1)[1].rsplit("_", 1)[0] + if module_court_id == court_id: + matches.append(module_string) + + if len(matches) == 1: + return matches[0] + elif len(matches) == 0: + raise ValueError(f"'{court_id}' doesn't match any juriscraper module") + else: + raise ValueError( + f"'{court_id}' matches more than 1 juriscraper module." + f"Use a full module path. Matches: '{matches}'" + ) diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py index b2da0a4581..a445df9438 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_citations.py +++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py @@ -24,6 +24,7 @@ class Command(cl_back_scrape_opinions.Command): scrape_target_descr = "citations" + juriscraper_module_type = "opinions" def scrape_court( self, diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index 67dac880ab..8fe42e893a 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -18,7 +18,7 @@ from cl.alerts.models import RealTimeQueue from cl.citations.utils import map_reporter_db_cite_type -from cl.lib.command_utils import VerboseCommand, logger +from cl.lib.command_utils import ScraperCommand, logger from cl.lib.crypto import sha1 from cl.lib.string_utils import trunc from cl.people_db.lookup_utils import lookup_judges_by_messy_str @@ -217,14 +217,16 @@ def save_everything( ) -class Command(VerboseCommand): +class Command(ScraperCommand): help = "Runs the Juriscraper toolkit against one or many jurisdictions." + juriscraper_module_type = "opinions" scrape_target_descr = "opinions" # for logging purposes def __init__(self, stdout=None, stderr=None, no_color=False): super().__init__(stdout=None, stderr=None, no_color=False) def add_arguments(self, parser): + super().add_arguments(parser) parser.add_argument( "--daemon", action="store_true", @@ -246,20 +248,6 @@ def add_arguments(self, parser): "is 30 minutes." ), ) - parser.add_argument( - "--courts", - type=str, - dest="court_id", - metavar="COURTID", - required=True, - help=( - "The court(s) to scrape and extract. This should be " - "in the form of a python module or package import " - "from the Juriscraper library, e.g. " - '"juriscraper.opinions.united_states.federal_appellate.ca1" ' - 'or simply "opinions" to do all opinions.' - ), - ) parser.add_argument( "--fullcrawl", dest="full_crawl", diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py index ad284381f4..62377a98ec 100644 --- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py +++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py @@ -107,6 +107,7 @@ def make_objects( class Command(cl_scrape_opinions.Command): scrape_target_descr = "oral arguments" + juriscraper_module_type = "oral_args" def ingest_a_case( self, diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py index f1450c9c66..0c7da06ef3 100644 --- a/cl/scrapers/management/commands/update_from_text.py +++ b/cl/scrapers/management/commands/update_from_text.py @@ -1,8 +1,9 @@ +import traceback from datetime import datetime from django.db import transaction -from cl.lib.command_utils import VerboseCommand, logger +from cl.lib.command_utils import ScraperCommand, logger from cl.scrapers.tasks import update_document_from_text from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster @@ -23,15 +24,37 @@ def rerun_extract_from_text( :return None """ + if not opinion.plain_text and not opinion.html: + # May be an opinion entirely from a merged corpus + # or an error during text extraction + logger.info( + "Opinion %s has no `plain_text` or `html` to extract from", + opinion.id, + ) + stats["No text to extract from"] += 1 + return + with transaction.atomic(): - changes = update_document_from_text(opinion, juriscraper_module) + try: + changes = update_document_from_text(opinion, juriscraper_module) + except: + # Probably a bad implementation of `extract_from_text` + logger.debug( + "`update_document_from_text` failed for opinion %s. Traceback: %s", + opinion.id, + traceback.format_exc(), + ) + stats["Error"] += 1 + return + if not changes: logger.info("Did not get any metadata for opinion %s", opinion.id) + stats["No metadata extracted"] += 1 return logger.info("Processing opinion %s", opinion.id) - # Check if changes exist before saving, to prevent unecessary DB queries + # Check if changes exist before saving, to prevent unnecessary DB queries if changes.get("Docket"): opinion.cluster.docket.save() logger.debug( @@ -67,7 +90,7 @@ def rerun_extract_from_text( ) -class Command(VerboseCommand): +class Command(ScraperCommand): help = """Updates objects by running Site.extract_from_text over extracted content found on Opinion.plain_text or Opinion.html. @@ -79,18 +102,20 @@ class Command(VerboseCommand): and check if updates over Docket, OpinionCluster, Opinion and Citation are as expected """ - stats = {} # assigned at the end of a command run, for testing + # For aggregate reporting at the end of the command + stats = { + "Docket": 0, + "OpinionCluster": 0, + "Opinion": 0, + "Citation": 0, + "No text to extract from": 0, + "No metadata extracted": 0, + "Error": 0, + } + juriscraper_module_type = "opinions" def add_arguments(self, parser): - parser.add_argument( - "--juriscraper-module", - help="""The Juriscraper file which contains the - `extract_from_text` method to be used. The `court_id` - will be deduced from this. Example: - juriscraper.opinions.united_states.federal_appellate.ca1 - """, - required=True, - ) + super().add_arguments(parser) parser.add_argument( "--opinion-ids", nargs="+", @@ -100,15 +125,17 @@ def add_arguments(self, parser): other filters will be ignored""", ) parser.add_argument( - "date-filed-gte", + "--date-filed-gte", default="", - help=r"""A filter value in %Y/%m/%d format. + type=self.parse_input_date, + help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format. OpinionCluster.date_filed will have to be greater or equal""", ) parser.add_argument( - "date-filed-lte", + "--date-filed-lte", default="", - help=r"""A filter value in %Y/%m/%d format. + type=self.parse_input_date, + help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format. OpinionCluster.date_filed will have to be less or equal""", ) parser.add_argument( @@ -122,16 +149,14 @@ def add_arguments(self, parser): def handle(self, *args, **options): super().handle(*args, **options) - juriscraper_module = options["juriscraper_module"] - # For aggregate reporting - stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0} + juriscraper_module = options["court_id"] if options["opinion_ids"]: opinions = Opinion.objects.filter(id__in=options["opinion_ids"]) for op in opinions: - rerun_extract_from_text(op, juriscraper_module, stats) + rerun_extract_from_text(op, juriscraper_module, self.stats) - logger.info("Modified objects counts: %s", stats) + logger.info("Modified objects counts: %s", self.stats) return if not (options["date_filed_gte"] and options["date_filed_lte"]): @@ -140,12 +165,10 @@ def handle(self, *args, **options): ) court_id = juriscraper_module.split(".")[-1].split("_")[0] - gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d") - lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d") query = { "docket__court_id": court_id, - "date_filed__gte": gte_date, - "date_filed__lte": lte_date, + "date_filed__gte": options["date_filed_lte"], + "date_filed__lte": options["date_filed_gte"], } if options["cluster_status"]: @@ -157,7 +180,19 @@ def handle(self, *args, **options): for cluster in qs: opinions = cluster.sub_opinions.all() for op in opinions: - rerun_extract_from_text(op, juriscraper_module, stats) - - logger.info("Modified objects counts: %s", stats) - self.stats = stats + rerun_extract_from_text(op, juriscraper_module, self.stats) + + logger.info("Modified objects counts: %s", self.stats) + + def parse_input_date(self, date_string: str) -> datetime | str: + """Parses a date string in accepted formats + + :param date_string: the date string in "%Y/%m/%d" or "%Y-%m-%d" + :return: an empty string if the input was empty; or the date object + """ + parsed_date = "" + if "/" in date_string: + parsed_date = datetime.strptime(date_string, "%Y/%m/%d") + elif "-" in date_string: + parsed_date = datetime.strptime(date_string, "%Y-%m-%d") + return parsed_date diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index 15500e94bb..7bbc8bb40b 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -30,6 +30,7 @@ from cl.lib.string_utils import trunc from cl.lib.utils import is_iter from cl.recap.mergers import save_iquery_to_docket +from cl.scrapers.utils import scraped_citation_object_is_valid from cl.search.models import Docket, Opinion, RECAPDocument logger = logging.getLogger(__name__) @@ -71,8 +72,9 @@ def update_document_from_text( opinion.cluster.__dict__.update(data) elif model_name == "Citation": data["cluster_id"] = opinion.cluster_id - _, citation_created = ModelClass.objects.get_or_create(**data) - metadata_dict["Citation"]["created"] = citation_created + if scraped_citation_object_is_valid(data): + _, citation_created = ModelClass.objects.get_or_create(**data) + metadata_dict["Citation"]["created"] = citation_created elif model_name == "Opinion": opinion.__dict__.update(data) else: diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index 95e1586a21..6bfad68d8e 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -41,6 +41,7 @@ get_binary_content, get_existing_docket, get_extension, + scraped_citation_object_is_valid, update_or_create_docket, ) from cl.search.factories import ( @@ -874,7 +875,7 @@ def test_federal_jurisdictions(self): ) -class UpdateFromTestCommandTest(TestCase): +class UpdateFromTextCommandTest(TestCase): """Test the input processing and DB querying for the command""" def setUp(self): @@ -932,7 +933,7 @@ def test_inputs(self): "cl.scrapers.tasks.get_scraper_object_by_name", return_value=test_opinion_scraper.Site(), ): - cmd.handle(juriscraper_module="somepath.sc", opinion_ids=[101]) + cmd.handle(court_id="somepath.sc", opinion_ids=[101]) self.assertFalse( any(cmd.stats.values()), "No object should be modified" @@ -945,7 +946,7 @@ def test_inputs(self): return_value=test_opinion_scraper.Site(), ): update_from_text.Command().handle( - juriscraper_module="somepath.vt", + court_id="somepath.vt", opinion_ids=[], date_filed_gte="2020/06/01", date_filed_lte="2021/06/01", @@ -979,3 +980,23 @@ def test_inputs(self): "13", "Unpublished docket should not be modified", ) + + def test_scraped_citation_object_is_valid(self): + """Can we validate Citation dicts got from `Site.extract_from_text`""" + bad_type = {"reporter": "WI", "type": Citation.FEDERAL} + self.assertFalse( + scraped_citation_object_is_valid(bad_type), + "Citation should be marked as invalid. Type does not match reporter", + ) + + bad_reporter = {"reporter": "Some text"} + self.assertFalse( + scraped_citation_object_is_valid(bad_reporter), + "Citation should be marked as invalid. Reporter does not exist", + ) + + valid_citation = {"reporter": "WI", "type": Citation.NEUTRAL} + self.assertTrue( + scraped_citation_object_is_valid(valid_citation), + "Citation object should be marked as valid", + ) diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index 31134ce3d2..2203bbe2c1 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -1,5 +1,4 @@ import os -import sys from datetime import date from typing import Optional, Tuple from urllib.parse import urljoin @@ -9,15 +8,16 @@ from asgiref.sync import async_to_sync from courts_db import find_court_by_id, find_court_ids_by_name from django.conf import settings -from django.db.models import Q, QuerySet +from django.db.models import Q from juriscraper import AbstractSite from juriscraper.AbstractSite import logger from juriscraper.lib.test_utils import MockRequest from lxml import html +from reporters_db import REPORTERS from requests import Response, Session +from cl.citations.utils import map_reporter_db_cite_type from cl.corpus_importer.utils import winnow_case_name -from cl.lib.celery_utils import CeleryThrottle from cl.lib.decorators import retry from cl.lib.microservice_utils import microservice from cl.recap.mergers import find_docket_object @@ -26,8 +26,7 @@ NoDownloadUrlError, UnexpectedContentTypeError, ) -from cl.scrapers.tasks import extract_recap_pdf -from cl.search.models import Court, Docket, RECAPDocument +from cl.search.models import Court, Docket def get_child_court(child_court_name: str, court_id: str) -> Optional[Court]: @@ -242,53 +241,6 @@ def signal_handler(signal, frame): die_now = True -def extract_recap_documents( - docs: QuerySet, - ocr_available: bool = True, - order_by: Optional[str] = None, - queue: Optional[str] = None, -) -> None: - """Loop over RECAPDocuments and extract their contents. Use OCR if requested. - - :param docs: A queryset containing the RECAPDocuments to be processed. - :type docs: Django Queryset - :param ocr_available: Whether OCR should be completed (True) or whether items - should simply be updated to have status OCR_NEEDED. - :type ocr_available: Bool - :param order_by: An optimization parameter. You may opt to order the - processing by 'small-first' or 'big-first'. - :type order_by: str - :param queue: The celery queue to send the content to. - :type queue: str - """ - docs = docs.exclude(filepath_local="") - if ocr_available: - # We're doing OCR. Only work with those items that require it. - docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED) - else: - # Focus on the items that we don't know if they need OCR. - docs = docs.filter(ocr_status=None) - - if order_by is not None: - if order_by == "small-first": - docs = docs.order_by("page_count") - elif order_by == "big-first": - docs = docs.order_by("-page_count") - - count = docs.count() - throttle = CeleryThrottle(queue_name=queue) - for i, pk in enumerate(docs.values_list("pk", flat=True)): - throttle.maybe_wait() - extract_recap_pdf.apply_async( - (pk, ocr_available), priority=5, queue=queue - ) - if i % 1000 == 0: - msg = f"Sent {i + 1}/{count} tasks to celery so far." - logger.info(msg) - sys.stdout.write(f"\r{msg}") - sys.stdout.flush() - - def get_existing_docket( court_id: str, docket_number: str, appeal_from_str: str = "" ) -> Docket | None: @@ -466,3 +418,30 @@ def update_or_create_docket( setattr(docket, field, value) return docket + + +def scraped_citation_object_is_valid(citation_object: dict) -> bool: + """Validate Citation objects from `Site.extract_from_text` + + Check that the parsed `Citation.reporter` exists in reporters-db + and that the `Citation.type` matches the reporters-db type + + :param citation_object: dict got from `Site.extract_from_text` + :return: True if the parsed reporter and type match with reporters-db + False otherwise + """ + parsed_reporter = citation_object["reporter"] + try: + reporter = REPORTERS[parsed_reporter] + mapped_type = map_reporter_db_cite_type(reporter[0].get("cite_type")) + if mapped_type == citation_object["type"]: + return True + logger.error( + "Citation.type '%s' from `extract_from_text` does not match reporters-db type '%s'", + citation_object["type"], + parsed_reporter, + ) + except KeyError: + logger.error("Parsed reporter '%s' does not exist", parsed_reporter) + + return False