diff --git a/cl/lib/command_utils.py b/cl/lib/command_utils.py index 2c3797f9f5..ee86463812 100644 --- a/cl/lib/command_utils.py +++ b/cl/lib/command_utils.py @@ -3,6 +3,8 @@ from django.core.management import BaseCommand, CommandError +from cl.lib.juriscraper_utils import get_module_by_court_id + logger = logging.getLogger(__name__) @@ -22,6 +24,40 @@ def handle(self, *args, **options): juriscraper_logger.setLevel(logging.DEBUG) +class ScraperCommand(VerboseCommand): + """Base class for cl.scrapers commands that use Juriscraper + + Implements the `--courts` argument to lookup for a Site object + """ + + # To be used on get_module_by_court_id + # Defined by inheriting classes + juriscraper_module_type = "" + + def add_arguments(self, parser): + parser.add_argument( + "--courts", + dest="court_id", + metavar="COURTID", + type=lambda s: ( + s + if "." in s + else get_module_by_court_id(s, self.juriscraper_module_type) + ), + required=True, + help=( + "The court(s) to scrape and extract. One of: " + "1. a python module or package import from the Juriscraper library, e.g." + "'juriscraper.opinions.united_states.federal_appellate.ca1' " + "or simply 'juriscraper.opinions' to do all opinions." + "" + "2. a court_id, to be used to lookup for a full module path" + "An error will be raised if the `court_id` matches more than " + "one module path. In that case, use the full path" + ), + ) + + class CommandUtils: """A mixin to give some useful methods to sub classes.""" diff --git a/cl/lib/juriscraper_utils.py b/cl/lib/juriscraper_utils.py index ae8c090f41..2eb902352b 100644 --- a/cl/lib/juriscraper_utils.py +++ b/cl/lib/juriscraper_utils.py @@ -5,6 +5,12 @@ import juriscraper +def walk_juriscraper(): + return pkgutil.walk_packages( + juriscraper.__path__, f"{juriscraper.__name__}." + ) + + def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""): """Identify and instantiate a Site() object given the name of a court @@ -25,9 +31,7 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""): return importlib.import_module(juriscraper_module).Site() - for _, full_module_path, _ in pkgutil.walk_packages( - juriscraper.__path__, f"{juriscraper.__name__}." - ): + for _, full_module_path, _ in walk_juriscraper(): # Get the module name from the full path and trim # any suffixes like _p, _u module_name = full_module_path.rsplit(".", 1)[1].rsplit("_", 1)[0] @@ -42,3 +46,45 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""): # has been stripped off it. In any case, just ignore it when # this happens. continue + + +def get_module_by_court_id(court_id: str, module_type: str): + """Given a `court_id` return a juriscraper module path + + Some court_ids match multiple scraper files. These will force the user + to use the full module path. For example, "lactapp_1" and "lactapp_5" + match the same `court_id`, but scrape totally different sites, and + their Site objects are expected to have different `extract_from_text` + behavior + + :param court_id: court id to look for + :param module_type: 'opinions' or 'oral_args'. Without this, some + court_ids may match the 2 classes of scrapers + + :raises: ValueError if there is no match or there is more than 1 match + :return: the full module path + """ + if module_type not in ["opinions", "oral_args"]: + raise ValueError( + "module_type has to be one of ['opinions', 'oral_args']" + ) + + matches = [] + for _, module_string, _ in walk_juriscraper(): + if module_string.count(".") != 4 or module_type not in module_string: + # Skip folder and lib modules. Skip type + continue + + module_court_id = module_string.rsplit(".", 1)[1].rsplit("_", 1)[0] + if module_court_id == court_id: + matches.append(module_string) + + if len(matches) == 1: + return matches[0] + elif len(matches) == 0: + raise ValueError(f"'{court_id}' doesn't match any juriscraper module") + else: + raise ValueError( + f"'{court_id}' matches more than 1 juriscraper module." + f"Use a full module path. Matches: '{matches}'" + ) diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py index b2da0a4581..a445df9438 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_citations.py +++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py @@ -24,6 +24,7 @@ class Command(cl_back_scrape_opinions.Command): scrape_target_descr = "citations" + juriscraper_module_type = "opinions" def scrape_court( self, diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index 67dac880ab..8fe42e893a 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -18,7 +18,7 @@ from cl.alerts.models import RealTimeQueue from cl.citations.utils import map_reporter_db_cite_type -from cl.lib.command_utils import VerboseCommand, logger +from cl.lib.command_utils import ScraperCommand, logger from cl.lib.crypto import sha1 from cl.lib.string_utils import trunc from cl.people_db.lookup_utils import lookup_judges_by_messy_str @@ -217,14 +217,16 @@ def save_everything( ) -class Command(VerboseCommand): +class Command(ScraperCommand): help = "Runs the Juriscraper toolkit against one or many jurisdictions." + juriscraper_module_type = "opinions" scrape_target_descr = "opinions" # for logging purposes def __init__(self, stdout=None, stderr=None, no_color=False): super().__init__(stdout=None, stderr=None, no_color=False) def add_arguments(self, parser): + super().add_arguments(parser) parser.add_argument( "--daemon", action="store_true", @@ -246,20 +248,6 @@ def add_arguments(self, parser): "is 30 minutes." ), ) - parser.add_argument( - "--courts", - type=str, - dest="court_id", - metavar="COURTID", - required=True, - help=( - "The court(s) to scrape and extract. This should be " - "in the form of a python module or package import " - "from the Juriscraper library, e.g. " - '"juriscraper.opinions.united_states.federal_appellate.ca1" ' - 'or simply "opinions" to do all opinions.' - ), - ) parser.add_argument( "--fullcrawl", dest="full_crawl", diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py index ad284381f4..62377a98ec 100644 --- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py +++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py @@ -107,6 +107,7 @@ def make_objects( class Command(cl_scrape_opinions.Command): scrape_target_descr = "oral arguments" + juriscraper_module_type = "oral_args" def ingest_a_case( self, diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py index f1450c9c66..0c7da06ef3 100644 --- a/cl/scrapers/management/commands/update_from_text.py +++ b/cl/scrapers/management/commands/update_from_text.py @@ -1,8 +1,9 @@ +import traceback from datetime import datetime from django.db import transaction -from cl.lib.command_utils import VerboseCommand, logger +from cl.lib.command_utils import ScraperCommand, logger from cl.scrapers.tasks import update_document_from_text from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster @@ -23,15 +24,37 @@ def rerun_extract_from_text( :return None """ + if not opinion.plain_text and not opinion.html: + # May be an opinion entirely from a merged corpus + # or an error during text extraction + logger.info( + "Opinion %s has no `plain_text` or `html` to extract from", + opinion.id, + ) + stats["No text to extract from"] += 1 + return + with transaction.atomic(): - changes = update_document_from_text(opinion, juriscraper_module) + try: + changes = update_document_from_text(opinion, juriscraper_module) + except: + # Probably a bad implementation of `extract_from_text` + logger.debug( + "`update_document_from_text` failed for opinion %s. Traceback: %s", + opinion.id, + traceback.format_exc(), + ) + stats["Error"] += 1 + return + if not changes: logger.info("Did not get any metadata for opinion %s", opinion.id) + stats["No metadata extracted"] += 1 return logger.info("Processing opinion %s", opinion.id) - # Check if changes exist before saving, to prevent unecessary DB queries + # Check if changes exist before saving, to prevent unnecessary DB queries if changes.get("Docket"): opinion.cluster.docket.save() logger.debug( @@ -67,7 +90,7 @@ def rerun_extract_from_text( ) -class Command(VerboseCommand): +class Command(ScraperCommand): help = """Updates objects by running Site.extract_from_text over extracted content found on Opinion.plain_text or Opinion.html. @@ -79,18 +102,20 @@ class Command(VerboseCommand): and check if updates over Docket, OpinionCluster, Opinion and Citation are as expected """ - stats = {} # assigned at the end of a command run, for testing + # For aggregate reporting at the end of the command + stats = { + "Docket": 0, + "OpinionCluster": 0, + "Opinion": 0, + "Citation": 0, + "No text to extract from": 0, + "No metadata extracted": 0, + "Error": 0, + } + juriscraper_module_type = "opinions" def add_arguments(self, parser): - parser.add_argument( - "--juriscraper-module", - help="""The Juriscraper file which contains the - `extract_from_text` method to be used. The `court_id` - will be deduced from this. Example: - juriscraper.opinions.united_states.federal_appellate.ca1 - """, - required=True, - ) + super().add_arguments(parser) parser.add_argument( "--opinion-ids", nargs="+", @@ -100,15 +125,17 @@ def add_arguments(self, parser): other filters will be ignored""", ) parser.add_argument( - "date-filed-gte", + "--date-filed-gte", default="", - help=r"""A filter value in %Y/%m/%d format. + type=self.parse_input_date, + help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format. OpinionCluster.date_filed will have to be greater or equal""", ) parser.add_argument( - "date-filed-lte", + "--date-filed-lte", default="", - help=r"""A filter value in %Y/%m/%d format. + type=self.parse_input_date, + help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format. OpinionCluster.date_filed will have to be less or equal""", ) parser.add_argument( @@ -122,16 +149,14 @@ def add_arguments(self, parser): def handle(self, *args, **options): super().handle(*args, **options) - juriscraper_module = options["juriscraper_module"] - # For aggregate reporting - stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0} + juriscraper_module = options["court_id"] if options["opinion_ids"]: opinions = Opinion.objects.filter(id__in=options["opinion_ids"]) for op in opinions: - rerun_extract_from_text(op, juriscraper_module, stats) + rerun_extract_from_text(op, juriscraper_module, self.stats) - logger.info("Modified objects counts: %s", stats) + logger.info("Modified objects counts: %s", self.stats) return if not (options["date_filed_gte"] and options["date_filed_lte"]): @@ -140,12 +165,10 @@ def handle(self, *args, **options): ) court_id = juriscraper_module.split(".")[-1].split("_")[0] - gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d") - lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d") query = { "docket__court_id": court_id, - "date_filed__gte": gte_date, - "date_filed__lte": lte_date, + "date_filed__gte": options["date_filed_lte"], + "date_filed__lte": options["date_filed_gte"], } if options["cluster_status"]: @@ -157,7 +180,19 @@ def handle(self, *args, **options): for cluster in qs: opinions = cluster.sub_opinions.all() for op in opinions: - rerun_extract_from_text(op, juriscraper_module, stats) - - logger.info("Modified objects counts: %s", stats) - self.stats = stats + rerun_extract_from_text(op, juriscraper_module, self.stats) + + logger.info("Modified objects counts: %s", self.stats) + + def parse_input_date(self, date_string: str) -> datetime | str: + """Parses a date string in accepted formats + + :param date_string: the date string in "%Y/%m/%d" or "%Y-%m-%d" + :return: an empty string if the input was empty; or the date object + """ + parsed_date = "" + if "/" in date_string: + parsed_date = datetime.strptime(date_string, "%Y/%m/%d") + elif "-" in date_string: + parsed_date = datetime.strptime(date_string, "%Y-%m-%d") + return parsed_date diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index 15500e94bb..7bbc8bb40b 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -30,6 +30,7 @@ from cl.lib.string_utils import trunc from cl.lib.utils import is_iter from cl.recap.mergers import save_iquery_to_docket +from cl.scrapers.utils import scraped_citation_object_is_valid from cl.search.models import Docket, Opinion, RECAPDocument logger = logging.getLogger(__name__) @@ -71,8 +72,9 @@ def update_document_from_text( opinion.cluster.__dict__.update(data) elif model_name == "Citation": data["cluster_id"] = opinion.cluster_id - _, citation_created = ModelClass.objects.get_or_create(**data) - metadata_dict["Citation"]["created"] = citation_created + if scraped_citation_object_is_valid(data): + _, citation_created = ModelClass.objects.get_or_create(**data) + metadata_dict["Citation"]["created"] = citation_created elif model_name == "Opinion": opinion.__dict__.update(data) else: diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index 95e1586a21..960d8b8f9f 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -41,6 +41,7 @@ get_binary_content, get_existing_docket, get_extension, + scraped_citation_object_is_valid, update_or_create_docket, ) from cl.search.factories import ( @@ -874,7 +875,7 @@ def test_federal_jurisdictions(self): ) -class UpdateFromTestCommandTest(TestCase): +class UpdateFromTextCommandTest(TestCase): """Test the input processing and DB querying for the command""" def setUp(self): @@ -979,3 +980,23 @@ def test_inputs(self): "13", "Unpublished docket should not be modified", ) + + def test_scraped_citation_object_is_valid(self): + """Can we validate Citation dicts got from `Site.extract_from_text`""" + bad_type = {"reporter": "WI", "type": Citation.FEDERAL} + self.assertFalse( + scraped_citation_object_is_valid(bad_type), + "Citation should be marked as invalid. Type does not match reporter", + ) + + bad_reporter = {"reporter": "Some text"} + self.assertFalse( + scraped_citation_object_is_valid(bad_reporter), + "Citation should be marked as invalid. Reporter does not exist", + ) + + valid_citation = {"Reporter": "WI", "type": Citation.NEUTRAL} + self.assertTrue( + scraped_citation_object_is_valid(valid_citation), + "Citation object should be marked as valid", + ) diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index 31134ce3d2..00c53e2981 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -14,8 +14,10 @@ from juriscraper.AbstractSite import logger from juriscraper.lib.test_utils import MockRequest from lxml import html +from reporters_db import REPORTERS from requests import Response, Session +from cl.citations.utils import map_reporter_db_cite_type from cl.corpus_importer.utils import winnow_case_name from cl.lib.celery_utils import CeleryThrottle from cl.lib.decorators import retry @@ -466,3 +468,30 @@ def update_or_create_docket( setattr(docket, field, value) return docket + + +def scraped_citation_object_is_valid(citation_object: dict) -> bool: + """Validate Citation objects from `Site.extract_from_text` + + Check that the parsed `Citation.reporter` exists in reporters-db + and that the `Citation.type` matches the reporters-db type + + :param citation_object: dict got from `Site.extract_from_text` + :return: True if the parsed reporter and type match with reporters-db + False otherwise + """ + parsed_reporter = citation_object["reporter"] + try: + reporter = REPORTERS.get(parsed_reporter) + mapped_type = map_reporter_db_cite_type(reporter[0].get("cite_type")) + if mapped_type == citation_object["type"]: + return True + logger.error( + "Citation.type '%s' from `extract_from_text` does not match reporters-db type '%s'", + citation_object["type"], + parsed_reporter, + ) + except KeyError: + logger.error("Parsed reporter '%s' does not exist", parsed_reporter) + + return False