Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(scrapers.update_from_text): new command #4520

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
36 changes: 36 additions & 0 deletions cl/lib/command_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from django.core.management import BaseCommand, CommandError

from cl.lib.juriscraper_utils import get_module_by_court_id

logger = logging.getLogger(__name__)


Expand All @@ -22,6 +24,40 @@ def handle(self, *args, **options):
juriscraper_logger.setLevel(logging.DEBUG)


class ScraperCommand(VerboseCommand):
"""Base class for cl.scrapers commands that use Juriscraper

Implements the `--courts` argument to lookup for a Site object
"""

# To be used on get_module_by_court_id
# Defined by inheriting classes
juriscraper_module_type = ""

def add_arguments(self, parser):
parser.add_argument(
"--courts",
dest="court_id",
metavar="COURTID",
type=lambda s: (
s
if "." in s
else get_module_by_court_id(s, self.juriscraper_module_type)
),
required=True,
help=(
"The court(s) to scrape and extract. One of: "
"1. a python module or package import from the Juriscraper library, e.g."
"'juriscraper.opinions.united_states.federal_appellate.ca1' "
"or simply 'juriscraper.opinions' to do all opinions."
""
"2. a court_id, to be used to lookup for a full module path"
"An error will be raised if the `court_id` matches more than "
"one module path. In that case, use the full path"
),
)


class CommandUtils:
"""A mixin to give some useful methods to sub classes."""

Expand Down
52 changes: 49 additions & 3 deletions cl/lib/juriscraper_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
import juriscraper


def walk_juriscraper():
return pkgutil.walk_packages(
juriscraper.__path__, f"{juriscraper.__name__}."
)


def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
"""Identify and instantiate a Site() object given the name of a court

Expand All @@ -25,9 +31,7 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):

return importlib.import_module(juriscraper_module).Site()

for _, full_module_path, _ in pkgutil.walk_packages(
juriscraper.__path__, f"{juriscraper.__name__}."
):
for _, full_module_path, _ in walk_juriscraper():
# Get the module name from the full path and trim
# any suffixes like _p, _u
module_name = full_module_path.rsplit(".", 1)[1].rsplit("_", 1)[0]
Expand All @@ -42,3 +46,45 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
# has been stripped off it. In any case, just ignore it when
# this happens.
continue


def get_module_by_court_id(court_id: str, module_type: str):
"""Given a `court_id` return a juriscraper module path

Some court_ids match multiple scraper files. These will force the user
to use the full module path. For example, "lactapp_1" and "lactapp_5"
match the same `court_id`, but scrape totally different sites, and
their Site objects are expected to have different `extract_from_text`
behavior

:param court_id: court id to look for
:param module_type: 'opinions' or 'oral_args'. Without this, some
court_ids may match the 2 classes of scrapers

:raises: ValueError if there is no match or there is more than 1 match
:return: the full module path
"""
if module_type not in ["opinions", "oral_args"]:
raise ValueError(
"module_type has to be one of ['opinions', 'oral_args']"
)

matches = []
for _, module_string, _ in walk_juriscraper():
if module_string.count(".") != 4 or module_type not in module_string:
# Skip folder and lib modules. Skip type
continue

module_court_id = module_string.rsplit(".", 1)[1].rsplit("_", 1)[0]
if module_court_id == court_id:
matches.append(module_string)

if len(matches) == 1:
return matches[0]
elif len(matches) == 0:
raise ValueError(f"'{court_id}' doesn't match any juriscraper module")
else:
raise ValueError(
f"'{court_id}' matches more than 1 juriscraper module."
f"Use a full module path. Matches: '{matches}'"
)
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

class Command(cl_back_scrape_opinions.Command):
scrape_target_descr = "citations"
juriscraper_module_type = "opinions"

def scrape_court(
self,
Expand Down
20 changes: 4 additions & 16 deletions cl/scrapers/management/commands/cl_scrape_opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from cl.alerts.models import RealTimeQueue
from cl.citations.utils import map_reporter_db_cite_type
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.command_utils import ScraperCommand, logger
from cl.lib.crypto import sha1
from cl.lib.string_utils import trunc
from cl.people_db.lookup_utils import lookup_judges_by_messy_str
Expand Down Expand Up @@ -217,14 +217,16 @@ def save_everything(
)


class Command(VerboseCommand):
class Command(ScraperCommand):
help = "Runs the Juriscraper toolkit against one or many jurisdictions."
juriscraper_module_type = "opinions"
scrape_target_descr = "opinions" # for logging purposes

def __init__(self, stdout=None, stderr=None, no_color=False):
super().__init__(stdout=None, stderr=None, no_color=False)

def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument(
"--daemon",
action="store_true",
Expand All @@ -246,20 +248,6 @@ def add_arguments(self, parser):
"is 30 minutes."
),
)
parser.add_argument(
"--courts",
type=str,
dest="court_id",
metavar="COURTID",
required=True,
help=(
"The court(s) to scrape and extract. This should be "
"in the form of a python module or package import "
"from the Juriscraper library, e.g. "
'"juriscraper.opinions.united_states.federal_appellate.ca1" '
'or simply "opinions" to do all opinions.'
),
)
parser.add_argument(
"--fullcrawl",
dest="full_crawl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def make_objects(

class Command(cl_scrape_opinions.Command):
scrape_target_descr = "oral arguments"
juriscraper_module_type = "oral_args"

def ingest_a_case(
self,
Expand Down
198 changes: 198 additions & 0 deletions cl/scrapers/management/commands/update_from_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import traceback
from datetime import datetime

from django.db import transaction

from cl.lib.command_utils import ScraperCommand, logger
from cl.scrapers.tasks import update_document_from_text
from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster


def rerun_extract_from_text(
opinion: Opinion, juriscraper_module: str, stats: dict[str, int]
):
"""
Reruns `update_document_from_text` from the scraper flow, saving changes

`update_document_from_text` calls `Site.extract_from_text` and assigns
any changes to the proper objets, in place, but they are not saved.
This method saves the ones with actual changes

:param opinion: the Opinion on which to apply extract_from_text
:param juriscraper_module: the scraper module path
:param stats: dict to accumulate counts for reporting. Modified in place

:return None
"""
if not opinion.plain_text and not opinion.html:
# May be an opinion entirely from a merged corpus
# or an error during text extraction
logger.info(
"Opinion %s has no `plain_text` or `html` to extract from",
opinion.id,
)
stats["No text to extract from"] += 1
return

with transaction.atomic():
try:
changes = update_document_from_text(opinion, juriscraper_module)
except:
# Probably a bad implementation of `extract_from_text`
logger.debug(
"`update_document_from_text` failed for opinion %s. Traceback: %s",
opinion.id,
traceback.format_exc(),
)
stats["Error"] += 1
return

if not changes:
logger.info("Did not get any metadata for opinion %s", opinion.id)
stats["No metadata extracted"] += 1
return

logger.info("Processing opinion %s", opinion.id)

# Check if changes exist before saving, to prevent unnecessary DB queries
if changes.get("Docket"):
opinion.cluster.docket.save()
logger.debug(
"Docket %s updated with data %s",
opinion.cluster.docket.id,
changes["Docket"],
)
stats["Docket"] += 1

if changes.get("OpinionCluster"):
opinion.cluster.save()
logger.debug(
"OpinionCluster %s updated with data %s",
opinion.cluster.id,
changes["OpinionCluster"],
)
stats["OpinionCluster"] += 1

if changes.get("Opinion"):
opinion.save()
logger.debug("Opinion updated with data %s", changes["Opinion"])
stats["Opinion"] += 1

if changes.get("Citation"):
if changes["Citation"].get("citation_created"):
logger.info(
"Citation created with data %s", changes["Citation"]
)
stats["Citation"] += 1
else:
logger.debug(
"Citation not created. Data %s", changes["Citation"]
)


class Command(ScraperCommand):
help = """Updates objects by running Site.extract_from_text
over extracted content found on Opinion.plain_text or Opinion.html.

If `--opinion-ids` is used, filters will be ignored.
If not, the 2 date filters will be required, to prevent triggering
unwanted reprocessing of the whole court's dataset

Recommended use is to run over a sample of the target time period
and check if updates over Docket, OpinionCluster, Opinion and
Citation are as expected
"""
# For aggregate reporting at the end of the command
stats = {
"Docket": 0,
"OpinionCluster": 0,
"Opinion": 0,
"Citation": 0,
"No text to extract from": 0,
"No metadata extracted": 0,
"Error": 0,
}
juriscraper_module_type = "opinions"

def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument(
"--opinion-ids",
nargs="+",
type=int,
help="""The Opinion ids to re-process.
May be more than one. If this argument is used,
other filters will be ignored""",
)
parser.add_argument(
"--date-filed-gte",
default="",
type=self.parse_input_date,
help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
OpinionCluster.date_filed will have to be greater or equal""",
)
parser.add_argument(
"--date-filed-lte",
default="",
type=self.parse_input_date,
help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
OpinionCluster.date_filed will have to be less or equal""",
)
parser.add_argument(
"--cluster-status",
default="",
choices=[value for value, name in PRECEDENTIAL_STATUS.NAMES],
help="""A value of OpinionCluster.precedential_status. To be
used for filtering the Opinions to be processed
""",
)

def handle(self, *args, **options):
super().handle(*args, **options)
juriscraper_module = options["court_id"]

if options["opinion_ids"]:
opinions = Opinion.objects.filter(id__in=options["opinion_ids"])
for op in opinions:
rerun_extract_from_text(op, juriscraper_module, self.stats)

logger.info("Modified objects counts: %s", self.stats)
return

if not (options["date_filed_gte"] and options["date_filed_lte"]):
raise ValueError(
"Both `date-filed-gte` and `date-filed-lte` arguments should have values"
)

court_id = juriscraper_module.split(".")[-1].split("_")[0]
query = {
"docket__court_id": court_id,
"date_filed__gte": options["date_filed_lte"],
"date_filed__lte": options["date_filed_gte"],
}

if options["cluster_status"]:
query["precedential_status"] = options["cluster_status"]

qs = OpinionCluster.objects.filter(**query).prefetch_related(
"sub_opinions"
)
for cluster in qs:
opinions = cluster.sub_opinions.all()
for op in opinions:
rerun_extract_from_text(op, juriscraper_module, self.stats)

logger.info("Modified objects counts: %s", self.stats)

def parse_input_date(self, date_string: str) -> datetime | str:
"""Parses a date string in accepted formats

:param date_string: the date string in "%Y/%m/%d" or "%Y-%m-%d"
:return: an empty string if the input was empty; or the date object
"""
parsed_date = ""
if "/" in date_string:
parsed_date = datetime.strptime(date_string, "%Y/%m/%d")
elif "-" in date_string:
parsed_date = datetime.strptime(date_string, "%Y-%m-%d")
return parsed_date
Loading
Loading