Skip to content

Commit

Permalink
feat(scrapers.update_from_text): refactor from code review
Browse files Browse the repository at this point in the history
- validate citation objects from `Site.extract_from_text`. Add tests for this
- abstract --courts required argument for scrapers into ScraperCommand class
also, made it more flexible
- refactor cl_scrape_opinions; cl_scrape_oral_arguments to account for this
- delete cl.scrapers.utils.extract_recap_documents which was generating a
circular import. This function was not used anywhere
  • Loading branch information
grossir committed Oct 28, 2024
1 parent 5baa691 commit d39abee
Show file tree
Hide file tree
Showing 9 changed files with 214 additions and 105 deletions.
36 changes: 36 additions & 0 deletions cl/lib/command_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from django.core.management import BaseCommand, CommandError

from cl.lib.juriscraper_utils import get_module_by_court_id

logger = logging.getLogger(__name__)


Expand All @@ -22,6 +24,40 @@ def handle(self, *args, **options):
juriscraper_logger.setLevel(logging.DEBUG)


class ScraperCommand(VerboseCommand):
"""Base class for cl.scrapers commands that use Juriscraper
Implements the `--courts` argument to lookup for a Site object
"""

# To be used on get_module_by_court_id
# Defined by inheriting classes
juriscraper_module_type = ""

def add_arguments(self, parser):
parser.add_argument(
"--courts",
dest="court_id",
metavar="COURTID",
type=lambda s: (
s
if "." in s
else get_module_by_court_id(s, self.juriscraper_module_type)
),
required=True,
help=(
"The court(s) to scrape and extract. One of: "
"1. a python module or package import from the Juriscraper library, e.g."
"'juriscraper.opinions.united_states.federal_appellate.ca1' "
"or simply 'juriscraper.opinions' to do all opinions."
""
"2. a court_id, to be used to lookup for a full module path"
"An error will be raised if the `court_id` matches more than "
"one module path. In that case, use the full path"
),
)


class CommandUtils:
"""A mixin to give some useful methods to sub classes."""

Expand Down
52 changes: 49 additions & 3 deletions cl/lib/juriscraper_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
import juriscraper


def walk_juriscraper():
return pkgutil.walk_packages(
juriscraper.__path__, f"{juriscraper.__name__}."
)


def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
"""Identify and instantiate a Site() object given the name of a court
Expand All @@ -25,9 +31,7 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):

return importlib.import_module(juriscraper_module).Site()

for _, full_module_path, _ in pkgutil.walk_packages(
juriscraper.__path__, f"{juriscraper.__name__}."
):
for _, full_module_path, _ in walk_juriscraper():
# Get the module name from the full path and trim
# any suffixes like _p, _u
module_name = full_module_path.rsplit(".", 1)[1].rsplit("_", 1)[0]
Expand All @@ -42,3 +46,45 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""):
# has been stripped off it. In any case, just ignore it when
# this happens.
continue


def get_module_by_court_id(court_id: str, module_type: str):
"""Given a `court_id` return a juriscraper module path
Some court_ids match multiple scraper files. These will force the user
to use the full module path. For example, "lactapp_1" and "lactapp_5"
match the same `court_id`, but scrape totally different sites, and
their Site objects are expected to have different `extract_from_text`
behavior
:param court_id: court id to look for
:param module_type: 'opinions' or 'oral_args'. Without this, some
court_ids may match the 2 classes of scrapers
:raises: ValueError if there is no match or there is more than 1 match
:return: the full module path
"""
if module_type not in ["opinions", "oral_args"]:
raise ValueError(
"module_type has to be one of ['opinions', 'oral_args']"
)

matches = []
for _, module_string, _ in walk_juriscraper():
if module_string.count(".") != 4 or module_type not in module_string:
# Skip folder and lib modules. Skip type
continue

module_court_id = module_string.rsplit(".", 1)[1].rsplit("_", 1)[0]
if module_court_id == court_id:
matches.append(module_string)

if len(matches) == 1:
return matches[0]
elif len(matches) == 0:
raise ValueError(f"'{court_id}' doesn't match any juriscraper module")
else:
raise ValueError(
f"'{court_id}' matches more than 1 juriscraper module."
f"Use a full module path. Matches: '{matches}'"
)
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

class Command(cl_back_scrape_opinions.Command):
scrape_target_descr = "citations"
juriscraper_module_type = "opinions"

def scrape_court(
self,
Expand Down
20 changes: 4 additions & 16 deletions cl/scrapers/management/commands/cl_scrape_opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from cl.alerts.models import RealTimeQueue
from cl.citations.utils import map_reporter_db_cite_type
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.command_utils import ScraperCommand, logger
from cl.lib.crypto import sha1
from cl.lib.string_utils import trunc
from cl.people_db.lookup_utils import lookup_judges_by_messy_str
Expand Down Expand Up @@ -217,14 +217,16 @@ def save_everything(
)


class Command(VerboseCommand):
class Command(ScraperCommand):
help = "Runs the Juriscraper toolkit against one or many jurisdictions."
juriscraper_module_type = "opinions"
scrape_target_descr = "opinions" # for logging purposes

def __init__(self, stdout=None, stderr=None, no_color=False):
super().__init__(stdout=None, stderr=None, no_color=False)

def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument(
"--daemon",
action="store_true",
Expand All @@ -246,20 +248,6 @@ def add_arguments(self, parser):
"is 30 minutes."
),
)
parser.add_argument(
"--courts",
type=str,
dest="court_id",
metavar="COURTID",
required=True,
help=(
"The court(s) to scrape and extract. This should be "
"in the form of a python module or package import "
"from the Juriscraper library, e.g. "
'"juriscraper.opinions.united_states.federal_appellate.ca1" '
'or simply "opinions" to do all opinions.'
),
)
parser.add_argument(
"--fullcrawl",
dest="full_crawl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def make_objects(

class Command(cl_scrape_opinions.Command):
scrape_target_descr = "oral arguments"
juriscraper_module_type = "oral_args"

def ingest_a_case(
self,
Expand Down
97 changes: 66 additions & 31 deletions cl/scrapers/management/commands/update_from_text.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import traceback
from datetime import datetime

from django.db import transaction

from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.command_utils import ScraperCommand, logger
from cl.scrapers.tasks import update_document_from_text
from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster

Expand All @@ -23,15 +24,37 @@ def rerun_extract_from_text(
:return None
"""
if not opinion.plain_text and not opinion.html:
# May be an opinion entirely from a merged corpus
# or an error during text extraction
logger.info(
"Opinion %s has no `plain_text` or `html` to extract from",
opinion.id,
)
stats["No text to extract from"] += 1
return

with transaction.atomic():
changes = update_document_from_text(opinion, juriscraper_module)
try:
changes = update_document_from_text(opinion, juriscraper_module)
except:
# Probably a bad implementation of `extract_from_text`
logger.debug(
"`update_document_from_text` failed for opinion %s. Traceback: %s",
opinion.id,
traceback.format_exc(),
)
stats["Error"] += 1
return

if not changes:
logger.info("Did not get any metadata for opinion %s", opinion.id)
stats["No metadata extracted"] += 1
return

logger.info("Processing opinion %s", opinion.id)

# Check if changes exist before saving, to prevent unecessary DB queries
# Check if changes exist before saving, to prevent unnecessary DB queries
if changes.get("Docket"):
opinion.cluster.docket.save()
logger.debug(
Expand Down Expand Up @@ -67,7 +90,7 @@ def rerun_extract_from_text(
)


class Command(VerboseCommand):
class Command(ScraperCommand):
help = """Updates objects by running Site.extract_from_text
over extracted content found on Opinion.plain_text or Opinion.html.
Expand All @@ -79,18 +102,20 @@ class Command(VerboseCommand):
and check if updates over Docket, OpinionCluster, Opinion and
Citation are as expected
"""
stats = {} # assigned at the end of a command run, for testing
# For aggregate reporting at the end of the command
stats = {
"Docket": 0,
"OpinionCluster": 0,
"Opinion": 0,
"Citation": 0,
"No text to extract from": 0,
"No metadata extracted": 0,
"Error": 0,
}
juriscraper_module_type = "opinions"

def add_arguments(self, parser):
parser.add_argument(
"--juriscraper-module",
help="""The Juriscraper file which contains the
`extract_from_text` method to be used. The `court_id`
will be deduced from this. Example:
juriscraper.opinions.united_states.federal_appellate.ca1
""",
required=True,
)
super().add_arguments(parser)
parser.add_argument(
"--opinion-ids",
nargs="+",
Expand All @@ -100,15 +125,17 @@ def add_arguments(self, parser):
other filters will be ignored""",
)
parser.add_argument(
"date-filed-gte",
"--date-filed-gte",
default="",
help=r"""A filter value in %Y/%m/%d format.
type=self.parse_input_date,
help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
OpinionCluster.date_filed will have to be greater or equal""",
)
parser.add_argument(
"date-filed-lte",
"--date-filed-lte",
default="",
help=r"""A filter value in %Y/%m/%d format.
type=self.parse_input_date,
help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format.
OpinionCluster.date_filed will have to be less or equal""",
)
parser.add_argument(
Expand All @@ -122,16 +149,14 @@ def add_arguments(self, parser):

def handle(self, *args, **options):
super().handle(*args, **options)
juriscraper_module = options["juriscraper_module"]
# For aggregate reporting
stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0}
juriscraper_module = options["court_id"]

if options["opinion_ids"]:
opinions = Opinion.objects.filter(id__in=options["opinion_ids"])
for op in opinions:
rerun_extract_from_text(op, juriscraper_module, stats)
rerun_extract_from_text(op, juriscraper_module, self.stats)

logger.info("Modified objects counts: %s", stats)
logger.info("Modified objects counts: %s", self.stats)
return

if not (options["date_filed_gte"] and options["date_filed_lte"]):
Expand All @@ -140,12 +165,10 @@ def handle(self, *args, **options):
)

court_id = juriscraper_module.split(".")[-1].split("_")[0]
gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d")
lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d")
query = {
"docket__court_id": court_id,
"date_filed__gte": gte_date,
"date_filed__lte": lte_date,
"date_filed__gte": options["date_filed_lte"],
"date_filed__lte": options["date_filed_gte"],
}

if options["cluster_status"]:
Expand All @@ -157,7 +180,19 @@ def handle(self, *args, **options):
for cluster in qs:
opinions = cluster.sub_opinions.all()
for op in opinions:
rerun_extract_from_text(op, juriscraper_module, stats)

logger.info("Modified objects counts: %s", stats)
self.stats = stats
rerun_extract_from_text(op, juriscraper_module, self.stats)

logger.info("Modified objects counts: %s", self.stats)

def parse_input_date(self, date_string: str) -> datetime | str:
"""Parses a date string in accepted formats
:param date_string: the date string in "%Y/%m/%d" or "%Y-%m-%d"
:return: an empty string if the input was empty; or the date object
"""
parsed_date = ""
if "/" in date_string:
parsed_date = datetime.strptime(date_string, "%Y/%m/%d")
elif "-" in date_string:
parsed_date = datetime.strptime(date_string, "%Y-%m-%d")
return parsed_date
6 changes: 4 additions & 2 deletions cl/scrapers/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from cl.lib.string_utils import trunc
from cl.lib.utils import is_iter
from cl.recap.mergers import save_iquery_to_docket
from cl.scrapers.utils import scraped_citation_object_is_valid
from cl.search.models import Docket, Opinion, RECAPDocument

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -71,8 +72,9 @@ def update_document_from_text(
opinion.cluster.__dict__.update(data)
elif model_name == "Citation":
data["cluster_id"] = opinion.cluster_id
_, citation_created = ModelClass.objects.get_or_create(**data)
metadata_dict["Citation"]["created"] = citation_created
if scraped_citation_object_is_valid(data):
_, citation_created = ModelClass.objects.get_or_create(**data)
metadata_dict["Citation"]["created"] = citation_created
elif model_name == "Opinion":
opinion.__dict__.update(data)
else:
Expand Down
Loading

0 comments on commit d39abee

Please sign in to comment.