Skip to content

Commit

Permalink
Merge branch 'freelawproject:main' into 4507-pray-and-pay-API
Browse files Browse the repository at this point in the history
  • Loading branch information
v-anne authored Oct 18, 2024
2 parents 7d5482c + 8e4aac4 commit dc30884
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 14 deletions.
6 changes: 5 additions & 1 deletion cl/corpus_importer/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2231,7 +2231,11 @@ def update_rd_metadata(
item=rd,
)
if response.is_success:
rd.page_count = response.text
rd.page_count = int(response.text)

assert isinstance(
rd.page_count, (int, type(None))
), "page_count must be an int or None."

# Save and extract, skipping OCR.
rd.save()
Expand Down
21 changes: 20 additions & 1 deletion cl/scrapers/management/commands/cl_back_scrape_opinions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

from juriscraper import AbstractSite
from juriscraper.AbstractSite import logger
from juriscraper.lib.importer import site_yielder
Expand Down Expand Up @@ -27,6 +29,14 @@ def add_backscraper_arguments(parser) -> None:
"imposes a limit of returned documents",
type=int,
)
parser.add_argument(
"--backscrape-wait",
type=int,
default=0,
help="Seconds to wait after consuming each element "
"of the backscrape iterable. Useful to avoid overloading"
" a target server when backscraping.",
)


class Command(cl_scrape_opinions.Command):
Expand All @@ -41,7 +51,7 @@ def parse_and_scrape_site(
) -> None:
"""Parse the site and scrape it using the backscraper
:param mod: The jusriscraper Site object to scrape
:param mod: The juriscraper Site object to scrape
:param options: argparse kwargs dictionary. May contain the following keys:
- full_crawl: Whether or not to do a full crawl (Ignored value)
- backscrape_start: string which may be a date, year, index, etc.
Expand All @@ -50,6 +60,8 @@ def parse_and_scrape_site(
- backscrape_end: end value for backscraper range
- days_interval: days between each (start, end) date pairs in the
Site.back_scrape_iterable
- backscrape_wait: Seconds to wait after consuming each element
of the backscrape iterable
:return: None
"""
Expand All @@ -66,5 +78,12 @@ def parse_and_scrape_site(
site.parse()
self.scrape_court(site, full_crawl=True)

if wait := options["backscrape_wait"]:
logger.info(
"Sleeping for %s seconds before continuing backscrape",
wait,
)
time.sleep(wait)

def save_everything(self, items, index=False, backscrape=True):
super().save_everything(items, index, backscrape)
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

from juriscraper.AbstractSite import logger
from juriscraper.lib.importer import site_yielder

Expand Down Expand Up @@ -26,3 +28,10 @@ def parse_and_scrape_site(self, mod, options: dict):
):
site.parse()
self.scrape_court(site, full_crawl=True, backscrape=True)

if wait := options["backscrape_wait"]:
logger.info(
"Sleeping for %s seconds before continuing backscrape",
wait,
)
time.sleep(wait)
45 changes: 33 additions & 12 deletions cl/search/management/commands/generate_cap_crosswalk.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
from botocore.exceptions import ClientError
from django.conf import settings
from django.core.management.base import BaseCommand
from eyecite.find import get_citations
from eyecite.models import FullCaseCitation
from tqdm import tqdm

from cl.lib.command_utils import CommandUtils
Expand Down Expand Up @@ -61,6 +59,11 @@ def add_arguments(self, parser):
help="Directory to save crosswalk files",
required=True,
)
parser.add_argument(
"--start-from-reporter",
type=str,
help="Process starting from this reporter slug",
)

def handle(self, *args: Any, **options: Any) -> None:
if options["verbose"]:
Expand All @@ -70,6 +73,7 @@ def handle(self, *args: Any, **options: Any) -> None:
self.single_reporter = options["reporter"]
self.single_volume = options["volume"]
self.output_dir = options["output_dir"]
self.start_from_reporter = options["start_from_reporter"]

if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
Expand Down Expand Up @@ -111,6 +115,27 @@ def generate_complete_crosswalk(self) -> None:
r for r in reporters if r["short_name"] == self.single_reporter
]

if self.start_from_reporter:
reporter_item_index = next(
(
index
for index, item in enumerate(reporters)
if item["slug"] == self.start_from_reporter
),
None,
)
if reporter_item_index:
logger.info(
f"Starting from reporter: {self.start_from_reporter}"
)
reporters = reporters[reporter_item_index:]
self.start_from_reporter = None
else:
# Invalid reporter slug
raise ValueError(
f"Invalid reporter slug to start from: {self.start_from_reporter}"
)

for i, reporter in enumerate(
tqdm(reporters, desc="Processing reporters")
):
Expand Down Expand Up @@ -164,10 +189,6 @@ def generate_crosswalk_for_reporter(
logger.info(
f"Match found: CAP ID {case_meta['id']} -> CL ID {cl_case.id}"
)
else:
logger.info(
f"No match found for CAP ID {case_meta['id']}"
)
else:
logger.warning(
f"Invalid case metadata for CAP ID {case_meta['id']}"
Expand All @@ -181,7 +202,7 @@ def generate_crosswalk_for_reporter(
)

logger.info(
f"Processed {self.total_cases_processed} cases for {reporter_name}, found {self.total_matches_found} matches"
f"Processed {self.total_cases_processed} cases for {reporter_name}({reporter_slug}), found {self.total_matches_found} matches"
)

def fetch_volumes_for_reporter(self, reporter_slug: str) -> List[str]:
Expand Down Expand Up @@ -235,17 +256,17 @@ def find_matching_case(
cap_case_id = str(case_meta["id"])
page = str(case_meta["first_page"])

query = f"{reporter_slug}.{volume}/{page}.{cap_case_id}"
query = f"law.free.cap.{reporter_slug}.{volume}/{page}.{cap_case_id}.json"
logger.debug(f"Searching for: {query}")

# Exact match of the file path in this format, e.g.:
# law.free.cap.wis-2d.369/658.6776082.json
matching_cluster = OpinionCluster.objects.filter(
filepath_json_harvard__icontains=query
filepath_json_harvard=query
).first()

if matching_cluster:
logger.info(
f"Match found: CAP ID {cap_case_id} -> CL ID {matching_cluster.id}"
)
# Match found, return object
return matching_cluster
else:
logger.info(
Expand Down

0 comments on commit dc30884

Please sign in to comment.