Skip to content

Commit

Permalink
fix(or): update scraper to solve IndexError
Browse files Browse the repository at this point in the history
Solves #1200

- Implement backscraper
- collect citations
- collect dispositions
- update example files
- collect unpublished opinions for orctapp
  • Loading branch information
grossir committed Oct 10, 2024
1 parent 865a726 commit 8e16ded
Show file tree
Hide file tree
Showing 8 changed files with 19,351 additions and 6,059 deletions.
151 changes: 147 additions & 4 deletions juriscraper/opinions/united_states/state/or.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,158 @@
- 2023-11-18: Fixed and updated
"""

from juriscraper.opinions.united_states.state import orctapp
from datetime import datetime, timedelta

from juriscraper.AbstractSite import logger
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
court_code = "p17027coll3"
detail_url = "https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{}/identi^{}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
download_url = "https://ojd.contentdm.oclc.org/digital/api/collection/{}/id/{}/download"
days_interval = 720
# Earliest opinion as of development in Oct 2024
first_opinion_date = datetime(2023, 4, 1)

class Site(orctapp.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = (
"https://www.courts.oregon.gov/publications/sc/Pages/default.aspx"
)
self.status = "Published"
self.court_code = "p17027coll3"
self.make_backscrape_iterable(kwargs)

# By default, scrape at most 10 days into the past
# It's important to limit regular scrapes, since
# this scraper makes secondary requests and the site
# loads all opinions back to a year; which would
# create a lot of hits to the server each time
# the hourly scraper is triggered
# The limits will be modified in a backscrape
self.start_date = (datetime.today() - timedelta(10)).date()
self.end_date = (datetime.today() + timedelta(1)).date()

def _process_html(self):
for date_header in self.html.xpath(
"//h4[a[contains(@href, '/dated/')]]"
):
date_string = date_header.text_content().strip()
if not date_string:
logger.info("Skipping section with no date string")
continue

date = datetime.strptime(date_string, "%m/%d/%Y").date()
if date > self.end_date:
# Opinions come in descending date order
continue
if date < self.start_date and not self.test_mode_enabled():
logger.info(
"Date %s is out of range [%s to %s]",
date,
self.start_date,
self.end_date,
)
break

self.process_a_date(date_header)

def process_a_date(self, date_header) -> None:
"""Process a section defined by a date header
:param date_header: the lxml element containing the date
:return None
"""
date_string = date_header.text_content().strip()

# orctapp has h5 tags which describe the status of the
# opinions in the next ul
for sibling in date_header.xpath("following-sibling::*"):
if sibling.tag not in ["ul", "h5"]:
# Time to jump to another date
break

if "orctapp" in self.court_id:
if sibling.tag == "h5":
status = sibling.text_content().strip()
if status == "Precedential Opinions":
status = "Published"
elif status == "Nonprecedential Memorandum Opinions":
status = "Unpublished"
else:
status = "Unknown"
else:
status = "Published"

for item in sibling.xpath("li"):
# Ensure two links are present (skip Petitions
# for Review rows)
text = item.text_content().strip()
anchors = item.xpath(".//a")
if not (len(anchors) > 1):
logger.info("Skipping row without 2 links. Row: %s", text)
continue

detail_url = anchors[0].xpath("./@href")[0]
download_url, disposition = self.get_details(detail_url)
if not download_url:
# Usually happens for
# "Miscellaneous Supreme Court Dispositions"
logger.info("No records for detail JSON")
continue

name = text.split(")", 1)[-1]
# Clean up names like:
# "Knopp v. Griffin-Valade (Certified appeal accepted)"
if "(" in name:
name, disposition = name.split("(", 1)
disposition = disposition.strip(")")

self.cases.append(
{
"date": date_string,
"name": name,
"docket": anchors[1].text_content().strip(),
"url": download_url,
"citation": item.xpath("b/text()")[0].strip(),
"status": status,
"disposition": disposition,
}
)

def get_details(self, detail_url: str) -> tuple[str, str]:
"""Makes a request to get a case details, including the URL
:param detail_url: case detail's page url
:return: a tuple: (the pdf download url, the disposition)
"""
if self.test_mode_enabled():
return "placeholder url", "placeholder disposition"

identifier = detail_url.split("=")[-1]
detail_url = self.detail_url.format(self.court_code, identifier)

logger.info("Getting detail JSON from %s", detail_url)
json = self.request["session"].get(detail_url).json()
logger.debug(json)
if not json.get("records"):
return "", ""

disposition = json["records"][0].get("descri") or ""
download_url = self.download_url.format(
self.court_code, json["records"][0]["pointer"]
)
return download_url, disposition

def _download_backwards(self, dates: tuple) -> None:
"""The site loads by default the last couple years of data.
So it's not necessary to query the page in a special way to
target data in these years, only to set the proper date limits
To back scrape older opinions, we would need to target another
site
"""
self.start_date, self.end_date = dates
logger.info("Backscraping for range %s %s", *dates)
self.html = self._download()
self._process_html()
59 changes: 7 additions & 52 deletions juriscraper/opinions/united_states/state/orctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,62 +7,17 @@
- 2023-11-18: Created
"""

from juriscraper.DeferringList import DeferringList
from juriscraper.OpinionSiteLinear import OpinionSiteLinear
from importlib import import_module

# `or` is a python reserved keyword; can't import the module as usual
oregon_module = import_module("juriscraper.opinions.united_states.state.or")


class Site(oregon_module.Site):
court_code = "p17027coll5"

class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = (
"https://www.courts.oregon.gov/publications/coa/Pages/default.aspx"
)
self.cases = []
self.status = "Published"
self.court_code = "p17027coll5"

def fetch_url_json(self, identifier):
""""""
url = f"https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{self.court_code}/identi^{identifier}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
json = self.request["session"].get(url).json()
return f"https://ojd.contentdm.oclc.org/digital/api/collection/{self.court_code}/id/{json['records'][0]['pointer']}/download"

def _process_html(self):
for header in self.html.xpath("//h4//a/parent::h4"):
date_string = header.text_content().strip()
if not date_string:
continue
ul = header.xpath("./following-sibling::ul")[0]
for item in ul.xpath(".//li"):
# Ensure two links are present (skip Petitions for Review rows)
# see or_example_2.html
anchors = item.xpath(".//a")
if not (len(anchors) > 1):
continue
text = item.text_content().strip()
url = anchors[0].xpath("./@href")[0]
docket = anchors[1].text_content().strip()
name = text.split(")", 1)[-1]
self.cases.append(
{
"date": date_string,
"name": name,
"docket": docket,
"url": url,
}
)

def _get_download_urls(self):
"""Get download urls
:return: List URLs
"""

def fetcher(case):
if self.test_mode_enabled():
return case["url"]

return self.fetch_url_json(case["url"].split("=")[-1][:-4])

return DeferringList(seed=self.cases, fetcher=fetcher)
Loading

0 comments on commit 8e16ded

Please sign in to comment.