Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(or): update scraper to solve IndexError #1201

Merged
merged 2 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 147 additions & 4 deletions juriscraper/opinions/united_states/state/or.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,158 @@
- 2023-11-18: Fixed and updated
"""

from juriscraper.opinions.united_states.state import orctapp
from datetime import datetime, timedelta

from juriscraper.AbstractSite import logger
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
court_code = "p17027coll3"
detail_url = "https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{}/identi^{}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
download_url = "https://ojd.contentdm.oclc.org/digital/api/collection/{}/id/{}/download"
days_interval = 720
# Earliest opinion as of development in Oct 2024
first_opinion_date = datetime(2023, 4, 1)

class Site(orctapp.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = (
"https://www.courts.oregon.gov/publications/sc/Pages/default.aspx"
)
self.status = "Published"
self.court_code = "p17027coll3"
self.make_backscrape_iterable(kwargs)

# By default, scrape at most 10 days into the past
# It's important to limit regular scrapes, since
# this scraper makes secondary requests and the site
# loads all opinions back to a year; which would
# create a lot of hits to the server each time
# the hourly scraper is triggered
# The limits will be modified in a backscrape
self.start_date = (datetime.today() - timedelta(10)).date()
self.end_date = (datetime.today() + timedelta(1)).date()

def _process_html(self):
for date_header in self.html.xpath(
"//h4[a[contains(@href, '/dated/')]]"
):
date_string = date_header.text_content().strip()
if not date_string:
logger.info("Skipping section with no date string")
continue

date = datetime.strptime(date_string, "%m/%d/%Y").date()
if date > self.end_date:
# Opinions come in descending date order
continue
if date < self.start_date and not self.test_mode_enabled():
logger.info(
"Date %s is out of range [%s to %s]",
date,
self.start_date,
self.end_date,
)
break

self.process_a_date(date_header)

def process_a_date(self, date_header) -> None:
"""Process a section defined by a date header
:param date_header: the lxml element containing the date
:return None
"""
date_string = date_header.text_content().strip()

# orctapp has h5 tags which describe the status of the
# opinions in the next ul
for sibling in date_header.xpath("following-sibling::*"):
if sibling.tag not in ["ul", "h5"]:
# Time to jump to another date
break

if "orctapp" in self.court_id:
if sibling.tag == "h5":
status = sibling.text_content().strip()
if status == "Precedential Opinions":
status = "Published"
elif status == "Nonprecedential Memorandum Opinions":
status = "Unpublished"
else:
status = "Unknown"
else:
status = "Published"

for item in sibling.xpath("li"):
# Ensure two links are present (skip Petitions
# for Review rows)
text = item.text_content().strip()
anchors = item.xpath(".//a")
if not (len(anchors) > 1):
logger.info("Skipping row without 2 links. Row: %s", text)
continue

detail_url = anchors[0].xpath("./@href")[0]
download_url, disposition = self.get_details(detail_url)
if not download_url:
# Usually happens for
# "Miscellaneous Supreme Court Dispositions"
logger.info("No records for detail JSON")
continue

name = text.split(")", 1)[-1]
# Clean up names like:
# "Knopp v. Griffin-Valade (Certified appeal accepted)"
if "(" in name:
name, disposition = name.split("(", 1)
disposition = disposition.strip(")")

self.cases.append(
{
"date": date_string,
"name": name,
"docket": anchors[1].text_content().strip(),
"url": download_url,
"citation": item.xpath("b/text()")[0].strip(),
"status": status,
"disposition": disposition,
}
)

def get_details(self, detail_url: str) -> tuple[str, str]:
"""Makes a request to get a case details, including the URL
:param detail_url: case detail's page url
:return: a tuple: (the pdf download url, the disposition)
"""
if self.test_mode_enabled():
return "placeholder url", "placeholder disposition"

identifier = detail_url.split("=")[-1]
detail_url = self.detail_url.format(self.court_code, identifier)

logger.info("Getting detail JSON from %s", detail_url)
json = self.request["session"].get(detail_url).json()
logger.debug(json)
if not json.get("records"):
return "", ""

disposition = json["records"][0].get("descri") or ""
download_url = self.download_url.format(
self.court_code, json["records"][0]["pointer"]
)
return download_url, disposition

def _download_backwards(self, dates: tuple) -> None:
"""The site loads by default the last couple years of data.
So it's not necessary to query the page in a special way to
target data in these years, only to set the proper date limits
To back scrape older opinions, we would need to target another
site
"""
self.start_date, self.end_date = dates
logger.info("Backscraping for range %s %s", *dates)
self.html = self._download()
self._process_html()
59 changes: 7 additions & 52 deletions juriscraper/opinions/united_states/state/orctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,62 +7,17 @@
- 2023-11-18: Created
"""

from juriscraper.DeferringList import DeferringList
from juriscraper.OpinionSiteLinear import OpinionSiteLinear
from importlib import import_module

# `or` is a python reserved keyword; can't import the module as usual
oregon_module = import_module("juriscraper.opinions.united_states.state.or")


class Site(oregon_module.Site):
court_code = "p17027coll5"

class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = (
"https://www.courts.oregon.gov/publications/coa/Pages/default.aspx"
)
self.cases = []
self.status = "Published"
self.court_code = "p17027coll5"

def fetch_url_json(self, identifier):
""""""
url = f"https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{self.court_code}/identi^{identifier}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
json = self.request["session"].get(url).json()
return f"https://ojd.contentdm.oclc.org/digital/api/collection/{self.court_code}/id/{json['records'][0]['pointer']}/download"

def _process_html(self):
for header in self.html.xpath("//h4//a/parent::h4"):
date_string = header.text_content().strip()
if not date_string:
continue
ul = header.xpath("./following-sibling::ul")[0]
for item in ul.xpath(".//li"):
# Ensure two links are present (skip Petitions for Review rows)
# see or_example_2.html
anchors = item.xpath(".//a")
if not (len(anchors) > 1):
continue
text = item.text_content().strip()
url = anchors[0].xpath("./@href")[0]
docket = anchors[1].text_content().strip()
name = text.split(")", 1)[-1]
self.cases.append(
{
"date": date_string,
"name": name,
"docket": docket,
"url": url,
}
)

def _get_download_urls(self):
"""Get download urls
:return: List URLs
"""

def fetcher(case):
if self.test_mode_enabled():
return case["url"]

return self.fetch_url_json(case["url"].split("=")[-1][:-4])

return DeferringList(seed=self.cases, fetcher=fetcher)
Loading
Loading