Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(lactapp_5): new scraper for Lousiana Court of Appeals Fifth Circuit #1212

Merged
merged 1 commit into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions juriscraper/OpinionSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(self, *args, **kwargs):
"per_curiam",
"types",
"other_dates",
"attorneys",
]
self._req_attrs = [
"case_dates",
Expand Down Expand Up @@ -134,6 +135,9 @@ def _get_per_curiam(self):
def _get_other_dates(self):
return None

def _get_attorneys(self):
return None

def extract_from_text(self, scraped_text):
"""Pass scraped text into function and return data as a dictionary

Expand Down
5 changes: 5 additions & 0 deletions juriscraper/OpinionSiteLinear.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class OpinionSiteLinear(OpinionSite):
"type",
"joined_by",
"other_date",
"attorney",
}

def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -153,6 +154,10 @@ def _get_other_dates(self):
"""Goes into OpinionCluster.other_dates, type: string"""
return self._get_optional_field_by_id("other_date")

def _get_attorneys(self):
"""Goes into OpinionCluster.attorneys, type: string"""
return self._get_optional_field_by_id("attorney")

def _check_sanity(self):
super()._check_sanity()
# Check that all returned keys have the proper name to be used
Expand Down
28 changes: 28 additions & 0 deletions juriscraper/lib/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import date
from itertools import zip_longest
from math import ceil
from typing import Union

from dateutil.parser import parser, parserinfo
from dateutil.rrule import DAILY, rrule
Expand Down Expand Up @@ -150,3 +151,30 @@ def make_date_range_tuples(start, end, gap):
for d in rrule(DAILY, interval=gap, dtstart=end_start, until=end)
]
return list(zip_longest(start_dates, end_dates, fillvalue=end))


def unique_year_month(
date_list: list[Union[date, datetime.datetime, tuple[date]]],
) -> list[Union[date, datetime.datetime]]:
"""Takes a list of dates or date tuples, and reduces it
to date objects with unique year-months pairs

:param date_list: a list containing dates or tuples of dates
default make_backscrape_iterable returns date tuples
:return: a list with date objects of unique year-month pairs
"""
unique_list = []
seen_year_months = set()

for obj in date_list:
if isinstance(obj, date) or isinstance(obj, datetime.datetime):
obj = [obj]

for date_obj in obj:
ym = date_obj.strftime("%Y%m")
if ym in seen_year_months:
continue
seen_year_months.add(ym)
unique_list.append(date_obj)

return unique_list
1 change: 1 addition & 0 deletions juriscraper/opinions/united_states/state/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
"kyctapp",
"la",
"lactapp_1",
"lactapp_5",
"mass",
"massappct",
"massappct_u",
Expand Down
133 changes: 133 additions & 0 deletions juriscraper/opinions/united_states/state/lactapp_5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import re
from datetime import date, datetime

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
id_to_case_mapper = {
"lblCaseTitle": "name",
"lblCaseNum": "docket",
"lblRulingJudge": "judge",
"lblDistrictCourtNo": "lower_court_number",
"lblLowerCourt": "lower_court",
"lblAttorney": "attorney",
}
first_opinion_date = datetime(1992, 1, 1)
days_interval = 28 # ensure a tick for each month
date_regex = re.compile(r"\d{2}/\d{2}/\d{4}")

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = "https://www.fifthcircuit.org/searchopinions.aspx"
self.search_is_configured = False
self.parameters = {
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions": "2",
}
self.target_date = datetime.today()
self.make_backscrape_iterable(kwargs)
self.status = "Unknown"

def _process_html(self):
# We need to do a plain GET to get hidden inputs
# Then we can do our filtered request
if not self.test_mode_enabled():
self.method = "POST"

# We need to set the proper search filter the first time
if not self.search_is_configured:
self.update_hidden_inputs()
self.parameters["__EVENTTARGET"] = (
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions"
)
self.html = self._download()
self.search_is_configured = True

# Set the proper filters to get the actual data we want
self.update_date_filters()
self.update_hidden_inputs()
self.html = self._download()

count_xpath = "//*[@id='cntBody_ctlOpinionSearch_Toggle_lblRecordCnt']"
logger.info(self.html.xpath(count_xpath)[0].text_content().strip())

for row in self.html.xpath("//tr[.//a[contains(@id, 'HyperLink_')]]"):
fixed_values = {}
for id_part, key in self.id_to_case_mapper.items():
element = row.xpath(f".//*[contains(@id, '{id_part}')]")
if element:
fixed_values[key] = element[0].text_content().strip()

fixed_values["name"] = titlecase(fixed_values["name"])
if fixed_values.get("judge"):
fixed_values["judge"] = re.sub(
r"Hon\.[\s\n]+", "", fixed_values["judge"]
)

# Some cases have more than 1 opinion document (check example 2)
# Some cases have no links, they will be ignored by this loop
for anchor in row.xpath(".//a"):
# The opinion date is sometimes in the disposition text
disposition = ""
case_date = f"{self.target_date.year}/07/01"
date_filed_is_approximate = True
if disp_container := anchor.xpath("following-sibling::text()"):
disposition = disp_container[0].strip()

if date_match := self.date_regex.search(disposition):
case_date = date_match.group(0)
disposition = disposition.rsplit(" on ", 1)[0].strip(
" '"
)
date_filed_is_approximate = False

case = {
"url": anchor.get("href"),
"disposition": disposition,
"date": case_date,
"date_filed_is_approximate": date_filed_is_approximate,
**fixed_values,
}

self.cases.append(case)

def update_hidden_inputs(self) -> None:
"""Parse form values characteristic of aspx sites,
and put then on self.parameters for POST use
"""
for input in self.html.xpath('//input[@type="hidden"]'):
self.parameters[input.get("name")] = input.get("value", "")

def update_date_filters(self) -> None:
"""Set year and month values from `self.target_date`
into self.parameters for POST use
"""
logger.info(
"Scraping for year: %s - month: %s",
self.target_date.year,
self.target_date.month,
)
self.parameters = {
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnMonth": str(
self.target_date.month
),
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnYear": str(
self.target_date.year
),
"ctl00$cntBody$ctlOpinionSearch_Toggle$btnSearch": "Search",
}

def _download_backwards(self, target_date: date) -> None:
self.target_date = target_date
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs):
super().make_backscrape_iterable(kwargs)
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)
14 changes: 4 additions & 10 deletions juriscraper/opinions/united_states/state/sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from typing import Dict, List, Tuple

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -80,16 +81,9 @@ def make_backscrape_iterable(
and replace the self.back_scrape_iterable
"""
super().make_backscrape_iterable(kwargs)
backscrape_iterable = []
seen_year_months = set()
for date_obj, _ in self.back_scrape_iterable:
ym = date_obj.strftime("%Y%m")
if ym in seen_year_months:
continue
seen_year_months.add(ym)
backscrape_iterable.append(date_obj)

self.back_scrape_iterable = backscrape_iterable
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)

def _download_backwards(self, date_obj: date) -> None:
"""Downloads an older page, and parses it
Expand Down
38 changes: 16 additions & 22 deletions juriscraper/oral_args/united_states/federal_appellate/cadc.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from urllib.parse import urljoin

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.OralArgumentSiteLinear import OralArgumentSiteLinear


Expand Down Expand Up @@ -56,16 +57,8 @@ def _process_html(self):
}
)

def _download_backwards(self, url: str) -> None:
logger.info("Backscraping URL '%s'", url)
self.url = url
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Use base function to generate a range, then pick
unique year-month combinations to build the backscrape
URLS, and save them to the self.back_scrape_iterable
def _download_backwards(self, target_date: date) -> None:
"""Download historical data

Note that this URL will work:
"https://media.cadc.uscourts.gov/recordings/bydate/2007/9"
Expand All @@ -74,16 +67,17 @@ def make_backscrape_iterable(self, kwargs: dict) -> None:

That's why the '%-m' formatter is needed
"""
super().make_backscrape_iterable(kwargs)
seen_year_months = set()
urls = []

for tupl in self.back_scrape_iterable:
for item in tupl:
ym = item.strftime("%Y/%-m")
if ym in seen_year_months:
continue
seen_year_months.add(ym)
urls.append(self.base_url.format(ym))
self.url = self.base_url.format(target_date.strftime("%Y/%-m"))
logger.info("Backscraping URL '%s'", self.url)
self.html = self._download()
self._process_html()

self.back_scrape_iterable = urls
def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Use base function to generate a range, then pick
unique year-month combinations to build the backscrape
URLS
"""
super().make_backscrape_iterable(kwargs)
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)
Loading
Loading