Skip to content

Commit

Permalink
Merge pull request #38 from datamade/feature/37-court-call-scrape
Browse files Browse the repository at this point in the history
Add court call scrape
  • Loading branch information
antidipyramid authored Mar 15, 2024
2 parents 60570d2 + a5675f9 commit aa956d2
Show file tree
Hide file tree
Showing 6 changed files with 412 additions and 1 deletion.
116 changes: 116 additions & 0 deletions .github/workflows/court_calls.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
name: Court call scrape

on:
workflow_dispatch:
schedule:
- cron: '15 10 * * *'

jobs:
scrape:
name: Scrape court calls
runs-on: ubuntu-latest

steps:
- name: Set current date as env variable
run: echo "BEGIN_COURTS_RUN=$(date +'%s')" >> $GITHUB_ENV
- uses: actions/checkout@v3
- name: upgrade sqlite3
run: |
sudo apt-get update
sudo apt-get install sqlite3
- name: Install requirements
run: |
pip install -U pyopenssl cryptography
pip install -r requirements.txt
- name: Download latest database zip
uses: robinraju/release-downloader@v1.8
with:
latest: true
tag: "nightly"
fileName: "*.db.zip"

- name: Decrypt database
run: |
unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip
- name: Scrape court calls
run: |
make -f Makefile.courtcalls all
- name: Setup database for upload
run: |
zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db
- name: Upload new release
uses: WebFreak001/deploy-nightly@v3.0.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label}
release_id: 131985702
asset_path: ./cases.db.zip
asset_name: cases.db.zip
asset_content_type: application/zip # required by GitHub API
max_releases: 7

- name: Keepalive
uses: gautamkrishnar/keepalive-workflow@v1

deploy:
name: Deploy to Heroku
needs: scrape
runs-on: ubuntu-latest

env:
HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }}

steps:
- uses: actions/checkout@v3

- name: Install requirements
run: pip install -r requirements.txt

- name: Download latest database zip
uses: robinraju/release-downloader@v1.8
with:
latest: true
tag: "nightly"
fileName: "*.db.zip"

- name: Decrypt database
run: |
unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip
- name: Install heroku-builds plugin
run: |
heroku plugins:install heroku-builds
- name: Login to Heroku CLI
uses: akhileshns/heroku-deploy@v3.12.14
with:
heroku_api_key: ${{ secrets.HEROKU_API_KEY }}
heroku_app_name: ""
heroku_email: ${{ secrets.HEROKU_EMAIL }}
justlogin: true

- name: Install Datasette plugins
run: |
datasette install datasette-auth-passwords datasette-auth-tokens
- name: Get hashed Datasette password
run: |
# Store hash as an environment variable
hash=$(echo '${{ secrets.DATASETTE_INSTANCE_PW }}' \
| datasette hash-password --no-confirm); \
echo "hash=$hash" >> $GITHUB_ENV
- name: Deploy Datasette instance to Heroku
run: |
datasette publish heroku cases.db \
-n court-scraper \
-m metadata.json \
--setting sql_time_limit_ms 60000 \
--install datasette-auth-passwords \
--plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}'
2 changes: 1 addition & 1 deletion .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
run: |
unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip
- name: Run scrape
- name: Run case scrape
run: |
echo $BEGIN_COURTS_RUN
make get_new_records
Expand Down
14 changes: 14 additions & 0 deletions Makefile.courtcalls
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Makefile for scraping court calls

.PHONY : all
all: court_calls.csv cases.db
cat $< | sqlite3 cases.db -init scripts/import_court_calls.sql -bail

court_calls.csv: court_calls.json
cat $^ | jq '.[] | [.["Case Number"], .["Division"], .["Plaintiff"], .["Defendant"], .["Court Date"], .["Room"], .["District"], .["Sequence #"], .["Time"], .hash] | @csv' -r > $@

court_calls.json: court_calls.jl
cat $^ | jq --slurp '.' > $@

court_calls.jl : cases.db
scrapy crawl courtcalls -s CLOSESPIDER_TIMEOUT=14400 -O $@
221 changes: 221 additions & 0 deletions courtscraper/spiders/court_calls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
from datetime import datetime, timedelta

from scrapy import Spider, Request
from scrapy.http import FormRequest
from scrapy.exceptions import CloseSpider
from scrapy.spidermiddlewares.httperror import HttpError

from lxml import html

from scripts.hash import dict_hash


class CourtCallSpider(Spider):
name = "courtcalls"
url = "https://casesearch.cookcountyclerkofcourt.org/CourtCallSearch.aspx"

def __init__(self, **kwargs):
self.failures = set()
super().__init__(**kwargs)

def next_business_days(self, n):
"""Returns the dates of the next n business days."""

current_date = datetime.today()
count = 0
while count <= n:
yield f"{current_date.month}/{current_date.day}/{current_date.year}"

next_date = current_date + timedelta(days=1)
while next_date.weekday() > 4:
# Skip weekends
next_date += timedelta(days=1)

current_date = next_date
count += 1

def start_requests(self):
for date in self.next_business_days(5):
yield Request(
CourtCallSpider.url,
meta={
"zyte_api_automap": {
"httpResponseHeaders": True,
"browserHtml": True,
"actions": [
{
"action": "waitForSelector",
"selector": {
"type": "css",
"value": "#MainContent_rblSearchType_2",
},
"timeout": 5,
"onError": "return",
},
{
"action": "click",
"selector": {
"type": "css",
"value": "#MainContent_rblSearchType_2",
},
"onError": "return",
},
{
"action": "waitForSelector",
"selector": {
"type": "css",
"value": "#MainContent_dtTxt",
},
"timeout": 5,
"onError": "return",
},
{
"action": "select",
"selector": {
"type": "css",
"value": "#MainContent_ddlDivisionCode",
},
"values": ["CV"],
"onError": "return",
},
{
"action": "type",
"selector": {
"type": "css",
"value": "#MainContent_dtTxt",
},
"text": date,
"onError": "return",
},
{
"action": "click",
"selector": {
"type": "css",
"value": "#MainContent_btnSearch",
},
"onError": "return",
},
{
"action": "waitForSelector",
"selector": {
"type": "css",
"value": "#MainContent_pnlResults",
},
"timeout": 5,
"onError": "return",
},
],
},
"date": date,
"result_page_num": 1,
},
errback=self.handle_error,
callback=self.parse_results,
)

def has_page_num(self, n, response):
"""Check if there's an nth page of court call results."""

tree = html.fromstring(response.text)
page_table = tree.xpath("//table")[1]
next_page_link = page_table.xpath(f".//a[contains(@href,'Page${n}')]")
return bool(next_page_link)

def get_court_calls(self, response):
"""Returns the court calls found on a result page."""

tree = html.fromstring(response.text)
results_table = tree.xpath("//table[@id='MainContent_grdRecords']")[0]

no_results = results_table.xpath(
".//*[text()[contains(.,'No cases found matching your selected"
"criteria.')]]"
)
if no_results:
return

rows = results_table.xpath(".//tr")
headers = rows[0].xpath(".//a/text()")
for row in rows[1:-1]:
cells = row.xpath(".//td/text()")
if cells:
yield dict(zip(headers, cells))

def extract_form(self, response, form_xpath):
"""
ASP.NET pages are essentially forms that store the data needed to send
POST requests in hidden form inputs on the page.
From https://www.trickster.dev/post/scraping-legacy-asp-net-site-with-
scrapy-a-real-example/
"""

form_data = dict()

for hidden_input in response.xpath(form_xpath).xpath(
".//input[@type='hidden']"
):
name = hidden_input.attrib.get("name")
if name is None:
continue
value = hidden_input.attrib.get("value")
if value is None:
value = ""

form_data[name] = value

return form_data

def get_page_n_form_data(self, n, response):
"""
Returns the form fields needed to send a POST request
for the nth page of court call results.
"""

form_data = self.extract_form(response, "//form[@id='ctl01']")
form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords"
form_data["__EVENTARGUMENT"] = f"Page${n}"
return form_data

def parse_results(self, response):
results = self.get_court_calls(response)
if not results:
return

for court_call in results:
court_call["hash"] = dict_hash(court_call)
yield court_call

# Request the next page of results
next_page_num = response.meta["result_page_num"] + 1
next_page_exists = self.has_page_num(next_page_num, response)
if not next_page_exists:
return

next_page_form_data = self.get_page_n_form_data(next_page_num, response)
yield FormRequest.from_response(
response,
meta={"result_page_num": next_page_num},
formxpath="//form[@id='ctl01']",
formdata=next_page_form_data,
callback=self.parse_results,
dont_click=True,
)

def _failing_responses(self, response):
self.failures.add(
f"{response.meta['date']} page {response.meta['result_page_num']}"
)

self.logger.info(f'failures: {", ".join(sorted(self.failures))}')

if len(self.failures) > 20:
raise CloseSpider("run of failures")

def handle_error(self, failure):
if failure.check(HttpError):
response = failure.value.response
if response.status in (404, 500):
self._failing_responses(response)
else:
self.logger.error(repr(failure))
Loading

0 comments on commit aa956d2

Please sign in to comment.