-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #38 from datamade/feature/37-court-call-scrape
Add court call scrape
- Loading branch information
Showing
6 changed files
with
412 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
name: Court call scrape | ||
|
||
on: | ||
workflow_dispatch: | ||
schedule: | ||
- cron: '15 10 * * *' | ||
|
||
jobs: | ||
scrape: | ||
name: Scrape court calls | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Set current date as env variable | ||
run: echo "BEGIN_COURTS_RUN=$(date +'%s')" >> $GITHUB_ENV | ||
- uses: actions/checkout@v3 | ||
- name: upgrade sqlite3 | ||
run: | | ||
sudo apt-get update | ||
sudo apt-get install sqlite3 | ||
- name: Install requirements | ||
run: | | ||
pip install -U pyopenssl cryptography | ||
pip install -r requirements.txt | ||
- name: Download latest database zip | ||
uses: robinraju/release-downloader@v1.8 | ||
with: | ||
latest: true | ||
tag: "nightly" | ||
fileName: "*.db.zip" | ||
|
||
- name: Decrypt database | ||
run: | | ||
unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip | ||
- name: Scrape court calls | ||
run: | | ||
make -f Makefile.courtcalls all | ||
- name: Setup database for upload | ||
run: | | ||
zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db | ||
- name: Upload new release | ||
uses: WebFreak001/deploy-nightly@v3.0.0 | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
with: | ||
upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label} | ||
release_id: 131985702 | ||
asset_path: ./cases.db.zip | ||
asset_name: cases.db.zip | ||
asset_content_type: application/zip # required by GitHub API | ||
max_releases: 7 | ||
|
||
- name: Keepalive | ||
uses: gautamkrishnar/keepalive-workflow@v1 | ||
|
||
deploy: | ||
name: Deploy to Heroku | ||
needs: scrape | ||
runs-on: ubuntu-latest | ||
|
||
env: | ||
HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }} | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Install requirements | ||
run: pip install -r requirements.txt | ||
|
||
- name: Download latest database zip | ||
uses: robinraju/release-downloader@v1.8 | ||
with: | ||
latest: true | ||
tag: "nightly" | ||
fileName: "*.db.zip" | ||
|
||
- name: Decrypt database | ||
run: | | ||
unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip | ||
- name: Install heroku-builds plugin | ||
run: | | ||
heroku plugins:install heroku-builds | ||
- name: Login to Heroku CLI | ||
uses: akhileshns/heroku-deploy@v3.12.14 | ||
with: | ||
heroku_api_key: ${{ secrets.HEROKU_API_KEY }} | ||
heroku_app_name: "" | ||
heroku_email: ${{ secrets.HEROKU_EMAIL }} | ||
justlogin: true | ||
|
||
- name: Install Datasette plugins | ||
run: | | ||
datasette install datasette-auth-passwords datasette-auth-tokens | ||
- name: Get hashed Datasette password | ||
run: | | ||
# Store hash as an environment variable | ||
hash=$(echo '${{ secrets.DATASETTE_INSTANCE_PW }}' \ | ||
| datasette hash-password --no-confirm); \ | ||
echo "hash=$hash" >> $GITHUB_ENV | ||
- name: Deploy Datasette instance to Heroku | ||
run: | | ||
datasette publish heroku cases.db \ | ||
-n court-scraper \ | ||
-m metadata.json \ | ||
--setting sql_time_limit_ms 60000 \ | ||
--install datasette-auth-passwords \ | ||
--plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Makefile for scraping court calls | ||
|
||
.PHONY : all | ||
all: court_calls.csv cases.db | ||
cat $< | sqlite3 cases.db -init scripts/import_court_calls.sql -bail | ||
|
||
court_calls.csv: court_calls.json | ||
cat $^ | jq '.[] | [.["Case Number"], .["Division"], .["Plaintiff"], .["Defendant"], .["Court Date"], .["Room"], .["District"], .["Sequence #"], .["Time"], .hash] | @csv' -r > $@ | ||
|
||
court_calls.json: court_calls.jl | ||
cat $^ | jq --slurp '.' > $@ | ||
|
||
court_calls.jl : cases.db | ||
scrapy crawl courtcalls -s CLOSESPIDER_TIMEOUT=14400 -O $@ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
from datetime import datetime, timedelta | ||
|
||
from scrapy import Spider, Request | ||
from scrapy.http import FormRequest | ||
from scrapy.exceptions import CloseSpider | ||
from scrapy.spidermiddlewares.httperror import HttpError | ||
|
||
from lxml import html | ||
|
||
from scripts.hash import dict_hash | ||
|
||
|
||
class CourtCallSpider(Spider): | ||
name = "courtcalls" | ||
url = "https://casesearch.cookcountyclerkofcourt.org/CourtCallSearch.aspx" | ||
|
||
def __init__(self, **kwargs): | ||
self.failures = set() | ||
super().__init__(**kwargs) | ||
|
||
def next_business_days(self, n): | ||
"""Returns the dates of the next n business days.""" | ||
|
||
current_date = datetime.today() | ||
count = 0 | ||
while count <= n: | ||
yield f"{current_date.month}/{current_date.day}/{current_date.year}" | ||
|
||
next_date = current_date + timedelta(days=1) | ||
while next_date.weekday() > 4: | ||
# Skip weekends | ||
next_date += timedelta(days=1) | ||
|
||
current_date = next_date | ||
count += 1 | ||
|
||
def start_requests(self): | ||
for date in self.next_business_days(5): | ||
yield Request( | ||
CourtCallSpider.url, | ||
meta={ | ||
"zyte_api_automap": { | ||
"httpResponseHeaders": True, | ||
"browserHtml": True, | ||
"actions": [ | ||
{ | ||
"action": "waitForSelector", | ||
"selector": { | ||
"type": "css", | ||
"value": "#MainContent_rblSearchType_2", | ||
}, | ||
"timeout": 5, | ||
"onError": "return", | ||
}, | ||
{ | ||
"action": "click", | ||
"selector": { | ||
"type": "css", | ||
"value": "#MainContent_rblSearchType_2", | ||
}, | ||
"onError": "return", | ||
}, | ||
{ | ||
"action": "waitForSelector", | ||
"selector": { | ||
"type": "css", | ||
"value": "#MainContent_dtTxt", | ||
}, | ||
"timeout": 5, | ||
"onError": "return", | ||
}, | ||
{ | ||
"action": "select", | ||
"selector": { | ||
"type": "css", | ||
"value": "#MainContent_ddlDivisionCode", | ||
}, | ||
"values": ["CV"], | ||
"onError": "return", | ||
}, | ||
{ | ||
"action": "type", | ||
"selector": { | ||
"type": "css", | ||
"value": "#MainContent_dtTxt", | ||
}, | ||
"text": date, | ||
"onError": "return", | ||
}, | ||
{ | ||
"action": "click", | ||
"selector": { | ||
"type": "css", | ||
"value": "#MainContent_btnSearch", | ||
}, | ||
"onError": "return", | ||
}, | ||
{ | ||
"action": "waitForSelector", | ||
"selector": { | ||
"type": "css", | ||
"value": "#MainContent_pnlResults", | ||
}, | ||
"timeout": 5, | ||
"onError": "return", | ||
}, | ||
], | ||
}, | ||
"date": date, | ||
"result_page_num": 1, | ||
}, | ||
errback=self.handle_error, | ||
callback=self.parse_results, | ||
) | ||
|
||
def has_page_num(self, n, response): | ||
"""Check if there's an nth page of court call results.""" | ||
|
||
tree = html.fromstring(response.text) | ||
page_table = tree.xpath("//table")[1] | ||
next_page_link = page_table.xpath(f".//a[contains(@href,'Page${n}')]") | ||
return bool(next_page_link) | ||
|
||
def get_court_calls(self, response): | ||
"""Returns the court calls found on a result page.""" | ||
|
||
tree = html.fromstring(response.text) | ||
results_table = tree.xpath("//table[@id='MainContent_grdRecords']")[0] | ||
|
||
no_results = results_table.xpath( | ||
".//*[text()[contains(.,'No cases found matching your selected" | ||
"criteria.')]]" | ||
) | ||
if no_results: | ||
return | ||
|
||
rows = results_table.xpath(".//tr") | ||
headers = rows[0].xpath(".//a/text()") | ||
for row in rows[1:-1]: | ||
cells = row.xpath(".//td/text()") | ||
if cells: | ||
yield dict(zip(headers, cells)) | ||
|
||
def extract_form(self, response, form_xpath): | ||
""" | ||
ASP.NET pages are essentially forms that store the data needed to send | ||
POST requests in hidden form inputs on the page. | ||
From https://www.trickster.dev/post/scraping-legacy-asp-net-site-with- | ||
scrapy-a-real-example/ | ||
""" | ||
|
||
form_data = dict() | ||
|
||
for hidden_input in response.xpath(form_xpath).xpath( | ||
".//input[@type='hidden']" | ||
): | ||
name = hidden_input.attrib.get("name") | ||
if name is None: | ||
continue | ||
value = hidden_input.attrib.get("value") | ||
if value is None: | ||
value = "" | ||
|
||
form_data[name] = value | ||
|
||
return form_data | ||
|
||
def get_page_n_form_data(self, n, response): | ||
""" | ||
Returns the form fields needed to send a POST request | ||
for the nth page of court call results. | ||
""" | ||
|
||
form_data = self.extract_form(response, "//form[@id='ctl01']") | ||
form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords" | ||
form_data["__EVENTARGUMENT"] = f"Page${n}" | ||
return form_data | ||
|
||
def parse_results(self, response): | ||
results = self.get_court_calls(response) | ||
if not results: | ||
return | ||
|
||
for court_call in results: | ||
court_call["hash"] = dict_hash(court_call) | ||
yield court_call | ||
|
||
# Request the next page of results | ||
next_page_num = response.meta["result_page_num"] + 1 | ||
next_page_exists = self.has_page_num(next_page_num, response) | ||
if not next_page_exists: | ||
return | ||
|
||
next_page_form_data = self.get_page_n_form_data(next_page_num, response) | ||
yield FormRequest.from_response( | ||
response, | ||
meta={"result_page_num": next_page_num}, | ||
formxpath="//form[@id='ctl01']", | ||
formdata=next_page_form_data, | ||
callback=self.parse_results, | ||
dont_click=True, | ||
) | ||
|
||
def _failing_responses(self, response): | ||
self.failures.add( | ||
f"{response.meta['date']} page {response.meta['result_page_num']}" | ||
) | ||
|
||
self.logger.info(f'failures: {", ".join(sorted(self.failures))}') | ||
|
||
if len(self.failures) > 20: | ||
raise CloseSpider("run of failures") | ||
|
||
def handle_error(self, failure): | ||
if failure.check(HttpError): | ||
response = failure.value.response | ||
if response.status in (404, 500): | ||
self._failing_responses(response) | ||
else: | ||
self.logger.error(repr(failure)) |
Oops, something went wrong.