Skip to content

Commit

Permalink
Merge pull request #30 from datamade/feature/run-rescrape
Browse files Browse the repository at this point in the history
Add rescrape action
  • Loading branch information
antidipyramid authored Feb 7, 2024
2 parents 9c6f348 + 515c5a8 commit b2ea52f
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 10 deletions.
116 changes: 116 additions & 0 deletions .github/workflows/rescrape.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
name: Rescrape cases

on:
workflow_dispatch:
schedule:
- cron: '0 21 * * *'

jobs:
scrape:
name: Re-scrape cases
runs-on: ubuntu-latest

steps:
- name: Set current date as env variable
run: echo "BEGIN_RESCRAPE_RUN=$(date +'%s')" >> $GITHUB_ENV
- uses: actions/checkout@v3
- name: upgrade sqlite3
run: |
sudo apt-get update
sudo apt-get install sqlite3
- name: Install requirements
run: |
pip install -U pyopenssl cryptography
pip install -r requirements.txt
- name: Download latest database zip
uses: robinraju/release-downloader@v1.8
with:
latest: true
tag: "nightly"
fileName: "*.db.zip"

- name: Decrypt database
run: |
unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip
- name: Run scrape
run: |
echo $BEGIN_COURTS_RUN
make -f Makefile.update update_db
- name: Setup database for upload
run: |
zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db
- name: Upload new release
uses: WebFreak001/deploy-nightly@v3.0.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label}
release_id: 131985702
asset_path: ./cases.db.zip
asset_name: cases.db.zip
asset_content_type: application/zip # required by GitHub API
max_releases: 7

- name: Keepalive
uses: gautamkrishnar/keepalive-workflow@v1

deploy:
name: Deploy to Heroku
needs: scrape
runs-on: ubuntu-latest

env:
HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }}

steps:
- uses: actions/checkout@v3

- name: Install requirements
run: pip install -r requirements.txt

- name: Download latest database zip
uses: robinraju/release-downloader@v1.8
with:
latest: true
tag: "nightly"
fileName: "*.db.zip"

- name: Decrypt database
run: |
unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip
- name: Install heroku-builds plugin
run: |
heroku plugins:install heroku-builds
- name: Login to Heroku CLI
uses: akhileshns/heroku-deploy@v3.12.14
with:
heroku_api_key: ${{ secrets.HEROKU_API_KEY }}
heroku_app_name: ""
heroku_email: ${{ secrets.HEROKU_EMAIL }}
justlogin: true

- name: Install Datasette plugins
run: |
datasette install datasette-auth-passwords datasette-auth-tokens
- name: Get hashed Datasette password
run: |
# Store hash as an environment variable
hash=$(echo '${{ secrets.DATASETTE_INSTANCE_PW }}' \
| datasette hash-password --no-confirm); \
echo "hash=$hash" >> $GITHUB_ENV
- name: Deploy Datasette instance to Heroku
run: |
datasette publish heroku cases.db \
-n court-scraper \
-m metadata.json \
--install datasette-auth-passwords \
--plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}'
16 changes: 6 additions & 10 deletions Makefile.update
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,22 @@ rescraped_cases.db : attorney.csv defendant.csv plaintiff.csv court_case.csv eve
plaintiff case_number court_case case_number \
event case_number court_case case_number
sqlite-utils transform $@ defendant \
--rename _key order \
--drop _key \
--column-order case_number \
--column-order order \
--column-order defendant
sqlite-utils transform $@ attorney \
--rename _key order \
--drop _key \
--column-order case_number \
--column-order order \
--column-order attorney
sqlite-utils transform $@ event \
--rename _key order \
--drop _key \
--column-order case_number \
--column-order order \
--column-order date \
--column-order description \
--column-order comments
sqlite-utils transform $@ plaintiff \
--rename _key order \
--drop _key \
--column-order case_number \
--column-order order \
--column-order plaintiff
sqlite-utils convert $@ court_case filing_date 'r.parsedate(value)'
sqlite-utils convert $@ event date 'r.parsedate(value)'
Expand All @@ -71,10 +67,10 @@ rescraped_cases.json: rescraped_chancery_cases.jl rescraped_civil_cases.jl
cat $^ | jq --slurp '.' > $@

rescraped_civil_cases.jl : to_rescrape.civil.csv
scrapy crawl civil -a case_numbers_file=$< -O $@
scrapy crawl civil -s CLOSESPIDER_TIMEOUT=7200 -a case_numbers_file=$< -O $@

rescraped_chancery_cases.jl : to_rescrape.chancery.csv
scrapy crawl chancery -a case_numbers_file=$< -O $@
scrapy crawl chancery -s CLOSESPIDER_TIMEOUT=7200 -a case_numbers_file=$< -O $@

TO_SCRAPE_QUERY=$(shell tail -n +6 scripts/to_scrape.sql)

Expand Down

0 comments on commit b2ea52f

Please sign in to comment.