diff --git a/.github/workflows/rescrape.yml b/.github/workflows/rescrape.yml new file mode 100644 index 0000000..12c56c5 --- /dev/null +++ b/.github/workflows/rescrape.yml @@ -0,0 +1,116 @@ +name: Rescrape cases + +on: + workflow_dispatch: + schedule: + - cron: '0 21 * * *' + +jobs: + scrape: + name: Re-scrape cases + runs-on: ubuntu-latest + + steps: + - name: Set current date as env variable + run: echo "BEGIN_RESCRAPE_RUN=$(date +'%s')" >> $GITHUB_ENV + - uses: actions/checkout@v3 + - name: upgrade sqlite3 + run: | + sudo apt-get update + sudo apt-get install sqlite3 + + - name: Install requirements + run: | + pip install -U pyopenssl cryptography + pip install -r requirements.txt + + - name: Download latest database zip + uses: robinraju/release-downloader@v1.8 + with: + latest: true + tag: "nightly" + fileName: "*.db.zip" + + - name: Decrypt database + run: | + unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip + + - name: Run scrape + run: | + echo $BEGIN_COURTS_RUN + make -f Makefile.update update_db + + - name: Setup database for upload + run: | + zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db + + - name: Upload new release + uses: WebFreak001/deploy-nightly@v3.0.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label} + release_id: 131985702 + asset_path: ./cases.db.zip + asset_name: cases.db.zip + asset_content_type: application/zip # required by GitHub API + max_releases: 7 + + - name: Keepalive + uses: gautamkrishnar/keepalive-workflow@v1 + + deploy: + name: Deploy to Heroku + needs: scrape + runs-on: ubuntu-latest + + env: + HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }} + + steps: + - uses: actions/checkout@v3 + + - name: Install requirements + run: pip install -r requirements.txt + + - name: Download latest database zip + uses: robinraju/release-downloader@v1.8 + with: + latest: true + tag: "nightly" + fileName: "*.db.zip" + + - name: Decrypt database + run: | + unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip + + - name: Install heroku-builds plugin + run: | + heroku plugins:install heroku-builds + + - name: Login to Heroku CLI + uses: akhileshns/heroku-deploy@v3.12.14 + with: + heroku_api_key: ${{ secrets.HEROKU_API_KEY }} + heroku_app_name: "" + heroku_email: ${{ secrets.HEROKU_EMAIL }} + justlogin: true + + - name: Install Datasette plugins + run: | + datasette install datasette-auth-passwords datasette-auth-tokens + + - name: Get hashed Datasette password + run: | + # Store hash as an environment variable + hash=$(echo '${{ secrets.DATASETTE_INSTANCE_PW }}' \ + | datasette hash-password --no-confirm); \ + echo "hash=$hash" >> $GITHUB_ENV + + - name: Deploy Datasette instance to Heroku + run: | + datasette publish heroku cases.db \ + -n court-scraper \ + -m metadata.json \ + --install datasette-auth-passwords \ + --plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}' diff --git a/Makefile.update b/Makefile.update index 97e2a87..b378741 100644 --- a/Makefile.update +++ b/Makefile.update @@ -27,26 +27,22 @@ rescraped_cases.db : attorney.csv defendant.csv plaintiff.csv court_case.csv eve plaintiff case_number court_case case_number \ event case_number court_case case_number sqlite-utils transform $@ defendant \ - --rename _key order \ + --drop _key \ --column-order case_number \ - --column-order order \ --column-order defendant sqlite-utils transform $@ attorney \ - --rename _key order \ + --drop _key \ --column-order case_number \ - --column-order order \ --column-order attorney sqlite-utils transform $@ event \ - --rename _key order \ + --drop _key \ --column-order case_number \ - --column-order order \ --column-order date \ --column-order description \ --column-order comments sqlite-utils transform $@ plaintiff \ - --rename _key order \ + --drop _key \ --column-order case_number \ - --column-order order \ --column-order plaintiff sqlite-utils convert $@ court_case filing_date 'r.parsedate(value)' sqlite-utils convert $@ event date 'r.parsedate(value)' @@ -71,10 +67,10 @@ rescraped_cases.json: rescraped_chancery_cases.jl rescraped_civil_cases.jl cat $^ | jq --slurp '.' > $@ rescraped_civil_cases.jl : to_rescrape.civil.csv - scrapy crawl civil -a case_numbers_file=$< -O $@ + scrapy crawl civil -s CLOSESPIDER_TIMEOUT=7200 -a case_numbers_file=$< -O $@ rescraped_chancery_cases.jl : to_rescrape.chancery.csv - scrapy crawl chancery -a case_numbers_file=$< -O $@ + scrapy crawl chancery -s CLOSESPIDER_TIMEOUT=7200 -a case_numbers_file=$< -O $@ TO_SCRAPE_QUERY=$(shell tail -n +6 scripts/to_scrape.sql)