diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 7711495..0000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,99 +0,0 @@ -name: Full case scrape - -on: - workflow_dispatch: -# schedule: -# - cron: '15 4 * * *' - -jobs: - # build: - # runs-on: ubuntu-latest - # - # steps: - # - name: Set current date as env variable - # run: echo "BEGIN_COURTS_RUN=$(date +'%s')" >> $GITHUB_ENV - # - uses: actions/checkout@v2 - # - name: upgrade sqlite3 - # run: | - # sudo add-apt-repository ppa:linuxgndu/sqlitebrowser-testing - # sudo apt-get update - # sudo apt-get install sqlite3 - # - # - name: Install requirements - # run: pip install -r requirements.txt - # - # - name: Build database - # run: | - # echo $BEGIN_COURTS_RUN - # make cases.db - # - # - name: Setup asset - # run: | - # zip -p ${{ secrets.CASE_DB_PW }} cases.db.zip cases.db - # - # - name: Release - # uses: WebFreak001/deploy-nightly@v3.0.0 - # env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # with: - # upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label} - # release_id: 131985702 - # asset_path: ./cases.db.zip - # asset_name: cases.db.zip - # asset_content_type: application/zip # required by GitHub API - # max_releases: 7 - # - name: keepalive - # uses: gautamkrishnar/keepalive-workflow@v1 - - deploy: - runs-on: ubuntu-latest - - env: - HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }} - - steps: - - uses: actions/checkout@v3 - - - name: Install requirements - run: pip install -r requirements.txt - - - name: Download latest database zip - uses: robinraju/release-downloader@v1.8 - with: - tag: "nightly" - fileName: "*.db.zip" - - - name: Decrypt database - run: | - unzip -P "${{ secrets.CASE_DB_PW }}" cases.db.zip - - - name: Install heroku-builds plugin - run: | - heroku plugins:install heroku-builds - - - name: Login to Heroku CLI - uses: akhileshns/heroku-deploy@v3.12.14 - with: - heroku_api_key: ${{ secrets.HEROKU_API_KEY }} - heroku_app_name: "" - heroku_email: ${{ secrets.HEROKU_EMAIL }} - justlogin: true - - - name: Install Datasette plugins - run: | - datasette install datasette-auth-passwords datasette-auth-tokens - - - name: Get hashed Datasette password - run: | - # Store hash as an environment variable - hash=$(echo "${{ secrets.DATASETTE_INSTANCE_PW }}" \ - | datasette hash-password --no-confirm); \ - echo "hash=$hash" >> $GITHUB_ENV - - - name: Deploy Datasette instance to Heroku - run: | - datasette publish heroku cases.db \ - -n court-scraper \ - -m metadata.json \ - --install datasette-auth-passwords \ - --plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}' diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml new file mode 100644 index 0000000..5f9775a --- /dev/null +++ b/.github/workflows/nightly.yml @@ -0,0 +1,116 @@ +name: Nightly case scrape + +on: + workflow_dispatch: + # schedule: + # - cron: '15 4 * * *' + +jobs: + scrape: + name: Scrape new cases + runs-on: ubuntu-latest + + steps: + - name: Set current date as env variable + run: echo "BEGIN_COURTS_RUN=$(date +'%s')" >> $GITHUB_ENV + - uses: actions/checkout@v3 + - name: upgrade sqlite3 + run: | + sudo apt-get update + sudo apt-get install sqlite3 + + - name: Install requirements + run: | + pip install -U pyopenssl cryptography + pip install -r requirements.txt + + - name: Download latest database zip + uses: robinraju/release-downloader@v1.8 + with: + latest: true + tag: "nightly" + fileName: "*.db.zip" + + - name: Decrypt database + run: | + unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip + + - name: Run scrape + run: | + echo $BEGIN_COURTS_RUN + make get_new_records + + - name: Setup database for upload + run: | + zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db + + - name: Upload new release + uses: WebFreak001/deploy-nightly@v3.0.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label} + release_id: 131985702 + asset_path: ./cases.db.zip + asset_name: cases.db.zip + asset_content_type: application/zip # required by GitHub API + max_releases: 7 + + - name: Keepalive + uses: gautamkrishnar/keepalive-workflow@v1 + + deploy: + name: Deploy to Heroku + needs: scrape + runs-on: ubuntu-latest + + env: + HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }} + + steps: + - uses: actions/checkout@v3 + + - name: Install requirements + run: pip install -r requirements.txt + + - name: Download latest database zip + uses: robinraju/release-downloader@v1.8 + with: + latest: true + tag: "nightly" + fileName: "*.db.zip" + + - name: Decrypt database + run: | + unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip + + - name: Install heroku-builds plugin + run: | + heroku plugins:install heroku-builds + + - name: Login to Heroku CLI + uses: akhileshns/heroku-deploy@v3.12.14 + with: + heroku_api_key: ${{ secrets.HEROKU_API_KEY }} + heroku_app_name: "" + heroku_email: ${{ secrets.HEROKU_EMAIL }} + justlogin: true + + - name: Install Datasette plugins + run: | + datasette install datasette-auth-passwords datasette-auth-tokens + + - name: Get hashed Datasette password + run: | + # Store hash as an environment variable + hash=$(echo '${{ secrets.DATASETTE_INSTANCE_PW }}' \ + | datasette hash-password --no-confirm); \ + echo "hash=$hash" >> $GITHUB_ENV + + - name: Deploy Datasette instance to Heroku + run: | + datasette publish heroku cases.db \ + -n court-scraper \ + -m metadata.json \ + --install datasette-auth-passwords \ + --plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}' diff --git a/Makefile b/Makefile index a007e90..0f1552d 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,3 @@ -.INTERMEDIATE: *.csv *.jl *.json - .PHONY: all all: upload @@ -43,9 +41,7 @@ new_plaintiffs.csv: cases.json new_defendants.csv: cases.json cat $^ | jq '.[] | . as $$p | .defendants[] | [., $$p.case_number] | @csv' -r > $@ -cases.json : civil-2.jl civil-3.jl civil-4.jl civil-5.jl \ - civil-6.jl civil-101.jl civil-104.jl civil-11.jl \ - civil-13.jl civil-14.jl civil-15.jl civil-17.jl chancery.jl +cases.json : chancery.jl cat $^ | sort | python scripts/remove_dupe_cases.py | jq --slurp '.' > $@ # Query parameterized by civil case subdivision diff --git a/courtscraper/spiders/__init__.py b/courtscraper/spiders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/courtscraper/spiders/base.py b/courtscraper/spiders/base.py index c7c75a5..9bfc5f7 100644 --- a/courtscraper/spiders/base.py +++ b/courtscraper/spiders/base.py @@ -13,7 +13,7 @@ class UnsuccessfulAutomation(Exception): class CourtSpiderBase(ABC, Spider): def __init__( - self, division="2", year=2022, start=0, case_numbers_file=None, **kwargs + self, division="2", year=2023, start=0, case_numbers_file=None, **kwargs ): self.year = year self.misses = set() diff --git a/courtscraper/spiders/chancery.py b/courtscraper/spiders/chancery.py index 3cbe0fe..685f980 100644 --- a/courtscraper/spiders/chancery.py +++ b/courtscraper/spiders/chancery.py @@ -7,7 +7,7 @@ class ChancerySpider(CourtSpiderBase): name = "chancery" url = "https://casesearch.cookcountyclerkofcourt.org/CivilCaseSearchAPI.aspx" - def __init__(self, year=2022, **kwargs): + def __init__(self, year=2023, **kwargs): self.case_type = CASE_FORMAT super().__init__(**kwargs) diff --git a/courtscraper/spiders/civil.py b/courtscraper/spiders/civil.py index 2c79149..a248d76 100644 --- a/courtscraper/spiders/civil.py +++ b/courtscraper/spiders/civil.py @@ -7,7 +7,7 @@ class CivilSpider(CourtSpiderBase): name = "civil" url = "https://casesearch.cookcountyclerkofcourt.org/CivilCaseSearchAPI.aspx" - def __init__(self, division="2", year=2022, **kwargs): + def __init__(self, division="2", year=2023, **kwargs): self.case_type = DIVISIONS[division] super().__init__(**kwargs) diff --git a/requirements.txt b/requirements.txt index 74016b0..b07ad4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ datasette csvs-to-sqlite sqlite-utils csvkit +sqlean.py \ No newline at end of file diff --git a/scripts/nightly_civil_start.sql b/scripts/nightly_civil_start.sql index 3f905a0..bfaab96 100644 --- a/scripts/nightly_civil_start.sql +++ b/scripts/nightly_civil_start.sql @@ -14,7 +14,7 @@ WITH serials AS ( court_case WHERE court = 'civil' - AND subdivision = ':subdivision' + AND subdivision = :subdivision /* noqa */ AND substr(case_number, 1, 4) = strftime('%Y', current_timestamp) )