Merge pull request #38 from datamade/feature/37-court-call-scrape

Add court call scrape
datamade · Mar 15, 2024 · aa956d2 · aa956d2
2 parents 60570d2 + a5675f9
commit aa956d2
Show file tree

Hide file tree

Showing 6 changed files with 412 additions and 1 deletion.
diff --git a/.github/workflows/court_calls.yml b/.github/workflows/court_calls.yml
@@ -0,0 +1,116 @@
+name: Court call scrape
+
+on:
+  workflow_dispatch:
+  schedule:
+     - cron: '15 10 * * *'
+
+jobs:
+  scrape:
+    name: Scrape court calls
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Set current date as env variable
+        run: echo "BEGIN_COURTS_RUN=$(date +'%s')" >> $GITHUB_ENV
+      - uses: actions/checkout@v3
+      - name: upgrade sqlite3
+        run: |
+          sudo apt-get update
+          sudo apt-get install sqlite3
+
+      - name: Install requirements
+        run: |
+          pip install -U pyopenssl cryptography
+          pip install -r requirements.txt
+
+      - name: Download latest database zip
+        uses: robinraju/release-downloader@v1.8
+        with:
+          latest: true
+          tag: "nightly"
+          fileName: "*.db.zip"
+
+      - name: Decrypt database
+        run: |
+          unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip
+
+      - name: Scrape court calls
+        run: |
+          make -f Makefile.courtcalls all
+
+      - name: Setup database for upload
+        run: |
+          zip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip cases.db
+
+      - name: Upload new release
+        uses: WebFreak001/deploy-nightly@v3.0.0
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: https://uploads.github.com/repos/datamade/court-scrapers/releases/131985702/assets{?name,label}
+          release_id: 131985702
+          asset_path: ./cases.db.zip
+          asset_name: cases.db.zip
+          asset_content_type: application/zip # required by GitHub API
+          max_releases: 7
+
+      - name: Keepalive
+        uses: gautamkrishnar/keepalive-workflow@v1
+
+  deploy:
+    name: Deploy to Heroku
+    needs: scrape
+    runs-on: ubuntu-latest
+
+    env:
+      HEROKU_ORGANIZATION: ${{ secrets.HEROKU_ORG }}
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Install requirements
+        run: pip install -r requirements.txt
+
+      - name: Download latest database zip
+        uses: robinraju/release-downloader@v1.8
+        with:
+          latest: true
+          tag: "nightly"
+          fileName: "*.db.zip"
+
+      - name: Decrypt database
+        run: |
+          unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip
+
+      - name: Install heroku-builds plugin
+        run: |
+          heroku plugins:install heroku-builds
+
+      - name: Login to Heroku CLI
+        uses: akhileshns/heroku-deploy@v3.12.14
+        with:
+          heroku_api_key: ${{ secrets.HEROKU_API_KEY }}
+          heroku_app_name: ""
+          heroku_email: ${{ secrets.HEROKU_EMAIL }}
+          justlogin: true
+
+      - name: Install Datasette plugins
+        run: |
+          datasette install datasette-auth-passwords datasette-auth-tokens
+
+      - name: Get hashed Datasette password
+        run: |
+          # Store hash as an environment variable
+          hash=$(echo '${{ secrets.DATASETTE_INSTANCE_PW }}' \
+              | datasette hash-password --no-confirm); \
+            echo "hash=$hash" >> $GITHUB_ENV
+
+      - name: Deploy Datasette instance to Heroku
+        run: |
+          datasette publish heroku cases.db \
+              -n court-scraper \
+              -m metadata.json \
+              --setting sql_time_limit_ms 60000 \
+              --install datasette-auth-passwords \
+              --plugin-secret datasette-auth-passwords root_password_hash '${{ env.hash }}'
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -35,7 +35,7 @@ jobs:
         run: |
           unzip -P '${{ secrets.CASE_DB_PW }}' cases.db.zip && rm cases.db.zip
 
-      - name: Run scrape
+      - name: Run case scrape
         run: |
           echo $BEGIN_COURTS_RUN
           make get_new_records

diff --git a/Makefile.courtcalls b/Makefile.courtcalls
@@ -0,0 +1,14 @@
+# Makefile for scraping court calls
+
+.PHONY : all
+all: court_calls.csv cases.db
+	cat $< | sqlite3 cases.db -init scripts/import_court_calls.sql -bail
+
+court_calls.csv: court_calls.json
+	cat $^ | jq '.[] | [.["Case Number"], .["Division"], .["Plaintiff"], .["Defendant"], .["Court Date"], .["Room"], .["District"], .["Sequence #"], .["Time"], .hash] | @csv' -r > $@
+
+court_calls.json: court_calls.jl
+	cat $^ | jq --slurp '.' > $@
+
+court_calls.jl : cases.db
+	scrapy crawl courtcalls -s CLOSESPIDER_TIMEOUT=14400 -O $@
diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py
@@ -0,0 +1,221 @@
+from datetime import datetime, timedelta
+
+from scrapy import Spider, Request
+from scrapy.http import FormRequest
+from scrapy.exceptions import CloseSpider
+from scrapy.spidermiddlewares.httperror import HttpError
+
+from lxml import html
+
+from scripts.hash import dict_hash
+
+
+class CourtCallSpider(Spider):
+    name = "courtcalls"
+    url = "https://casesearch.cookcountyclerkofcourt.org/CourtCallSearch.aspx"
+
+    def __init__(self, **kwargs):
+        self.failures = set()
+        super().__init__(**kwargs)
+
+    def next_business_days(self, n):
+        """Returns the dates of the next n business days."""
+
+        current_date = datetime.today()
+        count = 0
+        while count <= n:
+            yield f"{current_date.month}/{current_date.day}/{current_date.year}"
+
+            next_date = current_date + timedelta(days=1)
+            while next_date.weekday() > 4:
+                # Skip weekends
+                next_date += timedelta(days=1)
+
+            current_date = next_date
+            count += 1
+
+    def start_requests(self):
+        for date in self.next_business_days(5):
+            yield Request(
+                CourtCallSpider.url,
+                meta={
+                    "zyte_api_automap": {
+                        "httpResponseHeaders": True,
+                        "browserHtml": True,
+                        "actions": [
+                            {
+                                "action": "waitForSelector",
+                                "selector": {
+                                    "type": "css",
+                                    "value": "#MainContent_rblSearchType_2",
+                                },
+                                "timeout": 5,
+                                "onError": "return",
+                            },
+                            {
+                                "action": "click",
+                                "selector": {
+                                    "type": "css",
+                                    "value": "#MainContent_rblSearchType_2",
+                                },
+                                "onError": "return",
+                            },
+                            {
+                                "action": "waitForSelector",
+                                "selector": {
+                                    "type": "css",
+                                    "value": "#MainContent_dtTxt",
+                                },
+                                "timeout": 5,
+                                "onError": "return",
+                            },
+                            {
+                                "action": "select",
+                                "selector": {
+                                    "type": "css",
+                                    "value": "#MainContent_ddlDivisionCode",
+                                },
+                                "values": ["CV"],
+                                "onError": "return",
+                            },
+                            {
+                                "action": "type",
+                                "selector": {
+                                    "type": "css",
+                                    "value": "#MainContent_dtTxt",
+                                },
+                                "text": date,
+                                "onError": "return",
+                            },
+                            {
+                                "action": "click",
+                                "selector": {
+                                    "type": "css",
+                                    "value": "#MainContent_btnSearch",
+                                },
+                                "onError": "return",
+                            },
+                            {
+                                "action": "waitForSelector",
+                                "selector": {
+                                    "type": "css",
+                                    "value": "#MainContent_pnlResults",
+                                },
+                                "timeout": 5,
+                                "onError": "return",
+                            },
+                        ],
+                    },
+                    "date": date,
+                    "result_page_num": 1,
+                },
+                errback=self.handle_error,
+                callback=self.parse_results,
+            )
+
+    def has_page_num(self, n, response):
+        """Check if there's an nth page of court call results."""
+
+        tree = html.fromstring(response.text)
+        page_table = tree.xpath("//table")[1]
+        next_page_link = page_table.xpath(f".//a[contains(@href,'Page${n}')]")
+        return bool(next_page_link)
+
+    def get_court_calls(self, response):
+        """Returns the court calls found on a result page."""
+
+        tree = html.fromstring(response.text)
+        results_table = tree.xpath("//table[@id='MainContent_grdRecords']")[0]
+
+        no_results = results_table.xpath(
+            ".//*[text()[contains(.,'No cases found matching your selected"
+            "criteria.')]]"
+        )
+        if no_results:
+            return
+
+        rows = results_table.xpath(".//tr")
+        headers = rows[0].xpath(".//a/text()")
+        for row in rows[1:-1]:
+            cells = row.xpath(".//td/text()")
+            if cells:
+                yield dict(zip(headers, cells))
+
+    def extract_form(self, response, form_xpath):
+        """
+        ASP.NET pages are essentially forms that store the data needed to send
+        POST requests in hidden form inputs on the page.
+
+        From https://www.trickster.dev/post/scraping-legacy-asp-net-site-with-
+        scrapy-a-real-example/
+        """
+
+        form_data = dict()
+
+        for hidden_input in response.xpath(form_xpath).xpath(
+            ".//input[@type='hidden']"
+        ):
+            name = hidden_input.attrib.get("name")
+            if name is None:
+                continue
+            value = hidden_input.attrib.get("value")
+            if value is None:
+                value = ""
+
+            form_data[name] = value
+
+        return form_data
+
+    def get_page_n_form_data(self, n, response):
+        """
+        Returns the form fields needed to send a POST request
+        for the nth page of court call results.
+        """
+
+        form_data = self.extract_form(response, "//form[@id='ctl01']")
+        form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords"
+        form_data["__EVENTARGUMENT"] = f"Page${n}"
+        return form_data
+
+    def parse_results(self, response):
+        results = self.get_court_calls(response)
+        if not results:
+            return
+
+        for court_call in results:
+            court_call["hash"] = dict_hash(court_call)
+            yield court_call
+
+        # Request the next page of results
+        next_page_num = response.meta["result_page_num"] + 1
+        next_page_exists = self.has_page_num(next_page_num, response)
+        if not next_page_exists:
+            return
+
+        next_page_form_data = self.get_page_n_form_data(next_page_num, response)
+        yield FormRequest.from_response(
+            response,
+            meta={"result_page_num": next_page_num},
+            formxpath="//form[@id='ctl01']",
+            formdata=next_page_form_data,
+            callback=self.parse_results,
+            dont_click=True,
+        )
+
+    def _failing_responses(self, response):
+        self.failures.add(
+            f"{response.meta['date']} page {response.meta['result_page_num']}"
+        )
+
+        self.logger.info(f'failures: {", ".join(sorted(self.failures))}')
+
+        if len(self.failures) > 20:
+            raise CloseSpider("run of failures")
+
+    def handle_error(self, failure):
+        if failure.check(HttpError):
+            response = failure.value.response
+            if response.status in (404, 500):
+                self._failing_responses(response)
+        else:
+            self.logger.error(repr(failure))