datamade · fgregg · Apr 27, 2022
diff --git a/docker/templates/new-scraper/cookiecutter.json b/docker/templates/new-scraper/cookiecutter.json
@@ -0,0 +1,3 @@
+{
+    "module_name": "my_new_scraper",
+}
diff --git a/...lates/new-scraper/{{cookiecutter.module_name}}/.github/workflows/publish_docker_image.yml b/...lates/new-scraper/{{cookiecutter.module_name}}/.github/workflows/publish_docker_image.yml
@@ -0,0 +1,19 @@
+name: Publish Docker image
+on:
+  workflow_dispatch:  
+jobs:
+  push_to_registry:
+    name: Push Docker image to GitHub Packages
+    runs-on: ubuntu-latest
+    steps:
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v1
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Push to GitHub Packages
+        uses: docker/build-push-action@v2
+        with:
+          push: true
+          tags: ghcr.io/${{ github.repository }}:latest
diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/Dockerfile b/docker/templates/new-scraper/{{cookiecutter.module_name}}/Dockerfile
@@ -0,0 +1,37 @@
+# Extend the base Python image
+# See https://hub.docker.com/_/python for version options
+# N.b., there are many options for Python images. We used the plain
+# version number in the pilot. YMMV. See this post for a discussion of
+# some options and their pros and cons:
+# https://pythonspeed.com/articles/base-image-python-docker-images/
+FROM python:3.7
+
+# Give ourselves some credit
+LABEL maintainer "DataMade <info@datamade.us>"
+
+# Install any additional OS-level packages you need via apt-get. RUN statements
+# add additional layers to your image, increasing its final size. Keep your
+# image small by combining related commands into one RUN statement, e.g.,
+#
+# RUN apt-get update && \
+#     apt-get install -y python-pip
+#
+# Read more on Dockerfile best practices at the source:
+# https://docs.docker.com/develop/develop-images/dockerfile_best-practices
+
+# Inside the container, create an app directory and switch into it
+RUN mkdir /app
+WORKDIR /app
+
+# Copy the requirements file into the app directory, and install them. Copy
+# only the requirements file, so Docker can cache this build step. Otherwise,
+# the requirements must be reinstalled every time you build the image after
+# the app code changes. See this post for further discussion of strategies
+# for building lean and efficient containers:
+# https://blog.realkinetic.com/building-minimal-docker-containers-for-python-applications-37d0272c52f3
+COPY ./requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the contents of the current host directory (i.e., our app code) into
+# the container.
+COPY . /app
diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/Makefile b/docker/templates/new-scraper/{{cookiecutter.module_name}}/Makefile
@@ -0,0 +1,2 @@
+scrape :
+	python -m {{cookiecutter.module_name}} scraped
diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/docker-compose.yml b/docker/templates/new-scraper/{{cookiecutter.module_name}}/docker-compose.yml
@@ -0,0 +1,67 @@
+version: '2.4'
+
+services:
+  app:
+    image: {{cookiecutter.app_name}}
+    container_name: {{cookiecutter.app_name}}
+    build: .
+    # Allow container to be attached to, e.g., to access the pdb shell
+    stdin_open: true
+    tty: true
+    ports:
+      # Map ports on your computer to ports on your container. This allows you,
+      # e.g., to visit your containerized application in a browser on your
+      # computer.
+      - 8000:8000
+    depends_on:
+      postgres:
+        condition: service_healthy
+    volumes:
+      # Mount the development directory as a volume into the container, so
+      # Docker automatically recognizes your changes.
+      - .:/app{% if cookiecutter.local_settings != 'None' %}
+      - ${PWD}/{{cookiecutter.local_settings}}:/app/{{cookiecutter.local_settings|replace(".example", "")}}{% endif %}
+    command: {{cookiecutter.run_command}}{% if cookiecutter.auto_migrate == 'True' %}
+
+  migration:
+    container_name: {{cookiecutter.app_name}}-migration
+    image: {{cookiecutter.app_name}}:latest
+    depends_on:
+      # Declaring this dependency ensures that your application image is built
+      # before migrations are run, and that your application and migrations can
+      # be run from the same image, rather than creating purpose-specific
+      # copies.
+      - app
+    volumes:
+      # These should generally be the same as your application volumes.
+      - .:/app{% if cookiecutter.local_settings != 'None' %}
+      - ${PWD}/{{cookiecutter.local_settings}}:/app/{{cookiecutter.local_settings|replace(".example", "")}}{% endif %}
+    command: {{cookiecutter.migrate_command}}
+{% endif %}
+  postgres:
+    container_name: {{cookiecutter.app_name}}-postgres
+    image: {% if cookiecutter.postgis == 'True' %}mdillon/postgis{% else %}postgres{% endif %}:{{cookiecutter.pg_version}}
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U postgres"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    environment:
+      # The default Postgres image exposes a number of environmental variables
+      # that allow you to configure the container's behavior, without writing
+      # any additional code. Specify the name of your database, and any other
+      # variables, here. https://hub.docker.com/_/postgres/#environment-variables
+      - POSTGRES_DB={{cookiecutter.pg_db}}
+      - POSTGRES_PASSWORD=postgres
+    volumes:
+      # By default, Postgres instantiates an anonymous volume. Use a named
+      # one, so your data persists beyond the life of the container. See this
+      # post for a discussion of the pitfalls of Postgres and anonymous
+      # volumes: https://linuxhint.com/run_postgresql_docker_compose/
+      - {{cookiecutter.app_name}}-db-data:/var/lib/postgresql/data
+    ports:
+      - 32001:5432
+
+volumes:
+  # Declare your named volume for Postgres.
+  {{cookiecutter.app_name}}-db-data:
diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/scraped/.gitkeep b/docker/templates/new-scraper/{{cookiecutter.module_name}}/scraped/.gitkeep
diff --git a/docker/templates/new-scraper/{{cookiecutter.module_name}}/tests/docker-compose.yml b/docker/templates/new-scraper/{{cookiecutter.module_name}}/tests/docker-compose.yml
@@ -0,0 +1,7 @@
+version: '2.4'
+
+services:
+  app:
+    # Don't restart the service when the command exits
+    restart: "no"
+    command: pytest -sxv
diff --git a/...mplates/new-scraper/{{cookiecutter.module_name}}/{{cookiecutter.module_name}}/__init__.py b/...mplates/new-scraper/{{cookiecutter.module_name}}/{{cookiecutter.module_name}}/__init__.py
@@ -0,0 +1,39 @@
+import scrapelib
+import lxml
+
+class Scraper(scrapelib.Scraper):
+    '''Rename me to something more descriptive'''
+
+    def _spider(self):
+        '''yield lxml.html pages'''
+        ...
+
+
+    def scrape(self):
+        '''yield dictionaries of data'''
+        for page in self._spider():
+            ...
+
+
+if __name__ == '__main__':
+    import argparse
+    import pathlib
+    import json
+
+    parser = argparse.ArgumentParser(description='Scrape your site')
+    parser.add_argument('output_dir',  type=pathlib.Path)
+
+    args = parser.parse_args()
+
+    scraper = Scraper()
+
+    for result in scraper:
+        result_id = result['id']
+        file_name = f'{result_id}.json'
+        file_path = args.output_dir / file_name
+        with file_path.open() as f:
+            json.dump(f, result)
+
+
+
+