Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

start of new scraper cookie cutter #267

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docker/templates/new-scraper/cookiecutter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"module_name": "my_new_scraper",
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Publish Docker image
on:
workflow_dispatch:
jobs:
push_to_registry:
name: Push Docker image to GitHub Packages
runs-on: ubuntu-latest
steps:
- name: Login to GitHub Container Registry
uses: docker/login-action@v1
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Push to GitHub Packages
uses: docker/build-push-action@v2
with:
push: true
tags: ghcr.io/${{ github.repository }}:latest
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Extend the base Python image
# See https://hub.docker.com/_/python for version options
# N.b., there are many options for Python images. We used the plain
# version number in the pilot. YMMV. See this post for a discussion of
# some options and their pros and cons:
# https://pythonspeed.com/articles/base-image-python-docker-images/
FROM python:3.7

# Give ourselves some credit
LABEL maintainer "DataMade <info@datamade.us>"

# Install any additional OS-level packages you need via apt-get. RUN statements
# add additional layers to your image, increasing its final size. Keep your
# image small by combining related commands into one RUN statement, e.g.,
#
# RUN apt-get update && \
# apt-get install -y python-pip
#
# Read more on Dockerfile best practices at the source:
# https://docs.docker.com/develop/develop-images/dockerfile_best-practices

# Inside the container, create an app directory and switch into it
RUN mkdir /app
WORKDIR /app

# Copy the requirements file into the app directory, and install them. Copy
# only the requirements file, so Docker can cache this build step. Otherwise,
# the requirements must be reinstalled every time you build the image after
# the app code changes. See this post for further discussion of strategies
# for building lean and efficient containers:
# https://blog.realkinetic.com/building-minimal-docker-containers-for-python-applications-37d0272c52f3
COPY ./requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Copy the contents of the current host directory (i.e., our app code) into
# the container.
COPY . /app
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
scrape :
python -m {{cookiecutter.module_name}} scraped
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
version: '2.4'

services:
app:
image: {{cookiecutter.app_name}}
container_name: {{cookiecutter.app_name}}
build: .
# Allow container to be attached to, e.g., to access the pdb shell
stdin_open: true
tty: true
ports:
# Map ports on your computer to ports on your container. This allows you,
# e.g., to visit your containerized application in a browser on your
# computer.
- 8000:8000
depends_on:
postgres:
condition: service_healthy
volumes:
# Mount the development directory as a volume into the container, so
# Docker automatically recognizes your changes.
- .:/app{% if cookiecutter.local_settings != 'None' %}
- ${PWD}/{{cookiecutter.local_settings}}:/app/{{cookiecutter.local_settings|replace(".example", "")}}{% endif %}
command: {{cookiecutter.run_command}}{% if cookiecutter.auto_migrate == 'True' %}

migration:
container_name: {{cookiecutter.app_name}}-migration
image: {{cookiecutter.app_name}}:latest
depends_on:
# Declaring this dependency ensures that your application image is built
# before migrations are run, and that your application and migrations can
# be run from the same image, rather than creating purpose-specific
# copies.
- app
volumes:
# These should generally be the same as your application volumes.
- .:/app{% if cookiecutter.local_settings != 'None' %}
- ${PWD}/{{cookiecutter.local_settings}}:/app/{{cookiecutter.local_settings|replace(".example", "")}}{% endif %}
command: {{cookiecutter.migrate_command}}
{% endif %}
postgres:
container_name: {{cookiecutter.app_name}}-postgres
image: {% if cookiecutter.postgis == 'True' %}mdillon/postgis{% else %}postgres{% endif %}:{{cookiecutter.pg_version}}
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 10s
timeout: 5s
retries: 5
environment:
# The default Postgres image exposes a number of environmental variables
# that allow you to configure the container's behavior, without writing
# any additional code. Specify the name of your database, and any other
# variables, here. https://hub.docker.com/_/postgres/#environment-variables
- POSTGRES_DB={{cookiecutter.pg_db}}
- POSTGRES_PASSWORD=postgres
volumes:
# By default, Postgres instantiates an anonymous volume. Use a named
# one, so your data persists beyond the life of the container. See this
# post for a discussion of the pitfalls of Postgres and anonymous
# volumes: https://linuxhint.com/run_postgresql_docker_compose/
- {{cookiecutter.app_name}}-db-data:/var/lib/postgresql/data
ports:
- 32001:5432

volumes:
# Declare your named volume for Postgres.
{{cookiecutter.app_name}}-db-data:
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
version: '2.4'

services:
app:
# Don't restart the service when the command exits
restart: "no"
command: pytest -sxv
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import scrapelib
import lxml

class Scraper(scrapelib.Scraper):
'''Rename me to something more descriptive'''

def _spider(self):
'''yield lxml.html pages'''
...


def scrape(self):
'''yield dictionaries of data'''
for page in self._spider():
...


if __name__ == '__main__':
import argparse
import pathlib
import json

parser = argparse.ArgumentParser(description='Scrape your site')
parser.add_argument('output_dir', type=pathlib.Path)

args = parser.parse_args()

scraper = Scraper()

for result in scraper:
result_id = result['id']
file_name = f'{result_id}.json'
file_path = args.output_dir / file_name
with file_path.open() as f:
json.dump(f, result)