Merge branch 'master' into patch-1

spotify · Oct 17, 2023 · ef960ad · ef960ad
2 parents b962a4c + 319ce20
commit ef960ad
Show file tree

Hide file tree

Showing 80 changed files with 2,033 additions and 250 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -4,7 +4,7 @@
 
 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @dlstadther @Tarrasch @spotify/dataex
+* @dlstadther @spotify/dataex
 
 # Specific files, directories, paths, or file types can be
 # assigned more specificially.

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -0,0 +1,65 @@
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ 'master' ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ 'master' ]
+  schedule:
+    - cron: '29 18 * * 0'
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'python', 'javascript' ]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+        # Use only 'java' to analyze code written in Java, Kotlin or both
+        # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
+        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+
+        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+        # queries: security-extended,security-and-quality
+
+
+    # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v2
+
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+
+    #   If the Autobuild fails above, remove it and uncomment the following three lines.
+    #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
+
+    # - run: |
+    #     echo "Run, Build Application using script"
+    #     ./location_of_script_within_repo/buildscript.sh
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
+      with:
+        category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/pythonbuild.yml b/.github/workflows/pythonbuild.yml
@@ -13,14 +13,16 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: 3.6
+          - python-version: "3.6"
             tox-env: py36-core
-          - python-version: 3.7
+          - python-version: "3.7"
             tox-env: py37-core
-          - python-version: 3.8
+          - python-version: "3.8"
             tox-env: py38-core
-          - python-version: 3.9
+          - python-version: "3.9"
             tox-env: py39-core
+          - python-version: "3.10"
+            tox-env: py310-core
 
     steps:
       - uses: actions/checkout@v2
@@ -37,7 +39,7 @@ jobs:
           key: ${{ format('{0}-pip-{1}', runner.os, hashFiles('dev-requirements.txt', format('requirements{0}.txt', matrix.spark-version-suffix))) }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip 'tox<3.0'
+          python -m pip install --upgrade pip 'tox<4.0'
       - name: Setup MySQL DB
         run: |
           sudo /etc/init.d/mysql start
@@ -49,7 +51,7 @@ jobs:
           TOXENV: ${{ matrix.tox-env }}
         run: tox
       - name: Codecov
-        env: 
+        env:
           COVERAGE_PROCESS_START: .coveragerc
         run: |
           pip install codecov
@@ -76,14 +78,16 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: 3.6
+          - python-version: "3.6"
             tox-env: py36-postgres
-          - python-version: 3.7
+          - python-version: "3.7"
             tox-env: py37-postgres
-          - python-version: 3.8
+          - python-version: "3.8"
             tox-env: py38-postgres
-          - python-version: 3.9
+          - python-version: "3.9"
             tox-env: py39-postgres
+          - python-version: "3.10"
+            tox-env: py310-postgres
 
     steps:
       - uses: actions/checkout@v2
@@ -100,7 +104,7 @@ jobs:
           key: ${{ format('{0}-pip-{1}', runner.os, hashFiles('dev-requirements.txt', format('requirements{0}.txt', matrix.spark-version-suffix))) }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip 'tox<3.0'
+          python -m pip install --upgrade pip 'tox<4.0'
       - name: Create PSQL database
         run: |
           PGPASSWORD=postgres psql -h localhost -p 5432 -c 'create database spotify;' -U postgres
@@ -109,7 +113,7 @@ jobs:
           TOXENV: ${{ matrix.tox-env }}
         run: tox
       - name: Codecov
-        env: 
+        env:
           COVERAGE_PROCESS_START: .coveragerc
         run: |
           pip install codecov
@@ -125,45 +129,54 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: 3.6
+          - python-version: "3.6"
             tox-env: py36-aws
-          - python-version: 3.7
+          - python-version: "3.7"
             tox-env: py37-aws
-          - python-version: 3.8
+          - python-version: "3.8"
             tox-env: py38-aws
-          - python-version: 3.9
+          - python-version: "3.9"
             tox-env: py39-aws
+          - python-version: "3.10"
+            tox-env: py310-aws
 
-          - python-version: 3.6
+          - python-version: "3.6"
             tox-env: py36-unixsocket
             OVERRIDE_SKIP_CI_TESTS: True
-          - python-version: 3.7
+          - python-version: "3.7"
             tox-env: py37-unixsocket
             OVERRIDE_SKIP_CI_TESTS: True
-          - python-version: 3.8
+          - python-version: "3.8"
             tox-env: py38-unixsocket
             OVERRIDE_SKIP_CI_TESTS: True
-          - python-version: 3.9
+          - python-version: "3.9"
             tox-env: py39-unixsocket
             OVERRIDE_SKIP_CI_TESTS: True
+          - python-version: "3.10"
+            tox-env: py310-unixsocket
+            OVERRIDE_SKIP_CI_TESTS: True
 
-          - python-version: 3.6
+          - python-version: "3.6"
             tox-env: py36-apache
-          - python-version: 3.7
+          - python-version: "3.7"
             tox-env: py37-apache
-          - python-version: 3.8.9
+          - python-version: "3.8"
             tox-env: py38-apache
-          - python-version: 3.9.4
+          - python-version: "3.9"
             tox-env: py39-apache
-
-          - python-version: 3.6
+          - python-version: "3.10"
+            tox-env: py310-apache
+
+          - python-version: "3.6"
             tox-env: py36-azureblob
-          - python-version: 3.7
+          - python-version: "3.7"
             tox-env: py37-azureblob
-          - python-version: 3.8
+          - python-version: "3.8"
             tox-env: py38-azureblob
-          - python-version: 3.9
+          - python-version: "3.9"
             tox-env: py39-azureblob
+          - python-version: "3.10"
+            tox-env: py310-azureblob
 
 
           - python-version: 3.9
@@ -194,7 +207,7 @@ jobs:
         run: tox
       - name: Codecov
         if: ${{ matrix.tox-env != 'flake8' && matrix.tox-env != 'docs' }}
-        env: 
+        env:
           COVERAGE_PROCESS_START: .coveragerc
         run: |
           pip install codecov

diff --git a/README.rst b/README.rst
@@ -2,8 +2,8 @@
    :alt: Luigi Logo
    :align: center
 
-.. image:: https://img.shields.io/travis/spotify/luigi/master.svg?style=flat
-    :target: https://travis-ci.org/spotify/luigi
+.. image:: https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2Fspotify%2Fluigi%2Fbadge&label=build&logo=none&%3Fref%3Dmaster&style=flat
+    :target: https://actions-badge.atrox.dev/spotify/luigi/goto?ref=master
 
 .. image:: https://img.shields.io/codecov/c/github/spotify/luigi/master.svg?style=flat
     :target: https://codecov.io/gh/spotify/luigi?branch=master
@@ -14,7 +14,7 @@
 .. image:: https://img.shields.io/pypi/l/luigi.svg?style=flat
    :target: https://pypi.python.org/pypi/luigi
 
-Luigi is a Python (3.6, 3.7, 3.8, 3.9 tested) package that helps you build complex
+Luigi is a Python (3.6, 3.7, 3.8, 3.9, 3.10 tested) package that helps you build complex
 pipelines of batch jobs. It handles dependency resolution, workflow management,
 visualization, handling failures, command line integration, and much more.
 
@@ -100,7 +100,7 @@ Conceptually, Luigi is similar to `GNU
 Make <http://www.gnu.org/software/make/>`_ where you have certain tasks
 and these tasks in turn may have dependencies on other tasks. There are
 also some similarities to `Oozie <http://oozie.apache.org/>`_
-and `Azkaban <http://data.linkedin.com/opensource/azkaban>`_. One major
+and `Azkaban <https://azkaban.github.io/>`_. One major
 difference is that Luigi is not just built specifically for Hadoop, and
 it's easy to extend it with other kinds of tasks.
 

diff --git a/SECURITY.md b/SECURITY.md
@@ -0,0 +1,5 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+Please report sensitive security issues via Spotify's [bug-bounty program](https://hackerone.com/spotify) by following this [instruction](https://docs.hackerone.com/programs/security-page.html), rather than GitHub. 
diff --git a/catalog-info.yaml b/catalog-info.yaml
@@ -0,0 +1,7 @@
+apiVersion: backstage.io/v1alpha1
+kind: Component
+metadata:
+  name: luigi
+spec:
+  type: library
+  owner: dataex
diff --git a/doc/configuration.rst b/doc/configuration.rst
@@ -168,24 +168,6 @@ log_level
 logging_conf_file
   Location of the logging configuration file.
 
-max_shown_tasks
-  .. versionadded:: 1.0.20
-
-  The maximum number of tasks returned in a task_list api call. This
-  will restrict the number of tasks shown in task lists in the
-  visualiser. Small values can alleviate frozen browsers when there are
-  too many done tasks. This defaults to 100000 (one hundred thousand).
-
-max_graph_nodes
-  .. versionadded:: 2.0.0
-
-  The maximum number of nodes returned by a dep_graph or
-  inverse_dep_graph api call. Small values can greatly speed up graph
-  display in the visualiser by limiting the number of nodes shown. Some
-  of the nodes that are not sent to the visualiser will still show up as
-  dependencies of nodes that were sent. These nodes are given TRUNCATED
-  status.
-
 no_configure_logging
   If true, logging is not configured. Defaults to false.
 
@@ -303,14 +285,14 @@ wait_interval
   available jobs.
 
 wait_jitter
-  Size of jitter to add to the worker wait interval such that the multiple
-  workers do not ask the scheduler for another job at the same time.
+  Duration of jitter to add to the worker wait interval such that the multiple
+  workers do not ask the scheduler for another job at the same time, in seconds.
   Default: 5.0
 
 max_keep_alive_idle_duration
   .. versionadded:: 2.8.4
 
-  Maximum duration to keep worker alive while in idle state.
+  Maximum duration in seconds to keep worker alive while in idle state.
   Default: 0 (Indefinitely)
 
 max_reschedules
@@ -374,6 +356,15 @@ check_complete_on_run
   missing.
   Defaults to false.
 
+cache_task_completion
+  By default, luigi task processes might check the completion status multiple
+  times per task which is a safe way to avoid potential inconsistencies. For
+  tasks with many dynamic dependencies, yielded in multiple stages, this might
+  become expensive, e.g. in case the per-task completion check entails remote
+  resources. When set to true, completion checks are cached so that tasks
+  declared as complete once are not checked again.
+  Defaults to false.
+
 
 [elasticsearch]
 ---------------
@@ -447,7 +438,7 @@ traceback_max_length
 Parameters controlling the contents of batch notifications sent from the
 scheduler
 
-email_interval
+email_interval_minutes
   Number of minutes between e-mail sends. Making this larger results in
   fewer, bigger e-mails.
   Defaults to 60.
@@ -789,6 +780,24 @@ disable_window
   scheduler forgets about disables that have occurred longer ago than
   this amount of time. Defaults to 3600 (1 hour).
 
+max_shown_tasks
+  .. versionadded:: 1.0.20
+
+  The maximum number of tasks returned in a task_list api call. This
+  will restrict the number of tasks shown in task lists in the
+  visualiser. Small values can alleviate frozen browsers when there are
+  too many done tasks. This defaults to 100000 (one hundred thousand).
+
+max_graph_nodes
+  .. versionadded:: 2.0.0
+
+  The maximum number of nodes returned by a dep_graph or
+  inverse_dep_graph api call. Small values can greatly speed up graph
+  display in the visualiser by limiting the number of nodes shown. Some
+  of the nodes that are not sent to the visualiser will still show up as
+  dependencies of nodes that were sent. These nodes are given TRUNCATED
+  status.
+
 record_task_history
   If true, stores task history in a database. Defaults to false.
 
@@ -836,7 +845,12 @@ metrics_collector
   Optional setting allowing Luigi to use a contribution to collect metrics
   about the pipeline to a third-party. By default this uses the default metric
   collector that acts as a shell and does nothing. The currently available
-  options are "datadog" and "prometheus".
+  options are "datadog", "prometheus" and "custom". If it's custom the
+  'metrics_custom_import' needs to be set.
+
+metrics_custom_import
+  Optional setting allowing Luigi to import a custom subclass of MetricsCollector
+  at runtime. The string should be formatted like "module.sub_module.ClassName".
 
 
 [sendgrid]