diff --git a/README_snakemake-utils.md b/README_snakemake-utils.md new file mode 100644 index 0000000..730289f --- /dev/null +++ b/README_snakemake-utils.md @@ -0,0 +1,72 @@ +# Snakemake Utilities / CUBI + +This repository contains a script to generate Snakemake profiles from a generic config and resource presets. Currently, the data in this repository support generating Snakemake profiles for the HHU cluster "HILBERT", and for local execution such as on a laptop. + +## Usage + +If necessary, create the Conda environment specified in `envs/smk_profile.yaml` to have the pyYAML package available. + +Run the script `set_profile.py --help` to display the command line help. + +Briefly, the mode of operation is as follows: + +1. specify the infrastructrue (`-i`) you are targeting: `local` or `hilbert` +2. if you plan on using Snakemake version 8.x you have to enter `-smk8` because starting with Snakemake 8.0 some options/commands have been deprecated or renamed +3. for cluster execution, select a resource preset YAML file (`-r`) located in `profiles//resource_presets/.yaml` + - the preset equivalent to the Snakemake profile up to release/tag v1.0.0 of this repository is `mem-mb_walltime_wo-bonus.yaml` + - if you activated `-smk8` make sure that you select a Snakemake 8.x adjusted YAML file located in `profiles//resource_presets/_smk8.yaml` +4. specify the values to replace the placeholders as an ordered list (`-p`). The current set of recognized placeholders are - in that order - the "project" name and the "anchor" name (context: bonus/priority points). +5. specify the Snakemake working directory via `-w`, the profile will be copied to this folder. This file copying is done because Snakemake does not reasonably resolve paths to the files mentioned in the profile. + - if you generate several profiles, e.g., one with and one with using bonus points, you can also specify a suffix via `-s` that will be appended to the profile folder name. + +Having generated your execution profile, you can run `snakemake` as follows: + +```bash +$ snakemake -d SNAKEMAKE-WORK-DIR/ --profile SNAKEMAKE-WORK-DIR/prf__ [...] +``` + +As explained above, the `SUFFIX` part is optional. + +If you execute your workflow on an HPC cluster, the created profile folder includes a special config file `env.yaml` +that contains information on available CPU cores and common (and maximal) memory configurations of the +cluster compute nodes (= the job execution servers). Using that information requires loading this configuration +file via the `--configfiles` parameters: + +```bash +$ snakemake -d SNAKEMAKE-WORK-DIR/ \ + --profile SNAKEMAKE-WORK-DIR/prf__ \ + --configfiles SNAKEMAKE-WORK-DIR/prf__/env.yaml \ + [...] +``` + +Note that the CUBI Snakemake workflow template sets (low) default values for the available CPU cores, so it is +strongly recommended to make use of the `env.yaml` configuration file. + +### Cluster logs + +Note that the `pbs-submit.py` script includes the option to create the required directories that are the destinations for `stdout` and `stderr` of the cluster jobs: + +``` +pbs-submit.py ++mkdirs clusterLogs/err,clusterLogs/out [...] +``` + +These directory names match what is then specified further down in the profile: + +``` + -e clusterLogs/err/{rule}.{jobid}.stderr + -o clusterLogs/out/{rule}.{jobid}.stdout +``` + +If these folders do not exist at runtime, you'll receive PBS error notifications via e-mail. + +## Contributors + +- HHU/CUBI source + - Developer: Peter Ebert +- HHU source + - Developer: Lukas Rose +- Original source + - Copyright: 2017 Snakemake-Profiles + - License: MIT + - Developer: gh#neilav + - URL: https://github.com/Snakemake-Profiles/pbs-torque diff --git a/envs/conda/cubi_tools.yaml b/cubi-tools/prototypes/envs/conda/cubi_tools.yaml similarity index 100% rename from envs/conda/cubi_tools.yaml rename to cubi-tools/prototypes/envs/conda/cubi_tools.yaml diff --git a/cubi-tools/prototypes/envs/smk_profile.yaml b/cubi-tools/prototypes/envs/smk_profile.yaml new file mode 100644 index 0000000..0d5a560 --- /dev/null +++ b/cubi-tools/prototypes/envs/smk_profile.yaml @@ -0,0 +1,10 @@ +name: smk_profile +dependencies: + - Python=3.9.* + - pip + - mamba=0.25.0 + - pyyaml=6.0 + - semver=2.13.0 + - pylint=2.14.5 + - isort=5.10.1 + - black=22.6.0 diff --git a/cubi-tools/prototypes/profiles/hilbert/base.yaml b/cubi-tools/prototypes/profiles/hilbert/base.yaml new file mode 100644 index 0000000..09f1adf --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/base.yaml @@ -0,0 +1,30 @@ + +# Use custom submit script that, by default, +# adds the Singularity envmodule to the jobscript +# to be loaded before the Snakemake job execution. +# To deactivate that behavior, add the parameter +# ++no-singularity-module +cluster: >- + pbs-submit.py ++mkdirs log/cluster_jobs/err,log/cluster_jobs/out + -e log/cluster_jobs/err/{rule}.{jobid}.stderr + -o log/cluster_jobs/out/{rule}.{jobid}.stdout + -N {jobid}_{rule} + +cluster-status: pbs-status.py +cluster-cancel: qdel +jobscript: pbs-jobscript.sh +jobs: 100 +local-cores: 2 +max-jobs-per-second: 5 +immediate-submit: false +max-status-checks-per-second: 10 +scheduler: ilp +verbose: false +reason: false +latency-wait: 60 +keep-going: true +keep-incomplete: false +restart-times: 1 +rerun-incomplete: true +nolock: true +conda-frontend: mamba diff --git a/cubi-tools/prototypes/profiles/hilbert/base_smk8.yaml b/cubi-tools/prototypes/profiles/hilbert/base_smk8.yaml new file mode 100644 index 0000000..2f5179e --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/base_smk8.yaml @@ -0,0 +1,29 @@ + +# Use custom submit script that, by default, +# adds the Singularity envmodule to the jobscript +# to be loaded before the Snakemake job execution. +# To deactivate that behavior, add the parameter +# ++no-singularity-module +cluster-generic-submit-cmd: >- + pbs-submit.py ++mkdirs log/cluster_jobs/err,log/cluster_jobs/out + -e log/cluster_jobs/err/{rule}.{jobid}.stderr + -o log/cluster_jobs/out/{rule}.{jobid}.stdout + -N {jobid}_{rule} + +cluster-generic-status-cmd: pbs-status.py +cluster-generic-cancel-cmd: qdel +jobscript: pbs-jobscript.sh +jobs: 100 +local-cores: 2 +max-jobs-per-second: 5 +immediate-submit: false +max-status-checks-per-second: 10 +scheduler: ilp +verbose: false +latency-wait: 60 +keep-going: true +keep-incomplete: false +restart-times: 1 +rerun-incomplete: true +nolock: true +conda-frontend: mamba diff --git a/cubi-tools/prototypes/profiles/hilbert/cluster_utils/pbs-jobscript.sh b/cubi-tools/prototypes/profiles/hilbert/cluster_utils/pbs-jobscript.sh new file mode 100755 index 0000000..84bfa8c --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/cluster_utils/pbs-jobscript.sh @@ -0,0 +1,83 @@ +#!/bin/sh +# properties = {properties} + +# If really needed for debugging, uncomment the following two lines: +#echo "Will execute the following jobscript: " +#cat $0 + +# Will be inserted by pbs-submit.py +# + +# 2022-03-31 +# Properly set TMPDIR and change the default location +# of SINGULARITY_CACHEDIR to the (node-local) temp storage. +# At the time writing, this deals with certain Singularity +# problems when too many container run in parallel and dump +# their rootfs all to the same location on the /gpfs +# (default: /gpfs/scratch/$USER/.singularity) +# CAVEAT: the node-local temp storage is not monitored and +# cannot be requested as a job resources, which increases +# the risk of job failures because the node is running out +# of temp storage. + +# As long as the node-local temp storage is not monitored +# by PBS, track the info in the job output logs for +# potential debugging purposes. + +echo "Execution host:" +echo `uname -a` +echo "Size of /tmp:" +echo `df -h /tmp` + +# Unlikely: if a jobscript is not executed via +# the cluster scheduler (PBS), it will nevertheless +# create a temp directory, which needs to be +# cleaned up after the job (no matter the job's exit status) +TMPCLEANUP="MANUAL" + +if [[ -d $TMPDIR ]]; +then + echo "TMPDIR is set to: $TMPDIR" + TMPCLEANUP="AUTO" +else + echo "No TMPDIR set" + TMPDIR=$(mktemp -d -t $USER-task-XXXXXXXX) + echo "TMPDIR set to: $TMPDIR" +fi; + +# set all of these in case some tool dev doesn't know +# how to properly request a temp file... +TEMP=$TMPDIR +TEMPDIR=$TMPDIR +TMP=$TMPDIR +echo "Set env vars TEMP / TEMPDIR / TMP to $TMPDIR" +SINGULARITY_CACHEDIR=$TMPDIR/.singularity/cache +SINGULARITY_TMPDIR=$TMPDIR/.singularity/tmpdir +echo "SINGULARITY_CACHEDIR set to $SINGULARITY_CACHEDIR" +echo "SINGULARITY_TMPDIR set to $SINGULARITY_TMPDIR" + +{exec_job} + +# 2022-04-07 note: for Snakemake cluster jobs, this last +# part of the jobscript is not triggered if a cluster +# status command script is configured at the Snakemake +# command line (or profile). If so, the Snakemake +# command is extended with " && exit 0 || exit 1" +# (see "executors.py"), presumably to ensure always +# returning 0 or 1. In a cluster run, this seems +# acceptable since the scheduler will take care of +# cleaning up $TMPDIR. + +# Capture job's exit status before triggering +# potential cleanup operations + +JOBEXIT=$? + +if [[ "$TMPCLEANUP" = "MANUAL" ]]; +then + echo "Deleting TMPDIR: $TMPDIR" + rm -rfd $TMPDIR +fi; + +echo "Done - job exit status: $JOBEXIT" +exit $JOBEXIT diff --git a/cubi-tools/prototypes/profiles/hilbert/cluster_utils/pbs-status.py b/cubi-tools/prototypes/profiles/hilbert/cluster_utils/pbs-status.py new file mode 100755 index 0000000..9343ecd --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/cluster_utils/pbs-status.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +# This is a small script to allow Snakemake to query the PBSPro scheduler +# for job status. Use with --cluster-status "python statuscommand.py". +# Author: Lukas Rose +# Maintainer for CUBI: Peter Ebert + +import sys +import logging +import argparse +import subprocess +import json +from enum import Enum +from logging import StreamHandler, FileHandler + + +class JobState(Enum): + running = 0 + success = 1 + failed = -1 + + +# Source: PBS Professional 18.2, Administrator's Guide, Section 14.9: Job Exit Status Code +PBS_EXIT_CODES = { + 0: { + "name": "JOB_EXEC_OK", + "description": "Job execution was successful", + "state": JobState.success + }, + -1: { + "name": "JOB_EXEC_FAIL1", + "description": "Job execution failed, before files, no retry", + "state": JobState.failed + }, + -2: { + "name": "JOB_EXEC_FAIL2", + "description": "Job execution failed, after files, no retry", + "state": JobState.failed + }, + -3: { + "name": "JOB_EXEC_RETRY", + "description": "Job execution failed, do retry", + "state": JobState.failed + }, + -4: { + "name": "JOB_EXEC_INITABT", + "description": "Job aborted on MoM initialization", + "state": JobState.failed + }, + -5: { + "name": "JOB_EXEC_INITRST", + "description": "Job aborted on MoM initialization, checkpoint, no migrate", + "state": JobState.failed + }, + -6: { + "name": "JOB_EXEC_INITRMG", + "description": "Job aborted on MoM initialization, checkpoint, ok migrate", + "state": JobState.failed + }, + -7: { + "name": "JOB_EXEC_BADRESRT", + "description": "Job restart failed", + "state": JobState.failed + }, + -8: { + "name": "JOB_EXEC_GLOBUS_INIT_RETRY", + "description": "Globus can still send jobs to PBS, but PBS no longer supports sending jobs to Globus. No longer used. Initialization of Globus job failed; do retry", + "state": JobState.failed + }, + -9: { + "name": "JOB_EXEC_GLOBUS_INIT_FAIL", + "description": "Globus can still send jobs to PBS, but PBS no longer supports sending jobs to Globus. No longer used. Initialization of Globus job failed; no retry", + "state": JobState.failed + }, + -10: { + "name": "JOB_EXEC_FAILUID", + "description": "Invalid UID/GID for job", + "state": JobState.failed + }, + -11: { + "name": "JOB_EXEC_RERUN", + "description": "Job was rerun", + "state": JobState.failed + }, + -12: { + "name": "JOB_EXEC_CHKP", + "description": "Job was checkpointed and killed", + "state": JobState.failed + }, + -13: { + "name": "JOB_EXEC_FAIL_PASSWORD", + "description": "Job failed due to a bad password", + "state": JobState.failed + }, + -14: { + "name": "JOB_EXEC_RERUN_ON_SIS_FAIL", + "description": "Job was requeued (if rerunnable) or deleted (if not) due to a communication failure between Mother Superior and a Sister", + "state": JobState.failed + }, + -15: { + "name": "JOB_EXEC_QUERST", + "description": "Requeue job for restart from checkpoint", + "state": JobState.failed + }, + -16: { + "name": "JOB_EXEC_FAILHOOK_RERUN", + "description": "Job execution failed due to hook rejection; requeue for later retry", + "state": JobState.failed + }, + -17: { + "name": "JOB_EXEC_FAILHOOK_DELETE", + "description": "Job execution failed due to hook rejection; delete the job at end", + "state": JobState.failed + }, + -18: { + "name": "JOB_EXEC_HOOK_RERUN", + "description": "A hook requested for job to be requeued", + "state": JobState.failed + }, + -19: { + "name": "JOB_EXEC_HOOK_DELETE", + "description": "A hook requested for job to be deleted", + "state": JobState.failed + }, + -20: { + "name": "JOB_EXEC_RERUN_MS_FAIL", + "description": "Mother superior connection failed", + "state": JobState.failed + }, + "failed": { + "name": "JOB_SCRIPT_FAILED", + "description": "The exit value of the jobscript was {exit_code}. Probably one of the commands failed", + "state": JobState.failed + }, + "killed": { + "name": "JOB_SCRIPT_KILLED", + "description": "The job was killed with signal {signal}. See 'kill -l' for a list of signal names on your system", + "state": JobState.failed + }, + "default": { + "name": "EXIT_UNKNOWN", + "description": "An unknown exit code was encountered: {exit_code}. Ask the system administrator for help", + "state": JobState.failed + } +} + +# Source: PBS Professional 18.2, Reference Guide, 2.56.3: Options to qselect, Table 2-23: Job States +PBS_JOB_STATES = { + "B": { + "name": "STARTED", + "description": "Job array has started execution", + "state": JobState.running + }, + "E": { + "name": "EXITING", + "description": "Job is exiting", + "state": JobState.running + }, + "H": { + "name": "HELD", + "description": "Job is held", + "state": JobState.running + }, + "M": { + "name": "MOVED", + "description": "Job is moved", + "state": JobState.running + }, + "Q": { + "name": "QUEUED", + "description": "Job is queued and waiting to start", + "state": JobState.running + }, + "R": { + "name": "RUNNING", + "description": "Job is currently running", + "state": JobState.running + }, + "S": { + "name": "SUSPENDED", + "description": "Job is suspended", + "state": JobState.running + }, + "T": { + "name": "TRANSITING", + "description": "Job is transiting", + "state": JobState.running + }, + "U": { + "name": "WAIT_USER", + "description": "Job suspended due to workstation user activity", + "state": JobState.running + }, + "W": { + "name": "WAIT", + "description": "Job is waiting", + "state": JobState.running + }, + "X": { + "name": "EXITED", + "description": "The eXited state. Subjobs only", + "state": JobState.running + }, + "default": { + "name": "STATUS_UNKNOWN", + "description": "An unknown job status was encountered: {job_status}. Ask the system administrator for help", + "state": JobState.running + }, + "unknown": { + "name": "JOB_UNKNOWN", + "description": "An unknown job ID was encountered: {job_id}. Make sure a job with the given ID exists.", + "state": JobState.failed + }, + "error": { + "name": "UNKNOWN_ERROR", + "description": "An unknown error ocurred while getting the job state for job {job_id}: {ex}. Please contact the system administrator.", + "state": JobState.failed + } +} + + +def query_qstat(job_id): + job_data = None + try: + qstat_result = subprocess.run(["qstat", "-f", "-x", "-F", "json", str(job_id)], stdout=subprocess.PIPE) + job_data = json.loads(qstat_result.stdout) + except Exception as ex: + logging.getLogger().error("An exception occurred when querying qstat: {ex}".format(ex=ex)) + return job_data + + +def decode_state_dict(state_dict, job_state, **kwargs): + state = state_dict[job_state]["state"] + name = state_dict[job_state]["name"] + description = state_dict[job_state]["description"].format(**kwargs) + return (state, name, description) + + +def decode_job_status(job_id, job_data): + try: + if ("Jobs" in job_data and job_id in job_data["Jobs"]): + job_state = job_data["Jobs"][job_id]["job_state"] + + if(job_state in PBS_JOB_STATES): + result = decode_state_dict(PBS_JOB_STATES, job_state) + elif (job_state == "F"): + exit_code = job_data["Jobs"][job_id]["Exit_status"] + + if (exit_code in PBS_EXIT_CODES): + result = decode_state_dict(PBS_EXIT_CODES, exit_code) + elif (1 <= exit_code < 128): + result = decode_state_dict(PBS_EXIT_CODES, "failed", exit_code=exit_code) + elif (exit_code >= 128): + result = decode_state_dict(PBS_EXIT_CODES, "killed", signal=exit_code % 128) + else: + result = decode_state_dict(PBS_EXIT_CODES, "default", exit_code=exit_code) + + else: + result = decode_state_dict(PBS_JOB_STATES, "default", job_status=job_state) + else: + result = decode_state_dict(PBS_JOB_STATES, "unknown", job_id=job_id) + + except Exception as ex: + result = decode_state_dict(PBS_JOB_STATES, "error", job_id=job_id, ex=ex) + + return result + + +def parse_command_line(): + + parser = argparse.ArgumentParser( + description='Query the cluster job status for Snakemake', + add_help=True + ) + parser.add_argument( + 'job_id', + type=str + ) + parser.add_argument( + '--log-stderr', + type=str, + choices=[member.name for member in JobState], + default='failed', + help='Print a message to stderr each time a job has this or a ' + 'more severe state. The default value of "failed" means: ' + 'Log all queries for jobs that have state "failed" to stderr' + ) + parser.add_argument( + '--log-file', + type=str, + choices=[member.name for member in JobState], + default='success', + help='Print a message to --log-file-name each time a job has this or a ' + 'more severe state. E.g. the default value of "success" means: ' + 'Log all queries for jobs that have state "success" or something ' + 'more severe, e.g. "failed", but ignore jobs that are still "running".' + ) + parser.add_argument( + '--log-file-name', + type=str, + default='clusterStatus.log', + help='Specify the filename where messages for states specified in ' + '"--log-file" should be stored. Default: clusterStatus.log' + ) + + args = parser.parse_args() + return args + + +if __name__ == "__main__": + + args = parse_command_line() + + # Map job states to log levels + jobStateLogLevels = { + 'running': logging.DEBUG, + 'success': logging.INFO, + 'failed': logging.WARNING + } + + # Setup logger + streamHandler = StreamHandler(stream=sys.stderr) + streamHandler.setLevel(jobStateLogLevels[args.log_stderr]) + + fileHandler = FileHandler(args.log_file_name) + fileHandler.setLevel(jobStateLogLevels[args.log_file]) + + logging.basicConfig( + level=logging.DEBUG, + format='[%(asctime)-15s] %(message)s', + handlers=[streamHandler, fileHandler] + ) + logger = logging.getLogger() + + job_data = query_qstat(args.job_id) + (state, exit_code_name, exit_code_description) = decode_job_status(args.job_id, job_data) + + logger.log( + jobStateLogLevels[state.name], "State for {job_id}: {state}. Exit code: {exit_code_name} ({exit_code_description})".format( + job_id=args.job_id, + state=state.name, + exit_code_name=exit_code_name, + exit_code_description=exit_code_description + ) + ) + + print(state.name) diff --git a/cubi-tools/prototypes/profiles/hilbert/cluster_utils/pbs-submit.py b/cubi-tools/prototypes/profiles/hilbert/cluster_utils/pbs-submit.py new file mode 100755 index 0000000..9eee132 --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/cluster_utils/pbs-submit.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import subprocess +import re + +from snakemake.utils import read_job_properties + + +def parse_args(): + + parser = argparse.ArgumentParser(prefix_chars='+', add_help=True) + parser.add_argument( + "++depend", + help="Space separated list of ids for jobs this job should depend on." + ) + parser.add_argument( + "++no-singularity-module", + help="Do not add the default Singularity envmodule to the jobscript, i.e. " + "the Singularity envmodule will NOT be loaded prior to the " + "Snakemake job execution. Default: False", + action="store_true", + default=False + ) + parser.add_argument( + "++verbose", + help="Print qsub command line to stderr before job submission. Default: False", + action="store_true", + default=False + ) + parser.add_argument( + "++dry-run", + help="Do not execute anything - use for debugging. Default: False", + action="store_true", + default=False + ) + parser.add_argument( + "++mkdirs", + help="Comma-separated list of directories to create to catch the stdout/stderr " + "output files of the cluster jobs", + default="" + ) + parser.add_argument( + "qsub_args", + nargs="*", + ) + parser.add_argument( + "jobscript", + ) + + args = parser.parse_args() + return args + + +def main(): + + args = parse_args() + + if (args.verbose): + print("pbs-submit.py received the following args:\n", args, file=sys.stderr) + + if args.no_singularity_module: + if args.verbose: + print("NOT adding HPC envmodule 'Singularity' to jobscript", file=sys.stderr) + default_args = {"modules": []} + else: + if args.verbose: + print("Adding HPC envmodule 'Singularity' to jobscript", file=sys.stderr) + default_args = {"modules": ["Singularity"]} + + if args.mkdirs: + for directory in args.mkdirs.split(","): + os.makedirs(directory, exist_ok=True) + + try: + job_properties = read_job_properties(args.jobscript) + except Exception as e: + print("FATAL ERROR: could not read job properties:", e, file=sys.stderr) + raise + + try: + default_args["modules"].extend(job_properties["cluster"]["modules"]) + except KeyError as ke: + if args.verbose: + print("WARNING: could not merge clusterArgs because of key error:", ke, file=sys.stderr) + except Exception as e: + print("FATAL ERROR: could not read cluster modules:", e, file=sys.stderr) + raise + + module_string = "" + for module in default_args["modules"]: + module_string += "module load {}\n".format(module) + + try: + with open(args.jobscript, "r") as f: + jobscript_content = f.read() + + jobscript_content = re.sub('# ', module_string, jobscript_content) + + if (args.verbose): + print("Modified jobscript:\n", jobscript_content, file=sys.stderr) + + if not args.dry_run: + with open(args.jobscript, "w") as f: + f.write(jobscript_content) + except Exception as e: + print("FATAL ERROR: could not read or modify jobscript:", e, file=sys.stderr) + raise + + depend_jobs = "" + if args.depend: + for m in args.depend.split(" "): + depend_jobs = depend_jobs + ":" + m + depend_jobs = " -W \"depend=afterok" + depend_jobs + "\"" + + cmd = "qsub {} {} {}".format(depend_jobs, " ".join(args.qsub_args), args.jobscript) + + if (args.verbose): + print(f"Submitting jobscript {args.jobscript} to the cluster with qsub command:\n{cmd}\n", file=sys.stderr) + + res = "" + if not args.dry_run: + try: + res = subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE) + except subprocess.CalledProcessError as e: + raise e + res = res.stdout.decode().strip() + print(res, file=sys.stdout) + + return 0 + + +if __name__ == '__main__': + main() diff --git a/cubi-tools/prototypes/profiles/hilbert/env.yaml b/cubi-tools/prototypes/profiles/hilbert/env.yaml new file mode 100644 index 0000000..fd8f214 --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/env.yaml @@ -0,0 +1,11 @@ + +cpu_low: 6 +cpu_medium: 12 +cpu_high: 24 +cpu_max: 72 + +memory_max_mb: 3072000 +memory_max_gb: 3000 + +memory_common_mb: 98304 +memory_common_gb: 96 diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_w-bonus.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_w-bonus.yaml new file mode 100644 index 0000000..b2d1895 --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_w-bonus.yaml @@ -0,0 +1,16 @@ +cluster: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_mb}M{resources.arch} + -l walltime={resources.time_hrs}:59:59 + -l bonus={resources.bonus} {resources.anchor} +default-resources: + - project= + - mem_mb=1024 + - time_hrs=1 + - gpus=0 + - bonus=1 + - anchor="-l anchor=" + - arch="" +use-envmodules: true +use-singularity: true +use-conda: true diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_w-bonus_smk8.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_w-bonus_smk8.yaml new file mode 100644 index 0000000..4862403 --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_w-bonus_smk8.yaml @@ -0,0 +1,18 @@ +cluster-generic-submit-cmd: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_mb}M{resources.arch} + -l walltime={resources.time_hrs}:59:59 + -l bonus={resources.bonus} {resources.anchor} +default-resources: + - project= + - mem_mb=1024 + - time_hrs=1 + - gpus=0 + - bonus=1 + - anchor="-l anchor=" + - arch="" +software-deployment-method: + - conda + - apptainer + - env-modules + diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_wo-bonus.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_wo-bonus.yaml new file mode 100644 index 0000000..f776b5a --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_wo-bonus.yaml @@ -0,0 +1,16 @@ +cluster: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_mb}M{resources.arch} + -l walltime={resources.time_hrs}:59:59 + -l bonus={resources.bonus} +default-resources: + - project= + - mem_mb=1024 + - time_hrs=1 + - gpus=0 + - bonus=0 + - anchor="" + - arch="" +use-envmodules: true +use-singularity: true +use-conda: true diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_wo-bonus_smk8.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_wo-bonus_smk8.yaml new file mode 100644 index 0000000..cf2595e --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_time-hrs_wo-bonus_smk8.yaml @@ -0,0 +1,17 @@ +cluster-generic-submit-cmd: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_mb}M{resources.arch} + -l walltime={resources.time_hrs}:59:59 + -l bonus={resources.bonus} +default-resources: + - project= + - mem_mb=1024 + - time_hrs=1 + - gpus=0 + - bonus=0 + - anchor="" + - arch="" +software-deployment-method: + - conda + - apptainer + - env-modules diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_walltime_wo-bonus.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_walltime_wo-bonus.yaml new file mode 100644 index 0000000..6a2a7b9 --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_walltime_wo-bonus.yaml @@ -0,0 +1,16 @@ +cluster: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_mb}M{resources.arch} + -l walltime={resources.walltime} + -l bonus={resources.bonus} +default-resources: + - project= + - mem_mb=1024 + - walltime="01:59:00" + - gpus=0 + - bonus=0 + - anchor="" + - arch="" +use-envmodules: true +use-singularity: true +use-conda: true diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_walltime_wo-bonus_smk8.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_walltime_wo-bonus_smk8.yaml new file mode 100644 index 0000000..ce9f74d --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/mem-mb_walltime_wo-bonus_smk8.yaml @@ -0,0 +1,17 @@ +cluster-generic-submit-cmd: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_mb}M{resources.arch} + -l walltime={resources.walltime} + -l bonus={resources.bonus} +default-resources: + - project= + - mem_mb=1024 + - walltime="01:59:00" + - gpus=0 + - bonus=0 + - anchor="" + - arch="" +software-deployment-method: + - conda + - apptainer + - env-modules diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_w-bonus.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_w-bonus.yaml new file mode 100644 index 0000000..dc0f416 --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_w-bonus.yaml @@ -0,0 +1,16 @@ +cluster: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_gb}g{resources.arch} + -l walltime={resources.time_h}:59:59 + -l bonus={resources.bonus} {resources.anchor} +default-resources: + - project= + - mem_gb=1 + - time_h=1 + - gpus=0 + - bonus=1 + - anchor="-l anchor=" + - arch="" +use-envmodules: true +use-singularity: true +use-conda: true diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_w-bonus_smk8.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_w-bonus_smk8.yaml new file mode 100644 index 0000000..5e2d4b4 --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_w-bonus_smk8.yaml @@ -0,0 +1,17 @@ +cluster-generic-submit-cmd: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_gb}g{resources.arch} + -l walltime={resources.time_h}:59:59 + -l bonus={resources.bonus} {resources.anchor} +default-resources: + - project= + - mem_gb=1 + - time_h=1 + - gpus=0 + - bonus=1 + - anchor="-l anchor=" + - arch="" +software-deployment-method: + - conda + - apptainer + - env-modules diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_wo-bonus.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_wo-bonus.yaml new file mode 100644 index 0000000..7417730 --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_wo-bonus.yaml @@ -0,0 +1,16 @@ +cluster: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_gb}g{resources.arch} + -l walltime={resources.time_h}:59:59 + -l bonus={resources.bonus} +default-resources: + - project= + - mem_gb=1 + - time_h=1 + - gpus=0 + - bonus=1 + - anchor="" + - arch="" +use-envmodules: true +use-singularity: true +use-conda: true diff --git a/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_wo-bonus_smk8.yaml b/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_wo-bonus_smk8.yaml new file mode 100644 index 0000000..ca27970 --- /dev/null +++ b/cubi-tools/prototypes/profiles/hilbert/resource_presets/specific/verkko_wo-bonus_smk8.yaml @@ -0,0 +1,18 @@ +executor: cluster-generic +cluster-generic-submit-cmd: >- + -A {resources.project} + -l select=1:ncpus={threads}:ngpus={resources.gpus}:mem={resources.mem_gb}g{resources.arch} + -l walltime={resources.time_h}:59:59 + -l bonus={resources.bonus} +default-resources: + - project= + - mem_gb=1 + - time_h=1 + - gpus=0 + - bonus=1 + - anchor="" + - arch="" +software-deployment-method: + - conda + - apptainer + - env-modules diff --git a/cubi-tools/prototypes/profiles/local/base.yaml b/cubi-tools/prototypes/profiles/local/base.yaml new file mode 100644 index 0000000..dad926c --- /dev/null +++ b/cubi-tools/prototypes/profiles/local/base.yaml @@ -0,0 +1,13 @@ +cores: +scheduler: ilp +verbose: false +reason: false +latency-wait: 5 +keep-going: true +keep-incomplete: false +restart-times: 1 +rerun-incomplete: true +nolock: true +conda-frontend: mamba +use-singularity: true +use-conda: true diff --git a/cubi-tools/prototypes/profiles/local/base_smk8.yaml b/cubi-tools/prototypes/profiles/local/base_smk8.yaml new file mode 100644 index 0000000..78b9fa2 --- /dev/null +++ b/cubi-tools/prototypes/profiles/local/base_smk8.yaml @@ -0,0 +1,14 @@ +cores: +scheduler: ilp +verbose: false +latency-wait: 5 +keep-going: true +keep-incomplete: false +restart-times: 1 +rerun-incomplete: true +nolock: true +conda-frontend: mamba +software-deployment-method: + - conda + - apptainer + diff --git a/cubi-tools/prototypes/set_profile.py b/cubi-tools/prototypes/set_profile.py new file mode 100755 index 0000000..813b984 --- /dev/null +++ b/cubi-tools/prototypes/set_profile.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python + +import os +import argparse as argp +import pathlib as pl +import multiprocessing as mp +import re +import shutil as sh +import sys +import subprocess + +import yaml + + +__version__ = "3.0.0" + + +def install_snakemake_executor_plugin(args): + """ + If using the 'Snakemake 8' option the pip plugin + 'snakemake-executor-plugin-cluster-generic' needs to be installed. + """ + + if args.infrastructure == "local": + subprocess.check_call( + [ + sys.executable, + "-m", + "pip", + "install", + "snakemake-executor-plugin-cluster-generic", + ] + ) + else: + smk8_env = os.environ.copy() + smk8_env["PIP_CONFIG_FILE"] = "/software/python/pip.conf" + command = [ + sys.executable, + "-m", + "pip", + "install", + "snakemake-executor-plugin-cluster-generic", + ] + subprocess.check_call(command, env=smk8_env) + return None + + +def parse_args(): + """ + Collection of the various options of the 'set_profile.py' script. + """ + + parser = argp.ArgumentParser(add_help=True) + + parser.add_argument( + "--version", "-v", action="version", version=f"%(prog)s v{__version__}" + ) + + parser.add_argument( + "--infrastructure", + "-i", + default="local", + choices=["local", "hilbert"], + dest="infrastructure", + help="Specify execution infrastructure: local [laptop] / hilbert", + ) + + parser.add_argument( + "--resource-preset", + "-r", + default=None, + dest="preset", + help="Specify resource preset for cluster infrastructures. " + "This option is ignored for local execution profiles. " + "For cluster profiles, state the path to the respective " + "YAML file under '/resource_presets' that you want to use.", + ) + + parser.add_argument( + "--placeholders", + "-p", + default=[], + nargs="*", + dest="placeholders", + help="Specify placeholder replacements as space-separated list of VALUES. " + "Currently supported for cluster infrastructure: " + "VALUE-1 => project (qsub -A parameter) ; " + "VALUE-2 => anchor (qsub -l anchor= parameter)", + ) + + parser.add_argument( + "--snakemake-work-dir", + "-w", + type=lambda x: pl.Path(x).resolve(strict=False), + required=True, + dest="smk_work_dir", + help="Path to Snakemake (pipeline) working directory. Will be created " + "if it does not exist. Mandatory argument.", + ) + + parser.add_argument( + "--profile-suffix", + "-s", + type=str, + default="", + dest="suffix", + help="Append this suffix to the profile folder created in the Snakemake " + "working directory. Examples: (no suffix) wd/prf_PROJECT ; " + "(with suffix) wd/prf_PROJECT_suffix", + ) + + parser.add_argument( + "--snakemake_version_8", + "-smk8", + action="store_true", + default=False, + dest="smk_version", + help="In the new Snakemake 8 version some commands/options have been " + "changed/deprecated. To select the modified Snakemake 8 settings activate " + "this option by entering the argument. The default option is still " + "using Snakemake 7", + ) + + args = parser.parse_args() + + if args.infrastructure != "local" and args.preset is None: + raise ValueError("You need to specify a resource preset for cluster profiles!") + return args + + +def pprint_cluster_config(config_string): + """ + Convenience only - improve readability + of cluster config string in dumped + YAML + """ + + prettified_string = ">-\n " + for component in config_string.split(): + if component.startswith("+"): + prettified_string += " " + component + elif component.startswith("-"): + prettified_string += "\n " + component + else: + prettified_string += " " + component + return prettified_string + + +def replace_placeholders(placeholders, config_dump): + """ + Function to check for and add placeholder replacements as space-separated + list of VALUES.(VALUE-1 => project (qsub -A parameter) and + VALUE-2 => anchor (qsub -l anchor= parameter)) + """ + + for pname, pvalue in placeholders.items(): + pattern = f"" + if re.search(pattern, config_dump) is not None: + sys.stdout.write(f"\nReplacing placeholder {pname} with value {pvalue}\n") + config_dump = config_dump.replace(pattern, pvalue) + + missing_placeholders = re.search(")?", config_dump) + if missing_placeholders is not None: + missing = missing_placeholders.group(0) + raise ValueError( + "Error: placeholder not replaced by value. You need " + "to specify all concrete values via the '--placeholders' " + f"command line option: {missing}" + ) + assert missing_placeholders is None, missing_placeholders + return config_dump + + +def load_yaml(file_path): + """ + Load the PRESET.YAML file + """ + + with open(file_path, "rb") as yaml_file: + content = yaml.load(yaml_file, Loader=yaml.SafeLoader) + return content + + +def load_base_profile(profile_root, smk8): + """ + Load the base_profile YAML file, depending on Snkamemake version + """ + + if not smk8: + base_profile = profile_root.joinpath("base.yaml").resolve(strict=True) + config = load_yaml(base_profile) + else: + base_profile = profile_root.joinpath("base_smk8.yaml").resolve(strict=True) + config = load_yaml(base_profile) + return config + + +def prepare_local_profile(profile_root, smk8): + """ + Prepare the local profile config.yaml + """ + + local_cpus = mp.cpu_count() + config = load_base_profile(profile_root, smk8) + config["cores"] = local_cpus + config_dump = yaml.dump(config) + return config_dump + + +def prepare_cluster_profile(profile_root, smk8, rsrc_preset, placeholders): + """ + Prepare the cluster profile config.yaml + """ + + copy_files = list(profile_root.joinpath("cluster_utils").glob("*")) + assert len(copy_files) == 3 + # add environment config file + # that only exists for cluster environments + copy_files.append(profile_root.joinpath("env.yaml").resolve(strict=True)) + assert len(copy_files) == 4 + config = load_base_profile(profile_root, smk8) + preset = load_yaml(rsrc_preset) + + for rsrc_key, rsrc_value in preset.items(): + if rsrc_key not in config: + config[rsrc_key] = rsrc_value + else: + base_values = config[rsrc_key] + if isinstance(base_values, str): + assert isinstance(rsrc_value, str) + updated_values = base_values + " " + rsrc_value + elif isinstance(base_values, list): + assert isinstance(rsrc_value, list) + updated_values = base_values + rsrc_value + else: + raise ValueError( + f"Cannot handle resource value: {rsrc_value} / {type(rsrc_value)}" + ) + config[rsrc_key] = updated_values + # next: pretty print cluster config entry + cluster_config_value = config["cluster-generic-submit-cmd"] + config["cluster-generic-submit-cmd"] = "" + cluster_config_value = pprint_cluster_config(cluster_config_value) + config_dump = yaml.dump(config) + config_dump = config_dump.replace("", cluster_config_value) + + # replace placeholders with actual user-supplied values + config_dump = replace_placeholders(placeholders, config_dump) + return config_dump, copy_files + + +def main(): + """ + Main function of the 'set_profile.py' script. + """ + + args = parse_args() + smk8 = args.smk_version + + if smk8: + install_snakemake_executor_plugin(args) + + profiles_dir = pl.Path(__file__).parent.joinpath("profiles").resolve(strict=True) + profile_root = profiles_dir.joinpath(args.infrastructure).resolve(strict=True) + + known_placeholders = ["project", "anchor"] + placeholders = dict((k, v) for k, v in zip(known_placeholders, args.placeholders)) + + if args.infrastructure == "local": + profile_cfg = prepare_local_profile(profile_root, smk8) + copy_files = [] + placeholders = {"project": "local"} + elif args.infrastructure in ["hilbert"]: + profile_cfg, copy_files = prepare_cluster_profile( + profile_root, smk8, args.preset, placeholders + ) + else: + raise ValueError(f"Unknown execution infrastructure: {args.infrastructure}") + + if args.suffix: + # assert should mainly catch quirky typos + assert args.suffix.isidentifier() + profile_dir_name = f"prf_{placeholders['project']}_{args.suffix}" + profile_dir = args.smk_work_dir.joinpath(profile_dir_name).resolve(strict=False) + else: + profile_dir_name = f"prf_{placeholders['project']}" + profile_dir = args.smk_work_dir.joinpath(profile_dir_name).resolve(strict=False) + + profile_dir.mkdir(parents=True, exist_ok=True) + for utils_file in copy_files: + _ = sh.copy(utils_file, profile_dir) + + with open(profile_dir.joinpath("config.yaml"), "w", encoding="ascii") as dump: + _ = dump.write(profile_cfg) + + return 0 + + +if __name__ == "__main__": + main() diff --git a/cubi-tools/prototypes/testworkflow/Snakefile b/cubi-tools/prototypes/testworkflow/Snakefile new file mode 100644 index 0000000..4c162c0 --- /dev/null +++ b/cubi-tools/prototypes/testworkflow/Snakefile @@ -0,0 +1,75 @@ + +IMAGE_PATH="/gpfs/project/projects/medbioinf/container/" + +localrules: all + +rule all: + input: + "results/env.txt", + "results/hpcmodule_env.txt", + "results/singularity_env.txt", + "results/singularity_and_hpcmodule_env.txt" + resources: + walltime = "00:00:30", + +rule env: + output: + "results/env.txt" + resources: + walltime = "00:01:00", + shell: + """ + env > {output} + """ + +rule hpcmodule_env: + output: + "results/hpcmodule_env.txt" + resources: + walltime = "00:05:00", + envmodules: + "Singularity" + shell: + """ + env > {output} + """ + + +rule singularity_env: + """ + Not explicit in Snakemake documentation [v6.13.1], + but Singularity container URIs can be specified + as local files (absolute path!), and Snakemake + won't complain. + """ + output: + "results/singularity_env.txt" + resources: + walltime = "00:05:00", + singularity: IMAGE_PATH + "base.sif" + shell: + """ + env > {output} + """ + + +rule singularity_and_hpcmodule_env: + """ + Since Singularity is not by default available + in $PATH on HILBERT, and Snakemake does not support + activating envmodules before running a container + (see "shell::__new__()" in snakemake/shell.py), + the below fails because Singularity is not executable. + However, even if Singularity is available, the combination + of "--use-envmodules" and "--use-singularity" causes + a fail because "module purge && module load" are prepended + to the shell command, which are unknown in the container. + """ + output: + "results/singularity_and_hpcmodule_env.txt" + resources: + walltime = "00:05:00", + envmodules: "Singularity" + singularity: IMAGE_PATH + "base.sif" + shell: + "env > {output}" diff --git a/cubi-tools/prototypes/testworkflow/base.def b/cubi-tools/prototypes/testworkflow/base.def new file mode 100644 index 0000000..774886f --- /dev/null +++ b/cubi-tools/prototypes/testworkflow/base.def @@ -0,0 +1,10 @@ +Bootstrap: library +From: alpine:3.14 + +%post + apk --no-cache update + apk --no-cache add bash bash-completion + mkdir /payload + +%environment + export LC_ALL=C diff --git a/cubi-tools/prototypes/update_metadata.py b/cubi-tools/prototypes/update_metadata.py deleted file mode 100755 index 76daa42..0000000 --- a/cubi-tools/prototypes/update_metadata.py +++ /dev/null @@ -1,565 +0,0 @@ -#!/usr/bin/env python3 - -import pathlib -import sys -import subprocess as sp -import argparse as argp -import shutil -import hashlib -import toml - - -def main(): - """ - Main function of the 'update-metadata.py' script. - """ - - args = parse_command_line() - dryrun = args.dryrun - - # Is it a dry run? - if dryrun: - print("\nTHIS IS A DRY RUN!!") - - # check if project directory exist: - project_dir = pathlib.Path(args.project_dir).resolve() - if not project_dir.is_dir(): - raise FileNotFoundError(f"The project directory {project_dir} does not exist.") - print(f"Project directory set as: {project_dir}") - - ref_repo = args.ref_repo - source = args.source - external = args.external - - # Since using the online github repo directly to update the local metadata files - # is resulting in hitting an API rate limit fairly quickly a local copy is needed. - # The location of the template_metadata folder holding branch/version tag - # of interest is on the same level as the project directory - metadata_dir = pathlib.Path( - pathlib.Path(f"{project_dir}").resolve().parents[0], - "template-metadata-files", - ).resolve() - - # detect if its a external workflow - if external: - workflow_dir = external_repo(project_dir, external, dryrun) - print(f"\nExternal set as: {external}\n") - print( - "Metadata files will be updated in: " - f"{external_repo(project_dir, external, dryrun)}\n" - ) - else: - workflow_dir = external_repo(project_dir, external, dryrun) - print( - "\nMetadata files will be updated in: " - f"{external_repo(project_dir, external, dryrun)}\n" - ) - - # Clone the 'template-metadata-files' branch or version tag - # into a local folder if it exists - clone(workflow_dir, project_dir, ref_repo, source, metadata_dir, dryrun) - - # files that will be updated - files_to_update = [ - "CITATION.md", - "LICENSE", - ".editorconfig", - "pyproject.toml", - ] - - # Updating routine of the metadata files - for f in files_to_update: - if f == "pyproject.toml": - update_pyproject_toml(workflow_dir, metadata_dir, source, dryrun) - else: - print( - f"Comparing if local '{f}' differs from version in branch/version tag " - f"'{source}' in the 'template-metadata-files' repo" - ) - update_file(f, workflow_dir, metadata_dir, dryrun) - - # For 'git pull' you have to be in a branch of the template-metadata-files repo to - # merge with. If you previously chose a version tag to update from, 'git pull' will - # throw a waning message. This part will reset the repo to the main branch to avoid - # any warning message stemming from 'git pull' - command_reset = ["git", "checkout", "main", "-q"] - sp.run( - command_reset, - cwd=metadata_dir, - check=False, - ) - - print("\nUPDATE COMPLETED!") - - return None - - -def parse_command_line(): - """ - TODO: the 'project-dir' parameter is misleading; CUBI workflows - can also be updated with this script (or any other repo w/ metadata) - TODO: the 'source' parameter is misleading / the name too generic, - this should be something like "branch or tag", which is the intended meaning - - Collection of the various options of the 'update-metadata.py' script. - """ - parser = argp.ArgumentParser( - description="Add or update metadata files for your repository. " - "Example: python3 add-update-metadata.py --project-dir path/to/repo" - ) - parser.add_argument( - "--project-dir", - "-p", - type=pathlib.Path, - help="(Mandatory) Directory where metadata should be copied/updated.", - required=True, - ) - - DEFAULT_REF_REPO = ( - "https://github.com/core-unit-bioinformatics/template-metadata-files.git" - ) - parser.add_argument( - "--ref-repo", - type=str, - nargs="?", - default=DEFAULT_REF_REPO, - help=f"Reference/remote repository used to clone files. Default: {DEFAULT_REF_REPO}", - ) - parser.add_argument( - "--external", - "-e", - action="store_true", - default=False, - dest="external", - help="If False (default), metadata files are copied to the project_dir, " - "else to a subfolder (cubi). Default: False", - ) - parser.add_argument( - "--source", - "-s", - type=str, - nargs="?", - default="main", - help="Branch or version tag from which to update the files. Default: main", - ) - parser.add_argument( - "--dry-run", - "--dryrun", - "-d", - "-dry", - action="store_true", - default=False, - dest="dryrun", - help="Just report actions but do not execute them. Default: False", - ) - parser.add_argument( - "--version", - "-v", - action="version", - version=report_script_version(), - help="Displays version of this script.", - ) - # if no arguments are given, print help - if len(sys.argv) == 1: - parser.print_help() - parser.exit() # default is code 0 - args = parser.parse_args() - return args - - -def clone(workflow_dir, project_dir, ref_repo, source, metadata_dir, dryrun): - """ - Check if the 'template-metadata-files' repo is already parallel to the - project directory. If the 'template-metadata-files' repo exists this folder is - getting updated via a 'git pull --all' command. If that is not the case, - the 'template-metadata-files' repo will be cloned parallel to the project - directory unless the branch or version tag don't exist, - then an AssertionError will be called to stop the script. - """ - if dryrun: - if not metadata_dir.is_dir(): - raise NameError( - "The 'template-metadata-files' repo needs to be present in the " - f"parental folder of the project directory {project_dir}.\n" - "In a live run the 'template-metadata-files' repo would " - f"be created at {metadata_dir}.\n" - ) - else: - print( - "The requested branch/version tag (default: main) is present " - "and is getting updated via 'git pull -all' .\n" - ) - # TODO: full code duplication with section below - refactor! - command = [ - "git", - "pull", - "--all", - "-q", - ] - sp.run( - command, - cwd=metadata_dir, - check=False, - ) - command_checkout = ["git", "checkout", "".join({source}), "-q"] - checkout_cmd = sp.run( - command_checkout, - cwd=metadata_dir, - stderr=sp.PIPE, - check=False, - ) - # If the 'template-metadata-files' folder is not a Git repo - # an error message that contains the string 'fatal:' will be thrown - warning = "fatal:" - assert warning not in str(checkout_cmd.stderr.strip()), ( - "The folder 'template-metadata-files' is not a git repository! " - "For this script to work either delete the folder or move it!!" - ) - # If you try to clone a repo/branch/tag that doesn't exist - # Git will throw an error message that contains the string 'error:' - error = "error:" - assert error not in str( - checkout_cmd.stderr.strip() - ), f"The branch or version tag named '{source}' doesn't exist" - else: - if metadata_dir.is_dir(): - # TODO: see above - code dupliaction - command = [ - "git", - "pull", - "--all", - "-q", - ] - sp.run( - command, - cwd=metadata_dir, - check=False, - ) - command_checkout = ["git", "checkout", "".join({source}), "-q"] - checkout_cmd = sp.run( - command_checkout, - cwd=metadata_dir, - stderr=sp.PIPE, - check=False, - ) - # If the 'template-metadata-files' folder is not a Git repo - # an error message that contains the string 'fatal:' will be thrown - warning = "fatal:" - assert warning not in str(checkout_cmd.stderr.strip()), ( - "The folder 'template-metadata-files' is not a git repository! " - "For this script to work either delete the folder or move it!!" - ) - # If you try to clone a repo/branch/tag that doesn't exist - # Git will throw an error message that contains the string 'error:' - error = "error:" - assert error not in str( - checkout_cmd.stderr.strip() - ), f"The branch or version tag named '{source}' doesn't exist" - else: - command = [ - "git", - "clone", - "-q", - "-c advice.detachedHead=false", - ref_repo, - metadata_dir, - ] - clone_cmd = sp.run( - command, - stdout=sp.PIPE, - stderr=sp.PIPE, - cwd=workflow_dir, - check=False, - ) - # If the 'template-metadata-files' folder is not a Git repo - # an error message that contains the string 'fatal:' will be thrown - warning = "fatal:" - assert warning not in str(clone_cmd.stderr.strip()), ( - "The repository you entered or the branch or version tag " - f"named '{source}' doesn't exist" - ) - command_checkout = ["git", "checkout", "".join({source}), "-q"] - sp.run( - command_checkout, - cwd=metadata_dir, - check=False, - ) - return None - - -def get_local_checksum(workflow_dir, f): - """ - TODO: should be refactored, is quasi-identical to 'get_ref_checksum' - TODO: single-letter variable name is strongly discouraged - - The MD5 checksum for all metadata files in the - local project directory is determined. - """ - if workflow_dir.joinpath(f).is_file(): - with open(workflow_dir.joinpath(f), "rb") as local_file: - # read contents of the file - local_data = local_file.read() - # pipe contents of the file through - md5_local = hashlib.md5(local_data).hexdigest() - else: - md5_local = "" - return md5_local - - -def get_ref_checksum(metadata_dir, f): - """ - TODO: see TODOs in 'get_local_checksum' - - The MD5 checksum for all metadata files in the temp folder - for the desired branch or version tag is determined. - """ - with open(metadata_dir.joinpath(f), "rb") as ref_file: - # read contents of the file - ref_data = ref_file.read() - # pipe contents of the file through - md5_ref = hashlib.md5(ref_data).hexdigest() - return md5_ref - - -def update_file(f, workflow_dir, metadata_dir, dryrun): - """ - The MD5 checksum of the the local metadata file(s) and the metadata - file(s) in the desired branch or version tag are being compared. - If they differ a question to update for each different - metadata file pops up. If an update is requested it will be performed. - """ - if dryrun: - # TODO: full code duplication w/ section below - refactor! - local_sum = get_local_checksum(workflow_dir, f) - ref_sum = get_ref_checksum(metadata_dir, f) - if local_sum != ref_sum: - print(f"The versions of '{f}' differ!") - print(f"Local MD5 checksum: {local_sum}") - print(f"Remote MD5 checksum: {ref_sum}") - print(f"Update '{f}'(y/n)? y") - print(f"Dry run! '{f}' would be updated!") - else: - print(f"Dry run! '{f}' is up-to-date!") - else: - # TODO: see above - code dup - local_sum = get_local_checksum(workflow_dir, f) - ref_sum = get_ref_checksum(metadata_dir, f) - if local_sum != ref_sum: - print(f"The versions of '{f}' differ!") - print(f"Local MD5 checksum: {local_sum}") - print(f"Remote MD5 checksum: {ref_sum}") - answer_is_pos = user_response(f"Update '{f}'") - - if answer_is_pos: - shutil.copyfile(metadata_dir.joinpath(f), workflow_dir.joinpath(f)) - print(f"'{f}' was updated!") - else: - print(f"'{f}' was NOT updated!") - else: - print(f"'{f}' is up-to-date!") - return None - - -def update_pyproject_toml(workflow_dir, metadata_dir, source, dryrun): - """ - The 'pyproject.toml' is treated a little bit differently. First, there is - a check if the file even exists in the project directory. If that is not the - case it will be copied into that folder from the desired branch or version tag. - If the file is present it will be checked if the cubi.metadata.version - (and only that information!) differs between the local and the requested branch - or version tag version. If that is the case the cubi.metadata.version - is getting updated. - """ - if dryrun: - if not workflow_dir.joinpath("pyproject.toml").is_file(): - print( - "\nThere is no 'pyproject.toml' in your folder. " - "Do you want to add 'pyproject.toml'(y/n)? y" - "\nDry run! 'pyproject.toml' would have been added!" - ) - else: - # TODO: clear code duplication with section below - # TODO: logical swap of source/target between function call and return - comparison = get_metadata_versions(workflow_dir, metadata_dir) - # Just to clearly state which information/files are generated by the - # function 'get_metadata_versions(workflow_dir, metadata_dir)': - metadata_version = comparison[0] - workflow_version = comparison[1] - new_pyproject_toml = comparison[2] - - if metadata_version != workflow_version: - print( - "\nYou updated your local repo with the 'template-metadata-files' " - f"in branch/version tag '{source}'." - "\nDo you want to update the metadata files version in " - "'pyproject.toml'(y/n)? y" - ) - print( - "Dry run!\n" - "Metadata version in 'pyproject.toml' would have been updated from " - f"version '{workflow_version}' to version '{metadata_version}'!" - ) - else: - print( - "\nDry run! Metadata version in 'pyproject.toml' is up-to-date!\n" - ) - else: - if not workflow_dir.joinpath("pyproject.toml").is_file(): - answer_is_pos = user_response( - "There is no 'pyproject.toml' in your folder. Add 'pyproject.toml'" - ) - - if answer_is_pos: - shutil.copyfile( - metadata_dir.joinpath("pyproject.toml"), - workflow_dir.joinpath("pyproject.toml"), - ) - print("'pyproject.toml' was added!") - else: - print("'pyproject.toml' was NOT added!") - - else: - # TODO: see above TODOs - comparison = get_metadata_versions(workflow_dir, metadata_dir) - # Just to clearly state which information/files are generated by the - # function 'get_metadata_versions(workflow_dir, metadata_dir)': - metadata_version = comparison[0] - workflow_version = comparison[1] - new_pyproject_toml = comparison[2] - - if metadata_version != workflow_version: - answer_is_pos = user_response( - "\nYou updated your local repo with the 'template-metadata-files' " - f"in branch/version tag '{source}'." - "\nDo you want to update the metadata files version in " - "'pyproject.toml'" - ) - - if answer_is_pos: - with open( - pathlib.Path(workflow_dir, "pyproject.toml"), - "w", - encoding="utf-8", - ) as text_file: - text_file.write(toml.dumps(new_pyproject_toml, encoder=None)) - print( - f"Metadata version in 'pyproject.toml' was updated from version" - f" '{workflow_version}' to version '{metadata_version}'!" - ) - else: - print( - "'pyproject.toml' was NOT updated from version " - f"'{workflow_version}' to version '{metadata_version}'!" - ) - else: - print("\nMetadata version in 'pyproject.toml' is up-to-date!\n") - return None - - -def user_response(question, attempt=0): - """ - Function to evaluate the user response to the Yes or No question refarding updating - the metadata files. - """ - attempt += 1 - prompt = f"{question}? (y/n): " - answer = input(prompt).strip().lower() - pos = ["yes", "y", "yay"] - neg = ["no", "n", "nay"] - if attempt >= 3: - raise RuntimeError( - "You failed at least 3 times to answer a simple (y/n) question!" - ) - - if not (answer in pos or answer in neg): - print(f"That was a yes or no question, but you answered: {answer}") - return user_response(question, attempt) - return answer in pos - - -def get_metadata_versions(metadata_target, metadata_source): - """Read the metadata version strings in the respective - pyproject.toml files from the metadata source and target - directories. - - Args: - metadata_target (pathlib.Path): - The folder being processed / receiving the metadata update - metadata_source (pathlib.Path): - The source folder of the update process, i.e., that should - almost always refer to 'template-metadata-files' - - Returns: - str: Metadata version of the source - str: Metadata version of the target - dict: Target pyproject toml w/ updated metadata version - """ - # loading the pyproject.tomls: - source_pyproject = toml.load( - pathlib.Path(metadata_source, "pyproject.toml"), _dict=dict - ) - target_pyproject = toml.load( - pathlib.Path(metadata_target, "pyproject.toml"), _dict=dict - ) - # extracting the metadata versions: - source_version = source_pyproject["cubi"]["metadata"]["version"] - target_version = target_pyproject["cubi"]["metadata"]["version"] - # updating the metadata version in the workflow pyproject with the metadata version - # from the template-metadata-files 'source' pyproject: - target_pyproject["cubi"]["metadata"]["version"] = source_version - - return source_version, target_version, target_pyproject - - -def external_repo(project_dir, external, dryrun): - """ - TODO: this function (name) is not informative - TODO: the variable names are confusing; workflow and project exist - both in the CUBI context and are conceptually different. Here, it - looks like they are used interchangeably - - Function to create a cubi folder where the CUBI metadata files will be - copied/updated if the user stated that the project is from external. - """ - if dryrun: - if external: - workflow_dir = pathlib.Path(project_dir, "cubi") - else: - workflow_dir = project_dir - else: - if external: - workflow_dir = pathlib.Path(project_dir, "cubi") - workflow_dir.mkdir(parents=True, exist_ok=True) - else: - workflow_dir = project_dir - return workflow_dir - - -def find_cubi_tools_top_level(): - """Find the top-level folder of the cubi-tools - repository (starting from this script path). - """ - script_path = pathlib.Path(__file__).resolve(strict=True) - script_folder = script_path.parent - - cmd = ["git", "rev-parse", "--show-toplevel"] - repo_path = sp.check_output(cmd, cwd=script_folder).decode("utf-8").strip() - repo_path = pathlib.Path(repo_path) - return repo_path - - -def report_script_version(): - """ - Read out of the cubi-tools script version out of the 'pyproject.toml'. - """ - cubi_tools_repo = find_cubi_tools_top_level() - - toml_file = cubi_tools_repo.joinpath("pyproject.toml").resolve(strict=True) - - toml_file = toml.load(toml_file, _dict=dict) - version = toml_file["cubi"]["tools"]["script"][0]["version"] - return version - - -if __name__ == "__main__": - main()