diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 67b3a0197..4f059a6a5 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -30,7 +30,7 @@ jobs: timeout-minutes: 20 run: | PYTHONPATH=$PWD/gnomad_methods:$PWD/seqr-loading-pipelines \ - coverage run -m pytest test --junitxml=test-execution.xml + coverage run -m pytest -n auto test --junitxml=test-execution.xml rc=$? coverage xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4ecf9c1bd..183dda6f2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: - id: detect-private-key - id: debug-statements - id: check-added-large-files - exclude: '\.*.interval_list' + exclude: '\.*.interval_list|test/data/large_cohort/compressed_dirs/.*' - repo: https://github.com/populationgenomics/pre-commits rev: "v0.1.3" diff --git a/cpg_workflows/jobs/vep.py b/cpg_workflows/jobs/vep.py index ac1c8abf9..4cfc6115c 100644 --- a/cpg_workflows/jobs/vep.py +++ b/cpg_workflows/jobs/vep.py @@ -144,7 +144,6 @@ def vep_one( vep_image = image_path(f'vep_{vep_version}') vep_mount_path = to_path(reference_path(f'vep_{vep_version}_mount')) assert all([vep_image, vep_mount_path]) - logging.info(f'Using VEP {vep_version}') j = b.new_job('VEP', (job_attrs or {}) | dict(tool=f'VEP {vep_version}')) j.image(vep_image) diff --git a/cpg_workflows/large_cohort/dense_subset.py b/cpg_workflows/large_cohort/dense_subset.py index 9fce78115..85f6a7555 100644 --- a/cpg_workflows/large_cohort/dense_subset.py +++ b/cpg_workflows/large_cohort/dense_subset.py @@ -7,10 +7,7 @@ from cpg_workflows.utils import can_reuse -def run( - vds_path: str, - out_dense_mt_path: str, -) -> hl.MatrixTable: +def run(vds_path: str, out_dense_mt_path: str) -> hl.MatrixTable: """ Filter a sparse VariantDataset to a set of predetermined QC sites and return a dense MatrixTable with split multiallelics. diff --git a/requirements-dev.txt b/requirements-dev.txt index b60b99953..9bd5663bf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,8 +2,9 @@ pre-commit pylint bump2version black -pytest +pytest>8 pytest_mock +pytest-xdist>=3.6.0 mypy cpg-utils>=5.0.11 tenacity diff --git a/setup.py b/setup.py index 177a24037..8033ea138 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ license='MIT', packages=find_packages(), install_requires=[ - 'cpg-utils>=5.0.11', + 'cpg-utils>=5.1.1', 'cyvcf2==0.30.18', 'analysis-runner>=2.43.3', 'hail==0.2.132', # Pin Hail at CPG's installed version @@ -40,6 +40,7 @@ extras_require={ 'test': [ 'pytest', + 'pytest-xdist', 'pytest-mock', 'coverage', ], diff --git a/test/conftest.py b/test/conftest.py index ecd892d4f..8fcaaac7c 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -10,10 +10,19 @@ import cpg_workflows.metamist import cpg_workflows.stages.gatk_sv import cpg_workflows.workflow +from cpg_utils.hail_batch import start_query_context + + +@pytest.fixture(autouse=True, scope='session') +def start_up_query(): + """Start up the query backend once per session.""" + start_query_context() + yield @pytest.fixture(autouse=True, scope='function') def pre_and_post_test(): + # Set a dummy google cloud project to avoid errors when running tests for tests # that use the google cloud. with mock.patch.dict( diff --git a/test/data/large_cohort/compressed_dirs/ancestry_sample_qc.ht.zip b/test/data/large_cohort/compressed_dirs/ancestry_sample_qc.ht.zip new file mode 100644 index 000000000..d5b542f3d Binary files /dev/null and b/test/data/large_cohort/compressed_dirs/ancestry_sample_qc.ht.zip differ diff --git a/test/data/large_cohort/compressed_dirs/dense.mt.zip b/test/data/large_cohort/compressed_dirs/dense.mt.zip new file mode 100644 index 000000000..9e4251c11 Binary files /dev/null and b/test/data/large_cohort/compressed_dirs/dense.mt.zip differ diff --git a/test/data/large_cohort/compressed_dirs/eigenvalues.ht.zip b/test/data/large_cohort/compressed_dirs/eigenvalues.ht.zip new file mode 100644 index 000000000..06bbd9fd6 Binary files /dev/null and b/test/data/large_cohort/compressed_dirs/eigenvalues.ht.zip differ diff --git a/test/data/large_cohort/compressed_dirs/inferred_pop.ht.zip b/test/data/large_cohort/compressed_dirs/inferred_pop.ht.zip new file mode 100644 index 000000000..2be6f8903 Binary files /dev/null and b/test/data/large_cohort/compressed_dirs/inferred_pop.ht.zip differ diff --git a/test/data/large_cohort/compressed_dirs/loadings.ht.zip b/test/data/large_cohort/compressed_dirs/loadings.ht.zip new file mode 100644 index 000000000..3d9f094a8 Binary files /dev/null and b/test/data/large_cohort/compressed_dirs/loadings.ht.zip differ diff --git a/test/data/large_cohort/compressed_dirs/relateds_to_drop.ht.zip b/test/data/large_cohort/compressed_dirs/relateds_to_drop.ht.zip new file mode 100644 index 000000000..770884fe9 Binary files /dev/null and b/test/data/large_cohort/compressed_dirs/relateds_to_drop.ht.zip differ diff --git a/test/data/large_cohort/compressed_dirs/sample_qc.ht.zip b/test/data/large_cohort/compressed_dirs/sample_qc.ht.zip new file mode 100644 index 000000000..3e7fcaf57 Binary files /dev/null and b/test/data/large_cohort/compressed_dirs/sample_qc.ht.zip differ diff --git a/test/data/large_cohort/compressed_dirs/scores.ht.zip b/test/data/large_cohort/compressed_dirs/scores.ht.zip new file mode 100644 index 000000000..33dd20eb8 Binary files /dev/null and b/test/data/large_cohort/compressed_dirs/scores.ht.zip differ diff --git a/test/data/large_cohort/compressed_dirs/v01.vds.zip b/test/data/large_cohort/compressed_dirs/v01.vds.zip new file mode 100644 index 000000000..b2237553a Binary files /dev/null and b/test/data/large_cohort/compressed_dirs/v01.vds.zip differ diff --git a/test/test_large_cohort.py b/test/test_large_cohort.py index c78d434f2..974139734 100644 --- a/test/test_large_cohort.py +++ b/test/test_large_cohort.py @@ -2,18 +2,15 @@ Test large-cohort workflow. """ -import os -from os.path import exists +from os.path import join as os_path_join from pathlib import Path +from zipfile import ZipFile -import pytest from pytest_mock import MockFixture -import cpg_workflows import cpg_workflows.inputs from cpg_utils import Path as CPGPath from cpg_utils import to_path -from cpg_utils.hail_batch import start_query_context from cpg_workflows.filetypes import GvcfPath from cpg_workflows.large_cohort import ( ancestry_pca, @@ -25,15 +22,29 @@ site_only_vcf, ) from cpg_workflows.targets import MultiCohort +from cpg_workflows.utils import exists_not_cached from . import set_config from .factories.config import HailConfig, PipelineConfig, StorageConfig, WorkflowConfig from .factories.types import SequencingType ref_prefix = to_path(__file__).parent / 'data/large_cohort/reference' -gnomad_prefix = ref_prefix / 'gnomad/v0' -broad_prefix = ref_prefix / 'hg38/v0' +# input files for each component +compressed_inputs = to_path(__file__).parent / 'data/large_cohort/compressed_dirs' +compressed_vds_path = str(compressed_inputs / 'v01.vds.zip') +compressed_dense_mt_path = str(compressed_inputs / 'dense.mt.zip') +compressed_sample_qc_ht_path = str(compressed_inputs / 'sample_qc.ht.zip') +compressed_relateds_to_drop_ht_path = str(compressed_inputs / 'relateds_to_drop.ht.zip') +# compressed_relatedness_ht_path = str(compressed_inputs / 'relatedness.ht.zip') +compressed_loadings_ht_path = str(compressed_inputs / 'loadings.ht.zip') +compressed_scores_ht_path = str(compressed_inputs / 'scores.ht.zip') +compressed_eigen_ht_path = str(compressed_inputs / 'eigenvalues.ht.zip') +compressed_inferred_pop_ht_path = str(compressed_inputs / 'inferred_pop.ht.zip') +compressed_ancestry_sample_qc_ht_path = str(compressed_inputs / 'ancestry_sample_qc.ht.zip') + +GNOMAD_PREFIX = ref_prefix / 'gnomad/v0' +BROAD_PREFIX = ref_prefix / 'hg38/v0' DEFAULT_CONFIG = Path(to_path(__file__).parent.parent / 'cpg_workflows' / 'defaults.toml') LARGE_COHORT_CONFIG = Path(to_path(__file__).parent.parent / 'configs' / 'defaults' / 'large_cohort.toml') @@ -41,9 +52,9 @@ def create_config( tmp_path: Path, - sequencing_type: SequencingType, - gnomad_prefix: CPGPath, - broad_prefix: CPGPath, + sequencing_type: SequencingType = 'genome', + gnomad_prefix: CPGPath = GNOMAD_PREFIX, + broad_prefix: CPGPath = BROAD_PREFIX, ) -> PipelineConfig: return PipelineConfig( workflow=WorkflowConfig( @@ -127,111 +138,238 @@ def _mock_cohort(dataset_id: str): return mc -class TestAllLargeCohortMethods: +def decompress_into_job_tmp(tmp_path: Path, compressed_paths: list[str]): + """ + Takes a list of compressed paths, and decompresses them into a job temp location + """ + for path in compressed_paths: + with ZipFile(path, 'r') as zip_ref: + zip_ref.extractall(tmp_path) - @pytest.mark.integration - def test_with_sample_data(self, mocker: MockFixture, tmp_path: Path): - """ - Run entire workflow in a local mode. - """ - conf = create_config( - tmp_path=tmp_path, - sequencing_type='genome', - gnomad_prefix=gnomad_prefix, - broad_prefix=broad_prefix, - ) - set_config( - conf, - tmp_path / 'config.toml', - merge_with=[DEFAULT_CONFIG, LARGE_COHORT_CONFIG], - ) - mocker.patch( - 'cpg_workflows.inputs.deprecated_create_cohort', - lambda: _mock_cohort(conf.workflow.dataset), - ) - # skip can_reuse, implicit skip of existence checks - mocker.patch('cpg_workflows.large_cohort.combiner.can_reuse', lambda x: False) - mocker.patch('cpg_workflows.large_cohort.relatedness.can_reuse', lambda x: False) - mocker.patch('cpg_workflows.large_cohort.site_only_vcf.can_reuse', lambda x: False) - - test_cohort: MultiCohort = cpg_workflows.inputs.deprecated_create_cohort() - gvcf_paths: list[str] = [str(sg.gvcf) for sg in test_cohort.get_sequencing_groups()] - - start_query_context() - - res_pref = tmp_path - vds_path = str(res_pref / 'v01.vds') - - # we're passing a specific minority of intervals here, to test that the combiner works on a timely test case - combiner.run( - output_vds_path=vds_path, - sequencing_type=conf['workflow']['sequencing_type'], - tmp_prefix=str(res_pref / 'tmp'), - genome_build=conf['references']['genome_build'], - gvcf_paths=gvcf_paths, - vds_paths=None, - specific_intervals=conf['large_cohort']['combiner']['intervals'], - ) +def test_combiner(mocker: MockFixture, tmp_path: Path): - sample_qc_ht_path = res_pref / 'sample_qc.ht' - sample_qc.run( - vds_path=str(vds_path), - out_sample_qc_ht_path=str(sample_qc_ht_path), - tmp_prefix=os.path.join(res_pref, 'tmp'), - ) + conf = create_config(tmp_path) + set_config( + conf, + tmp_path / 'config.toml', + merge_with=[DEFAULT_CONFIG, LARGE_COHORT_CONFIG], + ) - dense_mt_path = res_pref / 'dense.mt' - dense_subset.run( - vds_path=str(vds_path), - out_dense_mt_path=str(dense_mt_path), - ) + mocker.patch( + 'cpg_workflows.inputs.deprecated_create_cohort', + lambda: _mock_cohort(conf.workflow.dataset), + ) + # skip can_reuse, implicit skip of existence checks + mocker.patch('cpg_workflows.large_cohort.combiner.can_reuse', lambda x: False) - relateds_to_drop_ht_path = res_pref / 'relateds_to_drop.ht' - relatedness.run( - dense_mt_path=dense_mt_path, - sample_qc_ht_path=sample_qc_ht_path, - out_relatedness_ht_path=res_pref / 'relatedness.ht', - out_relateds_to_drop_ht_path=relateds_to_drop_ht_path, - tmp_prefix=res_pref / 'tmp', - ) + test_cohort: MultiCohort = cpg_workflows.inputs.deprecated_create_cohort() + gvcf_paths: list[str] = [str(sg.gvcf) for sg in test_cohort.get_sequencing_groups()] - scores_ht_path = res_pref / 'scores.ht' - eigenvalues_ht_path = res_pref / 'eigenvalues.ht' - loadings_ht_path = res_pref / 'loadings.ht' - inferred_pop_ht_path = res_pref / 'inferred_pop.ht' - ancestry_sample_qc_ht_path = res_pref / 'ancestry_sample_qc.ht' - - ancestry_pca.run( - dense_mt_path=dense_mt_path, - sample_qc_ht_path=sample_qc_ht_path, - relateds_to_drop_ht_path=relateds_to_drop_ht_path, - tmp_prefix=res_pref / 'tmp', - out_scores_ht_path=scores_ht_path, - out_eigenvalues_ht_path=eigenvalues_ht_path, - out_loadings_ht_path=loadings_ht_path, - out_inferred_pop_ht_path=inferred_pop_ht_path, - out_sample_qc_ht_path=ancestry_sample_qc_ht_path, - ) - ancestry_plots.run( - out_path_pattern=res_pref / 'plots' / '{scope}_pc{pci}_{pca_suffix}.{ext}', - sample_qc_ht_path=ancestry_sample_qc_ht_path, - scores_ht_path=scores_ht_path, - eigenvalues_ht_path=eigenvalues_ht_path, - loadings_ht_path=loadings_ht_path, - inferred_pop_ht_path=inferred_pop_ht_path, - relateds_to_drop_ht_path=relateds_to_drop_ht_path, - ) + vds_path = str(tmp_path / 'v01.vds') - siteonly_vcf_path = res_pref / 'siteonly.vcf.bgz' - site_only_vcf.run( - vds_path=str(vds_path), - sample_qc_ht_path=str(sample_qc_ht_path), - relateds_to_drop_ht_path=str(relateds_to_drop_ht_path), - out_vcf_path=str(siteonly_vcf_path), - tmp_prefix=str(res_pref / 'tmp'), - ) + # we're passing a specific minority of intervals here, to test that the combiner works on a timely test case + combiner.run( + output_vds_path=vds_path, + sequencing_type=conf['workflow']['sequencing_type'], + tmp_prefix=str(tmp_path / 'tmp'), + genome_build=conf['references']['genome_build'], + gvcf_paths=gvcf_paths, + vds_paths=None, + specific_intervals=conf['large_cohort']['combiner']['intervals'], + ) + + # do some testing here + assert exists_not_cached(vds_path) + + +def test_sample_qc(mocker: MockFixture, tmp_path: Path): + conf = create_config(tmp_path) + set_config( + conf, + tmp_path / 'config.toml', + merge_with=[DEFAULT_CONFIG, LARGE_COHORT_CONFIG], + ) + + mocker.patch( + 'cpg_workflows.inputs.deprecated_create_cohort', + lambda: _mock_cohort(conf.workflow.dataset), + ) + + # skip can_reuse, implicit skip of existence checks + mocker.patch('cpg_workflows.large_cohort.combiner.can_reuse', lambda x: False) + + # open that VDS into a job temp location + decompress_into_job_tmp(tmp_path, [compressed_vds_path]) + + sample_qc_ht_path = tmp_path / 'sample_qc.ht' + sample_qc.run( + vds_path=str(os_path_join(tmp_path, 'v01.vds')), + out_sample_qc_ht_path=str(sample_qc_ht_path), + tmp_prefix=os_path_join(tmp_path, 'tmp'), + ) + + # do some testing here + assert exists_not_cached(sample_qc_ht_path) + + +def test_densify_mt(tmp_path: Path): + conf = create_config(tmp_path) + set_config( + conf, + tmp_path / 'config.toml', + merge_with=[DEFAULT_CONFIG, LARGE_COHORT_CONFIG], + ) + + # open that VDS into a job temp location + decompress_into_job_tmp(tmp_path, [compressed_vds_path]) + + dense_mt_output_path = tmp_path / 'dense.mt' + + # uses get_config()['references']['ancestry']['sites_table'] + dense_subset.run( + vds_path=str(tmp_path / 'v01.vds'), + out_dense_mt_path=str(dense_mt_output_path), + ) + + # do some testing here + assert exists_not_cached(dense_mt_output_path) + + +def test_relatedness(mocker: MockFixture, tmp_path: Path): + conf = create_config(tmp_path) + set_config( + conf, + tmp_path / 'config.toml', + merge_with=[DEFAULT_CONFIG, LARGE_COHORT_CONFIG], + ) + + # skip can_reuse, implicit skip of existence checks + mocker.patch('cpg_workflows.large_cohort.relatedness.can_reuse', lambda x: False) + + # decompress the sample QC HT and dense MT + decompress_into_job_tmp(tmp_path, [compressed_sample_qc_ht_path, compressed_dense_mt_path]) + + relateds_to_drop_ht_path = tmp_path / 'relateds_to_drop.ht' + relatedness_ht_path = tmp_path / 'relatedness.ht' + + # uses get_config()['large_cohort']['remove_failed_qc_pca'] and get_config()['large_cohort']['max_kin'] + relatedness.run( + dense_mt_path=tmp_path / 'dense.mt', + sample_qc_ht_path=tmp_path / 'sample_qc.ht', + out_relatedness_ht_path=relatedness_ht_path, + out_relateds_to_drop_ht_path=relateds_to_drop_ht_path, + tmp_prefix=tmp_path / 'tmp', + ) + + # do some testing here + assert exists_not_cached(relateds_to_drop_ht_path) + assert exists_not_cached(relatedness_ht_path) + + +def test_site_only(mocker: MockFixture, tmp_path: Path): + + # skip can_reuse, implicit skip of existence checks + mocker.patch('cpg_workflows.large_cohort.site_only_vcf.can_reuse', lambda x: False) + + # decompress the sample QC HT, VDS, and relateds to drop HT + decompress_into_job_tmp( + tmp_path, + [compressed_sample_qc_ht_path, compressed_vds_path, compressed_relateds_to_drop_ht_path], + ) + + siteonly_vcf_path = tmp_path / 'siteonly.vcf.bgz' + site_only_vcf.run( + vds_path=str(tmp_path / 'v01.vds'), + sample_qc_ht_path=str(tmp_path / 'sample_qc.ht'), + relateds_to_drop_ht_path=str(tmp_path / 'relateds_to_drop.ht'), + out_vcf_path=str(siteonly_vcf_path), + tmp_prefix=str(tmp_path / 'tmp'), + ) + + # do some testing here + assert exists_not_cached(siteonly_vcf_path) + + +def test_ancestry(tmp_path: Path): + conf = create_config(tmp_path) + set_config( + conf, + tmp_path / 'config.toml', + merge_with=[DEFAULT_CONFIG, LARGE_COHORT_CONFIG], + ) + + scores_ht_path = tmp_path / 'scores.ht' + eigenvalues_ht_path = tmp_path / 'eigenvalues.ht' + loadings_ht_path = tmp_path / 'loadings.ht' + inferred_pop_ht_path = tmp_path / 'inferred_pop.ht' + ancestry_sample_qc_ht_path = tmp_path / 'ancestry_sample_qc.ht' + + # decompress the dense MT, sample_qc, and relateds to drop HT + decompress_into_job_tmp( + tmp_path, + [ + compressed_dense_mt_path, + compressed_sample_qc_ht_path, + compressed_relateds_to_drop_ht_path, + ], + ) + + # uses get_config()['large_cohort']['min_pop_prob'] & get_config()['large_cohort'].get('pca_samples_to_remove', []) + ancestry_pca.run( + dense_mt_path=tmp_path / 'dense.mt', + sample_qc_ht_path=tmp_path / 'sample_qc.ht', + relateds_to_drop_ht_path=tmp_path / 'relateds_to_drop.ht', + tmp_prefix=tmp_path / 'tmp', + out_scores_ht_path=scores_ht_path, + out_eigenvalues_ht_path=eigenvalues_ht_path, + out_loadings_ht_path=loadings_ht_path, + out_inferred_pop_ht_path=inferred_pop_ht_path, + out_sample_qc_ht_path=ancestry_sample_qc_ht_path, + ) + + # do some testing here + for output in [ + scores_ht_path, + eigenvalues_ht_path, + loadings_ht_path, + inferred_pop_ht_path, + ancestry_sample_qc_ht_path, + ]: + assert exists_not_cached(output) + + +def test_ancestry_plots(tmp_path: Path): + conf = create_config(tmp_path) + set_config( + conf, + tmp_path / 'config.toml', + merge_with=[DEFAULT_CONFIG, LARGE_COHORT_CONFIG], + ) + + # decompress all the inputs tables for plotting ancestry + decompress_into_job_tmp( + tmp_path, + [ + compressed_ancestry_sample_qc_ht_path, + compressed_scores_ht_path, + compressed_eigen_ht_path, + compressed_loadings_ht_path, + compressed_inferred_pop_ht_path, + compressed_relateds_to_drop_ht_path, + ], + ) + + # uses a few config entries + ancestry_plots.run( + out_path_pattern=tmp_path / 'plots' / '{scope}_pc{pci}_{pca_suffix}.{ext}', + sample_qc_ht_path=tmp_path / 'ancestry_sample_qc.ht', + scores_ht_path=tmp_path / 'scores.ht', + eigenvalues_ht_path=tmp_path / 'eigenvalues.ht', + loadings_ht_path=tmp_path / 'loadings.ht', + inferred_pop_ht_path=tmp_path / 'inferred_pop.ht', + relateds_to_drop_ht_path=tmp_path / 'relateds_to_drop.ht', + ) - assert exists(vds_path) - assert exists(res_pref / 'plots' / 'dataset_pc1_hgdp_1kg_sites.html') - assert exists(siteonly_vcf_path) + # do some testing here + assert exists_not_cached(tmp_path / 'plots' / 'dataset_pc1_hgdp_1kg_sites.html') diff --git a/test/test_seqr_loader_dry.py b/test/test_seqr_loader_dry.py index ede03e2d1..93c29d322 100644 --- a/test/test_seqr_loader_dry.py +++ b/test/test_seqr_loader_dry.py @@ -2,6 +2,9 @@ Test seqr-loader workflow. """ +import shutil +import uuid +from os import makedirs from pathlib import Path from unittest.mock import mock_open @@ -174,11 +177,19 @@ def selective_mock_open(*args, **kwargs): return mock_open(read_data='')(*args, **kwargs) -def test_seqr_loader_dry(mocker: MockFixture, tmp_path): +def test_seqr_loader_dry(mocker: MockFixture): """ Test entire seqr-loader in a dry mode. """ - conf = TOML.format(directory=str(tmp_path)) + + # trying to generate a random tmp_path using pytest.tmp_path here fails on CI + # as the tmp_path is unbelievably long when running multiple parallel tests + tmp_path_string = f'test-seqr-loader-dry_tmp{uuid.uuid4().hex[:6]}' + makedirs(tmp_path_string, exist_ok=True) + + conf = TOML.format(directory=tmp_path_string) + tmp_path = Path(tmp_path_string) + set_config(conf, tmp_path / 'config.toml', merge_with=[DEFAULT_CONFIG, SEQR_LOADER_CONFIG]) mocker.patch('cpg_workflows.inputs.deprecated_create_cohort', _mock_cohort) @@ -219,3 +230,6 @@ def mock_create_analysis(*args, **kwargs) -> int: == len(get_multicohort().get_sequencing_groups()) + 1 ) assert get_batch().job_by_tool['gatk GenomicsDBImport']['job_n'] == 50 + + # if we got here, it's all good - clean up + shutil.rmtree(tmp_path_string)