diff --git a/data-pipeline/poetry.lock b/data-pipeline/poetry.lock index 37e3d0105..8083f8bfc 100644 --- a/data-pipeline/poetry.lock +++ b/data-pipeline/poetry.lock @@ -48,6 +48,22 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "astroid" +version = "2.15.6" +description = "An abstract syntax tree for Python with inference support." +category = "dev" +optional = false +python-versions = ">=3.7.2" + +[package.dependencies] +lazy-object-proxy = ">=1.4.0" +typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} +wrapt = [ + {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, + {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, +] + [[package]] name = "asttokens" version = "2.4.0" @@ -721,6 +737,20 @@ python-versions = "*" [package.dependencies] six = "*" +[[package]] +name = "isort" +version = "5.12.0" +description = "A Python utility / library to sort Python imports." +category = "dev" +optional = false +python-versions = ">=3.8.0" + +[package.extras] +colors = ["colorama (>=0.4.3)"] +pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib"] +plugins = ["setuptools"] +requirements-deprecated-finder = ["pip-api", "pipreqs"] + [[package]] name = "janus" version = "1.0.0" @@ -781,6 +811,14 @@ python-versions = "*" [package.dependencies] six = ">=1.13,<2.0" +[[package]] +name = "lazy-object-proxy" +version = "1.9.0" +description = "A fast and thorough lazy object proxy." +category = "dev" +optional = false +python-versions = ">=3.7" + [[package]] name = "loguru" version = "0.7.2" @@ -815,6 +853,14 @@ python-versions = ">=3.5" [package.dependencies] traitlets = "*" +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +category = "dev" +optional = false +python-versions = ">=3.6" + [[package]] name = "msal" version = "1.24.0" @@ -1012,6 +1058,18 @@ python-versions = ">=3.8" docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] +[[package]] +name = "platformdirs" +version = "3.10.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] + [[package]] name = "plotly" version = "5.17.0" @@ -1167,6 +1225,32 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] +[[package]] +name = "pylint" +version = "2.17.5" +description = "python code static checker" +category = "dev" +optional = false +python-versions = ">=3.7.2" + +[package.dependencies] +astroid = ">=2.15.6,<=2.17.0-dev0" +colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} +dill = [ + {version = ">=0.2", markers = "python_version < \"3.11\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, +] +isort = ">=4.2.5,<6" +mccabe = ">=0.6,<0.8" +platformdirs = ">=2.2.0" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +tomlkit = ">=0.10.1" +typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""} + +[package.extras] +spelling = ["pyenchant (>=3.2,<4.0)"] +testutils = ["gitpython (>3)"] + [[package]] name = "pyspark" version = "3.3.3" @@ -1405,6 +1489,14 @@ category = "dev" optional = false python-versions = ">=3.7" +[[package]] +name = "tomlkit" +version = "0.12.1" +description = "Style preserving TOML library" +category = "dev" +optional = false +python-versions = ">=3.7" + [[package]] name = "tornado" version = "6.3.3" @@ -1552,7 +1644,7 @@ multidict = ">=4.0" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "29e11461c9eebb7069b1426d42894beaf919f882955e6c5d8665b0b20c4c4219" +content-hash = "0d8f91bff53e125b97b70ea71cf99c637323c13caa58489d462fedf7a9ad409a" [metadata.files] aiodns = [ @@ -1656,6 +1748,10 @@ appnope = [ {file = "appnope-0.1.3-py2.py3-none-any.whl", hash = "sha256:265a455292d0bd8a72453494fa24df5a11eb18373a60c7c0430889f22548605e"}, {file = "appnope-0.1.3.tar.gz", hash = "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24"}, ] +astroid = [ + {file = "astroid-2.15.6-py3-none-any.whl", hash = "sha256:389656ca57b6108f939cf5d2f9a2a825a3be50ba9d589670f393236e0a03b91c"}, + {file = "astroid-2.15.6.tar.gz", hash = "sha256:903f024859b7c7687d7a7f3a3f73b17301f8e42dfd9cc9df9d4418172d3e2dbd"}, +] asttokens = [ {file = "asttokens-2.4.0-py2.py3-none-any.whl", hash = "sha256:cf8fc9e61a86461aa9fb161a14a0841a03c405fa829ac6b202670b3495d2ce69"}, {file = "asttokens-2.4.0.tar.gz", hash = "sha256:2e0171b991b2c959acc6c49318049236844a5da1d65ba2672c4880c1c894834e"}, @@ -2212,6 +2308,10 @@ isodate = [ {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, ] +isort = [ + {file = "isort-5.12.0-py3-none-any.whl", hash = "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6"}, + {file = "isort-5.12.0.tar.gz", hash = "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504"}, +] janus = [ {file = "janus-1.0.0-py3-none-any.whl", hash = "sha256:2596ea5482711c1ee3ef2df6c290aaf370a13c55a007826e8f7c32d696d1d00a"}, {file = "janus-1.0.0.tar.gz", hash = "sha256:df976f2cdcfb034b147a2d51edfc34ff6bfb12d4e2643d3ad0e10de058cb1612"}, @@ -2232,6 +2332,44 @@ jproperties = [ {file = "jproperties-2.1.1-py2.py3-none-any.whl", hash = "sha256:4dfcd7cab56d9c79bce4453f7ca9ffbe0ff0574ddcf1c2a99a8646df60634664"}, {file = "jproperties-2.1.1.tar.gz", hash = "sha256:40b71124e8d257e8954899a91cd2d5c0f72e0f67f1b72048a5ba264567604f29"}, ] +lazy-object-proxy = [ + {file = "lazy-object-proxy-1.9.0.tar.gz", hash = "sha256:659fb5809fa4629b8a1ac5106f669cfc7bef26fbb389dda53b3e010d1ac4ebae"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b40387277b0ed2d0602b8293b94d7257e17d1479e257b4de114ea11a8cb7f2d7"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8c6cfb338b133fbdbc5cfaa10fe3c6aeea827db80c978dbd13bc9dd8526b7d4"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:721532711daa7db0d8b779b0bb0318fa87af1c10d7fe5e52ef30f8eff254d0cd"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66a3de4a3ec06cd8af3f61b8e1ec67614fbb7c995d02fa224813cb7afefee701"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1aa3de4088c89a1b69f8ec0dcc169aa725b0ff017899ac568fe44ddc1396df46"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-win32.whl", hash = "sha256:f0705c376533ed2a9e5e97aacdbfe04cecd71e0aa84c7c0595d02ef93b6e4455"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ea806fd4c37bf7e7ad82537b0757999264d5f70c45468447bb2b91afdbe73a6e"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:946d27deaff6cf8452ed0dba83ba38839a87f4f7a9732e8f9fd4107b21e6ff07"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79a31b086e7e68b24b99b23d57723ef7e2c6d81ed21007b6281ebcd1688acb0a"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f699ac1c768270c9e384e4cbd268d6e67aebcfae6cd623b4d7c3bfde5a35db59"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfb38f9ffb53b942f2b5954e0f610f1e721ccebe9cce9025a38c8ccf4a5183a4"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:189bbd5d41ae7a498397287c408617fe5c48633e7755287b21d741f7db2706a9"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-win32.whl", hash = "sha256:81fc4d08b062b535d95c9ea70dbe8a335c45c04029878e62d744bdced5141586"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:f2457189d8257dd41ae9b434ba33298aec198e30adf2dcdaaa3a28b9994f6adb"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d9e25ef10a39e8afe59a5c348a4dbf29b4868ab76269f81ce1674494e2565a6e"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cbf9b082426036e19c6924a9ce90c740a9861e2bdc27a4834fd0a910742ac1e8"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f5fa4a61ce2438267163891961cfd5e32ec97a2c444e5b842d574251ade27d2"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8fa02eaab317b1e9e03f69aab1f91e120e7899b392c4fc19807a8278a07a97e8"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e7c21c95cae3c05c14aafffe2865bbd5e377cfc1348c4f7751d9dc9a48ca4bda"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-win32.whl", hash = "sha256:f12ad7126ae0c98d601a7ee504c1122bcef553d1d5e0c3bfa77b16b3968d2734"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-win_amd64.whl", hash = "sha256:edd20c5a55acb67c7ed471fa2b5fb66cb17f61430b7a6b9c3b4a1e40293b1671"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0daa332786cf3bb49e10dc6a17a52f6a8f9601b4cf5c295a4f85854d61de63"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cd077f3d04a58e83d04b20e334f678c2b0ff9879b9375ed107d5d07ff160171"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:660c94ea760b3ce47d1855a30984c78327500493d396eac4dfd8bd82041b22be"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:212774e4dfa851e74d393a2370871e174d7ff0ebc980907723bb67d25c8a7c30"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0117049dd1d5635bbff65444496c90e0baa48ea405125c088e93d9cf4525b11"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-win32.whl", hash = "sha256:0a891e4e41b54fd5b8313b96399f8b0e173bbbfc03c7631f01efbe29bb0bcf82"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:9990d8e71b9f6488e91ad25f322898c136b008d87bf852ff65391b004da5e17b"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9e7551208b2aded9c1447453ee366f1c4070602b3d932ace044715d89666899b"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f83ac4d83ef0ab017683d715ed356e30dd48a93746309c8f3517e1287523ef4"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7322c3d6f1766d4ef1e51a465f47955f1e8123caee67dd641e67d539a534d006"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:18b78ec83edbbeb69efdc0e9c1cb41a3b1b1ed11ddd8ded602464c3fc6020494"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:09763491ce220c0299688940f8dc2c5d05fd1f45af1e42e636b2e8b2303e4382"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-win32.whl", hash = "sha256:9090d8e53235aa280fc9239a86ae3ea8ac58eff66a705fa6aa2ec4968b95c821"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:db1c1722726f47e10e0b5fdbf15ac3b8adb58c091d12b3ab713965795036985f"}, +] loguru = [ {file = "loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb"}, {file = "loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac"}, @@ -2292,6 +2430,10 @@ matplotlib-inline = [ {file = "matplotlib-inline-0.1.6.tar.gz", hash = "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"}, {file = "matplotlib_inline-0.1.6-py3-none-any.whl", hash = "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311"}, ] +mccabe = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] msal = [ {file = "msal-1.24.0-py2.py3-none-any.whl", hash = "sha256:a7f2f342b80ba3fe168218003b6798cc81b83c9745284bf63fb8d4ec8e2dbc50"}, {file = "msal-1.24.0.tar.gz", hash = "sha256:7d2ecdad41a5f73bb2b813f3061a4cf47c924621105a8ed137586fcb9d8f827e"}, @@ -2574,6 +2716,10 @@ Pillow = [ {file = "Pillow-10.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d6caf3cd38449ec3cd8a68b375e0c6fe4b6fd04edb6c9766b55ef84a6e8ddf2d"}, {file = "Pillow-10.0.1.tar.gz", hash = "sha256:d72967b06be9300fed5cfbc8b5bafceec48bf7cdc7dab66b1d2549035287191d"}, ] +platformdirs = [ + {file = "platformdirs-3.10.0-py3-none-any.whl", hash = "sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d"}, + {file = "platformdirs-3.10.0.tar.gz", hash = "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d"}, +] plotly = [ {file = "plotly-5.17.0-py2.py3-none-any.whl", hash = "sha256:7c84cdf11da162423da957bb093287134f2d6f170eb9a74f1459f825892247c3"}, {file = "plotly-5.17.0.tar.gz", hash = "sha256:290d796bf7bab87aad184fe24b86096234c4c95dcca6ecbca02d02bdf17d3d97"}, @@ -2700,6 +2846,10 @@ PyJWT = [ {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"}, {file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"}, ] +pylint = [ + {file = "pylint-2.17.5-py3-none-any.whl", hash = "sha256:73995fb8216d3bed149c8d51bba25b2c52a8251a2c8ac846ec668ce38fab5413"}, + {file = "pylint-2.17.5.tar.gz", hash = "sha256:f7b601cbc06fef7e62a754e2b41294c2aa31f1cb659624b9a85bcba29eaf8252"}, +] pyspark = [ {file = "pyspark-3.3.3.tar.gz", hash = "sha256:384d2ad7090cd1db5b2d2ac497bda409d86ab3a27272833e1a27efadf45e4d2f"}, ] @@ -2944,6 +3094,10 @@ tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +tomlkit = [ + {file = "tomlkit-0.12.1-py3-none-any.whl", hash = "sha256:712cbd236609acc6a3e2e97253dfc52d4c2082982a88f61b640ecf0817eab899"}, + {file = "tomlkit-0.12.1.tar.gz", hash = "sha256:38e1ff8edb991273ec9f6181244a6a391ac30e9f5098e7535640ea6be97a7c86"}, +] tornado = [ {file = "tornado-6.3.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:502fba735c84450974fec147340016ad928d29f1e91f49be168c0a4c18181e1d"}, {file = "tornado-6.3.3-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:805d507b1f588320c26f7f097108eb4023bbaa984d63176d1652e184ba24270a"}, diff --git a/data-pipeline/pyproject.toml b/data-pipeline/pyproject.toml index 82673f89c..502abf9ac 100644 --- a/data-pipeline/pyproject.toml +++ b/data-pipeline/pyproject.toml @@ -18,6 +18,7 @@ cattrs = "^23.1.2" hail = "^0.2.122" pytest = "^7.4.2" ipython = "^8.15.0" +pylint = "^2.17.5" [build-system] requires = ["poetry-core"] @@ -25,3 +26,22 @@ build-backend = "poetry.core.masonry.api" [tool.black] line-length = 120 + +[tool.pylint.basic] +# ds: frequently used name for a variable containing a Hail table +good-names = [ + "ds", +] + +[tool.pylint.messages_control] +disable = [ + # Ignore refactor and convention categories + "R", + "C", +] + +[tool.pylint.miscellaneous] +notes = [ + "FIXME", + "TODO", +] diff --git a/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/mane_transcripts_path/mane_select_transcripts.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/mane_transcripts_path/mane_select_transcripts.ht.schema new file mode 100644 index 000000000..37028eb17 --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/mane_transcripts_path/mane_select_transcripts.ht.schema @@ -0,0 +1,14 @@ +---------------------------------------- +Global fields: + 'version': str +---------------------------------------- +Row fields: + 'gene_id': str + 'matched_gene_version': str + 'ensembl_id': str + 'ensembl_version': str + 'refseq_id': str + 'refseq_version': str +---------------------------------------- +Key: ['gene_id'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/output/gnomad_v4_variants_annotated_2.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/output/gnomad_v4_variants_annotated_2.ht.schema new file mode 100644 index 000000000..f32eebf40 --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/output/gnomad_v4_variants_annotated_2.ht.schema @@ -0,0 +1,297 @@ +---------------------------------------- +Global fields: + 'freq_meta': array> + 'freq_index_dict': dict + 'faf_meta': array> + 'faf_index_dict': dict + 'freq_sample_count': array + 'filtering_model': struct { + model_name: str, + score_name: str, + feature_medians: dict, + variants_by_strata: dict, + features_importance: dict, + features: array, + test_results: array, + rf_snv_cutoff: struct { + bin: float64, + min_score: float64 + }, + rf_indel_cutoff: struct { + bin: float64, + min_score: float64 + }, + inbreeding_cutoff: float64, + model_id: str + } + 'tool_versions': struct { + dbsnp_version: str, + cadd_version: str, + revel_version: str, + splicaai_version: str, + primateai_version: str, + pangolin_version: str, + vrs_version: str + } + 'vep_globals': struct { + vep_version: str, + vep_csq_header: str, + vep_help: str, + vep_config: str + } + 'age_distribution': struct { + bin_edges: array, + bin_freq: array, + n_smaller: int32, + n_larger: int32 + } + 'age_index_dict': dict + 'age_meta': array> + 'grpmax_index_dict': dict + 'grpmax_meta': array> + 'README': dict + 'gnomad_qc_repo': str + 'gnomad_methods_repo': str + 'mane_transcripts_version': str +---------------------------------------- +Row fields: + 'locus': locus + 'alleles': array + 'grpmax': array + 'rsids': set + 'rf': struct { + rf_positive_label: bool, + rf_negative_label: bool, + rf_label: str, + rf_train: bool, + rf_tp_probability: float64 + } + 'in_silico_predictors': struct { + cadd: struct { + phred: float32, + raw_score: float32, + has_duplicate: bool + }, + revel: struct { + revel_score: float64, + has_duplicate: bool + }, + splice_ai: struct { + splice_ai_score: float32, + splice_consequence: str, + has_duplicate: bool + }, + pangolin: struct { + pangolin_score: float64 + } + } + 'variant_id': str + 'colocated_variants': struct { + all: array, + non_ukb: array + } + 'gnomad': struct { + freq: struct { + all: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + }, + non_ukb: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + } + }, + faf95: struct { + popmax: float64, + popmax_population: str + }, + faf99: struct { + popmax: float64, + popmax_population: str + }, + age_distribution: struct { + het: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }>, + hom: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }> + }, + filters: set, + quality_metrics: struct { + allele_balance: struct { + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_depth: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_quality: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + site_quality_metrics: array + } + } + 'subsets': set + 'flags': set + 'coverage': struct { + exome: struct { + mean: float64, + median: int32, + over_1: float32, + over_5: float32, + over_10: float32, + over_15: float32, + over_20: float32, + over_25: float32, + over_30: float32, + over_50: float32, + over_100: float32 + }, + genome: struct { + mean: float64, + median: int32, + over_1: float32, + over_5: float32, + over_10: float32, + over_15: float32, + over_20: float32, + over_25: float32, + over_30: float32, + over_50: float32, + over_100: float32 + } + } + 'transcript_consequences': array, + domains: set, + gene_id: str, + gene_symbol: str, + hgvsc: str, + hgvsp: str, + is_canonical: bool, + lof_filter: str, + lof_flags: str, + lof: str, + major_consequence: str, + polyphen_prediction: str, + sift_prediction: str, + transcript_id: str, + transcript_version: str, + gene_version: str, + is_mane_select: bool, + is_mane_select_version: bool, + refseq_id: str, + refseq_version: str + }> +---------------------------------------- +Key: ['locus', 'alleles'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/output/gnomad_v4_variants_annotated_2.ht.schema.min b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/output/gnomad_v4_variants_annotated_2.ht.schema.min new file mode 100644 index 000000000..2c5eb8661 --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/output/gnomad_v4_variants_annotated_2.ht.schema.min @@ -0,0 +1,226 @@ +Row fields: + 'locus': locus + 'alleles': array + 'grpmax': array + 'rsids': set + 'rf': struct { + rf_positive_label: bool, + rf_negative_label: bool, + rf_label: str, + rf_train: bool, + rf_tp_probability: float64 + } + 'in_silico_predictors': struct { + cadd: struct { + phred: float32, + raw_score: float32, + has_duplicate: bool + }, + revel: struct { + revel_score: float64, + has_duplicate: bool + }, + splice_ai: struct { + splice_ai_score: float32, + splice_consequence: str, + has_duplicate: bool + }, + pangolin: struct { + pangolin_score: float64 + } + } + 'variant_id': str + 'colocated_variants': struct { + all: array, + non_ukb: array + } + 'gnomad': struct { + freq: struct { + all: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + }, + non_ukb: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + } + }, + faf95: struct { + popmax: float64, + popmax_population: str + }, + faf99: struct { + popmax: float64, + popmax_population: str + }, + age_distribution: struct { + het: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }>, + hom: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }> + }, + filters: set, + quality_metrics: struct { + allele_balance: struct { + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_depth: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_quality: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + site_quality_metrics: array + } + } + 'subsets': set + 'flags': set + 'coverage': struct { + exome: struct { + mean: float64, + median: int32, + over_1: float32, + over_5: float32, + over_10: float32, + over_15: float32, + over_20: float32, + over_25: float32, + over_30: float32, + over_50: float32, + over_100: float32 + }, + genome: struct { + mean: float64, + median: int32, + over_1: float32, + over_5: float32, + over_10: float32, + over_15: float32, + over_20: float32, + over_25: float32, + over_30: float32, + over_50: float32, + over_100: float32 + } + } + 'transcript_consequences': array, + domains: set, + gene_id: str, + gene_symbol: str, + hgvsc: str, + hgvsp: str, + is_canonical: bool, + lof_filter: str, + lof_flags: str, + lof: str, + major_consequence: str, + polyphen_prediction: str, + sift_prediction: str, + transcript_id: str, + transcript_version: str, + gene_version: str, + is_mane_select: bool, + is_mane_select_version: bool, + refseq_id: str, + refseq_version: str + }> +---------------------------------------- +Key: ['locus', 'alleles'] +---------------------------------------- diff --git a/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/transcripts_path/transcripts_grch38_base.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/transcripts_path/transcripts_grch38_base.ht.schema new file mode 100644 index 000000000..633610b9d --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/transcripts_path/transcripts_grch38_base.ht.schema @@ -0,0 +1,97 @@ +---------------------------------------- +Global fields: + 'annotations': struct { + canonical_transcript: struct {}, + mane_select_transcript: struct { + version: str + } + } +---------------------------------------- +Row fields: + 'gene': struct { + interval: interval>, + gene_id: str, + gene_version: str, + gencode_symbol: str, + chrom: str, + strand: str, + start: int32, + stop: int32, + xstart: int64, + xstop: int64, + exons: array, + transcripts: array>, + transcript_id: str, + transcript_version: str, + gene_id: str, + gene_version: str, + chrom: str, + strand: str, + start: int32, + stop: int32, + xstart: int64, + xstop: int64, + exons: array, + reference_genome: str, + refseq_id: str, + refseq_version: str + }>, + hgnc_id: str, + symbol: str, + name: str, + previous_symbols: set, + alias_symbols: set, + omim_id: str, + ncbi_id: str, + symbol_upper_case: str, + search_terms: set, + reference_genome: str, + flags: set, + canonical_transcript_id: str, + mane_select_transcript: struct { + matched_gene_version: str, + ensembl_id: str, + ensembl_version: str, + refseq_id: str, + refseq_version: str + }, + preferred_transcript_id: str, + preferred_transcript_source: str + } + 'interval': interval> + 'transcript_id': str + 'transcript_version': str + 'gene_id': str + 'gene_version': str + 'chrom': str + 'strand': str + 'start': int32 + 'stop': int32 + 'xstart': int64 + 'xstop': int64 + 'exons': array + 'reference_genome': str + 'refseq_id': str + 'refseq_version': str +---------------------------------------- +Key: ['transcript_id'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/variants_path/gnomad_v4_exome_variants_annotated_1.ht.schema b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/variants_path/gnomad_v4_exome_variants_annotated_1.ht.schema new file mode 100644 index 000000000..2d79bcce0 --- /dev/null +++ b/data-pipeline/schemas/gnomad_v4_variants/annotate_gnomad_v4_exome_transcript_consequences/variants_path/gnomad_v4_exome_variants_annotated_1.ht.schema @@ -0,0 +1,362 @@ +---------------------------------------- +Global fields: + 'freq_meta': array> + 'freq_index_dict': dict + 'faf_meta': array> + 'faf_index_dict': dict + 'freq_sample_count': array + 'filtering_model': struct { + model_name: str, + score_name: str, + feature_medians: dict, + variants_by_strata: dict, + features_importance: dict, + features: array, + test_results: array, + rf_snv_cutoff: struct { + bin: float64, + min_score: float64 + }, + rf_indel_cutoff: struct { + bin: float64, + min_score: float64 + }, + inbreeding_cutoff: float64, + model_id: str + } + 'tool_versions': struct { + dbsnp_version: str, + cadd_version: str, + revel_version: str, + splicaai_version: str, + primateai_version: str, + pangolin_version: str, + vrs_version: str + } + 'vep_globals': struct { + vep_version: str, + vep_csq_header: str, + vep_help: str, + vep_config: str + } + 'age_distribution': struct { + bin_edges: array, + bin_freq: array, + n_smaller: int32, + n_larger: int32 + } + 'age_index_dict': dict + 'age_meta': array> + 'grpmax_index_dict': dict + 'grpmax_meta': array> + 'README': dict + 'gnomad_qc_repo': str + 'gnomad_methods_repo': str +---------------------------------------- +Row fields: + 'locus': locus + 'alleles': array + 'grpmax': array + 'rsids': set + 'vep': struct { + allele_string: str, + end: int32, + id: str, + input: str, + intergenic_consequences: array, + impact: str, + variant_allele: str + }>, + most_severe_consequence: str, + motif_feature_consequences: array, + high_inf_pos: str, + impact: str, + motif_feature_id: str, + motif_name: str, + motif_pos: int32, + motif_score_change: float64, + transcription_factors: array, + strand: int32, + variant_allele: str + }>, + regulatory_feature_consequences: array, + impact: str, + regulatory_feature_id: str, + variant_allele: str + }>, + seq_region_name: str, + start: int32, + strand: int32, + transcript_consequences: array, + distance: int32, + domains: array, + exon: str, + flags: str, + gene_id: str, + gene_pheno: int32, + gene_symbol: str, + gene_symbol_source: str, + hgnc_id: str, + hgvsc: str, + hgvsp: str, + hgvs_offset: int32, + impact: str, + intron: str, + lof: str, + lof_flags: str, + lof_filter: str, + lof_info: str, + mane_select: str, + mane_plus_clinical: str, + mirna: array, + polyphen_prediction: str, + polyphen_score: float64, + protein_end: int32, + protein_start: int32, + protein_id: str, + sift_prediction: str, + sift_score: float64, + source: str, + strand: int32, + transcript_id: str, + tsl: int32, + uniprot_isoform: array, + variant_allele: str + }>, + variant_class: str + } + 'rf': struct { + rf_positive_label: bool, + rf_negative_label: bool, + rf_label: str, + rf_train: bool, + rf_tp_probability: float64 + } + 'in_silico_predictors': struct { + cadd: struct { + phred: float32, + raw_score: float32, + has_duplicate: bool + }, + revel: struct { + revel_score: float64, + has_duplicate: bool + }, + splice_ai: struct { + splice_ai_score: float32, + splice_consequence: str, + has_duplicate: bool + }, + pangolin: struct { + pangolin_score: float64 + } + } + 'variant_id': str + 'colocated_variants': struct { + all: array, + non_ukb: array + } + 'gnomad': struct { + freq: struct { + all: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + }, + non_ukb: struct { + ac: int32, + ac_raw: int32, + an: int32, + hemizygote_count: int32, + homozygote_count: int32, + populations: array + } + }, + faf95: struct { + popmax: float64, + popmax_population: str + }, + faf99: struct { + popmax: float64, + popmax_population: str + }, + age_distribution: struct { + het: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }>, + hom: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }> + }, + filters: set, + quality_metrics: struct { + allele_balance: struct { + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_depth: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + genotype_quality: struct { + all_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + all_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_adj: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + }, + alt_raw: struct { + bin_edges: array, + bin_freq: array, + n_smaller: int64, + n_larger: int64 + } + }, + site_quality_metrics: array + } + } + 'subsets': set + 'flags': set + 'coverage': struct { + exome: struct { + mean: float64, + median: int32, + over_1: float32, + over_5: float32, + over_10: float32, + over_15: float32, + over_20: float32, + over_25: float32, + over_30: float32, + over_50: float32, + over_100: float32 + }, + genome: struct { + mean: float64, + median: int32, + over_1: float32, + over_5: float32, + over_10: float32, + over_15: float32, + over_20: float32, + over_25: float32, + over_30: float32, + over_50: float32, + over_100: float32 + } + } +---------------------------------------- +Key: ['locus', 'alleles'] +---------------------------------------- \ No newline at end of file diff --git a/data-pipeline/src/data_pipeline/config.py b/data-pipeline/src/data_pipeline/config.py index 32d3ec636..2d5deaf8c 100644 --- a/data-pipeline/src/data_pipeline/config.py +++ b/data-pipeline/src/data_pipeline/config.py @@ -1,7 +1,6 @@ import os import attr from pathlib import Path -from hail import Optional DATA_ENV = os.getenv("DATA_ENV", "local") @@ -28,8 +27,8 @@ def create(cls, root=None, **kwargs): return cls(**dataset_paths) def make_local_folder(self): - if "gs://" not in self.data_path: - Path(self.data_path).mkdir(parents=True, exist_ok=True) + if "gs://" not in self.root: + Path(self.root).mkdir(parents=True, exist_ok=True) # @attr.define diff --git a/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step3.py b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step3.py new file mode 100644 index 000000000..a70a977b9 --- /dev/null +++ b/data-pipeline/src/data_pipeline/datasets/gnomad_v4/types/prepare_variants_step3.py @@ -0,0 +1,58 @@ +from typing import List, Union, Set +import attr +from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step2 import Coverage + +from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step1 import ColocatedVariants, Gnomad + +from data_pipeline.datasets.gnomad_v4.types.initial_variant import ( + InSilicoPredictors, + Vep, + Domain, + TranscriptConsequence, + Rf, + InSilicoPredictors, + Grpmax, + Locus, +) + + +@attr.define +class TranscriptConsequence: + biotype: Union[str, None] + consequence_terms: list[str] + domains: Union[List[Domain], None] + gene_id: Union[str, None] + gene_symbol: Union[str, None] + hgvsc: Union[str, None] + hgvsp: Union[str, None] + is_canonical: bool + lof: Union[str, None] + lof_flags: Union[str, None] + lof_filter: Union[str, None] + lof_info: Union[str, None] + major_consequence: str + polyphen_prediction: Union[str, None] + sift_prediction: Union[str, None] + transcript_id: Union[str, None] + transcript_version: Union[str, None] + transcript_id: Union[str, None] + is_mane_select: bool + is_mane_select_version: bool + refseq_id: str + refseq_version: str + + +@attr.define +class Variant: + locus: Locus + alleles: list[str] + grpmax: List[Grpmax] + rsids: Union[Set[str], None] + rf: Rf + in_silico_predictors: InSilicoPredictors + variant_id: str + colocated_variants: ColocatedVariants + gnomad: Gnomad + subsets: set[str] + flags: set[str] + transcript_consequences: Union[List[TranscriptConsequence], None] diff --git a/data-pipeline/src/data_pipeline/pipeline.py b/data-pipeline/src/data_pipeline/pipeline.py index b395e3fc4..a1c42b934 100644 --- a/data-pipeline/src/data_pipeline/pipeline.py +++ b/data-pipeline/src/data_pipeline/pipeline.py @@ -114,8 +114,17 @@ class Task: @classmethod def create( - cls, name: str, task_function: str, output_path: str, inputs: Optional[dict] = {}, params: Optional[dict] = {} + cls, + name: str, + task_function: str, + output_path: str, + inputs: Optional[dict] = None, + params: Optional[dict] = None, ): + if inputs is None: + inputs = {} + if params is None: + params = {} return cls(name, task_function, output_path, inputs, params) def get_output_path(self): @@ -129,7 +138,6 @@ def get_inputs(self): paths.update({k: v.get_output_path()}) else: logger.info(v) - new_path = os.path.join(config.data_paths.root, v) paths.update({k: os.path.join(config.data_paths.root, v)}) return paths @@ -170,8 +178,17 @@ class Pipeline: _outputs: dict = {} def add_task( - self, name: str, task_function: str, output_path: str, inputs: Optional[dict] = {}, params: Optional[dict] = {} + self, + name: str, + task_function: str, + output_path: str, + inputs: Optional[dict] = None, + params: Optional[dict] = None, ): + if inputs is None: + inputs = {} + if params is None: + params = {} task = Task.create(name, task_function, output_path, inputs, params) self._tasks[name] = task return task diff --git a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py index 573bc6550..f785dec6c 100644 --- a/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py +++ b/data-pipeline/src/data_pipeline/pipelines/gnomad_v4_variants.py @@ -32,10 +32,10 @@ # ) pipeline.add_task( - "annotate_gnomad_v4_exome_variants", - annotate_variants, - "/gnomad_v4/gnomad_v4_exome_variants_annotated_1.ht", - { + name="annotate_gnomad_v4_exome_variants", + task_function=annotate_variants, + output_path="/gnomad_v4/gnomad_v4_exome_variants_annotated_1.ht", + inputs={ "variants_path": pipeline.get_task("prepare_gnomad_v4_exome_variants"), "exome_coverage_path": "tiny_datasets/gnomad_v4_exome_coverage.ht", "genome_coverage_path": "tiny_datasets/gnomad_v4_genome_coverage.ht", @@ -45,22 +45,24 @@ }, ) -# pipeline.add_task( -# "annotate_gnomad_v4_transcript_consequences", -# annotate_transcript_consequences, -# "/gnomad_v4/gnomad_v4_variants_annotated_2.ht", -# { -# "variants_path": pipeline.get_task("annotate_gnomad_v4_variants"), -# "transcripts_path": genes_pipeline.get_output("base_transcripts_grch38"), -# "mane_transcripts_path": genes_pipeline.get_output("mane_select_transcripts"), -# }, -# ) +pipeline.add_task( + name="annotate_gnomad_v4_exome_transcript_consequences", + task_function=annotate_transcript_consequences, + output_path="/gnomad_v4/gnomad_v4_variants_annotated_2.ht", + inputs={ + "variants_path": pipeline.get_task("annotate_gnomad_v4_exome_variants"), + "transcripts_path": "genes/transcripts_grch38_base.ht", + "mane_transcripts_path": "genes/mane_select_transcripts.ht" + # "transcripts_path": genes_pipeline.get_output("base_transcripts_grch38"), + # "mane_transcripts_path": genes_pipeline.get_output("mane_select_transcripts"), + }, +) ############################################### # Outputs ############################################### -# pipeline.set_outputs({"variants": "annotate_gnomad_v4_transcript_consequences"}) +pipeline.set_outputs({"variants": "annotate_gnomad_v4_exome_transcript_consequences"}) ############################################### # Run diff --git a/data-pipeline/tests/v4/test_inputs.py b/data-pipeline/tests/v4/test_inputs.py index 64987f32c..631e48ccd 100644 --- a/data-pipeline/tests/v4/test_inputs.py +++ b/data-pipeline/tests/v4/test_inputs.py @@ -12,6 +12,7 @@ from data_pipeline.datasets.gnomad_v4.types.initial_variant import InitialVariant from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step1 import Variant as Step1Variant from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step2 import Variant as Step2Variant +from data_pipeline.datasets.gnomad_v4.types.prepare_variants_step3 import Variant as Step3Variant step1_task = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants") @@ -43,7 +44,6 @@ def test_globals_input_validation(): def test_validate_variant_input(): input_path = gnomad_v4_variant_pipeline.get_task("prepare_gnomad_v4_exome_variants").get_inputs()["input_path"] ht = hl.read_table(input_path) - # ht = ht.sample(0.1, seed=1234) result = ht_to_json(ht) [structure_attrs_fromdict(variant, InitialVariant) for variant in result] @@ -59,6 +59,14 @@ def test_validate_step1_output(): def test_validate_step2_output(): output_path = gnomad_v4_variant_pipeline.get_task("annotate_gnomad_v4_exome_variants").get_output_path() ht = hl.read_table(output_path) - # ht = ht.sample(0.1, seed=1234) result = ht_to_json(ht) [structure_attrs_fromdict(variant, Step2Variant) for variant in result] + + +def test_validate_step3_output(): + output_path = gnomad_v4_variant_pipeline.get_task( + "annotate_gnomad_v4_exome_transcript_consequences" + ).get_output_path() + ht = hl.read_table(output_path) + result = ht_to_json(ht) + [structure_attrs_fromdict(variant, Step3Variant) for variant in result]