-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
154 lines (135 loc) · 7.45 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# MultiBLUP workflow for free amino acid traits in Arabidopsis thaliana
# Sarah Turner-Hissong
# Last updated: 18 September 2019
# Phenotype data: Angelovici et al 2013 Plant Cell
# Genotype data: Li et al. 2010; Horton et al. 2012
# Most steps follow MultiBLUP documentation at dougspeed.com/multiblup/
# CAVEATS (in no specific order)
# 1. Full workflow requires a substantial amount of disk space
# Recommendation is to run just one or a few of the random subsets to start with, then scale up
# 2. Requires phenotype/genotype data to be previously downloaded
# 3. Paths to input files are not generic and will need to be updated if applied to different data
# to execute workflow, use submit.sh file or run the following to include cluster config:
# snakemake --jobs N --rerun-incomplete --latency-wait 60 --cluster-config submit.json
# --cluster "sbatch -p {cluster.p} -o {cluster.o} --cpus-per-task {cluster.cpus-per-task}"
################################################################################
import numpy as np
# define some variables
# dictionary of pathways and MapMan bincodes that will be tested
# to use prespecified files with pathway info, set bincode == 0
# take care when specifying bincodes - want to make sure it is grepping the correct value
# worth double checking when you pull annotations using src/02_select_pathway_snps
# e.g. if you want all annotations in bin 8, just use '8', if you want annotations in '29.5.1',
# specify the end of the line using '29.5.1$' or it will also pull '29.5.11'
# to check this is working correctly in R:
# foo <- tibble(file = files) %>%
# separate(file, sep = "/", into = c("dir", "pathway", "filename"), remove = FALSE) %>%
# filter(!pathway %in% "old") %>% mutate(data = lapply(file, read.table, header = TRUE, colClasses = "character"))
# bar <- foo %>% group_by(pathway) %>% distinct(BINCODE, NAME)
PATHWAYS = {
"protein_aa_activation": "29.1",
"protein_synthesis": "29.2",
"protein_targeting": "29.3",
"protein_postrans": "29.4",
"protein_degradation": "(?!29.5.11)29.5",
"degradation_subtilases": "29.5.1$",
"degradation_autophagy": "29.5.2$",
"cys_protease": "29.5.3$",
"asp_protease": "29.5.4$",
"ser_protease": "29.5.5$",
"metallo_protease": "29.5.7$",
"degradation_AAA": "29.5.9$",
"degradation_ubiquitin": "29.5.11",
"protein_folding": "29.6",
"protein_glyco": "29.7",
"protein_assembly": "29.8",
"aa_transport": "34.3$",
"tca_cycle": "8",
"e_alt_oxidase": "9.4",
"isoprenoids": "16.1",
"phenylpropanoids": "16.2",
"N_containing": "16.4",
"S_containing": "16.5",
"flavonoids": "16.8",
"aa_synthesis": "13.1",
"aa_degradation": "13.2",
"glycolysis": "4",
"glycolysis_cystolic": "4.1",
"glycolysis_plastid": "4.2",
"glycolysis_other": "4.3",
"aa_syn_nobcat_haplo": "0",
"aa_deg_bcat_haplo": "0"
}
def bincode(wildcards):
bincode = PATHWAYS[wildcards.pathway]
return bincode
# list trait names
TRAIT = ['ala', 'arg', 'asp', 'gln' , 'glu', 'gly', 'his', 'ile', 'leu', 'lys','met',
'phe', 'pro', 'ser', 'thr', 'trp', 'tyr', 'val', 'total', 'ShikFam', 'AspFam',
'AspFam_Asp', 'SerFam', 'GluFam', 'GluFam_glu', 'BCAA', 'PyrFam', 'ala_t',
'arg_t', 'asp_t', 'gln_t', 'glu_t', 'gly_t', 'his_t', 'ile_t', 'leu_t',
'lys_t', 'met_t', 'phe_t', 'pro_t', 'ser_t', 'trp_t', 'tyr_t', 'val_t',
'ile_BCAA', 'leu_BCAA', 'val_BCAA', 'arg_GluFam', 'gln_GluFam', 'glu_GluFam',
'his_GluFam', 'pro_GluFam', 'phe_ShikFam', 'trp_ShikFam', 'tyr_ShikFam',
'gly_SerFam', 'ser_SerFam', 'ala_PyrFam', 'leu_PyrFam', 'val_PyrFam',
'asp_AspFam', 'ile_AspFam', 'lys_AspFam', 'met_AspFam', 'thr_AspFam']
# list range of random subsets to include - we used 5000
RANDOM = list(range(1,5001))
# index for null distribution gene group sampling
NULL = np.arange(1, 5001, 50).tolist()
# how many sets of cross validation - we used 5
CV = list(range(1,6))
# fold for each cross validation - we used 10 fold (total of 50 cross validations)
INDEX = list(range(1,11))
# path for ldak software
ldak = "../software/ldak5.linux"
################################################################################
# this is a pseudo-rule that collects the target files
# variable names refer to different rules in the workflow
# this is where you can specify wildcard ids for rules
rule all:
input:
# linkage disequilibrium adjusted weightings (not used for this workflow)
snp_weightings = "data/processed/sections/weights.short",
# principal components analysis to correct for population structure
# pca = "models/pca/ld_pruned.bed",
# pca_result = "data/raw/pheno_file_pcs",
# create test and training indexes for cross-validation
get_cv = expand("data/processed/cross_validation/cv{cv}.test{index}", cv = CV, index = INDEX),
get_cv2 = expand("data/processed/cross_validation/cv{cv}.train{index}", cv = CV, index = INDEX),
# kinships for GBLUP
gblup_kins = "models/gblup/kinships.grm.id",
# heritability estimates for GBLUP
gblup_h2 = expand("models/gblup_h2/{trait}.gblup.reml", trait = TRAIT),
# # GBLUP cross-validation
gblup_reml = expand("models/gblup/{trait}.cv{cv}.{index}.reml", trait = TRAIT, cv = CV, index = INDEX),
gblup_blup = expand("models/gblup/{trait}.cv{cv}.{index}.profile", cv = CV, index = INDEX, trait = TRAIT),
gblup_results = "reports/gblup.RData",
# # extract pathway information for MultiBLUP
# multiblup_pathways = expand("data/processed/pathways/{pathway}/list1", pathway = PATHWAYS.keys()),
# kinship matrices for MultiBLUP model
multiblup_kins = expand("data/processed/pathways/{pathway}/partition.list", pathway = PATHWAYS.keys()),
# heritability for partitions in MultiBLUP model
multiblup_h2 = expand("models/multiblup_h2/{pathway}/{trait}.multiblup.reml", pathway = PATHWAYS.keys(), trait = TRAIT),
# # cross-validation for MultiBLUP
multiblup_reml = expand("models/multiblup/{pathway}/{trait}.cv5.10.reml", pathway = PATHWAYS.keys(), trait = TRAIT),
multiblup_blup = expand("models/multiblup/{pathway}/{trait}.cv5.10.profile", pathway = PATHWAYS.keys(), trait = TRAIT),
# # summary of MultiBLUP results
multiblup_results = "reports/multiblup.RData",
# # null distribution
# null_sampling = "data/interim/null_group_sizes.txt",
# null_gene_groups = expand("data/processed/random_sets/null_{null}.txt", null = NULL),
# calc_kins_control = expand("data/processed/random_sets/c_{random}/partition.list", random = 5000),
null_pathway_sampling = expand("data/processed/random_sets_pathways/{pathway}/null_1000.txt", pathway = PATHWAYS.keys()),
calc_kins_null_pathways = expand("data/processed/random_sets_pathways/{pathway}/c_1000/partition.list", pathway = PATHWAYS.keys()),
# null_h2_pathways = expand("models/null_h2_pathways/{pathway}/c_1000/{trait}.h2.reml", pathway = PATHWAYS.keys(), trait = TRAIT),
null_summary_pathways = expand("reports/null_dist_results_pathways/{pathway}_null.csv", pathway = PATHWAYS.keys())
# reml_h2_control = expand("models/null_h2/c_{random}/{trait}.h2.reml", random = 5000, trait = TRAIT),
# null_results = "reports/lr_null_results.csv"
# include rule files with commands to run each step
include: "rules/common.smk"
include: "rules/prep_data.smk"
include: "rules/cross_validation.smk"
include: "rules/gblup.smk"
include: "rules/multiblup.smk"
include: "rules/null_distribution.smk"