-
Notifications
You must be signed in to change notification settings - Fork 0
/
print_param.cpp
189 lines (156 loc) · 11.1 KB
/
print_param.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#include "print_param.h"
print_param::print_param(string file_Name)
{
/**
* * Constructor Function
* Assigns passed file location variable to the classes' private variable.
**/
cout << "Starting up Parameter Print" << endl
<< endl;
this->file_Name = file_Name;
}
void print_param::ingress()
{
/**
* Execution function prints the paramater file to the user defined location in *.json format.
**/
// string original = this->file_Name;
// for (size_t i = 1; i <= 22; i++)
// {
// stringstream ss;
// ss << std::setw(2) << std::setfill('0') << i;
// string file_Name = original + ss.str() + ".json";
/**
* Ensures first and foremost that the entered file name does not exist in the said location.
**/
if (filesystem::exists(file_Name) == 0)
{
cout << "Sample parameter file being printed: " << file_Name << endl;
fstream output;
output.open(file_Name, ios::out);
output << "{\n";
output << " # Cuda device\n"
<< " \"CUDA Device ID\":0,\n\n";
output << " # IO directory listings and universal file paths\n"
<< " # RECOMMENDS THE CREATION OF SEPARATE PARAMATER FILES WITH SEPARATE OUTPUT AND INTERMEDIATE PATHS FOR INDIVIDUAL PROJECTS\n"
<< " \"Input path\":\"VCF_indexed_folder\",\n"
<< " \"Output path\":\"Output_folder\",\n"
<< " \"Intermediate path\":\"Intermediate_folder\",\n\n";
output << " # Prometheus settings\n"
<< " # User is capable of determining the number of gene combinations and number of SNPs to be processed by CATE.\n"
<< " # When setting these parameters be mindful of system hardware availability.\n"
<< " # WARNING: Multi read CAN CAUSE A BOTTLENECK IF HARDWARE DOES NOT SUPPORT IT. NVMe DRIVES CONNECTED VIA PCIe BUS WILL SUPPORT MULTI READ. TYPICAL HDD DRIVES DO NOT.\n"
<< " \"Prometheus activate\":\"Yes\",\n"
<< " \"CPU cores\":25,\n"
<< " \"SNPs per time\":100000,\n"
<< " \"Number of genes\":1000,\n"
<< " \"Multi read\":\"Yes\",\n\n";
output << " # VCF sample details\n"
<< " \"Ploidy\":2,\n\n";
output << " # Protein information\n"
<< " # DNA INTERPRETATION SHOULD BE USED FOR START AND STOP CODONS\n"
<< " \"Start codon(s)\":\"ATG\",\n"
<< " \"Stop codon(s)\":\"TAA,TAG,TGA\",\n"
<< " \"Genetic code\":\"A|GCT,GCC,GCA,GCG;R|CGT,CGC,CGA,CGG,AGA,AGG;N|AAT,AAC;D|GAT,GAC;B|AAT,AAC,GAT,GAC;C|TGT,TGC;Q|CAA,CAG;E|GAA,GAG;Z|CAA,CAG,GAA,GAG;G|GGT,GGC,GGA,GGG;H|CAT,CAC;M|ATG;I|ATT,ATC,ATA;L|CTT,CTC,CTA,CTG,TTA,TTG;K|AAA,AAG;F|TTT,TTC;P|CCT,CCC,CCA,CCG;S|TCT,TCC,TCA,TCG,AGT,AGC;T|ACT,ACC,ACA,ACG;W|TGG;Y|TAT,TAC;V|GTT,GTC,GTA,GTG;X|TAA,TGA,TAG\",\n\n";
output << " # Neutrality and FST calculation mode\n"
<< " # Calculation mode can be FILE (to calculate the tests for predefined regions) or WINDOW\n"
<< " \"Calculation mode\":\"FILE\",\n\n";
output << " # WINDOW mode parameters\n"
<< " \"Window size\":10000,\n"
<< " \"Step size\":10000,\n\n";
output << " # FILE mode parameters\n"
<< " # Any tab-deliminated text based formats such as but not limited to (*.txt, *.csv)\n"
<< " \"Universal gene list\":\"sample.txt\n\n";
output << " # VCF split parameters\n"
<< " # The type of Split mode can be either CHR or CTSPLIT.\n"
<< " # The CHR mode will split the VCF file by chromosomes and extract the GT column only.\n"
<< " # The CTSPLIT mode will create CATE's proprietary file structure. For this the parent VCFs must only contain one chromosome's data and have only the GT column.\n"
<< " # Column numbers are non zero digits, i.e. Column numbers start with one.\n"
<< " \"Split mode\":\"CTSPLIT\"\n"
<< " \"CHR individual summary\":\"YES\"\n"
<< " \"Split cores\":10\n"
<< " \"Split SNPs per_time_CPU\":500000\n"
<< " \"Split SNPs per_time_GPU\":100000\n"
<< " \"Population file path\":\"sample_population_codes.tsv\",\n"
<< " \"Sample_ID Column number\":1,\n"
<< " \"Population_ID Column number\":6,\n"
<< " \"Reference allele count\":1,\n"
<< " \"Alternate allele count\":1,\n"
<< " \"SNP count per file\":10000,\n"
<< " \"MAF frequency\":\"0.05\"\n"
<< " \"Frequency logic\":\">\"\n\n";
output << " # FASTA split parameters\n"
<< " # REQUIRES SEPARATE INPUT, SINCE IT IS A SINGULAR INPUT\n"
<< " # IF ALL IS SET AS SEQUENCE THEN THE WHOLE FILE IS SPLIT. ELSE STATE THE SEQUENCE ID OF THE SEQUENCE THAT NEEDS TO BE SEPARATED\n"
<< " \"Sequence\":\"All\",\n"
<< " \"Raw FASTA file\":\"One.fasta\",\n\n";
output << " # FASTA merge parameters\n"
<< " # REQUIRES SEPARATE INPUT AND OUTPUT LOCATION\n"
<< " # ENSURE THE FASTA FILES HAVE THE APPROPRIATE EXTENSION: .fasta, .fna, .ffn, .faa, .frn, .fa\n"
<< " \"FASTA files folder\":\"reference_Split/\",\n"
<< " \"Merge FASTA path\":\"reference_Split/sample.fasta\",\n\n";
output << " # Extract genes parameters\n"
<< " # Extract gene FASTA sequences from REFERENCE FASTA file and outputs them as separate FASTA files\n"
<< " \"Reference genome ex\":\"reference/Human_1.fasta\",\n"
<< " \"Extract gene list\":\"universal\",\n\n";
output << " # GFF to Genes parameters\n"
<< " # REQUIRES SEPARATE INPUT, SINCE IT IS A SINGULAR INPUT\n"
<< " \"GFF file\":\"01.gff\",\n\n";
output << " # Haplotype INFO extract\n"
<< " # Extract haplotype information from VCFs\n"
<< " \"Reference genome hap\":\"reference/Human_1.fasta\",\n"
<< " \"Hap extract gene list\":\"universal\",\n\n";
output << " # All Neutrality tests\n"
" # Calculates all three neutrality tests (Tajima's D, Fay and Wu's and Fu and Li's) at once\n"
" # File created would have the extension *.nt (tab deliminated text file). File Name: CountryName_GeneListFileName.nt\n"
" \"Neutrality gene list\":\"universal\",\n\n";
output << " # Tajima's D\n"
<< " # File created would have the extension *.td (tab deliminated text file). File Name: CountryName_GeneListFileName.td\n"
<< " \"Tajima gene list\":\"universal\",\n\n";
output << " # Fu and Li\n"
<< " # File created would have the extension *.fl (tab deliminated text file). File Name: CountryName_GeneListFileName.fl\n"
<< " \"Fu and Li gene list\":\"universal\",\n\n";
output << " # Fay and Wu\n"
<< " # File created would have the extension *.fw (tab deliminated text file). File Name: CountryName_GeneListFileName.fw\n"
<< " \"Fay and Wu gene list\":\"universal\",\n\n";
output << " # McDonald–Kreitman\n"
<< " # REFERENCE GENOME FILE SHOULD BE THE SAME AS THAT USED TO GENERATE THE VCF FILE\n"
<< " # ALIGNMENT FILE SHOULD BE A PAIRWISE ALIGNMENT OF THE REFERENCE GENOME TO THE OUTGROUP GENOME. SHOULD BE IN .maf FORMAT\n"
<< " # Alignment can be gene wide (GENE mode) or chromosome (CHROM mode) wide.\n"
<< " # GENE mode would require each gene to be separately aligned and the location of each alignment file to be added as a third column to the gene list file.\n"
<< " # File created would have the extension *.mk (tab deliminated text file). File Name: CountryName_GeneListFileName.mk\n"
<< " # CHROM mode would require the outgroup species chromosome's genome to be aligned with the entire chromosome of the query species reference genome.\n"
<< " # File created would have the extension *.mk (tab deliminated text file). File Name: CountryName_GeneListFileName.mk\n"
<< " \"Alignment mode\":\"GENE\",\n"
<< " \"ORF known\":\"Yes\",\n"
<< " \"Reference genome mk\":\"reference/Human_1.fasta\",\n"
<< " \"Alignment file\":\"reference/results.maf\",\n"
<< " \"McDonald–Kreitman gene list\":\"universal\",\n\n";
output << " # Fst or Fixation Index\n"
" # Indexed population file should be a THREE column tab deliminated *.txt file with headings: Sample_name population_ID Super_population\n"
" # Ensure population_ID values are unique to each super population and do not overlap\n"
" \"Population index file path\":\"fst_pop/population.txt\",\n"
" \"Fst gene list\":\"universal\",\n"
" \"Population ID\":\"GWD,ACB,PJL\",\n\n";
output << " # EHH or Extended Haplotype Homozygosity\n"
<< " # Define the extended haplotype region specification mode by either using a separate gene list file (FILE mode) or state the standard displacement value (FIXED mode) in bases\n"
<< " # Degradation of EHH around a SNP can be calculated by SNP or BP mode.\n"
<< " \"Range mode\":\"FILE\",\n"
<< " \"EHH FILE path\":\"universal\",\n"
<< " \"FIXED mode\":\"+1000\"\n"
<< " \"SNP default count\":53\n"
<< " \"SNP BP displacement\":100000\n"
<< " \"EHH CPU cores\":10\n";
output
<< "}";
output.close();
cout << "Sample parameter has been printed: " << file_Name << endl;
}
else
{
/**
* ! Initialised if the parameter file already exists in the said location.
**/
cout << "ERROR: PARAMETER FILE " << file_Name << " ALREADY EXISTS." << endl;
}
}