-
Notifications
You must be signed in to change notification settings - Fork 5
/
BatchPredictChromAlignNet.py
180 lines (149 loc) · 9.3 KB
/
BatchPredictChromAlignNet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import pandas as pd
import numpy as np
import os
import time
import sys
from PredictChromAlignNet import prepareDataForPrediction, runPrediction
from parameters import prediction_options, batch_prediction_options
from modelDefinition import getModelVariant
from utilsData import calculateMetrics
"""
Runs the prediction script many times on one data set, over a range of model names,
model variants and repetitions.
Two arguments can be given from the command line when running the script. The first
argument is required and indicates the index of the data set to use. The second
argument is optional and can specify a particular repetition of the model to run.
Outputs:
ModelTests-On*.csv -- CSV file containing the metrics from each prediction
ModelTests-On*_Mean.csv -- CSV file containing summary metrics averaged across repetions.
./Individual/* -- Individual prediction outcomes can be saved if 'save_individual_predictions'
is True. The folder can be changed by modifying 'individual_predictions_save_path'
"""
### Load parameters
data_paths = batch_prediction_options['data_paths'] # Location of the data sets
model_repeats = batch_prediction_options['model_repeats'] # Which repetitions of the model were trained
model_variants = batch_prediction_options['model_variants'] # The variant of the model that was trained (defined in modelDefinition.py)
model_names = batch_prediction_options['model_names'] # The letter name of the model that was trained (indicating the data sets that were used in the training)
# Select the dataset to perform predictions for. The integer provided in an index
# which corresponds to the list of data sets in data_options
dataset_number = int(sys.argv[1])
data_path = data_paths[dataset_number]
calculate_f1_metric = prediction_options['calculate_f1_metric'] # If True, the F1 score is calculated as a metric as well as the true and false positives
calculate_auc_metric = prediction_options['calculate_auc_metric'] # If True, the AUC is calculated as a metric as well
calculate_average_precision_metric = prediction_options['calculate_average_precision_metric'] # If True, the average precision is calculated as a metric as well
calculate_metrics_for_components = prediction_options['calculate_metrics_for_components'] # If True, metrics are calculated for the subnetwork outputs
results_path = prediction_options['results_path'] # Path to save the summary output
# Second system input of repetition number. If it's present then need to modify the save_name variable
# If it's not present then run the prediction using all repetitions
save_names = batch_prediction_options['save_names'] # Gives the name of the csv file to be saved with the metrics from each prediction
if len(sys.argv) > 2:
repeat = int(sys.argv[2])
save_name = os.path.join(results_path,
'rep' + '{:02d}'.format(repeat)
+ '-' + save_names[dataset_number])
model_repeats = [repeat]
else:
save_name = os.path.join(results_path, save_names[dataset_number])
# The individual prediction outputs can be saved
if batch_prediction_options['save_individual_predictions']:
individual_predictions_save_path = batch_prediction_options['individual_predictions_save_path']
if individual_predictions_save_path is None:
results_path_individual = results_path # Save in the main results folder
else:
results_path_individual = os.path.join(results_path, individual_predictions_save_path) # Save in a subfolder
os.makedirs(results_path_individual, exist_ok = True)
### Initialise
metrics_list = []
prediction_times_list = []
model_fullname_list = []
# Check input, make sure we're using the correct data file name and where
# the results will be saved to.
print('Predicting for data: ')
print(data_path)
print('Results will be saved to: ')
print(save_name)
sys.stdout.flush() # Using manual flush to force the printing, for situations when we are checking the log file mid-calculation.
# Determine the names of the metrics to save in the summary dataframe
metric_names = ['True Positives', 'False Positives - Ignore Neg Idx', 'False Positives']
mass_metric_names = ['TP-Mass', 'FP-IgnNeg-Mass', 'FP-Mass']
peak_metric_names = ['TP-Peak', 'FP-IgnNeg-Peak', 'FP-Peak']
chrom_metric_names = ['TP-Chrom', 'FP-IgnNeg-Chrom', 'FP-Chrom']
if calculate_f1_metric:
metric_names.extend(['Recall', 'Precision', 'F1'])
mass_metric_names.extend(['Recall-Mass', 'Precision-Mass', 'F1-Mass'])
peak_metric_names.extend(['Recall-Peak', 'Precision-Peak', 'F1-Peak'])
chrom_metric_names.extend(['Recall-Chrom', 'Precision-Chrom', 'F1-Chrom'])
if calculate_auc_metric:
metric_names.append('AUC')
mass_metric_names.append('AUC-Mass')
peak_metric_names.append('AUC-Peak')
chrom_metric_names.append('AUC-Chrom')
if calculate_average_precision_metric:
metric_names.append('Average Precision')
mass_metric_names.append('AP-Mass')
peak_metric_names.append('AP-Peak')
chrom_metric_names.append('AP-Chrom')
if calculate_metrics_for_components:
metric_names.extend(mass_metric_names + peak_metric_names + chrom_metric_names)
metric_len = len(peak_metric_names)
### Initialise
if batch_prediction_options['continue_from_saved'] and os.path.isfile(save_name):
df = pd.read_csv(save_name, index_col = 0)
metrics_list = list(df[metric_names].values)
prediction_times_list = list(df['Prediction Times'])
model_fullname_list = list(df['Model Name'])
repetition_list = list(df['Repetition'])
check_saved = True
else:
metrics_list = []
prediction_times_list = []
model_fullname_list = []
repetition_list = []
check_saved = False
### Predict
# The variant loop is first, so the data only need to be loaded once for each variant (some variants do not use the peak data, hence the reloading between variants)
for i in model_variants:
print('\n\n') # This is just so the log file is nice
print('===============\nFor model variant', i)
# Get the correct model variant and extract the ignore_peak_profile attribute
chrom_align_model = getModelVariant(i)
ignore_peak_profile = getattr(chrom_align_model, 'ignore_peak_profile')
prediction_data, comparisons, info_df, peak_df_orig, peak_intensity = prepareDataForPrediction(data_path, ignore_peak_profile)
for name in model_names:
model_fullname = name + '-' + '{:02d}'.format(i) # Full name of the model - eg 'H-01'
for repeat in model_repeats:
# Check if this repetition has already been done
if check_saved:
if ((df['Model Name'] == model_fullname) & (df['Repetition'] == repeat)).any():
continue
# Combine the model name, variant and repetition to get the name of the model file to load
model_file = "{}-{}-{:02d}-r{:02d}".format(batch_prediction_options['model_prefix'], name, i, repeat)
print('---\nModel used: ', model_file)
model_fullname_list.append(model_fullname)
repetition_list.append(repeat)
if batch_prediction_options['save_individual_predictions']:
predictions_save_name = '{}/{}_{}_Prediction.csv'.format(results_path_individual, model_file, batch_prediction_options['dataset_name'][dataset_number])
else:
predictions_save_name = None
# Loads the saved model and runs the prediction
predict_time = time.time()
predictions = runPrediction(prediction_data, model_file, verbose = batch_prediction_options['verbose_prediction'],
predictions_save_name = predictions_save_name, comparisons = comparisons)
prediction_times_list.append(round((time.time() - predict_time)/60, 2)) # Prediction time in minutes
# Get the metrics of the prediction results
metrics = calculateMetrics(predictions, info_df, comparisons, calculate_f1 = calculate_f1_metric,
calculate_auc = calculate_auc_metric, calculate_average_precision = calculate_average_precision_metric,
calculate_for_components = calculate_metrics_for_components, print_metrics = True)
# Modify the returned metrics to include nan values for peak encoder (so the columns line up between different model variants)
if calculate_metrics_for_components and ignore_peak_profile:
metrics[metric_len * 2 : metric_len * 2] = [np.nan] * metric_len
metrics_list.append(metrics)
sys.stdout.flush()
df = pd.DataFrame(metrics_list, columns = metric_names)
df['Prediction Times'] = prediction_times_list
df['Model Name'] = model_fullname_list
df['Repetition'] = repetition_list
df.to_csv(save_name) # Saves the summary output after every prediction
if len(sys.argv) == 2: # No selection of model repeat
# Saves a mean output at the end (averaged over the model repetitions)
df.groupby('Model Name').mean().to_csv(save_name[:-4] + '_Mean.csv')