Skip to content

Commit

Permalink
v0.5: speed up load_ms by numba index; add deep_big cut before fast f…
Browse files Browse the repository at this point in the history
…dr; remove gc.collect()
  • Loading branch information
YuAirLab committed Oct 15, 2024
1 parent 4701ea0 commit 87d3c97
Show file tree
Hide file tree
Showing 19 changed files with 181 additions and 81 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,6 @@ optimize the proteome profiling of diaPASEF mass spectrometry data**
#### 0.3.0
* FEAT: refactor code to speed up
#### 0.4.0
* FEAT: use model_ys; select locus with x == 1 or deep == 1
* FEAT: use model_ys; select locus with x == 1 or deep == 1
#### 0.5.0
* FEAT: speed up load_ms by numba index; add deep_big cut before fast fdr
2 changes: 1 addition & 1 deletion beta_dia/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.4.0'
__version__ = '0.5.0'
9 changes: 6 additions & 3 deletions beta_dia/calib.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def update_info_im(df_tol, df_lib):
return df_tol, df_lib


@profile
def update_info_mz(df_seed, ms):
'''
Calib m/z, update the measure m/z
Expand All @@ -210,7 +211,7 @@ def update_info_mz(df_seed, ms):
frac = 0.1
y_lowess = lowess(y, x, frac=frac)
x_fit, y_fit = zip(*y_lowess)
x_fit, y_fit = np.array(x_fit), np.array(y_fit)
x_fit, y_fit = np.array(x_fit, dtype=np.float32), np.array(y_fit, dtype=np.float32)

f = interp1d(x_fit, y_fit, kind='cubic', fill_value='extrapolate')

Expand Down Expand Up @@ -251,8 +252,10 @@ def update_info_mz(df_seed, ms):
cycle_valid_lens2, all_push2, all_tof2, all_height2
) = ms_map

all_tof = f(all_tof).astype(np.float32)
all_tof2 = f(all_tof2).astype(np.float32)
all_tof = f(all_tof)
all_tof = all_tof.astype(np.float32)
all_tof2 = f(all_tof2)
all_tof2 = all_tof2.astype(np.float32)

ms_map = (all_rt,
cycle_valid_lens, all_push, all_tof, all_height,
Expand Down
17 changes: 11 additions & 6 deletions beta_dia/deepmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,24 +304,28 @@ def extract_maps(df_batch,

# params
if neutron_num == -1:
query_mz_ms1 = df_batch[['pr_mz_left', 'pr_mz_left']].values
query_mz_ms1 = df_batch['pr_mz_left'].values
query_mz_ms1 = np.tile(query_mz_ms1, (2, 1)).T
query_mz_ms2 = np.array(df_batch['fg_mz_left'].values.tolist())
query_mz_m = np.concatenate([query_mz_ms1, query_mz_ms2], axis=1)
ms1_ion_num = 1
elif neutron_num == 0:
query_mz_ms1 = df_batch[['pr_mz', 'pr_mz']].values
query_mz_ms1 = df_batch['pr_mz'].values
query_mz_ms1 = np.tile(query_mz_ms1, (2, 1)).T
cols_center = ['fg_mz_' + str(i) for i in range(param_g.fg_num)]
query_mz_ms2 = df_batch[cols_center].values
query_mz_m = np.concatenate([query_mz_ms1, query_mz_ms2], axis=1)
ms1_ion_num = 1
elif neutron_num == 1:
query_mz_ms1 = df_batch[['pr_mz_1H', 'pr_mz_1H']].values
query_mz_ms1 = df_batch['pr_mz_1H'].values
query_mz_ms1 = np.tile(query_mz_ms1, (2, 1)).T
cols_1H = ['fg_mz_1H_' + str(i) for i in range(param_g.fg_num)]
query_mz_ms2 = df_batch[cols_1H].values
query_mz_m = np.concatenate([query_mz_ms1, query_mz_ms2], axis=1)
ms1_ion_num = 1
elif neutron_num == 2:
query_mz_ms1 = df_batch[['pr_mz_2H', 'pr_mz_2H']].values
query_mz_ms1 = df_batch['pr_mz_2H'].values
query_mz_ms1 = np.tile(query_mz_ms1, (2, 1)).T
cols_2H = ['fg_mz_2H_' + str(i) for i in range(param_g.fg_num)]
query_mz_ms2 = df_batch[cols_2H].values
query_mz_m = np.concatenate([query_mz_ms1, query_mz_ms2], axis=1)
Expand Down Expand Up @@ -463,7 +467,7 @@ def scoring_maps(
feature = feature.numpy()
feature_v.append(feature)

pred = torch.cat(pred_v)
pred = torch.cat(pred_v).to(dtype=torch.float32) # torch autocast to 16
if return_feature:
feature = np.vstack(feature_v)
else:
Expand Down Expand Up @@ -579,8 +583,9 @@ def extract_scoring_big(
with torch.no_grad():
with torch.cuda.amp.autocast():
feature, pred = model(maps_sub, valid_ion_nums)
torch.cuda.synchronize()
pred = torch.softmax(pred, 1)
pred = pred[:, 1].cpu().numpy()
pred = pred[:, 1].cpu().numpy().astype(np.float32)
feature = feature.cpu().numpy()
pred_v.append(pred)
feature_v.append(feature)
Expand Down
4 changes: 2 additions & 2 deletions beta_dia/dist/main.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions beta_dia/dist/main_core.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion beta_dia/dist/pyarmor_runtime_000000/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Pyarmor 8.5.11 (trial), 000000, 2024-09-30T22:26:20.124320
# Pyarmor 8.5.11 (trial), 000000, 2024-10-15T17:35:23.857876
def __pyarmor__():
import platform
import sys
Expand Down
Binary file not shown.
Binary file not shown.
17 changes: 11 additions & 6 deletions beta_dia/fdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,22 +277,27 @@ def cal_q_pr_first(df, batch_size, n_model, model_trained=None, scaler=None):
else:
X = scaler.transform(X)

# train on group_rank == 1
# train
group_rank_max = df['group_rank'].max()
if group_rank_max > 1:
train_idx = df['group_rank'] == 1
if (model_trained is None) and (group_rank_max > 1):
decoy_deeps = df.loc[df['decoy'] == 1, 'score_big_deep_pre'].values
decoy_m, decoy_u = np.mean(decoy_deeps), np.std(decoy_deeps)
good_cut = decoy_m + 3 * decoy_u
logger.info(f'Training with good_big_score: {good_cut:.2f}')
train_idx = (df['group_rank'] == 1) & (df['score_big_deep_pre'] > good_cut)
X_train = X[train_idx]
y_train = y[train_idx]
else:
X_train = X
y_train = y

n_pos, n_neg = sum(y_train == 1), sum(y_train == 0)
info = 'Training the model: {} pos, {} neg'.format(n_pos, n_neg)
logger.info(info)

# models
if model_trained is None:
param = (20, 10, 5)
param = (25, 20, 15, 10, 5)
mlps = [MLPClassifier(max_iter=1,
shuffle=True,
random_state=i, # init weights and shuffle
Expand All @@ -304,7 +309,7 @@ def cal_q_pr_first(df, batch_size, n_model, model_trained=None, scaler=None):
names = [f'mlp{i}' for i in range(n_model)]
model = VotingClassifier(estimators=list(zip(names, mlps)),
voting='soft',
n_jobs=1 if __debug__ else 12)
n_jobs=1 if __debug__ else n_model)
model.fit(X_train, y_train)
cscore = model.predict_proba(X)[:, 1]
else:
Expand All @@ -318,7 +323,7 @@ def cal_q_pr_first(df, batch_size, n_model, model_trained=None, scaler=None):
group_size_cumsum = np.concatenate([[0], np.cumsum(group_size)])
group_rank = utils.cal_group_rank(df.cscore_pr.values, group_size_cumsum)
df['group_rank'] = group_rank
df = df[df['group_rank'] == 1].reset_index(drop=True)
df = df.loc[group_rank == 1]

df = cal_q_pr_core(df, score_col='cscore_pr')

Expand Down
2 changes: 1 addition & 1 deletion beta_dia/fxic.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ def screen_locus_by_deep(df_batch, locus_num, top_deep_q):
'''
group_size_cumsum = np.concatenate([[0], np.cumsum(locus_num)])
group_rank_deep = utils.cal_group_rank(
df_batch['seek_score_deep'].values.astype(np.float32), group_size_cumsum
df_batch['seek_score_deep'].values, group_size_cumsum
)
group_rank_x = utils.cal_group_rank(
df_batch['seek_score_sa_x_deep'].values, group_size_cumsum
Expand Down
2 changes: 2 additions & 0 deletions beta_dia/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,8 @@ def read_entry_worker(binary_data, block_positions, block_label, worker_i):
'fg_num': fg_num_v,
})
assert sum(fg_loss_v) == 0, 'DIA-NN .speclib has fg_loss type!'
assert len(df) == len(df.drop_duplicates(['pr_id', 'pr_index']))
assert len(df) == df['pr_id'].nunique() == df['pr_index'].nunique()

# unify to top-12,fg_anno code:y15_2 --> 2152
fg_mz_v = np.array(fg_mz_v, dtype=np.float32)
Expand Down
Binary file modified beta_dia/pretrained/deepbig_ys.pt
Binary file not shown.
Binary file modified beta_dia/pretrained/deepcenter_ys.pt
Binary file not shown.
3 changes: 1 addition & 2 deletions beta_dia/quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,7 @@ def quant_pr(df, ms):

df_good.append(df_batch)

utils.release_gpu_scans(ms1_centroid)
utils.release_gpu_scans(ms2_centroid)
utils.release_gpu_scans(ms1_centroid, ms2_centroid)

df = pd.concat(df_good, axis=0, ignore_index=True)

Expand Down
3 changes: 1 addition & 2 deletions beta_dia/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,7 @@ def extract_map_by_compare(df_top, ms):
param_g.tol_ppm,
)
mall_v.append(mall.cpu().numpy())
utils.release_gpu_scans(ms1_profile)
utils.release_gpu_scans(ms2_profile)
utils.release_gpu_scans(ms1_profile, ms2_profile)

maps_center = np.vstack(maps_center_v)
maps_big = np.vstack(maps_big_v)
Expand Down
123 changes: 93 additions & 30 deletions beta_dia/scoring.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd
import torch
from numba import cuda
from numba import cuda, jit

from beta_dia import deepmall
from beta_dia import deepmap
Expand Down Expand Up @@ -34,6 +34,7 @@ def score_locus(df_target, ms, model_center, model_big):

batch_n = param_g.batch_deep_big

# may split two locus that belong to a pr
for batch_idx, df_batch in df_swath.groupby(df_swath.index // batch_n):
df_batch = df_batch.reset_index(drop=True)
# deep scores and deep features
Expand Down Expand Up @@ -97,18 +98,17 @@ def score_locus(df_target, ms, model_center, model_big):
df_batch = scoring_center_im(df_batch, ims_v[1])
# mz
df_batch = scoring_center_mz(df_batch, mzs_v[1])
# competitive
df_batch = scoring_putatives(df_batch)
# cross scores
df_batch = scoring_by_cross(df_batch)

df_good.append(df_batch)
utils.release_gpu_scans(ms1_profile)
utils.release_gpu_scans(ms2_profile)
utils.release_gpu_scans(ms1_centroid)
utils.release_gpu_scans(ms2_centroid)

utils.release_gpu_scans(
ms1_profile, ms2_profile, ms1_centroid, ms2_centroid
)

df = pd.concat(df_good, axis=0, ignore_index=True)
df = scoring_putatives(df) # competitive for two locus from a pr
df = scoring_meta(df) # meta scores
return df

Expand All @@ -131,12 +131,12 @@ def scoring_by_deep(df_batch, scores_deep_v, x):
@profile
def scoring_by_ft(df_batch, features_deep_v, x):
# x: ['pre', 'refine_p1', 'refine_p2']
features = [x for x in features_deep_v if x is not None]
features = np.concatenate(features, axis=1)

m = features.shape[-1]
columns = [f'score_ft_deep_{x}_{i}' for i in range(m)]
df_batch[columns] = features
owned = 0
for features in features_deep_v:
m = features.shape[-1]
columns = [f'score_ft_deep_{x}_{i}' for i in range(owned, owned + m)]
df_batch[columns] = features
owned += m

return df_batch

Expand Down Expand Up @@ -511,27 +511,91 @@ def scoring_meta(df):
return df


def scoring_putatives(df_batch):
@jit(nopython=True, nogil=True)
def numba_scoring_putatives(groups, sa_v, center_v, big_v):
result_sa_max = np.empty(len(groups), dtype=sa_v.dtype)
result_sa_sum = np.empty(len(groups), dtype=sa_v.dtype)
result_center_max = np.empty(len(groups), dtype=sa_v.dtype)
result_center_sum = np.empty(len(groups), dtype=sa_v.dtype)
result_big_max = np.empty(len(groups), dtype=sa_v.dtype)
result_big_sum = np.empty(len(groups), dtype=sa_v.dtype)

current_group = groups[0]
sa_max = sa_v[0]
sa_sum = sa_v[0]
center_max = center_v[0]
center_sum = center_v[0]
big_max = big_v[0]
big_sum = big_v[0]

start_idx = 0

for i in range(1, len(groups)):
if groups[i] != current_group:
for j in range(start_idx, i):
result_sa_max[j] = sa_max
result_center_max[j] = center_max
result_big_max[j] = big_max
result_sa_sum[j] = sa_sum
result_center_sum[j] = center_sum
result_big_sum[j] = big_sum

current_group = groups[i]
sa_max = sa_v[i]
sa_sum = sa_v[i]
center_max = center_v[i]
center_sum = center_v[i]
big_max = big_v[i]
big_sum = big_v[i]
start_idx = i
else:
sa_max = max(sa_max, sa_v[i])
center_max = max(center_max, center_v[i])
big_max = max(big_max, big_v[i])

sa_sum += sa_v[i]
center_sum += center_v[i]
big_sum += big_v[i]

for j in range(start_idx, len(groups)):
result_sa_max[j] = sa_max
result_center_max[j] = center_max
result_big_max[j] = big_max
result_sa_sum[j] = sa_sum
result_center_sum[j] = center_sum
result_big_sum[j] = big_sum

return (result_sa_max, result_center_max, result_big_max,
result_sa_sum, result_center_sum, result_big_sum)


@profile
def scoring_putatives(df):
'''
If a pr has multiple candidate elution groups, calculate they bias:
1) score-i - score-max
2) np.log(score-i/score.sum)
'''
for col in ['score_center_coelution',
'score_center_deep_pre',
'score_big_deep_pre']:

scores1 = df_batch[col]
a = 1e-7

pr_index_v = df['pr_index'].values
sa_v = df['score_center_coelution'].values
center_v = df['score_center_deep_pre'].values
big_v = df['score_big_deep_pre'].values
(sa_max_v, center_max_v, big_max_v,
sa_sum_v, center_sum_v, big_sum_v) = numba_scoring_putatives(
pr_index_v, sa_v, center_v, big_v
)
df['score_center_coelution_putative1'] = sa_v - sa_max_v
df['score_center_coelution_putative2'] = np.log(sa_v + a) / (sa_sum_v + a)

group_max = df_batch.groupby('pr_id')[col].transform('max')
scores2 = scores1 - group_max
df_batch[col + '_putative1'] = scores2
df['score_center_deep_pre_putative1'] = center_v - center_max_v
df['score_center_deep_pre_putative2'] = np.log(center_v + a) / (center_sum_v + a)

group_sum = df_batch.groupby('pr_id')[col].transform('sum')
scores2 = np.log((scores1 + 1e-7) / (group_sum + 1e-7))
df_batch[col + '_putative2'] = scores2
df['score_big_deep_pre_putative1'] = big_v - big_max_v
df['score_big_deep_pre_putative2'] = np.log(big_v + a) / (big_sum_v + a)

return df_batch
return df


def scoring_by_cross(df_batch, is_update=False):
Expand Down Expand Up @@ -637,10 +701,9 @@ def update_scores(df, ms, model_center, model_big, model_mall):

df_good.append(df_batch)

utils.release_gpu_scans(ms1_profile)
utils.release_gpu_scans(ms2_profile)
utils.release_gpu_scans(ms1_centroid)
utils.release_gpu_scans(ms2_centroid)
utils.release_gpu_scans(
ms1_profile, ms2_profile, ms1_centroid, ms2_centroid
)

df = pd.concat(df_good, axis=0, ignore_index=True)
utils.cal_acc_recall(param_g.ws, df[df['decoy'] == 0], diann_q_pr=0.01)
Expand Down
Loading

0 comments on commit 87d3c97

Please sign in to comment.