v0.5: speed up load_ms by numba index; add deep_big cut before fast f…

…dr; remove gc.collect()
YuAirLab · Oct 15, 2024 · 87d3c97 · 87d3c97
1 parent 4701ea0
commit 87d3c97
Show file tree

Hide file tree

Showing 19 changed files with 181 additions and 81 deletions.
diff --git a/README.md b/README.md
@@ -119,4 +119,6 @@ optimize the proteome profiling of diaPASEF mass spectrometry data**
 #### 0.3.0
   * FEAT: refactor code to speed up
 #### 0.4.0
-  * FEAT: use model_ys; select locus with x == 1 or deep == 1
+  * FEAT: use model_ys; select locus with x == 1 or deep == 1
+#### 0.5.0
+  * FEAT: speed up load_ms by numba index; add deep_big cut before fast fdr
diff --git a/beta_dia/__init__.py b/beta_dia/__init__.py
@@ -1 +1 @@
-__version__ = '0.4.0'
+__version__ = '0.5.0'
diff --git a/beta_dia/calib.py b/beta_dia/calib.py
@@ -186,6 +186,7 @@ def update_info_im(df_tol, df_lib):
     return df_tol, df_lib
 
 
+@profile
 def update_info_mz(df_seed, ms):
     '''
     Calib m/z, update the measure m/z
@@ -210,7 +211,7 @@ def update_info_mz(df_seed, ms):
     frac = 0.1
     y_lowess = lowess(y, x, frac=frac)
     x_fit, y_fit = zip(*y_lowess)
-    x_fit, y_fit = np.array(x_fit), np.array(y_fit)
+    x_fit, y_fit = np.array(x_fit, dtype=np.float32), np.array(y_fit, dtype=np.float32)
 
     f = interp1d(x_fit, y_fit, kind='cubic', fill_value='extrapolate')
 
@@ -251,8 +252,10 @@ def update_info_mz(df_seed, ms):
              cycle_valid_lens2, all_push2, all_tof2, all_height2
              ) = ms_map
 
-            all_tof = f(all_tof).astype(np.float32)
-            all_tof2 = f(all_tof2).astype(np.float32)
+            all_tof = f(all_tof)
+            all_tof = all_tof.astype(np.float32)
+            all_tof2 = f(all_tof2)
+            all_tof2 = all_tof2.astype(np.float32)
 
             ms_map = (all_rt,
                       cycle_valid_lens, all_push, all_tof, all_height,

diff --git a/beta_dia/deepmap.py b/beta_dia/deepmap.py
@@ -304,24 +304,28 @@ def extract_maps(df_batch,
 
     # params
     if neutron_num == -1:
-        query_mz_ms1 = df_batch[['pr_mz_left', 'pr_mz_left']].values
+        query_mz_ms1 = df_batch['pr_mz_left'].values
+        query_mz_ms1 = np.tile(query_mz_ms1, (2, 1)).T
         query_mz_ms2 = np.array(df_batch['fg_mz_left'].values.tolist())
         query_mz_m = np.concatenate([query_mz_ms1, query_mz_ms2], axis=1)
         ms1_ion_num = 1
     elif neutron_num == 0:
-        query_mz_ms1 = df_batch[['pr_mz', 'pr_mz']].values
+        query_mz_ms1 = df_batch['pr_mz'].values
+        query_mz_ms1 = np.tile(query_mz_ms1, (2, 1)).T
         cols_center = ['fg_mz_' + str(i) for i in range(param_g.fg_num)]
         query_mz_ms2 = df_batch[cols_center].values
         query_mz_m = np.concatenate([query_mz_ms1, query_mz_ms2], axis=1)
         ms1_ion_num = 1
     elif neutron_num == 1:
-        query_mz_ms1 = df_batch[['pr_mz_1H', 'pr_mz_1H']].values
+        query_mz_ms1 = df_batch['pr_mz_1H'].values
+        query_mz_ms1 = np.tile(query_mz_ms1, (2, 1)).T
         cols_1H = ['fg_mz_1H_' + str(i) for i in range(param_g.fg_num)]
         query_mz_ms2 = df_batch[cols_1H].values
         query_mz_m = np.concatenate([query_mz_ms1, query_mz_ms2], axis=1)
         ms1_ion_num = 1
     elif neutron_num == 2:
-        query_mz_ms1 = df_batch[['pr_mz_2H', 'pr_mz_2H']].values
+        query_mz_ms1 = df_batch['pr_mz_2H'].values
+        query_mz_ms1 = np.tile(query_mz_ms1, (2, 1)).T
         cols_2H = ['fg_mz_2H_' + str(i) for i in range(param_g.fg_num)]
         query_mz_ms2 = df_batch[cols_2H].values
         query_mz_m = np.concatenate([query_mz_ms1, query_mz_ms2], axis=1)
@@ -463,7 +467,7 @@ def scoring_maps(
             feature = feature.numpy()
             feature_v.append(feature)
 
-    pred = torch.cat(pred_v)
+    pred = torch.cat(pred_v).to(dtype=torch.float32) # torch autocast to 16
     if return_feature:
         feature = np.vstack(feature_v)
     else:
@@ -579,8 +583,9 @@ def extract_scoring_big(
         with torch.no_grad():
             with torch.cuda.amp.autocast():
                 feature, pred = model(maps_sub, valid_ion_nums)
+        torch.cuda.synchronize()
         pred = torch.softmax(pred, 1)
-        pred = pred[:, 1].cpu().numpy()
+        pred = pred[:, 1].cpu().numpy().astype(np.float32)
         feature = feature.cpu().numpy()
         pred_v.append(pred)
         feature_v.append(feature)

diff --git a/beta_dia/dist/main.py b/beta_dia/dist/main.py
diff --git a/beta_dia/dist/main_core.py b/beta_dia/dist/main_core.py
diff --git a/beta_dia/dist/pyarmor_runtime_000000/__init__.py b/beta_dia/dist/pyarmor_runtime_000000/__init__.py
@@ -1,4 +1,4 @@
-# Pyarmor 8.5.11 (trial), 000000, 2024-09-30T22:26:20.124320
+# Pyarmor 8.5.11 (trial), 000000, 2024-10-15T17:35:23.857876
 def __pyarmor__():
     import platform
     import sys

diff --git a/beta_dia/dist/pyarmor_runtime_000000/linux_x86_64/pyarmor_runtime.so b/beta_dia/dist/pyarmor_runtime_000000/linux_x86_64/pyarmor_runtime.so
diff --git a/beta_dia/dist/pyarmor_runtime_000000/windows_x86_64/pyarmor_runtime.pyd b/beta_dia/dist/pyarmor_runtime_000000/windows_x86_64/pyarmor_runtime.pyd
diff --git a/beta_dia/fdr.py b/beta_dia/fdr.py
@@ -277,22 +277,27 @@ def cal_q_pr_first(df, batch_size, n_model, model_trained=None, scaler=None):
     else:
         X = scaler.transform(X)
 
-    # train on group_rank == 1
+    # train
     group_rank_max = df['group_rank'].max()
-    if group_rank_max > 1:
-        train_idx = df['group_rank'] == 1
+    if (model_trained is None) and (group_rank_max > 1):
+        decoy_deeps = df.loc[df['decoy'] == 1, 'score_big_deep_pre'].values
+        decoy_m, decoy_u = np.mean(decoy_deeps), np.std(decoy_deeps)
+        good_cut = decoy_m + 3 * decoy_u
+        logger.info(f'Training with good_big_score: {good_cut:.2f}')
+        train_idx = (df['group_rank'] == 1) & (df['score_big_deep_pre'] > good_cut)
         X_train = X[train_idx]
         y_train = y[train_idx]
     else:
         X_train = X
         y_train = y
+
     n_pos, n_neg = sum(y_train == 1), sum(y_train == 0)
     info = 'Training the model: {} pos, {} neg'.format(n_pos, n_neg)
     logger.info(info)
 
     # models
     if model_trained is None:
-        param = (20, 10, 5)
+        param = (25, 20, 15, 10, 5)
         mlps = [MLPClassifier(max_iter=1,
                               shuffle=True,
                               random_state=i,  # init weights and shuffle
@@ -304,7 +309,7 @@ def cal_q_pr_first(df, batch_size, n_model, model_trained=None, scaler=None):
         names = [f'mlp{i}' for i in range(n_model)]
         model = VotingClassifier(estimators=list(zip(names, mlps)),
                                  voting='soft',
-                                 n_jobs=1 if __debug__ else 12)
+                                 n_jobs=1 if __debug__ else n_model)
         model.fit(X_train, y_train)
         cscore = model.predict_proba(X)[:, 1]
     else:
@@ -318,7 +323,7 @@ def cal_q_pr_first(df, batch_size, n_model, model_trained=None, scaler=None):
         group_size_cumsum = np.concatenate([[0], np.cumsum(group_size)])
         group_rank = utils.cal_group_rank(df.cscore_pr.values, group_size_cumsum)
         df['group_rank'] = group_rank
-        df = df[df['group_rank'] == 1].reset_index(drop=True)
+        df = df.loc[group_rank == 1]
 
     df = cal_q_pr_core(df, score_col='cscore_pr')
 

diff --git a/beta_dia/fxic.py b/beta_dia/fxic.py
@@ -583,7 +583,7 @@ def screen_locus_by_deep(df_batch, locus_num, top_deep_q):
     '''
     group_size_cumsum = np.concatenate([[0], np.cumsum(locus_num)])
     group_rank_deep = utils.cal_group_rank(
-        df_batch['seek_score_deep'].values.astype(np.float32), group_size_cumsum
+        df_batch['seek_score_deep'].values, group_size_cumsum
     )
     group_rank_x = utils.cal_group_rank(
         df_batch['seek_score_sa_x_deep'].values, group_size_cumsum

diff --git a/beta_dia/library.py b/beta_dia/library.py
@@ -274,6 +274,8 @@ def read_entry_worker(binary_data, block_positions, block_label, worker_i):
                        'fg_num': fg_num_v,
                        })
     assert sum(fg_loss_v) == 0, 'DIA-NN .speclib has fg_loss type!'
+    assert len(df) == len(df.drop_duplicates(['pr_id', 'pr_index']))
+    assert len(df) == df['pr_id'].nunique() == df['pr_index'].nunique()
 
     # unify to top-12，fg_anno code：y15_2 --> 2152
     fg_mz_v = np.array(fg_mz_v, dtype=np.float32)

diff --git a/beta_dia/pretrained/deepbig_ys.pt b/beta_dia/pretrained/deepbig_ys.pt
diff --git a/beta_dia/pretrained/deepcenter_ys.pt b/beta_dia/pretrained/deepcenter_ys.pt
diff --git a/beta_dia/quant.py b/beta_dia/quant.py
@@ -172,8 +172,7 @@ def quant_pr(df, ms):
 
             df_good.append(df_batch)
 
-        utils.release_gpu_scans(ms1_centroid)
-        utils.release_gpu_scans(ms2_centroid)
+        utils.release_gpu_scans(ms1_centroid, ms2_centroid)
 
     df = pd.concat(df_good, axis=0, ignore_index=True)
 

diff --git a/beta_dia/refine.py b/beta_dia/refine.py
@@ -160,8 +160,7 @@ def extract_map_by_compare(df_top, ms):
                                          param_g.tol_ppm,
                                          )
             mall_v.append(mall.cpu().numpy())
-    utils.release_gpu_scans(ms1_profile)
-    utils.release_gpu_scans(ms2_profile)
+    utils.release_gpu_scans(ms1_profile, ms2_profile)
 
     maps_center = np.vstack(maps_center_v)
     maps_big = np.vstack(maps_big_v)

diff --git a/beta_dia/scoring.py b/beta_dia/scoring.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 import torch
-from numba import cuda
+from numba import cuda, jit
 
 from beta_dia import deepmall
 from beta_dia import deepmap
@@ -34,6 +34,7 @@ def score_locus(df_target, ms, model_center, model_big):
 
         batch_n = param_g.batch_deep_big
 
+        # may split two locus that belong to a pr
         for batch_idx, df_batch in df_swath.groupby(df_swath.index // batch_n):
             df_batch = df_batch.reset_index(drop=True)
             # deep scores and deep features
@@ -97,18 +98,17 @@ def score_locus(df_target, ms, model_center, model_big):
             df_batch = scoring_center_im(df_batch, ims_v[1])
             # mz
             df_batch = scoring_center_mz(df_batch, mzs_v[1])
-            # competitive
-            df_batch = scoring_putatives(df_batch)
             # cross scores
             df_batch = scoring_by_cross(df_batch)
 
             df_good.append(df_batch)
-        utils.release_gpu_scans(ms1_profile)
-        utils.release_gpu_scans(ms2_profile)
-        utils.release_gpu_scans(ms1_centroid)
-        utils.release_gpu_scans(ms2_centroid)
+
+        utils.release_gpu_scans(
+            ms1_profile, ms2_profile, ms1_centroid, ms2_centroid
+        )
 
     df = pd.concat(df_good, axis=0, ignore_index=True)
+    df = scoring_putatives(df) # competitive for two locus from a pr
     df = scoring_meta(df) # meta scores
     return df
 
@@ -131,12 +131,12 @@ def scoring_by_deep(df_batch, scores_deep_v, x):
 @profile
 def scoring_by_ft(df_batch, features_deep_v, x):
     # x: ['pre', 'refine_p1', 'refine_p2']
-    features = [x for x in features_deep_v if x is not None]
-    features = np.concatenate(features, axis=1)
-
-    m = features.shape[-1]
-    columns = [f'score_ft_deep_{x}_{i}' for i in range(m)]
-    df_batch[columns] = features
+    owned = 0
+    for features in features_deep_v:
+        m = features.shape[-1]
+        columns = [f'score_ft_deep_{x}_{i}' for i in range(owned, owned + m)]
+        df_batch[columns] = features
+        owned += m
 
     return df_batch
 
@@ -511,27 +511,91 @@ def scoring_meta(df):
     return df
 
 
-def scoring_putatives(df_batch):
+@jit(nopython=True, nogil=True)
+def numba_scoring_putatives(groups, sa_v, center_v, big_v):
+    result_sa_max = np.empty(len(groups), dtype=sa_v.dtype)
+    result_sa_sum = np.empty(len(groups), dtype=sa_v.dtype)
+    result_center_max = np.empty(len(groups), dtype=sa_v.dtype)
+    result_center_sum = np.empty(len(groups), dtype=sa_v.dtype)
+    result_big_max = np.empty(len(groups), dtype=sa_v.dtype)
+    result_big_sum = np.empty(len(groups), dtype=sa_v.dtype)
+
+    current_group = groups[0]
+    sa_max = sa_v[0]
+    sa_sum = sa_v[0]
+    center_max = center_v[0]
+    center_sum = center_v[0]
+    big_max = big_v[0]
+    big_sum = big_v[0]
+
+    start_idx = 0
+
+    for i in range(1, len(groups)):
+        if groups[i] != current_group:
+            for j in range(start_idx, i):
+                result_sa_max[j] = sa_max
+                result_center_max[j] = center_max
+                result_big_max[j] = big_max
+                result_sa_sum[j] = sa_sum
+                result_center_sum[j] = center_sum
+                result_big_sum[j] = big_sum
+
+            current_group = groups[i]
+            sa_max = sa_v[i]
+            sa_sum = sa_v[i]
+            center_max = center_v[i]
+            center_sum = center_v[i]
+            big_max = big_v[i]
+            big_sum = big_v[i]
+            start_idx = i
+        else:
+            sa_max = max(sa_max, sa_v[i])
+            center_max = max(center_max, center_v[i])
+            big_max = max(big_max, big_v[i])
+
+            sa_sum += sa_v[i]
+            center_sum += center_v[i]
+            big_sum += big_v[i]
+
+    for j in range(start_idx, len(groups)):
+        result_sa_max[j] = sa_max
+        result_center_max[j] = center_max
+        result_big_max[j] = big_max
+        result_sa_sum[j] = sa_sum
+        result_center_sum[j] = center_sum
+        result_big_sum[j] = big_sum
+
+    return (result_sa_max, result_center_max, result_big_max,
+            result_sa_sum, result_center_sum, result_big_sum)
+
+
+@profile
+def scoring_putatives(df):
     '''
     If a pr has multiple candidate elution groups, calculate they bias:
         1) score-i - score-max
         2) np.log(score-i/score.sum)
     '''
-    for col in ['score_center_coelution',
-                'score_center_deep_pre',
-                'score_big_deep_pre']:
-
-        scores1 = df_batch[col]
+    a = 1e-7
+
+    pr_index_v = df['pr_index'].values
+    sa_v = df['score_center_coelution'].values
+    center_v = df['score_center_deep_pre'].values
+    big_v = df['score_big_deep_pre'].values
+    (sa_max_v, center_max_v, big_max_v,
+     sa_sum_v, center_sum_v, big_sum_v) = numba_scoring_putatives(
+        pr_index_v, sa_v, center_v, big_v
+    )
+    df['score_center_coelution_putative1'] = sa_v - sa_max_v
+    df['score_center_coelution_putative2'] = np.log(sa_v + a) / (sa_sum_v + a)
 
-        group_max = df_batch.groupby('pr_id')[col].transform('max')
-        scores2 = scores1 - group_max
-        df_batch[col + '_putative1'] = scores2
+    df['score_center_deep_pre_putative1'] = center_v - center_max_v
+    df['score_center_deep_pre_putative2'] = np.log(center_v + a) / (center_sum_v + a)
 
-        group_sum = df_batch.groupby('pr_id')[col].transform('sum')
-        scores2 = np.log((scores1 + 1e-7) / (group_sum + 1e-7))
-        df_batch[col + '_putative2'] = scores2
+    df['score_big_deep_pre_putative1'] = big_v - big_max_v
+    df['score_big_deep_pre_putative2'] = np.log(big_v + a) / (big_sum_v + a)
 
-    return df_batch
+    return df
 
 
 def scoring_by_cross(df_batch, is_update=False):
@@ -637,10 +701,9 @@ def update_scores(df, ms, model_center, model_big, model_mall):
 
             df_good.append(df_batch)
 
-        utils.release_gpu_scans(ms1_profile)
-        utils.release_gpu_scans(ms2_profile)
-        utils.release_gpu_scans(ms1_centroid)
-        utils.release_gpu_scans(ms2_centroid)
+        utils.release_gpu_scans(
+            ms1_profile, ms2_profile, ms1_centroid, ms2_centroid
+        )
 
     df = pd.concat(df_good, axis=0, ignore_index=True)
     utils.cal_acc_recall(param_g.ws, df[df['decoy'] == 0], diann_q_pr=0.01)