Skip to content

Commit

Permalink
Merge pull request #405 from MannLabs/develop
Browse files Browse the repository at this point in the history
Develop 0.4.0
  • Loading branch information
straussmaximilian authored Mar 11, 2022
2 parents 864cae5 + a17c5a3 commit a4a1155
Show file tree
Hide file tree
Showing 56 changed files with 8,701 additions and 10,017 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.33
current_version = 0.4.0
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
2 changes: 1 addition & 1 deletion alphapept/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.3.33"
__version__ = "0.4.0"

__requirements__ = {
"": "requirements/requirements.txt",
Expand Down
2 changes: 1 addition & 1 deletion alphapept/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
AUTHOR_EMAIL = "straussmaximilian@gmail.com"
COPYRIGHT = "Mann Labs"
BRANCH = "master"
VERSION_NO = "0.3.33"
VERSION_NO = "0.4.0"
MIN_PYTHON = "3.6"
MAX_PYTHON = "4"
AUDIENCE = "Developers"
Expand Down
3 changes: 3 additions & 0 deletions alphapept/_nbdev.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
"extract_bruker": "04_feature_finding.ipynb",
"convert_bruker": "04_feature_finding.ipynb",
"map_bruker": "04_feature_finding.ipynb",
"get_stats": "04_feature_finding.ipynb",
"find_features": "04_feature_finding.ipynb",
"replace_infs": "04_feature_finding.ipynb",
"map_ms2": "04_feature_finding.ipynb",
Expand Down Expand Up @@ -183,6 +184,7 @@
"transform": "07_recalibration.ipynb",
"kneighbors_calibration": "07_recalibration.ipynb",
"get_calibration": "07_recalibration.ipynb",
"chunks": "07_recalibration.ipynb",
"density_scatter": "07_recalibration.ipynb",
"save_fragment_calibration": "07_recalibration.ipynb",
"calibrate_fragments_nn": "07_recalibration.ipynb",
Expand Down Expand Up @@ -245,6 +247,7 @@
"get_summary": "11_interface.ipynb",
"parallel_execute": "11_interface.ipynb",
"bcolors": "11_interface.ipynb",
"is_port_in_use": "11_interface.ipynb",
"run_cli": "11_interface.ipynb",
"cli_overview": "11_interface.ipynb",
"cli_database": "11_interface.ipynb",
Expand Down
5 changes: 3 additions & 2 deletions alphapept/default_settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ workflow:
find_features: true
search_data: true
recalibrate_data: true
align: true
align: false
match: false
lfq_quantification: true
general:
Expand Down Expand Up @@ -87,11 +87,12 @@ calibration:
matching:
match_p_min: 0.05
match_d_min: 3
match_group_tol: 0
isobaric_label:
label: None
reporter_frag_tolerance: 15
reporter_frag_tolerance_ppm: true
quantification:
max_lfq: true
lfq_ratio_min: 1
mode: int_sum
mode: ms1_int_sum
14 changes: 7 additions & 7 deletions alphapept/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ def remove_mods(sequence):
def ap_to_mq_sequence(sequence, mod_translation):
"""
Converts AlphaPept sequence format to MaxQuant Format
returns naked_sequence, len_sequence, modifications_, mq_sequence
returns sequence_naked, len_sequence, modifications_, mq_sequence
"""
# Add leading and trailing modification
naked_sequence = remove_mods(sequence)
sequence_naked = remove_mods(sequence)
parsed_sequence = parse(sequence)

mq_sequence = '_'
Expand All @@ -37,7 +37,7 @@ def ap_to_mq_sequence(sequence, mod_translation):

for idx, AA in enumerate(parsed_sequence):

mq_sequence += naked_sequence[idx]
mq_sequence += sequence_naked[idx]
if len(AA) != 1:
if mod_translation[AA] is not None:
if mod_translation[AA] in modifications:
Expand Down Expand Up @@ -68,9 +68,9 @@ def ap_to_mq_sequence(sequence, mod_translation):

mq_sequence += '_'

n_AA = len(naked_sequence)
n_AA = len(sequence_naked)

return naked_sequence, n_AA, modifications_, mq_sequence
return sequence_naked, n_AA, modifications_, mq_sequence


# Cell
Expand Down Expand Up @@ -100,9 +100,9 @@ def prepare_ap_results(ref_ap):

ref_ap['id'] = ref_ap.index

naked_sequence, nAA, mq_modifications, mq_sequence = zip(*ref_ap['sequence'].apply(lambda x: ap_to_mq_sequence(x, mod_translation)))
sequence_naked, nAA, mq_modifications, mq_sequence = zip(*ref_ap['sequence'].apply(lambda x: ap_to_mq_sequence(x, mod_translation)))

ref_ap['naked_sequence'] = naked_sequence
ref_ap['sequence_naked'] = sequence_naked
ref_ap['n_AA'] = nAA
ref_ap['mq_modifications'] = mq_modifications
ref_ap['mq_sequence'] = mq_sequence
Expand Down
72 changes: 53 additions & 19 deletions alphapept/feature_finding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
'get_trails', 'plot_pattern', 'get_minpos', 'get_local_minima', 'is_local_minima', 'truncate',
'check_averagine', 'pattern_to_mz', 'cosine_averagine', 'int_list_to_array', 'mz_to_mass', 'M_PROTON',
'isolate_isotope_pattern', 'get_isotope_patterns', 'report_', 'feature_finder_report', 'extract_bruker',
'convert_bruker', 'map_bruker', 'find_features', 'replace_infs', 'map_ms2']
'convert_bruker', 'map_bruker', 'get_stats', 'find_features', 'replace_infs', 'map_ms2']

# Cell
import numpy as np
Expand Down Expand Up @@ -628,8 +628,8 @@ def hill_stats(idx:np.ndarray, hill_range:np.ndarray, hill_ptrs:np.ndarray, hill
int_ = int_data[idx_]
mz_ = mass_data[idx_]

int_sum = np.sum(int_)
int_area = np.abs(np.trapz(rt_[rt_idx[idx_]], int_)) #Area
ms1_int_sum = np.sum(int_)
ms1_int_area = np.abs(np.trapz(rt_[rt_idx[idx_]], int_)) #Area

rt_min = rt_[rt_idx[idx_]].min()
rt_max = rt_[rt_idx[idx_]].max()
Expand Down Expand Up @@ -657,8 +657,8 @@ def hill_stats(idx:np.ndarray, hill_range:np.ndarray, hill_ptrs:np.ndarray, hill

stats[idx,0] = average_mz
stats[idx,1] = delta_m
stats[idx,2] = int_sum
stats[idx,3] = int_area
stats[idx,2] = ms1_int_sum
stats[idx,3] = ms1_int_area
stats[idx,4] = rt_min
stats[idx,5] = rt_max

Expand Down Expand Up @@ -1574,7 +1574,7 @@ def report_(idx:np.ndarray, isotope_charges:list, isotope_patterns:list, iso_idx
left_apex = np.abs(trace[:rt_apex_idx]-half_max).argmin()
right_apex = np.abs(trace[rt_apex_idx:]-half_max).argmin()+rt_apex_idx

int_apex = trace_sum[rt_apex_idx]
ms1_int_apex = trace_sum[rt_apex_idx]
fwhm = rt_range[right_apex] - rt_range[left_apex]

n_isotopes = len(pattern)
Expand Down Expand Up @@ -1602,10 +1602,10 @@ def report_(idx:np.ndarray, isotope_charges:list, isotope_patterns:list, iso_idx
rt_start = rt_range[rt_min_idx]
rt_end = rt_range[rt_max_idx]

int_area = np.abs(np.trapz(trace_sum[rt_min_idx:rt_max_idx], rt_range[rt_min_idx:rt_max_idx]))
int_sum = trace_sum.sum()
ms1_int_area = np.abs(np.trapz(trace_sum[rt_min_idx:rt_max_idx], rt_range[rt_min_idx:rt_max_idx]))
ms1_int_sum = trace_sum.sum()

results[idx,:] = np.array([mz, mz_std, mz_most_abundant, charge, rt_start, rt_apex, rt_end, fwhm, n_isotopes, mass, int_apex, int_area, int_sum])
results[idx,:] = np.array([mz, mz_std, mz_most_abundant, charge, rt_start, rt_apex, rt_end, fwhm, n_isotopes, mass, ms1_int_apex, ms1_int_area, ms1_int_sum])

# Cell
import pandas as pd
Expand Down Expand Up @@ -1639,7 +1639,7 @@ def feature_finder_report(query_data:dict, isotope_patterns:list, isotope_charge

report_(range(len(isotope_charges)), isotope_charges, isotope_patterns, iso_idx, stats, sortindex_, hill_ptrs, hill_data, int_data, rt_, rt_idx, results, lookup_idx)

df = pd.DataFrame(results, columns = ['mz','mz_std','mz_most_abundant','charge','rt_start','rt_apex','rt_end','fwhm','n_isotopes','mass','int_apex','int_area', 'int_sum'])
df = pd.DataFrame(results, columns = ['mz','mz_std','mz_most_abundant','charge','rt_start','rt_apex','rt_end','fwhm','n_isotopes','mass','ms1_int_apex','ms1_int_area', 'ms1_int_sum'])

df.sort_values(['rt_start','mz'])

Expand Down Expand Up @@ -1729,17 +1729,19 @@ def convert_bruker(feature_path:str)->pd.DataFrame:
"""
engine_featurefile = db.create_engine('sqlite:///{}'.format(feature_path))
feature_table = pd.read_sql_table('LcTimsMsFeature', engine_featurefile)

feature_cluster_mapping = pd.read_sql_table('FeatureClusterMapping', engine_featurefile)
from .constants import mass_dict

M_PROTON = mass_dict['Proton']
feature_table['Mass'] = feature_table['MZ'].values * feature_table['Charge'].values - feature_table['Charge'].values*M_PROTON
feature_table = feature_table.rename(columns={"MZ": "mz","Mass": "mass", "RT": "rt_apex", "RT_lower":"rt_start", "RT_upper":"rt_end", "Mobility": "mobility", "Mobility_lower": "mobility_lower", "Mobility_upper": "mobility_upper", "Charge":"charge","Intensity":'int_sum',"ClusterCount":'n_isotopes'})
feature_table = feature_table.rename(columns={"MZ": "mz","Mass": "mass", "RT": "rt_apex", "RT_lower":"rt_start", "RT_upper":"rt_end", "Mobility": "mobility", "Mobility_lower": "mobility_lower", "Mobility_upper": "mobility_upper", "Charge":"charge","Intensity":'ms1_int_sum',"ClusterCount":'n_isotopes'})
feature_table['rt_apex'] = feature_table['rt_apex']/60
feature_table['rt_start'] = feature_table['rt_start']/60
feature_table['rt_end'] = feature_table['rt_end']/60

return feature_table
feature_cluster_mapping = feature_cluster_mapping.rename(columns={"FeatureId": "feature_id", "ClusterId": "cluster_id", "Monoisotopic": "monoisotopic", "Intensity": "ms1_int_sum"})

return feature_table, feature_cluster_mapping


def map_bruker(feature_path:str, feature_table:pd.DataFrame, query_data:dict)->pd.DataFrame:
Expand Down Expand Up @@ -1800,6 +1802,29 @@ def map_bruker(feature_path:str, feature_table:pd.DataFrame, query_data:dict)->p

return features

# Cell
def get_stats(isotope_patterns, iso_idx, stats):
columns = ['mz_average','delta_m','int_sum','int_area','rt_min','rt_max']

stats_idx = np.zeros(iso_idx[-1], dtype=np.int64)
stats_map = np.zeros(iso_idx[-1], dtype=np.int64)

start_ = 0
end_ = 0

for idx in range(len(iso_idx)-1):
k = isotope_patterns[iso_idx[idx]:iso_idx[idx+1]]
end_ += len(k)
stats_idx[start_:end_] = k
stats_map[start_:end_] = idx
start_ = end_

k = pd.DataFrame(stats[stats_idx], columns=columns)

k['feature_id'] = stats_map

return k

# Cell
import numpy as np

Expand Down Expand Up @@ -1861,6 +1886,8 @@ def find_features(to_process:tuple, callback:Union[Callable, None] = None, paral
ms_file = alphapept.io.MS_Data_File(out_file, is_read_only=False)
query_data = ms_file.read_DDA_query_data()

feature_cluster_mapping = pd.DataFrame()

if not settings['workflow']["find_features"]:
features = query_data_to_features(query_data)
else:
Expand Down Expand Up @@ -1930,12 +1957,16 @@ def find_features(to_process:tuple, callback:Union[Callable, None] = None, paral
lookup_idx_df = pd.DataFrame(lookup_idx, columns = ['isotope_pattern', 'isotope_pattern_hill'])
ms_file.write(lookup_idx_df, dataset_name="feature_table_idx")

feature_cluster_mapping = get_stats(isotope_patterns, iso_idx, stats)


logging.info('Report complete.')

elif datatype == 'bruker':
logging.info('Feature finding on {}'.format(file_name))
feature_path = extract_bruker(file_name)
feature_table = convert_bruker(feature_path)
feature_table, feature_cluster_mapping = convert_bruker(feature_path)

logging.info('Bruker featurer finder complete. Extracted {:,} features.'.format(len(feature_table)))

# Calculate additional params
Expand All @@ -1952,8 +1983,11 @@ def find_features(to_process:tuple, callback:Union[Callable, None] = None, paral
else:
features = map_ms2(feature_table, query_data, **settings['features'])

ms_file.write(feature_cluster_mapping, dataset_name="feature_cluster_mapping")

logging.info('Saving feature table.')
ms_file.write(feature_table, dataset_name="feature_table")

logging.info('Feature table saved to {}'.format(out_file))


Expand Down Expand Up @@ -2028,7 +2062,7 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
for i, key in enumerate(range_dict):
tree_points[:,i] = tree_points[:,i]/range_dict[key][1]

matching_tree = KDTree(tree_points, metric="minkowski")
matching_tree = KDTree(tree_points, metric="euclidean")
ref_points = np.array([query_data[range_dict[_][0]] / range_dict[_][1] for _ in range_dict]).T
ref_points = replace_infs(ref_points)

Expand All @@ -2047,7 +2081,7 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
ref_df['query_idx'] = ref_df.index
ref_df['feature_idx'] = idx[:,neighbor]

for field in ['int_sum','int_apex','rt_start','rt_apex','rt_end','fwhm','mobility_lower','mobility_upper']:
for field in ['ms1_int_sum','ms1_int_apex','rt_start','rt_apex','rt_end','fwhm','mobility_lower','mobility_upper']:
if field in feature_table.keys():
ref_df[field] = feature_table.iloc[idx[:,neighbor]][field].values

Expand All @@ -2062,7 +2096,7 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
_check &= mob_check

ref_matched |= _check
ref_df['dist'] = dist[:,neighbor]
ref_df['feature_dist'] = dist[:,neighbor]
ref_df = ref_df[_check]

all_df.append(ref_df)
Expand All @@ -2088,10 +2122,10 @@ def map_ms2(feature_table:pd.DataFrame, query_data:dict, map_mz_range:float = 1,
ref_df['mobility_matched'] = unmatched_ref['mobility']
ref_df['mobility_offset'] = np.nan

for field in ['int_sum','int_apex','rt_start','rt_apex','rt_end','fwhm']:
for field in ['ms1_int_sum','ms1_int_apex','rt_start','rt_apex','rt_end','fwhm']:
if field in feature_table.keys():
unmatched_ref[field] = np.nan
unmatched_ref['dist'] = np.nan
unmatched_ref['feature_dist'] = np.nan

all_df.append(unmatched_ref)

Expand Down
24 changes: 18 additions & 6 deletions alphapept/gui/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ def submit_experiment(recorder: dict):
st.info(
f"Filename will be: {escape_markdown(long_name)}. Click submit button to add to queue."
)

if (recorder['workflow']['match']) | (recorder['workflow']['match']):
if len(recorder['experiment']['shortnames']) > 100:
st.warning('Performance Warning: More than 100 files are selected and matching / align is selected.'
'Matching / Align could take a long time. If you experience issues please contact mstrauss@biochem.mpg.de')

if st.button("Submit"):
settings = load_settings_as_template(DEFAULT_SETTINGS_PATH)
for group in recorder:
Expand Down Expand Up @@ -287,7 +293,7 @@ def experiment():

file_df = file_df_from_files(raw_files, file_folder)
file_df["Fraction"] = [str(i+1) for i in range(len(file_df))]
#file_df["Matching group"] = ""
file_df["Matching group"] = [str(0)]*len(file_df)

gb = GridOptionsBuilder.from_dataframe(file_df)
gb.configure_default_column(
Expand Down Expand Up @@ -315,15 +321,23 @@ def experiment():
" \n- Creation date of file."
" \n- Size (GB): Size in GB of the file."
" \n- Shortname: Unique shortname for each file."
" \n- Fraction: Fraction of each file."
#" \n- Matching Group: Match-between-runs only among members of this group."
" \n- Fraction: Fraction of each file. Files of the same fraction will be scored together. If dataset is not fractionated leave as is."
" \n- Matching Group: Match-between-runs only among members of this group or neighboring groups. Leave as is if matching between all files."
)

shortnames = file_df_selected["Shortname"].values.tolist()
if len(shortnames) != len(set(shortnames)):
st.warning("Warning: Shortnames are not unique.")
error += 1

try:
matching_groups = file_df_selected["Matching group"].values.astype('int').tolist()
except:
matching_groups = [str(0)]*len(file_df)

st.warning("Warning: Matching groups contain non-integer values. Please only use integers (0,1,2...).")
error += 1

fasta_files_home_dir = files_in_folder(FASTA_PATH, ".fasta")
fasta_files_home_dir = [
os.path.join(FASTA_PATH, _) for _ in fasta_files_home_dir
Expand Down Expand Up @@ -351,9 +365,7 @@ def experiment():
recorder["experiment"]["fractions"] = file_df_selected[
"Fraction"
].values.tolist()
#recorder["experiment"]["matching_groups"] = file_df_selected[
# "Matching group"
#].values.tolist()
recorder["experiment"]["matching_groups"] = matching_groups

f_dict = file_df_selected.groupby('Fraction')['Filename'].unique().to_dict()
f_dict = {k: list(v) for k,v in f_dict.items()}
Expand Down
Loading

0 comments on commit a4a1155

Please sign in to comment.