Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some speedups in the table processing #25

Merged
merged 8 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion directlfq/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


__project__ = "directlfq"
__version__ = "0.2.15"
__version__ = "0.2.16"
__license__ = "Apache"
__description__ = "An open-source Python package of the AlphaPept ecosystem"
__author__ = "Mann Labs"
Expand Down
10 changes: 9 additions & 1 deletion directlfq/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@ def setup_logging():
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')


##########################
LOG_PROCESSED_PROTEINS = True

def set_log_processed_proteins(log_processed_proteins = True):
global LOG_PROCESSED_PROTEINS
LOG_PROCESSED_PROTEINS = log_processed_proteins


##########################
PROTEIN_ID = 'protein'
QUANT_ID = 'ion'

Expand All @@ -22,3 +23,10 @@ def set_global_protein_and_ion_id(protein_id = 'protein', quant_id = 'ion'):
PROTEIN_ID = protein_id
QUANT_ID = quant_id

##########################
COMPILE_NORMALIZED_ION_TABLE = True

def set_compile_normalized_ion_table(compile_normalized_ion_table = True):
global COMPILE_NORMALIZED_ION_TABLE
COMPILE_NORMALIZED_ION_TABLE = compile_normalized_ion_table

7 changes: 5 additions & 2 deletions directlfq/lfq_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None, mq_protein_groups_txt = None, min_nonan = 1, input_type_to_use = None, maximum_number_of_quadratic_ions_to_use_per_protein = 10,
number_of_quadratic_samples = 50, num_cores = None, filename_suffix = "", deactivate_normalization = False, filter_dict = None, log_processed_proteins = True, protein_id = 'protein', quant_id = 'ion'
):
,compile_normalized_ion_table = True):
"""Run the directLFQ pipeline on a given input file. The input file is expected to contain ion intensities. The output is a table containing protein intensities.

Args:
Expand All @@ -38,6 +38,7 @@ def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None
"""
config.set_global_protein_and_ion_id(protein_id=protein_id, quant_id=quant_id)
config.set_log_processed_proteins(log_processed_proteins=log_processed_proteins)
config.set_compile_normalized_ion_table(compile_normalized_ion_table= compile_normalized_ion_table)

LOGGER.info("Starting directLFQ analysis.")
input_file = prepare_input_filename(input_file)
Expand All @@ -62,7 +63,9 @@ def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None
outfile_basename = get_outfile_basename(input_file, input_type_to_use, selected_proteins_file, deactivate_normalization,filename_suffix)
save_run_config(outfile_basename, locals())
save_protein_df(protein_df,outfile_basename)
save_ion_df(ion_df,outfile_basename)

if config.COMPILE_NORMALIZED_ION_TABLE:
save_ion_df(ion_df,outfile_basename)

LOGGER.info("Analysis finished!")

Expand Down
169 changes: 108 additions & 61 deletions directlfq/protein_intensity_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,30 +29,71 @@ def estimate_protein_intensities(normed_df, min_nonan, num_samples_quadratic, nu
"derives protein pseudointensities from between-sample normalized data"

allprots = list(normed_df.index.get_level_values(0).unique())
LOGGER.info(f"{len(allprots)} prots total")
LOGGER.info(f"{len(allprots)} lfq-groups total")

list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_of_tuple_w_protein_profiles_and_shifted_peptides(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores)
protein_df = get_protein_dataframe_from_list_of_protein_profiles(allprots=allprots, list_of_tuple_w_protein_profiles_and_shifted_peptides=list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df= normed_df)
ion_df = get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots)
if config.COMPILE_NORMALIZED_ION_TABLE:
ion_df = get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots)
else:
ion_df = None

return protein_df, ion_df


def get_list_of_tuple_w_protein_profiles_and_shifted_peptides(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores):
input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan = get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan)

if num_cores is not None and num_cores <=1:
list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_sequential_processing(allprots, normed_df, num_samples_quadratic, min_nonan)
list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_sequential_processing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan)
else:
list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_multiprocessing(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores)
list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_multiprocessing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan, num_cores)
return list_of_tuple_w_protein_profiles_and_shifted_peptides

def get_list_with_sequential_processing(allprots, normed_df, num_samples_quadratic, min_nonan):
input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan = get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan)

def get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan):
list_of_normed_dfs = get_normed_dfs(normed_df)
return zip(range(len(list_of_normed_dfs)),list_of_normed_dfs, itertools.repeat(num_samples_quadratic), itertools.repeat(min_nonan))


def get_normed_dfs(normed_df):
protein_names = normed_df.index.get_level_values(0).to_numpy()
ion_names = normed_df.index.get_level_values(1).to_numpy()
normed_array = normed_df.to_numpy()
indices_of_proteinname_switch = find_nameswitch_indices(protein_names)
results_list = [get_subdf(normed_array, indices_of_proteinname_switch, idx, protein_names, ion_names) for idx in range(len(indices_of_proteinname_switch)-1)]

return results_list


def find_nameswitch_indices(arr):
change_indices = np.where(arr[:-1] != arr[1:])[0] + 1

# Add the index 0 for the start of the first element
start_indices = np.insert(change_indices, 0, 0)

#Append the index of the last element
start_indices = np.append(start_indices, len(arr))

return start_indices


def get_subdf(normed_array, indices_of_proteinname_switch, idx, protein_names, ion_names):
start_switch = indices_of_proteinname_switch[idx]
end_switch = indices_of_proteinname_switch[idx+1]
sub_array = normed_array[start_switch:end_switch]
index_sub_array = pd.MultiIndex.from_arrays([protein_names[start_switch:end_switch], ion_names[start_switch:end_switch]], names=[config.PROTEIN_ID, config.QUANT_ID])
return pd.DataFrame(sub_array, index = index_sub_array)




def get_list_with_sequential_processing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan):
list_of_tuple_w_protein_profiles_and_shifted_peptides = list(map(lambda x : calculate_peptide_and_protein_intensities(*x), input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan))
return list_of_tuple_w_protein_profiles_and_shifted_peptides

def get_list_with_multiprocessing(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores):
def get_list_with_multiprocessing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan, num_cores):
pool = get_configured_multiprocessing_pool(num_cores)
input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan = get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan)
list_of_tuple_w_protein_profiles_and_shifted_peptides = pool.starmap(calculate_peptide_and_protein_intensities, input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan)
pool.close()
return list_of_tuple_w_protein_profiles_and_shifted_peptides
Expand All @@ -66,65 +107,17 @@ def get_configured_multiprocessing_pool(num_cores):
LOGGER.info(f"using {pool._processes} processes")
return pool

def calculate_peptide_and_protein_intensities_from_list_of_peptide_intensity_dfs(idx, list_of_peptide_intensity_dfs, num_samples_quadratic, min_nonan):
for peptide_intensity_df in list_of_peptide_intensity_dfs:
calculate_peptide_and_protein_intensities

def get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan):
list_of_normed_dfs = get_normed_dfs(normed_df, allprots)
return zip(range(len(list_of_normed_dfs)),list_of_normed_dfs, itertools.repeat(num_samples_quadratic), itertools.repeat(min_nonan))




def get_normed_dfs(normed_df, allprots):
list_of_normed_dfs = []
for protein in allprots:
peptide_intensity_df = pd.DataFrame(normed_df.loc[protein])#DataFrame definition to avoid pandas Series objects
list_of_normed_dfs.append(peptide_intensity_df)

return list_of_normed_dfs


def get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots):
ion_ints = [x[1] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]
ion_ints = add_protein_names_to_ion_ints(ion_ints, allprots)
ion_df = 2**pd.concat(ion_ints)
ion_df = ion_df.replace(np.nan, 0)
return ion_df

def add_protein_names_to_ion_ints(ion_ints, allprots):
ion_ints = [add_protein_name_to_ion_df(ion_ints[idx], allprots[idx]) for idx in range(len(ion_ints))]
return ion_ints

def add_protein_name_to_ion_df(ion_df, protein):
ion_df[config.PROTEIN_ID] = protein
ion_df = ion_df.reset_index().set_index([config.PROTEIN_ID, config.QUANT_ID])
return ion_df


def get_protein_dataframe_from_list_of_protein_profiles(allprots, list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df):
index_list = []
profile_list = []

list_of_protein_profiles = [x[0] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]

for idx in range(len(allprots)):
if list_of_protein_profiles[idx] is None:
continue
index_list.append(allprots[idx])
profile_list.append(list_of_protein_profiles[idx])

index_for_protein_df = pd.Index(data=index_list, name=config.PROTEIN_ID)
protein_df = 2**pd.DataFrame(profile_list, index = index_for_protein_df, columns = normed_df.columns)
protein_df = protein_df.replace(np.nan, 0)
protein_df = protein_df.reset_index()
return protein_df


def calculate_peptide_and_protein_intensities(idx,peptide_intensity_df , num_samples_quadratic, min_nonan):
def calculate_peptide_and_protein_intensities(idx, peptide_intensity_df, num_samples_quadratic, min_nonan):
if len(peptide_intensity_df.index) > 1:
peptide_intensity_df = ProtvalCutter(peptide_intensity_df, maximum_df_length=100).get_dataframe()

if(idx%100 ==0) and config.LOG_PROCESSED_PROTEINS:
LOGGER.info(f"prot {idx}")
LOGGER.info(f"lfq-object {idx}")
summed_pepint = np.nansum(2**peptide_intensity_df)

if(peptide_intensity_df.shape[1]<2):
Expand Down Expand Up @@ -200,3 +193,57 @@ def get_dataframe(self):
def _get_shortened_dataframe(self):
shortened_index = self._sorted_idx[:self._maximum_df_length]
return self._protvals_df.loc[shortened_index]




def get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots):
ion_names = []
ion_vals = []
protein_names = []
column_names = list_of_tuple_w_protein_profiles_and_shifted_peptides[0][1].columns.tolist()
for idx in range(len(list_of_tuple_w_protein_profiles_and_shifted_peptides)):
protein_name = allprots[idx]
ion_df = list_of_tuple_w_protein_profiles_and_shifted_peptides[idx][1]
ion_names += ion_df.index.values.tolist()
ion_vals.append(ion_df.to_numpy())
protein_names.extend([protein_name]*len(ion_df.index))
merged_ions = 2**np.concatenate(ion_vals)
merged_ions = np.nan_to_num(merged_ions)
ion_df = pd.DataFrame(merged_ions)
ion_df.columns = column_names
ion_df["ion"] = ion_names
ion_df["protein"] = protein_names
ion_df = ion_df.set_index(["protein", "ion"])
return ion_df



def add_protein_names_to_ion_ints(ion_ints, allprots):
ion_ints = [add_protein_name_to_ion_df(ion_ints[idx], allprots[idx]) for idx in range(len(ion_ints))]
return ion_ints

def add_protein_name_to_ion_df(ion_df, protein):
ion_df[config.PROTEIN_ID] = protein
ion_df = ion_df.reset_index().set_index([config.PROTEIN_ID, config.QUANT_ID])
return ion_df


def get_protein_dataframe_from_list_of_protein_profiles(allprots, list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df):
index_list = []
profile_list = []

list_of_protein_profiles = [x[0] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]

for idx in range(len(allprots)):
if list_of_protein_profiles[idx] is None:
continue
index_list.append(allprots[idx])
profile_list.append(list_of_protein_profiles[idx])

index_for_protein_df = pd.Index(data=index_list, name=config.PROTEIN_ID)
protein_df = 2**pd.DataFrame(profile_list, index = index_for_protein_df, columns = normed_df.columns)
protein_df = protein_df.replace(np.nan, 0)
protein_df = protein_df.reset_index()
return protein_df

2 changes: 1 addition & 1 deletion misc/bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.2.15
current_version = 0.2.16
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
Loading
Loading