From 52e5c8deee4984bfece342d8777eebfc96a1a9f5 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Wed, 2 Oct 2024 15:55:05 +0200 Subject: [PATCH 1/5] recognize and handle parquet input --- directlfq/utils.py | 327 +++------------------------------------------ 1 file changed, 22 insertions(+), 305 deletions(-) diff --git a/directlfq/utils.py b/directlfq/utils.py index c5d3a30..ca042a1 100644 --- a/directlfq/utils.py +++ b/directlfq/utils.py @@ -1,33 +1,3 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbdev_nbs/04_utils.ipynb. - -# %% auto 0 -__all__ = ['get_samples_used_from_samplemap_file', 'get_samples_used_from_samplemap_df', 'get_all_samples_from_samplemap_df', - 'get_samplenames_from_input_df', 'filter_df_to_minrep', 'get_condpairname', 'get_quality_score_column', - 'make_dir_w_existcheck', 'get_results_plot_dir_condpair', 'get_middle_elem', 'get_nonna_array', - 'get_non_nas_from_pd_df', 'get_ionints_from_pd_df', 'invert_dictionary', 'get_z_from_p_empirical', - 'count_fraction_outliers_from_expected_fc', 'create_or_replace_folder', - 'add_mq_protein_group_ids_if_applicable_and_obtain_annotated_file', - 'load_input_file_and_de_duplicate_if_evidence', 'create_id_to_protein_df', - 'determine_id_column_from_input_df', 'annotate_mq_df', 'remove_ids_not_occurring_in_mq_df', - 'save_annotated_mq_df', 'add_columns_to_lfq_results_table', 'clean_input_filename_if_necessary', - 'get_protein_column_input_table', 'get_standard_columns_for_input_type', - 'filter_columns_to_existing_columns', 'show_diff', 'write_chunk_to_file', 'index_and_log_transform_input_df', - 'remove_allnan_rows_input_df', 'get_relevant_columns', 'get_relevant_columns_config_dict', - 'get_quant_ids_from_config_dict', 'get_sample_ids_from_config_dict', 'get_channel_ids_from_config_dict', - 'load_config', 'get_type2relevant_cols', 'filter_input', 'merge_protein_and_ion_cols', - 'merge_protein_cols_and_ion_dict', 'get_quantitative_columns', 'get_ionname_columns', - 'adapt_headers_on_extended_df', 'split_extend_df', 'add_merged_ionnames', - 'reformat_and_write_longtable_according_to_config', 'adapt_subtable', 'process_with_dask', - 'reshape_input_df', 'sort_and_add_columns', 'extend_sample_allcolumns_for_plexdia_case', - 'adapt_input_df_columns_in_case_of_plexDIA', 'extend_sampleID_column_for_plexDIA_case', - 'set_mtraq_reduced_ion_column_into_dataframe', 'remove_mtraq_modifications_from_ion_ids', 'is_plexDIA_table', - 'parse_channel_from_peptide_column', 'merge_sample_id_and_channels', 'merge_channel_and_sample_string', - 'reformat_and_write_wideformat_table', 'check_for_processed_runs_in_results_folder', 'import_data', - 'reformat_and_save_input_file', 'add_ion_protein_headers_if_applicable', 'get_input_type_and_config_dict', - 'get_original_file_from_aq_reformat', 'import_config_dict', 'load_samplemap', 'prepare_loaded_tables', - 'LongTableReformater', 'AcquisitionTableHandler', 'AcquisitionTableInfo', 'AcquisitionTableHeaders', - 'AcquisitionTableOutputPaths', 'AcquisitionTableReformater', 'AcquisitionTableHeaderFilter', - 'merge_acquisition_df_parameter_df'] # %% ../nbdev_nbs/04_utils.ipynb 2 import os @@ -42,7 +12,7 @@ #config.setup_logging() LOGGER = logging.getLogger(__name__) -# %% ../nbdev_nbs/04_utils.ipynb 5 + def get_samples_used_from_samplemap_file(samplemap_file, cond1, cond2): samplemap_df = load_samplemap(samplemap_file) return get_samples_used_from_samplemap_df(samplemap_df, cond1, cond2) @@ -139,16 +109,6 @@ def invert_dictionary(my_map): inv_map[v] = inv_map.get(v, []) + [k] return inv_map -# %% ../nbdev_nbs/04_utils.ipynb 17 -import statistics - -def get_z_from_p_empirical(p_emp,p2z): - p_rounded = np.format_float_scientific(p_emp, 1) - if p_rounded in p2z: - return p2z.get(p_rounded) - z = statistics.NormalDist().inv_cdf(float(p_rounded)) - p2z[p_rounded] = z - return z # %% ../nbdev_nbs/04_utils.ipynb 18 def count_fraction_outliers_from_expected_fc(result_df, threshold, expected_log2fc): @@ -616,7 +576,7 @@ def reformat_and_write_longtable_according_to_config(input_file, outfile_name, c os.remove(outfile_name) relevant_cols = get_relevant_columns_config_dict(config_dict_for_type) - input_df_it = pd.read_csv(input_file, sep = sep, decimal=decimal, usecols = relevant_cols, encoding ='latin1', chunksize = chunksize) + input_df_it = read_file_with_pandas(input_file=input_file, sep=sep, decimal=decimal, usecols=relevant_cols, chunksize=chunksize) input_df_list = [] header = True for input_df_subset in input_df_it: @@ -775,7 +735,7 @@ def reformat_and_write_wideformat_table(peptides_tsv, outfile_name, config_dict) input_df.to_csv(outfile_name, sep = '\t', index = None) -# %% ../nbdev_nbs/04_utils.ipynb 42 + import os def check_for_processed_runs_in_results_folder(results_folder): contained_condpairs = [] @@ -787,7 +747,24 @@ def check_for_processed_runs_in_results_folder(results_folder): contained_condpairs.append(res_name) return contained_condpairs -# %% ../nbdev_nbs/04_utils.ipynb 44 + +def read_file_with_pandas(input_file, decimal='.', usecols=None, chunksize=None, nrows=None, sep = None): + filename = str(input_file) + if '.parquet' in filename: + return pd.read_parquet(input_file, columns=usecols, chunksize=chunksize, nrows=nrows) + else: + if sep is None: + if '.csv' in filename: + sep=',' + elif '.tsv' in filename: + sep='\t' + else: + sep='\t' + LOGGER.info(f"neither of the file extensions (.tsv, .csv) detected for file {input_file}! Trying with tab separation. In the case that it fails, please provide the correct file extension") + return pd.read_csv(input_file,sep=sep, decimal=decimal, usecols=usecols, encoding='latin1', chunksize=chunksize, nrows=nrows) + + + import pandas as pd import os import pathlib @@ -874,12 +851,8 @@ def get_input_type_and_config_dict(input_file, input_type_to_use = None): if '.txt' in filename: sep='\t' - if 'sep' not in locals(): - raise TypeError(f"neither of the file extensions (.tsv, .csv, .txt) detected for file {input_file}! Your filename has to contain one of these extensions. Please modify your file name accordingly.") - - - uploaded_data_columns = set(pd.read_csv(input_file, sep=sep, nrows=1, encoding ='latin1').columns) + uploaded_data_columns = set(read_file_with_pandas(input_file, nrows=1).columns) for input_type in type2relevant_columns.keys(): if (input_type_to_use is not None) and (input_type!=input_type_to_use): @@ -970,259 +943,3 @@ def __initialize_df_iterator__(self): def __write_reformatted_df_to_file__(reformatted_df, filepath ,write_header): reformatted_df.to_csv(filepath, header=write_header, mode='a', sep = "\t", index = None) -# %% ../nbdev_nbs/04_utils.ipynb 51 -import os -import re - -class AcquisitionTableHandler(): - def __init__(self, results_dir, samples): - self._table_infos = AcquisitionTableInfo(results_dir=results_dir) - self._header_infos = AcquisitionTableHeaders(self._table_infos) - self._samples = self.__reformat_samples_if_necessary(samples) - - def get_acquisition_info_df(self): - return self.__get_reformated_df__() - - def save_dataframe_as_new_acquisition_dataframe(self): - self._output_paths = AcquisitionTableOutputPaths(self._table_infos) - self.__remove_possible_pre_existing_ml_table__(self._output_paths.output_file_name) - df_reformater = AcquisitionTableReformater(table_infos = self._table_infos, header_infos=self._header_infos, samples = self._samples, dataframe_already_preformated=False) - df_reformater.reformat_and_save_acquisition_data_frame(self._output_paths.output_file_name) - - def update_ml_file_location_in_method_parameters_yaml(self): - method_params = load_method_parameters(self._table_infos._results_dir) - if self._output_paths == None: - raise Exception("output paths not initialized! This could be because no dataframe was saved before") - method_params[self._output_paths.ml_file_accession_in_yaml] = self._output_paths.output_file_name - save_dict_as_yaml(method_params, self._output_paths.method_parameters_yaml_path) - - def __get_reformated_df__(self): - df_reformater = AcquisitionTableReformater(table_infos = self._table_infos, header_infos=self._header_infos, samples = self._samples, dataframe_already_preformated=True) - df = df_reformater.reformat_and_load_acquisition_data_frame() - return df.convert_dtypes() - - def __reformat_samples_if_necessary(self, samples): - if "plexDIA" in self._table_infos._input_type: - return self.__get_plexDIA_samplenames__(samples) - else: - return samples - - def __get_plexDIA_samplenames__(self, samples): - new_samples = [] - for sample in samples: - new_samples.append(self.__get_samplename_without_mtraq_tag__(sample)) - return new_samples - - @staticmethod - def __get_samplename_without_mtraq_tag__(samplename): - pattern = "(.*)(_\(mTRAQ-n-.\))" - matched = re.match(pattern, samplename) - return matched.group(1) - - @staticmethod - def __remove_possible_pre_existing_ml_table__(output_file_name): - if os.path.exists(output_file_name): - os.remove(output_file_name) - LOGGER.info(f"removed pre existing {output_file_name}") - - -class AcquisitionTableInfo(): - def __init__(self, results_dir, sep = "\t", decimal = "."): - self._results_dir = results_dir - self._sep = sep - self._decimal = decimal - self._method_params_dict = load_method_parameters(results_dir) - self._input_file = self.__get_input_file__() - self._file_ending_of_formatted_table = ".ml_info_table.tsv" - self.already_formatted = self.__check_if_input_file_is_already_formatted__() - self._input_type, self._config_dict = self.__get_input_type_and_config_dict__() - self._sample_column = self.__get_sample_column__() - self.last_ion_level_to_use = self.__get_last_ion_level_to_use__() - - def __get_input_file__(self): - if self._method_params_dict.get('ml_input_file') is None: - return self.__get_location_of_original_file__() - else: - return self._method_params_dict.get('ml_input_file') - - def __check_if_input_file_is_already_formatted__(self): - if self._file_ending_of_formatted_table in self._input_file: - return True - else: - return False - - def __get_input_type_and_config_dict__(self): - if self.already_formatted: - original_file = self.__get_location_of_original_file__() - else: - original_file = self._input_file - input_type, config_dict, _ = get_input_type_and_config_dict(original_file) - return input_type, config_dict - - def __get_location_of_original_file__(self): - input_file = self._method_params_dict.get('input_file') - return self.__get_original_filename_from_input_file__(input_file) - - @staticmethod - def __get_original_filename_from_input_file__(input_file): - pattern = "(.*\.tsv|.*\.csv|.*\.txt)(\..*)(.aq_reformat.tsv)" - m = re.match(pattern=pattern, string=input_file) - if m: - return m.group(1) - else: - return input_file - - - def __get_sample_column__(self): - return self._config_dict.get("sample_ID") - - def __get_last_ion_level_to_use__(self): - return self._config_dict["ml_level"] - - - - - -class AcquisitionTableHeaders(): - def __init__(self, acquisition_table_info): - - self._table_info = acquisition_table_info - - self._ion_hierarchy = self.__get_ordered_ion_hierarchy__() - self._included_levelnames = self.__get_included_levelnames__() - self._ion_headers_grouped = self.__get_ion_headers_grouped__() - self._ion_headers = self.__get_ion_headers__() - self._numeric_headers = self.__get_numeric_headers__() - self._relevant_headers = self.__get_relevant_headers__() - - def __get_ordered_ion_hierarchy__(self): - ion_hierarchy = self._table_info._config_dict.get("ion_hierarchy") - hier_key = 'fragion' if 'fragion' in ion_hierarchy.keys() else list(ion_hierarchy.keys())[0] - ion_hierarchy_on_chosen_key = ion_hierarchy.get(hier_key) - return ion_hierarchy_on_chosen_key - - def __get_included_levelnames__(self): - levelnames = self.__get_all_levelnames__(self._ion_hierarchy) - last_ionlevel_idx = levelnames.index(self._table_info.last_ion_level_to_use) - return levelnames[:last_ionlevel_idx+1] - - @staticmethod - def __get_all_levelnames__(ion_hierarchy): - return ion_hierarchy.get('order') - - def __get_ion_headers_grouped__(self): - mapping_dict = self.__get_levelname_mapping_dict(self._ion_hierarchy) - return [mapping_dict.get(x) for x in self._included_levelnames]#on each level there can be multiple names, so it is a list of lists - - @staticmethod - def __get_levelname_mapping_dict(ion_hierarchy): - return ion_hierarchy.get('mapping') - - def __get_ion_headers__(self): - return list(itertools.chain(*self._ion_headers_grouped)) - - - def __get_relevant_headers__(self): - relevant_headers = self._numeric_headers+self._ion_headers + [self._table_info._sample_column] - return self.__remove_possible_none_values_from_list__(relevant_headers) - - @staticmethod - def __remove_possible_none_values_from_list__(list): - return [x for x in list if x is not None] - - def __get_numeric_headers__(self): - df_sample = pd.read_csv(self._table_info._input_file, sep = self._table_info._sep, decimal = self._table_info._decimal, encoding='latin1', nrows=3000) #sample 3000 rows from the df to assess the types of each row - df_sample = df_sample.replace({False: 0, True: 1}) - numeric_headers = list(df_sample.select_dtypes(include=np.number).columns) - numeric_headers = AcquisitionTableHeaderFilter().filter_numeric_headers_if_specified(input_type = self._table_info._input_type, numeric_headers = numeric_headers) - return numeric_headers - - -class AcquisitionTableOutputPaths(): - def __init__(self, table_info): - self._table_info = table_info - self.output_file_name = self.__get_output_file_name__() - self.method_parameters_yaml_path = self.__get_method_parameters_yaml_path__() - self.ml_file_accession_in_yaml = "ml_input_file" - - def __get_output_file_name__(self): - old_file_name = self._table_info._input_file - new_file_name = old_file_name+self._table_info._file_ending_of_formatted_table - return new_file_name - - def __get_method_parameters_yaml_path__(self): - return f"{self._table_info._results_dir}/aq_parameters.yaml" - - -class AcquisitionTableReformater(LongTableReformater): - def __init__(self, table_infos, header_infos, samples, dataframe_already_preformated = False): - - LongTableReformater.__init__(self, table_infos._input_file) - self._table_infos = table_infos - self._header_infos = header_infos - self._samples = samples - self._dataframe_already_preformated = dataframe_already_preformated - - #set the two functions that specify the explicit reformatting - self._reformatting_function = self.__reformatting_function__ - self._iterator_function = self.__initialize_iterator_with_specified_columns__ - - def __reformatting_function__(self, input_df_subset): - input_df_subset = input_df_subset.drop_duplicates() - input_df_subset = self.__filter_reformated_df_if_necessary__(input_df_subset) - if not self._dataframe_already_preformated: - input_df_subset = add_merged_ionnames(input_df_subset, self._header_infos._included_levelnames, self._header_infos._ion_headers_grouped, None, None) - return input_df_subset - - def __filter_reformated_df_if_necessary__(self, reformatted_df): - if 'spectronaut' in self._table_infos._input_type or 'diann' in self._table_infos._input_type: - return self.__filter_reformatted_dataframe_to_relevant_samples__(reformatted_df) - else: - return reformatted_df - - def __filter_reformatted_dataframe_to_relevant_samples__(self, input_df_subset): - return input_df_subset[[x in self._samples for x in input_df_subset[self._table_infos._sample_column]]] - - def __initialize_iterator_with_specified_columns__(self): - cols_to_use = self.__get_cols_to_use__() - return pd.read_csv(self._table_infos._input_file, sep = self._table_infos._sep, decimal=self._table_infos._decimal, usecols = cols_to_use, encoding ='latin1', chunksize=1000000) - - def __get_cols_to_use__(self): - cols_to_use = self._header_infos._relevant_headers - if self._dataframe_already_preformated: - return cols_to_use+[config.QUANT_ID] - else: - return cols_to_use - - - - -class AcquisitionTableHeaderFilter(): - def __init__(self): - self._spectronaut_header_filter = lambda x : (("EG." in x) | ("FG." in x)) and ("Global" not in x) - self._maxquant_header_filter = lambda x : ("Intensity" not in x) and ("Experiment" not in x) - - def filter_numeric_headers_if_specified(self, input_type, numeric_headers): - if 'spectronaut' in input_type: - return [x for x in numeric_headers if self._spectronaut_header_filter(x)] - elif 'maxquant' in input_type: - return [x for x in numeric_headers if self._maxquant_header_filter(x)] - else: - return numeric_headers - - - - -# %% ../nbdev_nbs/04_utils.ipynb 52 -def merge_acquisition_df_parameter_df(acquisition_df, parameter_df, groupby_merge_type = 'mean'): - """acquisition df contains details on the acquisition, parameter df are the parameters derived from the tree - """ - merged_df = parameter_df.merge(acquisition_df, how = 'left', on = config.QUANT_ID) - if groupby_merge_type == 'mean': - merged_df = merged_df.groupby(config.QUANT_ID).mean().reset_index() - if groupby_merge_type == 'min': - merged_df = merged_df.groupby(config.QUANT_ID).min().reset_index() - if groupby_merge_type == 'max': - merged_df = merged_df.groupby(config.QUANT_ID).max().reset_index() - merged_df = merged_df.dropna(axis=1, how='all') - return merged_df From cc7f58ea1052220b5f8a71c5a58a800f750d2150 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:47:31 +0200 Subject: [PATCH 2/5] add parquet compatible pandas reader --- directlfq/utils_fileread.py | 50 +++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 directlfq/utils_fileread.py diff --git a/directlfq/utils_fileread.py b/directlfq/utils_fileread.py new file mode 100644 index 0000000..98a2e79 --- /dev/null +++ b/directlfq/utils_fileread.py @@ -0,0 +1,50 @@ +import os +import pathlib +import logging +import pyarrow.parquet +import pandas as pd + +if "__file__" in globals(): + INTABLE_CONFIG = os.path.join(pathlib.Path(__file__).parent.absolute(), "configs", "intable_config.yaml") #the yaml config is located one directory below the python library files + CONFIG_PATH = os.path.join(pathlib.Path(__file__).parent.absolute(), "configs") + +LOGGER = logging.getLogger(__name__) + + + + + +def read_file_with_pandas(input_file, decimal='.', usecols=None, chunksize=None, sep = None): + filename = str(input_file) + if '.parquet' in filename: + return read_parquet_file(input_file, usecols=usecols, chunksize=chunksize) + else: + if sep is None: + if '.csv' in filename: + sep=',' + elif '.tsv' in filename: + sep='\t' + else: + sep='\t' + LOGGER.info(f"neither of the file extensions (.tsv, .csv) detected for file {input_file}! Trying with tab separation. In the case that it fails, please provide the correct file extension") + return pd.read_csv(input_file,sep=sep, decimal=decimal, usecols=usecols, encoding='latin1', chunksize=chunksize) + + +def read_parquet_file(input_file, usecols=None, chunksize=None): + if chunksize is not None: + return read_parque_file_chunkwise(input_file, usecols=usecols, chunksize=chunksize) + else: + return pd.read_parquet(input_file, columns=usecols) + +def read_parque_file_chunkwise(input_file, usecols=None, chunksize=None): + parquet_file = pyarrow.parquet.ParquetFile(input_file) + for batch in parquet_file.iter_batches(columns=usecols, batch_size=chunksize): + yield batch.to_pandas() + + +def read_columns_from_file(file, sep="\t"): + if file.endswith(".parquet"): + parquet_file = pyarrow.parquet.ParquetFile(file) + return parquet_file.schema.names + else: + return pd.read_csv(file, sep=sep, nrows=1).columns.tolist() \ No newline at end of file From 9d37493dbc353752cb3ac0ea0a8810e475946a20 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:47:48 +0200 Subject: [PATCH 3/5] use parquet compatible pandas readers --- directlfq/utils.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/directlfq/utils.py b/directlfq/utils.py index ca042a1..3590f97 100644 --- a/directlfq/utils.py +++ b/directlfq/utils.py @@ -1,5 +1,4 @@ -# %% ../nbdev_nbs/04_utils.ipynb 2 import os import pathlib if "__file__" in globals():#only run in the translated python file, as __file__ is not defined with ipython @@ -8,8 +7,8 @@ import logging import directlfq.config as config +import directlfq.utils_fileread as utils_fileread -#config.setup_logging() LOGGER = logging.getLogger(__name__) @@ -251,7 +250,7 @@ def get_standard_columns_for_input_type(input_type): return [] def filter_columns_to_existing_columns(columns, input_file): - existing_columns = pd.read_csv(input_file, sep='\t', nrows=1).columns + existing_columns = utils_fileread.read_columns_from_file(input_file) return [x for x in columns if x in existing_columns] @@ -576,7 +575,7 @@ def reformat_and_write_longtable_according_to_config(input_file, outfile_name, c os.remove(outfile_name) relevant_cols = get_relevant_columns_config_dict(config_dict_for_type) - input_df_it = read_file_with_pandas(input_file=input_file, sep=sep, decimal=decimal, usecols=relevant_cols, chunksize=chunksize) + input_df_it = utils_fileread.read_file_with_pandas(input_file=input_file, sep=sep, decimal=decimal, usecols=relevant_cols, chunksize=chunksize) input_df_list = [] header = True for input_df_subset in input_df_it: @@ -748,20 +747,6 @@ def check_for_processed_runs_in_results_folder(results_folder): return contained_condpairs -def read_file_with_pandas(input_file, decimal='.', usecols=None, chunksize=None, nrows=None, sep = None): - filename = str(input_file) - if '.parquet' in filename: - return pd.read_parquet(input_file, columns=usecols, chunksize=chunksize, nrows=nrows) - else: - if sep is None: - if '.csv' in filename: - sep=',' - elif '.tsv' in filename: - sep='\t' - else: - sep='\t' - LOGGER.info(f"neither of the file extensions (.tsv, .csv) detected for file {input_file}! Trying with tab separation. In the case that it fails, please provide the correct file extension") - return pd.read_csv(input_file,sep=sep, decimal=decimal, usecols=usecols, encoding='latin1', chunksize=chunksize, nrows=nrows) @@ -850,9 +835,11 @@ def get_input_type_and_config_dict(input_file, input_type_to_use = None): sep='\t' if '.txt' in filename: sep='\t' + else: + sep="\t" - uploaded_data_columns = set(read_file_with_pandas(input_file, nrows=1).columns) + uploaded_data_columns = utils_fileread.read_columns_from_file(input_file, sep=sep) for input_type in type2relevant_columns.keys(): if (input_type_to_use is not None) and (input_type!=input_type_to_use): From c18da0291b02e2cd73bc5ce30ba0f82189fecb47 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:59:40 +0200 Subject: [PATCH 4/5] add parquet check --- ...un_pipeline_w_different_input_formats.ipynb | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/quicktests/run_pipeline_w_different_input_formats.ipynb b/tests/quicktests/run_pipeline_w_different_input_formats.ipynb index 9a6d1f1..6962a37 100644 --- a/tests/quicktests/run_pipeline_w_different_input_formats.ipynb +++ b/tests/quicktests/run_pipeline_w_different_input_formats.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -17,6 +17,18 @@ "quicktest_folders = [quicktest_folder_mq_peptides, quicktest_folder_mq_evidence, quicktest_folder_diann, quicktest_folder_spectronaut]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import directlfq.lfq_manager as lfq_manager\n", + "diann_quicktest_file_parquet = f\"{quicktest_folder_diann}/shortened_input.parquet\"\n", + "\n", + "lfq_manager.run_lfq(diann_quicktest_file_parquet, input_type_to_use='diann_precursor_ms1_and_ms2', num_cores=1)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -27,13 +39,15 @@ "import directlfq.lfq_manager as lfq_manager\n", "import pandas as pd\n", "diann_quicktest_file = f\"{quicktest_folder_diann}/shortened_input.tsv\"\n", + "diann_quicktest_file_parquet = f\"{quicktest_folder_diann}/shortened_input.parquet\"\n", "filterdict_file = f\"{quicktest_folder_diann}/filterdict.yaml\"\n", "filter_dict={'protein_filt': {'param': 'Lib.PG.Q.Value', 'comparator': '<=', 'value': 0.01},\n", " 'peptide_filt' :{'param' : 'Lib.Q.Value','comparator': '<=', 'value': 0.01}}\n", "if __name__ == '__main__': \n", " lfq_manager.run_lfq(diann_quicktest_file, filter_dict=filterdict_file, num_cores=1)\n", " lfq_manager.run_lfq(diann_quicktest_file, input_type_to_use='diann_peptide_based_on_precursor_ms1_and_ms2', filter_dict=filter_dict)\n", - " lfq_manager.run_lfq(diann_quicktest_file, input_type_to_use='diann_precursor_ms1_and_ms2')\n" + " lfq_manager.run_lfq(diann_quicktest_file, input_type_to_use='diann_precursor_ms1_and_ms2')\n", + " lfq_manager.run_lfq(diann_quicktest_file_parquet, input_type_to_use='diann_precursor_ms1_and_ms2')\n" ] }, { From 1c0906616753a80e9cb5a225ec16effbd69b47e2 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:30:24 +0200 Subject: [PATCH 5/5] add pyarrow dependency --- requirements/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index e3a8ccd..b681010 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -5,4 +5,5 @@ dask==2023.1.0 numba==0.56.4 multiprocess==0.70.14 wget==3.2 -PyYAML==6.0.2 \ No newline at end of file +PyYAML==6.0.2 +pyarrow==17.0.0 \ No newline at end of file