diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3a5546a..8612603 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9"] + python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v2 diff --git a/moonstone/analysis/differential_analysis.py b/moonstone/analysis/differential_analysis.py index 9387200..66ea936 100644 --- a/moonstone/analysis/differential_analysis.py +++ b/moonstone/analysis/differential_analysis.py @@ -78,7 +78,7 @@ def test_multiple_features(self, feature, test_to_use): list_ofgroups = [] for variable in variable_dic: list_ofgroups.append(variable_dic[variable][self.full_table.columns[family]]) - #test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups)) + # test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups)) test = self.tests_functions_used[test_to_use](*list_ofgroups) # works for kruskal and one way anova features.append(feature) taxons.append(self.full_table.columns[family]) diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py index 97bfa1c..dc11c2b 100644 --- a/moonstone/analysis/diversity/base.py +++ b/moonstone/analysis/diversity/base.py @@ -1,12 +1,13 @@ import logging import re from abc import ABC, abstractmethod -import skbio +from numbers import Real from string import capwords from typing import Union import numpy as np import pandas as pd +import skbio from statsmodels.stats.multitest import multipletests from moonstone.analysis.statistical_test import statistical_test_groups_comparison @@ -16,6 +17,8 @@ from moonstone.plot.graphs.heatmap import HeatmapGraph from moonstone.plot.graphs.histogram import Histogram from moonstone.plot.graphs.violin import GroupViolinGraph, ViolinGraph +from moonstone.utils.dict_operations import merge_dict + logger = logging.getLogger(__name__) @@ -219,14 +222,28 @@ def _visualize_pvalue_matrix(self, pval: pd.DataFrame, output_pval_file: str): output_file=output_pval_file ) - def _valid_pval_param(self, pval_to_compute): + def _valid_pval_param(self, pval_to_compute, pval_to_display): choices = [ "all", "same group_col or group_col2 values", "same group_col values", None ] + dicpval = {} + for i in range(len(choices)): + dicpval[choices[i]] = i + if pval_to_compute not in choices: logger.warning("pval_to_compute='%s' not valid, set to default (all).", pval_to_compute) pval_to_compute = "all" - return pval_to_compute + + if pval_to_display not in choices: + logger.warning("pval_to_display='%s' not valid, set to default (None).", pval_to_display) + pval_to_display = None + elif dicpval[pval_to_display] < dicpval[pval_to_compute]: + raise ValueError("pval_to_display='{}' not valid, when pval_to_compute='{}'. \ +pval_to_display should be set to: {}".format( + pval_to_display, pval_to_compute, choices[dicpval[pval_to_compute]:] + )) + + return pval_to_compute, pval_to_display def _valid_correction_method_param(self, correction_method): if correction_method == "uncorrected": @@ -236,10 +253,325 @@ def _valid_correction_method_param(self, correction_method): return None return correction_method + def _pval_selection( + self, pval_series: pd.Series, groups: list, + threshold: float = 0.05 + ) -> pd.Series: + """ + To select the p-values to display. The significant p-values, meaning the p-values under a given threshold, + belonging to two groups diplayed. + + Args: + pval_series: series of all the p-values computed. + groups: list of groups displayed in graph. + threshold: the significance threshold. It must be between between 0 and 1. Default is 0.05. + """ + pval_series = pval_series[pval_series < threshold] + if groups is not None: + pval_series = pval_series[( + pval_series.index.get_level_values(0).isin(groups) & pval_series.index.get_level_values(1).isin(groups) + )] + return pval_series + + def _pval_selection_with_group_col2( + self, pval_series: pd.Series, final_groups: list, + pval_to_compute: str, pval_to_display: str, + threshold: float = 0.05 + ) -> pd.Series: + """ + To select the p-values to display when the group_col2 argument is being used. + The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed + + Args: + pval_series: series of all the p-values computed. + final_groups: list of all the combinations displayed in graph: "{group_col value} - {group_col2 value}". + threshold: the significance threshold. It must be between between 0 and 1. Default is 0.05. + """ + # Reminder: + # 1) This method called only if pval_to_display is not None. + # So pval_to_compute/pval_to_display = + # {"all", "same group_col or group_col2 values", "same group_col values"} + # 2) Index values follow this pattern: "{group_col value} - {group_col2 value}" + + pval_series = self._pval_selection(pval_series, final_groups, threshold) + if (pval_to_compute != "same group_col values" and + pval_to_display == "same group_col values"): + # we only have to check first part of index values + pval_series = pval_series[ + ( + pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0]) + == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0]) + ) + ] + elif (pval_to_compute == "all" and + pval_to_display == "same group_col or group_col2 values"): + # we compare both part of the index values -> if first part is the same = same group_col value + # -> if second part is the same = same group_col2 value + pval_series = pval_series[ + ( + pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0]) + == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0]) + ) | ( + pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[1]) + == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[1]) + )] + + return pval_series + + def _order_pval_series( + self, pval_series: pd.Series, groups: list, **kwargs + ) -> pd.Series: + """ + To select the p-values to display when the group_col2 argument is being used. + The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed + + Args: + pval_series: series of all the p-values computed. + groups: ordered list of the groups displayed in graph: "{group_col value} - {group_col2 value}". + """ + dic_gps = kwargs.pop("dic_gps", {}) + if not dic_gps: + for i in range(len(groups)): + dic_gps[groups[i]] = i + + # Reminder: pvalue series is a MultiIndex series -> 2 level of index are the 2 groups compared + # to order p-value series in a specific order dictated in dic_gps + # example: dic_gps = {"Group A": 0, "Group B": 1, "Group C": 3} + names = pval_series.index.names # default: ["Group1", "Group2"] + # first we order p-value series index name to have the group that should come first as first member + pval_series = pval_series.reset_index() + for i in pval_series.index: + level0 = pval_series.loc[i][names[0]] + level1 = pval_series.loc[i][names[1]] + if dic_gps[level0] > dic_gps[level1]: + pval_series.loc[i, names[0]] = level1 # invert to have the Group that should be put first as first + pval_series.loc[i, names[1]] = level0 # in example: "Group B - Group A" becomes "Group A - Group B" + pval_series[names[0]] = pval_series[names[0]].astype("category") + pval_series[names[0]] = pval_series[names[0]].cat.set_categories(groups, ordered=True) + pval_series[names[1]] = pval_series[names[1]].astype("category") + pval_series[names[1]] = pval_series[names[1]].cat.set_categories(groups, ordered=True) + pval_series = pval_series.sort_values([names[0], names[1]]) # we sort by 1st member, and then 2nd member + pval_series = pval_series.set_index([names[0], names[1]]) + return pval_series[0] + + def _generate_ordered_final_groups( + self, metadata_df: pd.DataFrame, final_group_col: str, group_col: str, group_col2: str, + groups: list, groups2: list + ) -> list: + """ + To order the values from final_group_col + (e.g. the combined names of group_col and group_col2: "{group_col value} - {group_col2 value}") + as it should be displayed in the graph: + Following first the order commanded by groups, and then the order commanded by groups2 + + Args: + metadata_df: dataframe containing metadata and information to group the data. + final_group_col: column generated from concatening group_col and group_col2 + (e.g. "{group_col value} - {group_col2 value}") + group_col: column from metadata_df used to group the data + group_col2: column from metadata_df used to further divide the data + groups: ordered list of groups from group_col to display in graph. + groups2: ordered list of groups from group_col2 to display in graph. + """ + # This method is called if pval_to_display isn't None and if at least one of groups or groups2 isn't None + # It lists and sorts all final_groups possibles respecting the order given by groups first and then by groups2 + t = metadata_df.drop_duplicates(subset=[final_group_col])\ + .copy() # copy() to avoid raising SettingWithCopyWarning + if groups: + t[group_col] = t[group_col].astype("category") + t[group_col] = t[group_col].cat.set_categories(groups, ordered=True) + if groups2: + t[group_col2] = t[group_col2].astype("category") + t[group_col2] = t[group_col2].cat.set_categories(groups2, ordered=True) + t = t.dropna(how="any", subset=[group_col, group_col2]) + t = t.sort_values([group_col, group_col2]) + return list(t[final_group_col]) + + def _generate_shapes_annotations_lists( + self, pval_series: pd.Series, groups: list, hgt_min: Real + ): + """ + To generate annotations to represent significant p-values. Methods for group_col only (not group_col2) + + Args: + pval_series: series of the p-values to put on the graph (need to be filtered beforehand + so that it only contains significant p-values). + groups: list of groups displayed in graph. + """ + # Overview of this method: We're trying to generate the shapes (1 bracket = 3 lines = + # 1 going from Group1 to Group2 and 2 small lines to form the edge of the bracket) + # and the annotations. + # To ensure that the brackets don't overlap, we create a table/list named `level` + # with a number of columns equal to the number of groups and an expendable number of rows + # When we fill the list, we replace 0 to 1 meaning that there is now a bracket at this + # location and that we can't add another bracket on top of it, and that another level (row) + # need to be added to `level` + dic_gps = {} + for i in range(len(groups)): + dic_gps[groups[i]] = i + + # the pvalues need to be ordered so that looking at the cell corresponding to the left edge + # of the bracket that needs to be added is enough to determine if there is already a + # bracket there. + pval_series = self._order_pval_series(pval_series, groups, dic_gps) + + det = (hgt_min/15) + + hgt_min += det/2 + fontsize = int(12+det) + linewidth = 0.5+0.15*det + + level = [[0] * (len(groups))] + list_shapes = [] + list_annotations = [] + + for ind, val in pval_series.items(): + y = 0 + + left_ind = dic_gps[ind[0]] # -> could be directly ind for shapes but /!\ not for the annotations + right_ind = dic_gps[ind[1]] + + for i in range(len(level)): + if level[i][left_ind] == 0: + level[i][left_ind:right_ind+1] = [1]*(right_ind-left_ind+1) + list_shapes += [ + {'x0': left_ind, 'y0': hgt_min+(i*det/2), + 'x1': right_ind, 'y1': hgt_min+(i*det/2), + 'line': dict(width=linewidth)}, # from Group1 to Group2 + {'x0': left_ind, 'y0': hgt_min+(i*det/2)-0.15*det, + 'x1': left_ind, 'y1': hgt_min+(i*det/2), + 'line': dict(width=linewidth)}, # left edge of the bracket + {'x0': right_ind, 'y0': hgt_min+(i*det/2)-0.15*det, + 'x1': right_ind, 'y1': hgt_min+(i*det/2), + 'line': dict(width=linewidth)} # right edge of the bracket + ] + if val < 0.01: + list_annotations += [{'text': '**', "font": dict(size=fontsize), + 'x': (left_ind+right_ind)/2, + 'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}] + else: + list_annotations += [{'text': '*', "font": dict(size=fontsize), + 'x': (left_ind+right_ind)/2, + 'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}] + y = 1 + break + if y == 0: + # we need to add another level + i += 1 + level += [[0] * (len(groups))] + level[i][left_ind:right_ind+1] = [1]*(right_ind-left_ind+1) # or ind + list_shapes += [ + {'x0': left_ind, 'y0': hgt_min+(i*det/2), + 'x1': right_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)}, + {'x0': left_ind, 'y0': hgt_min+(i*det/2)-0.15*det, + 'x1': left_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)}, + {'x0': right_ind, 'y0': hgt_min+(i*det/2)-0.15*det, + 'x1': right_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)}] + if val < 0.01: + list_annotations += [{'text': '**', "font": dict(size=fontsize), + 'x': (left_ind+right_ind)/2, + 'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}] + else: + list_annotations += [{'text': '*', "font": dict(size=fontsize), + 'x': (left_ind+right_ind)/2, + 'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}] + + return list_shapes, list_annotations, len(level) + + """ + def _generate_shapes_annotations_lists_supergroup( + self, pval_series, groups, supergroups, hgt_min + ): + # :param supergroups dictionary with supergroup edges + dic_gps = {} + for i in range(len(groups)): + dic_gps[groups[i]]=i + + list_shapes = [] + dic_middle = {} + supergroups_to_display = set(list(pval_series.index.get_level_values(0)) \ + + list(pval_series.index.get_level_values(1))) + for i in supergroups_to_display: + if isinstance(supergroups[i], list): + list_shapes += [{ + 'x0':supergroups[i][0], 'y0':hgt_min, + 'x1':supergroups[i][1], 'y1':hgt_min, + 'line':dict(width=1) + }] + dic_middle[i] = (dic_gps[supergroups[i][0]] + dic_gps[supergroups[i][1]])/2 + else: + dic_middle[i] = dic_gps[supergroups[i]] + + dic_supergps = {} + i = 0 + for k, v in sorted(dic_middle.items(), key=lambda item: item[1]): + dic_supergps[k] = i + i+=1 + + level=[[0] * (len(dic_supergps))] + list_annotations = [] + for ind, val in pval_series.items(): + y = 0 + for i in range(len(level)): + if dic_middle[ind[0]] > dic_middle[ind[1]]: + ind = (ind[1], ind[0]) + if level[i][dic_supergps[ind[0]]] == 0: + level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1] = \ + [1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1) # or ind + list_shapes += [ + {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, + 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)}, + {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5, + #'y0':hgt_min, 'line':dict(width=1, dash="dot")}, + 'y0':hgt_min+i+0.35, 'line':dict(width=1)}, + {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, + #'y0':hgt_min, 'line':dict(width=1, dash="dot")} + 'y0':hgt_min+i+0.35, 'line':dict(width=1)} + ] + if val < 0.01: + list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + 'y':hgt_min+i+0.65, 'showarrow':False}] + else: + list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + 'y':hgt_min+i+0.65, 'showarrow':False}] + y = 1 + break + if y == 0: + i += 1 + level += [[0] * (len(dic_supergps))] + level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1] = \ + [1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1) # or ind + list_shapes += [ + {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, + 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)}, + {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5, + #'y0':hgt_min, 'line':dict(width=1, dash="dot")}, + 'y0':hgt_min+i+0.35, 'line':dict(width=1)}, + {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, + #'y0':hgt_min, 'line':dict(width=1, dash="dot")} + 'y0':hgt_min+i+0.35, 'line':dict(width=1)} + ] + if val < 0.01: + list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + 'y':hgt_min+i+0.65, 'showarrow':False}] + else: + list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + 'y':hgt_min+i+0.65, 'showarrow':False}] + + return list_shapes, list_annotations, len(level) + """ + # for now, method above and below in different methods + # def _generate_dic_shapes_and_annotations( + # self, pval_series, dic_lev, hgt_min + # ): + # list_shapes={} + # dic_annotations={} + # for ind, val in pval_series.items(): + def _compute_pval_inside_subgroups( self, diversity_index_dataframe: pd.DataFrame, group_col: str, final_group_col: str, stats_test: str, correction_method: str, structure_pval: str, sym: bool - ): + ) -> pd.Series: pval = pd.Series([], dtype='float64') for g in diversity_index_dataframe[group_col].dropna().unique(): df_gp = diversity_index_dataframe[diversity_index_dataframe[group_col] == g] @@ -267,7 +599,7 @@ def analyse_groups( plotting_options: dict = None, stats_test: str = 'mann_whitney_u', correction_method: str = None, structure_pval: str = 'dataframe', sym: bool = True, - pval_to_compute: bool = 'all', + pval_to_compute: bool = 'all', pval_to_display: str = None, show_pval: bool = True, output_pval_file: str = False, **kwargs ) -> dict: @@ -280,7 +612,7 @@ def analyse_groups( :param groups: specifically select groups to display among group_col :param groups2: specifically select groups to display among group_col2 :param show: also visualize - :param show_pval: visualize p-values + :param show_pval: visualize p-values's heatmap :param output_file: file path to output your html graph :param make_graph: whether or not to make the graph :param plotting_options: plotly plotting_options @@ -293,10 +625,14 @@ def analyse_groups( :param pval_to_compute: if group_col2 used, problems of memory or in maximum recursion depth may occur. In this case, you may want to compute only p-values of specific comparisons. {"all" (default), None, "same group_col values", "same group_col or group_col2 values"} + :param pval_to_display: whether you want the significant pvalues displayed on the graph ("all") or not (None) + When group_col2 is used you may want to specify which type of comparisons you want to display the + significant pvalues of. Otherwise the graph can appear crowded by pvalues lines. + {None (default), "all", "same group_col values", "same group_col or group_col2 values"} """ filtered_metadata_df = self._get_filtered_df_from_metadata(metadata_df) - pval_to_compute = self._valid_pval_param(pval_to_compute) + pval_to_compute, pval_to_display = self._valid_pval_param(pval_to_compute, pval_to_display) correction_method = self._valid_correction_method_param(correction_method) if group_col2: @@ -310,6 +646,13 @@ def analyse_groups( ) df = self._get_grouped_df(filtered_metadata_df[[group_col, group_col2, final_group_col]]) + if pval_to_display and (groups or groups2): + # list and sort all final_groups possibles respecting the order given by + # groups first and then by groups2 + final_groups = self._generate_ordered_final_groups( + df, final_group_col, group_col, group_col2, groups, groups2 + ) + if pval_to_compute == "all": pval = self._run_statistical_test_groups( df, final_group_col, stats_test, correction_method, structure_pval, sym @@ -328,6 +671,9 @@ def analyse_groups( ) ]) + if pval_to_display: + to_display = self._pval_selection_with_group_col2(pval, final_groups, pval_to_compute, pval_to_display) + else: df = self._get_grouped_df(filtered_metadata_df[group_col]) pval = self._run_statistical_test_groups( @@ -335,6 +681,32 @@ def analyse_groups( ) # pval is in the right structure to be returned + final_groups = groups # to remove from here later on + + if pval_to_display: + final_groups = groups + to_display = self._pval_selection(pval, groups) + + if pval_to_display and to_display.empty: # nothing to display + pval_to_display = None + + if pval_to_display: + hgt_min = df[self.DIVERSITY_INDEXES_NAME].max() # + 0.5 + list_shapes, list_annotations, nb_lev = self._generate_shapes_annotations_lists( + to_display, final_groups, hgt_min + ) + if not plotting_options: + plotting_options = {} + det = (hgt_min/15) + hgt_min += det/2 + # nblevels=to_display.shape[0] + plotting_options = merge_dict(plotting_options, { + 'layout': { + 'shapes': list_shapes, # we should had to previous list if there is one + 'annotations': list_annotations, # idem + } + }) + self.last_grouped_df = df self.report_data['analyse_groups'] = { 'pval': pval, diff --git a/moonstone/plot/counts.py b/moonstone/plot/counts.py index c31b364..74b753b 100644 --- a/moonstone/plot/counts.py +++ b/moonstone/plot/counts.py @@ -70,8 +70,8 @@ def plot_mean_distribution( mean_series = self.df.mean(axis=1) binned_mean = SeriesBinning(mean_series).binned_data - bar_fig = BarGraph(binned_mean, plotting_options, show=show, output_file=output_file) - bar_fig.plot_one_graph(plotting_options, show=show, output_file=output_file) + bar_fig = BarGraph(binned_mean) + bar_fig.plot_one_graph(plotting_options=plotting_options, show=show, output_file=output_file) class PlotTaxonomyCounts: diff --git a/moonstone/plot/graphs/base.py b/moonstone/plot/graphs/base.py index 5183fdc..df715ac 100644 --- a/moonstone/plot/graphs/base.py +++ b/moonstone/plot/graphs/base.py @@ -17,18 +17,9 @@ class BaseGraph(ABC): def __init__( self, data: Union[pd.Series, pd.DataFrame], - plotting_options: dict = None, - show: bool = True, - output_file: Union[bool, str] = False, ): """ :param data: data to plot - :param show: set to False if you don't want to show the plot - :param output_file: name of the output file - :param plotting_options: options of plotting that will override the default setup \n - [!] Make sure the value given to an argument is of the right type \n - options allowed : 'log': `bool` ; 'colorbar': `[str, List[str]]` ; - 'tickangle': `[int, float]` """ self.data = data @@ -236,7 +227,7 @@ def plot_one_graph( if groups: filtered_df = self.data[self.data[group_col].isin(groups)] filtered_df[group_col] = filtered_df[group_col].astype("category") - filtered_df[group_col].cat = filtered_df[group_col].cat.set_categories(groups) + filtered_df[group_col] = filtered_df[group_col].cat.set_categories(groups, ordered=True) filtered_df = filtered_df.sort_values([group_col]) else: filtered_df = copy.deepcopy(self.data) diff --git a/setup.py b/setup.py index 93e62f2..8ad867e 100644 --- a/setup.py +++ b/setup.py @@ -9,19 +9,19 @@ author='Kenzo-Hugo Hillion, Agnès Baud, Mariela Furstenheim, Sean Kennedy', author_email='kehillio@pasteur.fr', install_requires=[ - 'pandas==2.0.2', - 'matplotlib==3.3.0', - 'plotly==5.17.0', - 'statsmodels==0.14.0', - 'python-slugify==4.0.1', - 'pyaml==20.4.0', - 'numpy==1.24.3', - 'scikit-bio==0.5.9', - 'scikit-learn==1.3.1', - 'hdmedians==0.14.2', - 'cython==0.29.21', - 'scipy==1.9.0' + 'pandas>=2.0.2', + 'matplotlib>=3.3.0', + 'plotly>=5.17.0', + 'statsmodels>=0.14.0', + 'python-slugify>=4.0.1', + 'pyaml>=20.4.0', + 'numpy>=1.24.3', + 'scikit-bio>=0.5.9', + 'scikit-learn>=1.3.1', + 'hdmedians>=0.14.2', + 'cython>=0.29.21', + 'scipy>=1.9.0' ], packages=find_packages(), entry_points={'console_scripts': ['moonstone=moonstone.main:run']}, -) \ No newline at end of file +) diff --git a/tests/analysis/diversity/test_alpha.py b/tests/analysis/diversity/test_alpha.py index 0eb1ed4..a70821d 100644 --- a/tests/analysis/diversity/test_alpha.py +++ b/tests/analysis/diversity/test_alpha.py @@ -107,10 +107,21 @@ def test_invalid_correction_method_param(self): def test_invalid_pval_param(self): tested_object_instance = ShannonIndex(self.tested_object) with self.assertLogs('moonstone.analysis.diversity.base', level='WARNING') as log: - tested_object_instance._valid_pval_param("lalala") - self.assertEqual(len(log.output), 1) + tested_object_instance._valid_pval_param("lalala", "lilili") + self.assertEqual(len(log.output), 2) self.assertIn("WARNING:moonstone.analysis.diversity.base:pval_to_compute='lalala' not valid, \ set to default (all).", log.output) + self.assertIn("WARNING:moonstone.analysis.diversity.base:pval_to_display='lilili' not valid, \ +set to default (None).", log.output) + + def test_inconsistent_pval_to_diplay_param(self): + tested_object_instance = ShannonIndex(self.tested_object) + with self.assertRaises(ValueError) as cm: + tested_object_instance._valid_pval_param("same group_col or group_col2 values", "all") + the_exception = cm.exception + expected_msg = "pval_to_display='all' not valid, when pval_to_compute='same group_col or group_col2 values'. \ +pval_to_display should be set to: ['same group_col or group_col2 values', 'same group_col values', None]" + self.assertEqual(the_exception.__str__(), expected_msg) def test_analyse_groups_pval_to_compute_all(self): tested_object_instance = ShannonIndex(self.tested_object) @@ -199,6 +210,300 @@ def test_analyse_groups_pval_to_compute_same_group_col_or_group_col2_values(self ) pd.testing.assert_series_equal(output['pval'], expected_ser, check_dtype=False) + def test_pval_selection(self): + tested_object = pd.Series({ + ('A', 'B'): 0.03, + ('A', 'C'): 0.5, + ('A', 'D'): 0.0014, + ('B', 'C'): 0.2, + ('B', 'D'): 0.001, + ('C', 'D'): 0.00067, + }) + tested_object.index.names = ["Group1", "Group2"] + tested_object_instance = ShannonIndex(self.tested_object) + + expected_ser = pd.Series({ + ('A', 'B'): 0.03, + }) + expected_ser.index.names = ["Group1", "Group2"] + pd.testing.assert_series_equal( + tested_object_instance._pval_selection(tested_object, ['A', 'B', 'C']), + expected_ser + ) + + expected_ser = pd.Series({ + ('A', 'D'): 0.0014, + ('C', 'D'): 0.00067, + }) + expected_ser.index.names = ["Group1", "Group2"] + pd.testing.assert_series_equal( + tested_object_instance._pval_selection(tested_object, ['A', 'C', 'D']), + expected_ser + ) + + def test_pval_selection_with_group_col2(self): + # pval_to_compute = 'all' + tested_object = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('M - A', 'F - A'): 0.5, + ('M - C', 'F - A'): 0.0034, + ('F - B', 'M - A'): 0.6, + ('F - C', 'F - B'): 0.0014, + ('F - B', 'M - B'): 0.2, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('M - C', 'F - C'): 0.0003, + ('M - C', 'F - B'): 0.0056, + ('M - A', 'M - C'): 0.89, + ('M - A', 'F - C'): 0.0006, + ('M - B', 'F - C'): 0.0043, + ('F - A', 'M - B'): 0.234, + }) + tested_object.index.names = ["Group1", "Group2"] + tested_object_instance = ShannonIndex(self.tested_object) + + groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'] + + # Case pval_to_display = 'all': only pval > 0.05 removed + pval_to_display = 'all' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('M - C', 'F - A'): 0.0034, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('M - C', 'F - C'): 0.0003, + ('M - C', 'F - B'): 0.0056, + ('M - A', 'F - C'): 0.0006, + ('M - B', 'F - C'): 0.0043, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'all', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case pval_to_display = 'same group_col or group_col2 values' + pval_to_display = 'same group_col or group_col2 values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('M - C', 'F - C'): 0.0003, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'all', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case pval_to_display = 'same group_col values' + pval_to_display = 'same group_col values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'all', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case final_groups defined + groups = ['M - A', 'F - A', 'M - C', 'F - C'] + expected_ser = pd.Series({ + ('M - C', 'F - A'): 0.0034, + ('F - A', 'F - C'): 0.0031, + ('M - C', 'F - C'): 0.0003, + ('M - A', 'F - C'): 0.0006, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'all', 'all' + ) + pd.testing.assert_series_equal(res, expected_ser) + + def test_pval_selection_with_group_col2_pval_to_compute_same_group_col_or_same_group_col2(self): + # pval_to_compute = 'same group_col or group_col2 values' + tested_object = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('M - A', 'F - A'): 0.5, + ('F - C', 'F - B'): 0.0014, + ('F - B', 'M - B'): 0.2, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('F - C', 'M - C'): 0.0003, + ('M - A', 'M - C'): 0.89, + }) + tested_object.index.names = ["Group1", "Group2"] + tested_object_instance = ShannonIndex(self.tested_object) + + groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'] + + # Case pval_to_display = 'same group_col or group_col2 values': only pval > 0.05 removed + pval_to_display = 'same group_col or group_col2 values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('F - C', 'M - C'): 0.0003, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col or group_col2 values', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case pval_to_display = 'same group_col values' + pval_to_display = 'same group_col values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col or group_col2 values', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case final_groups defined + groups = ['M - A', 'F - A', 'M - C', 'F - C'] + expected_ser = pd.Series({ + ('F - A', 'F - C'): 0.0031, + ('F - C', 'M - C'): 0.0003, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col or group_col2 values', 'same group_col or group_col2 values' + ) + pd.testing.assert_series_equal(res, expected_ser) + + def test_pval_selection_with_group_col2_pval_to_compute_same_group_col(self): + # pval_to_compute = 'same group_col values' + tested_object = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('M - A', 'M - C'): 0.89, + }) + tested_object.index.names = ["Group1", "Group2"] + tested_object_instance = ShannonIndex(self.tested_object) + + groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'] + + # Case pval_to_display = 'same group_col values' + pval_to_display = 'same group_col values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col values', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case final_groups defined + groups = ['M - A', 'F - A', 'M - C', 'F - C'] + expected_ser = pd.Series({ + ('F - A', 'F - C'): 0.0031, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col values', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + def test_generate_ordered_final_groups(self): + tested_object = pd.DataFrame.from_dict({ + 'sex_Group': {'comp1': 'M - A', 'comp2': 'F - B', 'comp3': 'F - A', 'comp4': 'F - C', 'comp5': 'M - C', + 'comp6': 'M - B', 'comp7': 'M - A'}, + 'sex': {'comp1': 'M', 'comp2': 'F', 'comp3': 'F', 'comp4': 'F', 'comp5': 'M', 'comp6': 'M', 'comp7': 'M'}, + 'Group': {'comp1': 'A', 'comp2': 'B', 'comp3': 'A', 'comp4': 'C', 'comp5': 'C', 'comp6': 'B', 'comp7': 'A'}, + }) + groups_Group = ["C", "A", "B"] + groups_sex = ["M", "F"] + tested_object_instance = ShannonIndex(self.tested_object) + final_groups = tested_object_instance._generate_ordered_final_groups( + tested_object, final_group_col='sex_Group', group_col='Group', group_col2='sex', + groups=groups_Group, groups2=groups_sex + ) + self.assertListEqual(final_groups, ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B']) + # now testing other way around + final_groups = tested_object_instance._generate_ordered_final_groups( + tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group', + groups=groups_sex, groups2=groups_Group + ) + self.assertListEqual(final_groups, ['M - C', 'M - A', 'M - B', 'F - C', 'F - A', 'F - B']) + # testing if order of groups not given + final_groups = tested_object_instance._generate_ordered_final_groups( + tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group', + groups=None, groups2=groups_Group + ) + self.assertListEqual(final_groups, ['F - C', 'F - A', 'F - B', 'M - C', 'M - A', 'M - B']) + # testing if order of groups2 not given + final_groups = tested_object_instance._generate_ordered_final_groups( + tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group', + groups=groups_sex, groups2=None + ) + self.assertListEqual(final_groups, ['M - A', 'M - B', 'M - C', 'F - A', 'F - B', 'F - C']) + + def test_order_pval_series(self): + tested_object = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('M - A', 'F - A'): 0.5, + ('F - C', 'F - B'): 0.0014, + ('M - B', 'F - B'): 0.2, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, # should be reorganized as ('F - C', 'F - A') + ('M - C', 'F - C'): 0.0003, + ('M - A', 'M - C'): 0.89, # should be reorganized as ('M - C', 'M - A') + }) + tested_object.index.names = ["Group1", "Group2"] + groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'] + tested_object_instance = ShannonIndex(self.tested_object) + + level0 = pd.Categorical( + ['M - C', 'M - C', 'M - C', 'F - C', 'F - C', 'M - A', 'M - A', 'F - A', 'M - B'], + categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], + ordered=True, dtype='category' + ) + level1 = pd.Categorical( + ['F - C', 'M - A', 'M - B', 'F - A', 'F - B', 'F - A', 'M - B', 'F - B', 'F - B'], + categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], + ordered=True, dtype='category' + ) + data = [[0.0003, 0.89, 0.03, 0.0031, 0.0014, 0.5, 0.00067, 0.001, 0.2]] + expected_ser = pd.DataFrame( + data=data, + columns=pd.MultiIndex.from_arrays([level0, level1]) + ).T[0] + expected_ser.index.names = ["Group1", "Group2"] + + pd.testing.assert_series_equal( + tested_object_instance._order_pval_series(tested_object, groups), + expected_ser + ) + class TestSimpsonInverseIndex(TestCase): diff --git a/tests/plot/graphs/test_boxgraph.py b/tests/plot/graphs/test_boxgraph.py index f5aee35..43ffb13 100644 --- a/tests/plot/graphs/test_boxgraph.py +++ b/tests/plot/graphs/test_boxgraph.py @@ -125,8 +125,8 @@ def test_with_group_col2_without_groups(self): def test_with_group_col2_with_groups_and_groups2(self): tested_df = pd.DataFrame( [ - [1.0, "M", "A"], - [3.0, "F", "B"], + [1.0, "F", "B"], + [3.0, "F", "C"], [9.0, "M", "A"], [6.0, "M", "B"], [2.0, "F", "A"], @@ -135,7 +135,7 @@ def test_with_group_col2_with_groups_and_groups2(self): [6.0, "M", "B"], [8.0, "M", "C"], [5.0, "F", "C"], - [7.0, "M", "C"], + [7.0, "M", "A"], ], index=[ "sample1", "sample2", "sample3", "sample4", "sample5", @@ -144,12 +144,12 @@ def test_with_group_col2_with_groups_and_groups2(self): ], columns=["data", "sex", "group"], ) - groups = ["F", "M"] # change order + groups = ["M", "F"] # change order groups2 = ["A", "B"] # don't show group "C" (+ dictate order) - expected_x_gpA = ['F', 'F', 'M', 'M'] - expected_y_gpA = [2.0, 4.0, 1.0, 9.0] - expected_x_gpB = ['F', 'M', 'M', 'M'] - expected_y_gpB = [3.0, 6.0, 2.0, 6.0] + expected_x_gpA = ['M', 'M', 'F', 'F'] + expected_y_gpA = [9.0, 7.0, 2.0, 4.0] + expected_x_gpB = ['M', 'M', 'M', 'F'] + expected_y_gpB = [6.0, 2.0, 6.0, 1.0] plot = GroupBoxGraph(tested_df) tested_graph = plot.plot_one_graph( data_col="data", group_col="sex", group_col2="group",