From b99179afe86bec48c5b258f50cd197f63bdc0e72 Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Tue, 4 Jul 2023 16:06:24 +0200 Subject: [PATCH 1/8] Draft pvalue display in graph --- moonstone/analysis/diversity/base.py | 313 ++++++++++++++++++++++++++- moonstone/plot/graphs/base.py | 9 - 2 files changed, 306 insertions(+), 16 deletions(-) diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py index 3243194..a047001 100644 --- a/moonstone/analysis/diversity/base.py +++ b/moonstone/analysis/diversity/base.py @@ -1,11 +1,11 @@ import logging import re from abc import ABC, abstractmethod -import skbio +#import skbio from string import capwords from typing import Union -import numpy as np +#import numpy as np import pandas as pd from statsmodels.stats.multitest import multipletests @@ -16,6 +16,8 @@ from moonstone.plot.graphs.heatmap import HeatmapGraph from moonstone.plot.graphs.histogram import Histogram from moonstone.plot.graphs.violin import GroupViolinGraph, ViolinGraph +from moonstone.utils.dict_operations import merge_dict + logger = logging.getLogger(__name__) @@ -219,14 +221,28 @@ def _visualize_pvalue_matrix(self, pval: pd.DataFrame, output_pval_file: str): output_file=output_pval_file ) - def _valid_pval_param(self, pval_to_compute): + def _valid_pval_param(self, pval_to_compute, pval_to_display): choices = [ "all", "same group_col or group_col2 values", "same group_col values", None ] + dicpval = {} + for i in range(len(choices)): + dicpval[choices[i]]=i + if pval_to_compute not in choices: logger.warning("pval_to_compute='%s' not valid, set to default (all).", pval_to_compute) pval_to_compute = "all" - return pval_to_compute + + if pval_to_display not in choices: + logger.warning("pval_to_display='%s' not valid, set to default (None).", pval_to_display) + pval_to_display = None + elif dicpval[pval_to_display] if first part is the same = same group_col value + # -> if second part is the same = same group_col2 value + pval_series = pval_series[ + ( + pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0]) + ) | ( + pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[1]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[1]) + )] + + return pval_series + + def _order_pval_series( + self, pval_series, groups, dic_gps + ): + # Reminder: pvalue series is a MultiIndex series -> 2 level of index are the 2 groups compared + # to order p-value series in a specific order dictated in dic_gps + # example: dic_gps = {"Group A": 0, "Group B": 1, "Group C": 3} + names = pval_series.index.names # default: ["Group1", "Group2"] + + # first we order p-value series index name to have the group that should come first as first member + pval_series = pval_series.reset_index() + for i in pval_series.index: + level0 = pval_series.loc[i][names[0]] + level1 = pval_series.loc[i][names[1]] + if dic_gps[level0] > dic_gps[level1]: + pval_series.loc[i, names[0]] = level1 # invert to have the Group that should be put first as first + pval_series.loc[i, names[1]] = level0 # in example: "Group B - Group A" becomes "Group A - Group B" + pval_series[names[0]] = pval_series[names[0]].astype("category") + pval_series[names[0]].cat.set_categories(groups, inplace=True) + pval_series[names[1]] = pval_series[names[1]].astype("category") + pval_series[names[1]].cat.set_categories(groups, inplace=True) + pval_series = pval_series.sort_values([names[0], names[1]]) # we sort by 1st member, and then 2nd member + pval_series = pval_series.set_index([names[0], names[1]]) + return pval_series[0] + + def _generate_shapes_annotations_lists( + self, pval_series, groups, hgt_min + ): + """ + To generate annotations to represent significant pvalues. Methods for group_col only (not group_col2) + + Args: + pval_series: pd.Series of the pvalue to put on the graph (need to be filtered beforehand so that it only contains significant pvalues) + """ + # Overview of this method: We're trying to generate the shapes (1 bracket = 3 lines = + # 1 going from Group1 to Group2 and 2 small lines to form the edge of the bracket) + # and the annotations. + # To ensure that the brackets don't overlap, we create a table/list named `level` + # with a number of columns equal to the number of groups and an expendable number of rows + # When we fill the list, we replace 0 to 1 meaning that there is now a bracket at this + # location and that we can't add another bracket on top of it, and that another level (row) + # need to be added to `level` + dic_gps = {} + for i in range(len(groups)): + dic_gps[groups[i]]=i + + # the pvalues need to be ordered so that looking at the cell corresponding to the left edge + # of the bracket that needs to be added is enough to determine if there is already a + # bracket there. + pval_series = self._order_pval_series(pval_series, groups, dic_gps) + + det = (hgt_min/15) + + hgt_min += det/2 + fontsize = int(12+det) + linewidth = 0.5+0.15*det + + level=[[0] * (len(groups))] + list_shapes = [] + list_annotations = [] + + for ind, val in pval_series.items(): + y = 0 + + left_ind = dic_gps[ind[0]] # -> could be directly ind for shapes but /!\ not for the annotations + right_ind = dic_gps[ind[1]] + + for i in range(len(level)): + if level[i][left_ind] == 0: + level[i][left_ind:right_ind+1]=[1]*(right_ind-left_ind+1) + list_shapes += [ + {'x0':left_ind, 'y0':hgt_min+(i*det/2), + 'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}, # from Group1 to Group2 + {'x0':left_ind, 'y0':hgt_min+(i*det/2)-0.15*det, + 'x1':left_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}, # left edge of the bracket + {'x0':right_ind, 'y0':hgt_min+(i*det/2)-0.15*det, + 'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)} # right edge of the bracket + ] + if val < 0.01: + list_annotations += [{'text':'**', "font":dict(size=fontsize), + 'x':(left_ind+right_ind)/2, + 'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}] + else: + list_annotations += [{'text':'*', "font":dict(size=fontsize), + 'x':(left_ind+right_ind)/2, + 'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}] + y = 1 + break + if y == 0: + # we need to add another level + i += 1 + level += [[0] * (len(groups))] + level[i][left_ind:right_ind+1]=[1]*(right_ind-left_ind+1) # or ind + list_shapes += [ + {'x0':left_ind, 'y0':hgt_min+(i*det/2), + 'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}, + {'x0':left_ind, 'y0':hgt_min+(i*det/2)-0.15*det, + 'x1':left_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}, + {'x0':right_ind, 'y0':hgt_min+(i*det/2)-0.15*det, + 'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}] + if val < 0.01: + list_annotations += [{'text':'**', "font":dict(size=fontsize), + 'x':(left_ind+right_ind)/2, + 'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}] + else: + list_annotations += [{'text':'*',"font":dict(size=fontsize), + 'x':(left_ind+right_ind)/2, + 'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}] + + return list_shapes, list_annotations, len(level) + + """ + def _generate_shapes_annotations_lists_supergroup( + self, pval_series, groups, supergroups, hgt_min + ): + """ + #:param supergroups dictionary with supergroup edges + """ + dic_gps = {} + for i in range(len(groups)): + dic_gps[groups[i]]=i + + + list_shapes = [] + dic_middle = {} + supergroups_to_display = set(list(pval_series.index.get_level_values(0)) + list(pval_series.index.get_level_values(1))) + for i in supergroups_to_display: + if isinstance(supergroups[i], list): + list_shapes += [{'x0':supergroups[i][0], 'y0':hgt_min, 'x1':supergroups[i][1], 'y1':hgt_min, 'line':dict(width=1)}] + dic_middle[i] = (dic_gps[supergroups[i][0]] + dic_gps[supergroups[i][1]])/2 + else: + dic_middle[i] = dic_gps[supergroups[i]] + + dic_supergps = {} + i = 0 + for k, v in sorted(dic_middle.items(), key=lambda item: item[1]): + dic_supergps[k] = i + i+=1 + + level=[[0] * (len(dic_supergps))] + list_annotations = [] + for ind, val in pval_series.items(): + y = 0 + for i in range(len(level)): + if dic_middle[ind[0]] > dic_middle[ind[1]]: + ind = (ind[1], ind[0]) + if level[i][dic_supergps[ind[0]]] == 0: + level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1]=[1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1) # or ind + list_shapes += [ + {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, + 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)}, + {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5, + #'y0':hgt_min, 'line':dict(width=1, dash="dot")}, + 'y0':hgt_min+i+0.35, 'line':dict(width=1)}, + {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, + #'y0':hgt_min, 'line':dict(width=1, dash="dot")} + 'y0':hgt_min+i+0.35, 'line':dict(width=1)} + ] + if val < 0.01: + list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + 'y':hgt_min+i+0.65, 'showarrow':False}] + else: + list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + 'y':hgt_min+i+0.65, 'showarrow':False}] + y = 1 + break + if y == 0: + i += 1 + level += [[0] * (len(dic_supergps))] + level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1]=[1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1) # or ind + list_shapes += [ + {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, + 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)}, + {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5, + #'y0':hgt_min, 'line':dict(width=1, dash="dot")}, + 'y0':hgt_min+i+0.35, 'line':dict(width=1)}, + {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, + #'y0':hgt_min, 'line':dict(width=1, dash="dot")} + 'y0':hgt_min+i+0.35, 'line':dict(width=1)} + ] + if val < 0.01: + list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + 'y':hgt_min+i+0.65, 'showarrow':False}] + else: + list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + 'y':hgt_min+i+0.65, 'showarrow':False}] + + return list_shapes, list_annotations, len(level) + """ + # for now, method above and below in different methods + #def _generate_dic_shapes_and_annotations( + # self, pval_series, dic_lev, hgt_min + #): + # list_shapes={} + # dic_annotations={} + # for ind, val in pval_series.items(): + def _compute_pval_inside_subgroups( self, diversity_index_dataframe: pd.DataFrame, group_col: str, final_group_col: str, stats_test: str, correction_method: str, structure_pval: str, sym: bool @@ -264,7 +517,7 @@ def analyse_groups( plotting_options: dict = None, stats_test: str = 'mann_whitney_u', correction_method: str = None, structure_pval: str = 'dataframe', sym: bool = True, - pval_to_compute: bool = 'all', + pval_to_compute: bool = 'all', pval_to_display: str = None, show_pval: bool = True, output_pval_file: str = False, **kwargs ) -> dict: @@ -277,7 +530,7 @@ def analyse_groups( :param groups: specifically select groups to display among group_col :param groups2: specifically select groups to display among group_col2 :param show: also visualize - :param show_pval: visualize p-values + :param show_pval: visualize p-values's heatmap :param output_file: file path to output your html graph :param make_graph: whether or not to make the graph :param plotting_options: plotly plotting_options @@ -290,10 +543,14 @@ def analyse_groups( :param pval_to_compute: if group_col2 used, problems of memory or in maximum recursion depth may occur. In this case, you may want to compute only p-values of specific comparisons. {"all" (default), None, "same group_col values", "same group_col or group_col2 values"} + :param pval_to_display: whether you want the significant pvalues displayed on the graph ("all") or not (None) + When group_col2 is used you may want to specify which type of comparisons you want to display the + significant pvalues of. Otherwise the graph can appear crowded by pvalues lines. + {None (default), "all", "same group_col values", "same group_col or group_col2 values"} """ filtered_metadata_df = self._get_filtered_df_from_metadata(metadata_df) - pval_to_compute = self._valid_pval_param(pval_to_compute) + pval_to_compute, pval_to_display = self._valid_pval_param(pval_to_compute, pval_to_display) correction_method = self._valid_correction_method_param(correction_method) if group_col2: @@ -307,6 +564,19 @@ def analyse_groups( ) df = self._get_grouped_df(filtered_metadata_df[[group_col, group_col2, final_group_col]]) + #if pval_to_display: + if 1 == 1: + # listing and sorting all final_groups possibles respecting the order given by + # groups first and then by groups2 + t = df.drop_duplicates(subset=[final_group_col]) + t[group_col] = t[group_col].astype("category") + t[group_col].cat.set_categories(groups, inplace=True) + t[group_col2] = t[group_col2].astype("category") + t[group_col2].cat.set_categories(groups2, inplace=True) + t.dropna(how="any", subset=[group_col, group_col2], inplace=True) + t = t.sort_values([group_col, group_col2]) + final_groups = list(t[final_group_col]) + if pval_to_compute == "all": pval = self._run_statistical_test_groups( df, final_group_col, stats_test, correction_method, structure_pval, sym @@ -324,6 +594,9 @@ def analyse_groups( ) ) + if pval_to_display: + to_display = self._pval_selection_with_group_col2(pval, final_groups, pval_to_compute, pval_to_display) + else: df = self._get_grouped_df(filtered_metadata_df[group_col]) pval = self._run_statistical_test_groups( @@ -331,6 +604,32 @@ def analyse_groups( ) # pval is in the right structure to be returned + final_groups = groups # to remove from here later on + + if pval_to_display: + final_groups = groups + to_display = self._pval_selection(pval, groups) + + if pval_to_display and to_display.empty: # nothing to display + pval_to_display = None + + if pval_to_display: + hgt_min = df[self.DIVERSITY_INDEXES_NAME].max() #+ 0.5 + list_shapes, list_annotations, nb_lev = self._generate_shapes_annotations_lists( + to_display, final_groups, hgt_min + ) + if not plotting_options: + plotting_options = {} + det = (hgt_min/15) + hgt_min += det/2 + nblevels=to_display.shape[0] + plotting_options = merge_dict(plotting_options, { + 'layout': { + 'shapes': list_shapes, # we should had to previous list if there is one + 'annotations': list_annotations, # idem + } + }) + self.last_grouped_df = df self.report_data['analyse_groups'] = { 'pval': pval, diff --git a/moonstone/plot/graphs/base.py b/moonstone/plot/graphs/base.py index 20cedd3..7b79ca3 100644 --- a/moonstone/plot/graphs/base.py +++ b/moonstone/plot/graphs/base.py @@ -17,18 +17,9 @@ class BaseGraph(ABC): def __init__( self, data: Union[pd.Series, pd.DataFrame], - plotting_options: dict = None, - show: bool = True, - output_file: Union[bool, str] = False, ): """ :param data: data to plot - :param show: set to False if you don't want to show the plot - :param output_file: name of the output file - :param plotting_options: options of plotting that will override the default setup \n - [!] Make sure the value given to an argument is of the right type \n - options allowed : 'log': `bool` ; 'colorbar': `[str, List[str]]` ; - 'tickangle': `[int, float]` """ self.data = data From 36805c4507bd3de51046447bd107541729c5f2bf Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Mon, 9 Oct 2023 16:39:47 +0200 Subject: [PATCH 2/8] add imports --- .github/workflows/python-package.yml | 14 +++++++------- moonstone/analysis/diversity/base.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 9966157..4e52bcb 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -22,13 +22,13 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip3 install --upgrade pip3 - pip3 install flake8 pytest - pip3 install numpy==1.18.1 - pip3 install . - pip3 install odfpy # optional dependencies - pip3 install openpyxl # idem - pip3 install xlrd # idem + python -m pip install --upgrade pip + pip install flake8 pytest + pip install numpy==1.18.1 + pip install . + pip install odfpy # optional dependencies + pip install openpyxl # idem + pip install xlrd # idem - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py index a047001..2b2af7f 100644 --- a/moonstone/analysis/diversity/base.py +++ b/moonstone/analysis/diversity/base.py @@ -1,11 +1,11 @@ import logging import re from abc import ABC, abstractmethod -#import skbio +import skbio from string import capwords from typing import Union -#import numpy as np +import numpy as np import pandas as pd from statsmodels.stats.multitest import multipletests From a9a5675bd7ca0d8bccbb2ddd70af88147564bdc6 Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Tue, 24 Oct 2023 15:04:37 +0200 Subject: [PATCH 3/8] for python 3.9 and other python lib upgrade --- moonstone/analysis/diversity/base.py | 36 +++++++---- moonstone/plot/counts.py | 4 +- tests/analysis/diversity/test_alpha.py | 83 +++++++++++++++++++++++++- 3 files changed, 108 insertions(+), 15 deletions(-) diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py index f36f3da..c318b88 100644 --- a/moonstone/analysis/diversity/base.py +++ b/moonstone/analysis/diversity/base.py @@ -294,13 +294,12 @@ def _pval_selection_with_group_col2( return pval_series def _order_pval_series( - self, pval_series, groups, dic_gps + self, pval_series: pd.Series, groups: list, dic_gps: dict ): # Reminder: pvalue series is a MultiIndex series -> 2 level of index are the 2 groups compared # to order p-value series in a specific order dictated in dic_gps # example: dic_gps = {"Group A": 0, "Group B": 1, "Group C": 3} names = pval_series.index.names # default: ["Group1", "Group2"] - # first we order p-value series index name to have the group that should come first as first member pval_series = pval_series.reset_index() for i in pval_series.index: @@ -310,13 +309,31 @@ def _order_pval_series( pval_series.loc[i, names[0]] = level1 # invert to have the Group that should be put first as first pval_series.loc[i, names[1]] = level0 # in example: "Group B - Group A" becomes "Group A - Group B" pval_series[names[0]] = pval_series[names[0]].astype("category") - pval_series[names[0]].cat.set_categories(groups, inplace=True) + pval_series[names[0]] = pval_series[names[0]].cat.set_categories(groups, ordered=True) pval_series[names[1]] = pval_series[names[1]].astype("category") - pval_series[names[1]].cat.set_categories(groups, inplace=True) + pval_series[names[1]] = pval_series[names[1]].cat.set_categories(groups, ordered=True) pval_series = pval_series.sort_values([names[0], names[1]]) # we sort by 1st member, and then 2nd member pval_series = pval_series.set_index([names[0], names[1]]) return pval_series[0] + def _generate_ordered_final_groups( + self, df: pd.DataFrame, final_group_col: str, group_col: str, group_col2: str, + groups: list, groups2: list + ): + # listing and sorting all final_groups possibles respecting the order given by + # groups first and then by groups2 + # NB: At least one of groups or groups2 need to not be None + t = df.drop_duplicates(subset=[final_group_col]).copy() # copy() to avoid raising SettingWithCopyWarning + if groups: + t[group_col] = t[group_col].astype("category") + t[group_col] = t[group_col].cat.set_categories(groups, ordered=True) + if groups2: + t[group_col2] = t[group_col2].astype("category") + t[group_col2] = t[group_col2].cat.set_categories(groups2, ordered=True) + t = t.dropna(how="any", subset=[group_col, group_col2]) + t = t.sort_values([group_col, group_col2]) + return list(t[final_group_col]) + def _generate_shapes_annotations_lists( self, pval_series, groups, hgt_min ): @@ -567,16 +584,15 @@ def analyse_groups( ) df = self._get_grouped_df(filtered_metadata_df[[group_col, group_col2, final_group_col]]) - #if pval_to_display: - if 1 == 1: + if pval_to_display and (groups or groups2): # listing and sorting all final_groups possibles respecting the order given by # groups first and then by groups2 t = df.drop_duplicates(subset=[final_group_col]) t[group_col] = t[group_col].astype("category") - t[group_col].cat.set_categories(groups, inplace=True) + t[group_col].cat = t[group_col].cat.set_categories(groups) t[group_col2] = t[group_col2].astype("category") - t[group_col2].cat.set_categories(groups2, inplace=True) - t.dropna(how="any", subset=[group_col, group_col2], inplace=True) + t[group_col].cat = t[group_col2].cat.set_categories(groups2) + t = t.dropna(how="any", subset=[group_col, group_col2]) t = t.sort_values([group_col, group_col2]) final_groups = list(t[final_group_col]) @@ -626,7 +642,7 @@ def analyse_groups( plotting_options = {} det = (hgt_min/15) hgt_min += det/2 - nblevels=to_display.shape[0] + #nblevels=to_display.shape[0] plotting_options = merge_dict(plotting_options, { 'layout': { 'shapes': list_shapes, # we should had to previous list if there is one diff --git a/moonstone/plot/counts.py b/moonstone/plot/counts.py index c31b364..74b753b 100644 --- a/moonstone/plot/counts.py +++ b/moonstone/plot/counts.py @@ -70,8 +70,8 @@ def plot_mean_distribution( mean_series = self.df.mean(axis=1) binned_mean = SeriesBinning(mean_series).binned_data - bar_fig = BarGraph(binned_mean, plotting_options, show=show, output_file=output_file) - bar_fig.plot_one_graph(plotting_options, show=show, output_file=output_file) + bar_fig = BarGraph(binned_mean) + bar_fig.plot_one_graph(plotting_options=plotting_options, show=show, output_file=output_file) class PlotTaxonomyCounts: diff --git a/tests/analysis/diversity/test_alpha.py b/tests/analysis/diversity/test_alpha.py index 0eb1ed4..57696ad 100644 --- a/tests/analysis/diversity/test_alpha.py +++ b/tests/analysis/diversity/test_alpha.py @@ -107,10 +107,12 @@ def test_invalid_correction_method_param(self): def test_invalid_pval_param(self): tested_object_instance = ShannonIndex(self.tested_object) with self.assertLogs('moonstone.analysis.diversity.base', level='WARNING') as log: - tested_object_instance._valid_pval_param("lalala") - self.assertEqual(len(log.output), 1) + tested_object_instance._valid_pval_param("lalala", "lilili") + self.assertEqual(len(log.output), 2) self.assertIn("WARNING:moonstone.analysis.diversity.base:pval_to_compute='lalala' not valid, \ set to default (all).", log.output) + self.assertIn("WARNING:moonstone.analysis.diversity.base:pval_to_display='lilili' not valid, \ +set to default (None).", log.output) def test_analyse_groups_pval_to_compute_all(self): tested_object_instance = ShannonIndex(self.tested_object) @@ -199,6 +201,81 @@ def test_analyse_groups_pval_to_compute_same_group_col_or_group_col2_values(self ) pd.testing.assert_series_equal(output['pval'], expected_ser, check_dtype=False) + def test_generate_ordered_final_groups(self): + tested_object = pd.DataFrame.from_dict({ + 'sex_Group': {'comp1': 'M - A', 'comp2': 'F - B', 'comp3': 'F - A', 'comp4': 'F - C', 'comp5': 'M - C', 'comp6': 'M - B', 'comp7': 'M - A'}, + 'sex': {'comp1': 'M', 'comp2': 'F', 'comp3': 'F', 'comp4': 'F', 'comp5': 'M', 'comp6': 'M', 'comp7': 'M'}, + 'Group': {'comp1': 'A', 'comp2': 'B', 'comp3': 'A', 'comp4': 'C', 'comp5': 'C', 'comp6': 'B', 'comp7': 'A'}, + }) + groups_Group=["C", "A", "B"] + groups_sex=["M", "F"] + tested_object_instance = ShannonIndex(self.tested_object) + final_groups = tested_object_instance._generate_ordered_final_groups( + tested_object, final_group_col='sex_Group', group_col='Group', group_col2='sex', + groups=groups_Group, groups2=groups_sex + ) + self.assertListEqual(final_groups, ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B']) + # now testing other way around + final_groups = tested_object_instance._generate_ordered_final_groups( + tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group', + groups=groups_sex, groups2=groups_Group + ) + self.assertListEqual(final_groups, ['M - C', 'M - A', 'M - B', 'F - C', 'F - A', 'F - B']) + # testing if order of groups not given + final_groups = tested_object_instance._generate_ordered_final_groups( + tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group', + groups=None, groups2=groups_Group + ) + self.assertListEqual(final_groups, ['F - C', 'F - A', 'F - B', 'M - C', 'M - A', 'M - B']) + # testing if order of groups2 not given + final_groups = tested_object_instance._generate_ordered_final_groups( + tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group', + groups=groups_sex, groups2=None + ) + self.assertListEqual(final_groups, ['M - A', 'M - B', 'M - C', 'F - A', 'F - B', 'F - C']) + + def test_order_pval_series(self): + tested_object = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('M - A', 'F - A'): 0.5, + ('F - C', 'F - B'): 0.0014, + ('M - B', 'F - B'): 0.2, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('M - C', 'F - C'): 0.0003, + ('M - A', 'M - C'): 0.89, + }) + tested_object.index.names = ["Group1", "Group2"] + groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'] + tested_object_instance = ShannonIndex(self.tested_object) + + level0 = pd.Categorical( + ['M - C', 'M - C', 'M - C', 'F - C', 'F - C', 'M - A', 'M - A', 'F - A', 'M - B'], + categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], + ordered=True, dtype='category' + ) + level1 = pd.Categorical( + ['F - C', 'M - A', 'M - B', 'F - A', 'F - B', 'F - A', 'M - B', 'F - B', 'F - B'], + categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], + ordered=True, dtype='category' + ) + data = [[0.0003, 0.89, 0.03, 0.0031, 0.0014, 0.5, 0.00067, 0.001, 0.2]] + expected_ser = pd.DataFrame( + data=data, + columns=pd.MultiIndex.from_arrays([level0, level1]), + + ).T[0] + expected_ser.index.names = ["Group1", "Group2"] + + pd.testing.assert_series_equal( + tested_object_instance._order_pval_series( + tested_object, groups, + {'M - C': 0, 'F - C': 1, 'M - A': 2, 'F - A': 3, 'M - B': 4, 'F - B': 5} + ), + expected_ser + ) + class TestSimpsonInverseIndex(TestCase): @@ -352,4 +429,4 @@ def test_visualize(self): tree = TreeNode.read(StringIO( u'(((species1:0.25,species2:0.25):0.75,species3:1.0):0.5,(species4:0.5,species5:0.5):1.0)root;')) tested_object_instance = FaithsPhylogeneticDiversity(tested_object, tree) - tested_object_instance.visualize(show=False) + tested_object_instance.visualize(show=False) \ No newline at end of file From a150de868b6bf3620d93969285c6ad3e3bc941a1 Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Mon, 30 Oct 2023 15:47:13 +0100 Subject: [PATCH 4/8] fix groups order --- moonstone/plot/graphs/base.py | 2 +- tests/plot/graphs/test_boxgraph.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/moonstone/plot/graphs/base.py b/moonstone/plot/graphs/base.py index 4a78d8b..df715ac 100644 --- a/moonstone/plot/graphs/base.py +++ b/moonstone/plot/graphs/base.py @@ -227,7 +227,7 @@ def plot_one_graph( if groups: filtered_df = self.data[self.data[group_col].isin(groups)] filtered_df[group_col] = filtered_df[group_col].astype("category") - filtered_df[group_col].cat = filtered_df[group_col].cat.set_categories(groups) + filtered_df[group_col] = filtered_df[group_col].cat.set_categories(groups, ordered=True) filtered_df = filtered_df.sort_values([group_col]) else: filtered_df = copy.deepcopy(self.data) diff --git a/tests/plot/graphs/test_boxgraph.py b/tests/plot/graphs/test_boxgraph.py index f5aee35..43ffb13 100644 --- a/tests/plot/graphs/test_boxgraph.py +++ b/tests/plot/graphs/test_boxgraph.py @@ -125,8 +125,8 @@ def test_with_group_col2_without_groups(self): def test_with_group_col2_with_groups_and_groups2(self): tested_df = pd.DataFrame( [ - [1.0, "M", "A"], - [3.0, "F", "B"], + [1.0, "F", "B"], + [3.0, "F", "C"], [9.0, "M", "A"], [6.0, "M", "B"], [2.0, "F", "A"], @@ -135,7 +135,7 @@ def test_with_group_col2_with_groups_and_groups2(self): [6.0, "M", "B"], [8.0, "M", "C"], [5.0, "F", "C"], - [7.0, "M", "C"], + [7.0, "M", "A"], ], index=[ "sample1", "sample2", "sample3", "sample4", "sample5", @@ -144,12 +144,12 @@ def test_with_group_col2_with_groups_and_groups2(self): ], columns=["data", "sex", "group"], ) - groups = ["F", "M"] # change order + groups = ["M", "F"] # change order groups2 = ["A", "B"] # don't show group "C" (+ dictate order) - expected_x_gpA = ['F', 'F', 'M', 'M'] - expected_y_gpA = [2.0, 4.0, 1.0, 9.0] - expected_x_gpB = ['F', 'M', 'M', 'M'] - expected_y_gpB = [3.0, 6.0, 2.0, 6.0] + expected_x_gpA = ['M', 'M', 'F', 'F'] + expected_y_gpA = [9.0, 7.0, 2.0, 4.0] + expected_x_gpB = ['M', 'M', 'M', 'F'] + expected_y_gpB = [6.0, 2.0, 6.0, 1.0] plot = GroupBoxGraph(tested_df) tested_graph = plot.plot_one_graph( data_col="data", group_col="sex", group_col2="group", From 5627831265fc9cd49cbc91b1b7132e2237c706fd Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Fri, 10 Nov 2023 16:39:28 +0100 Subject: [PATCH 5/8] Try with python 3.10 --- .github/workflows/python-package.yml | 2 +- setup.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3a5546a..8612603 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9"] + python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v2 diff --git a/setup.py b/setup.py index 93e62f2..72b773f 100644 --- a/setup.py +++ b/setup.py @@ -9,18 +9,18 @@ author='Kenzo-Hugo Hillion, Agnès Baud, Mariela Furstenheim, Sean Kennedy', author_email='kehillio@pasteur.fr', install_requires=[ - 'pandas==2.0.2', - 'matplotlib==3.3.0', - 'plotly==5.17.0', - 'statsmodels==0.14.0', - 'python-slugify==4.0.1', - 'pyaml==20.4.0', - 'numpy==1.24.3', - 'scikit-bio==0.5.9', - 'scikit-learn==1.3.1', - 'hdmedians==0.14.2', - 'cython==0.29.21', - 'scipy==1.9.0' + 'pandas>=2.0.2', + 'matplotlib>=3.3.0', + 'plotly>=5.17.0', + 'statsmodels>=0.14.0', + 'python-slugify>=4.0.1', + 'pyaml>=20.4.0', + 'numpy>=1.24.3', + 'scikit-bio>=0.5.9', + 'scikit-learn>=1.3.1', + 'hdmedians>=0.14.2', + 'cython>=0.29.21', + 'scipy>=1.9.0' ], packages=find_packages(), entry_points={'console_scripts': ['moonstone=moonstone.main:run']}, From 3d79f7800d5eb5723d415abe22e1fa53d01794dd Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Mon, 13 Nov 2023 17:01:28 +0100 Subject: [PATCH 6/8] tests pval_selection --- moonstone/analysis/differential_analysis.py | 2 +- moonstone/analysis/diversity/base.py | 186 +++++++------- setup.py | 2 +- tests/analysis/diversity/test_alpha.py | 258 +++++++++++++++++++- 4 files changed, 346 insertions(+), 102 deletions(-) diff --git a/moonstone/analysis/differential_analysis.py b/moonstone/analysis/differential_analysis.py index 9387200..66ea936 100644 --- a/moonstone/analysis/differential_analysis.py +++ b/moonstone/analysis/differential_analysis.py @@ -78,7 +78,7 @@ def test_multiple_features(self, feature, test_to_use): list_ofgroups = [] for variable in variable_dic: list_ofgroups.append(variable_dic[variable][self.full_table.columns[family]]) - #test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups)) + # test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups)) test = self.tests_functions_used[test_to_use](*list_ofgroups) # works for kruskal and one way anova features.append(feature) taxons.append(self.full_table.columns[family]) diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py index c318b88..20b4147 100644 --- a/moonstone/analysis/diversity/base.py +++ b/moonstone/analysis/diversity/base.py @@ -227,8 +227,8 @@ def _valid_pval_param(self, pval_to_compute, pval_to_display): ] dicpval = {} for i in range(len(choices)): - dicpval[choices[i]]=i - + dicpval[choices[i]] = i + if pval_to_compute not in choices: logger.warning("pval_to_compute='%s' not valid, set to default (all).", pval_to_compute) pval_to_compute = "all" @@ -236,12 +236,12 @@ def _valid_pval_param(self, pval_to_compute, pval_to_display): if pval_to_display not in choices: logger.warning("pval_to_display='%s' not valid, set to default (None).", pval_to_display) pval_to_display = None - elif dicpval[pval_to_display] if first part is the same = same group_col value - # -> if second part is the same = same group_col2 value + # -> if second part is the same = same group_col2 value pval_series = pval_series[ ( - pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0]) + pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0]) + == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0]) ) | ( - pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[1]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[1]) + pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[1]) + == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[1]) )] return pval_series @@ -306,13 +310,13 @@ def _order_pval_series( level0 = pval_series.loc[i][names[0]] level1 = pval_series.loc[i][names[1]] if dic_gps[level0] > dic_gps[level1]: - pval_series.loc[i, names[0]] = level1 # invert to have the Group that should be put first as first - pval_series.loc[i, names[1]] = level0 # in example: "Group B - Group A" becomes "Group A - Group B" + pval_series.loc[i, names[0]] = level1 # invert to have the Group that should be put first as first + pval_series.loc[i, names[1]] = level0 # in example: "Group B - Group A" becomes "Group A - Group B" pval_series[names[0]] = pval_series[names[0]].astype("category") pval_series[names[0]] = pval_series[names[0]].cat.set_categories(groups, ordered=True) pval_series[names[1]] = pval_series[names[1]].astype("category") pval_series[names[1]] = pval_series[names[1]].cat.set_categories(groups, ordered=True) - pval_series = pval_series.sort_values([names[0], names[1]]) # we sort by 1st member, and then 2nd member + pval_series = pval_series.sort_values([names[0], names[1]]) # we sort by 1st member, and then 2nd member pval_series = pval_series.set_index([names[0], names[1]]) return pval_series[0] @@ -341,7 +345,8 @@ def _generate_shapes_annotations_lists( To generate annotations to represent significant pvalues. Methods for group_col only (not group_col2) Args: - pval_series: pd.Series of the pvalue to put on the graph (need to be filtered beforehand so that it only contains significant pvalues) + pval_series: pd.Series of the pvalue to put on the graph (need to be filtered beforehand + so that it only contains significant pvalues) """ # Overview of this method: We're trying to generate the shapes (1 bracket = 3 lines = # 1 going from Group1 to Group2 and 2 small lines to form the edge of the bracket) @@ -353,91 +358,96 @@ def _generate_shapes_annotations_lists( # need to be added to `level` dic_gps = {} for i in range(len(groups)): - dic_gps[groups[i]]=i - - # the pvalues need to be ordered so that looking at the cell corresponding to the left edge + dic_gps[groups[i]] = i + + # the pvalues need to be ordered so that looking at the cell corresponding to the left edge # of the bracket that needs to be added is enough to determine if there is already a # bracket there. pval_series = self._order_pval_series(pval_series, groups, dic_gps) - + det = (hgt_min/15) - + hgt_min += det/2 fontsize = int(12+det) linewidth = 0.5+0.15*det - - level=[[0] * (len(groups))] + + level = [[0] * (len(groups))] list_shapes = [] list_annotations = [] - + for ind, val in pval_series.items(): y = 0 - + left_ind = dic_gps[ind[0]] # -> could be directly ind for shapes but /!\ not for the annotations right_ind = dic_gps[ind[1]] - + for i in range(len(level)): if level[i][left_ind] == 0: - level[i][left_ind:right_ind+1]=[1]*(right_ind-left_ind+1) + level[i][left_ind:right_ind+1] = [1]*(right_ind-left_ind+1) list_shapes += [ - {'x0':left_ind, 'y0':hgt_min+(i*det/2), - 'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}, # from Group1 to Group2 - {'x0':left_ind, 'y0':hgt_min+(i*det/2)-0.15*det, - 'x1':left_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}, # left edge of the bracket - {'x0':right_ind, 'y0':hgt_min+(i*det/2)-0.15*det, - 'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)} # right edge of the bracket + {'x0': left_ind, 'y0': hgt_min+(i*det/2), + 'x1': right_ind, 'y1': hgt_min+(i*det/2), + 'line': dict(width=linewidth)}, # from Group1 to Group2 + {'x0': left_ind, 'y0': hgt_min+(i*det/2)-0.15*det, + 'x1': left_ind, 'y1': hgt_min+(i*det/2), + 'line': dict(width=linewidth)}, # left edge of the bracket + {'x0': right_ind, 'y0': hgt_min+(i*det/2)-0.15*det, + 'x1': right_ind, 'y1': hgt_min+(i*det/2), + 'line': dict(width=linewidth)} # right edge of the bracket ] if val < 0.01: - list_annotations += [{'text':'**', "font":dict(size=fontsize), - 'x':(left_ind+right_ind)/2, - 'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}] + list_annotations += [{'text': '**', "font": dict(size=fontsize), + 'x': (left_ind+right_ind)/2, + 'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}] else: - list_annotations += [{'text':'*', "font":dict(size=fontsize), - 'x':(left_ind+right_ind)/2, - 'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}] + list_annotations += [{'text': '*', "font": dict(size=fontsize), + 'x': (left_ind+right_ind)/2, + 'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}] y = 1 break if y == 0: # we need to add another level i += 1 level += [[0] * (len(groups))] - level[i][left_ind:right_ind+1]=[1]*(right_ind-left_ind+1) # or ind + level[i][left_ind:right_ind+1] = [1]*(right_ind-left_ind+1) # or ind list_shapes += [ - {'x0':left_ind, 'y0':hgt_min+(i*det/2), - 'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}, - {'x0':left_ind, 'y0':hgt_min+(i*det/2)-0.15*det, - 'x1':left_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}, - {'x0':right_ind, 'y0':hgt_min+(i*det/2)-0.15*det, - 'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}] + {'x0': left_ind, 'y0': hgt_min+(i*det/2), + 'x1': right_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)}, + {'x0': left_ind, 'y0': hgt_min+(i*det/2)-0.15*det, + 'x1': left_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)}, + {'x0': right_ind, 'y0': hgt_min+(i*det/2)-0.15*det, + 'x1': right_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)}] if val < 0.01: - list_annotations += [{'text':'**', "font":dict(size=fontsize), - 'x':(left_ind+right_ind)/2, - 'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}] + list_annotations += [{'text': '**', "font": dict(size=fontsize), + 'x': (left_ind+right_ind)/2, + 'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}] else: - list_annotations += [{'text':'*',"font":dict(size=fontsize), - 'x':(left_ind+right_ind)/2, - 'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}] - + list_annotations += [{'text': '*', "font": dict(size=fontsize), + 'x': (left_ind+right_ind)/2, + 'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}] + return list_shapes, list_annotations, len(level) """ def _generate_shapes_annotations_lists_supergroup( self, pval_series, groups, supergroups, hgt_min ): - """ - #:param supergroups dictionary with supergroup edges - """ + # :param supergroups dictionary with supergroup edges dic_gps = {} for i in range(len(groups)): dic_gps[groups[i]]=i - - + list_shapes = [] dic_middle = {} - supergroups_to_display = set(list(pval_series.index.get_level_values(0)) + list(pval_series.index.get_level_values(1))) + supergroups_to_display = set(list(pval_series.index.get_level_values(0)) \ + + list(pval_series.index.get_level_values(1))) for i in supergroups_to_display: if isinstance(supergroups[i], list): - list_shapes += [{'x0':supergroups[i][0], 'y0':hgt_min, 'x1':supergroups[i][1], 'y1':hgt_min, 'line':dict(width=1)}] + list_shapes += [{ + 'x0':supergroups[i][0], 'y0':hgt_min, + 'x1':supergroups[i][1], 'y1':hgt_min, + 'line':dict(width=1) + }] dic_middle[i] = (dic_gps[supergroups[i][0]] + dic_gps[supergroups[i][1]])/2 else: dic_middle[i] = dic_gps[supergroups[i]] @@ -447,7 +457,7 @@ def _generate_shapes_annotations_lists_supergroup( for k, v in sorted(dic_middle.items(), key=lambda item: item[1]): dic_supergps[k] = i i+=1 - + level=[[0] * (len(dic_supergps))] list_annotations = [] for ind, val in pval_series.items(): @@ -456,52 +466,54 @@ def _generate_shapes_annotations_lists_supergroup( if dic_middle[ind[0]] > dic_middle[ind[1]]: ind = (ind[1], ind[0]) if level[i][dic_supergps[ind[0]]] == 0: - level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1]=[1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1) # or ind + level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1] = \ + [1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1) # or ind list_shapes += [ - {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, + {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)}, {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5, #'y0':hgt_min, 'line':dict(width=1, dash="dot")}, 'y0':hgt_min+i+0.35, 'line':dict(width=1)}, - {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, + {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, #'y0':hgt_min, 'line':dict(width=1, dash="dot")} 'y0':hgt_min+i+0.35, 'line':dict(width=1)} ] if val < 0.01: - list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 'y':hgt_min+i+0.65, 'showarrow':False}] else: - list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 'y':hgt_min+i+0.65, 'showarrow':False}] y = 1 break if y == 0: i += 1 level += [[0] * (len(dic_supergps))] - level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1]=[1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1) # or ind + level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1] = \ + [1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1) # or ind list_shapes += [ - {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, + {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)}, - {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5, + {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5, #'y0':hgt_min, 'line':dict(width=1, dash="dot")}, 'y0':hgt_min+i+0.35, 'line':dict(width=1)}, - {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, + {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, #'y0':hgt_min, 'line':dict(width=1, dash="dot")} 'y0':hgt_min+i+0.35, 'line':dict(width=1)} ] if val < 0.01: - list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 'y':hgt_min+i+0.65, 'showarrow':False}] else: - list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, + list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 'y':hgt_min+i+0.65, 'showarrow':False}] - + return list_shapes, list_annotations, len(level) """ # for now, method above and below in different methods - #def _generate_dic_shapes_and_annotations( + # def _generate_dic_shapes_and_annotations( # self, pval_series, dic_lev, hgt_min - #): + # ): # list_shapes={} # dic_annotations={} # for ind, val in pval_series.items(): @@ -625,16 +637,16 @@ def analyse_groups( # pval is in the right structure to be returned final_groups = groups # to remove from here later on - + if pval_to_display: final_groups = groups to_display = self._pval_selection(pval, groups) - + if pval_to_display and to_display.empty: # nothing to display pval_to_display = None if pval_to_display: - hgt_min = df[self.DIVERSITY_INDEXES_NAME].max() #+ 0.5 + hgt_min = df[self.DIVERSITY_INDEXES_NAME].max() # + 0.5 list_shapes, list_annotations, nb_lev = self._generate_shapes_annotations_lists( to_display, final_groups, hgt_min ) @@ -642,7 +654,7 @@ def analyse_groups( plotting_options = {} det = (hgt_min/15) hgt_min += det/2 - #nblevels=to_display.shape[0] + # nblevels=to_display.shape[0] plotting_options = merge_dict(plotting_options, { 'layout': { 'shapes': list_shapes, # we should had to previous list if there is one diff --git a/setup.py b/setup.py index 72b773f..8ad867e 100644 --- a/setup.py +++ b/setup.py @@ -24,4 +24,4 @@ ], packages=find_packages(), entry_points={'console_scripts': ['moonstone=moonstone.main:run']}, -) \ No newline at end of file +) diff --git a/tests/analysis/diversity/test_alpha.py b/tests/analysis/diversity/test_alpha.py index 57696ad..1643a16 100644 --- a/tests/analysis/diversity/test_alpha.py +++ b/tests/analysis/diversity/test_alpha.py @@ -114,6 +114,15 @@ def test_invalid_pval_param(self): self.assertIn("WARNING:moonstone.analysis.diversity.base:pval_to_display='lilili' not valid, \ set to default (None).", log.output) + def test_inconsistent_pval_to_diplay_param(self): + tested_object_instance = ShannonIndex(self.tested_object) + with self.assertRaises(ValueError) as cm: + tested_object_instance._valid_pval_param("same group_col or group_col2 values", "all") + the_exception = cm.exception + expected_msg = "pval_to_display='all' not valid, when pval_to_compute='same group_col or group_col2 values'. \ +pval_to_display should be set to: ['same group_col or group_col2 values', 'same group_col values', None]" + self.assertEqual(the_exception.__str__(), expected_msg) + def test_analyse_groups_pval_to_compute_all(self): tested_object_instance = ShannonIndex(self.tested_object) @@ -201,14 +210,237 @@ def test_analyse_groups_pval_to_compute_same_group_col_or_group_col2_values(self ) pd.testing.assert_series_equal(output['pval'], expected_ser, check_dtype=False) + def test_pval_selection(self): + tested_object = pd.Series({ + ('A', 'B'): 0.03, + ('A', 'C'): 0.5, + ('A', 'D'): 0.0014, + ('B', 'C'): 0.2, + ('B', 'D'): 0.001, + ('C', 'D'): 0.00067, + }) + tested_object.index.names = ["Group1", "Group2"] + tested_object_instance = ShannonIndex(self.tested_object) + + expected_ser = pd.Series({ + ('A', 'B'): 0.03, + }) + expected_ser.index.names = ["Group1", "Group2"] + pd.testing.assert_series_equal( + tested_object_instance._pval_selection(tested_object, ['A', 'B', 'C']), + expected_ser + ) + + expected_ser = pd.Series({ + ('A', 'D'): 0.0014, + ('C', 'D'): 0.00067, + }) + expected_ser.index.names = ["Group1", "Group2"] + pd.testing.assert_series_equal( + tested_object_instance._pval_selection(tested_object, ['A', 'C', 'D']), + expected_ser + ) + + def test_pval_selection_with_group_col2(self): + # pval_to_compute = 'all' + tested_object = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('M - A', 'F - A'): 0.5, + ('M - C', 'F - A'): 0.0034, + ('F - B', 'M - A'): 0.6, + ('F - C', 'F - B'): 0.0014, + ('F - B', 'M - B'): 0.2, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('M - C', 'F - C'): 0.0003, + ('M - C', 'F - B'): 0.0056, + ('M - A', 'M - C'): 0.89, + ('M - A', 'F - C'): 0.0006, + ('M - B', 'F - C'): 0.0043, + ('F - A', 'M - B'): 0.234, + }) + tested_object.index.names = ["Group1", "Group2"] + tested_object_instance = ShannonIndex(self.tested_object) + + groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'] + + # Case pval_to_display = 'all': only pval > 0.05 removed + pval_to_display = 'all' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('M - C', 'F - A'): 0.0034, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('M - C', 'F - C'): 0.0003, + ('M - C', 'F - B'): 0.0056, + ('M - A', 'F - C'): 0.0006, + ('M - B', 'F - C'): 0.0043, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'all', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case pval_to_display = 'same group_col or group_col2 values' + pval_to_display = 'same group_col or group_col2 values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('M - C', 'F - C'): 0.0003, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'all', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case pval_to_display = 'same group_col values' + pval_to_display = 'same group_col values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'all', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case final_groups defined + groups = ['M - A', 'F - A', 'M - C', 'F - C'] + expected_ser = pd.Series({ + ('M - C', 'F - A'): 0.0034, + ('F - A', 'F - C'): 0.0031, + ('M - C', 'F - C'): 0.0003, + ('M - A', 'F - C'): 0.0006, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'all', 'all' + ) + pd.testing.assert_series_equal(res, expected_ser) + + def test_pval_selection_with_group_col2_pval_to_compute_same_group_col_or_same_group_col2(self): + # pval_to_compute = 'same group_col or group_col2 values' + tested_object = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('M - A', 'F - A'): 0.5, + ('F - C', 'F - B'): 0.0014, + ('F - B', 'M - B'): 0.2, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('F - C', 'M - C'): 0.0003, + ('M - A', 'M - C'): 0.89, + }) + tested_object.index.names = ["Group1", "Group2"] + tested_object_instance = ShannonIndex(self.tested_object) + + groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'] + + # Case pval_to_display = 'same group_col or group_col2 values': only pval > 0.05 removed + pval_to_display = 'same group_col or group_col2 values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('F - C', 'M - C'): 0.0003, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col or group_col2 values', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case pval_to_display = 'same group_col values' + pval_to_display = 'same group_col values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col or group_col2 values', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case final_groups defined + groups = ['M - A', 'F - A', 'M - C', 'F - C'] + expected_ser = pd.Series({ + ('F - A', 'F - C'): 0.0031, + ('F - C', 'M - C'): 0.0003, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col or group_col2 values', 'same group_col or group_col2 values' + ) + pd.testing.assert_series_equal(res, expected_ser) + + def test_pval_selection_with_group_col2_pval_to_compute_same_group_col(self): + # pval_to_compute = 'same group_col values' + tested_object = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + ('M - A', 'M - C'): 0.89, + }) + tested_object.index.names = ["Group1", "Group2"] + tested_object_instance = ShannonIndex(self.tested_object) + + groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'] + + # Case pval_to_display = 'same group_col values' + pval_to_display = 'same group_col values' + expected_ser = pd.Series({ + ('M - C', 'M - B'): 0.03, + ('F - C', 'F - B'): 0.0014, + ('F - A', 'F - B'): 0.001, + ('M - A', 'M - B'): 0.00067, + ('F - A', 'F - C'): 0.0031, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col values', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + + # Case final_groups defined + groups = ['M - A', 'F - A', 'M - C', 'F - C'] + expected_ser = pd.Series({ + ('F - A', 'F - C'): 0.0031, + }) + expected_ser.index.names = ["Group1", "Group2"] + res = tested_object_instance._pval_selection_with_group_col2( + tested_object, groups, 'same group_col values', pval_to_display + ) + pd.testing.assert_series_equal(res, expected_ser) + def test_generate_ordered_final_groups(self): tested_object = pd.DataFrame.from_dict({ - 'sex_Group': {'comp1': 'M - A', 'comp2': 'F - B', 'comp3': 'F - A', 'comp4': 'F - C', 'comp5': 'M - C', 'comp6': 'M - B', 'comp7': 'M - A'}, - 'sex': {'comp1': 'M', 'comp2': 'F', 'comp3': 'F', 'comp4': 'F', 'comp5': 'M', 'comp6': 'M', 'comp7': 'M'}, + 'sex_Group': {'comp1': 'M - A', 'comp2': 'F - B', 'comp3': 'F - A', 'comp4': 'F - C', 'comp5': 'M - C', + 'comp6': 'M - B', 'comp7': 'M - A'}, + 'sex': {'comp1': 'M', 'comp2': 'F', 'comp3': 'F', 'comp4': 'F', 'comp5': 'M', 'comp6': 'M', 'comp7': 'M'}, 'Group': {'comp1': 'A', 'comp2': 'B', 'comp3': 'A', 'comp4': 'C', 'comp5': 'C', 'comp6': 'B', 'comp7': 'A'}, }) - groups_Group=["C", "A", "B"] - groups_sex=["M", "F"] + groups_Group = ["C", "A", "B"] + groups_sex = ["M", "F"] tested_object_instance = ShannonIndex(self.tested_object) final_groups = tested_object_instance._generate_ordered_final_groups( tested_object, final_group_col='sex_Group', group_col='Group', group_col2='sex', @@ -221,7 +453,7 @@ def test_generate_ordered_final_groups(self): groups=groups_sex, groups2=groups_Group ) self.assertListEqual(final_groups, ['M - C', 'M - A', 'M - B', 'F - C', 'F - A', 'F - B']) - # testing if order of groups not given + # testing if order of groups not given final_groups = tested_object_instance._generate_ordered_final_groups( tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group', groups=None, groups2=groups_Group @@ -232,7 +464,7 @@ def test_generate_ordered_final_groups(self): tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group', groups=groups_sex, groups2=None ) - self.assertListEqual(final_groups, ['M - A', 'M - B', 'M - C', 'F - A', 'F - B', 'F - C']) + self.assertListEqual(final_groups, ['M - A', 'M - B', 'M - C', 'F - A', 'F - B', 'F - C']) def test_order_pval_series(self): tested_object = pd.Series({ @@ -251,13 +483,13 @@ def test_order_pval_series(self): tested_object_instance = ShannonIndex(self.tested_object) level0 = pd.Categorical( - ['M - C', 'M - C', 'M - C', 'F - C', 'F - C', 'M - A', 'M - A', 'F - A', 'M - B'], - categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], + ['M - C', 'M - C', 'M - C', 'F - C', 'F - C', 'M - A', 'M - A', 'F - A', 'M - B'], + categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], ordered=True, dtype='category' ) level1 = pd.Categorical( - ['F - C', 'M - A', 'M - B', 'F - A', 'F - B', 'F - A', 'M - B', 'F - B', 'F - B'], - categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], + ['F - C', 'M - A', 'M - B', 'F - A', 'F - B', 'F - A', 'M - B', 'F - B', 'F - B'], + categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], ordered=True, dtype='category' ) data = [[0.0003, 0.89, 0.03, 0.0031, 0.0014, 0.5, 0.00067, 0.001, 0.2]] @@ -270,12 +502,12 @@ def test_order_pval_series(self): pd.testing.assert_series_equal( tested_object_instance._order_pval_series( - tested_object, groups, + tested_object, groups, {'M - C': 0, 'F - C': 1, 'M - A': 2, 'F - A': 3, 'M - B': 4, 'F - B': 5} ), expected_ser ) - + class TestSimpsonInverseIndex(TestCase): @@ -429,4 +661,4 @@ def test_visualize(self): tree = TreeNode.read(StringIO( u'(((species1:0.25,species2:0.25):0.75,species3:1.0):0.5,(species4:0.5,species5:0.5):1.0)root;')) tested_object_instance = FaithsPhylogeneticDiversity(tested_object, tree) - tested_object_instance.visualize(show=False) \ No newline at end of file + tested_object_instance.visualize(show=False) From 2567a48b508c1f8cc280c26120f78a6aa6932790 Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Wed, 15 Nov 2023 16:10:59 +0100 Subject: [PATCH 7/8] docstring new methods --- moonstone/analysis/diversity/base.py | 103 ++++++++++++++++++------- tests/analysis/diversity/test_alpha.py | 13 ++-- 2 files changed, 78 insertions(+), 38 deletions(-) diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py index 20b4147..fc2a010 100644 --- a/moonstone/analysis/diversity/base.py +++ b/moonstone/analysis/diversity/base.py @@ -1,12 +1,13 @@ import logging import re from abc import ABC, abstractmethod -import skbio +from numbers import Real from string import capwords from typing import Union import numpy as np import pandas as pd +import skbio from statsmodels.stats.multitest import multipletests from moonstone.analysis.statistical_test import statistical_test_groups_comparison @@ -253,10 +254,18 @@ def _valid_correction_method_param(self, correction_method): return correction_method def _pval_selection( - self, pval_series, groups - ): + self, pval_series: pd.Series, groups: list, + threshold: float = 0.05 + ) -> pd.Series: + """ + To select the p-values to display. The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed - pval_series = pval_series[pval_series < 0.05] + Args: + pval_series: series of all the p-values computed. + groups: list of groups displayed in graph. + threshold: the significance threshold. It must be between between 0 and 1. Default is 0.05. + """ + pval_series = pval_series[pval_series < threshold] if groups is not None: pval_series = pval_series[( pval_series.index.get_level_values(0).isin(groups) & pval_series.index.get_level_values(1).isin(groups) @@ -264,15 +273,26 @@ def _pval_selection( return pval_series def _pval_selection_with_group_col2( - self, pval_series, final_groups, pval_to_compute, pval_to_display - ): + self, pval_series: pd.Series, final_groups: list, + pval_to_compute: str, pval_to_display: str, + threshold: float = 0.05 + ) -> pd.Series: + """ + To select the p-values to display when the group_col2 argument is being used. + The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed + + Args: + pval_series: series of all the p-values computed. + final_groups: list of all the combinations displayed in graph: "{group_col value} - {group_col2 value}". + threshold: the significance threshold. It must be between between 0 and 1. Default is 0.05. + """ # Reminder: # 1) This method called only if pval_to_display is not None. # So pval_to_compute/pval_to_display = # {"all", "same group_col or group_col2 values", "same group_col values"} # 2) Index values follow this pattern: "{group_col value} - {group_col2 value}" - pval_series = self._pval_selection(pval_series, final_groups) + pval_series = self._pval_selection(pval_series, final_groups, threshold) if (pval_to_compute != "same group_col values" and pval_to_display == "same group_col values"): # we only have to check first part of index values @@ -298,8 +318,21 @@ def _pval_selection_with_group_col2( return pval_series def _order_pval_series( - self, pval_series: pd.Series, groups: list, dic_gps: dict - ): + self, pval_series: pd.Series, groups: list, **kwargs + ) -> pd.Series: + """ + To select the p-values to display when the group_col2 argument is being used. + The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed + + Args: + pval_series: series of all the p-values computed. + groups: ordered list of the groups displayed in graph: "{group_col value} - {group_col2 value}". + """ + dic_gps = kwargs.pop("dic_gps", {}) + if not dic_gps: + for i in range(len(groups)): + dic_gps[groups[i]] = i + # Reminder: pvalue series is a MultiIndex series -> 2 level of index are the 2 groups compared # to order p-value series in a specific order dictated in dic_gps # example: dic_gps = {"Group A": 0, "Group B": 1, "Group C": 3} @@ -321,13 +354,27 @@ def _order_pval_series( return pval_series[0] def _generate_ordered_final_groups( - self, df: pd.DataFrame, final_group_col: str, group_col: str, group_col2: str, + self, metadata_df: pd.DataFrame, final_group_col: str, group_col: str, group_col2: str, groups: list, groups2: list - ): - # listing and sorting all final_groups possibles respecting the order given by - # groups first and then by groups2 - # NB: At least one of groups or groups2 need to not be None - t = df.drop_duplicates(subset=[final_group_col]).copy() # copy() to avoid raising SettingWithCopyWarning + ) -> list: + """ + To order the values from final_group_col + (e.g. the combined names of group_col and group_col2: "{group_col value} - {group_col2 value}") + as it should be displayed in the graph: + Following first the order commanded by groups, and then the order commanded by groups2 + + Args: + metadata_df: dataframe containing metadata and information to group the data. + final_group_col: column generated from concatening group_col and group_col2 + (e.g. "{group_col value} - {group_col2 value}") + group_col: column from metadata_df used to group the data + group_col2: column from metadata_df used to further divide the data + groups: ordered list of groups from group_col to display in graph. + groups2: ordered list of groups from group_col2 to display in graph. + """ + # This method is called if pval_to_display isn't None and if at least one of groups or groups2 isn't None + # It lists and sorts all final_groups possibles respecting the order given by groups first and then by groups2 + t = metadata_df.drop_duplicates(subset=[final_group_col]).copy() # copy() to avoid raising SettingWithCopyWarning if groups: t[group_col] = t[group_col].astype("category") t[group_col] = t[group_col].cat.set_categories(groups, ordered=True) @@ -339,14 +386,15 @@ def _generate_ordered_final_groups( return list(t[final_group_col]) def _generate_shapes_annotations_lists( - self, pval_series, groups, hgt_min + self, pval_series:pd.Series, groups: list, hgt_min: Real ): """ - To generate annotations to represent significant pvalues. Methods for group_col only (not group_col2) + To generate annotations to represent significant p-values. Methods for group_col only (not group_col2) Args: - pval_series: pd.Series of the pvalue to put on the graph (need to be filtered beforehand - so that it only contains significant pvalues) + pval_series: series of the p-values to put on the graph (need to be filtered beforehand + so that it only contains significant p-values). + groups: list of groups displayed in graph. """ # Overview of this method: We're trying to generate the shapes (1 bracket = 3 lines = # 1 going from Group1 to Group2 and 2 small lines to form the edge of the bracket) @@ -390,7 +438,7 @@ def _generate_shapes_annotations_lists( 'line': dict(width=linewidth)}, # from Group1 to Group2 {'x0': left_ind, 'y0': hgt_min+(i*det/2)-0.15*det, 'x1': left_ind, 'y1': hgt_min+(i*det/2), - 'line': dict(width=linewidth)}, # left edge of the bracket + 'line': dict(width=linewidth)}, # left edge of the bracket {'x0': right_ind, 'y0': hgt_min+(i*det/2)-0.15*det, 'x1': right_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)} # right edge of the bracket @@ -521,7 +569,7 @@ def _generate_shapes_annotations_lists_supergroup( def _compute_pval_inside_subgroups( self, diversity_index_dataframe: pd.DataFrame, group_col: str, final_group_col: str, stats_test: str, correction_method: str, structure_pval: str, sym: bool - ): + ) -> pd.Series: pval = pd.Series([], dtype='float64') for g in diversity_index_dataframe[group_col].dropna().unique(): df_gp = diversity_index_dataframe[diversity_index_dataframe[group_col] == g] @@ -597,16 +645,11 @@ def analyse_groups( df = self._get_grouped_df(filtered_metadata_df[[group_col, group_col2, final_group_col]]) if pval_to_display and (groups or groups2): - # listing and sorting all final_groups possibles respecting the order given by + # list and sort all final_groups possibles respecting the order given by # groups first and then by groups2 - t = df.drop_duplicates(subset=[final_group_col]) - t[group_col] = t[group_col].astype("category") - t[group_col].cat = t[group_col].cat.set_categories(groups) - t[group_col2] = t[group_col2].astype("category") - t[group_col].cat = t[group_col2].cat.set_categories(groups2) - t = t.dropna(how="any", subset=[group_col, group_col2]) - t = t.sort_values([group_col, group_col2]) - final_groups = list(t[final_group_col]) + final_groups = self._generate_ordered_final_groups( + df, final_group_col, group_col, group_col2, groups, groups2 + ) if pval_to_compute == "all": pval = self._run_statistical_test_groups( diff --git a/tests/analysis/diversity/test_alpha.py b/tests/analysis/diversity/test_alpha.py index 1643a16..af899a2 100644 --- a/tests/analysis/diversity/test_alpha.py +++ b/tests/analysis/diversity/test_alpha.py @@ -1,3 +1,4 @@ +import re from unittest import TestCase from io import StringIO @@ -474,9 +475,9 @@ def test_order_pval_series(self): ('M - B', 'F - B'): 0.2, ('F - A', 'F - B'): 0.001, ('M - A', 'M - B'): 0.00067, - ('F - A', 'F - C'): 0.0031, + ('F - A', 'F - C'): 0.0031, # should be reorganized as ('F - C', 'F - A') ('M - C', 'F - C'): 0.0003, - ('M - A', 'M - C'): 0.89, + ('M - A', 'M - C'): 0.89, # should be reorganized as ('M - C', 'M - A') }) tested_object.index.names = ["Group1", "Group2"] groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'] @@ -495,16 +496,12 @@ def test_order_pval_series(self): data = [[0.0003, 0.89, 0.03, 0.0031, 0.0014, 0.5, 0.00067, 0.001, 0.2]] expected_ser = pd.DataFrame( data=data, - columns=pd.MultiIndex.from_arrays([level0, level1]), - + columns=pd.MultiIndex.from_arrays([level0, level1]) ).T[0] expected_ser.index.names = ["Group1", "Group2"] pd.testing.assert_series_equal( - tested_object_instance._order_pval_series( - tested_object, groups, - {'M - C': 0, 'F - C': 1, 'M - A': 2, 'F - A': 3, 'M - B': 4, 'F - B': 5} - ), + tested_object_instance._order_pval_series(tested_object, groups), expected_ser ) From f8392300fade033264a39b92b7fc7f6f6d2cba25 Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Wed, 15 Nov 2023 16:15:11 +0100 Subject: [PATCH 8/8] flake8 --- moonstone/analysis/diversity/base.py | 12 +++++++----- tests/analysis/diversity/test_alpha.py | 1 - 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py index fc2a010..dc11c2b 100644 --- a/moonstone/analysis/diversity/base.py +++ b/moonstone/analysis/diversity/base.py @@ -258,7 +258,8 @@ def _pval_selection( threshold: float = 0.05 ) -> pd.Series: """ - To select the p-values to display. The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed + To select the p-values to display. The significant p-values, meaning the p-values under a given threshold, + belonging to two groups diplayed. Args: pval_series: series of all the p-values computed. @@ -358,9 +359,9 @@ def _generate_ordered_final_groups( groups: list, groups2: list ) -> list: """ - To order the values from final_group_col + To order the values from final_group_col (e.g. the combined names of group_col and group_col2: "{group_col value} - {group_col2 value}") - as it should be displayed in the graph: + as it should be displayed in the graph: Following first the order commanded by groups, and then the order commanded by groups2 Args: @@ -374,7 +375,8 @@ def _generate_ordered_final_groups( """ # This method is called if pval_to_display isn't None and if at least one of groups or groups2 isn't None # It lists and sorts all final_groups possibles respecting the order given by groups first and then by groups2 - t = metadata_df.drop_duplicates(subset=[final_group_col]).copy() # copy() to avoid raising SettingWithCopyWarning + t = metadata_df.drop_duplicates(subset=[final_group_col])\ + .copy() # copy() to avoid raising SettingWithCopyWarning if groups: t[group_col] = t[group_col].astype("category") t[group_col] = t[group_col].cat.set_categories(groups, ordered=True) @@ -386,7 +388,7 @@ def _generate_ordered_final_groups( return list(t[final_group_col]) def _generate_shapes_annotations_lists( - self, pval_series:pd.Series, groups: list, hgt_min: Real + self, pval_series: pd.Series, groups: list, hgt_min: Real ): """ To generate annotations to represent significant p-values. Methods for group_col only (not group_col2) diff --git a/tests/analysis/diversity/test_alpha.py b/tests/analysis/diversity/test_alpha.py index af899a2..a70821d 100644 --- a/tests/analysis/diversity/test_alpha.py +++ b/tests/analysis/diversity/test_alpha.py @@ -1,4 +1,3 @@ -import re from unittest import TestCase from io import StringIO