From b99179afe86bec48c5b258f50cd197f63bdc0e72 Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Tue, 4 Jul 2023 16:06:24 +0200
Subject: [PATCH 1/8] Draft pvalue display in graph

---
 moonstone/analysis/diversity/base.py | 313 ++++++++++++++++++++++++++-
 moonstone/plot/graphs/base.py        |   9 -
 2 files changed, 306 insertions(+), 16 deletions(-)

diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
index 3243194..a047001 100644
--- a/moonstone/analysis/diversity/base.py
+++ b/moonstone/analysis/diversity/base.py
@@ -1,11 +1,11 @@
 import logging
 import re
 from abc import ABC, abstractmethod
-import skbio
+#import skbio
 from string import capwords
 from typing import Union
 
-import numpy as np
+#import numpy as np
 import pandas as pd
 from statsmodels.stats.multitest import multipletests
 
@@ -16,6 +16,8 @@
 from moonstone.plot.graphs.heatmap import HeatmapGraph
 from moonstone.plot.graphs.histogram import Histogram
 from moonstone.plot.graphs.violin import GroupViolinGraph, ViolinGraph
+from moonstone.utils.dict_operations import merge_dict
+
 
 logger = logging.getLogger(__name__)
 
@@ -219,14 +221,28 @@ def _visualize_pvalue_matrix(self, pval: pd.DataFrame, output_pval_file: str):
             output_file=output_pval_file
         )
 
-    def _valid_pval_param(self, pval_to_compute):
+    def _valid_pval_param(self, pval_to_compute, pval_to_display):
         choices = [
             "all", "same group_col or group_col2 values", "same group_col values", None
         ]
+        dicpval = {}
+        for i in range(len(choices)):
+            dicpval[choices[i]]=i
+            
         if pval_to_compute not in choices:
             logger.warning("pval_to_compute='%s' not valid, set to default (all).", pval_to_compute)
             pval_to_compute = "all"
-        return pval_to_compute
+
+        if pval_to_display not in choices:
+            logger.warning("pval_to_display='%s' not valid, set to default (None).", pval_to_display)
+            pval_to_display = None
+        elif dicpval[pval_to_display]<dicpval[pval_to_compute]:
+            raise ValueError("pval_to_display='{}' not valid, when pval_to_compute='{}'. \
+pval_to_display should be set to :{}".format(
+                pval_to_display, pval_to_compute, choices[dicpval[pval_to_compute]:]
+            ))
+        
+        return pval_to_compute, pval_to_display
 
     def _valid_correction_method_param(self, correction_method):
         if correction_method == "uncorrected":
@@ -236,6 +252,243 @@ def _valid_correction_method_param(self, correction_method):
             return None
         return correction_method
 
+    def _pval_selection(
+        self, pval_series, groups
+    ):
+        
+        pval_series = pval_series[pval_series < 0.05]
+        if groups is not None:
+            pval_series = pval_series[
+                (pval_series.index.get_level_values(0).isin(groups) & pval_series.index.get_level_values(1).isin(groups))
+            ]
+        return pval_series     
+    
+    def _pval_selection_with_group_col2(
+        self, pval_series, final_groups, pval_to_compute, pval_to_display
+    ):
+        #Reminder: 
+        #    1) This method called only if pval_to_display is not None. So pval_to_compute/pval_to_display = {"all", "same group_col or group_col2 values", "same group_col values"}
+        #    2) Index values follow this pattern: "{group_col value} - {group_col2 value}"
+        print("pval_to_compute:", pval_to_compute)
+        print("pval_to_display:", pval_to_display)
+        pval_series = self._pval_selection(pval_series, final_groups)
+        if (pval_to_compute != "same group_col values" and
+            pval_to_display == "same group_col values"):
+            # we only have to check first part of index values
+            pval_series = pval_series[
+                (
+                    pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0])
+                )
+            ]
+        elif (pval_to_compute == "all" and
+              pval_to_display == "same group_col or group_col2 values"):
+            # we compare both part of the index values -> if first part is the same = same group_col value
+                                                     # -> if second part is the same = same group_col2 value
+            pval_series = pval_series[
+                (
+                    pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0])
+                ) | (
+                    pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[1]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[1])    
+                )]
+
+        return pval_series
+
+    def _order_pval_series(
+        self, pval_series, groups, dic_gps
+    ):
+        # Reminder: pvalue series is a MultiIndex series -> 2 level of index are the 2 groups compared
+        # to order p-value series in a specific order dictated in dic_gps
+        # example: dic_gps = {"Group A": 0, "Group B": 1, "Group C": 3}
+        names = pval_series.index.names     # default: ["Group1", "Group2"]
+
+        # first we order p-value series index name to have the group that should come first as first member
+        pval_series = pval_series.reset_index()
+        for i in pval_series.index:
+            level0 = pval_series.loc[i][names[0]]
+            level1 = pval_series.loc[i][names[1]]
+            if dic_gps[level0] > dic_gps[level1]:
+                pval_series.loc[i, names[0]] = level1       # invert to have the Group that should be put first as first
+                pval_series.loc[i, names[1]] = level0       # in example: "Group B - Group A" becomes "Group A - Group B"
+        pval_series[names[0]] = pval_series[names[0]].astype("category")
+        pval_series[names[0]].cat.set_categories(groups, inplace=True)
+        pval_series[names[1]] = pval_series[names[1]].astype("category")
+        pval_series[names[1]].cat.set_categories(groups, inplace=True)
+        pval_series = pval_series.sort_values([names[0], names[1]]) # we sort by 1st member, and then 2nd member
+        pval_series = pval_series.set_index([names[0], names[1]])
+        return pval_series[0]
+
+    def _generate_shapes_annotations_lists(
+        self, pval_series, groups, hgt_min
+    ):
+        """
+        To generate annotations to represent significant pvalues. Methods for group_col only (not group_col2)
+
+        Args:
+            pval_series: pd.Series of the pvalue to put on the graph (need to be filtered beforehand so that it only contains significant pvalues)
+        """
+        # Overview of this method: We're trying to generate the shapes (1 bracket = 3 lines =
+        # 1 going from Group1 to Group2 and 2 small lines to form the edge of the bracket)
+        # and the annotations.
+        # To ensure that the brackets don't overlap, we create a table/list named `level`
+        # with a number of columns equal to the number of groups and an expendable number of rows
+        # When we fill the list, we replace 0 to 1 meaning that there is now a bracket at this
+        # location and that we can't add another bracket on top of it, and that another level (row)
+        # need to be added to `level`
+        dic_gps = {}
+        for i in range(len(groups)):
+            dic_gps[groups[i]]=i        
+        
+        # the pvalues need to be ordered so that looking at the cell corresponding to the left edge 
+        # of the bracket that needs to be added is enough to determine if there is already a
+        # bracket there.
+        pval_series = self._order_pval_series(pval_series, groups, dic_gps)
+        
+        det = (hgt_min/15)
+        
+        hgt_min += det/2
+        fontsize = int(12+det)
+        linewidth = 0.5+0.15*det
+    
+        level=[[0] * (len(groups))]
+        list_shapes = []
+        list_annotations = []
+        
+        for ind, val in pval_series.items():
+            y = 0
+            
+            left_ind = dic_gps[ind[0]]  # -> could be directly ind for shapes but /!\ not for the annotations
+            right_ind = dic_gps[ind[1]]
+            
+            for i in range(len(level)):
+                if level[i][left_ind] == 0:
+                    level[i][left_ind:right_ind+1]=[1]*(right_ind-left_ind+1)
+                    list_shapes += [
+                        {'x0':left_ind, 'y0':hgt_min+(i*det/2), 
+                         'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)},   # from Group1 to Group2
+                        {'x0':left_ind, 'y0':hgt_min+(i*det/2)-0.15*det, 
+                         'x1':left_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)},    # left edge of the bracket
+                        {'x0':right_ind, 'y0':hgt_min+(i*det/2)-0.15*det, 
+                         'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}    # right edge of the bracket
+                    ]
+                    if val < 0.01:
+                        list_annotations += [{'text':'**', "font":dict(size=fontsize),
+                                              'x':(left_ind+right_ind)/2, 
+                                              'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}]
+                    else:
+                        list_annotations += [{'text':'*', "font":dict(size=fontsize),
+                                              'x':(left_ind+right_ind)/2, 
+                                              'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}]
+                    y = 1
+                    break
+            if y == 0:
+                # we need to add another level
+                i += 1
+                level += [[0] * (len(groups))]
+                level[i][left_ind:right_ind+1]=[1]*(right_ind-left_ind+1)  # or ind
+                list_shapes += [
+                    {'x0':left_ind, 'y0':hgt_min+(i*det/2), 
+                     'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)},
+                    {'x0':left_ind, 'y0':hgt_min+(i*det/2)-0.15*det, 
+                     'x1':left_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)},
+                    {'x0':right_ind, 'y0':hgt_min+(i*det/2)-0.15*det, 
+                     'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}]
+                if val < 0.01:
+                    list_annotations += [{'text':'**', "font":dict(size=fontsize),
+                                          'x':(left_ind+right_ind)/2, 
+                                          'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}]
+                else:
+                    list_annotations += [{'text':'*',"font":dict(size=fontsize),
+                                          'x':(left_ind+right_ind)/2,
+                                          'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}]
+        
+        return list_shapes, list_annotations, len(level)
+
+    """
+    def _generate_shapes_annotations_lists_supergroup(
+        self, pval_series, groups, supergroups, hgt_min
+    ):
+    """
+        #:param supergroups dictionary with supergroup edges
+    """
+        dic_gps = {}
+        for i in range(len(groups)):
+            dic_gps[groups[i]]=i
+    
+    
+        list_shapes = []
+        dic_middle = {}
+        supergroups_to_display = set(list(pval_series.index.get_level_values(0)) + list(pval_series.index.get_level_values(1)))
+        for i in supergroups_to_display:
+            if isinstance(supergroups[i], list):
+                list_shapes += [{'x0':supergroups[i][0], 'y0':hgt_min, 'x1':supergroups[i][1], 'y1':hgt_min, 'line':dict(width=1)}]                      
+                dic_middle[i] = (dic_gps[supergroups[i][0]] + dic_gps[supergroups[i][1]])/2
+            else:
+                dic_middle[i] = dic_gps[supergroups[i]]
+
+        dic_supergps = {}
+        i = 0
+        for k, v in sorted(dic_middle.items(), key=lambda item: item[1]):
+            dic_supergps[k] = i
+            i+=1
+                
+        level=[[0] * (len(dic_supergps))]
+        list_annotations = []
+        for ind, val in pval_series.items():
+            y = 0
+            for i in range(len(level)):
+                if dic_middle[ind[0]] > dic_middle[ind[1]]:
+                    ind = (ind[1], ind[0])
+                if level[i][dic_supergps[ind[0]]] == 0:
+                    level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1]=[1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1)  # or ind
+                    list_shapes += [
+                            {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, 
+                             'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)},
+                            {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5,
+                             #'y0':hgt_min, 'line':dict(width=1, dash="dot")},
+                             'y0':hgt_min+i+0.35, 'line':dict(width=1)},
+                            {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 
+                             #'y0':hgt_min, 'line':dict(width=1, dash="dot")}
+                             'y0':hgt_min+i+0.35, 'line':dict(width=1)}
+                                   ]
+                    if val < 0.01:
+                        list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 
+                                              'y':hgt_min+i+0.65, 'showarrow':False}]
+                    else:
+                        list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 
+                                              'y':hgt_min+i+0.65, 'showarrow':False}]
+                    y = 1
+                    break
+            if y == 0:
+                i += 1
+                level += [[0] * (len(dic_supergps))]
+                level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1]=[1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1)  # or ind
+                list_shapes += [
+                            {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, 
+                             'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)},
+                            {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5, 
+                             #'y0':hgt_min, 'line':dict(width=1, dash="dot")},
+                             'y0':hgt_min+i+0.35, 'line':dict(width=1)},
+                            {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 
+                             #'y0':hgt_min, 'line':dict(width=1, dash="dot")}
+                             'y0':hgt_min+i+0.35, 'line':dict(width=1)}
+                                ]
+                if val < 0.01:
+                    list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 
+                                              'y':hgt_min+i+0.65, 'showarrow':False}]
+                else:
+                    list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 
+                                              'y':hgt_min+i+0.65, 'showarrow':False}]
+        
+        return list_shapes, list_annotations, len(level)
+    """
+    # for now, method above and below in different methods
+    #def _generate_dic_shapes_and_annotations(
+    #    self, pval_series, dic_lev, hgt_min
+    #):   
+    #    list_shapes={}
+    #    dic_annotations={}
+    #    for ind, val in pval_series.items():
+
     def _compute_pval_inside_subgroups(
         self, diversity_index_dataframe: pd.DataFrame, group_col: str, final_group_col: str,
         stats_test: str, correction_method: str, structure_pval: str, sym: bool
@@ -264,7 +517,7 @@ def analyse_groups(
         plotting_options: dict = None,
         stats_test: str = 'mann_whitney_u', correction_method: str = None,
         structure_pval: str = 'dataframe', sym: bool = True,
-        pval_to_compute: bool = 'all',
+        pval_to_compute: bool = 'all', pval_to_display: str = None,
         show_pval: bool = True, output_pval_file: str = False,
         **kwargs
     ) -> dict:
@@ -277,7 +530,7 @@ def analyse_groups(
         :param groups: specifically select groups to display among group_col
         :param groups2: specifically select groups to display among group_col2
         :param show: also visualize
-        :param show_pval: visualize p-values
+        :param show_pval: visualize p-values's heatmap
         :param output_file: file path to output your html graph
         :param make_graph: whether or not to make the graph
         :param plotting_options: plotly plotting_options
@@ -290,10 +543,14 @@ def analyse_groups(
         :param pval_to_compute: if group_col2 used, problems of memory or in maximum recursion depth
           may occur. In this case, you may want to compute only p-values of specific comparisons.
           {"all" (default), None, "same group_col values", "same group_col or group_col2 values"}
+        :param pval_to_display: whether you want the significant pvalues displayed on the graph ("all") or not (None)
+          When group_col2 is used you may want to specify which type of comparisons you want to display the
+          significant pvalues of. Otherwise the graph can appear crowded by pvalues lines.
+          {None (default), "all", "same group_col values", "same group_col or group_col2 values"}
         """
         filtered_metadata_df = self._get_filtered_df_from_metadata(metadata_df)
 
-        pval_to_compute = self._valid_pval_param(pval_to_compute)
+        pval_to_compute, pval_to_display = self._valid_pval_param(pval_to_compute, pval_to_display)
         correction_method = self._valid_correction_method_param(correction_method)
 
         if group_col2:
@@ -307,6 +564,19 @@ def analyse_groups(
             )
             df = self._get_grouped_df(filtered_metadata_df[[group_col, group_col2, final_group_col]])
 
+            #if pval_to_display:
+            if 1 == 1:
+                # listing and sorting all final_groups possibles respecting the order given by
+                # groups first and then by groups2
+                t = df.drop_duplicates(subset=[final_group_col])
+                t[group_col] = t[group_col].astype("category")
+                t[group_col].cat.set_categories(groups, inplace=True)
+                t[group_col2] = t[group_col2].astype("category")
+                t[group_col2].cat.set_categories(groups2, inplace=True)
+                t.dropna(how="any", subset=[group_col, group_col2], inplace=True)
+                t = t.sort_values([group_col, group_col2])
+                final_groups = list(t[final_group_col])
+
             if pval_to_compute == "all":
                 pval = self._run_statistical_test_groups(
                     df, final_group_col, stats_test, correction_method, structure_pval, sym
@@ -324,6 +594,9 @@ def analyse_groups(
                         )
                     )
 
+            if pval_to_display:
+                to_display = self._pval_selection_with_group_col2(pval, final_groups, pval_to_compute, pval_to_display)
+
         else:
             df = self._get_grouped_df(filtered_metadata_df[group_col])
             pval = self._run_statistical_test_groups(
@@ -331,6 +604,32 @@ def analyse_groups(
                 )
             # pval is in the right structure to be returned
 
+            final_groups = groups   # to remove from here later on
+            
+            if pval_to_display:
+                final_groups = groups
+                to_display = self._pval_selection(pval, groups)
+                
+        if pval_to_display and to_display.empty:       # nothing to display
+            pval_to_display = None
+
+        if pval_to_display:
+            hgt_min = df[self.DIVERSITY_INDEXES_NAME].max() #+ 0.5
+            list_shapes, list_annotations, nb_lev = self._generate_shapes_annotations_lists(
+                to_display, final_groups, hgt_min
+            )
+            if not plotting_options:
+                plotting_options = {}
+            det = (hgt_min/15)
+            hgt_min += det/2
+            nblevels=to_display.shape[0]
+            plotting_options = merge_dict(plotting_options, {
+                'layout': {
+                    'shapes': list_shapes,            # we should had to previous list if there is one
+                    'annotations': list_annotations,  # idem
+                }
+            })
+
         self.last_grouped_df = df
         self.report_data['analyse_groups'] = {
             'pval': pval,
diff --git a/moonstone/plot/graphs/base.py b/moonstone/plot/graphs/base.py
index 20cedd3..7b79ca3 100644
--- a/moonstone/plot/graphs/base.py
+++ b/moonstone/plot/graphs/base.py
@@ -17,18 +17,9 @@ class BaseGraph(ABC):
     def __init__(
         self,
         data: Union[pd.Series, pd.DataFrame],
-        plotting_options: dict = None,
-        show: bool = True,
-        output_file: Union[bool, str] = False,
     ):
         """
         :param data: data to plot
-        :param show: set to False if you don't want to show the plot
-        :param output_file: name of the output file
-        :param plotting_options: options of plotting that will override the default setup \n
-                                 [!] Make sure the value given to an argument is of the right type \n
-                                 options allowed : 'log': `bool` ; 'colorbar': `[str, List[str]]` ;
-                                 'tickangle': `[int, float]`
         """
         self.data = data
 

From 36805c4507bd3de51046447bd107541729c5f2bf Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Mon, 9 Oct 2023 16:39:47 +0200
Subject: [PATCH 2/8] add imports

---
 .github/workflows/python-package.yml | 14 +++++++-------
 moonstone/analysis/diversity/base.py |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 9966157..4e52bcb 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -22,13 +22,13 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip3 install --upgrade pip3
-        pip3 install flake8 pytest
-        pip3 install numpy==1.18.1
-        pip3 install .
-        pip3 install odfpy     # optional dependencies
-        pip3 install openpyxl  # idem
-        pip3 install xlrd      # idem
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        pip install numpy==1.18.1
+        pip install .
+        pip install odfpy     # optional dependencies
+        pip install openpyxl  # idem
+        pip install xlrd      # idem
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
index a047001..2b2af7f 100644
--- a/moonstone/analysis/diversity/base.py
+++ b/moonstone/analysis/diversity/base.py
@@ -1,11 +1,11 @@
 import logging
 import re
 from abc import ABC, abstractmethod
-#import skbio
+import skbio
 from string import capwords
 from typing import Union
 
-#import numpy as np
+import numpy as np
 import pandas as pd
 from statsmodels.stats.multitest import multipletests
 

From a9a5675bd7ca0d8bccbb2ddd70af88147564bdc6 Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Tue, 24 Oct 2023 15:04:37 +0200
Subject: [PATCH 3/8] for python 3.9 and other python lib upgrade

---
 moonstone/analysis/diversity/base.py   | 36 +++++++----
 moonstone/plot/counts.py               |  4 +-
 tests/analysis/diversity/test_alpha.py | 83 +++++++++++++++++++++++++-
 3 files changed, 108 insertions(+), 15 deletions(-)

diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
index f36f3da..c318b88 100644
--- a/moonstone/analysis/diversity/base.py
+++ b/moonstone/analysis/diversity/base.py
@@ -294,13 +294,12 @@ def _pval_selection_with_group_col2(
         return pval_series
 
     def _order_pval_series(
-        self, pval_series, groups, dic_gps
+        self, pval_series: pd.Series, groups: list, dic_gps: dict
     ):
         # Reminder: pvalue series is a MultiIndex series -> 2 level of index are the 2 groups compared
         # to order p-value series in a specific order dictated in dic_gps
         # example: dic_gps = {"Group A": 0, "Group B": 1, "Group C": 3}
         names = pval_series.index.names     # default: ["Group1", "Group2"]
-
         # first we order p-value series index name to have the group that should come first as first member
         pval_series = pval_series.reset_index()
         for i in pval_series.index:
@@ -310,13 +309,31 @@ def _order_pval_series(
                 pval_series.loc[i, names[0]] = level1       # invert to have the Group that should be put first as first
                 pval_series.loc[i, names[1]] = level0       # in example: "Group B - Group A" becomes "Group A - Group B"
         pval_series[names[0]] = pval_series[names[0]].astype("category")
-        pval_series[names[0]].cat.set_categories(groups, inplace=True)
+        pval_series[names[0]] = pval_series[names[0]].cat.set_categories(groups, ordered=True)
         pval_series[names[1]] = pval_series[names[1]].astype("category")
-        pval_series[names[1]].cat.set_categories(groups, inplace=True)
+        pval_series[names[1]] = pval_series[names[1]].cat.set_categories(groups, ordered=True)
         pval_series = pval_series.sort_values([names[0], names[1]]) # we sort by 1st member, and then 2nd member
         pval_series = pval_series.set_index([names[0], names[1]])
         return pval_series[0]
 
+    def _generate_ordered_final_groups(
+        self, df: pd.DataFrame, final_group_col: str, group_col: str, group_col2: str,
+        groups: list, groups2: list
+    ):
+        # listing and sorting all final_groups possibles respecting the order given by
+        # groups first and then by groups2
+        # NB: At least one of groups or groups2 need to not be None
+        t = df.drop_duplicates(subset=[final_group_col]).copy()  # copy() to avoid raising SettingWithCopyWarning
+        if groups:
+            t[group_col] = t[group_col].astype("category")
+            t[group_col] = t[group_col].cat.set_categories(groups, ordered=True)
+        if groups2:
+            t[group_col2] = t[group_col2].astype("category")
+            t[group_col2] = t[group_col2].cat.set_categories(groups2, ordered=True)
+        t = t.dropna(how="any", subset=[group_col, group_col2])
+        t = t.sort_values([group_col, group_col2])
+        return list(t[final_group_col])
+
     def _generate_shapes_annotations_lists(
         self, pval_series, groups, hgt_min
     ):
@@ -567,16 +584,15 @@ def analyse_groups(
             )
             df = self._get_grouped_df(filtered_metadata_df[[group_col, group_col2, final_group_col]])
 
-            #if pval_to_display:
-            if 1 == 1:
+            if pval_to_display and (groups or groups2):
                 # listing and sorting all final_groups possibles respecting the order given by
                 # groups first and then by groups2
                 t = df.drop_duplicates(subset=[final_group_col])
                 t[group_col] = t[group_col].astype("category")
-                t[group_col].cat.set_categories(groups, inplace=True)
+                t[group_col].cat = t[group_col].cat.set_categories(groups)
                 t[group_col2] = t[group_col2].astype("category")
-                t[group_col2].cat.set_categories(groups2, inplace=True)
-                t.dropna(how="any", subset=[group_col, group_col2], inplace=True)
+                t[group_col].cat = t[group_col2].cat.set_categories(groups2)
+                t = t.dropna(how="any", subset=[group_col, group_col2])
                 t = t.sort_values([group_col, group_col2])
                 final_groups = list(t[final_group_col])
 
@@ -626,7 +642,7 @@ def analyse_groups(
                 plotting_options = {}
             det = (hgt_min/15)
             hgt_min += det/2
-            nblevels=to_display.shape[0]
+            #nblevels=to_display.shape[0]
             plotting_options = merge_dict(plotting_options, {
                 'layout': {
                     'shapes': list_shapes,            # we should had to previous list if there is one
diff --git a/moonstone/plot/counts.py b/moonstone/plot/counts.py
index c31b364..74b753b 100644
--- a/moonstone/plot/counts.py
+++ b/moonstone/plot/counts.py
@@ -70,8 +70,8 @@ def plot_mean_distribution(
 
         mean_series = self.df.mean(axis=1)
         binned_mean = SeriesBinning(mean_series).binned_data
-        bar_fig = BarGraph(binned_mean, plotting_options, show=show, output_file=output_file)
-        bar_fig.plot_one_graph(plotting_options, show=show, output_file=output_file)
+        bar_fig = BarGraph(binned_mean)
+        bar_fig.plot_one_graph(plotting_options=plotting_options, show=show, output_file=output_file)
 
 
 class PlotTaxonomyCounts:
diff --git a/tests/analysis/diversity/test_alpha.py b/tests/analysis/diversity/test_alpha.py
index 0eb1ed4..57696ad 100644
--- a/tests/analysis/diversity/test_alpha.py
+++ b/tests/analysis/diversity/test_alpha.py
@@ -107,10 +107,12 @@ def test_invalid_correction_method_param(self):
     def test_invalid_pval_param(self):
         tested_object_instance = ShannonIndex(self.tested_object)
         with self.assertLogs('moonstone.analysis.diversity.base', level='WARNING') as log:
-            tested_object_instance._valid_pval_param("lalala")
-            self.assertEqual(len(log.output), 1)
+            tested_object_instance._valid_pval_param("lalala", "lilili")
+            self.assertEqual(len(log.output), 2)
             self.assertIn("WARNING:moonstone.analysis.diversity.base:pval_to_compute='lalala' not valid, \
 set to default (all).", log.output)
+            self.assertIn("WARNING:moonstone.analysis.diversity.base:pval_to_display='lilili' not valid, \
+set to default (None).", log.output)
 
     def test_analyse_groups_pval_to_compute_all(self):
         tested_object_instance = ShannonIndex(self.tested_object)
@@ -199,6 +201,81 @@ def test_analyse_groups_pval_to_compute_same_group_col_or_group_col2_values(self
         )
         pd.testing.assert_series_equal(output['pval'], expected_ser, check_dtype=False)
 
+    def test_generate_ordered_final_groups(self):
+        tested_object = pd.DataFrame.from_dict({
+            'sex_Group': {'comp1': 'M - A', 'comp2': 'F - B', 'comp3': 'F - A', 'comp4': 'F - C', 'comp5': 'M - C', 'comp6': 'M - B', 'comp7': 'M - A'}, 
+            'sex': {'comp1': 'M', 'comp2': 'F', 'comp3': 'F', 'comp4': 'F', 'comp5': 'M', 'comp6': 'M', 'comp7': 'M'}, 
+            'Group': {'comp1': 'A', 'comp2': 'B', 'comp3': 'A', 'comp4': 'C', 'comp5': 'C', 'comp6': 'B', 'comp7': 'A'},
+        })
+        groups_Group=["C", "A", "B"]
+        groups_sex=["M", "F"]
+        tested_object_instance = ShannonIndex(self.tested_object)
+        final_groups = tested_object_instance._generate_ordered_final_groups(
+            tested_object, final_group_col='sex_Group', group_col='Group', group_col2='sex',
+            groups=groups_Group, groups2=groups_sex
+        )
+        self.assertListEqual(final_groups, ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'])
+        # now testing other way around
+        final_groups = tested_object_instance._generate_ordered_final_groups(
+            tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group',
+            groups=groups_sex, groups2=groups_Group
+        )
+        self.assertListEqual(final_groups, ['M - C', 'M - A', 'M - B', 'F - C', 'F - A', 'F - B'])
+        # testing if order of groups not given        
+        final_groups = tested_object_instance._generate_ordered_final_groups(
+            tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group',
+            groups=None, groups2=groups_Group
+        )
+        self.assertListEqual(final_groups, ['F - C', 'F - A', 'F - B', 'M - C', 'M - A', 'M - B'])
+        # testing if order of groups2 not given
+        final_groups = tested_object_instance._generate_ordered_final_groups(
+            tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group',
+            groups=groups_sex, groups2=None
+        )
+        self.assertListEqual(final_groups, ['M - A', 'M - B', 'M - C', 'F - A', 'F - B', 'F - C'])        
+
+    def test_order_pval_series(self):
+        tested_object = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('M - A', 'F - A'): 0.5,
+            ('F - C', 'F - B'): 0.0014,
+            ('M - B', 'F - B'): 0.2,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+            ('M - C', 'F - C'): 0.0003,
+            ('M - A', 'M - C'): 0.89,
+        })
+        tested_object.index.names = ["Group1", "Group2"]
+        groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B']
+        tested_object_instance = ShannonIndex(self.tested_object)
+
+        level0 = pd.Categorical(
+            ['M - C', 'M - C', 'M - C', 'F - C', 'F - C', 'M - A', 'M - A', 'F - A', 'M - B'], 
+            categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], 
+            ordered=True, dtype='category'
+        )
+        level1 = pd.Categorical(
+            ['F - C', 'M - A', 'M - B', 'F - A', 'F - B', 'F - A', 'M - B', 'F - B', 'F - B'], 
+            categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], 
+            ordered=True, dtype='category'
+        )
+        data = [[0.0003, 0.89, 0.03, 0.0031, 0.0014, 0.5, 0.00067, 0.001, 0.2]]
+        expected_ser = pd.DataFrame(
+            data=data,
+            columns=pd.MultiIndex.from_arrays([level0, level1]),
+
+        ).T[0]
+        expected_ser.index.names = ["Group1", "Group2"]
+
+        pd.testing.assert_series_equal(
+            tested_object_instance._order_pval_series(
+                tested_object, groups, 
+                {'M - C': 0, 'F - C': 1, 'M - A': 2, 'F - A': 3, 'M - B': 4, 'F - B': 5}
+            ),
+            expected_ser
+        )
+        
 
 class TestSimpsonInverseIndex(TestCase):
 
@@ -352,4 +429,4 @@ def test_visualize(self):
         tree = TreeNode.read(StringIO(
             u'(((species1:0.25,species2:0.25):0.75,species3:1.0):0.5,(species4:0.5,species5:0.5):1.0)root;'))
         tested_object_instance = FaithsPhylogeneticDiversity(tested_object, tree)
-        tested_object_instance.visualize(show=False)
+        tested_object_instance.visualize(show=False)
\ No newline at end of file

From a150de868b6bf3620d93969285c6ad3e3bc941a1 Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Mon, 30 Oct 2023 15:47:13 +0100
Subject: [PATCH 4/8] fix groups order

---
 moonstone/plot/graphs/base.py      |  2 +-
 tests/plot/graphs/test_boxgraph.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/moonstone/plot/graphs/base.py b/moonstone/plot/graphs/base.py
index 4a78d8b..df715ac 100644
--- a/moonstone/plot/graphs/base.py
+++ b/moonstone/plot/graphs/base.py
@@ -227,7 +227,7 @@ def plot_one_graph(
             if groups:
                 filtered_df = self.data[self.data[group_col].isin(groups)]
                 filtered_df[group_col] = filtered_df[group_col].astype("category")
-                filtered_df[group_col].cat = filtered_df[group_col].cat.set_categories(groups)
+                filtered_df[group_col] = filtered_df[group_col].cat.set_categories(groups, ordered=True)
                 filtered_df = filtered_df.sort_values([group_col])
             else:
                 filtered_df = copy.deepcopy(self.data)
diff --git a/tests/plot/graphs/test_boxgraph.py b/tests/plot/graphs/test_boxgraph.py
index f5aee35..43ffb13 100644
--- a/tests/plot/graphs/test_boxgraph.py
+++ b/tests/plot/graphs/test_boxgraph.py
@@ -125,8 +125,8 @@ def test_with_group_col2_without_groups(self):
     def test_with_group_col2_with_groups_and_groups2(self):
         tested_df = pd.DataFrame(
             [
-                [1.0, "M", "A"],
-                [3.0, "F", "B"],
+                [1.0, "F", "B"],
+                [3.0, "F", "C"],
                 [9.0, "M", "A"],
                 [6.0, "M", "B"],
                 [2.0, "F", "A"],
@@ -135,7 +135,7 @@ def test_with_group_col2_with_groups_and_groups2(self):
                 [6.0, "M", "B"],
                 [8.0, "M", "C"],
                 [5.0, "F", "C"],
-                [7.0, "M", "C"],
+                [7.0, "M", "A"],
             ],
             index=[
                 "sample1", "sample2", "sample3", "sample4", "sample5",
@@ -144,12 +144,12 @@ def test_with_group_col2_with_groups_and_groups2(self):
             ],
             columns=["data", "sex", "group"],
         )
-        groups = ["F", "M"]    # change order
+        groups = ["M", "F"]    # change order
         groups2 = ["A", "B"]   # don't show group "C" (+ dictate order)
-        expected_x_gpA = ['F', 'F', 'M', 'M']
-        expected_y_gpA = [2.0, 4.0, 1.0, 9.0]
-        expected_x_gpB = ['F', 'M', 'M', 'M']
-        expected_y_gpB = [3.0, 6.0, 2.0, 6.0]
+        expected_x_gpA = ['M', 'M', 'F', 'F']
+        expected_y_gpA = [9.0, 7.0, 2.0, 4.0]
+        expected_x_gpB = ['M', 'M', 'M', 'F']
+        expected_y_gpB = [6.0, 2.0, 6.0, 1.0]
         plot = GroupBoxGraph(tested_df)
         tested_graph = plot.plot_one_graph(
             data_col="data", group_col="sex", group_col2="group",

From 5627831265fc9cd49cbc91b1b7132e2237c706fd Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Fri, 10 Nov 2023 16:39:28 +0100
Subject: [PATCH 5/8] Try with python 3.10

---
 .github/workflows/python-package.yml |  2 +-
 setup.py                             | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3a5546a..8612603 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9"]
+        python-version: ["3.8", "3.9", "3.10"]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/setup.py b/setup.py
index 93e62f2..72b773f 100644
--- a/setup.py
+++ b/setup.py
@@ -9,18 +9,18 @@
     author='Kenzo-Hugo Hillion, Agnès Baud, Mariela Furstenheim, Sean Kennedy',
     author_email='kehillio@pasteur.fr',
     install_requires=[
-        'pandas==2.0.2',
-        'matplotlib==3.3.0',
-        'plotly==5.17.0',
-        'statsmodels==0.14.0',
-        'python-slugify==4.0.1',
-        'pyaml==20.4.0',
-        'numpy==1.24.3',
-        'scikit-bio==0.5.9',
-        'scikit-learn==1.3.1',
-        'hdmedians==0.14.2',
-        'cython==0.29.21',
-        'scipy==1.9.0'
+        'pandas>=2.0.2',
+        'matplotlib>=3.3.0',
+        'plotly>=5.17.0',
+        'statsmodels>=0.14.0',
+        'python-slugify>=4.0.1',
+        'pyaml>=20.4.0',
+        'numpy>=1.24.3',
+        'scikit-bio>=0.5.9',
+        'scikit-learn>=1.3.1',
+        'hdmedians>=0.14.2',
+        'cython>=0.29.21',
+        'scipy>=1.9.0'
     ],
     packages=find_packages(),
     entry_points={'console_scripts': ['moonstone=moonstone.main:run']},

From 3d79f7800d5eb5723d415abe22e1fa53d01794dd Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Mon, 13 Nov 2023 17:01:28 +0100
Subject: [PATCH 6/8] tests pval_selection

---
 moonstone/analysis/differential_analysis.py |   2 +-
 moonstone/analysis/diversity/base.py        | 186 +++++++-------
 setup.py                                    |   2 +-
 tests/analysis/diversity/test_alpha.py      | 258 +++++++++++++++++++-
 4 files changed, 346 insertions(+), 102 deletions(-)

diff --git a/moonstone/analysis/differential_analysis.py b/moonstone/analysis/differential_analysis.py
index 9387200..66ea936 100644
--- a/moonstone/analysis/differential_analysis.py
+++ b/moonstone/analysis/differential_analysis.py
@@ -78,7 +78,7 @@ def test_multiple_features(self, feature, test_to_use):
             list_ofgroups = []
             for variable in variable_dic:
                 list_ofgroups.append(variable_dic[variable][self.full_table.columns[family]])
-            #test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups))
+            # test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups))
             test = self.tests_functions_used[test_to_use](*list_ofgroups)  # works for kruskal and one way anova
             features.append(feature)
             taxons.append(self.full_table.columns[family])
diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
index c318b88..20b4147 100644
--- a/moonstone/analysis/diversity/base.py
+++ b/moonstone/analysis/diversity/base.py
@@ -227,8 +227,8 @@ def _valid_pval_param(self, pval_to_compute, pval_to_display):
         ]
         dicpval = {}
         for i in range(len(choices)):
-            dicpval[choices[i]]=i
-            
+            dicpval[choices[i]] = i
+
         if pval_to_compute not in choices:
             logger.warning("pval_to_compute='%s' not valid, set to default (all).", pval_to_compute)
             pval_to_compute = "all"
@@ -236,12 +236,12 @@ def _valid_pval_param(self, pval_to_compute, pval_to_display):
         if pval_to_display not in choices:
             logger.warning("pval_to_display='%s' not valid, set to default (None).", pval_to_display)
             pval_to_display = None
-        elif dicpval[pval_to_display]<dicpval[pval_to_compute]:
+        elif dicpval[pval_to_display] < dicpval[pval_to_compute]:
             raise ValueError("pval_to_display='{}' not valid, when pval_to_compute='{}'. \
-pval_to_display should be set to :{}".format(
+pval_to_display should be set to: {}".format(
                 pval_to_display, pval_to_compute, choices[dicpval[pval_to_compute]:]
             ))
-        
+
         return pval_to_compute, pval_to_display
 
     def _valid_correction_method_param(self, correction_method):
@@ -255,40 +255,44 @@ def _valid_correction_method_param(self, correction_method):
     def _pval_selection(
         self, pval_series, groups
     ):
-        
+
         pval_series = pval_series[pval_series < 0.05]
         if groups is not None:
-            pval_series = pval_series[
-                (pval_series.index.get_level_values(0).isin(groups) & pval_series.index.get_level_values(1).isin(groups))
-            ]
-        return pval_series     
-    
+            pval_series = pval_series[(
+                pval_series.index.get_level_values(0).isin(groups) & pval_series.index.get_level_values(1).isin(groups)
+            )]
+        return pval_series
+
     def _pval_selection_with_group_col2(
         self, pval_series, final_groups, pval_to_compute, pval_to_display
     ):
-        #Reminder: 
-        #    1) This method called only if pval_to_display is not None. So pval_to_compute/pval_to_display = {"all", "same group_col or group_col2 values", "same group_col values"}
+        # Reminder:
+        #    1) This method called only if pval_to_display is not None.
+        #       So pval_to_compute/pval_to_display =
+        #           {"all", "same group_col or group_col2 values", "same group_col values"}
         #    2) Index values follow this pattern: "{group_col value} - {group_col2 value}"
-        print("pval_to_compute:", pval_to_compute)
-        print("pval_to_display:", pval_to_display)
+
         pval_series = self._pval_selection(pval_series, final_groups)
         if (pval_to_compute != "same group_col values" and
-            pval_to_display == "same group_col values"):
+                pval_to_display == "same group_col values"):
             # we only have to check first part of index values
             pval_series = pval_series[
                 (
-                    pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0])
+                    pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0])
+                    == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0])
                 )
             ]
         elif (pval_to_compute == "all" and
               pval_to_display == "same group_col or group_col2 values"):
             # we compare both part of the index values -> if first part is the same = same group_col value
-                                                     # -> if second part is the same = same group_col2 value
+            #                                          -> if second part is the same = same group_col2 value
             pval_series = pval_series[
                 (
-                    pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0])
+                    pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[0])
+                    == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[0])
                 ) | (
-                    pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[1]) == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[1])    
+                    pval_series.index.get_level_values(0).map(lambda x: x.split(" - ")[1])
+                    == pval_series.index.get_level_values(1).map(lambda x: x.split(" - ")[1])
                 )]
 
         return pval_series
@@ -306,13 +310,13 @@ def _order_pval_series(
             level0 = pval_series.loc[i][names[0]]
             level1 = pval_series.loc[i][names[1]]
             if dic_gps[level0] > dic_gps[level1]:
-                pval_series.loc[i, names[0]] = level1       # invert to have the Group that should be put first as first
-                pval_series.loc[i, names[1]] = level0       # in example: "Group B - Group A" becomes "Group A - Group B"
+                pval_series.loc[i, names[0]] = level1      # invert to have the Group that should be put first as first
+                pval_series.loc[i, names[1]] = level0      # in example: "Group B - Group A" becomes "Group A - Group B"
         pval_series[names[0]] = pval_series[names[0]].astype("category")
         pval_series[names[0]] = pval_series[names[0]].cat.set_categories(groups, ordered=True)
         pval_series[names[1]] = pval_series[names[1]].astype("category")
         pval_series[names[1]] = pval_series[names[1]].cat.set_categories(groups, ordered=True)
-        pval_series = pval_series.sort_values([names[0], names[1]]) # we sort by 1st member, and then 2nd member
+        pval_series = pval_series.sort_values([names[0], names[1]])  # we sort by 1st member, and then 2nd member
         pval_series = pval_series.set_index([names[0], names[1]])
         return pval_series[0]
 
@@ -341,7 +345,8 @@ def _generate_shapes_annotations_lists(
         To generate annotations to represent significant pvalues. Methods for group_col only (not group_col2)
 
         Args:
-            pval_series: pd.Series of the pvalue to put on the graph (need to be filtered beforehand so that it only contains significant pvalues)
+            pval_series: pd.Series of the pvalue to put on the graph (need to be filtered beforehand
+              so that it only contains significant pvalues)
         """
         # Overview of this method: We're trying to generate the shapes (1 bracket = 3 lines =
         # 1 going from Group1 to Group2 and 2 small lines to form the edge of the bracket)
@@ -353,91 +358,96 @@ def _generate_shapes_annotations_lists(
         # need to be added to `level`
         dic_gps = {}
         for i in range(len(groups)):
-            dic_gps[groups[i]]=i        
-        
-        # the pvalues need to be ordered so that looking at the cell corresponding to the left edge 
+            dic_gps[groups[i]] = i
+
+        # the pvalues need to be ordered so that looking at the cell corresponding to the left edge
         # of the bracket that needs to be added is enough to determine if there is already a
         # bracket there.
         pval_series = self._order_pval_series(pval_series, groups, dic_gps)
-        
+
         det = (hgt_min/15)
-        
+
         hgt_min += det/2
         fontsize = int(12+det)
         linewidth = 0.5+0.15*det
-    
-        level=[[0] * (len(groups))]
+
+        level = [[0] * (len(groups))]
         list_shapes = []
         list_annotations = []
-        
+
         for ind, val in pval_series.items():
             y = 0
-            
+
             left_ind = dic_gps[ind[0]]  # -> could be directly ind for shapes but /!\ not for the annotations
             right_ind = dic_gps[ind[1]]
-            
+
             for i in range(len(level)):
                 if level[i][left_ind] == 0:
-                    level[i][left_ind:right_ind+1]=[1]*(right_ind-left_ind+1)
+                    level[i][left_ind:right_ind+1] = [1]*(right_ind-left_ind+1)
                     list_shapes += [
-                        {'x0':left_ind, 'y0':hgt_min+(i*det/2), 
-                         'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)},   # from Group1 to Group2
-                        {'x0':left_ind, 'y0':hgt_min+(i*det/2)-0.15*det, 
-                         'x1':left_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)},    # left edge of the bracket
-                        {'x0':right_ind, 'y0':hgt_min+(i*det/2)-0.15*det, 
-                         'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}    # right edge of the bracket
+                        {'x0': left_ind, 'y0': hgt_min+(i*det/2),
+                         'x1': right_ind, 'y1': hgt_min+(i*det/2),
+                         'line': dict(width=linewidth)},   # from Group1 to Group2
+                        {'x0': left_ind, 'y0': hgt_min+(i*det/2)-0.15*det,
+                         'x1': left_ind, 'y1': hgt_min+(i*det/2),
+                         'line': dict(width=linewidth)},    # left edge of the bracket
+                        {'x0': right_ind, 'y0': hgt_min+(i*det/2)-0.15*det,
+                         'x1': right_ind, 'y1': hgt_min+(i*det/2),
+                         'line': dict(width=linewidth)}    # right edge of the bracket
                     ]
                     if val < 0.01:
-                        list_annotations += [{'text':'**', "font":dict(size=fontsize),
-                                              'x':(left_ind+right_ind)/2, 
-                                              'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}]
+                        list_annotations += [{'text': '**', "font": dict(size=fontsize),
+                                              'x': (left_ind+right_ind)/2,
+                                              'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}]
                     else:
-                        list_annotations += [{'text':'*', "font":dict(size=fontsize),
-                                              'x':(left_ind+right_ind)/2, 
-                                              'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}]
+                        list_annotations += [{'text': '*', "font": dict(size=fontsize),
+                                              'x': (left_ind+right_ind)/2,
+                                              'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}]
                     y = 1
                     break
             if y == 0:
                 # we need to add another level
                 i += 1
                 level += [[0] * (len(groups))]
-                level[i][left_ind:right_ind+1]=[1]*(right_ind-left_ind+1)  # or ind
+                level[i][left_ind:right_ind+1] = [1]*(right_ind-left_ind+1)  # or ind
                 list_shapes += [
-                    {'x0':left_ind, 'y0':hgt_min+(i*det/2), 
-                     'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)},
-                    {'x0':left_ind, 'y0':hgt_min+(i*det/2)-0.15*det, 
-                     'x1':left_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)},
-                    {'x0':right_ind, 'y0':hgt_min+(i*det/2)-0.15*det, 
-                     'x1':right_ind, 'y1':hgt_min+(i*det/2), 'line':dict(width=linewidth)}]
+                    {'x0': left_ind, 'y0': hgt_min+(i*det/2),
+                     'x1': right_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)},
+                    {'x0': left_ind, 'y0': hgt_min+(i*det/2)-0.15*det,
+                     'x1': left_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)},
+                    {'x0': right_ind, 'y0': hgt_min+(i*det/2)-0.15*det,
+                     'x1': right_ind, 'y1': hgt_min+(i*det/2), 'line': dict(width=linewidth)}]
                 if val < 0.01:
-                    list_annotations += [{'text':'**', "font":dict(size=fontsize),
-                                          'x':(left_ind+right_ind)/2, 
-                                          'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}]
+                    list_annotations += [{'text': '**', "font": dict(size=fontsize),
+                                          'x': (left_ind+right_ind)/2,
+                                          'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}]
                 else:
-                    list_annotations += [{'text':'*',"font":dict(size=fontsize),
-                                          'x':(left_ind+right_ind)/2,
-                                          'y':hgt_min+(i*det/2)+0.15*det, 'showarrow':False}]
-        
+                    list_annotations += [{'text': '*', "font": dict(size=fontsize),
+                                          'x': (left_ind+right_ind)/2,
+                                          'y': hgt_min+(i*det/2)+0.15*det, 'showarrow': False}]
+
         return list_shapes, list_annotations, len(level)
 
     """
     def _generate_shapes_annotations_lists_supergroup(
         self, pval_series, groups, supergroups, hgt_min
     ):
-    """
-        #:param supergroups dictionary with supergroup edges
-    """
+    #   :param supergroups dictionary with supergroup edges
         dic_gps = {}
         for i in range(len(groups)):
             dic_gps[groups[i]]=i
-    
-    
+
         list_shapes = []
         dic_middle = {}
-        supergroups_to_display = set(list(pval_series.index.get_level_values(0)) + list(pval_series.index.get_level_values(1)))
+        supergroups_to_display = set(list(pval_series.index.get_level_values(0)) \
+            + list(pval_series.index.get_level_values(1)))
         for i in supergroups_to_display:
             if isinstance(supergroups[i], list):
-                list_shapes += [{'x0':supergroups[i][0], 'y0':hgt_min, 'x1':supergroups[i][1], 'y1':hgt_min, 'line':dict(width=1)}]                      
+                list_shapes += [{
+                    'x0':supergroups[i][0], 'y0':hgt_min,
+                    'x1':supergroups[i][1], 'y1':hgt_min,
+                    'line':dict(width=1)
+                }]
                 dic_middle[i] = (dic_gps[supergroups[i][0]] + dic_gps[supergroups[i][1]])/2
             else:
                 dic_middle[i] = dic_gps[supergroups[i]]
@@ -447,7 +457,7 @@ def _generate_shapes_annotations_lists_supergroup(
         for k, v in sorted(dic_middle.items(), key=lambda item: item[1]):
             dic_supergps[k] = i
             i+=1
-                
+
         level=[[0] * (len(dic_supergps))]
         list_annotations = []
         for ind, val in pval_series.items():
@@ -456,52 +466,54 @@ def _generate_shapes_annotations_lists_supergroup(
                 if dic_middle[ind[0]] > dic_middle[ind[1]]:
                     ind = (ind[1], ind[0])
                 if level[i][dic_supergps[ind[0]]] == 0:
-                    level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1]=[1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1)  # or ind
+                    level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1] = \
+                        [1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1)  # or ind
                     list_shapes += [
-                            {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, 
+                            {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5,
                              'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)},
                             {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5,
                              #'y0':hgt_min, 'line':dict(width=1, dash="dot")},
                              'y0':hgt_min+i+0.35, 'line':dict(width=1)},
-                            {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 
+                            {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5,
                              #'y0':hgt_min, 'line':dict(width=1, dash="dot")}
                              'y0':hgt_min+i+0.35, 'line':dict(width=1)}
                                    ]
                     if val < 0.01:
-                        list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 
+                        list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2,
                                               'y':hgt_min+i+0.65, 'showarrow':False}]
                     else:
-                        list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 
+                        list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2,
                                               'y':hgt_min+i+0.65, 'showarrow':False}]
                     y = 1
                     break
             if y == 0:
                 i += 1
                 level += [[0] * (len(dic_supergps))]
-                level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1]=[1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1)  # or ind
+                level[i][dic_supergps[ind[0]]:dic_supergps[ind[1]]+1] = \
+                    [1]*(dic_supergps[ind[1]]-dic_supergps[ind[0]]+1)  # or ind
                 list_shapes += [
-                            {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5, 
+                            {'x0':dic_middle[ind[0]], 'y0':hgt_min+i+0.5,
                              'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 'line':dict(width=1)},
-                            {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5, 
+                            {'x0':dic_middle[ind[0]], 'x1':dic_middle[ind[0]], 'y1':hgt_min+i+0.5,
                              #'y0':hgt_min, 'line':dict(width=1, dash="dot")},
                              'y0':hgt_min+i+0.35, 'line':dict(width=1)},
-                            {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5, 
+                            {'x0':dic_middle[ind[1]], 'x1':dic_middle[ind[1]], 'y1':hgt_min+i+0.5,
                              #'y0':hgt_min, 'line':dict(width=1, dash="dot")}
                              'y0':hgt_min+i+0.35, 'line':dict(width=1)}
                                 ]
                 if val < 0.01:
-                    list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 
+                    list_annotations += [{'text':'**','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2,
                                               'y':hgt_min+i+0.65, 'showarrow':False}]
                 else:
-                    list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2, 
+                    list_annotations += [{'text':'*','x':(dic_middle[ind[0]]+dic_middle[ind[1]])/2,
                                               'y':hgt_min+i+0.65, 'showarrow':False}]
-        
+
         return list_shapes, list_annotations, len(level)
     """
     # for now, method above and below in different methods
-    #def _generate_dic_shapes_and_annotations(
+    # def _generate_dic_shapes_and_annotations(
     #    self, pval_series, dic_lev, hgt_min
-    #):   
+    # ):
     #    list_shapes={}
     #    dic_annotations={}
     #    for ind, val in pval_series.items():
@@ -625,16 +637,16 @@ def analyse_groups(
             # pval is in the right structure to be returned
 
             final_groups = groups   # to remove from here later on
-            
+
             if pval_to_display:
                 final_groups = groups
                 to_display = self._pval_selection(pval, groups)
-                
+
         if pval_to_display and to_display.empty:       # nothing to display
             pval_to_display = None
 
         if pval_to_display:
-            hgt_min = df[self.DIVERSITY_INDEXES_NAME].max() #+ 0.5
+            hgt_min = df[self.DIVERSITY_INDEXES_NAME].max()  # + 0.5
             list_shapes, list_annotations, nb_lev = self._generate_shapes_annotations_lists(
                 to_display, final_groups, hgt_min
             )
@@ -642,7 +654,7 @@ def analyse_groups(
                 plotting_options = {}
             det = (hgt_min/15)
             hgt_min += det/2
-            #nblevels=to_display.shape[0]
+            # nblevels=to_display.shape[0]
             plotting_options = merge_dict(plotting_options, {
                 'layout': {
                     'shapes': list_shapes,            # we should had to previous list if there is one
diff --git a/setup.py b/setup.py
index 72b773f..8ad867e 100644
--- a/setup.py
+++ b/setup.py
@@ -24,4 +24,4 @@
     ],
     packages=find_packages(),
     entry_points={'console_scripts': ['moonstone=moonstone.main:run']},
-)
\ No newline at end of file
+)
diff --git a/tests/analysis/diversity/test_alpha.py b/tests/analysis/diversity/test_alpha.py
index 57696ad..1643a16 100644
--- a/tests/analysis/diversity/test_alpha.py
+++ b/tests/analysis/diversity/test_alpha.py
@@ -114,6 +114,15 @@ def test_invalid_pval_param(self):
             self.assertIn("WARNING:moonstone.analysis.diversity.base:pval_to_display='lilili' not valid, \
 set to default (None).", log.output)
 
+    def test_inconsistent_pval_to_diplay_param(self):
+        tested_object_instance = ShannonIndex(self.tested_object)
+        with self.assertRaises(ValueError) as cm:
+            tested_object_instance._valid_pval_param("same group_col or group_col2 values", "all")
+        the_exception = cm.exception
+        expected_msg = "pval_to_display='all' not valid, when pval_to_compute='same group_col or group_col2 values'. \
+pval_to_display should be set to: ['same group_col or group_col2 values', 'same group_col values', None]"
+        self.assertEqual(the_exception.__str__(), expected_msg)
+
     def test_analyse_groups_pval_to_compute_all(self):
         tested_object_instance = ShannonIndex(self.tested_object)
 
@@ -201,14 +210,237 @@ def test_analyse_groups_pval_to_compute_same_group_col_or_group_col2_values(self
         )
         pd.testing.assert_series_equal(output['pval'], expected_ser, check_dtype=False)
 
+    def test_pval_selection(self):
+        tested_object = pd.Series({
+            ('A', 'B'): 0.03,
+            ('A', 'C'): 0.5,
+            ('A', 'D'): 0.0014,
+            ('B', 'C'): 0.2,
+            ('B', 'D'): 0.001,
+            ('C', 'D'): 0.00067,
+        })
+        tested_object.index.names = ["Group1", "Group2"]
+        tested_object_instance = ShannonIndex(self.tested_object)
+
+        expected_ser = pd.Series({
+            ('A', 'B'): 0.03,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        pd.testing.assert_series_equal(
+            tested_object_instance._pval_selection(tested_object, ['A', 'B', 'C']),
+            expected_ser
+        )
+
+        expected_ser = pd.Series({
+            ('A', 'D'): 0.0014,
+            ('C', 'D'): 0.00067,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        pd.testing.assert_series_equal(
+            tested_object_instance._pval_selection(tested_object, ['A', 'C', 'D']),
+            expected_ser
+        )
+
+    def test_pval_selection_with_group_col2(self):
+        # pval_to_compute = 'all'
+        tested_object = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('M - A', 'F - A'): 0.5,
+            ('M - C', 'F - A'): 0.0034,
+            ('F - B', 'M - A'): 0.6,
+            ('F - C', 'F - B'): 0.0014,
+            ('F - B', 'M - B'): 0.2,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+            ('M - C', 'F - C'): 0.0003,
+            ('M - C', 'F - B'): 0.0056,
+            ('M - A', 'M - C'): 0.89,
+            ('M - A', 'F - C'): 0.0006,
+            ('M - B', 'F - C'): 0.0043,
+            ('F - A', 'M - B'): 0.234,
+        })
+        tested_object.index.names = ["Group1", "Group2"]
+        tested_object_instance = ShannonIndex(self.tested_object)
+
+        groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B']
+
+        # Case pval_to_display = 'all':  only pval > 0.05 removed
+        pval_to_display = 'all'
+        expected_ser = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('M - C', 'F - A'): 0.0034,
+            ('F - C', 'F - B'): 0.0014,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+            ('M - C', 'F - C'): 0.0003,
+            ('M - C', 'F - B'): 0.0056,
+            ('M - A', 'F - C'): 0.0006,
+            ('M - B', 'F - C'): 0.0043,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        res = tested_object_instance._pval_selection_with_group_col2(
+            tested_object, groups, 'all', pval_to_display
+        )
+        pd.testing.assert_series_equal(res, expected_ser)
+
+        # Case pval_to_display = 'same group_col or group_col2 values'
+        pval_to_display = 'same group_col or group_col2 values'
+        expected_ser = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('F - C', 'F - B'): 0.0014,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+            ('M - C', 'F - C'): 0.0003,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        res = tested_object_instance._pval_selection_with_group_col2(
+            tested_object, groups, 'all', pval_to_display
+        )
+        pd.testing.assert_series_equal(res, expected_ser)
+
+        # Case pval_to_display = 'same group_col values'
+        pval_to_display = 'same group_col values'
+        expected_ser = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('F - C', 'F - B'): 0.0014,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        res = tested_object_instance._pval_selection_with_group_col2(
+            tested_object, groups, 'all', pval_to_display
+        )
+        pd.testing.assert_series_equal(res, expected_ser)
+
+        # Case final_groups defined
+        groups = ['M - A', 'F - A', 'M - C', 'F - C']
+        expected_ser = pd.Series({
+            ('M - C', 'F - A'): 0.0034,
+            ('F - A', 'F - C'): 0.0031,
+            ('M - C', 'F - C'): 0.0003,
+            ('M - A', 'F - C'): 0.0006,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        res = tested_object_instance._pval_selection_with_group_col2(
+            tested_object, groups, 'all', 'all'
+        )
+        pd.testing.assert_series_equal(res, expected_ser)
+
+    def test_pval_selection_with_group_col2_pval_to_compute_same_group_col_or_same_group_col2(self):
+        # pval_to_compute = 'same group_col or group_col2 values'
+        tested_object = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('M - A', 'F - A'): 0.5,
+            ('F - C', 'F - B'): 0.0014,
+            ('F - B', 'M - B'): 0.2,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+            ('F - C', 'M - C'): 0.0003,
+            ('M - A', 'M - C'): 0.89,
+        })
+        tested_object.index.names = ["Group1", "Group2"]
+        tested_object_instance = ShannonIndex(self.tested_object)
+
+        groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B']
+
+        # Case pval_to_display = 'same group_col or group_col2 values':  only pval > 0.05 removed
+        pval_to_display = 'same group_col or group_col2 values'
+        expected_ser = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('F - C', 'F - B'): 0.0014,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+            ('F - C', 'M - C'): 0.0003,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        res = tested_object_instance._pval_selection_with_group_col2(
+            tested_object, groups, 'same group_col or group_col2 values', pval_to_display
+        )
+        pd.testing.assert_series_equal(res, expected_ser)
+
+        # Case pval_to_display = 'same group_col values'
+        pval_to_display = 'same group_col values'
+        expected_ser = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('F - C', 'F - B'): 0.0014,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        res = tested_object_instance._pval_selection_with_group_col2(
+            tested_object, groups, 'same group_col or group_col2 values', pval_to_display
+        )
+        pd.testing.assert_series_equal(res, expected_ser)
+
+        # Case final_groups defined
+        groups = ['M - A', 'F - A', 'M - C', 'F - C']
+        expected_ser = pd.Series({
+            ('F - A', 'F - C'): 0.0031,
+            ('F - C', 'M - C'): 0.0003,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        res = tested_object_instance._pval_selection_with_group_col2(
+            tested_object, groups, 'same group_col or group_col2 values', 'same group_col or group_col2 values'
+        )
+        pd.testing.assert_series_equal(res, expected_ser)
+
+    def test_pval_selection_with_group_col2_pval_to_compute_same_group_col(self):
+        # pval_to_compute = 'same group_col values'
+        tested_object = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('F - C', 'F - B'): 0.0014,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+            ('M - A', 'M - C'): 0.89,
+        })
+        tested_object.index.names = ["Group1", "Group2"]
+        tested_object_instance = ShannonIndex(self.tested_object)
+
+        groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B']
+
+        # Case pval_to_display = 'same group_col values'
+        pval_to_display = 'same group_col values'
+        expected_ser = pd.Series({
+            ('M - C', 'M - B'): 0.03,
+            ('F - C', 'F - B'): 0.0014,
+            ('F - A', 'F - B'): 0.001,
+            ('M - A', 'M - B'): 0.00067,
+            ('F - A', 'F - C'): 0.0031,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        res = tested_object_instance._pval_selection_with_group_col2(
+            tested_object, groups, 'same group_col values', pval_to_display
+        )
+        pd.testing.assert_series_equal(res, expected_ser)
+
+        # Case final_groups defined
+        groups = ['M - A', 'F - A', 'M - C', 'F - C']
+        expected_ser = pd.Series({
+            ('F - A', 'F - C'): 0.0031,
+        })
+        expected_ser.index.names = ["Group1", "Group2"]
+        res = tested_object_instance._pval_selection_with_group_col2(
+            tested_object, groups, 'same group_col values', pval_to_display
+        )
+        pd.testing.assert_series_equal(res, expected_ser)
+
     def test_generate_ordered_final_groups(self):
         tested_object = pd.DataFrame.from_dict({
-            'sex_Group': {'comp1': 'M - A', 'comp2': 'F - B', 'comp3': 'F - A', 'comp4': 'F - C', 'comp5': 'M - C', 'comp6': 'M - B', 'comp7': 'M - A'}, 
-            'sex': {'comp1': 'M', 'comp2': 'F', 'comp3': 'F', 'comp4': 'F', 'comp5': 'M', 'comp6': 'M', 'comp7': 'M'}, 
+            'sex_Group': {'comp1': 'M - A', 'comp2': 'F - B', 'comp3': 'F - A', 'comp4': 'F - C', 'comp5': 'M - C',
+                          'comp6': 'M - B', 'comp7': 'M - A'},
+            'sex': {'comp1': 'M', 'comp2': 'F', 'comp3': 'F', 'comp4': 'F', 'comp5': 'M', 'comp6': 'M', 'comp7': 'M'},
             'Group': {'comp1': 'A', 'comp2': 'B', 'comp3': 'A', 'comp4': 'C', 'comp5': 'C', 'comp6': 'B', 'comp7': 'A'},
         })
-        groups_Group=["C", "A", "B"]
-        groups_sex=["M", "F"]
+        groups_Group = ["C", "A", "B"]
+        groups_sex = ["M", "F"]
         tested_object_instance = ShannonIndex(self.tested_object)
         final_groups = tested_object_instance._generate_ordered_final_groups(
             tested_object, final_group_col='sex_Group', group_col='Group', group_col2='sex',
@@ -221,7 +453,7 @@ def test_generate_ordered_final_groups(self):
             groups=groups_sex, groups2=groups_Group
         )
         self.assertListEqual(final_groups, ['M - C', 'M - A', 'M - B', 'F - C', 'F - A', 'F - B'])
-        # testing if order of groups not given        
+        # testing if order of groups not given
         final_groups = tested_object_instance._generate_ordered_final_groups(
             tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group',
             groups=None, groups2=groups_Group
@@ -232,7 +464,7 @@ def test_generate_ordered_final_groups(self):
             tested_object, final_group_col='sex_Group', group_col='sex', group_col2='Group',
             groups=groups_sex, groups2=None
         )
-        self.assertListEqual(final_groups, ['M - A', 'M - B', 'M - C', 'F - A', 'F - B', 'F - C'])        
+        self.assertListEqual(final_groups, ['M - A', 'M - B', 'M - C', 'F - A', 'F - B', 'F - C'])
 
     def test_order_pval_series(self):
         tested_object = pd.Series({
@@ -251,13 +483,13 @@ def test_order_pval_series(self):
         tested_object_instance = ShannonIndex(self.tested_object)
 
         level0 = pd.Categorical(
-            ['M - C', 'M - C', 'M - C', 'F - C', 'F - C', 'M - A', 'M - A', 'F - A', 'M - B'], 
-            categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], 
+            ['M - C', 'M - C', 'M - C', 'F - C', 'F - C', 'M - A', 'M - A', 'F - A', 'M - B'],
+            categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'],
             ordered=True, dtype='category'
         )
         level1 = pd.Categorical(
-            ['F - C', 'M - A', 'M - B', 'F - A', 'F - B', 'F - A', 'M - B', 'F - B', 'F - B'], 
-            categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'], 
+            ['F - C', 'M - A', 'M - B', 'F - A', 'F - B', 'F - A', 'M - B', 'F - B', 'F - B'],
+            categories=['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B'],
             ordered=True, dtype='category'
         )
         data = [[0.0003, 0.89, 0.03, 0.0031, 0.0014, 0.5, 0.00067, 0.001, 0.2]]
@@ -270,12 +502,12 @@ def test_order_pval_series(self):
 
         pd.testing.assert_series_equal(
             tested_object_instance._order_pval_series(
-                tested_object, groups, 
+                tested_object, groups,
                 {'M - C': 0, 'F - C': 1, 'M - A': 2, 'F - A': 3, 'M - B': 4, 'F - B': 5}
             ),
             expected_ser
         )
-        
+
 
 class TestSimpsonInverseIndex(TestCase):
 
@@ -429,4 +661,4 @@ def test_visualize(self):
         tree = TreeNode.read(StringIO(
             u'(((species1:0.25,species2:0.25):0.75,species3:1.0):0.5,(species4:0.5,species5:0.5):1.0)root;'))
         tested_object_instance = FaithsPhylogeneticDiversity(tested_object, tree)
-        tested_object_instance.visualize(show=False)
\ No newline at end of file
+        tested_object_instance.visualize(show=False)

From 2567a48b508c1f8cc280c26120f78a6aa6932790 Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Wed, 15 Nov 2023 16:10:59 +0100
Subject: [PATCH 7/8] docstring new methods

---
 moonstone/analysis/diversity/base.py   | 103 ++++++++++++++++++-------
 tests/analysis/diversity/test_alpha.py |  13 ++--
 2 files changed, 78 insertions(+), 38 deletions(-)

diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
index 20b4147..fc2a010 100644
--- a/moonstone/analysis/diversity/base.py
+++ b/moonstone/analysis/diversity/base.py
@@ -1,12 +1,13 @@
 import logging
 import re
 from abc import ABC, abstractmethod
-import skbio
+from numbers import Real
 from string import capwords
 from typing import Union
 
 import numpy as np
 import pandas as pd
+import skbio
 from statsmodels.stats.multitest import multipletests
 
 from moonstone.analysis.statistical_test import statistical_test_groups_comparison
@@ -253,10 +254,18 @@ def _valid_correction_method_param(self, correction_method):
         return correction_method
 
     def _pval_selection(
-        self, pval_series, groups
-    ):
+        self, pval_series: pd.Series, groups: list,
+        threshold: float = 0.05
+    ) -> pd.Series:
+        """
+        To select the p-values to display. The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed
 
-        pval_series = pval_series[pval_series < 0.05]
+        Args:
+            pval_series: series of all the p-values computed.
+            groups: list of groups displayed in graph.
+            threshold: the significance threshold. It must be between between 0 and 1. Default is 0.05.
+        """
+        pval_series = pval_series[pval_series < threshold]
         if groups is not None:
             pval_series = pval_series[(
                 pval_series.index.get_level_values(0).isin(groups) & pval_series.index.get_level_values(1).isin(groups)
@@ -264,15 +273,26 @@ def _pval_selection(
         return pval_series
 
     def _pval_selection_with_group_col2(
-        self, pval_series, final_groups, pval_to_compute, pval_to_display
-    ):
+        self, pval_series: pd.Series, final_groups: list,
+        pval_to_compute: str, pval_to_display: str,
+        threshold: float = 0.05
+    ) -> pd.Series:
+        """
+        To select the p-values to display when the group_col2 argument is being used.
+        The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed
+
+        Args:
+            pval_series: series of all the p-values computed.
+            final_groups: list of all the combinations displayed in graph: "{group_col value} - {group_col2 value}".
+            threshold: the significance threshold. It must be between between 0 and 1. Default is 0.05.
+        """
         # Reminder:
         #    1) This method called only if pval_to_display is not None.
         #       So pval_to_compute/pval_to_display =
         #           {"all", "same group_col or group_col2 values", "same group_col values"}
         #    2) Index values follow this pattern: "{group_col value} - {group_col2 value}"
 
-        pval_series = self._pval_selection(pval_series, final_groups)
+        pval_series = self._pval_selection(pval_series, final_groups, threshold)
         if (pval_to_compute != "same group_col values" and
                 pval_to_display == "same group_col values"):
             # we only have to check first part of index values
@@ -298,8 +318,21 @@ def _pval_selection_with_group_col2(
         return pval_series
 
     def _order_pval_series(
-        self, pval_series: pd.Series, groups: list, dic_gps: dict
-    ):
+        self, pval_series: pd.Series, groups: list, **kwargs
+    ) -> pd.Series:
+        """
+        To select the p-values to display when the group_col2 argument is being used.
+        The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed
+
+        Args:
+            pval_series: series of all the p-values computed.
+            groups: ordered list of the groups displayed in graph: "{group_col value} - {group_col2 value}".
+        """
+        dic_gps = kwargs.pop("dic_gps", {})
+        if not dic_gps:
+            for i in range(len(groups)):
+                dic_gps[groups[i]] = i
+
         # Reminder: pvalue series is a MultiIndex series -> 2 level of index are the 2 groups compared
         # to order p-value series in a specific order dictated in dic_gps
         # example: dic_gps = {"Group A": 0, "Group B": 1, "Group C": 3}
@@ -321,13 +354,27 @@ def _order_pval_series(
         return pval_series[0]
 
     def _generate_ordered_final_groups(
-        self, df: pd.DataFrame, final_group_col: str, group_col: str, group_col2: str,
+        self, metadata_df: pd.DataFrame, final_group_col: str, group_col: str, group_col2: str,
         groups: list, groups2: list
-    ):
-        # listing and sorting all final_groups possibles respecting the order given by
-        # groups first and then by groups2
-        # NB: At least one of groups or groups2 need to not be None
-        t = df.drop_duplicates(subset=[final_group_col]).copy()  # copy() to avoid raising SettingWithCopyWarning
+    ) -> list:
+        """
+        To order the values from final_group_col 
+        (e.g. the combined names of group_col and group_col2: "{group_col value} - {group_col2 value}")
+        as it should be displayed in the graph: 
+        Following first the order commanded by groups, and then the order commanded by groups2
+
+        Args:
+            metadata_df: dataframe containing metadata and information to group the data.
+            final_group_col: column generated from concatening group_col and group_col2
+              (e.g. "{group_col value} - {group_col2 value}")
+            group_col: column from metadata_df used to group the data
+            group_col2: column from metadata_df used to further divide the data
+            groups: ordered list of groups from group_col to display in graph.
+            groups2: ordered list of groups from group_col2 to display in graph.
+        """
+        # This method is called if pval_to_display isn't None and if at least one of groups or groups2 isn't None
+        # It lists and sorts all final_groups possibles respecting the order given by groups first and then by groups2
+        t = metadata_df.drop_duplicates(subset=[final_group_col]).copy()  # copy() to avoid raising SettingWithCopyWarning
         if groups:
             t[group_col] = t[group_col].astype("category")
             t[group_col] = t[group_col].cat.set_categories(groups, ordered=True)
@@ -339,14 +386,15 @@ def _generate_ordered_final_groups(
         return list(t[final_group_col])
 
     def _generate_shapes_annotations_lists(
-        self, pval_series, groups, hgt_min
+        self, pval_series:pd.Series, groups: list, hgt_min: Real
     ):
         """
-        To generate annotations to represent significant pvalues. Methods for group_col only (not group_col2)
+        To generate annotations to represent significant p-values. Methods for group_col only (not group_col2)
 
         Args:
-            pval_series: pd.Series of the pvalue to put on the graph (need to be filtered beforehand
-              so that it only contains significant pvalues)
+            pval_series: series of the p-values to put on the graph (need to be filtered beforehand
+              so that it only contains significant p-values).
+            groups: list of groups displayed in graph.
         """
         # Overview of this method: We're trying to generate the shapes (1 bracket = 3 lines =
         # 1 going from Group1 to Group2 and 2 small lines to form the edge of the bracket)
@@ -390,7 +438,7 @@ def _generate_shapes_annotations_lists(
                          'line': dict(width=linewidth)},   # from Group1 to Group2
                         {'x0': left_ind, 'y0': hgt_min+(i*det/2)-0.15*det,
                          'x1': left_ind, 'y1': hgt_min+(i*det/2),
-                         'line': dict(width=linewidth)},    # left edge of the bracket
+                         'line': dict(width=linewidth)},   # left edge of the bracket
                         {'x0': right_ind, 'y0': hgt_min+(i*det/2)-0.15*det,
                          'x1': right_ind, 'y1': hgt_min+(i*det/2),
                          'line': dict(width=linewidth)}    # right edge of the bracket
@@ -521,7 +569,7 @@ def _generate_shapes_annotations_lists_supergroup(
     def _compute_pval_inside_subgroups(
         self, diversity_index_dataframe: pd.DataFrame, group_col: str, final_group_col: str,
         stats_test: str, correction_method: str, structure_pval: str, sym: bool
-    ):
+    ) -> pd.Series:
         pval = pd.Series([], dtype='float64')
         for g in diversity_index_dataframe[group_col].dropna().unique():
             df_gp = diversity_index_dataframe[diversity_index_dataframe[group_col] == g]
@@ -597,16 +645,11 @@ def analyse_groups(
             df = self._get_grouped_df(filtered_metadata_df[[group_col, group_col2, final_group_col]])
 
             if pval_to_display and (groups or groups2):
-                # listing and sorting all final_groups possibles respecting the order given by
+                # list and sort all final_groups possibles respecting the order given by
                 # groups first and then by groups2
-                t = df.drop_duplicates(subset=[final_group_col])
-                t[group_col] = t[group_col].astype("category")
-                t[group_col].cat = t[group_col].cat.set_categories(groups)
-                t[group_col2] = t[group_col2].astype("category")
-                t[group_col].cat = t[group_col2].cat.set_categories(groups2)
-                t = t.dropna(how="any", subset=[group_col, group_col2])
-                t = t.sort_values([group_col, group_col2])
-                final_groups = list(t[final_group_col])
+                final_groups = self._generate_ordered_final_groups(
+                    df, final_group_col, group_col, group_col2, groups, groups2
+                )
 
             if pval_to_compute == "all":
                 pval = self._run_statistical_test_groups(
diff --git a/tests/analysis/diversity/test_alpha.py b/tests/analysis/diversity/test_alpha.py
index 1643a16..af899a2 100644
--- a/tests/analysis/diversity/test_alpha.py
+++ b/tests/analysis/diversity/test_alpha.py
@@ -1,3 +1,4 @@
+import re
 from unittest import TestCase
 
 from io import StringIO
@@ -474,9 +475,9 @@ def test_order_pval_series(self):
             ('M - B', 'F - B'): 0.2,
             ('F - A', 'F - B'): 0.001,
             ('M - A', 'M - B'): 0.00067,
-            ('F - A', 'F - C'): 0.0031,
+            ('F - A', 'F - C'): 0.0031,    # should be reorganized as ('F - C', 'F - A')
             ('M - C', 'F - C'): 0.0003,
-            ('M - A', 'M - C'): 0.89,
+            ('M - A', 'M - C'): 0.89,      # should be reorganized as ('M - C', 'M - A')
         })
         tested_object.index.names = ["Group1", "Group2"]
         groups = ['M - C', 'F - C', 'M - A', 'F - A', 'M - B', 'F - B']
@@ -495,16 +496,12 @@ def test_order_pval_series(self):
         data = [[0.0003, 0.89, 0.03, 0.0031, 0.0014, 0.5, 0.00067, 0.001, 0.2]]
         expected_ser = pd.DataFrame(
             data=data,
-            columns=pd.MultiIndex.from_arrays([level0, level1]),
-
+            columns=pd.MultiIndex.from_arrays([level0, level1])
         ).T[0]
         expected_ser.index.names = ["Group1", "Group2"]
 
         pd.testing.assert_series_equal(
-            tested_object_instance._order_pval_series(
-                tested_object, groups,
-                {'M - C': 0, 'F - C': 1, 'M - A': 2, 'F - A': 3, 'M - B': 4, 'F - B': 5}
-            ),
+            tested_object_instance._order_pval_series(tested_object, groups),
             expected_ser
         )
 

From f8392300fade033264a39b92b7fc7f6f6d2cba25 Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Wed, 15 Nov 2023 16:15:11 +0100
Subject: [PATCH 8/8] flake8

---
 moonstone/analysis/diversity/base.py   | 12 +++++++-----
 tests/analysis/diversity/test_alpha.py |  1 -
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
index fc2a010..dc11c2b 100644
--- a/moonstone/analysis/diversity/base.py
+++ b/moonstone/analysis/diversity/base.py
@@ -258,7 +258,8 @@ def _pval_selection(
         threshold: float = 0.05
     ) -> pd.Series:
         """
-        To select the p-values to display. The significant p-values, meaning the p-values under a given threshold, belonging to two groups diplayed
+        To select the p-values to display. The significant p-values, meaning the p-values under a given threshold,
+        belonging to two groups diplayed.
 
         Args:
             pval_series: series of all the p-values computed.
@@ -358,9 +359,9 @@ def _generate_ordered_final_groups(
         groups: list, groups2: list
     ) -> list:
         """
-        To order the values from final_group_col 
+        To order the values from final_group_col
         (e.g. the combined names of group_col and group_col2: "{group_col value} - {group_col2 value}")
-        as it should be displayed in the graph: 
+        as it should be displayed in the graph:
         Following first the order commanded by groups, and then the order commanded by groups2
 
         Args:
@@ -374,7 +375,8 @@ def _generate_ordered_final_groups(
         """
         # This method is called if pval_to_display isn't None and if at least one of groups or groups2 isn't None
         # It lists and sorts all final_groups possibles respecting the order given by groups first and then by groups2
-        t = metadata_df.drop_duplicates(subset=[final_group_col]).copy()  # copy() to avoid raising SettingWithCopyWarning
+        t = metadata_df.drop_duplicates(subset=[final_group_col])\
+            .copy()  # copy() to avoid raising SettingWithCopyWarning
         if groups:
             t[group_col] = t[group_col].astype("category")
             t[group_col] = t[group_col].cat.set_categories(groups, ordered=True)
@@ -386,7 +388,7 @@ def _generate_ordered_final_groups(
         return list(t[final_group_col])
 
     def _generate_shapes_annotations_lists(
-        self, pval_series:pd.Series, groups: list, hgt_min: Real
+        self, pval_series: pd.Series, groups: list, hgt_min: Real
     ):
         """
         To generate annotations to represent significant p-values. Methods for group_col only (not group_col2)
diff --git a/tests/analysis/diversity/test_alpha.py b/tests/analysis/diversity/test_alpha.py
index af899a2..a70821d 100644
--- a/tests/analysis/diversity/test_alpha.py
+++ b/tests/analysis/diversity/test_alpha.py
@@ -1,4 +1,3 @@
-import re
 from unittest import TestCase
 
 from io import StringIO