remove, refactor and simplify implementaiton

ing-bank · Mar 28, 2024 · 2cb8724 · 2cb8724
1 parent 37808de
commit 2cb8724
Show file tree

Hide file tree

Showing 12 changed files with 178,776 additions and 4,809 deletions.
diff --git a/LICENCE b/LICENCE
@@ -1,4 +1,4 @@
-Copyright (c) 2020 ING Bank N.V.
+Copyright (c) ING Bank N.V.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 

diff --git a/docs/tutorials/nb_custom_scoring.ipynb b/docs/tutorials/nb_custom_scoring.ipynb
diff --git a/docs/tutorials/nb_shap_feature_elimination.ipynb b/docs/tutorials/nb_shap_feature_elimination.ipynb
diff --git a/probatus/feature_elimination/feature_elimination.py b/probatus/feature_elimination/feature_elimination.py
diff --git a/probatus/interpret/model_interpret.py b/probatus/interpret/model_interpret.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from sklearn.metrics import get_scorer
 from shap import summary_plot
 from shap.plots._waterfall import waterfall_legacy
 
@@ -9,7 +10,6 @@
     BaseFitComputePlotClass,
     assure_list_of_strings,
     calculate_shap_importance,
-    get_single_scorer,
     preprocess_data,
     preprocess_labels,
     shap_calc,
@@ -86,7 +86,8 @@ def __init__(self, model, scoring="roc_auc", verbose=0, random_state=None):
                 reproducible results set it to an integer.
         """
         self.model = model
-        self.scorer = get_single_scorer(scoring)
+        self.scoring = scoring  # (str) name of the metric
+        self.scorer = get_scorer(scoring)
         self.verbose = verbose
         self.random_state = random_state
 
@@ -144,12 +145,12 @@ def fit(
             self.class_names = ["Negative Class", "Positive Class"]
 
         # Calculate Metrics
-        self.train_score = self.scorer.score(self.model, self.X_train, self.y_train)
-        self.test_score = self.scorer.score(self.model, self.X_test, self.y_test)
+        self.train_score = self.scorer(self.model, self.X_train, self.y_train)
+        self.test_score = self.scorer(self.model, self.X_test, self.y_test)
 
         self.results_text = (
-            f"Train {self.scorer.metric_name}: {np.round(self.train_score, 3)},\n"
-            f"Test {self.scorer.metric_name}: {np.round(self.test_score, 3)}."
+            f"Train {self.scoring}: {np.round(self.train_score, 3)},\n"
+            f"Test {self.scoring}: {np.round(self.test_score, 3)}."
         )
 
         (

diff --git a/probatus/sample_similarity/resemblance_model.py b/probatus/sample_similarity/resemblance_model.py
@@ -7,8 +7,9 @@
 from shap import summary_plot
 from sklearn.inspection import permutation_importance
 from sklearn.model_selection import train_test_split
+from sklearn.metrics import get_scorer
 
-from probatus.utils import BaseFitComputePlotClass, get_single_scorer, preprocess_data, preprocess_labels
+from probatus.utils import BaseFitComputePlotClass, preprocess_data, preprocess_labels
 from probatus.utils.shap_helpers import calculate_shap_importance, shap_calc
 
 
@@ -70,7 +71,8 @@ class is 'roc_auc'.
         self.n_jobs = n_jobs
         self.random_state = random_state
         self.verbose = verbose
-        self.scorer = get_single_scorer(scoring)
+        self.scoring = scoring  # (str) name of the metric
+        self.scorer = get_scorer(scoring)
 
     def _init_output_variables(self):
         """
@@ -151,20 +153,20 @@ def fit(self, X1, X2, column_names=None, class_names=None):
         )
         self.model.fit(self.X_train, self.y_train)
 
-        self.train_score = np.round(self.scorer.score(self.model, self.X_train, self.y_train), 3)
-        self.test_score = np.round(self.scorer.score(self.model, self.X_test, self.y_test), 3)
+        self.train_score = np.round(self.scorer(self.model, self.X_train, self.y_train), 3)
+        self.test_score = np.round(self.scorer(self.model, self.X_test, self.y_test), 3)
 
         self.results_text = (
-            f"Train {self.scorer.metric_name}: {np.round(self.train_score, 3)},\n"
-            f"Test {self.scorer.metric_name}: {np.round(self.test_score, 3)}."
+            f"Train {self.scoring}: {np.round(self.train_score, 3)},\n"
+            f"Test {self.scoring}: {np.round(self.test_score, 3)}."
         )
         if self.verbose > 1:
             logger.info(f"Finished model training: \n{self.results_text}")
 
         if self.verbose > 0:
             if self.train_score > self.test_score:
                 warnings.warn(
-                    f"Train {self.scorer.metric_name} > Test {self.scorer.metric_name}, which might indicate "
+                    f"Train {self.scoring} > Test {self.scoring}, which might indicate "
                     f"an overfit. \n Strong overfit might lead to misleading conclusions when analysing "
                     f"feature importance. Consider retraining with more regularization applied to the model."
                 )
@@ -384,7 +386,7 @@ def fit(self, X1, X2, column_names=None, class_names=None):
             self.model,
             self.X_test,
             self.y_test,
-            scoring=self.scorer.scorer,
+            scoring=self.scorer,
             n_repeats=self.iterations,
             n_jobs=self.n_jobs,
         )

diff --git a/probatus/utils/__init__.py b/probatus/utils/__init__.py
@@ -1,5 +1,4 @@
-from .exceptions import NotFittedError, UnsupportedModelError
-from .scoring import Scorer, get_scorers, get_single_scorer
+from .exceptions import NotFittedError
 from .arrayfuncs import (
     assure_pandas_df,
     assure_pandas_series,
@@ -12,10 +11,7 @@
 
 __all__ = [
     "NotFittedError",
-    "UnsupportedModelError",
-    "Scorer",
     "assure_pandas_df",
-    "get_scorers",
     "assure_list_of_strings",
     "shap_calc",
     "shap_to_df",
@@ -25,5 +21,4 @@
     "preprocess_labels",
     "BaseFitComputeClass",
     "BaseFitComputePlotClass",
-    "get_single_scorer",
 ]
diff --git a/probatus/utils/arrayfuncs.py b/probatus/utils/arrayfuncs.py
@@ -15,21 +15,15 @@ def assure_pandas_df(x, column_names=None):
         pandas DataFrame
     """
     if isinstance(x, pd.DataFrame):
-        # Check if column_names are passed correctly
         if column_names is not None:
             x.columns = column_names
-        return x
-    elif any(
-        [
-            isinstance(x, np.ndarray),
-            isinstance(x, pd.core.series.Series),
-            isinstance(x, list),
-        ]
-    ):
-        return pd.DataFrame(x, columns=column_names)
+    elif isinstance(x, (np.ndarray, pd.Series, list)):
+        x = pd.DataFrame(x, columns=column_names)
     else:
         raise TypeError("Please supply a list, numpy array, pandas Series or pandas DataFrame")
 
+    return x
+
 
 def assure_pandas_series(x, index=None):
     """
@@ -42,7 +36,7 @@ def assure_pandas_series(x, index=None):
         pandas Series
     """
     if isinstance(x, pd.Series):
-        if isinstance(index, list) or isinstance(index, np.ndarray):
+        if isinstance(index, (list, np.ndarray)):
             index = pd.Index(index)
         current_x_index = pd.Index(x.index.values)
         if current_x_index.equals(index):
@@ -55,7 +49,7 @@ def assure_pandas_series(x, index=None):
             # If indexes have different values, overwrite
             x.index = index
             return x
-    elif any([isinstance(x, np.ndarray), isinstance(x, list)]):
+    elif any([isinstance(x, (np.ndarray, list))]):
         return pd.Series(x, index=index)
     else:
         raise TypeError("Please supply a list, numpy array, pandas Series")
@@ -92,40 +86,36 @@ def preprocess_data(X, X_name=None, column_names=None, verbose=0):
         (pd.DataFrame):
             Preprocessed dataset.
     """
-    if X_name is None:
-        X_name = "X"
+    X_name = "X" if X_name is None else X_name
 
     # Make sure that X is a pd.DataFrame with correct column names
     X = assure_pandas_df(X, column_names=column_names)
 
-    # Warn if missing
-    columns_with_missing = [column for column in X.columns if X[column].isnull().values.any()]
-    if len(columns_with_missing) > 0:
-        if verbose > 0:
+    if verbose > 0:
+        # Warn if missing
+        columns_with_missing = X.columns[X.isnull().any()].tolist()
+        if columns_with_missing:
             warnings.warn(
                 f"The following variables in {X_name} contains missing values {columns_with_missing}. "
                 f"Make sure to impute missing or apply a model that handles them automatically."
             )
 
-    # Warn if categorical features and change to category
-    indices_categorical_features = [
-        column[0] for column in enumerate(X.dtypes) if column[1].name in ["category", "object"]
-    ]
-    categorical_features = list(X.columns[indices_categorical_features])
-
-    # Set categorical features type to category
-    if len(categorical_features) > 0:
-        if verbose > 0:
-            warnings.warn(
-                f"The following variables in {X_name} contains categorical variables: "
-                f"{categorical_features}. Make sure to use a model that handles them automatically or "
-                f"encode them into numerical variables."
-            )
+        # Warn if categorical features and change to category
+        categorical_features = X.select_dtypes(include=["category", "object"]).columns.tolist()
+        # Set categorical features type to category
+        if categorical_features:
+            if verbose > 0:
+                warnings.warn(
+                    f"The following variables in {X_name} contains categorical variables: "
+                    f"{categorical_features}. Make sure to use a model that handles them automatically or "
+                    f"encode them into numerical variables."
+                )
+
+    # Ensure category dtype, to enable models e.g. LighGBM, handle them automatically
+    object_columns = X.select_dtypes(include=["object"]).columns
+    if not object_columns.empty:
+        X[object_columns] = X[object_columns].astype("category")
 
-        # Ensure category dtype, to enable models e.g. LighGBM, handle them automatically
-        for categorical_feature in categorical_features:
-            if X[categorical_feature].dtype.name == "object":
-                X[categorical_feature] = X[categorical_feature].astype("category")
     return X, X.columns.tolist()
 
 
@@ -157,8 +147,7 @@ def preprocess_labels(y, y_name=None, index=None, verbose=0):
         (pd.Series):
             Labels in the form of pd.Series.
     """
-    if y_name is None:
-        y_name = "y"
+    y_name = "y" if y_name is None else y_name
 
     # Make sure that y is a series with correct index
     y = assure_pandas_series(y, index=index)

diff --git a/probatus/utils/exceptions.py b/probatus/utils/exceptions.py
@@ -8,16 +8,3 @@ def __init__(self, message):
         Init error.
         """
         self.message = message
-
-
-class UnsupportedModelError(Exception):
-    """
-    Error.
-    """
-
-    def __init__(self, message):
-        # TODO: Add this check for unsupported models to our implementations.
-        """
-        Init error.
-        """
-        self.message = message
diff --git a/probatus/utils/scoring.py b/probatus/utils/scoring.py