Skip to content

Commit

Permalink
remove, refactor and simplify implementaiton
Browse files Browse the repository at this point in the history
  • Loading branch information
Reinier Koops committed Mar 28, 2024
1 parent 37808de commit 2cb8724
Show file tree
Hide file tree
Showing 12 changed files with 178,776 additions and 4,809 deletions.
2 changes: 1 addition & 1 deletion LICENCE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2020 ING Bank N.V.
Copyright (c) ING Bank N.V.

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

Expand Down
172 changes: 0 additions & 172 deletions docs/tutorials/nb_custom_scoring.ipynb

This file was deleted.

182,800 changes: 178,570 additions & 4,230 deletions docs/tutorials/nb_shap_feature_elimination.ipynb

Large diffs are not rendered by default.

282 changes: 128 additions & 154 deletions probatus/feature_elimination/feature_elimination.py

Large diffs are not rendered by default.

13 changes: 7 additions & 6 deletions probatus/interpret/model_interpret.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import get_scorer
from shap import summary_plot
from shap.plots._waterfall import waterfall_legacy

Expand All @@ -9,7 +10,6 @@
BaseFitComputePlotClass,
assure_list_of_strings,
calculate_shap_importance,
get_single_scorer,
preprocess_data,
preprocess_labels,
shap_calc,
Expand Down Expand Up @@ -86,7 +86,8 @@ def __init__(self, model, scoring="roc_auc", verbose=0, random_state=None):
reproducible results set it to an integer.
"""
self.model = model
self.scorer = get_single_scorer(scoring)
self.scoring = scoring # (str) name of the metric
self.scorer = get_scorer(scoring)
self.verbose = verbose
self.random_state = random_state

Expand Down Expand Up @@ -144,12 +145,12 @@ def fit(
self.class_names = ["Negative Class", "Positive Class"]

# Calculate Metrics
self.train_score = self.scorer.score(self.model, self.X_train, self.y_train)
self.test_score = self.scorer.score(self.model, self.X_test, self.y_test)
self.train_score = self.scorer(self.model, self.X_train, self.y_train)
self.test_score = self.scorer(self.model, self.X_test, self.y_test)

self.results_text = (
f"Train {self.scorer.metric_name}: {np.round(self.train_score, 3)},\n"
f"Test {self.scorer.metric_name}: {np.round(self.test_score, 3)}."
f"Train {self.scoring}: {np.round(self.train_score, 3)},\n"
f"Test {self.scoring}: {np.round(self.test_score, 3)}."
)

(
Expand Down
18 changes: 10 additions & 8 deletions probatus/sample_similarity/resemblance_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
from shap import summary_plot
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import get_scorer

from probatus.utils import BaseFitComputePlotClass, get_single_scorer, preprocess_data, preprocess_labels
from probatus.utils import BaseFitComputePlotClass, preprocess_data, preprocess_labels
from probatus.utils.shap_helpers import calculate_shap_importance, shap_calc


Expand Down Expand Up @@ -70,7 +71,8 @@ class is 'roc_auc'.
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose
self.scorer = get_single_scorer(scoring)
self.scoring = scoring # (str) name of the metric
self.scorer = get_scorer(scoring)

def _init_output_variables(self):
"""
Expand Down Expand Up @@ -151,20 +153,20 @@ def fit(self, X1, X2, column_names=None, class_names=None):
)
self.model.fit(self.X_train, self.y_train)

self.train_score = np.round(self.scorer.score(self.model, self.X_train, self.y_train), 3)
self.test_score = np.round(self.scorer.score(self.model, self.X_test, self.y_test), 3)
self.train_score = np.round(self.scorer(self.model, self.X_train, self.y_train), 3)
self.test_score = np.round(self.scorer(self.model, self.X_test, self.y_test), 3)

self.results_text = (
f"Train {self.scorer.metric_name}: {np.round(self.train_score, 3)},\n"
f"Test {self.scorer.metric_name}: {np.round(self.test_score, 3)}."
f"Train {self.scoring}: {np.round(self.train_score, 3)},\n"
f"Test {self.scoring}: {np.round(self.test_score, 3)}."
)
if self.verbose > 1:
logger.info(f"Finished model training: \n{self.results_text}")

if self.verbose > 0:
if self.train_score > self.test_score:
warnings.warn(
f"Train {self.scorer.metric_name} > Test {self.scorer.metric_name}, which might indicate "
f"Train {self.scoring} > Test {self.scoring}, which might indicate "
f"an overfit. \n Strong overfit might lead to misleading conclusions when analysing "
f"feature importance. Consider retraining with more regularization applied to the model."
)
Expand Down Expand Up @@ -384,7 +386,7 @@ def fit(self, X1, X2, column_names=None, class_names=None):
self.model,
self.X_test,
self.y_test,
scoring=self.scorer.scorer,
scoring=self.scorer,
n_repeats=self.iterations,
n_jobs=self.n_jobs,
)
Expand Down
7 changes: 1 addition & 6 deletions probatus/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .exceptions import NotFittedError, UnsupportedModelError
from .scoring import Scorer, get_scorers, get_single_scorer
from .exceptions import NotFittedError
from .arrayfuncs import (
assure_pandas_df,
assure_pandas_series,
Expand All @@ -12,10 +11,7 @@

__all__ = [
"NotFittedError",
"UnsupportedModelError",
"Scorer",
"assure_pandas_df",
"get_scorers",
"assure_list_of_strings",
"shap_calc",
"shap_to_df",
Expand All @@ -25,5 +21,4 @@
"preprocess_labels",
"BaseFitComputeClass",
"BaseFitComputePlotClass",
"get_single_scorer",
]
65 changes: 27 additions & 38 deletions probatus/utils/arrayfuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,15 @@ def assure_pandas_df(x, column_names=None):
pandas DataFrame
"""
if isinstance(x, pd.DataFrame):
# Check if column_names are passed correctly
if column_names is not None:
x.columns = column_names
return x
elif any(
[
isinstance(x, np.ndarray),
isinstance(x, pd.core.series.Series),
isinstance(x, list),
]
):
return pd.DataFrame(x, columns=column_names)
elif isinstance(x, (np.ndarray, pd.Series, list)):
x = pd.DataFrame(x, columns=column_names)
else:
raise TypeError("Please supply a list, numpy array, pandas Series or pandas DataFrame")

return x


def assure_pandas_series(x, index=None):
"""
Expand All @@ -42,7 +36,7 @@ def assure_pandas_series(x, index=None):
pandas Series
"""
if isinstance(x, pd.Series):
if isinstance(index, list) or isinstance(index, np.ndarray):
if isinstance(index, (list, np.ndarray)):
index = pd.Index(index)
current_x_index = pd.Index(x.index.values)
if current_x_index.equals(index):
Expand All @@ -55,7 +49,7 @@ def assure_pandas_series(x, index=None):
# If indexes have different values, overwrite
x.index = index
return x
elif any([isinstance(x, np.ndarray), isinstance(x, list)]):
elif any([isinstance(x, (np.ndarray, list))]):
return pd.Series(x, index=index)
else:
raise TypeError("Please supply a list, numpy array, pandas Series")
Expand Down Expand Up @@ -92,40 +86,36 @@ def preprocess_data(X, X_name=None, column_names=None, verbose=0):
(pd.DataFrame):
Preprocessed dataset.
"""
if X_name is None:
X_name = "X"
X_name = "X" if X_name is None else X_name

# Make sure that X is a pd.DataFrame with correct column names
X = assure_pandas_df(X, column_names=column_names)

# Warn if missing
columns_with_missing = [column for column in X.columns if X[column].isnull().values.any()]
if len(columns_with_missing) > 0:
if verbose > 0:
if verbose > 0:
# Warn if missing
columns_with_missing = X.columns[X.isnull().any()].tolist()
if columns_with_missing:
warnings.warn(
f"The following variables in {X_name} contains missing values {columns_with_missing}. "
f"Make sure to impute missing or apply a model that handles them automatically."
)

# Warn if categorical features and change to category
indices_categorical_features = [
column[0] for column in enumerate(X.dtypes) if column[1].name in ["category", "object"]
]
categorical_features = list(X.columns[indices_categorical_features])

# Set categorical features type to category
if len(categorical_features) > 0:
if verbose > 0:
warnings.warn(
f"The following variables in {X_name} contains categorical variables: "
f"{categorical_features}. Make sure to use a model that handles them automatically or "
f"encode them into numerical variables."
)
# Warn if categorical features and change to category
categorical_features = X.select_dtypes(include=["category", "object"]).columns.tolist()
# Set categorical features type to category
if categorical_features:
if verbose > 0:
warnings.warn(
f"The following variables in {X_name} contains categorical variables: "
f"{categorical_features}. Make sure to use a model that handles them automatically or "
f"encode them into numerical variables."
)

# Ensure category dtype, to enable models e.g. LighGBM, handle them automatically
object_columns = X.select_dtypes(include=["object"]).columns
if not object_columns.empty:
X[object_columns] = X[object_columns].astype("category")

# Ensure category dtype, to enable models e.g. LighGBM, handle them automatically
for categorical_feature in categorical_features:
if X[categorical_feature].dtype.name == "object":
X[categorical_feature] = X[categorical_feature].astype("category")
return X, X.columns.tolist()


Expand Down Expand Up @@ -157,8 +147,7 @@ def preprocess_labels(y, y_name=None, index=None, verbose=0):
(pd.Series):
Labels in the form of pd.Series.
"""
if y_name is None:
y_name = "y"
y_name = "y" if y_name is None else y_name

# Make sure that y is a series with correct index
y = assure_pandas_series(y, index=index)
Expand Down
13 changes: 0 additions & 13 deletions probatus/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,3 @@ def __init__(self, message):
Init error.
"""
self.message = message


class UnsupportedModelError(Exception):
"""
Error.
"""

def __init__(self, message):
# TODO: Add this check for unsupported models to our implementations.
"""
Init error.
"""
self.message = message
127 changes: 0 additions & 127 deletions probatus/utils/scoring.py

This file was deleted.

Loading

0 comments on commit 2cb8724

Please sign in to comment.