From 6aa76a5371ef46c252240675f7610c4a36d5a2c0 Mon Sep 17 00:00:00 2001 From: Yahya Date: Wed, 22 May 2019 20:23:28 +0100 Subject: [PATCH 01/11] generate synthetic categorical data first push --- examples/generate_data_categorical_example.py | 47 +++++ pyod/test/test_data.py | 179 +++++++++++++++++- pyod/utils/data.py | 139 +++++++++++++- 3 files changed, 355 insertions(+), 10 deletions(-) create mode 100644 examples/generate_data_categorical_example.py diff --git a/examples/generate_data_categorical_example.py b/examples/generate_data_categorical_example.py new file mode 100644 index 000000000..4bcdf0f1b --- /dev/null +++ b/examples/generate_data_categorical_example.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +"""Example of using and visualizing ``generate_data_categorical`` function. +""" +# Author: Yahya Almardeny +# License: BSD 2 clause + +from __future__ import division +from __future__ import print_function + +import os +import sys +import numpy as np +import matplotlib.pyplot as plt +# temporary solution for relative imports in case pyod is not installed +# if pyod is installed, no need to use the following line + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname("__file__"), '..'))) + +from pyod.utils.data import generate_data_categorical + + +if __name__ == "__main__": + contamination = 0.1 # percentage of outliers + + # Generate sample data in clusters + X_train, X_test, y_train, y_test = generate_data_categorical(n_train=200, n_test=50, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=contamination, + random_state=42) + # note that visalizing it can only be in 1 dimension! + cats = list(np.ravel(X_train)) + labels = list(y_train) + fig, axs = plt.subplots(1, 2) + axs[0].bar(cats, labels) + axs[1].plot(cats, labels) + plt.title('Synthetic Categorical Train Data') + plt.show() + + cats = list(np.ravel(X_test)) + labels = list(y_test) + fig, axs = plt.subplots(1, 2) + axs[0].bar(cats, labels) + axs[1].plot(cats, labels) + plt.title('Synthetic Categorical Test Data') + plt.show() diff --git a/pyod/test/test_data.py b/pyod/test/test_data.py index 5c2293a30..ef2c29f36 100644 --- a/pyod/test/test_data.py +++ b/pyod/test/test_data.py @@ -17,6 +17,7 @@ # temporary solution for relative imports in case pyod is not installed # if pyod is installed, no need to use the following line +from pyod.utils.data import generate_data_categorical sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) @@ -141,8 +142,7 @@ def test_data_generate_cluster3(self): def test_data_generate_cluster5(self): with assert_raises(ValueError): - X_train, y_train, X_test, y_test = \ - generate_data_clusters(n_train=self.n_train, + generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, n_clusters='e', @@ -150,24 +150,21 @@ def test_data_generate_cluster5(self): random_state=self.random_state) with assert_raises(ValueError): - X_train, y_train, X_test, y_test = \ - generate_data_clusters(n_train=self.n_train, + generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features='e', contamination=self.contamination, random_state=self.random_state) with assert_raises(ValueError): - X_train, y_train, X_test, y_test = \ - generate_data_clusters(n_train=self.n_train, + generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination='e', random_state=self.random_state) with assert_raises(ValueError): - X_train, y_train, X_test, y_test = \ - generate_data_clusters(n_train=self.n_train, + generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination, @@ -197,6 +194,172 @@ def test_data_generate_cluster6(self): self.n_train + self.n_test) assert_allclose(self.contamination, out_perc, atol=0.01) + def test_data_generate_categorical(self): + X_train, X_test, y_train, y_test = \ + generate_data_categorical(n_train=self.n_train, + n_test=self.n_test, + n_features=2, + contamination=self.contamination, + random_state=self.random_state) + + assert_equal(y_train.shape[0], X_train.shape[0]) + assert_equal(y_test.shape[0], X_test.shape[0]) + + assert_less_equal(self.n_train - X_train.shape[0], 1) + assert_equal(X_train.shape[1], 2) + + assert_less_equal(self.n_test - X_test.shape[0], 1) + assert_equal(X_test.shape[1], 2) + + out_perc = (np.sum(y_train) + np.sum(y_test)) / ( + self.n_train + self.n_test) + assert_allclose(self.contamination, out_perc, atol=0.01) + + def test_data_generate_categorical2(self): + X_train, X_test, y_train, y_test = \ + generate_data_categorical(n_train=self.n_train, + n_test=self.n_test, + n_features=4, + contamination=self.contamination, + random_state=self.random_state) + + assert_allclose(X_train.shape, (self.n_train, 4)) + assert_allclose(X_test.shape, (self.n_test, 4)) + + def test_data_generate_categorical3(self): + X_train, y_train, X_test, y_test = \ + generate_data_categorical(n_train=self.n_train, + n_test=self.n_test, + n_features=3, + contamination=self.contamination, + random_state=self.random_state) + + X_train2, y_train2, X_test2, y_test2 = \ + generate_data_categorical(n_train=self.n_train, + n_test=self.n_test, + n_features=3, + contamination=self.contamination, + random_state=self.random_state) + + assert np.array_equal(X_train, X_train2) + assert np.array_equal(X_train, X_train2) + assert np.array_equal(X_test, X_test2) + assert np.array_equal(y_train, y_train2) + assert np.array_equal(y_test, y_test2) + + def test_data_generate_categorical5(self): + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=-1) + + with assert_raises(ValueError): + generate_data_categorical(n_train=0, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=-1, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train='not int', n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test='not int', + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features= 0, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features='not int', + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=-1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative='not int', n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination=0.6, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=3, + n_informative=1, n_features=1, + contamination='not float', + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=-1, n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in='not int', n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=self.n_train+self.n_test+1, + n_category_out=3, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out=-1, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, n_category_out='not int', + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, + n_category_out=self.n_train+self.n_test+1, + n_informative=1, n_features=1, + contamination=self.contamination, + random_state=self.random_state) + def test_evaluate_print(self): X_train, y_train, X_test, y_test = generate_data( n_train=self.n_train, diff --git a/pyod/utils/data.py b/pyod/utils/data.py index 324af5727..a8a5a2c5c 100644 --- a/pyod/utils/data.py +++ b/pyod/utils/data.py @@ -8,9 +8,8 @@ from __future__ import division from __future__ import print_function -import numpy as np from warnings import warn - +import numpy as np from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split from sklearn.utils import column_or_1d @@ -515,3 +514,139 @@ def generate_data_clusters(n_train=1000, n_test=500, n_clusters=2, else: return train_test_split(X, y, test_size=n_test, random_state=random_state) + + +def generate_data_categorical(n_train=1000, n_test=500, n_features=2, + n_informative=2, n_category_in=2, + n_category_out=2, contamination=0.1, + random_state=None): + + """Utility function to generate synthesized categorical data. + + Parameters + ---------- + n_train : int, (default=1000) + The number of training points to generate. + + n_test : int, (default=500) + The number of test points to generate. + + n_features : int, optional (default=2) + The number of features for each sample. + + n_informative : int in (1, n_features), optional (default=2) + The number of informative features in the outlier points. + The higher the easier the outlier detection should be. + Note that n_informative should not be less than or + equal n_features. + + n_category_in : int in (1, n_inliers), optional (default=2) + The number of categories in the inlier points. + + n_category_out : int in (1, n_outliers), optional (default=2) + The number of categories in the outlier points. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + + Returns + ------- + X_train : numpy array of shape (n_train, n_features) + Training data. + + y_train : numpy array of shape (n_train,) + Training ground truth. + + X_test : numpy array of shape (n_test, n_features) + Test data. + + y_test : numpy array of shape (n_test,) + Test ground truth. + """ + + # initialize a random state and seeds for the instance + random_state = check_random_state(random_state) + + if isinstance(n_train, int): + check_parameter(n_train, low=1, param_name='n_train') + else: + raise ValueError("n_train should be int, got %s" % n_train) + + if isinstance(n_test, int): + check_parameter(n_test, low=0, param_name='n_test') + else: + raise ValueError("n_test should be int, got %s" % n_test) + + if isinstance(n_features, int): + check_parameter(n_features, low=0, param_name='n_features') + else: + raise ValueError("n_features should be int, got %s" % n_features) + + if isinstance(n_informative, int): + check_parameter(n_informative, low=0, high=n_features+1, param_name='n_informative') + else: + raise ValueError("n_informative should be int, got %s" % n_informative) + + if isinstance(contamination, float): + check_parameter(contamination, low=0, high=0.5, + param_name='contamination') + else: + raise ValueError("contamination should be float, got %s" % contamination) + + + # find the required number of outliers and inliers + n_samples = n_train + n_test + n_outliers = int(n_samples * contamination) + n_inliers = n_samples - n_outliers + + if isinstance(n_category_in, int): + check_parameter(n_category_in, low=0, high=n_inliers+1, param_name='n_category_in') + else: + raise ValueError("n_category_in should be int, got %s" % n_category_in) + + if isinstance(n_category_out, int): + check_parameter(n_category_out, low=0, high=n_outliers+1, param_name='n_category_out') + else: + raise ValueError("n_category_out should be int, got %s" % n_category_out) + + # Encapsulated functions to generate features + def __f(f): + quot, rem = divmod(f - 1, 26) + return __f(quot) + chr(rem + ord('A')) if f != 0 else '' + + # generate pool of features to be the base for naming the data points + features = [] + for i in range(1, n_features + 1): + features.append(__f(i)) + + # find the required distributions of categories over inliers and outliers + temp_ = [int(n_inliers / n_category_in)] * (n_category_in - 1) + dist_in = temp_ + [int(n_inliers - sum(temp_))] + temp_ = [int(n_outliers / n_category_out)] * (n_category_out - 1) + dist_out = temp_ + [int(n_outliers - sum(temp_))] + + # generate categorical data + X = [] + count = 0 + for f in features: + inliers = np.hstack([[f + str(i)] * dist_in[i] for i in range(n_category_in)]) + if count < n_informative: + outliers = list(np.hstack( + [[f + str((n_category_in * 2) + i)] * dist_out[i] for i in range(n_category_out)])) + else: + outliers = list(inliers[random_state.randint(0, len(inliers), size=n_outliers)]) + count += 1 + X.append(list(inliers) + outliers) + + return train_test_split(np.array(X).T, + np.array(([0]*n_inliers) + ([1]*n_outliers)), + test_size=n_test, + random_state=random_state) From 3fa370695d57cf44fd4b36bde52958fd37117e3b Mon Sep 17 00:00:00 2001 From: Yahya Date: Wed, 22 May 2019 21:18:35 +0100 Subject: [PATCH 02/11] shuffled outliers --- pyod/utils/data.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pyod/utils/data.py b/pyod/utils/data.py index a8a5a2c5c..d7203432a 100644 --- a/pyod/utils/data.py +++ b/pyod/utils/data.py @@ -644,9 +644,15 @@ def __f(f): else: outliers = list(inliers[random_state.randint(0, len(inliers), size=n_outliers)]) count += 1 + X.append(list(inliers) + outliers) - return train_test_split(np.array(X).T, + X = np.array(X).T + outliers_ = X[n_inliers:] + random_state.shuffle([random_state.shuffle(c) for c in outliers_]) + X[n_inliers:] = outliers_ + + return train_test_split(X, np.array(([0]*n_inliers) + ([1]*n_outliers)), test_size=n_test, random_state=random_state) From 7b9aeb428c24a907de26ab1487e422cd6ba0843e Mon Sep 17 00:00:00 2001 From: Yahya Date: Wed, 22 May 2019 21:30:01 +0100 Subject: [PATCH 03/11] revert added shuffle --- pyod/utils/data.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pyod/utils/data.py b/pyod/utils/data.py index d7203432a..bfad4268f 100644 --- a/pyod/utils/data.py +++ b/pyod/utils/data.py @@ -647,12 +647,7 @@ def __f(f): X.append(list(inliers) + outliers) - X = np.array(X).T - outliers_ = X[n_inliers:] - random_state.shuffle([random_state.shuffle(c) for c in outliers_]) - X[n_inliers:] = outliers_ - - return train_test_split(X, + return train_test_split(np.array(X).T, np.array(([0]*n_inliers) + ([1]*n_outliers)), test_size=n_test, random_state=random_state) From 6e730bb6cc8aa8ad27e6c722305335c4c57f7c30 Mon Sep 17 00:00:00 2001 From: Yahya Date: Thu, 23 May 2019 21:50:40 +0100 Subject: [PATCH 04/11] added shuffle inliers option to cause noise in distribution on demand --- examples/generate_data_categorical_example.py | 5 +++-- pyod/test/test_data.py | 9 +++++++++ pyod/utils/data.py | 10 +++++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/examples/generate_data_categorical_example.py b/examples/generate_data_categorical_example.py index 4bcdf0f1b..cec4d5f89 100644 --- a/examples/generate_data_categorical_example.py +++ b/examples/generate_data_categorical_example.py @@ -25,10 +25,11 @@ # Generate sample data in clusters X_train, X_test, y_train, y_test = generate_data_categorical(n_train=200, n_test=50, - n_category_in=5, n_category_out=3, + n_category_in=8, n_category_out=5, n_informative=1, n_features=1, contamination=contamination, - random_state=42) + shuffle=True, random_state=42) + # note that visalizing it can only be in 1 dimension! cats = list(np.ravel(X_train)) labels = list(y_train) diff --git a/pyod/test/test_data.py b/pyod/test/test_data.py index ef2c29f36..d6a37456d 100644 --- a/pyod/test/test_data.py +++ b/pyod/test/test_data.py @@ -360,6 +360,15 @@ def test_data_generate_categorical5(self): contamination=self.contamination, random_state=self.random_state) + with assert_raises(ValueError): + generate_data_categorical(n_train=self.n_train, n_test=self.n_test, + n_category_in=5, + n_category_out=5, + n_informative=2, n_features=2, + contamination=self.contamination, + shuffle='not bool', + random_state=self.random_state) + def test_evaluate_print(self): X_train, y_train, X_test, y_test = generate_data( n_train=self.n_train, diff --git a/pyod/utils/data.py b/pyod/utils/data.py index bfad4268f..4d098c496 100644 --- a/pyod/utils/data.py +++ b/pyod/utils/data.py @@ -519,7 +519,7 @@ def generate_data_clusters(n_train=1000, n_test=500, n_clusters=2, def generate_data_categorical(n_train=1000, n_test=500, n_features=2, n_informative=2, n_category_in=2, n_category_out=2, contamination=0.1, - random_state=None): + shuffle=True, random_state=None): """Utility function to generate synthesized categorical data. @@ -550,6 +550,9 @@ def generate_data_categorical(n_train=1000, n_test=500, n_features=2, The amount of contamination of the data set, i.e. the proportion of outliers in the data set. + shuffle: bool, optional(default=True) + If True, inliers will be shuffled which makes more noisy distribution. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -601,6 +604,9 @@ def generate_data_categorical(n_train=1000, n_test=500, n_features=2, else: raise ValueError("contamination should be float, got %s" % contamination) + if not isinstance(shuffle, bool): + raise ValueError("shuffle should be bool, got %s" % shuffle) + # find the required number of outliers and inliers n_samples = n_train + n_test @@ -638,6 +644,8 @@ def __f(f): count = 0 for f in features: inliers = np.hstack([[f + str(i)] * dist_in[i] for i in range(n_category_in)]) + if shuffle: + random_state.shuffle(inliers) if count < n_informative: outliers = list(np.hstack( [[f + str((n_category_in * 2) + i)] * dist_out[i] for i in range(n_category_out)])) From 3f1c2aaab6e1856fc14355a29750d342784af769 Mon Sep 17 00:00:00 2001 From: Gianluca Martino Date: Tue, 29 Sep 2020 15:25:03 +0200 Subject: [PATCH 05/11] Fix for LSCP --- pyod/models/lscp.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pyod/models/lscp.py b/pyod/models/lscp.py index bfd31ea34..64bf4522d 100644 --- a/pyod/models/lscp.py +++ b/pyod/models/lscp.py @@ -341,10 +341,18 @@ def _get_local_region(self, X_test_norm): # keep nearby points which occur at least local_region_threshold times final_local_region_list = [[]] * X_test_norm.shape[0] for j in range(X_test_norm.shape[0]): - final_local_region_list[j] = [item for item, count in - collections.Counter( - local_region_list[j]).items() if - count > self.local_region_threshold] + tmp = [item for item, count in collections.Counter( + local_region_list[j]).items() if + count > self.local_region_threshold] + decrease_value = 0 + while len(tmp) < 2: + decrease_value = decrease_value + 1 + assert decrease_value < self.local_region_threshold + tmp = [item for item, count in + collections.Counter(local_region_list[j]).items() if + count > (self.local_region_threshold - decrease_value)] + + final_local_region_list[j] = tmp return final_local_region_list From 44c0ad79e9c0daa6b76b85249d2f151a976e63ec Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Tue, 13 Oct 2020 17:51:56 -0400 Subject: [PATCH 06/11] fix #239 --- pyod/models/copod.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyod/models/copod.py b/pyod/models/copod.py index bc59881df..235515cdb 100644 --- a/pyod/models/copod.py +++ b/pyod/models/copod.py @@ -79,9 +79,10 @@ def fit(self, X, y=None): Fitted estimator. """ X = check_array(X) - self._set_n_classes(y=None) + self._set_n_classes(y) self.X_train = X self.decision_function(X) + return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. From f536209f03394666d2f30217c4733bbf21f9565a Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Tue, 13 Oct 2020 17:54:25 -0400 Subject: [PATCH 07/11] version update --- CHANGES.txt | 1 + pyod/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 856b86070..5f1ff623a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -108,6 +108,7 @@ v<0.8.2>, <07/04/2020> -- Add a set of utility functions. v<0.8.2>, <08/30/2020> -- Add COPOD and MAD algorithm. v<0.8.3>, <09/01/2020> -- Make decision score consistent. v<0.8.3>, <09/19/2020> -- Add model persistence documentation (save and load). +v<0.8.4>, <10/13/2020> -- Fix COPOD code inconsistency (issue #239). diff --git a/pyod/version.py b/pyod/version.py index 6aa4a51c5..b5e5b5354 100644 --- a/pyod/version.py +++ b/pyod/version.py @@ -20,4 +20,4 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.8.3' # pragma: no cover +__version__ = '0.8.4' # pragma: no cover From 2e756dd6c4e02c30074d86d3a4fd5c521f26b89e Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Sat, 24 Oct 2020 22:30:46 -0400 Subject: [PATCH 08/11] version update --- CHANGES.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.txt b/CHANGES.txt index 5f1ff623a..2117ee70f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -109,6 +109,7 @@ v<0.8.2>, <08/30/2020> -- Add COPOD and MAD algorithm. v<0.8.3>, <09/01/2020> -- Make decision score consistent. v<0.8.3>, <09/19/2020> -- Add model persistence documentation (save and load). v<0.8.4>, <10/13/2020> -- Fix COPOD code inconsistency (issue #239). +v<0.8.4>, <10/24/2020> -- Fix LSCP minor bug (issue #180). From b6ef5ed475d43e734cb1f2c81c120e8d6dc763e9 Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Sun, 8 Nov 2020 11:43:36 -0500 Subject: [PATCH 09/11] update code to reflect the support of TF2 --- CHANGES.txt | 1 + pyod/models/auto_encoder.py | 17 +++++++++++++---- pyod/models/base_dl.py | 27 +++++++++++++++++++++++++++ pyod/models/gaal_base.py | 14 +++++++++++--- pyod/models/mo_gaal.py | 15 +++++++++++---- pyod/models/so_gaal.py | 15 +++++++++++---- pyod/models/vae.py | 22 +++++++++++++++------- pyod/utils/utility.py | 3 +++ 8 files changed, 92 insertions(+), 22 deletions(-) create mode 100644 pyod/models/base_dl.py diff --git a/CHANGES.txt b/CHANGES.txt index 2117ee70f..bf42f9ab8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -110,6 +110,7 @@ v<0.8.3>, <09/01/2020> -- Make decision score consistent. v<0.8.3>, <09/19/2020> -- Add model persistence documentation (save and load). v<0.8.4>, <10/13/2020> -- Fix COPOD code inconsistency (issue #239). v<0.8.4>, <10/24/2020> -- Fix LSCP minor bug (issue #180). +v<0.8.4>, <11/02/2020> -- Add support for Tensorflow 2. diff --git a/pyod/models/auto_encoder.py b/pyod/models/auto_encoder.py index 23ab1cfdf..4d1bc5888 100644 --- a/pyod/models/auto_encoder.py +++ b/pyod/models/auto_encoder.py @@ -8,10 +8,6 @@ from __future__ import print_function import numpy as np -from keras.models import Sequential -from keras.layers import Dense, Dropout -from keras.regularizers import l2 -from keras.losses import mean_squared_error from sklearn.preprocessing import StandardScaler from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted @@ -20,6 +16,19 @@ from ..utils.stat_models import pairwise_distances_no_broadcast from .base import BaseDetector +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + from keras.models import Sequential + from keras.layers import Dense, Dropout + from keras.regularizers import l2 + from keras.losses import mean_squared_error +else: + from tensorflow.keras.models import Sequential + from tensorflow.keras.layers import Dense, Dropout + from tensorflow.keras.regularizers import l2 + from tensorflow.keras.losses import mean_squared_error # noinspection PyUnresolvedReferences,PyPep8Naming,PyTypeChecker diff --git a/pyod/models/base_dl.py b/pyod/models/base_dl.py new file mode 100644 index 000000000..d6fcdd8a9 --- /dev/null +++ b/pyod/models/base_dl.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +"""Base class for deep learning models +""" +# Author: Yue Zhao +# License: BSD 2 clause + +from __future__ import division +from __future__ import print_function + +import tensorflow + +def _get_tensorflow_version(): # pragma: no cover + """ Utility function to decide the version of tensorflow, which will + affect how to import keras models. + + Returns + ------- + tensorflow version : int + + """ + + tf_version = str(tensorflow.__version__) + if int(tf_version.split(".")[0]) != 1 and int( + tf_version.split(".")[0]) != 2: + raise ValueError("tensorflow version error") + + return int(tf_version.split(".")[0]) \ No newline at end of file diff --git a/pyod/models/gaal_base.py b/pyod/models/gaal_base.py index 12e179b2f..763d4c7d3 100644 --- a/pyod/models/gaal_base.py +++ b/pyod/models/gaal_base.py @@ -11,9 +11,17 @@ import math -import keras -from keras.layers import Input, Dense -from keras.models import Sequential, Model +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + import keras + from keras.layers import Input, Dense + from keras.models import Sequential, Model +else: + import tensorflow.keras as keras + from tensorflow.keras.layers import Input, Dense + from tensorflow.keras.models import Sequential, Model # TODO: create a base class for so_gaal and mo_gaal diff --git a/pyod/models/mo_gaal.py b/pyod/models/mo_gaal.py index d8f17e94b..25cf4ca0a 100644 --- a/pyod/models/mo_gaal.py +++ b/pyod/models/mo_gaal.py @@ -13,16 +13,23 @@ import numpy as np -from keras.layers import Input -from keras.models import Model -from keras.optimizers import SGD - from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted from .base import BaseDetector from .gaal_base import create_discriminator from .gaal_base import create_generator +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + from keras.layers import Input + from keras.models import Model + from keras.optimizers import SGD +else: + from tensorflow.keras.layers import Input + from tensorflow.keras.models import Model + from tensorflow.keras.optimizers import SGD class MO_GAAL(BaseDetector): diff --git a/pyod/models/so_gaal.py b/pyod/models/so_gaal.py index 9dd3c8151..18b43f1ed 100644 --- a/pyod/models/so_gaal.py +++ b/pyod/models/so_gaal.py @@ -13,16 +13,23 @@ import numpy as np -from keras.layers import Input -from keras.models import Model -from keras.optimizers import SGD - from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted from .base import BaseDetector from .gaal_base import create_discriminator from .gaal_base import create_generator +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + from keras.layers import Input + from keras.models import Model + from keras.optimizers import SGD +else: + from tensorflow.keras.layers import Input + from tensorflow.keras.models import Model + from tensorflow.keras.optimizers import SGD class SO_GAAL(BaseDetector): diff --git a/pyod/models/vae.py b/pyod/models/vae.py index fa612e8e2..95b6228fa 100644 --- a/pyod/models/vae.py +++ b/pyod/models/vae.py @@ -22,13 +22,6 @@ import numpy as np -from keras.models import Model -from keras.layers import Lambda, Input, Dense, Dropout -from keras.regularizers import l2 -from keras.losses import mse, binary_crossentropy -from keras.utils import plot_model -from keras import backend as K - from sklearn.preprocessing import StandardScaler from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted @@ -37,6 +30,21 @@ from ..utils.stat_models import pairwise_distances_no_broadcast from .base import BaseDetector +from .base_dl import _get_tensorflow_version + +# if tensorflow 2, import from tf directly +if _get_tensorflow_version() == 1: + from keras.models import Model + from keras.layers import Lambda, Input, Dense, Dropout + from keras.regularizers import l2 + from keras.losses import mse, binary_crossentropy + from keras import backend as K +else: + from tensorflow.keras.models import Model + from tensorflow.keras.layers import Lambda, Input, Dense, Dropout + from tensorflow.keras.regularizers import l2 + from tensorflow.keras.losses import mse, binary_crossentropy + from tensorflow.keras import backend as K class VAE(BaseDetector): diff --git a/pyod/utils/utility.py b/pyod/utils/utility.py index cc837f2ab..15009f3a7 100644 --- a/pyod/utils/utility.py +++ b/pyod/utils/utility.py @@ -279,6 +279,7 @@ def get_label_n(y, y_pred, n=None): return y_pred + def get_intersection(lst1, lst2): """get the overlapping between two lists @@ -321,6 +322,7 @@ def get_list_diff(li1, li2): return (list(set(li1) - set(li2))) + def get_diff_elements(li1, li2): """get the elements in li1 but not li2, and vice versa @@ -344,6 +346,7 @@ def get_diff_elements(li1, li2): return (list(set(li1) - set(li2)) + list(set(li2) - set(li1))) + def argmaxn(value_list, n, order='desc'): """Return the index of top n elements in the list if order is set to 'desc', otherwise return the index of n smallest ones. From 6fd280c6209f7642b7a7274e71550bf998f38706 Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Thu, 12 Nov 2020 18:36:58 -0500 Subject: [PATCH 10/11] fix #246 --- pyod/models/auto_encoder.py | 2 +- pyod/models/vae.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pyod/models/auto_encoder.py b/pyod/models/auto_encoder.py index 4d1bc5888..1ed264dd9 100644 --- a/pyod/models/auto_encoder.py +++ b/pyod/models/auto_encoder.py @@ -87,7 +87,7 @@ class AutoEncoder(BaseDetector): - 1 = progress bar - 2 = one line per epoch. - For verbosity >= 1, model summary may be printed. + For verbose >= 1, model summary may be printed. random_state : random_state: int, RandomState instance or None, optional (default=None) diff --git a/pyod/models/vae.py b/pyod/models/vae.py index 95b6228fa..eb8bbaf4c 100644 --- a/pyod/models/vae.py +++ b/pyod/models/vae.py @@ -122,13 +122,13 @@ class VAE(BaseDetector): If True, apply standardization on the data. verbose : int, optional (default=1) - Verbosity mode. + verbose mode. - 0 = silent - 1 = progress bar - 2 = one line per epoch. - For verbosity >= 1, model summary may be printed. + For verbose >= 1, model summary may be printed. random_state : random_state: int, RandomState instance or None, opti (default=None) @@ -180,7 +180,7 @@ def __init__(self, encoder_neurons=None, decoder_neurons=None, output_activation='sigmoid', loss=mse, optimizer='adam', epochs=100, batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=True, - verbosity=1, random_state=None, contamination=0.1, + verbose=1, random_state=None, contamination=0.1, gamma=1.0, capacity=0.0): super(VAE, self).__init__(contamination=contamination) self.encoder_neurons = encoder_neurons @@ -195,7 +195,7 @@ def __init__(self, encoder_neurons=None, decoder_neurons=None, self.l2_regularizer = l2_regularizer self.validation_size = validation_size self.preprocessing = preprocessing - self.verbosity = verbosity + self.verbose = verbose self.random_state = random_state self.latent_dim = latent_dim self.gamma = gamma @@ -272,7 +272,7 @@ def _build_model(self): [z_mean, z_log]) # Instantiate encoder encoder = Model(inputs, [z_mean, z_log, z]) - if self.verbosity >= 1: + if self.verbose >= 1: encoder.summary() # Build Decoder @@ -289,7 +289,7 @@ def _build_model(self): layer) # Instatiate decoder decoder = Model(latent_inputs, outputs) - if self.verbosity >= 1: + if self.verbose >= 1: decoder.summary() # Generate outputs outputs = decoder(encoder(inputs)[2]) @@ -298,7 +298,7 @@ def _build_model(self): vae = Model(inputs, outputs) vae.add_loss(self.vae_loss(inputs, outputs, z_mean, z_log)) vae.compile(optimizer=self.optimizer) - if self.verbosity >= 1: + if self.verbose >= 1: vae.summary() return vae @@ -343,7 +343,7 @@ def fit(self, X, y=None): batch_size=self.batch_size, shuffle=True, validation_split=self.validation_size, - verbose=self.verbosity).history + verbose=self.verbose).history # Predict on X itself and calculate the reconstruction error as # the outlier scores. Noted X_norm was shuffled has to recreate if self.preprocessing: From 4b625994c7e48e83deb586f5cba254c922e1e1ba Mon Sep 17 00:00:00 2001 From: Yue Zhao Date: Thu, 12 Nov 2020 18:41:14 -0500 Subject: [PATCH 11/11] improve categorical data generation. --- CHANGES.txt | 1 + examples/generate_data_categorical_example.py | 13 +++++++------ pyod/utils/data.py | 1 - 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index bf42f9ab8..b104c5879 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -111,6 +111,7 @@ v<0.8.3>, <09/19/2020> -- Add model persistence documentation (save and load). v<0.8.4>, <10/13/2020> -- Fix COPOD code inconsistency (issue #239). v<0.8.4>, <10/24/2020> -- Fix LSCP minor bug (issue #180). v<0.8.4>, <11/02/2020> -- Add support for Tensorflow 2. +v<0.8.4>, <11/12/2020> -- Merge PR #!02 for categortical data generation. diff --git a/examples/generate_data_categorical_example.py b/examples/generate_data_categorical_example.py index cec4d5f89..7926eee78 100644 --- a/examples/generate_data_categorical_example.py +++ b/examples/generate_data_categorical_example.py @@ -11,6 +11,7 @@ import sys import numpy as np import matplotlib.pyplot as plt + # temporary solution for relative imports in case pyod is not installed # if pyod is installed, no need to use the following line @@ -19,16 +20,16 @@ from pyod.utils.data import generate_data_categorical - if __name__ == "__main__": contamination = 0.1 # percentage of outliers # Generate sample data in clusters - X_train, X_test, y_train, y_test = generate_data_categorical(n_train=200, n_test=50, - n_category_in=8, n_category_out=5, - n_informative=1, n_features=1, - contamination=contamination, - shuffle=True, random_state=42) + X_train, X_test, y_train, y_test = generate_data_categorical \ + (n_train=200, n_test=50, + n_category_in=8, n_category_out=5, + n_informative=1, n_features=1, + contamination=contamination, + shuffle=True, random_state=42) # note that visalizing it can only be in 1 dimension! cats = list(np.ravel(X_train)) diff --git a/pyod/utils/data.py b/pyod/utils/data.py index 1d30f60ff..5588c1879 100644 --- a/pyod/utils/data.py +++ b/pyod/utils/data.py @@ -494,7 +494,6 @@ def generate_data_categorical(n_train=1000, n_test=500, n_features=2, n_informative=2, n_category_in=2, n_category_out=2, contamination=0.1, shuffle=True, random_state=None): - """Utility function to generate synthesized categorical data. Parameters