-
Notifications
You must be signed in to change notification settings - Fork 94
/
kernel_naive_bayes.py
316 lines (279 loc) · 10.6 KB
/
kernel_naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
"""Kernel Naive Bayes implementation by sklearn. For small data (< 200k rows)."""
import os
import datatable as dt
import numpy as np
import pandas as pd
from h2oaicore.models_main import MainModel
from h2oaicore.systemutils import config, physical_cores_count, loggerinfo
from sklearn.preprocessing import LabelEncoder
from h2oaicore.models import CustomModel
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier, KernelDensity
from sklearn.base import BaseEstimator, ClassifierMixin
class KDENaiveBayesClassifier(BaseEstimator, ClassifierMixin):
"""Bayesian generative classification based on Kernel Density Estimation
Reference: https://jakevdp.github.io/PythonDataScienceHandbook/05.13-kernel-density-estimation.html
Parameters
----------
bandwidth : float
the kernel bandwidth within each class
kernel : str
the kernel name, passed to KernelDensity - one of {'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'}
"""
def __init__(self, bandwidth=1.0, kernel="gaussian", algorithm="auto"):
self.bandwidth = bandwidth
self.kernel = kernel
self.algorithm = algorithm
def fit(self, X, y):
self.classes_ = np.sort(np.unique(y))
training_sets = [X[y == yi] for yi in self.classes_]
self.models_ = [
KernelDensity(
bandwidth=self.bandwidth, kernel=self.kernel, algorithm=self.algorithm
).fit(Xi)
for Xi in training_sets
]
self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0]) for Xi in training_sets]
return self
def predict_proba(self, X):
logprobs = np.array([model.score_samples(X) for model in self.models_]).T
result = np.exp(logprobs + self.logpriors_)
return result / result.sum(1, keepdims=True)
def predict(self, X):
return self.classes_[np.argmax(self.predict_proba(X), 1)]
class KernelNaiveBayesClassifier(CustomModel):
_regression = False
_binary = True
_multiclass = True
_parallel_task = True
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
_known_bad_preds = True # makes all nans sometimes in predictions
_display_name = "Kernel Naive Bayes Classifier"
_description = "Kernel Naive Bayes Model based on sklearn. Not advised if the data is larger than 200K rows"
@staticmethod
def can_use(
accuracy,
interpretability,
train_shape=None,
test_shape=None,
valid_shape=None,
n_gpus=0,
num_classes=None,
**kwargs
):
if config.hard_asserts:
# for bigger data, too slow to test even with 1 iteration
use = (
train_shape is not None
and train_shape[0] * train_shape[1] < 1024 * 1024
or valid_shape is not None
and valid_shape[0] * valid_shape[1] < 1024 * 1024
)
# too slow for walmart with only 421k x 15 even with 10 neighbors
use &= train_shape is not None and train_shape[1] < 10
return use
else:
return True
def set_default_params(
self, accuracy=None, time_tolerance=None, interpretability=None, **kwargs
):
kwargs.pop("get_best", None)
self.mutate_params(
accuracy=accuracy,
time_tolerance=time_tolerance,
interpretability=interpretability,
get_best=True,
**kwargs
)
def mutate_params(
self,
accuracy=10,
time_tolerance=10,
interpretability=1,
get_best=False,
**kwargs
):
# Modify certain parameters for tuning
user_choice = config.recipe_dict.copy()
self.params = dict()
trial = kwargs.get("trial")
list_of_neibs = [1, 2, 5, 10, 50, 100, 150, 200]
bandwidths = [1e-2, 1e-1, 1, 10, 100]
kernels = [
"gaussian",
"tophat",
"epanechnikov",
"exponential",
"linear",
"cosine",
]
if config.recipe == "kaggle":
list_of_neibs.extend([250, 300])
if "GIT_HASH" in os.environ and config.hard_asserts:
list_of_neibs = [1]
self.params["bandwidth"] = MainModel.get_one(
bandwidths,
get_best=get_best,
best_type="first",
name="kernel bandwidth",
trial=trial,
user_choice=user_choice,
)
self.params["kernel"] = MainModel.get_one(
kernels,
get_best=get_best,
best_type="first",
name="kernel",
trial=trial,
user_choice=user_choice,
)
self.params["algorithm"] = MainModel.get_one(
["auto", "ball_tree", "kd_tree"],
get_best=get_best,
best_type="first",
name="algorithm",
trial=trial,
user_choice=user_choice,
)
self.params["standardize"] = MainModel.get_one(
[False, True],
get_best=get_best,
best_type="first",
name="standardize",
trial=trial,
user_choice=user_choice,
)
def transcribe_params(self, params=None, **kwargs):
"""
Fixups of params to avoid any conflicts not expressible easily for Optuna
Or system things only need to set at fit time
:param params:
:return:
"""
params_was_None = False
if params is None:
params = self.params # reference, so goes back into self.params
params_was_None = True
params.pop(
"standardize", None
) # internal parameter, not for actual underlying sklearn model
params.pop("n_jobs", None) # KernelDensity doesn't use n_jobs
if params_was_None:
# in case some function didn't preserve reference
self.params = params
return params # default is no transcription
def fit(
self,
X,
y,
sample_weight=None,
eval_set=None,
sample_weight_eval_set=None,
**kwargs
):
# system thing, doesn't need to be set in default or mutate, just at runtime in fit, into self.params so can see
self.params["n_jobs"] = self.params_base.get(
"n_jobs", max(1, physical_cores_count)
)
params = self.params.copy()
params = self.transcribe_params(params, train_shape=X.shape)
loggerinfo(
self.get_logger(**kwargs),
"%s fit params: %s" % (self.display_name, dict(params)),
)
loggerinfo(
self.get_logger(**kwargs),
"%s data: %s %s" % (self.display_name, X.shape, y.shape),
)
X = dt.Frame(X)
orig_cols = list(X.names)
model = KDENaiveBayesClassifier(**params)
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
X = self.basic_impute(X)
X = X.to_numpy()
if self.params.get(
"standardize", False
): # self.params since params has it popped out
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
else:
standard_scaler = None
model.fit(X, y)
importances = self.get_basic_importances(X, y)
self.set_model_properties(
model=(model, standard_scaler, self.min),
features=orig_cols,
importances=importances.tolist(), # abs(model.coef_[0])
iterations=0,
)
def basic_impute(self, X):
# scikit extra trees internally converts to np.float32 during all operations,
# so if float64 datatable, need to cast first, in case will be nan for float32
from h2oaicore.systemutils import update_precision
X = update_precision(
X,
data_type=np.float32,
override_with_data_type=True,
fixup_almost_numeric=True,
)
# Replace missing values with a value smaller than all observed values
if not hasattr(self, "min") or not isinstance(self.min, dict):
self.min = dict()
for col in X.names:
XX = X[:, col]
if col not in self.min:
self.min[col] = XX.min1()
if (
self.min[col] is None
or np.isnan(self.min[col])
or np.isinf(self.min[col])
):
self.min[col] = -1e10
else:
self.min[col] -= 1
XX.replace([None, np.inf, -np.inf], self.min[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
return X
def get_basic_importances(self, X, y):
# Just get feature importance using basic model so at least something to consume by genetic algorithm
if isinstance(X, dt.Frame):
X = X.to_numpy()
elif isinstance(X, pd.DataFrame):
X = X.values
# not in self, just for importances regardless of real model behavior
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
from sklearn.linear_model import (
Ridge,
) # will be used to derive feature importances
feature_model = Ridge(alpha=1.0, random_state=self.random_state)
feature_model.fit(X, y)
return np.array(abs(feature_model.coef_))
def predict(self, X, **kwargs):
model_tuple, _, _, _ = self.get_model_properties()
if len(model_tuple) == 3:
model, standard_scaler, self.min = model_tuple
else:
# migration for old recipe version
model = model_tuple
standard_scaler = None
self.min = dict()
X = dt.Frame(X)
X = self.basic_impute(X)
X = X.to_numpy()
if standard_scaler is not None:
X = standard_scaler.transform(X)
pred_contribs = kwargs.get("pred_contribs", None)
output_margin = kwargs.get("output_margin", None)
if not pred_contribs:
if self.num_classes == 1:
preds = model.predict(X)
else:
preds = model.predict_proba(X)
# preds = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
return preds
else:
raise NotImplementedError("No Shapley for k-NB model")