-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data.py
76 lines (58 loc) · 2.45 KB
/
Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import logging
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
# defining the logging formt
form = logging.Formatter("%(asctime)s : %(levelname)-5.5s : %(message)s")
logger = logging.getLogger()
# setting the logging format
consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(form)
logger.addHandler(consoleHandler)
# set the logger level
logger.setLevel(logging.DEBUG)
class preprocess():
def __init__(self):
pass
# This function turns object columns to categorical columns
def toCat(self, df):
logger.info('\n turning object cols into category cols \n')
colList = list(df.select_dtypes(include='object'))
for col in colList:
df[col] = df[col].astype('category')
return df
# transforms categorical columns to a one hot vector/ array
def oneHot(self, df):
logger.info('\n creating oneHot columns \n')
colList = list(df.select_dtypes(include='category'))
prefList = []
# create a prefix on how to name the new dummy columns
for col in colList:
prefList.append("is_" + col)
df = pd.get_dummies(df, columns=colList, prefix=prefList)
return df
# uses the StandardScaler to normalize our data
def scalerDf(self, df, rmColList):
logger.info('\n running StandardScaler() on the data \n')
# select numerical columns not present in the remColList
colList = list(df.select_dtypes(include=['int64', 'float64']))
colList = [i for i in colList if i not in rmColList]
scaler = StandardScaler()
scaled = scaler.fit(df[colList].values)
df[colList] = scaled.transform(df[colList].values)
return df
# Transforms the Y column into numbers
def targetEnc(self, y):
logger.info('\n running LabelEncoder() on the target col \n')
labelEnc = LabelEncoder()
y = labelEnc.fit_transform(y)
return y
# reduces the dimensions using the t-SNE technique
def tsneFunc(self, df):
logger.info('\n dimensional reductionality using TSNE \n')
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
tsneResults = tsne.fit_transform(df)
return tsneResults