-
Notifications
You must be signed in to change notification settings - Fork 0
/
source file.py
153 lines (131 loc) · 5.75 KB
/
source file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
# %matplotlib inline
"""**Function to calculate error**"""
def err(y,yp):
# error function to compute the testing and training errors
N = len(y)
tem = np.round(yp)
add = np.sum(y!=tem)
er = add/N
return er
"""**Bagging Method**"""
class bagging_tree_classif:
#Class for the Bagging ensember with fit and predict methords
def bag_fit(self,x,y,n,m):
self.m_list=list()
for iedx in range(n):
l=len(x)
samp=np.random.choice(np.arange(l),size=l,replace=True)
t_set,Test_set=x[samp],y[samp]
DT=DecisionTreeClassifier(max_depth=m)
DT.fit(t_set,Test_set)
self.m_list.append(DT)
def predic(self,X,y,n):
ll = len(X)
scores_ba,te=np.zeros(ll),list()
for dt in self.m_list:
p=dt.predict(X)
temp = err(y,p)
scores_ba=scores_ba+dt.predict(X)
te.append(temp)
ret = scores_ba/n
re = np.round(ret)
return re,te
"""**Boosting: Using AdaBoost**"""
class adaboost_ensemble:
def fit(self, X, y,N,m):
#Class for the Ada Boost ensember with fit and predict methords
we_coms,self.mod,tr_e = list(),list(),list()
ll = len(y)
var_we = np.ones(ll)/ ll
for t in range(0, N):
tree = DecisionTreeClassifier(max_depth=m)
tree.fit(X,y,sample_weight = var_we)
sc = tree.predict(X)
self.mod.append(tree)
error = err(y,sc)
tr_e.append(error)
ada_wec = 1/2*np.log((1 - error) / (error+1e-10))
var_we = var_we * np.exp(-ada_wec * (y!=sc).astype(int))
we_coms.append(ada_wec)
return we_coms
def predic(self,X,y,N,we_coms):
scores_ada,te_e,re = list(),list(),list()
for t in range(N):
sc = self.mod[t].predict(X)
y_pred_t = sc*we_coms[t]
sc = np.sign(y_pred_t)
error_test = np.sum(y!=sc)/len(y)
te_e.append(error_test)
scores_ada.append(y_pred_t)
scores_ada = np.array(scores_ada).T
tem2 = np.sum(scores_ada,axis=1)
re = np.sign(tem2)
re = np.array(re)
return re,te_e
"""**Function for model fit, predict and plot results**"""
def graphs(X_train, X_test, y_train, y_test):
#this function is the part where we do the bagging and ada-boost on the decition tree classifier along with finding errors for test and train
#process and plots for the error rate.
lis11=[5,10]#list for max_depth
for md in lis11:
list1=[200]#list for number of rounds
for pos in list1:
g_val = pos+10
g_list =[range(g_val)]
plt.xlabel('Number of Rounds')
plt.ylabel('Error in Testing')
ada = adaboost_ensemble()#defining the ada_boost class
aa = ada.fit(X_train,y_train,pos+10,md)#fitting the model
t1,t2 = ada.predic(X_train,y_train,pos+10,aa)#prediction
ada_trainerr = err(y_train,t1)#finding the training error
ada_pred,ada_te = ada.predic(X_test,y_test,pos+10,aa)
ada_finalerror = err(y_test,ada_pred)#finding the test error
print("Final test error for Ada boosting with depth is",ada_finalerror,"with a depth of",md)
#plotting graph for error rate while testing and number of rounds for ada boost for all the max_depth values
plt.plot(list(range(pos+10)), ada_te, label='Ada Boosting error plot',color='black')
bag = bagging_tree_classif()#defining the bagging class
bag.bag_fit(X_train,y_train,pos+10,md)#fitting the model
be,te = bag.predic(X_train,y_train,pos+10)#prediction
bad_trainerr = err(y_train,be)#finding the training error
bag_ypred,bag_te = bag.predic(X_test,y_test,pos+10)
bag_finalerror = err(y_test,bag_ypred)#finding the test error
print("Final test error for Bagging with depth:",bag_finalerror,"with a depth of",md)
#plotting graph for error rate while testing and number of rounds for Bagging for all the max_depth values
plt.plot(list(range(pos+10)), bag_te, label='Bagging error plot',color='green')
plt.legend()
plt.show()
"""**Ada-Boost and Bagging on Letter Recognition data**"""
df = pd.read_csv('letter-recognition.data',header=None)
c_vals = (df.iloc[:,0]=='C')
g_vals = (df.iloc[:,0]=='G')
df=df[c_vals|g_vals]
df[0]=df[0].astype('category').cat.codes
data = df.iloc[:,1:].values
lab = df.iloc[:,0].values
X_train, X_test, y_train, y_test = train_test_split(data,lab,test_size=0.3, random_state = 17)
graphs(X_train, X_test, y_train, y_test)
"""**Ada-Boost and Bagging on spambase data**"""
df = pd.read_csv('spambase.data', header = None)
cns = pd.read_csv('spambase.names', sep = ':', skiprows=range(0, 33), header = None)
var_heads = list(cns[0])
var_heads.append('Spam')
df.columns = var_heads
w = 2-1
df['Spam'] = df['Spam'] *w
v1 = df.drop(columns = 'Spam').values
v2 = df['Spam'].values
X_train, X_test, y_train, y_test = train_test_split(v1,v2,train_size = 0.3, random_state = 17)
graphs(X_train, X_test, y_train, y_test)
"""**Ada-Boost and Bagging on German credit data**"""
df = pd.read_csv('german.data',sep=' ',header = None)
df = df.apply(lambda indx: LabelEncoder().fit_transform(indx))
v1,v2= df.iloc[:,:20].values,LabelEncoder().fit_transform(df[20])
X_train, X_test, y_train, y_test = train_test_split(v1,v2,test_size=0.3,random_state=17)
graphs(X_train, X_test, y_train, y_test)