-
Notifications
You must be signed in to change notification settings - Fork 0
/
entropy.py
168 lines (137 loc) · 9.09 KB
/
entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import math
import pandas as pd
import numpy as np
def entropy2(train_data, classifier , base = 2):
count = [len(train_data[train_data[classifier]==1]), len(train_data[train_data[classifier]==0]), len(train_data)]
prob_1 =count[2] and count[0]/count[2] or 0
prob_2 = count[2] and count[1]/count[2] or 0
entropy = (prob_1 and - prob_1 * math.log(prob_1,2) or 0 )- ( prob_2 and prob_2* math.log(prob_2, base) or 0 )
return entropy
def entropy_split(train_data, label, classifier, bins=100, base = 2):
total_entropy = entropy2(train_data, classifier,base)
bins = list(np.linspace(
math.floor(train_data[label].min()),
math.ceil(train_data[label].max()),
bins))
x= []
result=[]
for bin in bins :
upper, lower = train_data[train_data[label]> bin], train_data[train_data[label]< bin]
count= [len(upper[upper[classifier] == 1]), len(upper[upper[classifier] == 0]), len(upper),len(lower[lower[classifier] == 1]),len(lower[lower[classifier] == 0]),len(lower), len(train_data)]
prob_u1 = count[2] and count[0]/count[2] or 0 # if divide by zero set probability to zero
prob_u2 = count[2] and count[1]/count[2] or 0
prob_l1 = count[5] and count[3]/count[5] or 0
prob_l2 = count[5] and count[4]/count[5] or 0
entropy_u = ( prob_u1 and - prob_u1*math.log(prob_u1, base) or 0 )+ ( prob_u2 and - prob_u2*math.log(prob_u2, base) or 0 ) # probability zero set to zero
entropy_l = ( prob_l1 and - prob_l1*math.log(prob_l1, base) or 0 )+ ( prob_l2 and - prob_l2*math.log(prob_l2, base) or 0 )
entropy_gain = total_entropy - (entropy_u*count[2]/count[6]+ entropy_l*count[5]/count[6])
entropy = [bin, entropy_gain, entropy_u, entropy_l,count[2], count[5]]
x.append(entropy)
results = pd.DataFrame(x, columns= ['bin_point','entropy_gain', 'entropy_u', 'entropy_l', 'count_u', 'count_l'])
result = results[results['entropy_gain'] == results.entropy_gain.max()][:1]
# y.append(entropy_l)
# z.append(entropy_u)
# w.append(count)
# q.append(entropy)
# fig = pd_result.plot.line(x='bin_point', y='entropy_gain')
return result#, y, z, w, q
from functools import total_ordering
from itertools import count
def entropy_binning(train_data, label, classifier, split_bins=100, minimum_gain=0.0001):
total_entropy = entropy2(train_data, classifier=classifier)
results= entropy_split(train_data, label=label, classifier = classifier, bins=split_bins)
n = 0
reg_entropy = pd.DataFrame({ 'splits':[0], 'split_side' : [None], 'bin_point':[None], 'next_optimal_bin': None, 'bin_entropy_gain':0,
'used': [0], 'entropy': total_entropy,'entropy_gain':1 })
data= {n : {'data_u': train_data,
'data_l': train_data,
'entropy_u' :results,
'entropy_l' : results,
'used_split': 0 }}
entropy = [[results['count_u'].values[0],results['entropy_u'].values[0],n, 'upper'], [results['count_l'].values[0], results['entropy_l'].values[0], n, 'lower']]
entropy = pd.DataFrame(entropy, columns=['count','entropy', 'split', 'side'])
prev_entropy = total_entropy
total_count= len(train_data)
entropy['prec'] = entropy['count']/total_count
now_entropy = np.sum(entropy['prec']*entropy['entropy'])
entropy_gain = prev_entropy - now_entropy
while reg_entropy['entropy_gain'].min() > minimum_gain :
if len(reg_entropy) < 2 :
#chosen parameters
chosen_bin = results['bin_point'].values[0]
reg_entropy['next_optimal_bin'] = chosen_bin
n = n+1
data_upper = train_data.loc[train_data[label]> chosen_bin]
data_lower = train_data.loc[train_data[label]< chosen_bin]
entropy_u = entropy_split(data_upper, label=label, classifier=classifier )
entropy_l = entropy_split(data_lower, label=label, classifier=classifier )
data[n] = {'data_u' : data_upper,
'data_l' : data_lower,
'entropy_u': entropy_u,
'entropy_l': entropy_l,
'used_split': n}
else:
#next step
n = n+1
#find the optimal bin which has not yet been used based on the entropy gain in its region
bin_option = reg_entropy[ (reg_entropy.used==0) ]
optimal_bin = bin_option[bin_option.bin_entropy_gain == bin_option.bin_entropy_gain.max()]
#chosen parameters
chosen_bin = optimal_bin['next_optimal_bin'].values[0]
chosen_split = optimal_bin['splits'].values[0]
chosen_split_side = optimal_bin['split_side'].values[0]
#set the bin to used
#remove the branch that has been split in two in the entropy calculation
entropy = entropy.drop(entropy[(entropy['split'] == chosen_split-1) & (entropy["side"] == chosen_split_side)].index)
# select the used branch to retrieve the data and calculate the entropy gain from the split within the branch
if chosen_split_side == 'upper':
entropy_addu = pd.Series({ 'count': data[chosen_split]['entropy_u']['count_u'].values[0] ,
'entropy' : data[chosen_split]['entropy_u']['entropy_u'].values[0],
'split' : n-1,
'side': 'upper'})
entropy_addl = pd.Series({'count': data[chosen_split]['entropy_u']['count_l'].values[0],
'entropy' : data[chosen_split]['entropy_u']['entropy_l'].values[0],
'split' : n-1,
'side': 'lower'})
entropy = pd.concat([entropy, entropy_addu.to_frame().T, entropy_addl.to_frame().T], ignore_index=True)
data_upper = data[chosen_split]['data_u'][ data[chosen_split]['data_u'][label]> chosen_bin ]
data_lower = data[chosen_split]['data_u'][ data[chosen_split]['data_u'][label]< chosen_bin ]
entropy_u = entropy_split(data_upper, label=label, classifier=classifier)
entropy_l = entropy_split(data_lower, label=label, classifier=classifier)
data[n] = {'data_u' : data_upper,
'data_l' : data_lower,
'entropy_u' :entropy_u,
'entropy_l' : entropy_l,
'used_split': [chosen_split]}
else:
entropy_addu = pd.Series({ 'count' : data[chosen_split]['entropy_l']['count_u'].values[0],
'entropy' : data[chosen_split]['entropy_l']['entropy_u'].values[0],
'split' : n-1,
'side': 'upper'})
entropy_addl = pd.Series({'count': data[chosen_split]['entropy_l']['count_l'].values[0],
'entropy' : data[chosen_split]['entropy_l']['entropy_l'].values[0],
'split' : n-1,
'side': 'lower'})
entropy = pd.concat([entropy, entropy_addu.to_frame().T, entropy_addl.to_frame().T], ignore_index=True)
data_upper = data[chosen_split]['data_l'][ data[chosen_split]['data_l'][label]> chosen_bin ]
data_lower = data[chosen_split]['data_l'][ data[chosen_split]['data_l'][label]< chosen_bin ]
entropy_u = entropy_split(data_upper, label=label, classifier=classifier)
entropy_l = entropy_split(data_lower, label=label, classifier=classifier)
data[n] = {'data_u' : data_upper,
'data_l' : data_lower,
'entropy_u' :entropy_u,
'entropy_l' : entropy_l,
'used_split': [chosen_split]}
entropy['prec'] = entropy['count']/total_count
now_entropy = np.sum(entropy['prec']*entropy['entropy'])
entropy_gain = prev_entropy - now_entropy
#add all the results together
pd_entropy1 = [ chosen_bin,n,entropy_u['bin_point'].values[0], 'upper', 0, now_entropy, entropy_gain, entropy_u['entropy_gain'].values[0]]
pd_entropy2 = [chosen_bin,n,entropy_l['bin_point'].values[0], 'lower', 0, now_entropy, entropy_gain, entropy_l['entropy_gain'].values[0]]
add_entropy = pd.DataFrame([pd_entropy1, pd_entropy2],
columns= ['bin_point', 'splits', 'next_optimal_bin','split_side', 'used', 'entropy','entropy_gain', 'bin_entropy_gain'])
reg_entropy = pd.concat([reg_entropy, add_entropy], ignore_index=True)
used_rule = chosen_bin== reg_entropy.next_optimal_bin
reg_entropy.loc[used_rule, 'used'] = 1
prev_entropy=now_entropy
return reg_entropy