-
Notifications
You must be signed in to change notification settings - Fork 0
/
task_4(email_spam_detection).py
186 lines (111 loc) · 5.61 KB
/
task_4(email_spam_detection).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- coding: utf-8 -*-
"""Task-4(email spam detection).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1IPUDN4BsVxAiuFQToDc6VPjefRhgyBt8
Email Spam Detection Using Machine Learning
1. **Data Collection**
importing pandas library to read the data
"""
import pandas as pd
"""loading the dataset from Google Sheets
"""
data=pd.read_csv('https://docs.google.com/spreadsheets/d/1egtZgqs61fgCmI_eHxqPePYlEMAACDW8lEAjfC9cgi4/export?format=csv&gid=332598977')
"""Reading the data
This data consists of 2 columns - v1 which is the category column that labels the mail content as ham or spam; v2 is the column containing the mail/messages. For better understanding, the columns are renamed - v1-> Category,v2 -> Message
"""
data.rename(columns={'v1':'Category','v2':'Message'},inplace=True)
data.head(3)
data.tail(3)
"""2. **Data Organization**
"""
data.shape # gives information about the data i.e, total number of rows and columns
data.info()
"""From the above info, it shows that there is no null values present"""
data.describe() # gives the summary of the data
"""3. **Data Visualization**
importing necessary libraries for Data Visualization
"""
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(data=data,x=data['Category'])
"""The above graph shows that number of ham mails is more than spam ones."""
data.groupby('Category')['Category'].count()
# Total number of ham mails = 4825
# Total number of spam mails = 747
#Total mails= 5572
Ham = (4825/5572)*100
print('Ham :-',Ham)
Spam=(747/5572)*100
print('Spam :-',Spam)
# Ham is rounded to 87 and Spam is rounded to 13 for more convinience
plt.pie((87,13),labels=['Ham','Spam'])
plt.title('Mails by Category')
plt.show()
"""4. **Data Preparation**
As the category data which is needed for detection has only string values, replacement by numbers is to be done.
0 for ham, 1 for spam
"""
data['Category']=data['Category'].replace({'ham':0,'spam':1})
"""Dividing the dataset into input and output columns, i.e input consists of independent variables (Message) and output has dependent variables(Category)."""
# input column
x= data['Message']
# output column
y=data['Category']
"""importing needed libraries"""
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
"""Feature extraction:-
For machine learning, using numerical data is more easy because process is faster. So, the string data is to be converted to numbers.
This is done using the method of Feature Extraction, and the module TfidfVectorizer.
"""
from sklearn.feature_extraction.text import TfidfVectorizer
new= TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
"""- min_df is for checking if the word's score is less than 1, can be ignored
- stop_words = unnecessary words to be ignored
- lowercase= words in lower case to be included for analysis
"""
x_train_new=new.fit_transform(x_train) # input train values implied with the feature extraction is trained for the model building
x_test_new=new.transform(x_test) # since its a test data, its not fitted.
"""5. **Model Building**
Since, the dataset is composed of dependent variable which falls under binary category, models like Logistic Regression, Decision Tree Classifier, Support Vector Classifier are used.
(1) Logistic Regression
"""
from sklearn.linear_model import LogisticRegression
model_1=LogisticRegression() # defining the model
model_1.fit(x_train_new,y_train) # training the data
"""Model evaluation for (1)"""
score_1=model_1.score(x_train_new,y_train)
print('Accuracy Score for Logistic Regression model with training data =',score_1)
score_2=model_1.score(x_test_new,y_test)
print('Accuracy Score for Logistic Regression model with test data =',score_2)
"""(2) DecisionTreeClassifier"""
from sklearn.tree import DecisionTreeClassifier
model_2=DecisionTreeClassifier()
model_2.fit(x_train_new,y_train)
"""Model Evaluation for (2)"""
s_1=model_2.score(x_train_new,y_train)
print('Accuracy Score for Decision Tree Classification model with training data =',s_1)
s_2=model_2.score(x_test_new,y_test)
print('Accuracy Score for Decision Tree Classification model with test data =',s_2)
"""(3) Support Vector Classifier"""
from sklearn.svm import SVC
model_3=SVC()
model_3.fit(x_train_new,y_train)
"""Model Evaluation for (3)"""
s_a=model_3.score(x_train_new,y_train)
print('Accuracy Score for Support Vector Classification model with training data =',s_a)
s_b=model_3.score(x_test_new,y_test)
print('Accuracy Score for Support Vector Classification model with test data =',s_b)
"""Classification Reports for all 3 models"""
from sklearn.metrics import classification_report # importing the library used for printing classification reports
p1=model_1.predict(x_test_new)
print(' 1. Classification Report for Model 1 - Logistic Regression:-' ,'\n ', classification_report(y_test,p1))
pred1=model_2.predict(x_test_new)
print('2. Classification Report for Model 2 - Decision Tree Classification:-' ,'\n ', classification_report(y_test,pred1))
pred_a=model_3.predict(x_test_new)
print('3. Classification Report for Model 3 - SVC:-' ,'\n ', classification_report(y_test,pred_a))
"""From the above classification reports of the 3 models used for training the machine in order to detect the spam mails, "model_3" i.e, the model built by using the algorithm Support Vector Classifier(SVC) is found to be the best one."""
import pickle # modeule used for loading and storing files
pickle.dump(model_3,open('/content/drive/MyDrive/ONE/Model_for_Spamdetection.pkl','wb'))
"""Hence, the model built using SVC alogrithm is saved."""