-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
72 lines (43 loc) · 1.53 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv('./movies.csv')
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()
df['Key_words'] = ''
for index, row in df.iterrows():
plot = row['Plot']
r = Rake()
r.extract_keywords_from_text(plot)
key_words_dict_scores = r.get_word_degrees()
row['Key_words'] = list(key_words_dict_scores.keys())
df.drop(columns = ['Plot'], inplace = True)
df.set_index('Title', inplace = True)
df.head()
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
words = ''
for col in columns:
if col != 'Director':
words = words + ' '.join(row[col])+ ' '
else:
words = words + row[col]+ ' '
row['bag_of_words'] = words
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
indices = pd.Series(df.index)
print(indices[:5])
def recommendations(title, cosine_sim = cosine_sim):
recommended_movies = []
idx = indices[indices == title].index[0]
score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
top_10_indexes = list(score_series.iloc[1:11].index)
for i in top_10_indexes:
recommended_movies.append(list(df.index)[i])
return recommended_movies
print(recommendations('Fargo'))