-
Notifications
You must be signed in to change notification settings - Fork 209
/
bow_sklearn.py
125 lines (110 loc) · 5 KB
/
bow_sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding:utf-8 -*-
'''
-------------------------------------------------
Description : tf-idf-sklearn
Author : machinelp
Date : 2020-06-03
-------------------------------------------------
'''
import os
import jieba
import pickle
import logging
import numpy as np
from .stop_words import StopWords
from textmatch.config.config import cfg
from textmatch.utils.logging import logging
from gensim import corpora, models, similarities
from textmatch.config.constant import Constant as const
from textmatch.models.model_base.model_base import ModelBase
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
class Bow(ModelBase):
def __init__( self,
dic_path=const.BOW_DIC_PATH,
bow_index_path=const.BOW_INDEX_PARH,
stop_word=StopWords ):
'''
'''
self.dic_path = dic_path
self.bow_index_path = bow_index_path
for per_path in [self.dic_path, self.bow_index_path]:
per_path = '/'.join(per_path.split('/')[:-1])
if os.path.exists(per_path) == False:
os.makedirs(per_path)
self.stop_word = stop_word()
self.vectorizer = CountVectorizer(stop_words = None, max_df=cfg.emb.MAX_DF, min_df=cfg.emb.MIN_DF, max_features=cfg.emb.MAX_FEATURES, token_pattern='(?u)\\b\\w\\w*\\b')
# init
def init(self, words_list=None, update=True):
if (~os.path.exists(self.dic_path) or ~os.path.exists(self.bow_model_path) or ~os.path.exists(self.bow_index_path) or update) and (words_list!=None):
word_list = self._seg_word(words_list)
if os.path.exists(self.dic_path) and update==False:
with open(self.dic_path, 'rb') as f:
self.vectorizer = pickle.load(f)
else:
try:
logging.info('[Bow] start build tfidf model.')
if words_list==None:
logging.error( '[Bow] words_list is None' )
self._gen_model(word_list)
logging.info('[Bow] build tfidf model success.')
except Exception as e:
logging.error( '[Bow] build tfidf model error,error info: {} '.format(e) )
if words_list!=None:
self.words_list_pre = []
for per_word in words_list:
self.words_list_pre.append( self._normalize( self._predict(per_word) )[0] )
self.words_list_pre = np.array(self.words_list_pre)
return self
'''
# seg word
def _seg_word(self, words_list, jieba_flag=True, del_stopword=False):
if jieba_flag:
word_list = [[self.stop_word.del_stopwords(words) if del_stopword else word for word in jieba.cut(words)] for words in words_list]
else:
word_list = [[self.stop_word.del_stopwords(words) if del_stopword else word for word in words] for words in words_list]
print( 'word_list>>>', word_list )
return [ ' '.join(word) for word in word_list ]
'''
# seg word
def _seg_word(self, words_list, jieba_flag=cfg.emb.JIEBA_FLAG, del_stopword=cfg.emb.DEL_STOPWORD):
word_list = []
if jieba_flag:
for words in words_list:
if del_stopword:
if words!='' and type(words) == str:
word_list.append( [word for word in self.stop_word.del_stopwords(jieba.cut(words))] )
else:
if words!='' and type(words) == str:
word_list.append( [word for word in jieba.cut(words)] )
else:
for words in words_list:
if del_stopword:
if words!='' and type(words) == str:
word_list.append( [word for word in self.stop_word.del_stopwords(words)] )
else:
if words!='' and type(words) == str:
word_list.append( [word for word in words] )
return [ ' '.join(word) for word in word_list ]
def fit(self, word_list):
word_list = self._seg_word(word_list)
self._gen_model(word_list)
# build dic
def _gen_dic(self, word_list):
#print ('word_list>>>>>', word_list)
dic = self.vectorizer.fit_transform(word_list)
with open(self.dic_path, 'wb') as f:
pickle.dump(self.vectorizer, f)
#print( 'vectorizer>>>>', self.vectorizer.get_feature_names() )
return dic
# build tf-idf model
def _gen_model(self, word_list):
self._gen_dic(word_list)
def _predict(self, words):
tf_idf_embedding = self.vectorizer.transform(self._seg_word([words]))
tf_idf_embedding = tf_idf_embedding.toarray().sum(axis=0)
# print ('>>>>', tf_idf_embedding[np.newaxis, :])
return tf_idf_embedding[np.newaxis, :].astype(float)
def predict(self, words):
pre = self._normalize( self._predict(words) )
return np.dot( self.words_list_pre[:], pre[0] )