-
Notifications
You must be signed in to change notification settings - Fork 2
/
demo_new_docs.py
96 lines (82 loc) · 2.78 KB
/
demo_new_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#-*- coding: utf-8 -*-
"""
@author: yingwenjie
"""
import os
import sys
import string
import numpy as np
import scipy
import scipy.io
import jieba
import jieba.posseg as pseg
from jieba import analyse
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import re
from utils import *
if __name__ == "__main__" :
task_name = sys.argv[1]
filename = sys.argv[2]
task_dir = "./data/" + task_name
arg = scipy.io.loadmat(task_dir + "/arg.mat")
B_index = arg['B_index']
B_index = B_index[0, :].tolist()
logPX1_B1 = arg['logPX1_B1']
logPX1_B0 = arg['logPX1_B0']
bits = logPX1_B1.shape[0] #哈希码长度
train_docs = []
with open(task_dir + "/seg_file_orgin") as f:
for line in f:
train_docs.append(line.strip())
dics = []
with open(task_dir + "/arg_word.utf8",'r') as f:
for line in f:
dics.append(line.strip())
test_docs = []
test_seg_list = []
with open(filename) as f:
for line in f:
sample = line.strip('\n')
#sample = re.sub('[A-Za-z0-9\!\%\[\]]',"",line.strip('\n'))
#print("Using jieba on " + filename)
#words = jieba.cut(doc,cut_all=True)
tfidf = analyse.extract_tags
seg_list = tfidf(sample, topK=5)
test_docs.append(sample)
test_seg_list.append(seg_list)
doc_vec = np.zeros((len(dics),len(test_seg_list)))
for i in range(len(test_seg_list)):
word = test_seg_list[i]
for w in word:
if w in dics:
index = dics.index(w)
doc_vec[index, i] = 1
logPB1 = np.dot(logPX1_B1, doc_vec)
logPB0 = np.dot(logPX1_B0, doc_vec)
tmp = (logPB1 - logPB0)
tmp[tmp >32] = 32
PXB1 = np.power(2, tmp)
PXB1 = PXB1 / (1 + PXB1)
test_B = PXB1 * 2 - 1
test_B[test_B > 0] = 1
test_B[test_B < 0] = 0
B_index_dict = {}
for i in range(len(B_index)):
B_index_dict.setdefault(B_index[i], [])
B_index_dict[B_index[i]].append(train_docs[i])
for i in range(test_B.shape[1]):
index = 0
for j in range(bits):
index += 2**(bits - j - 1) * test_B[j, i]
index = int(index)
topK = 10
for b in B_index_dict:
dis = bin(index ^ b).count('1')
if dis < 2:
for docs in B_index_dict[b]:
print(str(index) + "\t" + "".join(test_docs[i]) + "\t" + str(b) + "\t" + "".join(docs) + "\t" + str(dis))
topK -= 1
if topK < 0: break
if topK < 0: break