forked from Bookworm-project/BookwormDB
-
Notifications
You must be signed in to change notification settings - Fork 0
/
WordsTableCreate.py
executable file
·49 lines (42 loc) · 1.53 KB
/
WordsTableCreate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python
import os
import sys
import re
maxDictionaryLength=3000000
wordcounts = dict()
for file in os.listdir('../texts/textids'):
for line in open('../texts/textids/' + file,'r'):
filename = line.split('\t')[1]
filename = re.sub('\n','',filename)
reading = open('../texts/unigrams/'+filename+'.txt')
for wordEntry in reading:
wordEntry = wordEntry.split(' ')
wordEntry[1] = int(re.sub('\n','',wordEntry[1]))
try:
wordcounts[wordEntry[0]]+=wordEntry[1]
except KeyError:
wordcounts[wordEntry[0]] = wordEntry[1]
#Now we need to delete the words that appear below a cutoff that we find dynamically:
countcounts = dict()
for key in wordcounts:
try:
countcounts[wordcounts[key]] += 1
except KeyError:
countcounts[wordcounts[key]] = 1
ticker = 0
#cutoff is the
minimumCountsForInclusion = 1
for key in sorted(countcounts.keys(),reverse=True):
ticker += countcounts[key]
# print str(key) + ' has ' + str(countcounts[key]) + ' types of things ticker is at ' + str(ticker)
if ticker > maxDictionaryLength:
minimumCountsForInclusion = key
break
for key in wordcounts.keys():
if wordcounts[key] < minimumCountsForInclusion:
del wordcounts[key]
OUTFILE = open("../texts/wordlist/wordlist.txt",'w')
wordid = 0
for word in sorted(wordcounts,key=wordcounts.get,reverse=True):
wordid = wordid + 1
OUTFILE.write(str(wordid) + '\t' + word + '\t' + str(wordcounts[word]) + '\n')