-
Notifications
You must be signed in to change notification settings - Fork 0
/
subproject3.py
294 lines (209 loc) · 12.3 KB
/
subproject3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import json
from collections import defaultdict
from nltk.stem import PorterStemmer
from utilities import (calc_postings_size, calc_dict_size, calc_percent_change, render_table)
def subproject_3():
"""
Read the index generated from `subproject1.py`, and perform various lossy compressions to it, saving
to additional output files and recording size data along the way. Display a table at the end. Finally,
run the query processor on the selected three queries again to see what has changed.
"""
# Read the naive index into memory
with open('output/1. naive_index.txt', 'rt') as f:
index = json.load(f)
# Calculate initial sizes
INITIAL_DICT_SIZE = calc_dict_size(index)
INITIAL_POSTINGS_SIZE = calc_postings_size(index)
# Remove numbers
print("\nRemoving numbers from the index")
index = remove_numbers(index)
# Calculate size data after removing numbers
NO_NUMS_DICT_SIZE = calc_dict_size(index)
NO_NUMS_POSTINGS_SIZE = calc_postings_size(index)
PCT_CHANGE_DICT_SIZE_NO_NUMS = calc_percent_change(NO_NUMS_DICT_SIZE, INITIAL_DICT_SIZE)
PCT_CHANGE_POSTINGS_SIZE_NO_NUMS = calc_percent_change(NO_NUMS_POSTINGS_SIZE, INITIAL_POSTINGS_SIZE)
# Do case folding
print("\nCase-folding the index")
index = case_folding(index)
# Calculate size data after case-folding
CASE_FOLDING_DICT_SIZE = calc_dict_size(index)
CASE_FOLDING_POSTINGS_SIZE = calc_postings_size(index)
PCT_CHANGE_DICT_SIZE_CASE_FOLDING = calc_percent_change(CASE_FOLDING_DICT_SIZE, NO_NUMS_DICT_SIZE)
CML_CHANGE_DICT_SIZE_CASE_FOLDING = calc_percent_change(CASE_FOLDING_DICT_SIZE, INITIAL_DICT_SIZE)
PCT_CHANGE_POSTINGS_SIZE_CASE_FOLDING = calc_percent_change(CASE_FOLDING_POSTINGS_SIZE, NO_NUMS_POSTINGS_SIZE)
CML_CHANGE_POSTINGS_SIZE_CASE_FOLDING = calc_percent_change(CASE_FOLDING_POSTINGS_SIZE, INITIAL_POSTINGS_SIZE)
# Find most common stopwords, after case-folding and number removal
print("\nCreating stopwords list based on the 150 most common terms in the index")
create_stopwords(index)
# Remove 30 stopwords
print("\nRemoving the most common 30 stopwords from the index")
index30 = stopwords30(index)
# Calculate size data after removing 30 stopwords
STOPW30_DICT_SIZE = calc_dict_size(index30)
STOPW30_POSTINGS_SIZE = calc_postings_size(index30)
PCT_CHANGE_DICT_SIZE_30_STOPW = calc_percent_change(STOPW30_DICT_SIZE, CASE_FOLDING_DICT_SIZE)
CML_CHANGE_DICT_SIZE_30_STOPW = calc_percent_change(STOPW30_DICT_SIZE, INITIAL_DICT_SIZE)
PCT_CHANGE_POSTINGS_SIZE_30_STOPW = calc_percent_change(STOPW30_POSTINGS_SIZE, CASE_FOLDING_POSTINGS_SIZE)
CML_CHANGE_POSTINGS_SIZE_30_STOPW = calc_percent_change(STOPW30_POSTINGS_SIZE, INITIAL_POSTINGS_SIZE)
# Remove 150 stopwords
print("\nRemoving the most common 150 stopwords from the index")
index150 = stopwords150(index)
# Calculate size data after removing 150 stopwords
STOPW150_DICT_SIZE = calc_dict_size(index150)
STOPW150_POSTINGS_SIZE = calc_postings_size(index150)
PCT_CHANGE_DICT_SIZE_150_STOPW = calc_percent_change(STOPW150_DICT_SIZE, CASE_FOLDING_DICT_SIZE)
CML_CHANGE_DICT_SIZE_150_STOPW = calc_percent_change(STOPW150_DICT_SIZE, INITIAL_DICT_SIZE)
PCT_CHANGE_POSTINGS_SIZE_150_STOPW = calc_percent_change(STOPW150_POSTINGS_SIZE, CASE_FOLDING_POSTINGS_SIZE)
CML_CHANGE_POSTINGS_SIZE_150_STOPW = calc_percent_change(STOPW150_POSTINGS_SIZE, INITIAL_POSTINGS_SIZE)
# Stem
print("\nStemming the index")
index = stem(index150)
# Calculate size data after stemming
STEM_DICT_SIZE = calc_dict_size(index)
STEM_POSTINGS_SIZE = calc_postings_size(index)
PCT_CHANGE_DICT_SIZE_STEM = calc_percent_change(STEM_DICT_SIZE, STOPW150_DICT_SIZE)
CML_CHANGE_DICT_SIZE_STEM = calc_percent_change(STEM_DICT_SIZE, INITIAL_DICT_SIZE)
PCT_CHANGE_POSTINGS_SIZE_STEM = calc_percent_change(STEM_POSTINGS_SIZE, STOPW150_POSTINGS_SIZE)
CML_CHANGE_POSTINGS_SIZE_STEM = calc_percent_change(STEM_POSTINGS_SIZE, INITIAL_POSTINGS_SIZE)
# Render the table to the console, featuring all the computed data
render_table(CASE_FOLDING_DICT_SIZE, CASE_FOLDING_POSTINGS_SIZE, CML_CHANGE_DICT_SIZE_150_STOPW,
CML_CHANGE_DICT_SIZE_30_STOPW, CML_CHANGE_DICT_SIZE_CASE_FOLDING, CML_CHANGE_DICT_SIZE_STEM,
CML_CHANGE_POSTINGS_SIZE_150_STOPW, CML_CHANGE_POSTINGS_SIZE_30_STOPW,
CML_CHANGE_POSTINGS_SIZE_CASE_FOLDING, CML_CHANGE_POSTINGS_SIZE_STEM, INITIAL_DICT_SIZE,
INITIAL_POSTINGS_SIZE, NO_NUMS_DICT_SIZE, NO_NUMS_POSTINGS_SIZE, PCT_CHANGE_DICT_SIZE_150_STOPW,
PCT_CHANGE_DICT_SIZE_30_STOPW, PCT_CHANGE_DICT_SIZE_CASE_FOLDING, PCT_CHANGE_DICT_SIZE_NO_NUMS,
PCT_CHANGE_DICT_SIZE_STEM, PCT_CHANGE_POSTINGS_SIZE_150_STOPW, PCT_CHANGE_POSTINGS_SIZE_30_STOPW,
PCT_CHANGE_POSTINGS_SIZE_CASE_FOLDING, PCT_CHANGE_POSTINGS_SIZE_NO_NUMS, PCT_CHANGE_POSTINGS_SIZE_STEM,
STEM_DICT_SIZE, STEM_POSTINGS_SIZE, STOPW150_DICT_SIZE, STOPW150_POSTINGS_SIZE, STOPW30_DICT_SIZE,
STOPW30_POSTINGS_SIZE)
def remove_numbers(index: dict) -> dict:
"""
Remove all numeric keys in the given index.
Create a new index, keeping only the items from the given index that are non-numeric, discarding postings lists
as necessary
:param index: The index to remove items with numeric keys from
:return: A new index, based on the given index, without numeric keys
"""
# Create a new index based on the given index, keeping only non-numeric keys
new_index = {key: val for key, val in index.items() if not key.isnumeric()}
print("Saving to file: output/2. no_numbers_index.txt")
with open("output/2. no_numbers_index.txt", "wt") as f:
json.dump(new_index, f)
return new_index
def case_folding(index: dict) -> dict:
"""
Handle case-folding the keys of the given index.
If 2 or more original keys case-fold to the same new key, must append their postings lists together,
keeping only unique postings, and sorting the resulting list.
The result should, itself, be sorted by key, alphabetically.
Creates a new index based on the given index to avoid errors.
:param index: The index to case-fold keys for
:return: A new index, based on the given index, that has case-folded versions of the given indexes keys.
Should be smaller than the given index. Should not lose any postings. Should have unique
and sorted postings for each key.
"""
# Get all the keys of the given index
index_keys = list(index.keys())
# Create a new index to return
new_index = defaultdict(list)
# Go through each of the given indexes keys
for key in index_keys:
lowered = key.lower()
# Either populate or create a key based on the lower-cased version of this key.
# The key should be associated with the postings list of that key in the original index.
# If a postings list already exists, just append to it.
# If it doesn't exist, the defaultdicts list for that key will be [], so can still append to it
new_index[lowered] += index[key]
# Sort the postings list for this key, and remove duplicates
new_index[lowered] = sorted(set(new_index[lowered]))
# Sort index by keys
new_index = dict(sorted(new_index.items()))
print("Saving to file: output/3. case_folded_index.txt")
with open("output/3. case_folded_index.txt", "wt") as f:
json.dump(new_index, f)
return new_index
def stopwords30(index: dict) -> dict:
"""
Create a new index, based on the given index, with 30 stopword keys removed.
Eliminates the keys corresponding to stopwords completely, not preserving their postings lists.
Creates a new index based on the given index to avoid errors.
:param index: The index to remove 30 stopword keys for
:return: A new index, based on the given index, with all keys corresponding to 30 stopwords removed.
"""
# Get the first 30 stopwords from stopwords.txt. The file has the 150 most common words in the corpus
with open('stopwords.txt', 'rt') as f:
STOPWORDS = [word.strip() for word in f.readlines()][:30]
# Create new index based on whether the keys of the old index are stopwords
new_index = {key: val for key, val in index.items() if key not in STOPWORDS}
print("Saving to file: output/4a. 30_stopwords_index.txt")
with open("output/4a. 30_stopwords_index.txt", "wt") as f:
json.dump(new_index, f)
return new_index
def stopwords150(index: dict) -> dict:
"""
Create a new index, based on the given index, with 150 stopword keys removed.
Eliminates the keys corresponding to stopwords completely, not preserving their postings lists.
Creates a new index based on the given index to avoid errors.
:param index: The index to remove 150 stopword keys for
:return: A new index, based on the given index, with all keys corresponding to 150 stopwords removed.
"""
# Get all stopwords from stopwords.txt. The file has the 150 most common words in the corpus
with open('stopwords.txt', 'rt') as f:
STOPWORDS = [word.strip() for word in f.readlines()]
# Create new index based on whether the keys of the old index are stopwords
new_index = {key: val for key, val in index.items() if key not in STOPWORDS}
print("Saving to file: output/4b. 150_stopwords_index.txt")
with open("output/4b. 150_stopwords_index.txt", "wt") as f:
json.dump(new_index, f)
return new_index
def stem(index: dict) -> dict:
"""
Handle stemming the keys of the given index.
If 2 or more original keys stem to the same new key, must append their postings lists together,
keeping only unique postings, and sorting the resulting list.
The result should, itself, be sorted by key, alphabetically.
Creates a new index based on the given index to avoid errors.
:param index: The index to stem keys for
:return: A new index, based on the given index, that has stemmed versions of the given indexes keys.
Should be smaller than the given index. Should not lose any postings. Should have unique
and sorted postings for each key.
"""
# Get all the keys of the given index
index_keys = list(index.keys())
# Create a new index to return
new_index = defaultdict(list)
# Create them Porter stemmer
stemmer = PorterStemmer()
# Go through each of the given indexes keys
for key in index_keys:
stemmed = stemmer.stem(key)
# Either populate or create a key based on the stemmed version of this key.
# The key should be associated with the postings list of that key in the original index.
# If a postings list already exists, just append to it.
# If it doesn't exist, the defaultdicts list for that key will be [], so can still append to it
new_index[stemmed] += index[key]
# Sort the postings list for this key, and remove duplicates
new_index[stemmed] = sorted(set(new_index[stemmed]))
# Sort index by keys
new_index = dict(sorted(new_index.items()))
print("Saving to file: output/5. stemmed_index.txt")
with open("output/5. stemmed_index.txt", "wt") as f:
json.dump(new_index, f)
return new_index
def create_stopwords(index: dict) -> None:
"""
Create a list of the 150 most common terms in the index.
This needs to be done here, so that the terms are as normalized as possible before we need the stopwords
for their removal. It would be bad if 'the' and 'The' were different stopwords, for instance, so it's best to
do this after case-folding. Needs to happen before stemming though, as per the table I'm trying to emulate.
Finds the most common terms by seeing which terms have the longest postings list.
:param index: The index to find the most common terms of
"""
# Sort by most common dictionary terms. Inspired by https://stackoverflow.com/a/50863131
sorted_index = dict(sorted(index.items(), key=lambda x: len(x[1]), reverse=True))
# Get only the 150 most common tokens
most_common_tokens_150 = list(sorted_index.keys())[:150]
# Save them to a file
print("Saving to file: stopwords.txt")
with open('stopwords.txt', 'wt') as f:
f.write('\n'.join(token for token in most_common_tokens_150))