-
Notifications
You must be signed in to change notification settings - Fork 5
/
ex - Counting Words
48 lines (36 loc) · 1.38 KB
/
ex - Counting Words
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import re
"""Count words."""
def count_words(text):
"""Count how many times each unique word occurs in text."""
counts = dict() # dictionary of { <word>: <count> } pairs to return
# TODO: Convert to lowercase
text = text.lower()
#print(text)
# TODO: Split text into tokens (words), leaving out punctuation
# (Hint: Use regex to split on non-alphanumeric characters)
text_list = re.split(r'(?<=[a-z])[\s\W]*',text)
# the split leaves an extra word '' at the end of the list
# refreshing the list this way removes it
# text_list = text_list[0:-1]
text_list = [i for i in text_list if i != '']
# print(text_list)
# TODO: Aggregate word counts using a dictionary
for word in text_list:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
return counts
def test_run():
with open("input.txt", "r") as f:
text = f.read()
counts = count_words(text)
sorted_counts = sorted(counts.items(), key=lambda pair: pair[1], reverse=True)
print("10 most common words:\nWord\tCount")
for word, count in sorted_counts[:10]:
print("{}\t{}".format(word, count))
print("\n10 least common words:\nWord\tCount")
for word, count in sorted_counts[-10:]:
print("{}\t{}".format(word, count))
if __name__ == "__main__":
test_run()