forked from jldbc/gutenberg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_search.py
62 lines (55 loc) · 1.77 KB
/
word_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from sklearn.feature_extraction.text import TfidfVectorizer
import sys, re
import numpy as np
from random import random
from operator import add
import pymongo
from pymongo import MongoClient
import string
import os
client = MongoClient()
db = client.bookdb
posts = db.posts
def process_input(input_string):
#returns list of words in format ["$word1", "$word2", ..., "$wordn"]
"""
Next steps:
- Stem texts before putting them into the database, and then stem the query
input in this function to ensure that results are not thrown off by plurals, etc.
- Remove stopwords from query input as well. scores for stopwords shouldn't mater.
- add in some simple rules that make this smarter. i.e. cut ("books about") from queries
"""
input_string = input_string.translate(string.maketrans("",""), string.punctuation)
words = input_string.split() #now a list
for i in range(len(words)):
new_word = "$" + words[i]
words[i] = new_word
return words
def query(input_str, n_results):
words = process_input(input_str) #returns list of words to retrieve values for in queryable format
#mongo query. sums values of all words in query. val is 0 if not in document.
results = posts.aggregate(
[
{
"$group":
{
"_id":{ "_id": "$_id", "author": "$author_id_0011", "title": "$title_id_0011" }, #features to include in output
"totalAmount": { "$sum": { "$sum": words}} #format: ["$word1", "$word2", ..., "$wordn"]
}
},
{
"$sort":
{
"totalAmount": -1 #sort descending
}
},
{
"$limit" : n_results #limit to this many results
}
]
)
return results
#example of this in action:
out = query("the quick brown fox jumped over the lazy dog", 20)
for i in out:
print i