-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_datasets.py
95 lines (83 loc) · 3.54 KB
/
get_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
import sqlite3
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
lemmer=WordNetLemmatizer()
stemmer = SnowballStemmer('english')
def convert_to_numpy_array(query_results):
as_2d_array = list(map(lambda x: list(x), query_results))
return np.array(as_2d_array)
def get_data(types, attrs, test=0, needDescription = False):
conn = sqlite3.connect("routes.db")
attr_clause = "latitude, longitude, saftey, difficulty, Trad, Ice, Sport, TR, Alpine, Snow, Mixed, Aid, Boulder, Other"
where_clause = "test = " + str(test) + " AND stars IS NOT NULL"
if types is not None:
types = [ " %s = 1" % t for t in types]
where_clause += " AND ("
where_clause += " OR ".join(types)
where_clause += ")"
attr_clause = "latitude, longitude, saftey, difficulty, description_length"
if attrs is not None:
attr_clause = ", ".join(attrs)
query = ("SELECT ROUND(stars), %s FROM routes" % attr_clause)
if needDescription:
where_clause += " AND description_length > 0"
query += " WHERE " + where_clause
data = conn.cursor().execute(query).fetchall()
conn.close()
return convert_to_numpy_array(data)
def get_test_routes():
route_data = get_data(None, None, 1)
stars = route_data[:, 0]
data = route_data[:, 1:]
return data, stars
def get_datasets(types=None, attrs=None):
route_data = get_data(types, attrs)
stars = route_data[:, 0]
data = route_data[:, 1:]
return train_test_split(data, stars, test_size=0.20)
def binary_stars(stars_train, stars_test):
return [1 if x > 3 else 0 for x in stars_train], [1 if x > 3 else 0 for x in stars_test]
def lemmatize_stem(data):
for d in data:
d[1] = ' '.join([stemmer.stem(word) for word in d[1].split(' ')])
d[1] = ' '.join([lemmer.lemmatize(word) for word in d[1].split(' ')])
return data
def get_words(types):
route_data = get_data(types, attrs=["description"], needDescription = True)
route_data = np.array(route_data)
route_data = lemmatize_stem(route_data)
stars = route_data[:, 0]
stars = [int(float(s)) for s in stars]
data = route_data[:, 1:]
data = np.array([d[0] for d in data])
return train_test_split(data, stars, test_size=0.20)
def tfid(bag):
tfidf_transformer = TfidfTransformer()
return tfidf_transformer.fit_transform(bag)
def get_test_bag_of_words(vocab):
route_data = get_data(None, attrs=["description"], test=1, needDescription = True)
route_data = np.array([np.array(x) for x in route_data if x[1] is not None and x[1] is not ""])
route_data = lemmatize_stem(route_data)
stars = route_data[:, 0]
stars = [int(float(s)) for s in stars]
data = route_data[:, 1:]
data = np.array([d[0] for d in data])
testVectorizer = CountVectorizer(vocabulary=vocab)
data = testVectorizer.fit_transform(data)
data = tfid(data)
return data.toarray(), stars
def get_bag_of_words(types=None, max_features=100):
vectorizer = CountVectorizer(stop_words="english", max_features=max_features)
xtrain, xtest, ytrain, ytest = get_words(types)
X = vectorizer.fit_transform(xtrain)
X = tfid(X)
vocab = vectorizer.get_feature_names()
testVectorizer = CountVectorizer(vocabulary=vocab)
X_test = testVectorizer.fit_transform(xtest)
X_test = tfid(X_test)
return X.toarray(), X_test.toarray(), ytrain, ytest, vocab