forked from alexvollmer/pci4r
-
Notifications
You must be signed in to change notification settings - Fork 0
/
filtering-with-tokenise.diff
157 lines (150 loc) · 4.75 KB
/
filtering-with-tokenise.diff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
From 7b87830e7c31a0eade5c80f8aa8beee3ab97681a Mon Sep 17 00:00:00 2001
From: Roy Pardee <roy@kif.(none)>
Date: Sun, 29 Mar 2009 10:30:53 -0700
Subject: [PATCH] Added string tokenise stuff
---
lib/filtering.rb | 61 +++++++++++++++++++++++++++++++++++++++++++++++-
spec/filtering_spec.rb | 42 ++++++++++++++++++++++++++++++++-
2 files changed, 101 insertions(+), 2 deletions(-)
mode change 100644 => 100755 spec/filtering_spec.rb
diff --git a/lib/filtering.rb b/lib/filtering.rb
index e707f32..cd30b78 100644
--- a/lib/filtering.rb
+++ b/lib/filtering.rb
@@ -1,4 +1,5 @@
require "set"
+require "stemmer"
##
# This module provides a number of +Classifier+ classes that are instantiated,
@@ -516,4 +517,62 @@ module Filtering
[sum, 1.0].min
end
end
-end
\ No newline at end of file
+end
+
+class String
+
+ @@n_gram_length = 1
+
+ # These are copied uncritically from Lucas Carlson's Classifier library.
+ @@stop_words = %w(a again all along are also an and as at but by came can cant couldnt did didn didnt do doesnt dont ever first from have her here him how i if in into is isnt it itll just last least like most my new no not now of on or should sinc so some th than this that the their then those to told too true try until url us were when whether while with within yes you youll)
+
+ def self.n_gram_length=(newval)
+ @@n_gram_length = newval
+ end
+
+ def self.stop_words=(newval)
+ @@stop_words = newval
+ end
+ def self.stop_words
+ @@stop_words
+ end
+ def self.add_stop_words(adds)
+ @@stop_words += adds.map {|w| w.downcase}
+ @@stop_words.uniq!
+ end
+ def self.remove_stop_words(removes)
+ @@stop_words -= removes
+ end
+ def tokenise
+ # Doing some regex massaging.
+ # Replace single or double dashes w/a single space. Replace tildes w/spaces
+ text = gsub(/(-{1,2}|~)/, ' ').downcase
+
+ # Eat punctuation
+ text.gsub!(/[^\w\s]/, '')
+
+ # Change digits to placeholder
+ text.gsub!(/\d/, '#')
+
+ words = text.split.select {|w| w.length > 1 and not(@@stop_words.include?(w))}
+
+ tokens = []
+ 0.upto(words.length) do |i|
+ @@n_gram_length.downto(1) do |k|
+ chunk = words[i, k]
+ if chunk.length == k then
+ # TODO?: keep from crossing obvious sentence boundaries (e.g., test for periods in elements other than chunk[-1])
+ tokens << chunk.map do |w|
+ # puts(w)
+ w.stem
+ end.join('_')
+ end
+ end
+ end
+
+ return tokens # + words.map {|w| w.stem}
+
+ end
+
+end
+
diff --git a/spec/filtering_spec.rb b/spec/filtering_spec.rb
old mode 100644
new mode 100755
index 6276b47..f2a6c4c
--- a/spec/filtering_spec.rb
+++ b/spec/filtering_spec.rb
@@ -4,6 +4,40 @@ require File.join(File.dirname(__FILE__), "..", "lib", "filtering")
require "tempfile"
describe "Filtering" do
+ describe "tokenise" do
+ it "should ignore stop words" do
+ @words = "I told him to ignore most of these".tokenise
+ @words.should_not be_member("told".stem)
+ @words.should_not be_member("him".stem)
+ @words.should be_member("ignore".stem)
+ @words.should be_member("these".stem)
+ end
+
+ it "should honor added stop words" do
+ String.add_stop_words(%w(ignore all these))
+ @words = "I told him to ignore most of these".tokenise
+ @words.should_not be_member("told".stem)
+ @words.should_not be_member("him".stem)
+ @words.should_not be_member("ignore".stem)
+ @words.should_not be_member("these".stem)
+ end
+
+ it "should honor removed stop words" do
+ String.remove_stop_words(%w(dont not cant couldnt))
+ @words = "I don't want you trying to do things you can't".tokenise
+ @words.should be_member("dont".stem)
+ @words.should be_member("cant".stem)
+ end
+
+ it "should tokenise properly" do
+ str = "The fisherman fished a fish out of the fishing holes"
+ expected = %w(fisherman fish fish out fish hole)
+ str.tokenise.should == expected
+ String.n_gram_length = 2
+ expected = %w(fisherman_fish fisherman fish_fish fish fish_out fish out_fish out fish_hole fish hole)
+ str.tokenise.should == expected
+ end
+ end
describe "get_words" do
before(:each) do
@@ -130,6 +164,11 @@ describe "Filtering" do
@classifier.minimums[:bad] = 0.8
@classifier.classify("quick money").should == :good
end
+
+ it "should handle long documents w/out barfing" do
+ @classifier.classify("make penis fast " * 100000).should == :bad
+ end
+
end
describe "ActiveRecord persistence" do
@@ -161,5 +200,6 @@ describe "Filtering" do
@classifier.feature_count("dog", :good).should == 1.0
@classifier.feature_count("dog", :bad).should == 0.0
end
+
end
-end
\ No newline at end of file
+end
--
1.5.4.4