Merge pull request #7 from zgornel/latest

Speed improvements and removal of string search
zgornel · Oct 30, 2018 · b508bf1 · b508bf1
2 parents dc69f9c + 6f4d6b7
commit b508bf1
Show file tree

Hide file tree

Showing 8 changed files with 50 additions and 98 deletions.
diff --git a/Manifest.toml b/Manifest.toml
@@ -20,28 +20,28 @@ uuid = "a74b3585-a348-5f62-a45c-50e91977d574"
 version = "0.5.1"
 
 [[BufferedStreams]]
-deps = ["Compat", "Pkg", "Test"]
+deps = ["Compat", "Test"]
 git-tree-sha1 = "5d55b9486590fdda5905c275bb21ce1f0754020f"
 uuid = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
 version = "1.0.0"
 
 [[CMake]]
-deps = ["BinDeps", "Libdl", "Pkg", "Test"]
+deps = ["BinDeps", "Libdl", "Test"]
 git-tree-sha1 = "74853a75c26a4a73ac391ee26ee29ebeb5583d9f"
 uuid = "631607c0-34d2-5d66-819e-eb0f9aa2061a"
 version = "1.1.0"
 
 [[CMakeWrapper]]
-deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Pkg", "Test"]
+deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Test"]
 git-tree-sha1 = "2b43d451639984e3571951cc687b8509b0a86c6d"
 uuid = "d5fb7624-851a-54ee-a528-d3f3bac0b4a0"
 version = "0.2.2"
 
 [[CodecZlib]]
-deps = ["BinaryProvider", "Libdl", "Pkg", "Test", "TranscodingStreams"]
-git-tree-sha1 = "83cb3d65c37ea1364c2d5bf7bcea41843ba645dc"
+deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"]
+git-tree-sha1 = "e3df104c84dfc108f0ca203fd7f5bbdc98641ae9"
 uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
-version = "0.5.0"
+version = "0.5.1"
 
 [[Compat]]
 deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
@@ -57,12 +57,6 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
 deps = ["Mmap"]
 uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 
-[[Distances]]
-deps = ["LinearAlgebra", "Pkg", "Printf", "Random", "Statistics", "Test"]
-git-tree-sha1 = "2f38605722542f1c0a32dd2856fb529d8c226c69"
-uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
-version = "0.7.3"
-
 [[Distributed]]
 deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
@@ -89,14 +83,8 @@ version = "0.7.0"
 deps = ["LinearAlgebra", "Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
-[[IterTools]]
-deps = ["Pkg", "SparseArrays", "Test"]
-git-tree-sha1 = "79246285c43602384e6f1943b3554042a3712056"
-uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
-version = "1.1.1"
-
 [[JSON]]
-deps = ["Dates", "Distributed", "Mmap", "Pkg", "Sockets", "Test", "Unicode"]
+deps = ["Dates", "Distributed", "Mmap", "Sockets", "Test", "Unicode"]
 git-tree-sha1 = "fec8e4d433072731466d37ed0061b3ba7f70eeb9"
 uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 version = "0.19.0"
@@ -108,13 +96,13 @@ uuid = "8ef0a80b-9436-5d2c-a485-80b904378c43"
 version = "0.4.0"
 
 [[LibCURL]]
-deps = ["BinaryProvider", "Compat", "Libdl", "Pkg", "Printf"]
+deps = ["BinaryProvider", "Compat", "Libdl", "Printf"]
 git-tree-sha1 = "6339c87cb76923a3cf947fcd213cbc364355c9c9"
 uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
 version = "0.4.1"
 
 [[LibExpat]]
-deps = ["Compat", "Pkg"]
+deps = ["Compat"]
 git-tree-sha1 = "fde352ec13479e2f90e57939da2440fb78c5e388"
 uuid = "522f3ed2-3f36-55e3-b6df-e94fee9b0c07"
 version = "0.5.0"
@@ -146,13 +134,13 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[OrderedCollections]]
-deps = ["Pkg", "Random", "Serialization", "Test"]
+deps = ["Random", "Serialization", "Test"]
 git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.0.2"
 
 [[Parameters]]
-deps = ["Markdown", "OrderedCollections", "Pkg", "REPL", "Test"]
+deps = ["Markdown", "OrderedCollections", "REPL", "Test"]
 git-tree-sha1 = "40f540ec96e50c0b2b9efdb11b5e4d0c63f90923"
 uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 version = "0.10.1"
@@ -194,18 +182,12 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
-[[StringDistances]]
-deps = ["Distances", "IterTools", "Test"]
-git-tree-sha1 = "41fddd579b75e0cd0d1bbdb2d68a2a9cc588c164"
-uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
-version = "0.3.0"
-
 [[Test]]
 deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[TranscodingStreams]]
-deps = ["DelimitedFiles", "Pkg", "Random", "Test"]
+deps = ["Pkg", "Random", "Test"]
 git-tree-sha1 = "a34a2d588e2d2825602bf14a24216d5c8b0921ec"
 uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 version = "0.8.1"

diff --git a/Project.toml b/Project.toml
@@ -1,12 +1,11 @@
 name = "ConceptnetNumberbatch"
 uuid = "2d1d9008-b762-11e8-11f1-375fdd6dca71"
 authors = ["Corneliu Cofaru <cornel@oxoaresearch.com>"]
-version = "0.1.0"
+version = "0.1.2"
 
 [deps]
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
-StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
diff --git a/README.md b/README.md
@@ -116,11 +116,10 @@ julia> doc = "embed this document containing X_#-s231 which cannot be embedded"
 
 ## Remarks
 
- - fast for retrieving embeddings of exact matches
- - fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`)
- - fast document embedding
- - if neither exact or wildcard matches exist, retrieval can be based on string distances (slow, see `src/search.jl`)
- - for another package handling word embeddings, check out [Embeddings.jl](https://github.com/JuliaText/Embeddings.jl)
+ - for the best speed, the `HDF5` version should be used
+ - the API is very fast for retrieving embeddings of single word exact matches
+ - it is also quite fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`) and multiple word expressions of arbitrary length (provided these are embedded)
+ - the document embedding is slower (scales with document length)
 
 
 ## Installation

diff --git a/REQUIRE b/REQUIRE
@@ -3,4 +3,3 @@ TranscodingStreams
 CodecZlib
 HDF5
 Languages
-StringDistances
diff --git a/src/ConceptnetNumberbatch.jl b/src/ConceptnetNumberbatch.jl
@@ -24,7 +24,6 @@ using TranscodingStreams
 using CodecZlib
 using HDF5
 using Languages
-using StringDistances
 
 import Base: get, getindex, size, length, show, keys, values, in
 

diff --git a/src/defaults.jl b/src/defaults.jl
@@ -51,4 +51,4 @@ const LANGUAGES = Dict(:en=>Languages.English(),
                       )
 
 # Regular expression on which to split text into tokens
-const DEFAULT_SPLITTER = r"(,|:|\\|\/|;|\.|\[|\]|\{|\}|\"|\"|\s+)"
+const DEFAULT_SPLITTER = r"(,|\n|\r|\:|\\|\/|;|\.|\[|\]|\{|\}|\'|\`|\"|\"|\?|\!|\=|\~|\&|\s+)"
diff --git a/src/document_embeddings.jl b/src/document_embeddings.jl
@@ -1,7 +1,11 @@
 """
 Fast tokenization function.
 """
-function tokenize_for_conceptnet(doc::AbstractString, splitter::Regex=DEFAULT_SPLITTER)
+tokenize_for_conceptnet(doc::Vector{S}, splitter::Regex=DEFAULT_SPLITTER
+                       ) where S<:AbstractString = begin doc end
+
+tokenize_for_conceptnet(doc::S, splitter::Regex=DEFAULT_SPLITTER
+                       ) where S<:AbstractString = begin
     # First, split
     tokens = strip.(split(doc, splitter))
     # Filter out empty strings
@@ -20,9 +24,7 @@ function embed_document(conceptnet::ConceptNet,
                         compound_word_separator::String="_",
                         max_compound_word_length::Int=1,
                         wildcard_matching::Bool=false,
-                        search_mismatches::Symbol=:no,
-                        print_matched_words::Bool=false,
-                        distance=Levenshtein())
+                        print_matched_words::Bool=false)
     # Split document into tokens and embed
     return embed_document(conceptnet,
                           tokenize_for_conceptnet(document),
@@ -31,9 +33,7 @@ function embed_document(conceptnet::ConceptNet,
                           compound_word_separator=compound_word_separator,
                           max_compound_word_length=max_compound_word_length,
                           wildcard_matching=wildcard_matching,
-                          search_mismatches=search_mismatches,
-                          print_matched_words=print_matched_words,
-                          distance=distance)
+                          print_matched_words=print_matched_words)
 end
 
 function embed_document(conceptnet::ConceptNet,
@@ -43,9 +43,8 @@ function embed_document(conceptnet::ConceptNet,
                         compound_word_separator::String="_",
                         max_compound_word_length::Int=1,
                         wildcard_matching::Bool=false,
-                        search_mismatches::Symbol=:no,
-                        print_matched_words::Bool=false,
-                        distance=Levenshtein()) where S<:AbstractString
+                        print_matched_words::Bool=false
+                       ) where S<:AbstractString
     # Initializations
     embeddings = conceptnet.embeddings[language]
     # Get positions of words that can be used for indexing (found)
@@ -69,31 +68,10 @@ function embed_document(conceptnet::ConceptNet,
     not_found_positions = setdiff(1:length(document_tokens),
                                   collect.(found_positions)...)
     words_not_found = document_tokens[not_found_positions]
-    if keep_size && !isempty(words_not_found)  # keep_size has precendence
+    if keep_size
         for word in words_not_found
-            if search_mismatches == :no
-                # Insert not found words if exact matches are to be
-                # returned only if a matrix of width equal to the
-                # number of terms is to be returned
-                push!(found_words, word)
-            elseif search_mismatches == :brute_force
-                match_word = ""
-                distmin = Inf
-                for dict_word in keys(embeddings)
-                    dist = evaluate(distance, word, dict_word)
-                    if dist < distmin
-                        distmin = dist
-                        match_word = dict_word
-                    end
-                end
-                push!(found_words, match_word)
-            else
-                @warn "The only supported approximate string matching" *
-                      " method is :brute_force. Use :no for skipping the" *
-                      " search; will not search."
-                push!(found_words, word)
-            end
-        end
+            push!(found_words, word)  # the zero-vectors will be the
+        end                           # last columns of the document matrix
     end
     # Return
     if print_matched_words
@@ -139,12 +117,12 @@ end
 #              more_complicated,
 #              complicated]
 function token_search(conceptnet::ConceptNet{L,K,V},
-                      tokens::S;
+                      tokens::Vector{S};
                       language::L=Languages.English(),
                       separator::String="_",
                       max_length::Int=3,
                       wildcard_matching::Bool=false) where
-        {L<:Language, K, V, S<:AbstractVector{<:AbstractString}}
+        {L<:Language, K, V, S<:AbstractString}
     # Initializations
     if wildcard_matching
         # Build function that checks whether a token is found in conceptnet
@@ -163,21 +141,22 @@ function token_search(conceptnet::ConceptNet{L,K,V},
     i = 1
     j = n
     while i <= n
-        token = join(tokens[i:j], separator, separator)
-        is_match = check_function(conceptnet, language, token, V())
-        if is_match && j-i+1 <= max_length
-            push!(found, i:j)
-            i = j + 1
-            j = n
-            continue
-        else
-            if i == j
+        if j-i+1 <= max_length
+            token = join(tokens[i:j], separator, separator)
+            is_match = check_function(conceptnet, language, token, V())
+            if is_match
+                push!(found, i:j)
+                i = j + 1
                 j = n
-                i+= 1
-            else
-                j-= 1
+                continue
             end
         end
+        if i == j
+            j = n
+            i+= 1
+        else
+            j-= 1
+        end
     end
     return found
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -117,16 +117,14 @@ end
     embedded_doc, missed = embed_document(conceptnet,
                                           doc,
                                           keep_size=false,
-                                          max_compound_word_length=1,
-                                          search_mismatches=:no)
+                                          max_compound_word_length=1)
     @test embedded_doc isa Matrix{Float64}
     @test isempty(embedded_doc)
     @test length(missed) == 3
     embedded_doc, missed = embed_document(conceptnet,
                                           doc,
                                           keep_size=true,
-                                          max_compound_word_length=1,
-                                          search_mismatches=:no)
+                                          max_compound_word_length=1)
     @test embedded_doc isa Matrix{Float64}
     @test size(embedded_doc, 2) == length(tokenize_for_conceptnet(doc))
     @test length(missed) == 3
@@ -135,25 +133,22 @@ end
     embedded_doc_2, missed = embed_document(conceptnet,
                                             doc_2,
                                             keep_size=false,
-                                            max_compound_word_length=2,
-                                            search_mismatches=:no)
+                                            max_compound_word_length=2)
     @test embedded_doc_2 isa Matrix{Float64}
     @test isempty(embedded_doc_2)
     @test length(missed) == length(tokenize_for_conceptnet(doc_2))
     embedded_doc_2, missed = embed_document(conceptnet,
                                             doc_2,
                                             keep_size=true,
-                                            max_compound_word_length=2,
-                                            search_mismatches=:no)
+                                            max_compound_word_length=2)
     @test embedded_doc_2 isa Matrix{Float64}
     @test size(embedded_doc_2, 2) == length(tokenize_for_conceptnet(doc_2))
     @test length(missed) == length(tokenize_for_conceptnet(doc_2))
     embedded_doc_2, missed = embed_document(conceptnet,
                                             doc_2,
                                             keep_size=true,
                                             wildcard_matching=true,
-                                            max_compound_word_length=2,
-                                            search_mismatches=:no)
+                                            max_compound_word_length=2)
     @show missed
     @test length(missed) == 0
 end