Skip to content

Commit

Permalink
Merge pull request #7 from zgornel/latest
Browse files Browse the repository at this point in the history
Speed improvements and removal of string search
  • Loading branch information
zgornel authored Oct 30, 2018
2 parents dc69f9c + 6f4d6b7 commit b508bf1
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 98 deletions.
42 changes: 12 additions & 30 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,28 @@ uuid = "a74b3585-a348-5f62-a45c-50e91977d574"
version = "0.5.1"

[[BufferedStreams]]
deps = ["Compat", "Pkg", "Test"]
deps = ["Compat", "Test"]
git-tree-sha1 = "5d55b9486590fdda5905c275bb21ce1f0754020f"
uuid = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
version = "1.0.0"

[[CMake]]
deps = ["BinDeps", "Libdl", "Pkg", "Test"]
deps = ["BinDeps", "Libdl", "Test"]
git-tree-sha1 = "74853a75c26a4a73ac391ee26ee29ebeb5583d9f"
uuid = "631607c0-34d2-5d66-819e-eb0f9aa2061a"
version = "1.1.0"

[[CMakeWrapper]]
deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Pkg", "Test"]
deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Test"]
git-tree-sha1 = "2b43d451639984e3571951cc687b8509b0a86c6d"
uuid = "d5fb7624-851a-54ee-a528-d3f3bac0b4a0"
version = "0.2.2"

[[CodecZlib]]
deps = ["BinaryProvider", "Libdl", "Pkg", "Test", "TranscodingStreams"]
git-tree-sha1 = "83cb3d65c37ea1364c2d5bf7bcea41843ba645dc"
deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"]
git-tree-sha1 = "e3df104c84dfc108f0ca203fd7f5bbdc98641ae9"
uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
version = "0.5.0"
version = "0.5.1"

[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
Expand All @@ -57,12 +57,6 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"

[[Distances]]
deps = ["LinearAlgebra", "Pkg", "Printf", "Random", "Statistics", "Test"]
git-tree-sha1 = "2f38605722542f1c0a32dd2856fb529d8c226c69"
uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
version = "0.7.3"

[[Distributed]]
deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Expand All @@ -89,14 +83,8 @@ version = "0.7.0"
deps = ["LinearAlgebra", "Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[IterTools]]
deps = ["Pkg", "SparseArrays", "Test"]
git-tree-sha1 = "79246285c43602384e6f1943b3554042a3712056"
uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
version = "1.1.1"

[[JSON]]
deps = ["Dates", "Distributed", "Mmap", "Pkg", "Sockets", "Test", "Unicode"]
deps = ["Dates", "Distributed", "Mmap", "Sockets", "Test", "Unicode"]
git-tree-sha1 = "fec8e4d433072731466d37ed0061b3ba7f70eeb9"
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
version = "0.19.0"
Expand All @@ -108,13 +96,13 @@ uuid = "8ef0a80b-9436-5d2c-a485-80b904378c43"
version = "0.4.0"

[[LibCURL]]
deps = ["BinaryProvider", "Compat", "Libdl", "Pkg", "Printf"]
deps = ["BinaryProvider", "Compat", "Libdl", "Printf"]
git-tree-sha1 = "6339c87cb76923a3cf947fcd213cbc364355c9c9"
uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
version = "0.4.1"

[[LibExpat]]
deps = ["Compat", "Pkg"]
deps = ["Compat"]
git-tree-sha1 = "fde352ec13479e2f90e57939da2440fb78c5e388"
uuid = "522f3ed2-3f36-55e3-b6df-e94fee9b0c07"
version = "0.5.0"
Expand Down Expand Up @@ -146,13 +134,13 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
uuid = "a63ad114-7e13-5084-954f-fe012c677804"

[[OrderedCollections]]
deps = ["Pkg", "Random", "Serialization", "Test"]
deps = ["Random", "Serialization", "Test"]
git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.0.2"

[[Parameters]]
deps = ["Markdown", "OrderedCollections", "Pkg", "REPL", "Test"]
deps = ["Markdown", "OrderedCollections", "REPL", "Test"]
git-tree-sha1 = "40f540ec96e50c0b2b9efdb11b5e4d0c63f90923"
uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
version = "0.10.1"
Expand Down Expand Up @@ -194,18 +182,12 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[[StringDistances]]
deps = ["Distances", "IterTools", "Test"]
git-tree-sha1 = "41fddd579b75e0cd0d1bbdb2d68a2a9cc588c164"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.3.0"

[[Test]]
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[TranscodingStreams]]
deps = ["DelimitedFiles", "Pkg", "Random", "Test"]
deps = ["Pkg", "Random", "Test"]
git-tree-sha1 = "a34a2d588e2d2825602bf14a24216d5c8b0921ec"
uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
version = "0.8.1"
Expand Down
3 changes: 1 addition & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
name = "ConceptnetNumberbatch"
uuid = "2d1d9008-b762-11e8-11f1-375fdd6dca71"
authors = ["Corneliu Cofaru <cornel@oxoaresearch.com>"]
version = "0.1.0"
version = "0.1.2"

[deps]
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,10 @@ julia> doc = "embed this document containing X_#-s231 which cannot be embedded"

## Remarks

- fast for retrieving embeddings of exact matches
- fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`)
- fast document embedding
- if neither exact or wildcard matches exist, retrieval can be based on string distances (slow, see `src/search.jl`)
- for another package handling word embeddings, check out [Embeddings.jl](https://github.com/JuliaText/Embeddings.jl)
- for the best speed, the `HDF5` version should be used
- the API is very fast for retrieving embeddings of single word exact matches
- it is also quite fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`) and multiple word expressions of arbitrary length (provided these are embedded)
- the document embedding is slower (scales with document length)


## Installation
Expand Down
1 change: 0 additions & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ TranscodingStreams
CodecZlib
HDF5
Languages
StringDistances
1 change: 0 additions & 1 deletion src/ConceptnetNumberbatch.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ using TranscodingStreams
using CodecZlib
using HDF5
using Languages
using StringDistances

import Base: get, getindex, size, length, show, keys, values, in

Expand Down
2 changes: 1 addition & 1 deletion src/defaults.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ const LANGUAGES = Dict(:en=>Languages.English(),
)

# Regular expression on which to split text into tokens
const DEFAULT_SPLITTER = r"(,|:|\\|\/|;|\.|\[|\]|\{|\}|\"|\"|\s+)"
const DEFAULT_SPLITTER = r"(,|\n|\r|\:|\\|\/|;|\.|\[|\]|\{|\}|\'|\`|\"|\"|\?|\!|\=|\~|\&|\s+)"
75 changes: 27 additions & 48 deletions src/document_embeddings.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
"""
Fast tokenization function.
"""
function tokenize_for_conceptnet(doc::AbstractString, splitter::Regex=DEFAULT_SPLITTER)
tokenize_for_conceptnet(doc::Vector{S}, splitter::Regex=DEFAULT_SPLITTER
) where S<:AbstractString = begin doc end

tokenize_for_conceptnet(doc::S, splitter::Regex=DEFAULT_SPLITTER
) where S<:AbstractString = begin
# First, split
tokens = strip.(split(doc, splitter))
# Filter out empty strings
Expand All @@ -20,9 +24,7 @@ function embed_document(conceptnet::ConceptNet,
compound_word_separator::String="_",
max_compound_word_length::Int=1,
wildcard_matching::Bool=false,
search_mismatches::Symbol=:no,
print_matched_words::Bool=false,
distance=Levenshtein())
print_matched_words::Bool=false)
# Split document into tokens and embed
return embed_document(conceptnet,
tokenize_for_conceptnet(document),
Expand All @@ -31,9 +33,7 @@ function embed_document(conceptnet::ConceptNet,
compound_word_separator=compound_word_separator,
max_compound_word_length=max_compound_word_length,
wildcard_matching=wildcard_matching,
search_mismatches=search_mismatches,
print_matched_words=print_matched_words,
distance=distance)
print_matched_words=print_matched_words)
end

function embed_document(conceptnet::ConceptNet,
Expand All @@ -43,9 +43,8 @@ function embed_document(conceptnet::ConceptNet,
compound_word_separator::String="_",
max_compound_word_length::Int=1,
wildcard_matching::Bool=false,
search_mismatches::Symbol=:no,
print_matched_words::Bool=false,
distance=Levenshtein()) where S<:AbstractString
print_matched_words::Bool=false
) where S<:AbstractString
# Initializations
embeddings = conceptnet.embeddings[language]
# Get positions of words that can be used for indexing (found)
Expand All @@ -69,31 +68,10 @@ function embed_document(conceptnet::ConceptNet,
not_found_positions = setdiff(1:length(document_tokens),
collect.(found_positions)...)
words_not_found = document_tokens[not_found_positions]
if keep_size && !isempty(words_not_found) # keep_size has precendence
if keep_size
for word in words_not_found
if search_mismatches == :no
# Insert not found words if exact matches are to be
# returned only if a matrix of width equal to the
# number of terms is to be returned
push!(found_words, word)
elseif search_mismatches == :brute_force
match_word = ""
distmin = Inf
for dict_word in keys(embeddings)
dist = evaluate(distance, word, dict_word)
if dist < distmin
distmin = dist
match_word = dict_word
end
end
push!(found_words, match_word)
else
@warn "The only supported approximate string matching" *
" method is :brute_force. Use :no for skipping the" *
" search; will not search."
push!(found_words, word)
end
end
push!(found_words, word) # the zero-vectors will be the
end # last columns of the document matrix
end
# Return
if print_matched_words
Expand Down Expand Up @@ -139,12 +117,12 @@ end
# more_complicated,
# complicated]
function token_search(conceptnet::ConceptNet{L,K,V},
tokens::S;
tokens::Vector{S};
language::L=Languages.English(),
separator::String="_",
max_length::Int=3,
wildcard_matching::Bool=false) where
{L<:Language, K, V, S<:AbstractVector{<:AbstractString}}
{L<:Language, K, V, S<:AbstractString}
# Initializations
if wildcard_matching
# Build function that checks whether a token is found in conceptnet
Expand All @@ -163,21 +141,22 @@ function token_search(conceptnet::ConceptNet{L,K,V},
i = 1
j = n
while i <= n
token = join(tokens[i:j], separator, separator)
is_match = check_function(conceptnet, language, token, V())
if is_match && j-i+1 <= max_length
push!(found, i:j)
i = j + 1
j = n
continue
else
if i == j
if j-i+1 <= max_length
token = join(tokens[i:j], separator, separator)
is_match = check_function(conceptnet, language, token, V())
if is_match
push!(found, i:j)
i = j + 1
j = n
i+= 1
else
j-= 1
continue
end
end
if i == j
j = n
i+= 1
else
j-= 1
end
end
return found
end
15 changes: 5 additions & 10 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,14 @@ end
embedded_doc, missed = embed_document(conceptnet,
doc,
keep_size=false,
max_compound_word_length=1,
search_mismatches=:no)
max_compound_word_length=1)
@test embedded_doc isa Matrix{Float64}
@test isempty(embedded_doc)
@test length(missed) == 3
embedded_doc, missed = embed_document(conceptnet,
doc,
keep_size=true,
max_compound_word_length=1,
search_mismatches=:no)
max_compound_word_length=1)
@test embedded_doc isa Matrix{Float64}
@test size(embedded_doc, 2) == length(tokenize_for_conceptnet(doc))
@test length(missed) == 3
Expand All @@ -135,25 +133,22 @@ end
embedded_doc_2, missed = embed_document(conceptnet,
doc_2,
keep_size=false,
max_compound_word_length=2,
search_mismatches=:no)
max_compound_word_length=2)
@test embedded_doc_2 isa Matrix{Float64}
@test isempty(embedded_doc_2)
@test length(missed) == length(tokenize_for_conceptnet(doc_2))
embedded_doc_2, missed = embed_document(conceptnet,
doc_2,
keep_size=true,
max_compound_word_length=2,
search_mismatches=:no)
max_compound_word_length=2)
@test embedded_doc_2 isa Matrix{Float64}
@test size(embedded_doc_2, 2) == length(tokenize_for_conceptnet(doc_2))
@test length(missed) == length(tokenize_for_conceptnet(doc_2))
embedded_doc_2, missed = embed_document(conceptnet,
doc_2,
keep_size=true,
wildcard_matching=true,
max_compound_word_length=2,
search_mismatches=:no)
max_compound_word_length=2)
@show missed
@test length(missed) == 0
end
Expand Down

0 comments on commit b508bf1

Please sign in to comment.