diff --git a/Manifest.toml b/Manifest.toml index 57b8b67..934324d 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -20,28 +20,28 @@ uuid = "a74b3585-a348-5f62-a45c-50e91977d574" version = "0.5.1" [[BufferedStreams]] -deps = ["Compat", "Pkg", "Test"] +deps = ["Compat", "Test"] git-tree-sha1 = "5d55b9486590fdda5905c275bb21ce1f0754020f" uuid = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d" version = "1.0.0" [[CMake]] -deps = ["BinDeps", "Libdl", "Pkg", "Test"] +deps = ["BinDeps", "Libdl", "Test"] git-tree-sha1 = "74853a75c26a4a73ac391ee26ee29ebeb5583d9f" uuid = "631607c0-34d2-5d66-819e-eb0f9aa2061a" version = "1.1.0" [[CMakeWrapper]] -deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Pkg", "Test"] +deps = ["BinDeps", "CMake", "Libdl", "Parameters", "Test"] git-tree-sha1 = "2b43d451639984e3571951cc687b8509b0a86c6d" uuid = "d5fb7624-851a-54ee-a528-d3f3bac0b4a0" version = "0.2.2" [[CodecZlib]] -deps = ["BinaryProvider", "Libdl", "Pkg", "Test", "TranscodingStreams"] -git-tree-sha1 = "83cb3d65c37ea1364c2d5bf7bcea41843ba645dc" +deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"] +git-tree-sha1 = "e3df104c84dfc108f0ca203fd7f5bbdc98641ae9" uuid = "944b1d66-785c-5afd-91f1-9de20f533193" -version = "0.5.0" +version = "0.5.1" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] @@ -57,12 +57,6 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" deps = ["Mmap"] uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" -[[Distances]] -deps = ["LinearAlgebra", "Pkg", "Printf", "Random", "Statistics", "Test"] -git-tree-sha1 = "2f38605722542f1c0a32dd2856fb529d8c226c69" -uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -version = "0.7.3" - [[Distributed]] deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" @@ -89,14 +83,8 @@ version = "0.7.0" deps = ["LinearAlgebra", "Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[IterTools]] -deps = ["Pkg", "SparseArrays", "Test"] -git-tree-sha1 = "79246285c43602384e6f1943b3554042a3712056" -uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" -version = "1.1.1" - [[JSON]] -deps = ["Dates", "Distributed", "Mmap", "Pkg", "Sockets", "Test", "Unicode"] +deps = ["Dates", "Distributed", "Mmap", "Sockets", "Test", "Unicode"] git-tree-sha1 = "fec8e4d433072731466d37ed0061b3ba7f70eeb9" uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.19.0" @@ -108,13 +96,13 @@ uuid = "8ef0a80b-9436-5d2c-a485-80b904378c43" version = "0.4.0" [[LibCURL]] -deps = ["BinaryProvider", "Compat", "Libdl", "Pkg", "Printf"] +deps = ["BinaryProvider", "Compat", "Libdl", "Printf"] git-tree-sha1 = "6339c87cb76923a3cf947fcd213cbc364355c9c9" uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" version = "0.4.1" [[LibExpat]] -deps = ["Compat", "Pkg"] +deps = ["Compat"] git-tree-sha1 = "fde352ec13479e2f90e57939da2440fb78c5e388" uuid = "522f3ed2-3f36-55e3-b6df-e94fee9b0c07" version = "0.5.0" @@ -146,13 +134,13 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[OrderedCollections]] -deps = ["Pkg", "Random", "Serialization", "Test"] +deps = ["Random", "Serialization", "Test"] git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" version = "1.0.2" [[Parameters]] -deps = ["Markdown", "OrderedCollections", "Pkg", "REPL", "Test"] +deps = ["Markdown", "OrderedCollections", "REPL", "Test"] git-tree-sha1 = "40f540ec96e50c0b2b9efdb11b5e4d0c63f90923" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.10.1" @@ -194,18 +182,12 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -[[StringDistances]] -deps = ["Distances", "IterTools", "Test"] -git-tree-sha1 = "41fddd579b75e0cd0d1bbdb2d68a2a9cc588c164" -uuid = "88034a9c-02f8-509d-84a9-84ec65e18404" -version = "0.3.0" - [[Test]] deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[TranscodingStreams]] -deps = ["DelimitedFiles", "Pkg", "Random", "Test"] +deps = ["Pkg", "Random", "Test"] git-tree-sha1 = "a34a2d588e2d2825602bf14a24216d5c8b0921ec" uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" version = "0.8.1" diff --git a/Project.toml b/Project.toml index 88040fe..c49f0e1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,11 @@ name = "ConceptnetNumberbatch" uuid = "2d1d9008-b762-11e8-11f1-375fdd6dca71" authors = ["Corneliu Cofaru "] -version = "0.1.0" +version = "0.1.2" [deps] CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" -StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" diff --git a/README.md b/README.md index 0ea2a97..d3bc98e 100644 --- a/README.md +++ b/README.md @@ -116,11 +116,10 @@ julia> doc = "embed this document containing X_#-s231 which cannot be embedded" ## Remarks - - fast for retrieving embeddings of exact matches - - fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`) - - fast document embedding - - if neither exact or wildcard matches exist, retrieval can be based on string distances (slow, see `src/search.jl`) - - for another package handling word embeddings, check out [Embeddings.jl](https://github.com/JuliaText/Embeddings.jl) + - for the best speed, the `HDF5` version should be used + - the API is very fast for retrieving embeddings of single word exact matches + - it is also quite fast for retrieving embeddings of wildcard matches (`xyzabcish` is matched to `######ish`) and multiple word expressions of arbitrary length (provided these are embedded) + - the document embedding is slower (scales with document length) ## Installation diff --git a/REQUIRE b/REQUIRE index 4b31580..e215079 100644 --- a/REQUIRE +++ b/REQUIRE @@ -3,4 +3,3 @@ TranscodingStreams CodecZlib HDF5 Languages -StringDistances diff --git a/src/ConceptnetNumberbatch.jl b/src/ConceptnetNumberbatch.jl index a80651c..0e62941 100644 --- a/src/ConceptnetNumberbatch.jl +++ b/src/ConceptnetNumberbatch.jl @@ -24,7 +24,6 @@ using TranscodingStreams using CodecZlib using HDF5 using Languages -using StringDistances import Base: get, getindex, size, length, show, keys, values, in diff --git a/src/defaults.jl b/src/defaults.jl index 5be2f29..391f707 100644 --- a/src/defaults.jl +++ b/src/defaults.jl @@ -51,4 +51,4 @@ const LANGUAGES = Dict(:en=>Languages.English(), ) # Regular expression on which to split text into tokens -const DEFAULT_SPLITTER = r"(,|:|\\|\/|;|\.|\[|\]|\{|\}|\"|\"|\s+)" +const DEFAULT_SPLITTER = r"(,|\n|\r|\:|\\|\/|;|\.|\[|\]|\{|\}|\'|\`|\"|\"|\?|\!|\=|\~|\&|\s+)" diff --git a/src/document_embeddings.jl b/src/document_embeddings.jl index 0084fc0..de93af0 100644 --- a/src/document_embeddings.jl +++ b/src/document_embeddings.jl @@ -1,7 +1,11 @@ """ Fast tokenization function. """ -function tokenize_for_conceptnet(doc::AbstractString, splitter::Regex=DEFAULT_SPLITTER) +tokenize_for_conceptnet(doc::Vector{S}, splitter::Regex=DEFAULT_SPLITTER + ) where S<:AbstractString = begin doc end + +tokenize_for_conceptnet(doc::S, splitter::Regex=DEFAULT_SPLITTER + ) where S<:AbstractString = begin # First, split tokens = strip.(split(doc, splitter)) # Filter out empty strings @@ -20,9 +24,7 @@ function embed_document(conceptnet::ConceptNet, compound_word_separator::String="_", max_compound_word_length::Int=1, wildcard_matching::Bool=false, - search_mismatches::Symbol=:no, - print_matched_words::Bool=false, - distance=Levenshtein()) + print_matched_words::Bool=false) # Split document into tokens and embed return embed_document(conceptnet, tokenize_for_conceptnet(document), @@ -31,9 +33,7 @@ function embed_document(conceptnet::ConceptNet, compound_word_separator=compound_word_separator, max_compound_word_length=max_compound_word_length, wildcard_matching=wildcard_matching, - search_mismatches=search_mismatches, - print_matched_words=print_matched_words, - distance=distance) + print_matched_words=print_matched_words) end function embed_document(conceptnet::ConceptNet, @@ -43,9 +43,8 @@ function embed_document(conceptnet::ConceptNet, compound_word_separator::String="_", max_compound_word_length::Int=1, wildcard_matching::Bool=false, - search_mismatches::Symbol=:no, - print_matched_words::Bool=false, - distance=Levenshtein()) where S<:AbstractString + print_matched_words::Bool=false + ) where S<:AbstractString # Initializations embeddings = conceptnet.embeddings[language] # Get positions of words that can be used for indexing (found) @@ -69,31 +68,10 @@ function embed_document(conceptnet::ConceptNet, not_found_positions = setdiff(1:length(document_tokens), collect.(found_positions)...) words_not_found = document_tokens[not_found_positions] - if keep_size && !isempty(words_not_found) # keep_size has precendence + if keep_size for word in words_not_found - if search_mismatches == :no - # Insert not found words if exact matches are to be - # returned only if a matrix of width equal to the - # number of terms is to be returned - push!(found_words, word) - elseif search_mismatches == :brute_force - match_word = "" - distmin = Inf - for dict_word in keys(embeddings) - dist = evaluate(distance, word, dict_word) - if dist < distmin - distmin = dist - match_word = dict_word - end - end - push!(found_words, match_word) - else - @warn "The only supported approximate string matching" * - " method is :brute_force. Use :no for skipping the" * - " search; will not search." - push!(found_words, word) - end - end + push!(found_words, word) # the zero-vectors will be the + end # last columns of the document matrix end # Return if print_matched_words @@ -139,12 +117,12 @@ end # more_complicated, # complicated] function token_search(conceptnet::ConceptNet{L,K,V}, - tokens::S; + tokens::Vector{S}; language::L=Languages.English(), separator::String="_", max_length::Int=3, wildcard_matching::Bool=false) where - {L<:Language, K, V, S<:AbstractVector{<:AbstractString}} + {L<:Language, K, V, S<:AbstractString} # Initializations if wildcard_matching # Build function that checks whether a token is found in conceptnet @@ -163,21 +141,22 @@ function token_search(conceptnet::ConceptNet{L,K,V}, i = 1 j = n while i <= n - token = join(tokens[i:j], separator, separator) - is_match = check_function(conceptnet, language, token, V()) - if is_match && j-i+1 <= max_length - push!(found, i:j) - i = j + 1 - j = n - continue - else - if i == j + if j-i+1 <= max_length + token = join(tokens[i:j], separator, separator) + is_match = check_function(conceptnet, language, token, V()) + if is_match + push!(found, i:j) + i = j + 1 j = n - i+= 1 - else - j-= 1 + continue end end + if i == j + j = n + i+= 1 + else + j-= 1 + end end return found end diff --git a/test/runtests.jl b/test/runtests.jl index 30c79fd..34309cf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -117,16 +117,14 @@ end embedded_doc, missed = embed_document(conceptnet, doc, keep_size=false, - max_compound_word_length=1, - search_mismatches=:no) + max_compound_word_length=1) @test embedded_doc isa Matrix{Float64} @test isempty(embedded_doc) @test length(missed) == 3 embedded_doc, missed = embed_document(conceptnet, doc, keep_size=true, - max_compound_word_length=1, - search_mismatches=:no) + max_compound_word_length=1) @test embedded_doc isa Matrix{Float64} @test size(embedded_doc, 2) == length(tokenize_for_conceptnet(doc)) @test length(missed) == 3 @@ -135,16 +133,14 @@ end embedded_doc_2, missed = embed_document(conceptnet, doc_2, keep_size=false, - max_compound_word_length=2, - search_mismatches=:no) + max_compound_word_length=2) @test embedded_doc_2 isa Matrix{Float64} @test isempty(embedded_doc_2) @test length(missed) == length(tokenize_for_conceptnet(doc_2)) embedded_doc_2, missed = embed_document(conceptnet, doc_2, keep_size=true, - max_compound_word_length=2, - search_mismatches=:no) + max_compound_word_length=2) @test embedded_doc_2 isa Matrix{Float64} @test size(embedded_doc_2, 2) == length(tokenize_for_conceptnet(doc_2)) @test length(missed) == length(tokenize_for_conceptnet(doc_2)) @@ -152,8 +148,7 @@ end doc_2, keep_size=true, wildcard_matching=true, - max_compound_word_length=2, - search_mismatches=:no) + max_compound_word_length=2) @show missed @test length(missed) == 0 end