diff --git a/Project.toml b/Project.toml index 63a7c5e..84d9468 100644 --- a/Project.toml +++ b/Project.toml @@ -7,14 +7,17 @@ version = "0.1.0" CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" -ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" [compat] +CircularArrays = "1" +DataStructures = "0.18" +OffsetArrays = "1" julia = "1" [extras] Faker = "0efc519c-db33-5916-ab87-703215c3906f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb" [targets] -test = ["Test", "Faker"] +test = ["Test", "Faker", "Suppressor"] diff --git a/README.md b/README.md index 9f07e30..900293d 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,8 @@ This package is be particulary useful for natural language processing tasks whic - [X] Support for unicodes - [ ] Custom user defined feature generation methods - [ ] Mecab-based tokenizer support +- [X] Support for building databases directly from text files +- [ ] Support for persistent databases ## Suported String Similarity Measures @@ -24,6 +26,7 @@ This package is be particulary useful for natural language processing tasks whic - [X] Jaccard coefficient - [X] Cosine coefficient - [X] Overlap coefficient +- [X] Exact match ## Installation diff --git a/docs/src/index.md b/docs/src/index.md index ef9edaa..06d50b2 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -16,7 +16,7 @@ This package is be particulary useful for natural language processing tasks whic - [X] Support for unicodes - [ ] Custom user defined feature generation methods - [ ] Mecab-based tokenizer support -- [ ] Support for building databases directly from text files +- [X] Support for building databases directly from text files - [ ] Support for persistent databases ## Suported String Similarity Measures @@ -64,6 +64,8 @@ push!(db, "fooo"); # Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);` +# OR: Build database from text files: `append!(db, "YOUR_FILE_NAME.txt"); + # Retrieve the closest match(es) res = search(Dice(), db, "foo"; α=0.8, ranked=true) # 2-element Vector{Tuple{String, Float64}}: @@ -72,7 +74,7 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true) # Describe a working database collection desc = describe_collection(db) -# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13) +# (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13) ``` ## TODO: Benchmarks diff --git a/extras/benchmark_sim.jl b/extras/benchmark_sim.jl new file mode 100644 index 0000000..82f9e9e --- /dev/null +++ b/extras/benchmark_sim.jl @@ -0,0 +1,48 @@ +using SimString +using Faker +using BenchmarkTools +using DataStructures + +################################# Benchmark Bulk addition ##################### +db = DictDB(CharacterNGrams(3, " ")); +Faker.seed(2020) +@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000]; + + +f(d, x) = append!(d, x) +@time f(db, fake_names) + + + +################################ Simple Addition ############################### + +db = DictDB(CharacterNGrams(2, " ")); +push!(db, "foo"); +push!(db, "bar"); +push!(db, "fooo"); + +f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r) +test = "foo"; +col = db; +sim = Cosine(); +a = 0.8; +r = true; + +f(Cosine(), db, "foo", 0.8, true) + +@btime f($sim, $col, $test, $a, $r) +@btime search(Cosine(), db, "foo"; α=0.8, ranked=true) + + + +db2 = DictDB(CharacterNGrams(3, " ")); +append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector + +results = search(Cosine(), db, "foo"; α=0.8, ranked=true) # yet to be implemented + +bs = ["foo", "bar", "foo", "foo", "bar"] +SimString.extract_features(CharacterNGrams(3, " "), "prepress") +SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.") + +db = DictDB(WordNGrams(2, " ", " ")) +push!(db, "You are a really really really cool dude.") diff --git a/src/SimString.jl b/src/SimString.jl index 8def7a0..12a0f45 100644 --- a/src/SimString.jl +++ b/src/SimString.jl @@ -2,7 +2,6 @@ module SimString import Base: push!, append! using DataStructures: DefaultOrderedDict, DefaultDict -using ProgressMeter using CircularArrays using OffsetArrays diff --git a/src/dictdb.jl b/src/dictdb.jl index 7ae857c..f4570ec 100644 --- a/src/dictdb.jl +++ b/src/dictdb.jl @@ -87,6 +87,7 @@ Basic summary stats for the DB db = DictDB(CharacterNGrams(2, " ")); append!(db, ["foo", "bar", "fooo"]); describe_collection(db) +(total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13) # Returns * NamedTuples: Summary stats for the DB @@ -98,7 +99,7 @@ function describe_collection(db::DictDB) # Total number of strings in collection ∑ = length(db.string_collection) -# Average number of ngram features +# Average size of ngram features n = [x for x in keys(db.string_size_map)] μ = sum(n) / length(n) @@ -108,7 +109,19 @@ for i in values(db.string_feature_map) total_ngrams += length(i) end -return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams) +return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams) +end + + +""" +Pretty print summary stats for the DB +""" +function Base.show(io::IO, x::DictDB) + metrics = describe_collection(x) + println(io, "DictDB($(x.feature_extractor))") + println(io, "Total collection: ", metrics.total_collection) + println(io, "Average number of ngram features: ", metrics.avg_size_ngrams) + println(io, "Total number of ngram features: ", metrics.total_ngrams) end diff --git a/src/features.jl b/src/features.jl index 2ebc0ca..8486b3b 100644 --- a/src/features.jl +++ b/src/features.jl @@ -99,8 +99,25 @@ end """ + push!(db::AbstractSimStringDB, str::AbstractString) + Add a new item to a new or existing collection of strings using the custom AbstractSimStringDB type. + +# Arguments: +* `db`: AbstractSimStringDB - The collection of strings to add to +* `str`: AbstractString - The string to add to the collection + +# Example: +```julia +db = DictDB(CharacterNGrams(2, " ")); +push!(db, "foo") +push!(db, "bar") +push!(db, "fooo") +```` + +# Returns: +* `db`: AbstractSimStringDB - The collection of strings with the new string added """ function push!(db::AbstractSimStringDB, str::AbstractString) # Extract features based on the specified feature extractor @@ -125,11 +142,54 @@ end """ + append!(db::AbstractSimStringDB, str::Vector) + Add bulk items to a new or existing collection of strings using the custom AbstractSimStringDB type. + +# Arguments: +* db: AbstractSimStringDB - The database to add the strings to +* str: Vector of AbstractString - Vector/Array of strings to add to the database + +# Example: +```julia +db = DictDB(CharacterNGrams(2, " ")); +append!(db, ["foo", "foo", "fooo"]); +``` + +# Returns: +* db: AbstractSimStringDB - The database with the new strings added """ function append!(db::AbstractSimStringDB, str::Vector) @inbounds @simd for i in str push!(db, i) end +end + + +""" + append!(db::AbstractSimStringDB, file::AbstractString) + +Add bulk items to a new or existing collection of strings using +from a file using the custom AbstractSimStringDB type. + +# Arguments: +* `db``: AbstractSimStringDB - The database to add the items to +* `file`: AbstractString - Path to the file to read from + +# Example: +```julia +db = DictDB(CharacterNGrams(2, " ")); +append!(db, "./data/test.txt") +``` + +# Returns: +* `db`: AbstractSimStringDB - The database with the items added +""" +function append!(db::AbstractSimStringDB, file::AbstractString) + open(file) do f + for line in eachline(f) + push!(db, line) + end + end end \ No newline at end of file diff --git a/src/search.jl b/src/search.jl index 29d6e2d..fa0ddb4 100644 --- a/src/search.jl +++ b/src/search.jl @@ -74,7 +74,7 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat results = String[] for (candidate, match_count) in candidate_match_counts - for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify + for i in (query_feature_length - τ + 1) : query_feature_length if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i]) match_count += 1 end @@ -103,16 +103,16 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer features = extract_features(db_collection.feature_extractor, query) # Metadata from the generated features (length, min & max sizes) - length_of_features = length(features) - min_feature_size = minimum_feature_size(measure, length_of_features, α) - max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α) + # length_of_features = length(features) + # min_feature_size = minimum_feature_size(measure, length_of_features, α) + # max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α) results = String[] # Generate and return results from the potential candidate size pool - @inbounds for candidate_size in min_feature_size:max_feature_size + @inbounds for candidate_size in minimum_feature_size(measure, length(features), α) : maximum_feature_size(measure, db_collection, length(features), α) # Minimum overlap - τ = minimum_overlap(measure, length_of_features, candidate_size, α) + τ = minimum_overlap(measure, length(features), candidate_size, α) # Generate approximate candidates from the overlap join append!(results, overlap_join(db_collection, features, τ, candidate_size)) diff --git a/test/dummy_sents.txt b/test/dummy_sents.txt new file mode 100644 index 0000000..b60b3c9 --- /dev/null +++ b/test/dummy_sents.txt @@ -0,0 +1,2 @@ +You are a really really really cool dude. +Sometimes you are not really really cool tho \ No newline at end of file diff --git a/test/dummy_words.txt b/test/dummy_words.txt new file mode 100644 index 0000000..d4b7bf6 --- /dev/null +++ b/test/dummy_words.txt @@ -0,0 +1,3 @@ +foo +bar +fooo \ No newline at end of file diff --git a/test/test01_dictdb.jl b/test/test01_dictdb.jl index 6531a70..ddb7f43 100644 --- a/test/test01_dictdb.jl +++ b/test/test01_dictdb.jl @@ -15,8 +15,8 @@ using Test @test collect(keys(db.string_feature_map)) == [5, 6] - @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) ) - @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) + @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5))) + @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) end @@ -41,10 +41,10 @@ end @test collect(keys(db.string_feature_map)) == [5, 6] - @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) ) - @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) + @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5))) + @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) - @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64} + @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64} end @@ -59,19 +59,53 @@ end @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) - @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64} + @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64} end @testset "Test describe functionality" begin - db = DictDB(CharacterNGrams(2, " ")); - append!(db, ["foo", "bar", "fooo"]); + db = DictDB(CharacterNGrams(2, " ")) + append!(db, ["foo", "bar", "fooo"]) # Interact with db - search(Dice(), db, "zep"; α=0.8, ranked=true) + search(Dice(), db, "zep"; α = 0.8, ranked = true) + + @test describe_collection(db) == (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13) +end + + +@testset "Test bulk insertion from a file using CharacterNGrams" begin + db = DictDB(CharacterNGrams(3, " ")) + append!(db, "dummy_words.txt") + + @test db.string_collection == ["foo", "bar", "fooo"] + @test db.string_size_map[5] == Set(["bar", "foo"]) + @test db.string_size_map[6] == Set(["fooo"]) + + @test collect(keys(db.string_feature_map)) == [5, 6] + + @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5))) + @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) + + @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64} +end + + + +@testset "Test bulk insertion from a file using WordNGrams" begin + db = DictDB(WordNGrams(2, " ", " ")) + append!(db, "dummy_sents.txt") + + @test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"] + @test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + + @test collect(keys(db.string_feature_map)) == [9] + @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + + @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64} - @test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13) end diff --git a/test/test04_search.jl b/test/test04_search.jl index 14f93e0..8cdb283 100644 --- a/test/test04_search.jl +++ b/test/test04_search.jl @@ -2,6 +2,7 @@ module TestMeasures using SimString using Test using Faker +using Suppressor @testset "Test Dice Search" begin @@ -54,6 +55,7 @@ end end + @testset "Test Micro Deep Dive Search" begin db = DictDB(CharacterNGrams(2, " ")); append!(db, ["a", "ab", "abc", "abcd", "abcde"]); @@ -76,6 +78,17 @@ end end +@testset "Test output from show" begin + db = DictDB(CharacterNGrams(2, " ")); + append!(db, ["foo", "bar", "fooo"]); + + expected_out = "DictDB(SimString.CharacterNGrams{Int64, String}(2, \" \"))\nTotal collection: 3\nAverage number of ngram features: 4.5\nTotal number of ngram features: 13\n" + r = @capture_out show(db) + @test r == expected_out +end + + + end # module \ No newline at end of file