Initial public release (#15)

* Initial public release
PyDataBlog · Feb 10, 2022 · f822944 · f822944 · PyDataBlog · Feb 10, 2022
1 parent 504460e
commit f822944
Show file tree

Hide file tree

Showing 12 changed files with 203 additions and 23 deletions.
diff --git a/Project.toml b/Project.toml
@@ -7,14 +7,17 @@ version = "0.1.0"
 CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 
 [compat]
+CircularArrays = "1"
+DataStructures = "0.18"
+OffsetArrays = "1"
 julia = "1"
 
 [extras]
 Faker = "0efc519c-db33-5916-ab87-703215c3906f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
 
 [targets]
-test = ["Test", "Faker"]
+test = ["Test", "Faker", "Suppressor"]
diff --git a/README.md b/README.md
@@ -17,13 +17,16 @@ This package is be particulary useful for natural language processing tasks whic
 - [X] Support for unicodes
 - [ ] Custom user defined feature generation methods
 - [ ] Mecab-based tokenizer support
+- [X] Support for building databases directly from text files
+- [ ] Support for persistent databases
 
 ## Suported String Similarity Measures
 
 - [X] Dice coefficient
 - [X] Jaccard coefficient
 - [X] Cosine coefficient
 - [X] Overlap coefficient
+- [X] Exact match
 
 ## Installation
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -16,7 +16,7 @@ This package is be particulary useful for natural language processing tasks whic
 - [X] Support for unicodes
 - [ ] Custom user defined feature generation methods
 - [ ] Mecab-based tokenizer support
-- [ ] Support for building databases directly from text files
+- [X] Support for building databases directly from text files
 - [ ] Support for persistent databases
 
 ## Suported String Similarity Measures
@@ -64,6 +64,8 @@ push!(db, "fooo");
 
 # Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);`
 
+# OR: Build database from text files: `append!(db, "YOUR_FILE_NAME.txt");
+
 # Retrieve the closest match(es)
 res = search(Dice(), db, "foo"; α=0.8, ranked=true)
 # 2-element Vector{Tuple{String, Float64}}:
@@ -72,7 +74,7 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)
 
 # Describe a working database collection
 desc = describe_collection(db)
-# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
+# (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
 ```
 
 ## TODO: Benchmarks

diff --git a/extras/benchmark_sim.jl b/extras/benchmark_sim.jl
@@ -0,0 +1,48 @@
+using SimString
+using Faker
+using BenchmarkTools
+using DataStructures
+
+################################# Benchmark Bulk addition #####################
+db = DictDB(CharacterNGrams(3, " "));
+Faker.seed(2020)
+@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000];
+
+
+f(d, x) = append!(d, x)
+@time f(db, fake_names)
+
+
+
+################################ Simple Addition ###############################
+
+db = DictDB(CharacterNGrams(2, " "));
+push!(db, "foo");
+push!(db, "bar");
+push!(db, "fooo");
+
+f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
+test = "foo";
+col = db;
+sim = Cosine();
+a = 0.8;
+r = true;
+
+f(Cosine(), db, "foo", 0.8, true)
+
+@btime f($sim, $col, $test, $a, $r)
+@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)
+
+
+
+db2 = DictDB(CharacterNGrams(3, " "));
+append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector
+
+results = search(Cosine(), db, "foo"; α=0.8, ranked=true) # yet to be implemented
+
+bs = ["foo", "bar", "foo", "foo", "bar"]
+SimString.extract_features(CharacterNGrams(3, " "), "prepress")
+SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
+
+db = DictDB(WordNGrams(2, " ", " "))
+push!(db, "You are a really really really cool dude.")
diff --git a/src/SimString.jl b/src/SimString.jl
@@ -2,7 +2,6 @@ module SimString
 
 import Base: push!, append!
 using DataStructures: DefaultOrderedDict, DefaultDict
-using ProgressMeter
 using CircularArrays
 using OffsetArrays
 

diff --git a/src/dictdb.jl b/src/dictdb.jl
@@ -87,6 +87,7 @@ Basic summary stats for the DB
 db = DictDB(CharacterNGrams(2, " "));
 append!(db, ["foo", "bar", "fooo"]);
 describe_collection(db)
+(total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
 
 # Returns
 * NamedTuples: Summary stats for the DB
@@ -98,7 +99,7 @@ function describe_collection(db::DictDB)
 # Total number of strings in collection
 ∑ = length(db.string_collection)
 
-# Average number of ngram features
+# Average size of ngram features
 n = [x for x in keys(db.string_size_map)]
 μ = sum(n) / length(n)
 
@@ -108,7 +109,19 @@ for i in values(db.string_feature_map)
  total_ngrams += length(i)
 end
 
-return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
+return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
+end
+
+
+"""
+Pretty print summary stats for the DB
+"""
+function Base.show(io::IO, x::DictDB)
+ metrics = describe_collection(x)
+ println(io, "DictDB($(x.feature_extractor))")
+ println(io, "Total collection: ", metrics.total_collection)
+ println(io, "Average number of ngram features: ", metrics.avg_size_ngrams)
+ println(io, "Total number of ngram features: ", metrics.total_ngrams)
 end
 
 

diff --git a/src/features.jl b/src/features.jl
@@ -99,8 +99,25 @@ end
 
 
 """
+ push!(db::AbstractSimStringDB, str::AbstractString)
+
 Add a new item to a new or existing collection of strings using
 the custom AbstractSimStringDB type.
+
+# Arguments:
+* `db`: AbstractSimStringDB - The collection of strings to add to
+* `str`: AbstractString - The string to add to the collection
+
+# Example:
+```julia
+db = DictDB(CharacterNGrams(2, " "));
+push!(db, "foo")
+push!(db, "bar")
+push!(db, "fooo")
+````
+
+# Returns:
+* `db`: AbstractSimStringDB - The collection of strings with the new string added
 """
 function push!(db::AbstractSimStringDB, str::AbstractString)
  # Extract features based on the specified feature extractor
@@ -125,11 +142,54 @@ end
 
 
 """
+ append!(db::AbstractSimStringDB, str::Vector)
+
 Add bulk items to a new or existing collection of strings using
 the custom AbstractSimStringDB type.
+
+# Arguments:
+* db: AbstractSimStringDB - The database to add the strings to
+* str: Vector of AbstractString - Vector/Array of strings to add to the database
+
+# Example:
+```julia
+db = DictDB(CharacterNGrams(2, " "));
+append!(db, ["foo", "foo", "fooo"]);
+```
+
+# Returns:
+* db: AbstractSimStringDB - The database with the new strings added
 """
 function append!(db::AbstractSimStringDB, str::Vector)
  @inbounds @simd for i in str
  push!(db, i)
  end
+end
+
+
+"""
+ append!(db::AbstractSimStringDB, file::AbstractString)
+
+Add bulk items to a new or existing collection of strings using
+from a file using the custom AbstractSimStringDB type.
+
+# Arguments:
+* `db``: AbstractSimStringDB - The database to add the items to
+* `file`: AbstractString - Path to the file to read from
+
+# Example:
+```julia
+db = DictDB(CharacterNGrams(2, " "));
+append!(db, "./data/test.txt")
+```
+
+# Returns:
+* `db`: AbstractSimStringDB - The database with the items added
+"""
+function append!(db::AbstractSimStringDB, file::AbstractString)
+ open(file) do f
+ for line in eachline(f)
+ push!(db, line)
+ end
+ end
 end
diff --git a/src/search.jl b/src/search.jl
@@ -74,7 +74,7 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
  results = String[]
 
  for (candidate, match_count) in candidate_match_counts
- for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
+ for i in (query_feature_length - τ + 1) : query_feature_length
  if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
  match_count += 1
  end
@@ -103,16 +103,16 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
  features = extract_features(db_collection.feature_extractor, query)
 
  # Metadata from the generated features (length, min & max sizes)
- length_of_features = length(features)
- min_feature_size = minimum_feature_size(measure, length_of_features, α)
- max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
+ # length_of_features = length(features)
+ # min_feature_size = minimum_feature_size(measure, length_of_features, α)
+ # max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
 
  results = String[]
 
  # Generate and return results from the potential candidate size pool
- @inbounds for candidate_size in min_feature_size:max_feature_size
+ @inbounds for candidate_size in minimum_feature_size(measure, length(features), α) : maximum_feature_size(measure, db_collection, length(features), α)
  # Minimum overlap
- τ = minimum_overlap(measure, length_of_features, candidate_size, α)
+ τ = minimum_overlap(measure, length(features), candidate_size, α)
 
  # Generate approximate candidates from the overlap join
  append!(results, overlap_join(db_collection, features, τ, candidate_size))

diff --git a/test/dummy_sents.txt b/test/dummy_sents.txt
@@ -0,0 +1,2 @@
+You are a really really really cool dude.
+Sometimes you are not really really cool tho
diff --git a/test/dummy_words.txt b/test/dummy_words.txt
@@ -0,0 +1,3 @@
+foo
+bar
+fooo
diff --git a/test/test01_dictdb.jl b/test/test01_dictdb.jl
@@ -15,8 +15,8 @@ using Test
 
  @test collect(keys(db.string_feature_map)) == [5, 6]
 
- @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
- @test collect(values(db.string_feature_map[6])) ==  repeat([Set(["fooo"])], 6)
+ @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
+ @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
 end
 
 
@@ -41,10 +41,10 @@ end
 
  @test collect(keys(db.string_feature_map)) == [5, 6]
 
- @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
- @test collect(values(db.string_feature_map[6])) ==  repeat([Set(["fooo"])], 6)
+ @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
+ @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
 
- @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
+ @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
 end
 
 
@@ -59,19 +59,53 @@ end
  @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
  @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
 
- @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
+ @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
 end
 
 
 
 @testset "Test describe functionality" begin
- db = DictDB(CharacterNGrams(2, " "));
- append!(db, ["foo", "bar", "fooo"]);
+ db = DictDB(CharacterNGrams(2, " "))
+ append!(db, ["foo", "bar", "fooo"])
 
  # Interact with db
- search(Dice(), db, "zep"; α=0.8, ranked=true)
+ search(Dice(), db, "zep"; α = 0.8, ranked = true)
+
+ @test describe_collection(db) == (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
+end
+
+
+@testset "Test bulk insertion from a file using CharacterNGrams" begin
+ db = DictDB(CharacterNGrams(3, " "))
+ append!(db, "dummy_words.txt")
+
+ @test db.string_collection == ["foo", "bar", "fooo"]
+ @test db.string_size_map[5] == Set(["bar", "foo"])
+ @test db.string_size_map[6] == Set(["fooo"])
+
+ @test collect(keys(db.string_feature_map)) == [5, 6]
+
+ @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
+ @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
+
+ @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
+end
+
+
+
+@testset "Test bulk insertion from a file using WordNGrams" begin
+ db = DictDB(WordNGrams(2, " ", " "))
+ append!(db, "dummy_sents.txt")
+
+ @test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]
+ @test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+
+ @test collect(keys(db.string_feature_map)) == [9]
+ @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+ @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+
+ @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
 
- @test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
 end
 
 

diff --git a/test/test04_search.jl b/test/test04_search.jl
@@ -2,6 +2,7 @@ module TestMeasures
 using SimString
 using Test
 using Faker
+using Suppressor
 
 
 @testset "Test Dice Search" begin
@@ -54,6 +55,7 @@ end
 
 end
 
+
 @testset "Test Micro Deep Dive Search" begin
  db = DictDB(CharacterNGrams(2, " "));
  append!(db, ["a", "ab", "abc", "abcd", "abcde"]);
@@ -76,6 +78,17 @@ end
 end
 
 
+@testset "Test output from show" begin
+ db = DictDB(CharacterNGrams(2, " "));
+ append!(db, ["foo", "bar", "fooo"]);
+
+ expected_out = "DictDB(SimString.CharacterNGrams{Int64, String}(2, \" \"))\nTotal collection: 3\nAverage number of ngram features: 4.5\nTotal number of ngram features: 13\n"
+ r = @capture_out show(db)
+ @test r == expected_out
+end
+
+
+
 
 
 end # module