Skip to content

Commit

Permalink
Initial public release (#15)
Browse files Browse the repository at this point in the history
* Initial public release
  • Loading branch information
PyDataBlog committed Feb 10, 2022
1 parent 504460e commit f822944
Show file tree
Hide file tree
Showing 12 changed files with 203 additions and 23 deletions.
7 changes: 5 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@ version = "0.1.0"
CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"

[compat]
CircularArrays = "1"
DataStructures = "0.18"
OffsetArrays = "1"
julia = "1"

[extras]
Faker = "0efc519c-db33-5916-ab87-703215c3906f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"

[targets]
test = ["Test", "Faker"]
test = ["Test", "Faker", "Suppressor"]
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,16 @@ This package is be particulary useful for natural language processing tasks whic
- [X] Support for unicodes
- [ ] Custom user defined feature generation methods
- [ ] Mecab-based tokenizer support
- [X] Support for building databases directly from text files
- [ ] Support for persistent databases

## Suported String Similarity Measures

- [X] Dice coefficient
- [X] Jaccard coefficient
- [X] Cosine coefficient
- [X] Overlap coefficient
- [X] Exact match

## Installation

Expand Down
6 changes: 4 additions & 2 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This package is be particulary useful for natural language processing tasks whic
- [X] Support for unicodes
- [ ] Custom user defined feature generation methods
- [ ] Mecab-based tokenizer support
- [ ] Support for building databases directly from text files
- [X] Support for building databases directly from text files
- [ ] Support for persistent databases

## Suported String Similarity Measures
Expand Down Expand Up @@ -64,6 +64,8 @@ push!(db, "fooo");

# Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);`

# OR: Build database from text files: `append!(db, "YOUR_FILE_NAME.txt");

# Retrieve the closest match(es)
res = search(Dice(), db, "foo"; α=0.8, ranked=true)
# 2-element Vector{Tuple{String, Float64}}:
Expand All @@ -72,7 +74,7 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)

# Describe a working database collection
desc = describe_collection(db)
# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
# (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
```

## TODO: Benchmarks
Expand Down
48 changes: 48 additions & 0 deletions extras/benchmark_sim.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
using SimString
using Faker
using BenchmarkTools
using DataStructures

################################# Benchmark Bulk addition #####################
db = DictDB(CharacterNGrams(3, " "));
Faker.seed(2020)
@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000];


f(d, x) = append!(d, x)
@time f(db, fake_names)



################################ Simple Addition ###############################

db = DictDB(CharacterNGrams(2, " "));
push!(db, "foo");
push!(db, "bar");
push!(db, "fooo");

f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
test = "foo";
col = db;
sim = Cosine();
a = 0.8;
r = true;

f(Cosine(), db, "foo", 0.8, true)

@btime f($sim, $col, $test, $a, $r)
@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)



db2 = DictDB(CharacterNGrams(3, " "));
append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector

results = search(Cosine(), db, "foo"; α=0.8, ranked=true) # yet to be implemented

bs = ["foo", "bar", "foo", "foo", "bar"]
SimString.extract_features(CharacterNGrams(3, " "), "prepress")
SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")

db = DictDB(WordNGrams(2, " ", " "))
push!(db, "You are a really really really cool dude.")
1 change: 0 additions & 1 deletion src/SimString.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ module SimString

import Base: push!, append!
using DataStructures: DefaultOrderedDict, DefaultDict
using ProgressMeter
using CircularArrays
using OffsetArrays

Expand Down
17 changes: 15 additions & 2 deletions src/dictdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ Basic summary stats for the DB
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["foo", "bar", "fooo"]);
describe_collection(db)
(total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
# Returns
* NamedTuples: Summary stats for the DB
Expand All @@ -98,7 +99,7 @@ function describe_collection(db::DictDB)
# Total number of strings in collection
= length(db.string_collection)

# Average number of ngram features
# Average size of ngram features
n = [x for x in keys(db.string_size_map)]
μ = sum(n) / length(n)

Expand All @@ -108,7 +109,19 @@ for i in values(db.string_feature_map)
total_ngrams += length(i)
end

return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
end


"""
Pretty print summary stats for the DB
"""
function Base.show(io::IO, x::DictDB)
metrics = describe_collection(x)
println(io, "DictDB($(x.feature_extractor))")
println(io, "Total collection: ", metrics.total_collection)
println(io, "Average number of ngram features: ", metrics.avg_size_ngrams)
println(io, "Total number of ngram features: ", metrics.total_ngrams)
end


Expand Down
60 changes: 60 additions & 0 deletions src/features.jl
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,25 @@ end


"""
push!(db::AbstractSimStringDB, str::AbstractString)
Add a new item to a new or existing collection of strings using
the custom AbstractSimStringDB type.
# Arguments:
* `db`: AbstractSimStringDB - The collection of strings to add to
* `str`: AbstractString - The string to add to the collection
# Example:
```julia
db = DictDB(CharacterNGrams(2, " "));
push!(db, "foo")
push!(db, "bar")
push!(db, "fooo")
````
# Returns:
* `db`: AbstractSimStringDB - The collection of strings with the new string added
"""
function push!(db::AbstractSimStringDB, str::AbstractString)
# Extract features based on the specified feature extractor
Expand All @@ -125,11 +142,54 @@ end


"""
append!(db::AbstractSimStringDB, str::Vector)
Add bulk items to a new or existing collection of strings using
the custom AbstractSimStringDB type.
# Arguments:
* db: AbstractSimStringDB - The database to add the strings to
* str: Vector of AbstractString - Vector/Array of strings to add to the database
# Example:
```julia
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["foo", "foo", "fooo"]);
```
# Returns:
* db: AbstractSimStringDB - The database with the new strings added
"""
function append!(db::AbstractSimStringDB, str::Vector)
@inbounds @simd for i in str
push!(db, i)
end
end


"""
append!(db::AbstractSimStringDB, file::AbstractString)
Add bulk items to a new or existing collection of strings using
from a file using the custom AbstractSimStringDB type.
# Arguments:
* `db``: AbstractSimStringDB - The database to add the items to
* `file`: AbstractString - Path to the file to read from
# Example:
```julia
db = DictDB(CharacterNGrams(2, " "));
append!(db, "./data/test.txt")
```
# Returns:
* `db`: AbstractSimStringDB - The database with the items added
"""
function append!(db::AbstractSimStringDB, file::AbstractString)
open(file) do f
for line in eachline(f)
push!(db, line)
end
end
end
12 changes: 6 additions & 6 deletions src/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
results = String[]

for (candidate, match_count) in candidate_match_counts
for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
for i in (query_feature_length - τ + 1) : query_feature_length
if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
match_count += 1
end
Expand Down Expand Up @@ -103,16 +103,16 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
features = extract_features(db_collection.feature_extractor, query)

# Metadata from the generated features (length, min & max sizes)
length_of_features = length(features)
min_feature_size = minimum_feature_size(measure, length_of_features, α)
max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
# length_of_features = length(features)
# min_feature_size = minimum_feature_size(measure, length_of_features, α)
# max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)

results = String[]

# Generate and return results from the potential candidate size pool
@inbounds for candidate_size in min_feature_size:max_feature_size
@inbounds for candidate_size in minimum_feature_size(measure, length(features), α) : maximum_feature_size(measure, db_collection, length(features), α)
# Minimum overlap
τ = minimum_overlap(measure, length_of_features, candidate_size, α)
τ = minimum_overlap(measure, length(features), candidate_size, α)

# Generate approximate candidates from the overlap join
append!(results, overlap_join(db_collection, features, τ, candidate_size))
Expand Down
2 changes: 2 additions & 0 deletions test/dummy_sents.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
You are a really really really cool dude.
Sometimes you are not really really cool tho
3 changes: 3 additions & 0 deletions test/dummy_words.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
foo
bar
fooo
54 changes: 44 additions & 10 deletions test/test01_dictdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ using Test

@test collect(keys(db.string_feature_map)) == [5, 6]

@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
end


Expand All @@ -41,10 +41,10 @@ end

@test collect(keys(db.string_feature_map)) == [5, 6]

@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)

@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
end


Expand All @@ -59,19 +59,53 @@ end
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])

@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
end



@testset "Test describe functionality" begin
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["foo", "bar", "fooo"]);
db = DictDB(CharacterNGrams(2, " "))
append!(db, ["foo", "bar", "fooo"])

# Interact with db
search(Dice(), db, "zep"; α=0.8, ranked=true)
search(Dice(), db, "zep"; α = 0.8, ranked = true)

@test describe_collection(db) == (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
end


@testset "Test bulk insertion from a file using CharacterNGrams" begin
db = DictDB(CharacterNGrams(3, " "))
append!(db, "dummy_words.txt")

@test db.string_collection == ["foo", "bar", "fooo"]
@test db.string_size_map[5] == Set(["bar", "foo"])
@test db.string_size_map[6] == Set(["fooo"])

@test collect(keys(db.string_feature_map)) == [5, 6]

@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)

@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
end



@testset "Test bulk insertion from a file using WordNGrams" begin
db = DictDB(WordNGrams(2, " ", " "))
append!(db, "dummy_sents.txt")

@test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]
@test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])

@test collect(keys(db.string_feature_map)) == [9]
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])

@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}

@test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
end


Expand Down
13 changes: 13 additions & 0 deletions test/test04_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module TestMeasures
using SimString
using Test
using Faker
using Suppressor


@testset "Test Dice Search" begin
Expand Down Expand Up @@ -54,6 +55,7 @@ end

end


@testset "Test Micro Deep Dive Search" begin
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["a", "ab", "abc", "abcd", "abcde"]);
Expand All @@ -76,6 +78,17 @@ end
end


@testset "Test output from show" begin
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["foo", "bar", "fooo"]);

expected_out = "DictDB(SimString.CharacterNGrams{Int64, String}(2, \" \"))\nTotal collection: 3\nAverage number of ngram features: 4.5\nTotal number of ngram features: 13\n"
r = @capture_out show(db)
@test r == expected_out
end





end # module

2 comments on commit f822944

@PyDataBlog
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/54345

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.0 -m "<description of version>" f822944fcc9416389a3d34e93c44d57d11db7ef2
git push origin v0.1.0

Please sign in to comment.