Sparse DTVs, more immutable structs, updates

zgornel · Feb 10, 2019 · 36046d2 · 36046d2
2 parents 7d2eef7 + 0a908ea
commit 36046d2
Show file tree

Hide file tree

Showing 11 changed files with 46 additions and 30 deletions.
diff --git a/Manifest.toml b/Manifest.toml
@@ -1,3 +1,5 @@
+# This file is machine-generated - editing it directly is not advised
+
 [[Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
@@ -22,11 +24,11 @@ deps = ["Mmap"]
 uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 
 [[Distributed]]
-deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
+deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 
 [[InteractiveUtils]]
-deps = ["LinearAlgebra", "Markdown"]
+deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[JSON]]
@@ -111,14 +113,14 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[UUIDs]]
-deps = ["Random"]
+deps = ["Random", "SHA"]
 uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [[WordTokenizers]]
 deps = ["Test"]
-git-tree-sha1 = "b8ae86c3c3902d50a606f7fbd98203f7d966067f"
+git-tree-sha1 = "e239c6cbc18248772b70595814a423e34fdba1ff"
 uuid = "796a5d58-b03d-544a-977e-18100b691f6e"
-version = "0.2.0"
+version = "0.3.0"
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 ## StringAnalysis Release Notes
 
+v0.3.4
+------
+ - All forms of DTVs are sparse
+ - DTMs, COOMs are immutable
+
 v0.3.3
 ------
  - Performance improvementss

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "StringAnalysis"
 uuid = "b66b7d2f-f536-51df-9f97-4dfb9d27c005"
 authors = ["Corneliu Cofaru <cornel@oxoaresearch.com>"]
-version = "0.3.3"
+version = "0.3.4"
 
 [deps]
 BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232"

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -148,15 +148,17 @@ One can verify the DTM dimensions with:
 ### Document Term Vectors (DTVs)
 The individual rows of the DTM can also be generated iteratively whether a lexicon is present or not. If a lexicon is present, the `each_dtv` iterator allows the generation of the document vectors along with the control of the vector element type:
 ```@repl index
-for dv in each_dtv(crps, eltype=Int8)
+for dv in map(Vector, each_dtv(crps, eltype=Int8))
     @show dv
 end
 ```
 
 Alternatively, the vectors can be generated using the [hash trick](https://en.wikipedia.org/wiki/Feature_hashing). This is a form of dimensionality reduction as `cardinality` i.e. output dimension is much smaller than the dimension of the original DTM vectors, which is equal to the length of the lexicon. The `cardinality` is a keyword argument of the `Corpus` constructor. The hashed vector output type can be specified when building the iterator:
 ```@repl index
-for dv in each_hash_dtv(Corpus(documents(crps), cardinality=5), eltype=Int8)
-    @show dv
+new_crps = Corpus(documents(crps), cardinality=7);
+hash_vectors = map(Vector, each_hash_dtv(new_crps, eltype=Int8));
+for hdv in hash_vectors
+    @show hdv
 end
 ```
 One can construct a 'hashed' version of the DTM as well:
@@ -165,6 +167,14 @@ hash_dtm(Corpus(documents(crps), cardinality=5), Int8)
 ```
 The default `Corpus` cardinality is specified by the constant `DEFAULT_CARDINALITY` present in `src/defaults.jl`.
 
+!!! note
+
+    From version `v0.3.4`, all document vectors are instances of `SparseVector`. This consequently
+    has an impact on the output and performance of methods that directly employ DTVs such
+    as the `embed_document` method. In certain cases, if speed is more important than memory consumption,
+    it may be useful to first transform the vectors into a dense representation prior to transformation
+    i.e. `dtv_dense = Vector(dtv_sparse)`.
+
 ### TF, TF-IDF, BM25
 From the DTM, three more document-word statistics can be constructed: the [term frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Term_frequency_2), the [tf-idf (term frequency - inverse document frequency)](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Term_frequency%E2%80%93Inverse_document_frequency) and [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) using the `tf`, `tf!`, `tf_idf`, `tf_idf!`, `bm_25` and `bm_25!` functions respectively. Their usage is very similar yet there exist several approaches one can take to constructing the output.
 

diff --git a/src/coom.jl b/src/coom.jl
@@ -59,7 +59,7 @@ the document or corpus
   * `column_indices::OrderedDict{String, Int}` a map between the `terms` and the
 columns of the co-occurrence matrix
 """
-mutable struct CooMatrix{T}
+struct CooMatrix{T}
     coom::SparseMatrixCSC{T, Int}
     terms::Vector{String}
     column_indices::OrderedDict{String, Int}

diff --git a/src/dtm.jl b/src/dtm.jl
@@ -9,7 +9,7 @@ the corpus associated with the DTM
   * `row_indices::OrderedDict{String, Int}` a map between the `terms` and the
 rows of the `dtm`
 """
-mutable struct DocumentTermMatrix{T}
+struct DocumentTermMatrix{T}
     dtm::SparseMatrixCSC{T, Int}
     terms::Vector{String}
     row_indices::OrderedDict{String, Int}
@@ -155,7 +155,7 @@ function dtv(d, lex::OrderedDict{String, Int},
              tokenizer::Symbol=DEFAULT_TOKENIZER,
              lex_is_row_indices::Bool=false) where T<:Real
     p = length(keys(lex))
-    column = zeros(T, p)
+    column = spzeros(T, p)
     indices, values = dtm_entries(d, lex, eltype, tokenizer=tokenizer,
                                   lex_is_row_indices=lex_is_row_indices)
     column[indices] = values
@@ -237,7 +237,7 @@ function dtv_regex(d, lex::OrderedDict{String, Int},
                    tokenizer::Symbol=DEFAULT_TOKENIZER,
                    lex_is_row_indices::Bool=false) where T<:Real
     p = length(keys(lex))
-    column = zeros(T, p)
+    column = spzeros(T, p)
     indices, values = dtm_regex_entries(d, lex, eltype, tokenizer=tokenizer,
                                         lex_is_row_indices=lex_is_row_indices)
     column[indices] = values
@@ -254,7 +254,7 @@ using the hashing function `h`. `d` can be an `AbstractString` or an `AbstractDo
 function hash_dtv(d, h::TextHashFunction, eltype::Type{T}=DEFAULT_DTM_TYPE;
                  tokenizer::Symbol=DEFAULT_TOKENIZER) where T<:Real
     p = cardinality(h)
-    res = zeros(T, p)
+    res = spzeros(T, p)
     ngs = ngrams(d, tokenizer=tokenizer)
     for ng in keys(ngs)
         res[index_hash(ng, h)] += ngs[ng]
@@ -280,7 +280,7 @@ function hash_dtm(crps::Corpus,
                   eltype::Type{T}=DEFAULT_DTM_TYPE;
                   tokenizer::Symbol=DEFAULT_TOKENIZER) where T<:Real
     n, p = length(crps), cardinality(h)
-    res = zeros(T, p, n)
+    res = spzeros(T, p, n)
     for (i, doc) in enumerate(crps)
         res[:, i] = hash_dtv(doc, h, eltype, tokenizer=tokenizer)
     end
@@ -293,7 +293,7 @@ hash_dtm(crps::Corpus, eltype::Type{T}=DEFAULT_DTM_TYPE;
 
 
 # Produce entries for on-line analysis when DTM would not fit in memory
-mutable struct EachDTV{U, S<:AbstractString, T<:AbstractDocument}
+struct EachDTV{U, S<:AbstractString, T<:AbstractDocument}
     corpus::Corpus{S,T}
     row_indices::OrderedDict{String, Int}
     tokenizer::Symbol
@@ -348,7 +348,7 @@ Base.show(io::IO, edt::EachDTV{U,S,T}) where {U,S,T} =
           "$(length(edt)) elements of type $(eltype(edt)).")
 
 
-mutable struct EachHashDTV{U, S<:AbstractString, T<:AbstractDocument}
+struct EachHashDTV{U, S<:AbstractString, T<:AbstractDocument}
     corpus::Corpus{S,T}
     tokenizer::Symbol
     function EachHashDTV{U,S,T}(corpus::Corpus{S,T}, tokenizer::Symbol=DEFAULT_TOKENIZER) where

diff --git a/src/hash.jl b/src/hash.jl
@@ -43,7 +43,7 @@ julia> doc = StringDocument("this is a text")
  0.0
 ```
 """
-mutable struct TextHashFunction
+struct TextHashFunction
     hash_function::Function
     cardinality::Int
 end

diff --git a/src/rp.jl b/src/rp.jl
@@ -225,12 +225,12 @@ embed_document(rpm::RPModel{S,T,A,H}, doc::AbstractDocument) where {S,T,A,H} =
 embed_document(rpm::RPModel{S,T,A,H}, doc::AbstractString) where {S,T,A,H} =
     embed_document(rpm, NGramDocument{S}(doc))
 
-embed_document(rpm::RPModel{S,T,A,H}, doc::Vector{S2}) where {S,T,A,H,S2<:AbstractString} =
+embed_document(rpm::RPModel{S,T,A,H}, doc::AbstractVector{S2}) where {S,T,A,H,S2<:AbstractString} =
     embed_document(rpm, TokenDocument{S}(doc))
 
 # Actual embedding function: takes as input the random projection model `rpm` and a document
 # term vector `dtv`. Returns the representation of `dtv` in the embedding space.
-function embed_document(rpm::RPModel{S,T,A,H}, dtv::Vector{T}) where {S,T,A,H}
+function embed_document(rpm::RPModel{S,T,A,H}, dtv::AbstractVector{T}) where {S,T,A,H}
     words_in_document = sum(dtv)
     # Calculate document vector
     if rpm.stats == :count

diff --git a/test/corpus.jl b/test/corpus.jl
@@ -14,7 +14,7 @@
 
     crps2 = Corpus([ngd, ngd])
     update_inverse_index!(crps2)
-    @test typeof(crps2) <: Corpus{<:SubString, <:NGramDocument}
+    @test typeof(crps2) <: Corpus{<:String, <:NGramDocument}
 
     documents(crps)
 

diff --git a/test/dtm.jl b/test/dtm.jl
@@ -39,7 +39,7 @@ end
         hash_dtv(crps[1], TextHashFunction())
     v = hash_dtv(text(crps[1]), cardinality=25)
     @test v == hash_dtv(crps[1], cardinality=25)
-    @test v isa Vector{StringAnalysis.DEFAULT_DTM_TYPE}
+    @test v isa SparseVector{StringAnalysis.DEFAULT_DTM_TYPE}
     @test length(v) == 25
 
     dtm1 = dtm(crps)
@@ -86,15 +86,14 @@ end
     update_inverse_index!(crps)
     m = DocumentTermMatrix(crps)
     # Iteration iterface tests
-    i = 1
-    for v in each_dtv(crps)
+    T = Int8
+    for (i,v) in enumerate(each_dtv(crps, eltype=T))
         @test v == m.dtm[1:end,i]
-        i+= 1
+        i==1 && @test v isa SparseVector{T}
     end
-    i = 1
-    for v in each_hash_dtv(crps)
+    for (i,v) in enumerate(each_hash_dtv(crps, eltype=T))
         @test v == hash_dtv(crps[i])
-        i+= 1
+        i==1 && @test v isa SparseVector{T}
     end
     # Indexing into the DTM
     word = "This"

diff --git a/test/preprocessing.jl b/test/preprocessing.jl
@@ -21,8 +21,8 @@
     map(x->prepare!(x, strip_everything_stem), [sdoc, ndoc, tdoc, crps])
     @test prepare(poem_no_1, strip_everything_stem) == "pin"
     @test text(sdoc) == "pin"
-    @test ngrams(ndoc) == Dict("tag"=>2,"pin"=>1,"hold"=>1,"thrill"=>1)
-    @test string.(tokens(tdoc)) == ["pin", "tag", "hold", "tag", "thrill"]
+    @test ngrams(ndoc) == Dict("tag"=>2,"pin "=>1,"hold"=>1,"thrill "=>1)
+    @test tokens(tdoc) == ["pin ", "tag", "hold", "tag", "thrill "]
     @test text(crps[1]) == "pin"
     # Flag generation
     v = [2, 5, 7]