JuliaString · oxinabox · May 11, 2018 · May 9, 2018 · May 11, 2018 · May 11, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -27,9 +27,9 @@ matrix:
 
 ## uncomment the following lines to override the default test script
 #script:
-# - julia -e 'Pkg.clone(pwd()); Pkg.build("StringInterning"); Pkg.test("StringInterning"; coverage=true)'
+# - julia -e 'Pkg.clone(pwd()); Pkg.build("InternedStrings"); Pkg.test("InternedStrings"; coverage=true)'
 after_success:
  # push coverage results to Coveralls
- - julia -e 'cd(Pkg.dir("StringInterning")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
+ # - julia -e 'cd(Pkg.dir("InternedStrings")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
  # push coverage results to Codecov
- - julia -e 'cd(Pkg.dir("StringInterning")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
+ - julia -e 'cd(Pkg.dir("InternedStrings")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
diff --git a/README.md b/README.md
@@ -5,6 +5,9 @@ For not having duplicate strings in memory.
 
 [![Build Status](https://travis-ci.org/JuliaString/InternedStrings.jl.svg?branch=master)](https://travis-ci.org/JuliaString/InternedStrings.jl)
 
+
+[![codecov.io](http://codecov.io/github/JuliaString/InternedStrings.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaString/InternedStrings.jl?branch=master)
+
 ## Usage
 
 `intern(s)` returns an interned string.
@@ -38,36 +41,39 @@ You might like to intern the strings from [Strs.jl](https://github.com/JuliaStri
 If your not familiar with the concept of string interning perhaps the following example will help.
 
 ```
+
 julia> using InternedStrings
 
 julia> a = "Gold"
 "Gold"
 
-julia> typeof(a), object_id(a) #This is the original reference
-(String, 0x2052f7ed641c9475)
+julia> typeof(a), pointer(a)
+(String, Ptr{UInt8} @0x00007fe604e93b18)
 
 julia> a = intern(a)
 "Gold"
 
-julia> typeof(a), object_id(a) # No change still same memory
-(String, 0x2052f7ed641c9475)
+julia> typeof(a), pointer(a) # No change still same memory
+(String, Ptr{UInt8} @0x00007fe604e93b18)
 
 julia> b = "Gold"
 "Gold"
 
-julia> typeof(b),object_id(b) # New memory, see different ID
-(String, 0x927fe26348e44a27)
+julia> typeof(b),pointer(b) # New memory, see different ID
+(String, Ptr{UInt8} @0x00007fe5fae44444)
 
 julia> b = intern(b) # Replace it,
 "Gold"
 
-julia> typeof(b),object_id(b) # See it is same memory as for the original `a`
-(String, 0x2052f7ed641c9475)
+julia> typeof(b),pointer(b) # See it is same memory as for the original `a`
+(String, Ptr{UInt8} @0x00007fe604e93b18)
+#now the memory allocated to "b" at addr=0x00007fe5fae44444 can be garbage collected
 
- #now the memory allocated to "b" with id=0x927fe26348e44a27 can be garbage collected
+julia> pointer(intern("Gold")) # Same again
+Ptr{UInt8} @0x00007fe604e93b18
 
-julia> object_id(intern("Gold")) # Same again
-0x2052f7ed641c9475
+julia> pointer(intern(SubString("Golden",1,4))) # Substrings too
+Ptr{UInt8} @0x00007fe604e93b1
 ```
 
 
@@ -82,7 +88,7 @@ There is an issue though:
 How much are these tokens costing you in memory use?
 
 Originally you had say a 100MB (10⁸ bytes) text file (multiply this out as required).
-Which as a String took-up (10⁸ bytes + 1 pointer (4 bytes) + 1 length marker (4 bytes) + null terminating character (total 10⁸ + 9 bytes).
+Which as a String took-up (10⁸ bytes + 1 pointer (4 or 8 bytes) + 1 length marker (4 or 8 bytes) + null terminating character (total 10⁸ + 9 (or 17) bytes).
 To simplify the math lets say the average token length was 10 bytes.
 So you had 10⁷ tokens.
 
@@ -142,7 +148,7 @@ Once the last string with with that content goes out of scope (and is garbage co
 removing the copy in the interning pool will be handled automatically (it is a WeakRef, so won't keep it alive).
 
 
-Finally point **4:**.
+Final point **4:**.
 As I said before.
 The original 10⁸ byte document, with 10⁷ words probably only has about 50,000 (5×10⁴) unique words after cleaning.
 (Looking at real world data, the first 10⁷ tokens of wikipedia,

diff --git a/src/InternedStrings.jl b/src/InternedStrings.jl
@@ -1,12 +1,105 @@
+__precompile__(true)
+"""
+InternedStrings
+
+Copyright 2017-2018 Lyndon White (and other contributors)
+Licensed under MIT License, see LICENSE.md
+"""
 module InternedStrings
-using Base
 
 export @i_str, intern
 
-include("corefunctionality.jl")
+Base.@deprecate_binding(InternedString, String, true)
+
+@static if VERSION < v"0.7.0-DEV"
+ const Nothing = Void
+ const ht_keyindex2! = Base.ht_keyindex2
+ add_finalizer(fun::Function, obj) = Base.finalizer(obj, fun)
+else
+ using Base: ht_keyindex2!
+ const add_finalizer = Base.finalizer
+end
 
+########################
+# The pool/interning lookup core code
 
-Base.@deprecate_binding(InternedString, String, true)
-#InternedString(s)=intern(String(s))
+# This forces the type to be inferred (I don't know that the @noinline is reqired or even good)
+@noinline getvalue(::Type{K}, wk) where {K} = wk.value::K
+
+# NOTE: This code is carefully optimised.
+# Do not tweak it (for readability or otherwise) without benchmarking
+@inline function intern!(wkd::WeakKeyDict{K}, key)::K where {K}
+ lock(wkd.lock)
+ # hand positioning the locks and unlocks
+ # (rather than do block or try finally, seems to be faster)
+ index = ht_keyindex2!(wkd.ht, key) # returns index if present, or -index if not
+ # note hash of weakref is equal to the hash of value, so avoid constructing it if not required
+ if index > 0
+ # found it
+ @inbounds found_key = wkd.ht.keys[index]
+ unlock(wkd.lock)
+ return getvalue(K, found_key) # return the strong ref
+ else
+ # Not found, so add it,
+ # and mark it as a reference we track to delete!
+ kk::K = convert(K, key)
+ add_finalizer(wkd.finalizer, kk) # finalizer is set on the strong ref
+ @inbounds Base._setindex!(wkd.ht, nothing, WeakRef(kk), -index)
+ unlock(wkd.lock)
+ return kk # Return the strong ref
+ end
+end
+#####################################################
+# Setup for types
+
+const pool = Dict{DataType, WeakKeyDict}()
+
+@inline function get_pool(::Type{T})::WeakKeyDict{T, Nothing} where {T}
+ get!(pool, T) do
+ WeakKeyDict{T, Nothing}()
+ end
+end
+
+
+###################################
 
+"""
+ intern(s::T)
+
+Return a reference to a interned instance of `s`,
+adding it to the interning pool if it did not already exist.
+"""
+function intern(s::T)::T where {T}
+ intern(T, s)
+end
+
+"""
+ intern(::Type{T}, s)
+
+Intern `s` as if it were type `T`, converting it if required.
+Note that this will lead to unexpected behavour if the type of `s`, and `T`,
+do not have equivalent equality and hash functions
+(i.e. this is not safe if `hash(s) != hash(convert(T, s))`).
+"""
+function intern(::Type{T}, s)::T where {T}
+ intern!(get_pool(T), s)
+end
+
+"""
+Substrings are interned as their parent string type
+"""
+function intern(substr::SubString{T})::T where {T}
+ intern(T, substr)
 end
+
+
+#############################
+
+
+macro i_str(s)
+ # This is done to get interpolation to work correctly
+ true_string_expr = esc(Meta.parse(string('"', unescape_string(s), '"')))
+ Expr(:call, intern, true_string_expr)
+end
+
+end # module InternedStrings
diff --git a/src/corefunctionality.jl b/src/corefunctionality.jl
diff --git a/test/all_kinds_of_types.jl b/test/all_kinds_of_types.jl
@@ -1,16 +1,11 @@
-using Base.Test
-using InternedStrings
-
 @testset "String" begin
  ex1 = intern("ex")
- @test ex1=="ex"
- @test !(ex1==="ex")
+ @test ex1 == "ex"
+ @test !addr_eq(ex1, "ex")
  ex2 = intern("ex")
- @test ex1===ex2
+ @test addr_eq(ex1, ex2)
  ex3 = intern(String, "ex")
- @test ex1===ex3
-
-
+ @test addr_eq(ex1, ex3)
 
  @testset "type inference" begin
  @test ex1 isa String
@@ -20,17 +15,15 @@ using InternedStrings
  end
 end
 
-
-
 @testset "SubString" begin
  aa1, bb1, cc1 = intern.(split("aa bb cc"))
  aa2, bb2, cc2 = intern.(split("aa bb cc"))
  aa3, bb3, cc3 = intern.(String, split("aa bb cc"))
 
- @test bb1=="bb"
- @test !(bb1==="bb")
- @test bb1===bb2
- @test bb1===bb3
+ @test bb1 == "bb"
+ @test !addr_eq(bb1, "bb")
+ @test addr_eq(bb1, bb2)
+ @test addr_eq(bb1, bb3)
 
  @testset "type inference" begin
  @test intern(split("aa bb cc")[1]) isa String
@@ -44,30 +37,26 @@ end
  using WeakRefStrings
  s1 = "ex"
  s2 = "ex"
- ex1 = @inferred intern(String, WeakRefString(Vector{UInt8}(s1)))
- @test ex1=="ex"
- @test !(ex1===s1)
+ ex1 = @inferred intern(String, WeakRefString(unsafe_wrap(Vector{UInt8}, s1)))
+ @test !addr_eq(ex1, s1)
  @test ex1 isa String
- ex2 = @inferred intern(String, WeakRefString(Vector{UInt8}(s2)))
- @test ex1===ex2
+ ex2 = @inferred intern(String, WeakRefString(unsafe_wrap(Vector{UInt8}, s2)))
+ @test addr_eq(ex1, ex2)
 end
 
-#== Uncomment when https://github.com/JuliaLang/julia/issues/26939 is fixed
-@testset "BigFloat" begin let
- pi1 = intern(BigFloat(π))
- @test pi1==BigFloat(π)
- @test !(pi1===BigFloat(π))
+# Enable when https://github.com/JuliaLang/julia/issues/26939 is fixed
+false && @testset "BigFloat" begin
+ let
+ pi1 = intern(BigFloat(π))
+ @test pi1 == BigFloat(π)
+ @test !addr_eq(pi1, BigFloat(π))
 
- pi2 = intern(BigFloat(π))
- @test pi1===pi2
+  pi2 = intern(BigFloat(π))
+  @test addr_eq(pi1, pi2)
 
- @testset "type inference" begin
- @test pi1 isa BigFloat
- @inferred intern(BigFloat(π))
+ @testset "type inference" begin
+ @test pi1 isa BigFloat
+ @inferred intern(BigFloat(π))
+ end
  end
-end end
-
-==#
-
-
-dicts = [WeakKeyDict()]
+end