Skip to content

Commit

Permalink
Merge pull request #13 from JuliaString/spj/v07update
Browse files Browse the repository at this point in the history
Update for v0.7 support
  • Loading branch information
oxinabox authored May 11, 2018
2 parents 335af70 + 481ba7d commit 34e9c24
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 189 deletions.
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ matrix:

## uncomment the following lines to override the default test script
#script:
# - julia -e 'Pkg.clone(pwd()); Pkg.build("StringInterning"); Pkg.test("StringInterning"; coverage=true)'
# - julia -e 'Pkg.clone(pwd()); Pkg.build("InternedStrings"); Pkg.test("InternedStrings"; coverage=true)'
after_success:
# push coverage results to Coveralls
- julia -e 'cd(Pkg.dir("StringInterning")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
# - julia -e 'cd(Pkg.dir("InternedStrings")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
# push coverage results to Codecov
- julia -e 'cd(Pkg.dir("StringInterning")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
- julia -e 'cd(Pkg.dir("InternedStrings")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
32 changes: 19 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ For not having duplicate strings in memory.

[![Build Status](https://travis-ci.org/JuliaString/InternedStrings.jl.svg?branch=master)](https://travis-ci.org/JuliaString/InternedStrings.jl)


[![codecov.io](http://codecov.io/github/JuliaString/InternedStrings.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaString/InternedStrings.jl?branch=master)

## Usage

`intern(s)` returns an interned string.
Expand Down Expand Up @@ -38,36 +41,39 @@ You might like to intern the strings from [Strs.jl](https://github.com/JuliaStri
If your not familiar with the concept of string interning perhaps the following example will help.

```
julia> using InternedStrings
julia> a = "Gold"
"Gold"
julia> typeof(a), object_id(a) #This is the original reference
(String, 0x2052f7ed641c9475)
julia> typeof(a), pointer(a)
(String, Ptr{UInt8} @0x00007fe604e93b18)
julia> a = intern(a)
"Gold"
julia> typeof(a), object_id(a) # No change still same memory
(String, 0x2052f7ed641c9475)
julia> typeof(a), pointer(a) # No change still same memory
(String, Ptr{UInt8} @0x00007fe604e93b18)
julia> b = "Gold"
"Gold"
julia> typeof(b),object_id(b) # New memory, see different ID
(String, 0x927fe26348e44a27)
julia> typeof(b),pointer(b) # New memory, see different ID
(String, Ptr{UInt8} @0x00007fe5fae44444)
julia> b = intern(b) # Replace it,
"Gold"
julia> typeof(b),object_id(b) # See it is same memory as for the original `a`
(String, 0x2052f7ed641c9475)
julia> typeof(b),pointer(b) # See it is same memory as for the original `a`
(String, Ptr{UInt8} @0x00007fe604e93b18)
#now the memory allocated to "b" at addr=0x00007fe5fae44444 can be garbage collected
#now the memory allocated to "b" with id=0x927fe26348e44a27 can be garbage collected
julia> pointer(intern("Gold")) # Same again
Ptr{UInt8} @0x00007fe604e93b18
julia> object_id(intern("Gold")) # Same again
0x2052f7ed641c9475
julia> pointer(intern(SubString("Golden",1,4))) # Substrings too
Ptr{UInt8} @0x00007fe604e93b1
```


Expand All @@ -82,7 +88,7 @@ There is an issue though:
How much are these tokens costing you in memory use?

Originally you had say a 100MB (10⁸ bytes) text file (multiply this out as required).
Which as a String took-up (10⁸ bytes + 1 pointer (4 bytes) + 1 length marker (4 bytes) + null terminating character (total 10⁸ + 9 bytes).
Which as a String took-up (10⁸ bytes + 1 pointer (4 or 8 bytes) + 1 length marker (4 or 8 bytes) + null terminating character (total 10⁸ + 9 (or 17) bytes).
To simplify the math lets say the average token length was 10 bytes.
So you had 10⁷ tokens.

Expand Down Expand Up @@ -142,7 +148,7 @@ Once the last string with with that content goes out of scope (and is garbage co
removing the copy in the interning pool will be handled automatically (it is a WeakRef, so won't keep it alive).


Finally point **4:**.
Final point **4:**.
As I said before.
The original 10⁸ byte document, with 10⁷ words probably only has about 50,000 (5×10⁴) unique words after cleaning.
(Looking at real world data, the first 10⁷ tokens of wikipedia,
Expand Down
101 changes: 97 additions & 4 deletions src/InternedStrings.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,105 @@
__precompile__(true)
"""
InternedStrings
Copyright 2017-2018 Lyndon White (and other contributors)
Licensed under MIT License, see LICENSE.md
"""
module InternedStrings
using Base

export @i_str, intern

include("corefunctionality.jl")
Base.@deprecate_binding(InternedString, String, true)

@static if VERSION < v"0.7.0-DEV"
const Nothing = Void
const ht_keyindex2! = Base.ht_keyindex2
add_finalizer(fun::Function, obj) = Base.finalizer(obj, fun)
else
using Base: ht_keyindex2!
const add_finalizer = Base.finalizer
end

########################
# The pool/interning lookup core code

Base.@deprecate_binding(InternedString, String, true)
#InternedString(s)=intern(String(s))
# This forces the type to be inferred (I don't know that the @noinline is reqired or even good)
@noinline getvalue(::Type{K}, wk) where {K} = wk.value::K

# NOTE: This code is carefully optimised.
# Do not tweak it (for readability or otherwise) without benchmarking
@inline function intern!(wkd::WeakKeyDict{K}, key)::K where {K}
lock(wkd.lock)
# hand positioning the locks and unlocks
# (rather than do block or try finally, seems to be faster)
index = ht_keyindex2!(wkd.ht, key) # returns index if present, or -index if not
# note hash of weakref is equal to the hash of value, so avoid constructing it if not required
if index > 0
# found it
@inbounds found_key = wkd.ht.keys[index]
unlock(wkd.lock)
return getvalue(K, found_key) # return the strong ref
else
# Not found, so add it,
# and mark it as a reference we track to delete!
kk::K = convert(K, key)
add_finalizer(wkd.finalizer, kk) # finalizer is set on the strong ref
@inbounds Base._setindex!(wkd.ht, nothing, WeakRef(kk), -index)
unlock(wkd.lock)
return kk # Return the strong ref
end
end
#####################################################
# Setup for types

const pool = Dict{DataType, WeakKeyDict}()

@inline function get_pool(::Type{T})::WeakKeyDict{T, Nothing} where {T}
get!(pool, T) do
WeakKeyDict{T, Nothing}()
end
end


###################################

"""
intern(s::T)
Return a reference to a interned instance of `s`,
adding it to the interning pool if it did not already exist.
"""
function intern(s::T)::T where {T}
intern(T, s)
end

"""
intern(::Type{T}, s)
Intern `s` as if it were type `T`, converting it if required.
Note that this will lead to unexpected behavour if the type of `s`, and `T`,
do not have equivalent equality and hash functions
(i.e. this is not safe if `hash(s) != hash(convert(T, s))`).
"""
function intern(::Type{T}, s)::T where {T}
intern!(get_pool(T), s)
end

"""
Substrings are interned as their parent string type
"""
function intern(substr::SubString{T})::T where {T}
intern(T, substr)
end


#############################


macro i_str(s)
# This is done to get interpolation to work correctly
true_string_expr = esc(Meta.parse(string('"', unescape_string(s), '"')))
Expr(:call, intern, true_string_expr)
end

end # module InternedStrings
79 changes: 0 additions & 79 deletions src/corefunctionality.jl

This file was deleted.

61 changes: 25 additions & 36 deletions test/all_kinds_of_types.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
using Base.Test
using InternedStrings

@testset "String" begin
ex1 = intern("ex")
@test ex1=="ex"
@test !(ex1==="ex")
@test ex1 == "ex"
@test !addr_eq(ex1, "ex")
ex2 = intern("ex")
@test ex1===ex2
@test addr_eq(ex1, ex2)
ex3 = intern(String, "ex")
@test ex1===ex3


@test addr_eq(ex1, ex3)

@testset "type inference" begin
@test ex1 isa String
Expand All @@ -20,17 +15,15 @@ using InternedStrings
end
end



@testset "SubString" begin
aa1, bb1, cc1 = intern.(split("aa bb cc"))
aa2, bb2, cc2 = intern.(split("aa bb cc"))
aa3, bb3, cc3 = intern.(String, split("aa bb cc"))

@test bb1=="bb"
@test !(bb1==="bb")
@test bb1===bb2
@test bb1===bb3
@test bb1 == "bb"
@test !addr_eq(bb1, "bb")
@test addr_eq(bb1, bb2)
@test addr_eq(bb1, bb3)

@testset "type inference" begin
@test intern(split("aa bb cc")[1]) isa String
Expand All @@ -44,30 +37,26 @@ end
using WeakRefStrings
s1 = "ex"
s2 = "ex"
ex1 = @inferred intern(String, WeakRefString(Vector{UInt8}(s1)))
@test ex1=="ex"
@test !(ex1===s1)
ex1 = @inferred intern(String, WeakRefString(unsafe_wrap(Vector{UInt8}, s1)))
@test !addr_eq(ex1, s1)
@test ex1 isa String
ex2 = @inferred intern(String, WeakRefString(Vector{UInt8}(s2)))
@test ex1===ex2
ex2 = @inferred intern(String, WeakRefString(unsafe_wrap(Vector{UInt8}, s2)))
@test addr_eq(ex1, ex2)
end

#== Uncomment when https://github.com/JuliaLang/julia/issues/26939 is fixed
@testset "BigFloat" begin let
pi1 = intern(BigFloat(π))
@test pi1==BigFloat(π)
@test !(pi1===BigFloat(π))
# Enable when https://github.com/JuliaLang/julia/issues/26939 is fixed
false && @testset "BigFloat" begin
let
pi1 = intern(BigFloat(π))
@test pi1 == BigFloat(π)
@test !addr_eq(pi1, BigFloat(π))

pi2 = intern(BigFloat(π))
@test pi1===pi2
pi2 = intern(BigFloat(π))
@test addr_eq(pi1, pi2)

@testset "type inference" begin
@test pi1 isa BigFloat
@inferred intern(BigFloat(π))
@testset "type inference" begin
@test pi1 isa BigFloat
@inferred intern(BigFloat(π))
end
end
end end
==#


dicts = [WeakKeyDict()]
end
Loading

0 comments on commit 34e9c24

Please sign in to comment.