Skip to content

Commit

Permalink
de-duplicate strings in serialization (#35056)
Browse files Browse the repository at this point in the history
fixes #35030
  • Loading branch information
JeffBezanson authored Mar 19, 2020
1 parent 64d31fe commit d33c5a5
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 2 deletions.
15 changes: 13 additions & 2 deletions stdlib/Serialization/src/Serialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Serializer(io::IO) = Serializer{typeof(io)}(io)

const n_int_literals = 33
const n_reserved_slots = 24
const n_reserved_tags = 11
const n_reserved_tags = 10

const TAGS = Any[
Symbol, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128,
Expand All @@ -57,6 +57,7 @@ const TAGS = Any[
Symbol, # FULL_GLOBALREF_TAG
Symbol, # HEADER_TAG
Symbol, # IDDICT_TAG
Symbol, # SHARED_REF_TAG
fill(Symbol, n_reserved_tags)...,

(), Bool, Any, Bottom, Core.TypeofBottom, Type, svec(), Tuple{}, false, true, nothing,
Expand All @@ -76,7 +77,7 @@ const TAGS = Any[

@assert length(TAGS) == 255

const ser_version = 9 # do not make changes without bumping the version #!
const ser_version = 10 # do not make changes without bumping the version #!

const NTAGS = length(TAGS)

Expand Down Expand Up @@ -135,6 +136,7 @@ const REF_OBJECT_TAG = Int32(o0+13)
const FULL_GLOBALREF_TAG = Int32(o0+14)
const HEADER_TAG = Int32(o0+15)
const IDDICT_TAG = Int32(o0+16)
const SHARED_REF_TAG = Int32(o0+17)

writetag(s::IO, tag) = (write(s, UInt8(tag)); nothing)

Expand Down Expand Up @@ -284,6 +286,10 @@ end

function serialize(s::AbstractSerializer, ss::String)
len = sizeof(ss)
if len > 7
serialize_cycle(s, ss) && return
writetag(s.io, SHARED_REF_TAG)
end
if len <= 255
writetag(s.io, STRING_TAG)
write(s.io, UInt8(len))
Expand Down Expand Up @@ -802,6 +808,11 @@ function handle_deserialize(s::AbstractSerializer, b::Int32)
push!(s.pending_refs, slot)
t = deserialize(s)
return deserialize(s, t)
elseif b == SHARED_REF_TAG
slot = s.counter; s.counter += 1
obj = deserialize(s)
s.table[slot] = obj
return obj
elseif b == SYMBOL_TAG
return deserialize_symbol(s, Int(read(s.io, UInt8)::UInt8))
elseif b == SHORTINT64_TAG
Expand Down
8 changes: 8 additions & 0 deletions stdlib/Serialization/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -582,3 +582,11 @@ let d = IdDict([1] => 2, [3] => 4), io = IOBuffer()
@test Dict(d) == Dict(ds)
@test all([k in keys(ds) for k in keys(ds)])
end

# issue #35030, shared references to Strings
let s = join(rand('a':'z', 1024)), io = IOBuffer()
serialize(io, (s, s))
seekstart(io)
s2 = deserialize(io)
@test Base.summarysize(s2) < 2*sizeof(s)
end

2 comments on commit d33c5a5

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily benchmark build, I will reply here when finished:

@nanosoldier runbenchmarks(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here. cc @ararslan

Please sign in to comment.