Skip to content

Commit

Permalink
[Serialization] Delta-encode consecutive SourceLocations in TypeLoc
Browse files Browse the repository at this point in the history
Much of the size of PCH/PCM files comes from stored SourceLocations.
These are encoded using (almost) their raw value, VBR-encoded. Absolute
SourceLocations can be relatively large numbers, so this commonly takes
20-30 bits per location.

We can reduce this by exploiting redundancy: many "nearby" SourceLocations are
stored differing only slightly and can be delta-encoded.
Randam-access loading of AST nodes constrains how long these sequences
can be, but we can do it at least within a node that always gets
deserialized as an atomic unit.

TypeLoc is implemented in this patch as it's a relatively small change
that shows most of the API.
This saves ~3.5% of PCH size, I have local changes applying this technique
further that save another 3%, I think it's possible to get to 10% total.

Differential Revision: https://reviews.llvm.org/D125403
  • Loading branch information
sam-mccall committed May 19, 2022
1 parent bbc6834 commit 4df795b
Show file tree
Hide file tree
Showing 9 changed files with 426 additions and 135 deletions.
25 changes: 14 additions & 11 deletions clang/include/clang/Serialization/ASTReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "clang/Serialization/ModuleFile.h"
#include "clang/Serialization/ModuleFileExtension.h"
#include "clang/Serialization/ModuleManager.h"
#include "clang/Serialization/SourceLocationEncoding.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
Expand Down Expand Up @@ -396,6 +397,8 @@ class ASTReader
using ModuleReverseIterator = ModuleManager::ModuleReverseIterator;

private:
using LocSeq = SourceLocationSequence;

/// The receiver of some callbacks invoked by ASTReader.
std::unique_ptr<ASTReaderListener> Listener;

Expand Down Expand Up @@ -2155,16 +2158,16 @@ class ASTReader

/// Read a source location from raw form and return it in its
/// originating module file's source location space.
SourceLocation
ReadUntranslatedSourceLocation(SourceLocation::UIntTy Raw) const {
return SourceLocation::getFromRawEncoding((Raw >> 1) |
(Raw << (8 * sizeof(Raw) - 1)));
SourceLocation ReadUntranslatedSourceLocation(SourceLocation::UIntTy Raw,
LocSeq *Seq = nullptr) const {
return SourceLocationEncoding::decode(Raw, Seq);
}

/// Read a source location from raw form.
SourceLocation ReadSourceLocation(ModuleFile &ModuleFile,
SourceLocation::UIntTy Raw) const {
SourceLocation Loc = ReadUntranslatedSourceLocation(Raw);
SourceLocation::UIntTy Raw,
LocSeq *Seq = nullptr) const {
SourceLocation Loc = ReadUntranslatedSourceLocation(Raw, Seq);
return TranslateSourceLocation(ModuleFile, Loc);
}

Expand All @@ -2184,14 +2187,14 @@ class ASTReader

/// Read a source location.
SourceLocation ReadSourceLocation(ModuleFile &ModuleFile,
const RecordDataImpl &Record,
unsigned &Idx) {
return ReadSourceLocation(ModuleFile, Record[Idx++]);
const RecordDataImpl &Record, unsigned &Idx,
LocSeq *Seq = nullptr) {
return ReadSourceLocation(ModuleFile, Record[Idx++], Seq);
}

/// Read a source range.
SourceRange ReadSourceRange(ModuleFile &F,
const RecordData &Record, unsigned &Idx);
SourceRange ReadSourceRange(ModuleFile &F, const RecordData &Record,
unsigned &Idx, LocSeq *Seq = nullptr);

// Read a string
static std::string ReadString(const RecordData &Record, unsigned &Idx);
Expand Down
13 changes: 7 additions & 6 deletions clang/include/clang/Serialization/ASTRecordReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "clang/AST/AbstractBasicReader.h"
#include "clang/Lex/Token.h"
#include "clang/Serialization/ASTReader.h"
#include "clang/Serialization/SourceLocationEncoding.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/APSInt.h"
Expand All @@ -30,6 +31,7 @@ class OMPChildren;
class ASTRecordReader
: public serialization::DataStreamBasicReader<ASTRecordReader> {
using ModuleFile = serialization::ModuleFile;
using LocSeq = SourceLocationSequence;

ASTReader *Reader;
ModuleFile *F;
Expand Down Expand Up @@ -160,8 +162,7 @@ class ASTRecordReader
TypeSourceInfo *readTypeSourceInfo();

/// Reads the location information for a type.
void readTypeLoc(TypeLoc TL);

void readTypeLoc(TypeLoc TL, LocSeq *Seq = nullptr);

/// Map a local type ID within a given AST file to a global type ID.
serialization::TypeID getGlobalTypeID(unsigned LocalID) const {
Expand Down Expand Up @@ -271,13 +272,13 @@ class ASTRecordReader
void readOMPChildren(OMPChildren *Data);

/// Read a source location, advancing Idx.
SourceLocation readSourceLocation() {
return Reader->ReadSourceLocation(*F, Record, Idx);
SourceLocation readSourceLocation(LocSeq *Seq = nullptr) {
return Reader->ReadSourceLocation(*F, Record, Idx, Seq);
}

/// Read a source range, advancing Idx.
SourceRange readSourceRange() {
return Reader->ReadSourceRange(*F, Record, Idx);
SourceRange readSourceRange(LocSeq *Seq = nullptr) {
return Reader->ReadSourceRange(*F, Record, Idx, Seq);
}

/// Read an arbitrary constant value, advancing Idx.
Expand Down
13 changes: 8 additions & 5 deletions clang/include/clang/Serialization/ASTRecordWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "clang/AST/AbstractBasicWriter.h"
#include "clang/AST/OpenMPClause.h"
#include "clang/Serialization/ASTWriter.h"
#include "clang/Serialization/SourceLocationEncoding.h"

namespace clang {

Expand All @@ -25,6 +26,8 @@ class TypeLoc;
/// An object for streaming information to a record.
class ASTRecordWriter
: public serialization::DataStreamBasicWriter<ASTRecordWriter> {
using LocSeq = SourceLocationSequence;

ASTWriter *Writer;
ASTWriter::RecordDataImpl *Record;

Expand Down Expand Up @@ -131,16 +134,16 @@ class ASTRecordWriter
void AddFunctionDefinition(const FunctionDecl *FD);

/// Emit a source location.
void AddSourceLocation(SourceLocation Loc) {
return Writer->AddSourceLocation(Loc, *Record);
void AddSourceLocation(SourceLocation Loc, LocSeq *Seq = nullptr) {
return Writer->AddSourceLocation(Loc, *Record, Seq);
}
void writeSourceLocation(SourceLocation Loc) {
AddSourceLocation(Loc);
}

/// Emit a source range.
void AddSourceRange(SourceRange Range) {
return Writer->AddSourceRange(Range, *Record);
void AddSourceRange(SourceRange Range, LocSeq *Seq = nullptr) {
return Writer->AddSourceRange(Range, *Record, Seq);
}

void writeBool(bool Value) {
Expand Down Expand Up @@ -206,7 +209,7 @@ class ASTRecordWriter
void AddTypeSourceInfo(TypeSourceInfo *TInfo);

/// Emits source location information for a type. Does not emit the type.
void AddTypeLoc(TypeLoc TL);
void AddTypeLoc(TypeLoc TL, LocSeq *Seq = nullptr);

/// Emits a template argument location info.
void AddTemplateArgumentLocInfo(TemplateArgument::ArgKind Kind,
Expand Down
9 changes: 7 additions & 2 deletions clang/include/clang/Serialization/ASTWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "clang/Serialization/ASTBitCodes.h"
#include "clang/Serialization/ASTDeserializationListener.h"
#include "clang/Serialization/PCHContainerOperations.h"
#include "clang/Serialization/SourceLocationEncoding.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
Expand Down Expand Up @@ -104,6 +105,8 @@ class ASTWriter : public ASTDeserializationListener,
using TypeIdxMap = llvm::DenseMap<QualType, serialization::TypeIdx,
serialization::UnsafeQualTypeDenseMapInfo>;

using LocSeq = SourceLocationSequence;

/// The bitstream writer used to emit this precompiled header.
llvm::BitstreamWriter &Stream;

Expand Down Expand Up @@ -581,10 +584,12 @@ class ASTWriter : public ASTDeserializationListener,
RecordDataImpl &Record);

/// Emit a source location.
void AddSourceLocation(SourceLocation Loc, RecordDataImpl &Record);
void AddSourceLocation(SourceLocation Loc, RecordDataImpl &Record,
LocSeq *Seq = nullptr);

/// Emit a source range.
void AddSourceRange(SourceRange Range, RecordDataImpl &Record);
void AddSourceRange(SourceRange Range, RecordDataImpl &Record,
LocSeq *Seq = nullptr);

/// Emit a reference to an identifier.
void AddIdentifierRef(const IdentifierInfo *II, RecordDataImpl &Record);
Expand Down
162 changes: 162 additions & 0 deletions clang/include/clang/Serialization/SourceLocationEncoding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
//===--- SourceLocationEncoding.h - Small serialized locations --*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Source locations are stored pervasively in the AST, making up a third of
// the size of typical serialized files. Storing them efficiently is important.
//
// We use integers optimized by VBR-encoding, because:
// - when abbrevations cannot be used, VBR6 encoding is our only choice
// - in the worst case a SourceLocation can be ~any 32-bit number, but in
// practice they are highly predictable
//
// We encode the integer so that likely values encode as small numbers that
// turn into few VBR chunks:
// - the invalid sentinel location is a very common value: it encodes as 0
// - the "macro or not" bit is stored at the bottom of the integer
// (rather than at the top, as in memory), so macro locations can have
// small representations.
// - related locations (e.g. of a left and right paren pair) are usually
// similar, so when encoding a sequence of locations we store only
// differences between successive elements.
//
//===----------------------------------------------------------------------===//

#include "clang/Basic/SourceLocation.h"

#ifndef LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H
#define LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H

namespace clang {
class SourceLocationSequence;

/// Serialized encoding of SourceLocations without context.
/// Optimized to have small unsigned values (=> small after VBR encoding).
///
// Macro locations have the top bit set, we rotate by one so it is the low bit.
class SourceLocationEncoding {
using UIntTy = SourceLocation::UIntTy;
constexpr static unsigned UIntBits = CHAR_BIT * sizeof(UIntTy);

static UIntTy encodeRaw(UIntTy Raw) {
return (Raw << 1) | (Raw >> (UIntBits - 1));
}
static UIntTy decodeRaw(UIntTy Raw) {
return (Raw >> 1) | (Raw << (UIntBits - 1));
}
friend SourceLocationSequence;

public:
static uint64_t encode(SourceLocation Loc,
SourceLocationSequence * = nullptr);
static SourceLocation decode(uint64_t, SourceLocationSequence * = nullptr);
};

/// Serialized encoding of a sequence of SourceLocations.
///
/// Optimized to produce small values when locations with the sequence are
/// similar. Each element can be delta-encoded against the last nonzero element.
///
/// Sequences should be started by creating a SourceLocationSequence::State,
/// and then passed around as SourceLocationSequence*. Example:
///
/// // establishes a sequence
/// void EmitTopLevelThing() {
/// SourceLocationSequence::State Seq;
/// EmitContainedThing(Seq);
/// EmitRecursiveThing(Seq);
/// }
///
/// // optionally part of a sequence
/// void EmitContainedThing(SourceLocationSequence *Seq = nullptr) {
/// Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
/// }
///
/// // establishes a sequence if there isn't one already
/// void EmitRecursiveThing(SourceLocationSequence *ParentSeq = nullptr) {
/// SourceLocationSequence::State Seq(ParentSeq);
/// Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq));
/// EmitRecursiveThing(Seq);
/// }
///
class SourceLocationSequence {
using UIntTy = SourceLocation::UIntTy;
using EncodedTy = uint64_t;
constexpr static auto UIntBits = SourceLocationEncoding::UIntBits;
static_assert(sizeof(EncodedTy) > sizeof(UIntTy), "Need one extra bit!");

// Prev stores the rotated last nonzero location.
UIntTy &Prev;

// Zig-zag encoding turns small signed integers into small unsigned integers.
// 0 => 0, -1 => 1, 1 => 2, -2 => 3, ...
static UIntTy zigZag(UIntTy V) {
UIntTy Sign = (V & (1 << (UIntBits - 1))) ? UIntTy(-1) : UIntTy(0);
return Sign ^ (V << 1);
}
static UIntTy zagZig(UIntTy V) { return (V >> 1) ^ -(V & 1); }

SourceLocationSequence(UIntTy &Prev) : Prev(Prev) {}

EncodedTy encodeRaw(UIntTy Raw) {
if (Raw == 0)
return 0;
UIntTy Rotated = SourceLocationEncoding::encodeRaw(Raw);
if (Prev == 0)
return Prev = Rotated;
UIntTy Delta = Rotated - Prev;
Prev = Rotated;
// Exactly one 33 bit value is possible! (1 << 32).
// This is because we have two representations of zero: trivial & relative.
return 1 + EncodedTy{zigZag(Delta)};
}
UIntTy decodeRaw(EncodedTy Encoded) {
if (Encoded == 0)
return 0;
if (Prev == 0)
return SourceLocationEncoding::decodeRaw(Prev = Encoded);
return SourceLocationEncoding::decodeRaw(Prev += zagZig(Encoded - 1));
}

public:
SourceLocation decode(EncodedTy Encoded) {
return SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
}
EncodedTy encode(SourceLocation Loc) {
return encodeRaw(Loc.getRawEncoding());
}

class State;
};

/// This object establishes a SourceLocationSequence.
class SourceLocationSequence::State {
UIntTy Prev = 0;
SourceLocationSequence Seq;

public:
// If Parent is provided and non-null, then this root becomes part of that
// enclosing sequence instead of establishing a new one.
State(SourceLocationSequence *Parent = nullptr)
: Seq(Parent ? Parent->Prev : Prev) {}

// Implicit conversion for uniform use of roots vs propagated sequences.
operator SourceLocationSequence *() { return &Seq; }
};

inline uint64_t SourceLocationEncoding::encode(SourceLocation Loc,
SourceLocationSequence *Seq) {
return Seq ? Seq->encode(Loc) : encodeRaw(Loc.getRawEncoding());
}
inline SourceLocation
SourceLocationEncoding::decode(uint64_t Encoded, SourceLocationSequence *Seq) {
return Seq ? Seq->decode(Encoded)
: SourceLocation::getFromRawEncoding(decodeRaw(Encoded));
}

} // namespace clang
#endif
Loading

0 comments on commit 4df795b

Please sign in to comment.