Skip to content

Commit

Permalink
Auto merge of #5121 - Eh2406:string_interning, r=alexcrichton
Browse files Browse the repository at this point in the history
String interning

This builds on the work from #5118. This interns the strings in the part of resolver that gets cloned a lot.

In a test on #4810 (comment)
Before we got to 1700000 ticks in ~(63 to 67) sec from #5118
After we got to 1700000 ticks in ~(42 to 45) sec

The interning code itself would be much better with a `leak` function that converts a `String` to a `&'static str`. Something like:
```rust
pub fn leek(s: String) -> &'static str {
    let ptr = s.as_ptr();
    let len = s.len();
    mem::forget(s);
    unsafe {
        let slice = slice::from_raw_parts(ptr, len);
        str::from_utf8(slice).unwrap()
    }
}
```
but "there is no unsafe in Cargo", and I am not the best at unsafe. So I just `to_string` and lived with the extra copy. Is there a better way to hand out references?

I assumed that `InternedString::new` world start appearing in profile result, and that we would want `PackageId`, and `Summary`, Et Al. to store the `InternedString`. That is why I put the interner in a shared folder. So far it is just used in the resolver. It may make sense for a lot more of the Strings to be interned, but with the extra copy... I have not explored it yet.
  • Loading branch information
bors committed Mar 7, 2018
2 parents 879f483 + 98480e8 commit 8fde4e3
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 15 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ glob = "0.2"
hex = "0.3"
home = "0.3"
ignore = "0.4"
lazy_static = "1.0.0"
jobserver = "0.1.9"
lazycell = "0.6"
libc = "0.2"
Expand Down
68 changes: 68 additions & 0 deletions src/cargo/core/interning.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
use std::sync::RwLock;
use std::collections::HashSet;
use std::slice;
use std::str;
use std::mem;
use std::cmp::Ordering;
use std::ops::Deref;

pub fn leek(s: String) -> &'static str {
let boxed = s.into_boxed_str();
let ptr = boxed.as_ptr();
let len = boxed.len();
mem::forget(boxed);
unsafe {
let slice = slice::from_raw_parts(ptr, len);
str::from_utf8_unchecked(slice)
}
}

lazy_static! {
static ref STRING_CASHE: RwLock<HashSet<&'static str>> =
RwLock::new(HashSet::new());
}

#[derive(Eq, PartialEq, Hash, Clone, Copy)]
pub struct InternedString {
ptr: *const u8,
len: usize,
}

impl InternedString {
pub fn new(str: &str) -> InternedString {
let mut cache = STRING_CASHE.write().unwrap();
if let Some(&s) = cache.get(str) {
return InternedString { ptr: s.as_ptr(), len: s.len() };
}
let s = leek(str.to_string());
cache.insert(s);
InternedString { ptr: s.as_ptr(), len: s.len() }
}
}

impl Deref for InternedString {
type Target = str;

fn deref(&self) -> &'static str {
unsafe {
let slice = slice::from_raw_parts(self.ptr, self.len);
&str::from_utf8_unchecked(slice)
}
}
}

impl Ord for InternedString {
fn cmp(&self, other: &InternedString) -> Ordering {
let str: &str = &*self;
str.cmp(&*other)
}
}

impl PartialOrd for InternedString {
fn partial_cmp(&self, other: &InternedString) -> Option<Ordering> {
Some(self.cmp(other))
}
}

unsafe impl Send for InternedString {}
unsafe impl Sync for InternedString {}
1 change: 1 addition & 0 deletions src/cargo/core/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub mod resolver;
pub mod summary;
pub mod shell;
pub mod registry;
mod interning;
mod package_id_spec;
mod workspace;
mod features;
29 changes: 14 additions & 15 deletions src/cargo/core/resolver/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ use url::Url;

use core::{PackageId, Registry, SourceId, Summary, Dependency};
use core::PackageIdSpec;
use core::interning::InternedString;
use util::config::Config;
use util::Graph;
use util::errors::{CargoResult, CargoError};
Expand Down Expand Up @@ -343,8 +344,8 @@ struct Context {
// switch to persistent hash maps if we can at some point or otherwise
// make these much cheaper to clone in general.
activations: Activations,
resolve_features: HashMap<PackageId, HashSet<String>>,
links: HashMap<String, PackageId>,
resolve_features: HashMap<PackageId, HashSet<InternedString>>,
links: HashMap<InternedString, PackageId>,

// These are two cheaply-cloneable lists (O(1) clone) which are effectively
// hash maps but are built up as "construction lists". We'll iterate these
Expand All @@ -356,7 +357,7 @@ struct Context {
warnings: RcList<String>,
}

type Activations = HashMap<String, HashMap<SourceId, Rc<Vec<Summary>>>>;
type Activations = HashMap<InternedString, HashMap<SourceId, Rc<Vec<Summary>>>>;

/// Builds the list of all packages required to build the first argument.
pub fn resolve(summaries: &[(Summary, Method)],
Expand All @@ -382,7 +383,7 @@ pub fn resolve(summaries: &[(Summary, Method)],
metadata: BTreeMap::new(),
replacements: cx.resolve_replacements(),
features: cx.resolve_features.iter().map(|(k, v)| {
(k.clone(), v.clone())
(k.clone(), v.iter().map(|x| x.to_string()).collect())
}).collect(),
unused_patches: Vec::new(),
};
Expand Down Expand Up @@ -717,7 +718,7 @@ impl RemainingCandidates {
fn next(
&mut self,
prev_active: &[Summary],
links: &HashMap<String, PackageId>,
links: &HashMap<InternedString, PackageId>,
) -> Result<(Candidate, bool), HashMap<PackageId, ConflictReason>> {
// Filter the set of candidates based on the previously activated
// versions for this dependency. We can actually use a version if it
Expand All @@ -734,7 +735,7 @@ impl RemainingCandidates {
use std::mem::replace;
for (_, b) in self.remaining.by_ref() {
if let Some(link) = b.summary.links() {
if let Some(a) = links.get(link) {
if let Some(a) = links.get(&InternedString::new(link)) {
if a != b.summary.package_id() {
self.conflicting_prev_active
.entry(a.clone())
Expand Down Expand Up @@ -1304,14 +1305,14 @@ impl Context {
method: &Method) -> CargoResult<bool> {
let id = summary.package_id();
let prev = self.activations
.entry(id.name().to_string())
.entry(InternedString::new(id.name()))
.or_insert_with(HashMap::new)
.entry(id.source_id().clone())
.or_insert_with(||Rc::new(Vec::new()));
if !prev.iter().any(|c| c == summary) {
self.resolve_graph.push(GraphNode::Add(id.clone()));
if let Some(link) = summary.links() {
ensure!(self.links.insert(link.to_owned(), id.clone()).is_none(),
ensure!(self.links.insert(InternedString::new(link), id.clone()).is_none(),
"Attempting to resolve a with more then one crate with the links={}. \n\
This will not build as is. Consider rebuilding the .lock file.", link);
}
Expand All @@ -1332,8 +1333,8 @@ impl Context {
let has_default_feature = summary.features().contains_key("default");
Ok(match self.resolve_features.get(id) {
Some(prev) => {
features.iter().all(|f| prev.contains(f)) &&
(!use_default || prev.contains("default") ||
features.iter().all(|f| prev.contains(&InternedString::new(f))) &&
(!use_default || prev.contains(&InternedString::new("default")) ||
!has_default_feature)
}
None => features.is_empty() && (!use_default || !has_default_feature)
Expand Down Expand Up @@ -1367,14 +1368,14 @@ impl Context {
}

fn prev_active(&self, dep: &Dependency) -> &[Summary] {
self.activations.get(dep.name())
self.activations.get(&InternedString::new(dep.name()))
.and_then(|v| v.get(dep.source_id()))
.map(|v| &v[..])
.unwrap_or(&[])
}

fn is_active(&self, id: &PackageId) -> bool {
self.activations.get(id.name())
self.activations.get(&InternedString::new(id.name()))
.and_then(|v| v.get(id.source_id()))
.map(|v| v.iter().any(|s| s.package_id() == id))
.unwrap_or(false)
Expand Down Expand Up @@ -1448,9 +1449,7 @@ impl Context {
let set = self.resolve_features.entry(pkgid.clone())
.or_insert_with(HashSet::new);
for feature in reqs.used {
if !set.contains(feature) {
set.insert(feature.to_string());
}
set.insert(InternedString::new(feature));
}
}

Expand Down
1 change: 1 addition & 0 deletions src/cargo/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ extern crate home;
extern crate ignore;
extern crate jobserver;
extern crate lazycell;
#[macro_use] extern crate lazy_static;
extern crate libc;
extern crate libgit2_sys;
extern crate num_cpus;
Expand Down

0 comments on commit 8fde4e3

Please sign in to comment.