diff --git a/rust/load/src/bit_difference.rs b/rust/load/src/bit_difference.rs index 4fcbabc3cd0..bedeec9efa4 100644 --- a/rust/load/src/bit_difference.rs +++ b/rust/load/src/bit_difference.rs @@ -32,7 +32,7 @@ use guacamole::{FromGuacamole, Guacamole, Zipf}; use siphasher::sip::SipHasher24; use tracing::Instrument; -use crate::words::WORDS; +use crate::words::MANY_WORDS; use crate::{DataSet, GetQuery, KeySelector, QueryQuery, Skew, UpsertQuery}; const EMBEDDING_BYTES: usize = 128; @@ -101,7 +101,7 @@ impl Document { pub fn embedding(&self) -> Vec { let mut result = vec![]; let words = self.content.split_whitespace().collect::>(); - for word in WORDS.iter() { + for word in MANY_WORDS.iter() { if words.contains(word) { result.push(1.0); } else { @@ -114,7 +114,7 @@ impl Document { impl From<[u8; EMBEDDING_BYTES]> for Document { fn from(embedding: [u8; EMBEDDING_BYTES]) -> Document { - let document = WORDS + let document = MANY_WORDS .iter() .enumerate() .filter_map(|(idx, word)| { @@ -388,7 +388,7 @@ mod tests { #[test] fn constants() { - assert_eq!(EMBEDDING_SIZE, WORDS.len()); + assert_eq!(EMBEDDING_SIZE, MANY_WORDS.len()); } mod synthethic { diff --git a/rust/load/src/lib.rs b/rust/load/src/lib.rs index 907a1e71d7d..bc09c70da6b 100644 --- a/rust/load/src/lib.rs +++ b/rust/load/src/lib.rs @@ -306,6 +306,9 @@ pub enum WhereMixin { /// A raw metadata query simply copies the provided filter spec. #[serde(rename = "query")] Constant(serde_json::Value), + /// Search for a word from the provided set of words with skew. + #[serde(rename = "fts")] + FullTextSearch(Skew), /// The tiny stories workload. The way these collections were setup, there are three fields /// each of integer, float, and string. The integer fields are named i1, i2, and i3. The /// float fields are named f1, f2, and f3. The string fields are named s1, s2, and s3. @@ -325,6 +328,17 @@ impl WhereMixin { pub fn to_json(&self, guac: &mut Guacamole) -> serde_json::Value { match self { Self::Constant(query) => query.clone(), + Self::FullTextSearch(skew) => { + const WORDS: &[&str] = words::FEW_WORDS; + let word = match skew { + Skew::Uniform => WORDS[uniform(0, WORDS.len() as u64)(guac) as usize], + Skew::Zipf { theta } => { + let z = Zipf::from_alpha(WORDS.len() as u64, *theta); + WORDS[z.next(guac) as usize] + } + }; + serde_json::json!({"$contains": word.to_string()}) + } Self::TinyStories(mixin) => mixin.to_json(guac), Self::Select(select) => { let scale: f64 = any(guac); diff --git a/rust/load/src/words.rs b/rust/load/src/words.rs index f77f185c7b1..77f9023c224 100644 --- a/rust/load/src/words.rs +++ b/rust/load/src/words.rs @@ -1,4 +1,4 @@ -pub const WORDS: &[&str] = &[ +pub const MANY_WORDS: &[&str] = &[ "man’s", "sought", "touch", @@ -1024,3 +1024,1006 @@ pub const WORDS: &[&str] = &[ "and", "the", ]; + +pub const FEW_WORDS: &[&str] = &[ + "the", + "of", + "to", + "and", + "a", + "in", + "is", + "it", + "you", + "that", + "he", + "was", + "for", + "on", + "are", + "with", + "as", + "I", + "his", + "they", + "be", + "at", + "one", + "have", + "this", + "from", + "or", + "had", + "by", + "not", + "word", + "but", + "what", + "some", + "we", + "can", + "out", + "other", + "were", + "all", + "there", + "when", + "up", + "use", + "your", + "how", + "said", + "an", + "each", + "she", + "which", + "do", + "their", + "time", + "if", + "will", + "way", + "about", + "many", + "then", + "them", + "write", + "would", + "like", + "so", + "these", + "her", + "long", + "make", + "thing", + "see", + "him", + "two", + "has", + "look", + "more", + "day", + "could", + "go", + "come", + "did", + "number", + "sound", + "no", + "most", + "people", + "my", + "over", + "know", + "water", + "than", + "call", + "first", + "who", + "may", + "down", + "side", + "been", + "now", + "find", + "any", + "new", + "work", + "part", + "take", + "get", + "place", + "made", + "live", + "where", + "after", + "back", + "little", + "only", + "round", + "man", + "year", + "came", + "show", + "every", + "good", + "me", + "give", + "our", + "under", + "name", + "very", + "through", + "just", + "form", + "sentence", + "great", + "think", + "say", + "help", + "low", + "line", + "differ", + "turn", + "cause", + "much", + "mean", + "before", + "move", + "right", + "boy", + "old", + "too", + "same", + "tell", + "does", + "set", + "three", + "want", + "air", + "well", + "also", + "play", + "small", + "end", + "put", + "home", + "read", + "hand", + "port", + "large", + "spell", + "add", + "even", + "land", + "here", + "must", + "big", + "high", + "such", + "follow", + "act", + "why", + "ask", + "men", + "change", + "went", + "light", + "kind", + "off", + "need", + "house", + "picture", + "try", + "us", + "again", + "animal", + "point", + "mother", + "world", + "near", + "build", + "self", + "earth", + "father", + "head", + "stand", + "own", + "page", + "should", + "country", + "found", + "answer", + "school", + "grow", + "study", + "still", + "learn", + "plant", + "cover", + "food", + "sun", + "four", + "between", + "state", + "keep", + "eye", + "never", + "last", + "let", + "thought", + "city", + "tree", + "cross", + "farm", + "hard", + "start", + "might", + "story", + "saw", + "far", + "sea", + "draw", + "left", + "late", + "run", + "don't", + "while", + "press", + "close", + "night", + "real", + "life", + "few", + "north", + "open", + "seem", + "together", + "next", + "white", + "children", + "begin", + "got", + "walk", + "example", + "ease", + "paper", + "group", + "always", + "music", + "those", + "both", + "mark", + "often", + "letter", + "until", + "mile", + "river", + "car", + "feet", + "care", + "second", + "book", + "carry", + "took", + "science", + "eat", + "room", + "friend", + "began", + "idea", + "fish", + "mountain", + "stop", + "once", + "base", + "hear", + "horse", + "cut", + "sure", + "watch", + "color", + "face", + "wood", + "main", + "enough", + "plain", + "girl", + "usual", + "young", + "ready", + "above", + "ever", + "red", + "list", + "though", + "feel", + "talk", + "bird", + "soon", + "body", + "dog", + "family", + "direct", + "pose", + "leave", + "song", + "measure", + "door", + "product", + "black", + "short", + "numeral", + "class", + "wind", + "question", + "happen", + "complete", + "ship", + "area", + "half", + "rock", + "order", + "fire", + "south", + "problem", + "piece", + "told", + "knew", + "pass", + "since", + "top", + "whole", + "king", + "space", + "heard", + "best", + "hour", + "better", + "true", + "during", + "hundred", + "five", + "remember", + "step", + "early", + "hold", + "west", + "ground", + "interest", + "reach", + "fast", + "verb", + "sing", + "listen", + "six", + "table", + "travel", + "less", + "morning", + "ten", + "simple", + "several", + "vowel", + "toward", + "war", + "lay", + "against", + "pattern", + "slow", + "center", + "love", + "person", + "money", + "serve", + "appear", + "road", + "map", + "rain", + "rule", + "govern", + "pull", + "cold", + "notice", + "voice", + "unit", + "power", + "town", + "fine", + "certain", + "fly", + "fall", + "lead", + "cry", + "dark", + "machine", + "note", + "wait", + "plan", + "figure", + "star", + "box", + "noun", + "field", + "rest", + "correct", + "able", + "pound", + "done", + "beauty", + "drive", + "stood", + "contain", + "front", + "teach", + "week", + "final", + "gave", + "green", + "oh", + "quick", + "develop", + "ocean", + "warm", + "free", + "minute", + "strong", + "special", + "mind", + "behind", + "clear", + "tail", + "produce", + "fact", + "street", + "inch", + "multiply", + "nothing", + "course", + "stay", + "wheel", + "full", + "force", + "blue", + "object", + "decide", + "surface", + "deep", + "moon", + "island", + "foot", + "system", + "busy", + "test", + "record", + "boat", + "common", + "gold", + "possible", + "plane", + "stead", + "dry", + "wonder", + "laugh", + "thousand", + "ago", + "ran", + "check", + "game", + "shape", + "equate", + "hot", + "miss", + "brought", + "heat", + "snow", + "tire", + "bring", + "yes", + "distant", + "fill", + "east", + "paint", + "language", + "among", + "grand", + "ball", + "yet", + "wave", + "drop", + "heart", + "am", + "present", + "heavy", + "dance", + "engine", + "position", + "arm", + "wide", + "sail", + "material", + "size", + "vary", + "settle", + "speak", + "weight", + "general", + "ice", + "matter", + "circle", + "pair", + "include", + "divide", + "syllable", + "felt", + "perhaps", + "pick", + "sudden", + "count", + "square", + "reason", + "length", + "represent", + "art", + "subject", + "region", + "energy", + "hunt", + "probable", + "bed", + "brother", + "egg", + "ride", + "cell", + "believe", + "fraction", + "forest", + "sit", + "race", + "window", + "store", + "summer", + "train", + "sleep", + "prove", + "lone", + "leg", + "exercise", + "wall", + "catch", + "mount", + "wish", + "sky", + "board", + "joy", + "winter", + "sat", + "written", + "wild", + "instrument", + "kept", + "glass", + "grass", + "cow", + "job", + "edge", + "sign", + "visit", + "past", + "soft", + "fun", + "bright", + "gas", + "weather", + "month", + "million", + "bear", + "finish", + "happy", + "hope", + "flower", + "clothe", + "strange", + "gone", + "jump", + "baby", + "eight", + "village", + "meet", + "root", + "buy", + "raise", + "solve", + "metal", + "whether", + "push", + "seven", + "paragraph", + "third", + "shall", + "held", + "hair", + "describe", + "cook", + "floor", + "either", + "result", + "burn", + "hill", + "safe", + "cat", + "century", + "consider", + "type", + "law", + "bit", + "coast", + "copy", + "phrase", + "silent", + "tall", + "sand", + "soil", + "roll", + "temperature", + "finger", + "industry", + "value", + "fight", + "lie", + "beat", + "excite", + "natural", + "view", + "sense", + "ear", + "else", + "quite", + "broke", + "case", + "middle", + "kill", + "son", + "lake", + "moment", + "scale", + "loud", + "spring", + "observe", + "child", + "straight", + "consonant", + "nation", + "dictionary", + "milk", + "speed", + "method", + "organ", + "pay", + "age", + "section", + "dress", + "cloud", + "surprise", + "quiet", + "stone", + "tiny", + "climb", + "cool", + "design", + "poor", + "lot", + "experiment", + "bottom", + "key", + "iron", + "single", + "stick", + "flat", + "twenty", + "skin", + "smile", + "crease", + "hole", + "trade", + "melody", + "trip", + "office", + "receive", + "row", + "mouth", + "exact", + "symbol", + "die", + "least", + "trouble", + "shout", + "except", + "wrote", + "seed", + "tone", + "join", + "suggest", + "clean", + "break", + "lady", + "yard", + "rise", + "bad", + "blow", + "oil", + "blood", + "touch", + "grew", + "cent", + "mix", + "team", + "wire", + "cost", + "lost", + "brown", + "wear", + "garden", + "equal", + "sent", + "choose", + "fell", + "fit", + "flow", + "fair", + "bank", + "collect", + "save", + "control", + "decimal", + "gentle", + "woman", + "captain", + "practice", + "separate", + "difficult", + "doctor", + "please", + "protect", + "noon", + "whose", + "locate", + "ring", + "character", + "insect", + "caught", + "period", + "indicate", + "radio", + "spoke", + "atom", + "human", + "history", + "effect", + "electric", + "expect", + "crop", + "modern", + "element", + "hit", + "student", + "corner", + "party", + "supply", + "bone", + "rail", + "imagine", + "provide", + "agree", + "thus", + "capital", + "won't", + "chair", + "danger", + "fruit", + "rich", + "thick", + "soldier", + "process", + "operate", + "guess", + "necessary", + "sharp", + "wing", + "create", + "neighbor", + "wash", + "bat", + "rather", + "crowd", + "corn", + "compare", + "poem", + "string", + "bell", + "depend", + "meat", + "rub", + "tube", + "famous", + "dollar", + "stream", + "fear", + "sight", + "thin", + "triangle", + "planet", + "hurry", + "chief", + "colony", + "clock", + "mine", + "tie", + "enter", + "major", + "fresh", + "search", + "send", + "yellow", + "gun", + "allow", + "print", + "dead", + "spot", + "desert", + "suit", + "current", + "lift", + "rose", + "continue", + "block", + "chart", + "hat", + "sell", + "success", + "company", + "subtract", + "event", + "particular", + "deal", + "swim", + "term", + "opposite", + "wife", + "shoe", + "shoulder", + "spread", + "arrange", + "camp", + "invent", + "cotton", + "born", + "determine", + "quart", + "nine", + "truck", + "noise", + "level", + "chance", + "gather", + "shop", + "stretch", + "throw", + "shine", + "property", + "column", + "molecule", + "select", + "wrong", + "gray", + "repeat", + "require", + "broad", + "prepare", + "salt", + "nose", + "plural", + "anger", + "claim", + "continent", + "oxygen", + "sugar", + "death", + "pretty", + "skill", + "women", + "season", + "solution", + "magnet", + "silver", + "thank", + "branch", + "match", + "suffix", + "especially", + "fig", + "afraid", + "huge", + "sister", + "steel", + "discuss", + "forward", + "similar", + "guide", + "experience", + "score", + "apple", + "bought", + "led", + "pitch", + "coat", + "mass", + "card", + "band", + "rope", + "slip", + "win", + "dream", + "evening", + "condition", + "feed", + "tool", + "total", + "basic", + "smell", + "valley", + "nor", + "double", + "seat", + "arrive", + "master", + "track", + "parent", + "shore", + "division", + "sheet", + "substance", + "favor", + "connect", + "post", + "spend", + "chord", + "fat", + "glad", + "original", + "share", + "station", + "dad", + "bread", + "charge", + "proper", + "bar", + "offer", + "segment", + "slave", + "duck", + "instant", + "market", + "degree", + "populate", + "chick", + "dear", + "enemy", + "reply", + "drink", + "occur", + "support", + "speech", + "nature", + "range", + "steam", + "motion", + "path", + "liquid", + "log", + "meant", + "quotient", + "teeth", + "shell", + "neck", +]; diff --git a/rust/load/src/workloads.rs b/rust/load/src/workloads.rs index f51c199bcc7..06aca191c8f 100644 --- a/rust/load/src/workloads.rs +++ b/rust/load/src/workloads.rs @@ -22,9 +22,7 @@ pub fn all_workloads() -> HashMap { skew: Skew::Zipf { theta: 0.999 }, limit: Distribution::Constant(10), metadata: None, - document: Some(WhereMixin::Constant( - serde_json::json!({"$contains": "the"}), - )), + document: Some(WhereMixin::FullTextSearch(Skew::Zipf { theta: 0.99 })), }), ), ( @@ -56,9 +54,7 @@ pub fn all_workloads() -> HashMap { skew: Skew::Zipf { theta: 0.999 }, limit: Distribution::Constant(10), metadata: None, - document: Some(WhereMixin::Constant( - serde_json::json!({"$contains": "the"}), - )), + document: Some(WhereMixin::FullTextSearch(Skew::Zipf { theta: 0.99 })), }), ), ( @@ -83,9 +79,7 @@ pub fn all_workloads() -> HashMap { skew: Skew::Zipf { theta: 0.999 }, limit: Distribution::Constant(10), metadata: None, - document: Some(WhereMixin::Constant( - serde_json::json!({"$contains": "the"}), - )), + document: Some(WhereMixin::FullTextSearch(Skew::Zipf { theta: 0.99 })), }), ), (