Feat: adds TF-IDF (#169)

* feat: adds term frequency to index * perf: moves parseInt function to more performant shorthand * chore(lyra): initializes new branch * feat(lyra): adds frequency map * feat: wip * feat(lyra): adds tokens number to frequency map * feat(lyra): adds tuple-based term frequencies data * feat(lyra): removes redundant data * feat(lyra): work in progress on sorting * refactor: removes dead code * feat(lyra): adds tf-idf-based results sorting * feat(lyra): completes tf-idf-based ranking * feat(lyra): adds frequencies and tokenOccurrencies to load function * feat(lyra): removes token frequencies and occurrencies on docs deletion * refactor: minor refactors * fix: fixes exports
oramasearch · Nov 14, 2022 · b363a30 · b363a30
1 parent cc94cf3
commit b363a30
Show file tree

Hide file tree

Showing 10 changed files with 323 additions and 164 deletions.
diff --git a/internals/index.ts b/internals/index.ts
@@ -1,3 +1,3 @@
-export { formatNanoseconds, getNanosecondsTime, intersectMany, includes } from "../src/utils";
+export { formatNanoseconds, getNanosecondsTime, intersectTokenScores, includes } from "../src/utils";
 export { boundedLevenshtein } from "../src/levenshtein";
 export { tokenize } from "../src/tokenizer";
diff --git a/src/lyra.ts b/src/lyra.ts
@@ -8,11 +8,24 @@ import { create as createNode, Node } from "./prefix-tree/node";
 import { find as trieFind, insert as trieInsert, removeDocumentByWord, Nodes } from "./prefix-tree/trie";
 import { trackInsertion } from "./insertion-checker";
 import { availableStopWords, stopWords } from "./tokenizer/stop-words";
-import { intersectMany } from "./utils";
+import { intersectTokenScores, insertSortedValue, sortTokenScorePredicate } from "./utils";
 
+export type TokenScore = [string, number];
 type Index = Record<string, Node>;
-type TokenMap = Record<string, string[]>;
+type TokenMap = Record<string, TokenScore[]>;
 type IndexMap = Record<string, TokenMap>;
+type FrequencyMap = {
+  [property: string]: {
+    [documentID: string]: {
+      [token: string]: number;
+    };
+  };
+};
+type TokenOccurrency = {
+  [property: string]: {
+    [token: string]: number;
+  };
+};
 
 export { formatNanoseconds } from "./utils";
 export { tokenize } from "./tokenizer";
@@ -70,6 +83,8 @@ export type Data<S extends PropertiesSchema> = {
   index: Index;
   nodes: Nodes;
   schema: S;
+  frequencies: FrequencyMap;
+  tokenOccurrencies: TokenOccurrency;
 };
 
 export interface Lyra<S extends PropertiesSchema> extends Data<S> {
@@ -78,6 +93,7 @@ export interface Lyra<S extends PropertiesSchema> extends Data<S> {
   edge: boolean;
   hooks: Hooks;
   tokenizer?: TokenizerConfig;
+  frequencies: FrequencyMap;
 }
 
 export type InsertConfig = {
@@ -206,19 +222,20 @@ function recursiveCheckDocSchema<S extends PropertiesSchema>(
 }
 
 function recursiveTrieInsertion<S extends PropertiesSchema>(
-  index: Index,
-  nodes: Nodes,
+  lyra: Lyra<S>,
   doc: ResolveSchema<S>,
   id: string,
   config: InsertConfig,
   prefix = "",
   tokenizerConfig: TokenizerConfigExec,
 ) {
+  const { index, nodes, frequencies, tokenOccurrencies } = lyra;
+
   for (const key of Object.keys(doc)) {
     const isNested = typeof doc[key] === "object";
     const propName = `${prefix}${key}`;
     if (isNested) {
-      recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema<S>, id, config, propName + ".", tokenizerConfig);
+      recursiveTrieInsertion(lyra, doc[key] as ResolveSchema<S>, id, config, propName + ".", tokenizerConfig);
     }
 
     if (typeof doc[key] === "string") {
@@ -227,7 +244,37 @@ function recursiveTrieInsertion<S extends PropertiesSchema>(
       const requestedTrie = index[propName];
       const tokens = tokenizerConfig.tokenizerFn(doc[key] as string, config.language, false, tokenizerConfig);
 
+      if (!(propName in frequencies)) {
+        frequencies[propName] = {};
+      }
+
+      if (!(propName in tokenOccurrencies)) {
+        tokenOccurrencies[propName] = {};
+      }
+
+      if (!(id in frequencies[propName])) {
+        frequencies[propName][id] = {};
+      }
+
       for (const token of tokens) {
+        let tokenFrequency = 0;
+
+        for (const t of tokens) {
+          if (t === token) {
+            tokenFrequency++;
+          }
+        }
+
+        const tf = tokenFrequency / tokens.length;
+
+        frequencies[propName][id][token] = tf;
+
+        if (!(token in tokenOccurrencies[propName])) {
+          tokenOccurrencies[propName][token] = 0;
+        }
+
+        tokenOccurrencies[propName][token]++;
+
         trieInsert(nodes, requestedTrie, token, id);
       }
     }
@@ -262,12 +309,12 @@ function getDocumentIDsFromSearch<S extends PropertiesSchema>(
   params: SearchParams<S> & { index: string },
 ): string[] {
   const idx = lyra.index[params.index];
-
   const searchResult = trieFind(lyra.nodes, idx, {
     term: params.term,
     exact: params.exact,
     tolerance: params.tolerance,
   });
+
   const ids = new Set<string>();
 
   for (const key in searchResult) {
@@ -322,6 +369,8 @@ export function create<S extends PropertiesSchema>(properties: Configuration<S>)
     hooks: properties.hooks || {},
     edge: properties.edge ?? false,
     tokenizer: defaultTokenizerConfig(defaultLanguage, properties.tokenizer!),
+    frequencies: {},
+    tokenOccurrencies: {},
   };
 
   buildIndex(instance, properties.schema);
@@ -353,7 +402,7 @@ export function insert<S extends PropertiesSchema>(
   assertDocSchema(doc, lyra.schema);
 
   lyra.docs[id] = doc;
-  recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config, undefined, lyra.tokenizer as TokenizerConfigExec);
+  recursiveTrieInsertion(lyra, doc, id, config, undefined, lyra.tokenizer as TokenizerConfigExec);
   trackInsertion(lyra);
 
   return { id };
@@ -384,7 +433,7 @@ export async function insertWithHooks<S extends PropertiesSchema>(
   assertDocSchema(doc, lyra.schema);
 
   lyra.docs[id] = doc;
-  recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config, undefined, lyra.tokenizer as TokenizerConfigExec);
+  recursiveTrieInsertion(lyra, doc, id, config, undefined, lyra.tokenizer as TokenizerConfigExec);
   trackInsertion(lyra);
   if (lyra.hooks.afterInsert) {
     await hookRunner.call(lyra, lyra.hooks.afterInsert, id);
@@ -461,25 +510,29 @@ export function remove<S extends PropertiesSchema>(lyra: Lyra<S>, docID: string)
   }
 
   const document = lyra.docs[docID] || ({} as Record<string, ResolveSchema<S>>);
-
   const documentKeys = Object.keys(document || {});
 
-  for (let i = 0; i < documentKeys.length; i++) {
+  const documentKeysLength = documentKeys.length;
+  for (let i = 0; i < documentKeysLength; i++) {
     const key = documentKeys[i];
 
     const propertyType = lyra.schema[key];
 
     if (propertyType === "string") {
       const idx = lyra.index[key];
-      const tokens = lyra.tokenizer.tokenizerFn!(
+      const tokens: string[] = lyra.tokenizer.tokenizerFn!(
         document[key] as string,
         lyra.defaultLanguage,
         false,
         lyra.tokenizer!,
       )!;
 
-      for (let k = 0; k < tokens.length; k++) {
+      const tokensLength = tokens.length;
+      for (let k = 0; k < tokensLength; k++) {
         const token = tokens[k];
+        delete lyra.frequencies[key][docID];
+        lyra.tokenOccurrencies[key][token]--;
+
         if (token && removeDocumentByWord(lyra.nodes, idx, token, docID)) {
           throw new Error(ERRORS.CANT_DELETE_DOCUMENT(docID, key, token));
         }
@@ -517,18 +570,21 @@ export function search<S extends PropertiesSchema>(
     lyra.tokenizer = defaultTokenizerConfig(language);
   }
 
-  const tokens = lyra.tokenizer.tokenizerFn!(params.term, language, false, lyra.tokenizer);
-  const indices = getIndices(lyra, params.properties);
-  const { limit = 10, offset = 0, exact = false } = params;
-  const results: RetrievedDoc<S>[] = new Array(limit);
+  const { limit = 10, offset = 0, exact = false, term, properties } = params;
+  const tokens = lyra.tokenizer.tokenizerFn!(term, language, false, lyra.tokenizer);
+  const indices = getIndices(lyra, properties);
+  const results: RetrievedDoc<S>[] = Array.from({
+    length: limit,
+  });
 
   const timeStart = getNanosecondsTime();
   // uniqueDocsIDs contains unique document IDs for all the tokens in all the indices.
-  const uniqueDocsIDs: Set<string> = new Set();
+  const uniqueDocsIDs: Map<string, number> = new Map();
+
   // indexMap is an object containing all the indexes considered for the current search,
   // and an array of doc IDs for each token in all the indices.
   //
-  // Give the search term "quick brown fox" on the "description" index,
+  // Given the search term "quick brown fox" on the "description" index,
   // indexMap will look like this:
   //
   // {
@@ -559,23 +615,60 @@ export function search<S extends PropertiesSchema>(
     docsIntersection[index] = [];
   }
 
+  const N = Object.keys(lyra.docs).length;
+
   // Now it's time to loop over all the indices and get the documents IDs for every single term
-  for (const index of indices) {
-    for (const term of tokens) {
+  const indexesLength = indices.length;
+  for (let i = 0; i < indexesLength; i++) {
+    const index = indices[i];
+    const lyraOccurrencies = lyra.tokenOccurrencies[index];
+    const lyraFrequencies = lyra.frequencies[index];
+
+    const tokensLength = tokens.length;
+    for (let j = 0; j < tokensLength; j++) {
+      const term = tokens[j];
       const documentIDs = getDocumentIDsFromSearch(lyra, { ...params, index, term, exact });
-      indexMap[index][term].push(...documentIDs);
+      const termOccurrencies = lyraOccurrencies[term];
+      const orderedTFIDFList: TokenScore[] = [];
+
+      // Calculate TF-IDF value for each term, in each document, for each index.
+      // Then insert sorted results into orderedTFIDFList.
+      const documentIDsLength = documentIDs.length;
+      for (let k = 0; k < documentIDsLength; k++) {
+        const id = documentIDs[k];
+        const idf = Math.log10(N / termOccurrencies);
+        const tfIdf = idf * (lyraFrequencies?.[id]?.[term] ?? 0);
+
+        // @todo: we're now using binary search to insert the element in the right position.
+        // Maybe we can switch to sparse array insertion?
+        insertSortedValue(orderedTFIDFList, [id, tfIdf], sortTokenScorePredicate);
+      }
+
+      indexMap[index][term].push(...orderedTFIDFList);
     }
 
     const docIds = indexMap[index];
     const vals = Object.values(docIds);
-    docsIntersection[index] = intersectMany(vals);
-    for (const id of Object.values(docsIntersection[index])) {
-      uniqueDocsIDs.add(id);
+    docsIntersection[index] = intersectTokenScores(vals);
+
+    const uniqueDocs = Object.values(docsIntersection[index]);
+    const uniqueDocsLength = uniqueDocs.length;
+    for (let i = 0; i < uniqueDocsLength; i++) {
+      const [id, tfIdfScore] = uniqueDocs[i];
+
+      if (uniqueDocsIDs.has(id)) {
+        const prevScore = uniqueDocsIDs.get(id)!;
+        uniqueDocsIDs.set(id, prevScore + tfIdfScore);
+      } else {
+        uniqueDocsIDs.set(id, tfIdfScore);
+      }
     }
   }
 
-  // Convert uniqueDocsIDs to array to access its elements by index
-  const uniqueDocsIDsArray = Array.from(uniqueDocsIDs);
+  // Get unique doc IDs from uniqueDocsIDs map, sorted by value.
+  const uniqueDocsIDsArray = Array.from(uniqueDocsIDs.entries())
+    .sort(sortTokenScorePredicate)
+    .map(([id]) => id);
   const resultIDs: Set<string> = new Set();
 
   // We already have the list of ALL the document IDs containing the search terms.
@@ -613,10 +706,15 @@ export function save<S extends PropertiesSchema>(lyra: Lyra<S>): Data<S> {
     docs: lyra.docs,
     nodes: lyra.nodes,
     schema: lyra.schema,
+    frequencies: lyra.frequencies,
+    tokenOccurrencies: lyra.tokenOccurrencies,
   };
 }
 
-export function load<S extends PropertiesSchema>(lyra: Lyra<S>, { index, docs, nodes, schema }: Data<S>) {
+export function load<S extends PropertiesSchema>(
+  lyra: Lyra<S>,
+  { index, docs, nodes, schema, frequencies, tokenOccurrencies }: Data<S>,
+) {
   if (!lyra.edge) {
     throw new Error(ERRORS.GETTER_SETTER_WORKS_ON_EDGE_ONLY("load"));
   }
@@ -625,6 +723,8 @@ export function load<S extends PropertiesSchema>(lyra: Lyra<S>, { index, docs, n
   lyra.docs = docs;
   lyra.nodes = nodes;
   lyra.schema = schema;
+  lyra.frequencies = frequencies;
+  lyra.tokenOccurrencies = tokenOccurrencies;
 }
 
 export function defaultTokenizerConfig(language: Language, tokenizerConfig: TokenizerConfig = {}): TokenizerConfigExec {

diff --git a/src/prefix-tree/node.ts b/src/prefix-tree/node.ts
@@ -1,6 +1,5 @@
 import type { Nullable } from "../types";
 import { uniqueId } from "../utils";
-
 export interface Node {
   id: string;
   key: string;

diff --git a/src/prefix-tree/trie.ts b/src/prefix-tree/trie.ts
@@ -44,7 +44,8 @@ function findAllWords(nodes: Nodes, node: Node, output: FindResult, term: string
     if (getOwnProperty(output, word) && docIDs.length) {
       const docs = new Set(output[word]);
 
-      for (let i = 0; i < docIDs.length; i++) {
+      const docIDsLength = docIDs.length;
+      for (let i = 0; i < docIDsLength; i++) {
         docs.add(docIDs[i]);
       }
       output[word] = Array.from(docs);

diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts
@@ -9,6 +9,7 @@ export type Tokenizer = (
   language: Language,
   allowDuplicates: boolean,
   tokenizerConfig: TokenizerConfig,
+  frequency?: boolean,
 ) => string[];
 
 const splitRegex: Record<Language, RegExp> = {