Skip to content

Commit

Permalink
Feat: adds TF-IDF (#169)
Browse files Browse the repository at this point in the history
* feat: adds term frequency to index

* perf: moves parseInt function to more performant shorthand

* chore(lyra): initializes new branch

* feat(lyra): adds frequency map

* feat: wip

* feat(lyra): adds tokens number to frequency map

* feat(lyra): adds tuple-based term frequencies data

* feat(lyra): removes redundant data

* feat(lyra): work in progress on sorting

* refactor: removes dead code

* feat(lyra): adds tf-idf-based results sorting

* feat(lyra): completes tf-idf-based ranking

* feat(lyra): adds frequencies and tokenOccurrencies to load function

* feat(lyra): removes token frequencies and occurrencies on docs deletion

* refactor: minor refactors

* fix: fixes exports
  • Loading branch information
micheleriva authored Nov 14, 2022
1 parent cc94cf3 commit b363a30
Show file tree
Hide file tree
Showing 10 changed files with 323 additions and 164 deletions.
2 changes: 1 addition & 1 deletion internals/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
export { formatNanoseconds, getNanosecondsTime, intersectMany, includes } from "../src/utils";
export { formatNanoseconds, getNanosecondsTime, intersectTokenScores, includes } from "../src/utils";
export { boundedLevenshtein } from "../src/levenshtein";
export { tokenize } from "../src/tokenizer";
154 changes: 127 additions & 27 deletions src/lyra.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,24 @@ import { create as createNode, Node } from "./prefix-tree/node";
import { find as trieFind, insert as trieInsert, removeDocumentByWord, Nodes } from "./prefix-tree/trie";
import { trackInsertion } from "./insertion-checker";
import { availableStopWords, stopWords } from "./tokenizer/stop-words";
import { intersectMany } from "./utils";
import { intersectTokenScores, insertSortedValue, sortTokenScorePredicate } from "./utils";

export type TokenScore = [string, number];
type Index = Record<string, Node>;
type TokenMap = Record<string, string[]>;
type TokenMap = Record<string, TokenScore[]>;
type IndexMap = Record<string, TokenMap>;
type FrequencyMap = {
[property: string]: {
[documentID: string]: {
[token: string]: number;
};
};
};
type TokenOccurrency = {
[property: string]: {
[token: string]: number;
};
};

export { formatNanoseconds } from "./utils";
export { tokenize } from "./tokenizer";
Expand Down Expand Up @@ -70,6 +83,8 @@ export type Data<S extends PropertiesSchema> = {
index: Index;
nodes: Nodes;
schema: S;
frequencies: FrequencyMap;
tokenOccurrencies: TokenOccurrency;
};

export interface Lyra<S extends PropertiesSchema> extends Data<S> {
Expand All @@ -78,6 +93,7 @@ export interface Lyra<S extends PropertiesSchema> extends Data<S> {
edge: boolean;
hooks: Hooks;
tokenizer?: TokenizerConfig;
frequencies: FrequencyMap;
}

export type InsertConfig = {
Expand Down Expand Up @@ -206,19 +222,20 @@ function recursiveCheckDocSchema<S extends PropertiesSchema>(
}

function recursiveTrieInsertion<S extends PropertiesSchema>(
index: Index,
nodes: Nodes,
lyra: Lyra<S>,
doc: ResolveSchema<S>,
id: string,
config: InsertConfig,
prefix = "",
tokenizerConfig: TokenizerConfigExec,
) {
const { index, nodes, frequencies, tokenOccurrencies } = lyra;

for (const key of Object.keys(doc)) {
const isNested = typeof doc[key] === "object";
const propName = `${prefix}${key}`;
if (isNested) {
recursiveTrieInsertion(index, nodes, doc[key] as ResolveSchema<S>, id, config, propName + ".", tokenizerConfig);
recursiveTrieInsertion(lyra, doc[key] as ResolveSchema<S>, id, config, propName + ".", tokenizerConfig);
}

if (typeof doc[key] === "string") {
Expand All @@ -227,7 +244,37 @@ function recursiveTrieInsertion<S extends PropertiesSchema>(
const requestedTrie = index[propName];
const tokens = tokenizerConfig.tokenizerFn(doc[key] as string, config.language, false, tokenizerConfig);

if (!(propName in frequencies)) {
frequencies[propName] = {};
}

if (!(propName in tokenOccurrencies)) {
tokenOccurrencies[propName] = {};
}

if (!(id in frequencies[propName])) {
frequencies[propName][id] = {};
}

for (const token of tokens) {
let tokenFrequency = 0;

for (const t of tokens) {
if (t === token) {
tokenFrequency++;
}
}

const tf = tokenFrequency / tokens.length;

frequencies[propName][id][token] = tf;

if (!(token in tokenOccurrencies[propName])) {
tokenOccurrencies[propName][token] = 0;
}

tokenOccurrencies[propName][token]++;

trieInsert(nodes, requestedTrie, token, id);
}
}
Expand Down Expand Up @@ -262,12 +309,12 @@ function getDocumentIDsFromSearch<S extends PropertiesSchema>(
params: SearchParams<S> & { index: string },
): string[] {
const idx = lyra.index[params.index];

const searchResult = trieFind(lyra.nodes, idx, {
term: params.term,
exact: params.exact,
tolerance: params.tolerance,
});

const ids = new Set<string>();

for (const key in searchResult) {
Expand Down Expand Up @@ -322,6 +369,8 @@ export function create<S extends PropertiesSchema>(properties: Configuration<S>)
hooks: properties.hooks || {},
edge: properties.edge ?? false,
tokenizer: defaultTokenizerConfig(defaultLanguage, properties.tokenizer!),
frequencies: {},
tokenOccurrencies: {},
};

buildIndex(instance, properties.schema);
Expand Down Expand Up @@ -353,7 +402,7 @@ export function insert<S extends PropertiesSchema>(
assertDocSchema(doc, lyra.schema);

lyra.docs[id] = doc;
recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config, undefined, lyra.tokenizer as TokenizerConfigExec);
recursiveTrieInsertion(lyra, doc, id, config, undefined, lyra.tokenizer as TokenizerConfigExec);
trackInsertion(lyra);

return { id };
Expand Down Expand Up @@ -384,7 +433,7 @@ export async function insertWithHooks<S extends PropertiesSchema>(
assertDocSchema(doc, lyra.schema);

lyra.docs[id] = doc;
recursiveTrieInsertion(lyra.index, lyra.nodes, doc, id, config, undefined, lyra.tokenizer as TokenizerConfigExec);
recursiveTrieInsertion(lyra, doc, id, config, undefined, lyra.tokenizer as TokenizerConfigExec);
trackInsertion(lyra);
if (lyra.hooks.afterInsert) {
await hookRunner.call(lyra, lyra.hooks.afterInsert, id);
Expand Down Expand Up @@ -461,25 +510,29 @@ export function remove<S extends PropertiesSchema>(lyra: Lyra<S>, docID: string)
}

const document = lyra.docs[docID] || ({} as Record<string, ResolveSchema<S>>);

const documentKeys = Object.keys(document || {});

for (let i = 0; i < documentKeys.length; i++) {
const documentKeysLength = documentKeys.length;
for (let i = 0; i < documentKeysLength; i++) {
const key = documentKeys[i];

const propertyType = lyra.schema[key];

if (propertyType === "string") {
const idx = lyra.index[key];
const tokens = lyra.tokenizer.tokenizerFn!(
const tokens: string[] = lyra.tokenizer.tokenizerFn!(
document[key] as string,
lyra.defaultLanguage,
false,
lyra.tokenizer!,
)!;

for (let k = 0; k < tokens.length; k++) {
const tokensLength = tokens.length;
for (let k = 0; k < tokensLength; k++) {
const token = tokens[k];
delete lyra.frequencies[key][docID];
lyra.tokenOccurrencies[key][token]--;

if (token && removeDocumentByWord(lyra.nodes, idx, token, docID)) {
throw new Error(ERRORS.CANT_DELETE_DOCUMENT(docID, key, token));
}
Expand Down Expand Up @@ -517,18 +570,21 @@ export function search<S extends PropertiesSchema>(
lyra.tokenizer = defaultTokenizerConfig(language);
}

const tokens = lyra.tokenizer.tokenizerFn!(params.term, language, false, lyra.tokenizer);
const indices = getIndices(lyra, params.properties);
const { limit = 10, offset = 0, exact = false } = params;
const results: RetrievedDoc<S>[] = new Array(limit);
const { limit = 10, offset = 0, exact = false, term, properties } = params;
const tokens = lyra.tokenizer.tokenizerFn!(term, language, false, lyra.tokenizer);
const indices = getIndices(lyra, properties);
const results: RetrievedDoc<S>[] = Array.from({
length: limit,
});

const timeStart = getNanosecondsTime();
// uniqueDocsIDs contains unique document IDs for all the tokens in all the indices.
const uniqueDocsIDs: Set<string> = new Set();
const uniqueDocsIDs: Map<string, number> = new Map();

// indexMap is an object containing all the indexes considered for the current search,
// and an array of doc IDs for each token in all the indices.
//
// Give the search term "quick brown fox" on the "description" index,
// Given the search term "quick brown fox" on the "description" index,
// indexMap will look like this:
//
// {
Expand Down Expand Up @@ -559,23 +615,60 @@ export function search<S extends PropertiesSchema>(
docsIntersection[index] = [];
}

const N = Object.keys(lyra.docs).length;

// Now it's time to loop over all the indices and get the documents IDs for every single term
for (const index of indices) {
for (const term of tokens) {
const indexesLength = indices.length;
for (let i = 0; i < indexesLength; i++) {
const index = indices[i];
const lyraOccurrencies = lyra.tokenOccurrencies[index];
const lyraFrequencies = lyra.frequencies[index];

const tokensLength = tokens.length;
for (let j = 0; j < tokensLength; j++) {
const term = tokens[j];
const documentIDs = getDocumentIDsFromSearch(lyra, { ...params, index, term, exact });
indexMap[index][term].push(...documentIDs);
const termOccurrencies = lyraOccurrencies[term];
const orderedTFIDFList: TokenScore[] = [];

// Calculate TF-IDF value for each term, in each document, for each index.
// Then insert sorted results into orderedTFIDFList.
const documentIDsLength = documentIDs.length;
for (let k = 0; k < documentIDsLength; k++) {
const id = documentIDs[k];
const idf = Math.log10(N / termOccurrencies);
const tfIdf = idf * (lyraFrequencies?.[id]?.[term] ?? 0);

// @todo: we're now using binary search to insert the element in the right position.
// Maybe we can switch to sparse array insertion?
insertSortedValue(orderedTFIDFList, [id, tfIdf], sortTokenScorePredicate);
}

indexMap[index][term].push(...orderedTFIDFList);
}

const docIds = indexMap[index];
const vals = Object.values(docIds);
docsIntersection[index] = intersectMany(vals);
for (const id of Object.values(docsIntersection[index])) {
uniqueDocsIDs.add(id);
docsIntersection[index] = intersectTokenScores(vals);

const uniqueDocs = Object.values(docsIntersection[index]);
const uniqueDocsLength = uniqueDocs.length;
for (let i = 0; i < uniqueDocsLength; i++) {
const [id, tfIdfScore] = uniqueDocs[i];

if (uniqueDocsIDs.has(id)) {
const prevScore = uniqueDocsIDs.get(id)!;
uniqueDocsIDs.set(id, prevScore + tfIdfScore);
} else {
uniqueDocsIDs.set(id, tfIdfScore);
}
}
}

// Convert uniqueDocsIDs to array to access its elements by index
const uniqueDocsIDsArray = Array.from(uniqueDocsIDs);
// Get unique doc IDs from uniqueDocsIDs map, sorted by value.
const uniqueDocsIDsArray = Array.from(uniqueDocsIDs.entries())
.sort(sortTokenScorePredicate)
.map(([id]) => id);
const resultIDs: Set<string> = new Set();

// We already have the list of ALL the document IDs containing the search terms.
Expand Down Expand Up @@ -613,10 +706,15 @@ export function save<S extends PropertiesSchema>(lyra: Lyra<S>): Data<S> {
docs: lyra.docs,
nodes: lyra.nodes,
schema: lyra.schema,
frequencies: lyra.frequencies,
tokenOccurrencies: lyra.tokenOccurrencies,
};
}

export function load<S extends PropertiesSchema>(lyra: Lyra<S>, { index, docs, nodes, schema }: Data<S>) {
export function load<S extends PropertiesSchema>(
lyra: Lyra<S>,
{ index, docs, nodes, schema, frequencies, tokenOccurrencies }: Data<S>,
) {
if (!lyra.edge) {
throw new Error(ERRORS.GETTER_SETTER_WORKS_ON_EDGE_ONLY("load"));
}
Expand All @@ -625,6 +723,8 @@ export function load<S extends PropertiesSchema>(lyra: Lyra<S>, { index, docs, n
lyra.docs = docs;
lyra.nodes = nodes;
lyra.schema = schema;
lyra.frequencies = frequencies;
lyra.tokenOccurrencies = tokenOccurrencies;
}

export function defaultTokenizerConfig(language: Language, tokenizerConfig: TokenizerConfig = {}): TokenizerConfigExec {
Expand Down
1 change: 0 additions & 1 deletion src/prefix-tree/node.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import type { Nullable } from "../types";
import { uniqueId } from "../utils";

export interface Node {
id: string;
key: string;
Expand Down
3 changes: 2 additions & 1 deletion src/prefix-tree/trie.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ function findAllWords(nodes: Nodes, node: Node, output: FindResult, term: string
if (getOwnProperty(output, word) && docIDs.length) {
const docs = new Set(output[word]);

for (let i = 0; i < docIDs.length; i++) {
const docIDsLength = docIDs.length;
for (let i = 0; i < docIDsLength; i++) {
docs.add(docIDs[i]);
}
output[word] = Array.from(docs);
Expand Down
1 change: 1 addition & 0 deletions src/tokenizer/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export type Tokenizer = (
language: Language,
allowDuplicates: boolean,
tokenizerConfig: TokenizerConfig,
frequency?: boolean,
) => string[];

const splitRegex: Record<Language, RegExp> = {
Expand Down
Loading

0 comments on commit b363a30

Please sign in to comment.