Skip to content

Commit

Permalink
perf(orama): facets and hybrid search (#694)
Browse files Browse the repository at this point in the history
  • Loading branch information
ilteoood authored Apr 10, 2024
1 parent 2b856bb commit 54396e4
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 64 deletions.
78 changes: 45 additions & 33 deletions packages/orama/src/components/facets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,16 @@ import { getNested } from '../utils.js'

type FacetValue = string | boolean | number

function sortAsc(a: [string, number], b: [string, number]) {
return a[1] - b[1]
}

function sortDesc(a: [string, number], b: [string, number]) {
return b[1] - a[1]
}

function sortingPredicateBuilder(order: FacetSorting = 'desc') {
return order.toLowerCase() === 'asc'
? (a: [string, number], b: [string, number]) => a[1] - b[1]
: (a: [string, number], b: [string, number]) => b[1] - a[1]
return order.toLowerCase() === 'asc' ? sortAsc : sortDesc
}

export async function getFacets<T extends AnyOrama>(
Expand Down Expand Up @@ -67,30 +73,32 @@ export async function getFacets<T extends AnyOrama>(
switch (propertyType) {
case 'number': {
const ranges = (facetsConfig[facet] as NumberFacetDefinition).ranges
calculateNumberFacet(ranges, facetValues, facetValue as number)
calculateNumberFacetBuilder(ranges, facetValues)(facetValue as number)
break
}
case 'number[]': {
const alreadyInsertedValues = new Set<string>()
const ranges = (facetsConfig[facet] as NumberFacetDefinition).ranges
const calculateNumberFacet = calculateNumberFacetBuilder(ranges, facetValues, alreadyInsertedValues)
for (const v of facetValue as Array<number>) {
calculateNumberFacet(ranges, facetValues, v, alreadyInsertedValues)
calculateNumberFacet(v)
}
break
}
case 'boolean':
case 'enum':
case 'string': {
calculateBooleanStringOrEnumFacet(facetValues, facetValue as FacetValue, propertyType)
calculateBooleanStringOrEnumFacetBuilder(facetValues, propertyType)(facetValue as FacetValue)
break
}
case 'boolean[]':
case 'enum[]':
case 'string[]': {
const alreadyInsertedValues = new Set<string>()
const innerType = propertyType === 'boolean[]' ? 'boolean' : 'string'
const calculateBooleanStringOrEnumFacet = calculateBooleanStringOrEnumFacetBuilder(facetValues, innerType, alreadyInsertedValues)
for (const v of facetValue as Array<FacetValue>) {
calculateBooleanStringOrEnumFacet(facetValues, v, innerType, alreadyInsertedValues)
calculateBooleanStringOrEnumFacet(v)
}
break
}
Expand All @@ -102,15 +110,16 @@ export async function getFacets<T extends AnyOrama>(

// TODO: We are looping again with the same previous keys, should we creat a single loop instead?
for (const facet of facetKeys) {
const currentFacet = facets[facet]
// Count the number of values for each facet
facets[facet].count = Object.keys(facets[facet].values).length
currentFacet.count = Object.keys(currentFacet.values).length
// Sort only string-based facets
if (properties[facet] === 'string') {
const stringFacetDefinition = facetsConfig[facet] as StringFacetDefinition
const sortingPredicate = sortingPredicateBuilder(stringFacetDefinition.sort)

facets[facet].values = Object.fromEntries(
Object.entries(facets[facet].values)
currentFacet.values = Object.fromEntries(
Object.entries(currentFacet.values)
.sort(sortingPredicate)
.slice(stringFacetDefinition.offset ?? 0, stringFacetDefinition.limit ?? 10)
)
Expand All @@ -120,41 +129,44 @@ export async function getFacets<T extends AnyOrama>(
return facets
}

function calculateNumberFacet(
function calculateNumberFacetBuilder(
ranges: NumberFacetDefinition['ranges'],
values: Record<string, number>,
facetValue: number,
alreadyInsertedValues?: Set<string>
) {
for (const range of ranges) {
const value = `${range.from}-${range.to}`
if (alreadyInsertedValues?.has(value)) {
continue
}

if (facetValue >= range.from && facetValue <= range.to) {
if (values[value] === undefined) {
values[value] = 1
} else {
values[value]++

alreadyInsertedValues?.add(value)
return (facetValue: number) => {
for (const range of ranges) {
const value = `${range.from}-${range.to}`
if (alreadyInsertedValues?.has(value)) {
continue
}

if (facetValue >= range.from && facetValue <= range.to) {
if (values[value] === undefined) {
values[value] = 1
} else {
values[value]++

alreadyInsertedValues?.add(value)
}
}
}
}
}

function calculateBooleanStringOrEnumFacet(
function calculateBooleanStringOrEnumFacetBuilder(
values: Record<string, number>,
facetValue: FacetValue,
propertyType: 'string' | 'boolean' | 'enum',
alreadyInsertedValues?: Set<string>
) {
// String or boolean based facets
const value = facetValue?.toString() ?? (propertyType === 'boolean' ? 'false' : '')
if (alreadyInsertedValues?.has(value)) {
return
const defaultValue = (propertyType === 'boolean' ? 'false' : '')
return (facetValue: FacetValue) => {
// String or boolean based facets
const value = facetValue?.toString() ?? defaultValue
if (alreadyInsertedValues?.has(value)) {
return
}
values[value] = (values[value] ?? 0) + 1
alreadyInsertedValues?.add(value)
}
values[value] = (values[value] ?? 0) + 1
alreadyInsertedValues?.add(value)
}
63 changes: 32 additions & 31 deletions packages/orama/src/methods/search-hybrid.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,16 @@ async function getFullTextSearchIDs<T extends AnyOrama, ResultDocument = TypedDo
}

if (properties && properties !== '*') {
const propertiesToSearchSet = new Set(propertiesToSearch)
const propertiesSet = new Set(properties as string[])

for (const prop of properties) {
// TODO: since propertiesToSearch.includes is repeated multiple times, maybe we should move it in a Set first?
if (!propertiesToSearch.includes(prop as string)) {
if (!propertiesToSearchSet.has(prop as string)) {
throw createError('UNKNOWN_INDEX', prop as string, propertiesToSearch.join(', '))
}
}

// TODO: since properties.includes is repeated multiple times, maybe we should move it in a Set first?
propertiesToSearch = propertiesToSearch.filter((prop: string) => (properties as string[]).includes(prop))
propertiesToSearch = propertiesToSearch.filter((prop: string) => propertiesSet.has(prop))
}

// Create the search context and the results
Expand Down Expand Up @@ -195,9 +196,10 @@ async function getFullTextSearchIDs<T extends AnyOrama, ResultDocument = TypedDo
safeArrayPush(context.indexMap[prop][term], scoreList)
}
} else {
context.indexMap[prop][''] = []
const indexMapContent = []
context.indexMap[prop][''] = indexMapContent
const scoreList = await orama.index.search(context, index, prop, '')
safeArrayPush(context.indexMap[prop][''], scoreList)
safeArrayPush(indexMapContent, scoreList)
}

const docIds = context.indexMap[prop]
Expand All @@ -209,11 +211,7 @@ async function getFullTextSearchIDs<T extends AnyOrama, ResultDocument = TypedDo
for (let i = 0; i < uniqueDocsLength; i++) {
const [id, score] = uniqueDocs[i]
const prevScore = context.uniqueDocsIDs[id]
if (prevScore) {
context.uniqueDocsIDs[id] = prevScore + score + 0.5
} else {
context.uniqueDocsIDs[id] = score
}
context.uniqueDocsIDs[id] = prevScore ? prevScore + score + 0.5 : score
}
}
} else if (tokens.length === 0 && term) {
Expand Down Expand Up @@ -263,52 +261,55 @@ export async function getVectorSearchIDs<T extends AnyOrama, ResultDocument = Ty
return minMaxScoreNormalization(uniqueIDs)
}

function extractScore([, score]: TokenScore) {
return score
}

function minMaxScoreNormalization(results: TokenScore[]): TokenScore[] {
const maxScore = Math.max(...results.map(([, score]) => score))
// In this case I disabled the `prefer-spread` rule because spread seems to be slower
// eslint-disable-next-line prefer-spread
const maxScore = Math.max.apply(Math, results.map(extractScore))
return results.map(([id, score]) => [id, score / maxScore] as TokenScore)
}

function normalizeScore(score: number, maxScore: number) {
return score / maxScore
}

function hybridScore(textScore: number, vectorScore: number, textWeight: number, vectorWeight: number) {
return textScore * textWeight + vectorScore * vectorWeight
function hybridScoreBuilder(textWeight: number, vectorWeight: number) {
return (textScore: number, vectorScore: number) => textScore * textWeight + vectorScore * vectorWeight
}

function mergeAndRankResults(
textResults: TokenScore[],
vectorResults: TokenScore[],
query: string,
hybridWeights: HybridWeights | undefined
) {
const maxTextScore = Math.max(...textResults.map(([, score]) => score))
const maxVectorScore = Math.max(...vectorResults.map(([, score]) => score))
) {
// eslint-disable-next-line prefer-spread
const maxTextScore = Math.max.apply(Math, textResults.map(extractScore))
// eslint-disable-next-line prefer-spread
const maxVectorScore = Math.max.apply(Math, vectorResults.map(extractScore))
const hasHybridWeights = hybridWeights && hybridWeights.text && hybridWeights.vector

const { text: textWeight, vector: vectorWeight } = hasHybridWeights ? hybridWeights : getQueryWeights(query)
const mergedResults = new Map()

const textResultsLength = textResults.length
const hybridScore = hybridScoreBuilder(textWeight, vectorWeight)
for (let i = 0; i < textResultsLength; i++) {
const normalizedScore = normalizeScore(textResults[i][1], maxTextScore)
// ^ 1 here refers to "score"
const hybridScoreValue = hybridScore(normalizedScore, 0, textWeight, vectorWeight)
mergedResults.set(textResults[i][0], hybridScoreValue)
// ^ 0 here refers to "id"
const [id, score] = textResults[i]
const normalizedScore = normalizeScore(score, maxTextScore)
const hybridScoreValue = hybridScore(normalizedScore, 0)
mergedResults.set(id, hybridScoreValue)
}

const vectorResultsLength = vectorResults.length
for (let i = 0; i < vectorResultsLength; i++) {
const normalizedScore = normalizeScore(vectorResults[i][1], maxVectorScore)
// ^ 1 here refers to "score"
const resultId = vectorResults[i][0]
if (mergedResults.has(resultId)) {
let existingRes = mergedResults.get(resultId)
mergedResults.set(resultId, (existingRes += hybridScore(0, normalizedScore, textWeight, vectorWeight)))
} else {
mergedResults.set(resultId, hybridScore(0, normalizedScore, textWeight, vectorWeight))
}
const [resultId, score] = vectorResults[i]
const normalizedScore = normalizeScore(score, maxVectorScore)
const existingRes = mergedResults.get(resultId) ?? 0
mergedResults.set(resultId, existingRes + hybridScore(0, normalizedScore))
}

return [...mergedResults].sort((a, b) => b[1] - a[1])
Expand Down

0 comments on commit 54396e4

Please sign in to comment.