From f6593e69de6df3e85a39c048794a56c7eb842c4c Mon Sep 17 00:00:00 2001 From: Johannes Vogel <31311694+johannes-vogel@users.noreply.github.com> Date: Fri, 22 Nov 2024 13:22:36 +0100 Subject: [PATCH] feat: config options for fuzzy search (#898) Co-authored-by: Bob den Os <108393871+BobdenOs@users.noreply.github.com> --- db-service/lib/cql-functions.js | 2 +- db-service/lib/cqn4sql.js | 2 +- db-service/test/cqn4sql/search.test.js | 105 ++++++++++++++----------- hana/lib/cql-functions.js | 66 +++++++++++++++- hana/test/fuzzy.cds | 6 +- hana/test/fuzzy.test.js | 81 ++++++++++++++++--- test/bookshop/db/schema.cds | 3 +- test/compliance/SELECT.test.js | 4 +- 8 files changed, 203 insertions(+), 66 deletions(-) diff --git a/db-service/lib/cql-functions.js b/db-service/lib/cql-functions.js index 0165278e5..6a7206119 100644 --- a/db-service/lib/cql-functions.js +++ b/db-service/lib/cql-functions.js @@ -33,7 +33,7 @@ const StandardFunctions = { val = sub[2] || sub[3] || '' } arg.val = arg.__proto__.val = val - const refs = ref.list || [ref] + const refs = ref.list const { toString } = ref return '(' + refs.map(ref2 => this.contains(this.tolower(toString(ref2)), this.tolower(arg))).join(' or ') + ')' }, diff --git a/db-service/lib/cqn4sql.js b/db-service/lib/cqn4sql.js index 739597abf..57b68f7a6 100644 --- a/db-service/lib/cqn4sql.js +++ b/db-service/lib/cqn4sql.js @@ -2203,7 +2203,7 @@ function cqn4sql(originalQuery, model) { const searchFunc = { func: 'search', args: [ - searchIn.length > 1 ? { list: searchIn } : { ...searchIn[0] }, + { list: searchIn }, xpr.length === 1 && 'val' in xpr[0] ? xpr[0] : { xpr }, ], } diff --git a/db-service/test/cqn4sql/search.test.js b/db-service/test/cqn4sql/search.test.js index be44de4d0..7904150d7 100644 --- a/db-service/test/cqn4sql/search.test.js +++ b/db-service/test/cqn4sql/search.test.js @@ -16,9 +16,8 @@ describe('Replace attribute search by search predicate', () => { let res = cqn4sql(query, model) // single val is stored as val directly, not as expr with val - const expected = CQL`SELECT from bookshop.WithStructuredKey as wsk { - wsk.second - } where search(wsk.second, 'x')` + const expected = CQL`SELECT from bookshop.WithStructuredKey as wsk { wsk.second }` + expected.SELECT.where = [ {func: 'search', args: [{ list: [{ ref: ['wsk', 'second']}] }, {val: 'x'}]}] expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected) }) @@ -28,9 +27,8 @@ describe('Replace attribute search by search predicate', () => { query.SELECT.search = [{ val: 'x' }, 'or', { val: 'y' }] let res = cqn4sql(query, model) - const expected = CQL`SELECT from bookshop.WithStructuredKey as wsk { - wsk.second - } where search(wsk.second, ('x' OR 'y'))` + const expected = CQL`SELECT from bookshop.WithStructuredKey as wsk { wsk.second }` + expected.SELECT.where = [ {func: 'search', args: [{ list: [{ ref: ['wsk', 'second']}] }, {xpr: [{val: 'x'}, 'or', {val: 'y'}]}]}] expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected) }) @@ -109,16 +107,16 @@ describe('Replace attribute search by search predicate', () => { query.SELECT.search = [{ val: 'x' }, 'or', { val: 'y' }] let res = cqn4sql(query, model) - expect(JSON.parse(JSON.stringify(res))).to.deep.equal( - CQL` - SELECT from bookshop.Books as Books - left join bookshop.Authors as author on author.ID = Books.author_ID - left join bookshop.Books as books2 on books2.author_ID = author.ID - { - Books.ID, - books2.title as authorsBook - } where search(books2.title, ('x' OR 'y')) group by Books.title `, - ) + const expected = CQL` + SELECT from bookshop.Books as Books + left join bookshop.Authors as author on author.ID = Books.author_ID + left join bookshop.Books as books2 on books2.author_ID = author.ID + { + Books.ID, + books2.title as authorsBook + } where search(books2.title, ('x' OR 'y')) group by Books.title ` + expected.SELECT.where = [ {func: 'search', args: [{ list: [{ ref: ['books2', 'title']}] }, {xpr: [{val: 'x'}, 'or', {val: 'y'}]}]}] + expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected) }) it('Search on navigation', () => { let query = CQL`SELECT from bookshop.Authors:books { ID }` @@ -147,11 +145,12 @@ describe('Replace attribute search by search predicate', () => { .columns({ args: [{ ref: ['title'] }], as: 'firstInAlphabet', func: 'MIN' }) .groupBy('title') .search('Cat') - - expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(CQL` - SELECT from bookshop.Books as Books { - MIN(Books.title) as firstInAlphabet - } group by Books.title having search(MIN(Books.title), 'Cat')`) + const expected = CQL` + SELECT from bookshop.Books as Books { + MIN(Books.title) as firstInAlphabet + } group by Books.title having search(MIN(Books.title), 'Cat')` + expected.SELECT.having = [ {func: 'search', args: [{ list: [{func: 'MIN', args: [{ ref: ['Books', 'title']}]}] }, {val: 'Cat'}]}] + expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(expected) }) it('Ignore non string aggregates from being searched', () => { @@ -163,12 +162,13 @@ describe('Replace attribute search by search predicate', () => { ` query.SELECT.search = [{ val: 'x' }] - - expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(CQL` - SELECT from bookshop.Books as Books { - Books.title, - AVG(Books.stock) as searchRelevant, - } where search(Books.title, 'x') group by Books.title`) + const expected = CQL` + SELECT from bookshop.Books as Books { + Books.title, + AVG(Books.stock) as searchRelevant, + } where search(Books.title, 'x') group by Books.title` + expected.SELECT.where = [ {func: 'search', args: [{ list: [{ ref: ['Books', 'title']}] }, {val: 'x'}]}] + expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(expected) }) it('aggregations which are not of type string are not searched', () => { const query = CQL` @@ -197,12 +197,16 @@ describe('Replace attribute search by search predicate', () => { ` query.SELECT.search = [{ val: 'x' }] - - expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(CQL` - SELECT from bookshop.Books as Books { - Books.ID, - substring(Books.stock) as searchRelevantViaCast: cds.String, - } group by Books.title having search(substring(Books.stock), 'x')`) + const expected = CQL` + SELECT from bookshop.Books as Books { + Books.ID, + substring(Books.stock) as searchRelevantViaCast: cds.String, + } group by Books.title having search(substring(Books.stock), 'x')` + expected.SELECT.having = [ {func: 'search', args: [{ list: [{ + args: [ { ref: [ 'Books', 'stock' ] } ], + func: 'substring' + }] }, {val: 'x'}]}] + expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(expected) }) it('xpr is search relevant via cast', () => { // this aggregation is not relevant for search per default @@ -216,13 +220,21 @@ describe('Replace attribute search by search predicate', () => { ` query.SELECT.search = [{ val: 'x' }] - - expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(CQL` - SELECT from bookshop.Books as Books { - Books.ID, - ('very' + 'useful' + 'string') as searchRelevantViaCast: cds.String, - ('1' + '2' + '3') as notSearchRelevant: cds.Integer, - } group by Books.title having search(('very' + 'useful' + 'string'), 'x')`) + const expected = CQL` + SELECT from bookshop.Books as Books { + Books.ID, + ('very' + 'useful' + 'string') as searchRelevantViaCast: cds.String, + ('1' + '2' + '3') as notSearchRelevant: cds.Integer, + } group by Books.title` + expected.SELECT.having = [ {func: 'search', args: [{ list: [{ + xpr: [ + { val: 'very' }, + '+', + { val: 'useful' }, + '+', + { val: 'string' } + ] }] }, {val: 'x'}]}] + expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(expected) }) }) @@ -242,7 +254,8 @@ describe('search w/ path expressions', () => { { BooksSearchAuthorName.ID, BooksSearchAuthorName.title - } where search(author.lastName, 'x')` + }` + expected.SELECT.where = [ {func: 'search', args: [{ list: [{ref: ['author', 'lastName']}]}, {val: 'x'}]}] expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected) }) @@ -286,7 +299,8 @@ describe('search w/ path expressions', () => { { BookShelf.ID, BookShelf.genre - } where search((BookShelf.genre), 'Harry Plotter')` + }` + expected.SELECT.where = [ {func: 'search', args: [{ list: [{ref: ['BookShelf', 'genre']}]}, {val: 'Harry Plotter'}]}] expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected) }) }) @@ -316,11 +330,8 @@ describe('calculated elements', () => { query.SELECT.search = [{ val: 'x' }] let res = cqn4sql(query, model) - const expected = CQL` - SELECT from search.CalculatedAddressesWithoutAnno as Address - { - Address.ID - } where search((Address.city), 'x')` + const expected = CQL`SELECT from search.CalculatedAddressesWithoutAnno as Address { Address.ID }` + expected.SELECT.where = [ {func: 'search', args: [{ list: [{ref: ['Address', 'city']}]}, {val: 'x'}]}] expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected) }) }) diff --git a/hana/lib/cql-functions.js b/hana/lib/cql-functions.js index b78516553..0f7aa005e 100644 --- a/hana/lib/cql-functions.js +++ b/hana/lib/cql-functions.js @@ -24,10 +24,72 @@ const StandardFunctions = { contains: (...args) => args.length > 2 ? `CONTAINS(${args})` : `(CASE WHEN coalesce(locate(${args}),0)>0 THEN TRUE ELSE FALSE END)`, concat: (...args) => `(${args.map(a => (a.xpr ? `(${a})` : a)).join(' || ')})`, search: function (ref, arg) { + if (cds.env.hana.fuzzy === false) { + // REVISIT: remove once the protocol adapter only creates vals + arg = arg.xpr ? arg.xpr : arg + if (Array.isArray(arg)) arg = [{ val: arg.filter(a => a.val).map(a => a.val).join(' ') }] + else arg = [arg] + const searchTerms = arg[0].val + .match(/("")|("(?:[^"]|\\")*(?:[^\\]|\\\\)")|(\S*)/g) + .filter(el => el.length).map(el => `%${el.replace(/^\"|\"$/g, '').toLowerCase()}%`) + + const columns = ref.list + const xpr = [] + for (const s of searchTerms) { + const nestedXpr = [] + for (const c of columns) { + if (nestedXpr.length) nestedXpr.push('or') + nestedXpr.push({ func: 'lower', args: [c]}, 'like', {val: s}) + } + if (xpr.length) xpr.push('and') + xpr.push({xpr: nestedXpr}) + } + + const { toString } = ref + return `(CASE WHEN (${toString({ xpr })}) THEN TRUE ELSE FALSE END)` + } + + // fuzziness config + const fuzzyIndex = cds.env.hana?.fuzzy || 0.7 + + const csnElements = ref.list + // if column specific value is provided, the configuration has to be defined on column level + if (csnElements.some(e => e.element?.['@Search.ranking'] || e.element?.['@Search.fuzzinessThreshold'])) { + csnElements.forEach(e => { + let fuzzy = `FUZZY` + + // weighted search + const rank = e.element?.['@Search.ranking']?.['='] + switch(rank) { + case 'HIGH': + fuzzy += ' WEIGHT 0.8' + break + case 'LOW': + fuzzy += ' WEIGHT 0.3' + break + case 'MEDIUM': + case undefined: + fuzzy += ' WEIGHT 0.5' + break + default: throw new Error(`Invalid configuration ${rank} for @Search.ranking. HIGH, MEDIUM, LOW are supported values.`) + } + + // fuzziness + fuzzy+= ` MINIMAL TOKEN SCORE ${e.element?.['@Search.fuzzinessThreshold'] || fuzzyIndex} SIMILARITY CALCULATION MODE 'search'` + + // rewrite ref to xpr to mix in search config + // ensure in place modification to reuse .toString method that ensures quoting + e.xpr = [{ ref: e.ref }, fuzzy] + delete e.ref + }) + } else { + ref = `${ref} FUZZY MINIMAL TOKEN SCORE ${fuzzyIndex} SIMILARITY CALCULATION MODE 'search'` + } + // REVISIT: remove once the protocol adapter only creates vals if (Array.isArray(arg.xpr)) arg = { val: arg.xpr.filter(a => a.val).map(a => a.val).join(' ') } - // REVISIT: make this more configurable - return (`(CASE WHEN SCORE(${arg} IN ${ref} FUZZY MINIMAL TOKEN SCORE 0.7 SIMILARITY CALCULATION MODE 'search') > 0 THEN TRUE ELSE FALSE END)`) + + return (`(CASE WHEN SCORE(${arg} IN ${ref}) > 0 THEN TRUE ELSE FALSE END)`) }, // Date and Time Functions diff --git a/hana/test/fuzzy.cds b/hana/test/fuzzy.cds index 977c4458a..56e22edda 100644 --- a/hana/test/fuzzy.cds +++ b/hana/test/fuzzy.cds @@ -1 +1,5 @@ -using {sap.capire.bookshop.Books as Books} from '../../test/bookshop/db/schema.cds'; +using {sap.capire.bookshop.BooksAnnotated as BooksAnnotated} from '../../test/bookshop/db/schema.cds'; + +annotate BooksAnnotated with @cds.search: {title, descr, currency.code}; +annotate BooksAnnotated:title with @(Search.ranking: HIGH, Search.fuzzinessThreshold: 0.9); +annotate BooksAnnotated:descr with @(Search.ranking: LOW, Search.fuzzinessThreshold: 0.9); \ No newline at end of file diff --git a/hana/test/fuzzy.test.js b/hana/test/fuzzy.test.js index 857717c8f..a2ae4f103 100644 --- a/hana/test/fuzzy.test.js +++ b/hana/test/fuzzy.test.js @@ -1,19 +1,78 @@ const cds = require('../../test/cds') -describe('Fuzzy search', () => { +describe('search', () => { const { expect } = cds.test(__dirname, 'fuzzy.cds') - test('select', async () => { - const { Books } = cds.entities('sap.capire.bookshop') - const res = await SELECT.from(Books).where({ - func: 'contains', - args: [ - { list: [{ ref: ['title'] }, { ref: ['descr'] }] }, - { val: 'poem' }, - { func: 'FUZZY', args: [{ val: 0.8 }, { val: 'similarCalculationMode=searchCompare' }] } - ] + beforeEach (() => { + delete cds.env.hana.fuzzy + }) + + describe('fuzzy', () => { + test('default', async () => { + const { Books } = cds.entities('sap.capire.bookshop') + const cqn = SELECT.from(Books).search('"autobio"').columns('1') + const {sql} = cqn.toSQL() + expect(sql).to.include('FUZZY MINIMAL TOKEN SCORE 0.7') + const res = await cqn + expect(res.length).to.be(2) // Eleonora and Jane Eyre + }) + + //HCE returns different result than HXE + test.skip('multiple search terms', async () => { + const { Books } = cds.entities('sap.capire.bookshop') + const cqn = SELECT.from(Books).search('"autobio" "jane"').columns('1') + const {sql, values} = cqn.toSQL() + expect(sql).to.include('FUZZY MINIMAL TOKEN SCORE 0.7') + expect(values[0]).to.eq('"autobio" "jane"') // taken as is + const res = await cqn + expect(res.length).to.be(2) // Eleonora and Jane Eyre + }) + + test('global config', async () => { + cds.env.hana.fuzzy = 1 + const { Books } = cds.entities('sap.capire.bookshop') + const cqn = SELECT.from(Books).search('"autobio"').columns('1') + const {sql} = cqn.toSQL() + expect(sql).to.include('FUZZY MINIMAL TOKEN SCORE 1') + const res = await cqn + expect(res.length).to.be(2) // Eleonora and Jane Eyre }) - expect(res).to.have.property('length').to.be.eq(1) + test('annotations', async () => { + const { BooksAnnotated } = cds.entities('sap.capire.bookshop') + const cqn = SELECT.from(BooksAnnotated).search('"first-person"').columns('1') + const {sql} = cqn.toSQL() + expect(sql).to.include('title FUZZY WEIGHT 0.8 MINIMAL TOKEN SCORE 0.9') + expect(sql).to.include('code FUZZY WEIGHT 0.5 MINIMAL TOKEN SCORE 0.7') + expect(sql).to.include('descr FUZZY WEIGHT 0.3 MINIMAL TOKEN SCORE 0.9') + + const res = await cqn + expect(res.length).to.be(1) // jane eyre + }) + }) + + describe('like', () => { + beforeEach (() => cds.env.hana.fuzzy = false) + test('fallback - 1 search term', async () => { + const { Books } = cds.entities('sap.capire.bookshop') + const cqn = SELECT.from(Books).search('"autobio"').columns('1') + const {sql} = cqn.toSQL() + // 5 columns to be searched createdBy, modifiedBy, title, descr, currency_code + expect(sql.match(/(like)/g).length).to.be(5) + const res = await cqn + expect(res.length).to.be(2) // Eleonora and Jane Eyre + }) + + test('fallback - 2 search terms', async () => { + const { Books } = cds.entities('sap.capire.bookshop') + const cqn = SELECT.from(Books).search('"autobio"', '"Jane"').columns('1') + const {sql, values} = cqn.toSQL() + // 5 columns to be searched createdBy, modifiedBy, title, descr, currency_code + expect(sql.match(/(like)/g).length).to.be(10) + expect(values).to.include('%autobio%') + expect(values).to.include('%jane%') + const res = await cqn + expect(res.length).to.be(1) // Jane Eyre + }) }) }) \ No newline at end of file diff --git a/test/bookshop/db/schema.cds b/test/bookshop/db/schema.cds index f00593075..72931bd71 100644 --- a/test/bookshop/db/schema.cds +++ b/test/bookshop/db/schema.cds @@ -67,4 +67,5 @@ entity C : managed { B : Integer; toB : Composition of many B on toB.ID = $self.B; -} +}; +entity BooksAnnotated as projection on Books; \ No newline at end of file diff --git a/test/compliance/SELECT.test.js b/test/compliance/SELECT.test.js index 49625ded7..5083391fd 100644 --- a/test/compliance/SELECT.test.js +++ b/test/compliance/SELECT.test.js @@ -433,7 +433,7 @@ describe('SELECT', () => { // search tests don't check results as the search behavior is undefined test('search one column', async () => { const { string } = cds.entities('basic.literals') - const cqn = CQL`SELECT * FROM ${string} WHERE search((string),${'yes'})` + const cqn = SELECT.from(string).where([{func: 'search', args: [{list: [{ref: ['string']}]}, {val: 'yes'}]}]) await cds.run(cqn) }) @@ -994,7 +994,7 @@ describe('SELECT', () => { unified.scalar = [ // TODO: investigate search issue for nvarchar columns ...unified.ref.filter(ref => cds.builtin.types[ref.element?.type] === cds.builtin.types.LargeString).map(ref => { - return unified.string.map(val => ({ func: 'search', args: [ref, val] })) + return unified.string.map(val => ({ func: 'search', args: [{list:[ref]}, val] })) }).flat(), // ...unified.string.map(val => ({ func: 'search', args: [{ list: unified.ref.filter(stringRefs) }, val] })), ...unified.ref.filter(stringRefs).filter(noBooleanRefs).map(X => {