From f6593e69de6df3e85a39c048794a56c7eb842c4c Mon Sep 17 00:00:00 2001
From: Johannes Vogel <31311694+johannes-vogel@users.noreply.github.com>
Date: Fri, 22 Nov 2024 13:22:36 +0100
Subject: [PATCH] feat: config options for fuzzy search (#898)

Co-authored-by: Bob den Os <108393871+BobdenOs@users.noreply.github.com>
---
 db-service/lib/cql-functions.js        |   2 +-
 db-service/lib/cqn4sql.js              |   2 +-
 db-service/test/cqn4sql/search.test.js | 105 ++++++++++++++-----------
 hana/lib/cql-functions.js              |  66 +++++++++++++++-
 hana/test/fuzzy.cds                    |   6 +-
 hana/test/fuzzy.test.js                |  81 ++++++++++++++++---
 test/bookshop/db/schema.cds            |   3 +-
 test/compliance/SELECT.test.js         |   4 +-
 8 files changed, 203 insertions(+), 66 deletions(-)

diff --git a/db-service/lib/cql-functions.js b/db-service/lib/cql-functions.js
index 0165278e5..6a7206119 100644
--- a/db-service/lib/cql-functions.js
+++ b/db-service/lib/cql-functions.js
@@ -33,7 +33,7 @@ const StandardFunctions = {
       val = sub[2] || sub[3] || ''
     }
     arg.val = arg.__proto__.val = val
-    const refs = ref.list || [ref]
+    const refs = ref.list
     const { toString } = ref
     return '(' + refs.map(ref2 => this.contains(this.tolower(toString(ref2)), this.tolower(arg))).join(' or ') + ')'
   },
diff --git a/db-service/lib/cqn4sql.js b/db-service/lib/cqn4sql.js
index 739597abf..57b68f7a6 100644
--- a/db-service/lib/cqn4sql.js
+++ b/db-service/lib/cqn4sql.js
@@ -2203,7 +2203,7 @@ function cqn4sql(originalQuery, model) {
       const searchFunc = {
         func: 'search',
         args: [
-          searchIn.length > 1 ? { list: searchIn } : { ...searchIn[0] },
+          { list: searchIn },
           xpr.length === 1 && 'val' in xpr[0] ? xpr[0] : { xpr },
         ],
       }
diff --git a/db-service/test/cqn4sql/search.test.js b/db-service/test/cqn4sql/search.test.js
index be44de4d0..7904150d7 100644
--- a/db-service/test/cqn4sql/search.test.js
+++ b/db-service/test/cqn4sql/search.test.js
@@ -16,9 +16,8 @@ describe('Replace attribute search by search predicate', () => {
 
     let res = cqn4sql(query, model)
     // single val is stored as val directly, not as expr with val
-    const expected = CQL`SELECT from bookshop.WithStructuredKey as wsk {
-      wsk.second
-    } where search(wsk.second, 'x')`
+    const expected = CQL`SELECT from bookshop.WithStructuredKey as wsk { wsk.second }`
+    expected.SELECT.where = [ {func: 'search', args: [{ list: [{ ref: ['wsk', 'second']}] }, {val: 'x'}]}]
     expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected)
   })
 
@@ -28,9 +27,8 @@ describe('Replace attribute search by search predicate', () => {
     query.SELECT.search = [{ val: 'x' }, 'or', { val: 'y' }]
 
     let res = cqn4sql(query, model)
-    const expected = CQL`SELECT from bookshop.WithStructuredKey as wsk {
-      wsk.second
-    } where search(wsk.second, ('x' OR 'y'))`
+    const expected = CQL`SELECT from bookshop.WithStructuredKey as wsk { wsk.second }`
+    expected.SELECT.where = [ {func: 'search', args: [{ list: [{ ref: ['wsk', 'second']}] }, {xpr: [{val: 'x'}, 'or', {val: 'y'}]}]}]
     expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected)
   })
 
@@ -109,16 +107,16 @@ describe('Replace attribute search by search predicate', () => {
     query.SELECT.search = [{ val: 'x' }, 'or', { val: 'y' }]
 
     let res = cqn4sql(query, model)
-    expect(JSON.parse(JSON.stringify(res))).to.deep.equal(
-      CQL`
-      SELECT from bookshop.Books as Books
-        left join bookshop.Authors as author on author.ID = Books.author_ID
-        left join bookshop.Books as books2 on  books2.author_ID = author.ID
-      {
-        Books.ID,
-        books2.title as authorsBook
-      } where search(books2.title, ('x' OR 'y')) group by Books.title `,
-    )
+    const expected =       CQL`
+    SELECT from bookshop.Books as Books
+      left join bookshop.Authors as author on author.ID = Books.author_ID
+      left join bookshop.Books as books2 on  books2.author_ID = author.ID
+    {
+      Books.ID,
+      books2.title as authorsBook
+    } where search(books2.title, ('x' OR 'y')) group by Books.title `
+    expected.SELECT.where = [ {func: 'search', args: [{ list: [{ ref: ['books2', 'title']}] }, {xpr: [{val: 'x'}, 'or', {val: 'y'}]}]}]
+    expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected)
   })
   it('Search on navigation', () => {
     let query = CQL`SELECT from bookshop.Authors:books { ID }`
@@ -147,11 +145,12 @@ describe('Replace attribute search by search predicate', () => {
       .columns({ args: [{ ref: ['title'] }], as: 'firstInAlphabet', func: 'MIN' })
       .groupBy('title')
       .search('Cat')
-
-    expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(CQL`
-      SELECT from bookshop.Books as Books {
-        MIN(Books.title) as firstInAlphabet
-      } group by Books.title having search(MIN(Books.title), 'Cat')`)
+    const expected = CQL`
+    SELECT from bookshop.Books as Books {
+      MIN(Books.title) as firstInAlphabet
+    } group by Books.title having search(MIN(Books.title), 'Cat')`
+    expected.SELECT.having = [ {func: 'search', args: [{ list: [{func: 'MIN', args: [{ ref: ['Books', 'title']}]}] }, {val: 'Cat'}]}]
+    expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(expected)
   })
 
   it('Ignore non string aggregates from being searched', () => {
@@ -163,12 +162,13 @@ describe('Replace attribute search by search predicate', () => {
       `
 
     query.SELECT.search = [{ val: 'x' }]
-
-    expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(CQL`
-      SELECT from bookshop.Books as Books {
-        Books.title,
-        AVG(Books.stock) as searchRelevant,
-      } where search(Books.title, 'x') group by Books.title`)
+    const expected = CQL`
+    SELECT from bookshop.Books as Books {
+      Books.title,
+      AVG(Books.stock) as searchRelevant,
+    } where search(Books.title, 'x') group by Books.title`
+    expected.SELECT.where = [ {func: 'search', args: [{ list: [{ ref: ['Books', 'title']}] }, {val: 'x'}]}]
+    expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(expected)
   })
   it('aggregations which are not of type string are not searched', () => {
     const query = CQL`
@@ -197,12 +197,16 @@ describe('Replace attribute search by search predicate', () => {
       `
 
     query.SELECT.search = [{ val: 'x' }]
-
-    expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(CQL`
-      SELECT from bookshop.Books as Books {
-        Books.ID,
-        substring(Books.stock) as searchRelevantViaCast: cds.String,
-      } group by Books.title having search(substring(Books.stock), 'x')`)
+    const expected = CQL`
+    SELECT from bookshop.Books as Books {
+      Books.ID,
+      substring(Books.stock) as searchRelevantViaCast: cds.String,
+    } group by Books.title having search(substring(Books.stock), 'x')`
+    expected.SELECT.having = [ {func: 'search', args: [{ list: [{
+      args: [ { ref: [ 'Books', 'stock' ] } ],
+      func: 'substring'
+    }] }, {val: 'x'}]}]
+    expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(expected)
   })
   it('xpr is search relevant via cast', () => {
     // this aggregation is not relevant for search per default
@@ -216,13 +220,21 @@ describe('Replace attribute search by search predicate', () => {
       `
 
     query.SELECT.search = [{ val: 'x' }]
-
-    expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(CQL`
-      SELECT from bookshop.Books as Books {
-        Books.ID,
-        ('very' + 'useful' + 'string') as searchRelevantViaCast: cds.String,
-        ('1' + '2' + '3') as notSearchRelevant: cds.Integer,
-      } group by Books.title having search(('very' + 'useful' + 'string'), 'x')`)
+    const expected = CQL`
+    SELECT from bookshop.Books as Books {
+      Books.ID,
+      ('very' + 'useful' + 'string') as searchRelevantViaCast: cds.String,
+      ('1' + '2' + '3') as notSearchRelevant: cds.Integer,
+    } group by Books.title`
+    expected.SELECT.having = [ {func: 'search', args: [{ list: [{
+      xpr: [
+        { val: 'very' },
+        '+',
+        { val: 'useful' },
+        '+',
+        { val: 'string' }
+      ] }] }, {val: 'x'}]}]
+    expect(JSON.parse(JSON.stringify(cqn4sql(query, model)))).to.deep.equal(expected)
   })
 })
 
@@ -242,7 +254,8 @@ describe('search w/ path expressions', () => {
     {
       BooksSearchAuthorName.ID,
       BooksSearchAuthorName.title
-  } where search(author.lastName, 'x')`
+  }`
+    expected.SELECT.where = [ {func: 'search', args: [{ list: [{ref: ['author', 'lastName']}]}, {val: 'x'}]}]
     expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected)
   })
 
@@ -286,7 +299,8 @@ describe('search w/ path expressions', () => {
     {
       BookShelf.ID,
       BookShelf.genre
-  } where search((BookShelf.genre), 'Harry Plotter')`
+  }`
+    expected.SELECT.where = [ {func: 'search', args: [{ list: [{ref: ['BookShelf', 'genre']}]}, {val: 'Harry Plotter'}]}]
     expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected)
   })
 })
@@ -316,11 +330,8 @@ describe('calculated elements', () => {
     query.SELECT.search = [{ val: 'x' }]
 
     let res = cqn4sql(query, model)
-    const expected = CQL`
-    SELECT from search.CalculatedAddressesWithoutAnno as Address
-    {
-      Address.ID
-  } where search((Address.city), 'x')`
+    const expected = CQL`SELECT from search.CalculatedAddressesWithoutAnno as Address { Address.ID }`
+    expected.SELECT.where = [ {func: 'search', args: [{ list: [{ref: ['Address', 'city']}]}, {val: 'x'}]}]
     expect(JSON.parse(JSON.stringify(res))).to.deep.equal(expected)
   })
 })
diff --git a/hana/lib/cql-functions.js b/hana/lib/cql-functions.js
index b78516553..0f7aa005e 100644
--- a/hana/lib/cql-functions.js
+++ b/hana/lib/cql-functions.js
@@ -24,10 +24,72 @@ const StandardFunctions = {
   contains: (...args) => args.length > 2 ? `CONTAINS(${args})` : `(CASE WHEN coalesce(locate(${args}),0)>0 THEN TRUE ELSE FALSE END)`,
   concat: (...args) => `(${args.map(a => (a.xpr ? `(${a})` : a)).join(' || ')})`,
   search: function (ref, arg) {
+    if (cds.env.hana.fuzzy === false) {
+      // REVISIT: remove once the protocol adapter only creates vals
+      arg = arg.xpr ? arg.xpr : arg
+      if (Array.isArray(arg)) arg = [{ val: arg.filter(a => a.val).map(a => a.val).join(' ') }]
+      else arg = [arg]
+      const searchTerms = arg[0].val
+          .match(/("")|("(?:[^"]|\\")*(?:[^\\]|\\\\)")|(\S*)/g)
+          .filter(el => el.length).map(el => `%${el.replace(/^\"|\"$/g, '').toLowerCase()}%`)
+
+      const columns = ref.list
+      const xpr = []
+      for (const s of searchTerms) {
+        const nestedXpr = []
+        for (const c of columns) {
+          if (nestedXpr.length) nestedXpr.push('or')
+          nestedXpr.push({ func: 'lower', args: [c]}, 'like', {val: s})
+        }
+        if (xpr.length) xpr.push('and')
+        xpr.push({xpr: nestedXpr})
+      }
+
+      const { toString } = ref
+      return `(CASE WHEN (${toString({ xpr })}) THEN TRUE ELSE FALSE END)`
+    }
+
+    // fuzziness config
+    const fuzzyIndex = cds.env.hana?.fuzzy || 0.7
+    
+    const csnElements = ref.list
+    // if column specific value is provided, the configuration has to be defined on column level
+    if (csnElements.some(e => e.element?.['@Search.ranking'] || e.element?.['@Search.fuzzinessThreshold'])) {
+      csnElements.forEach(e => {
+        let fuzzy = `FUZZY`
+        
+        // weighted search
+        const rank = e.element?.['@Search.ranking']?.['=']
+        switch(rank) {
+          case 'HIGH':
+            fuzzy += ' WEIGHT 0.8'
+            break
+          case 'LOW':
+            fuzzy += ' WEIGHT 0.3'
+            break
+          case 'MEDIUM':
+          case undefined:
+            fuzzy += ' WEIGHT 0.5'
+            break
+          default: throw new Error(`Invalid configuration ${rank} for @Search.ranking. HIGH, MEDIUM, LOW are supported values.`)
+        }
+        
+        // fuzziness
+        fuzzy+= ` MINIMAL TOKEN SCORE ${e.element?.['@Search.fuzzinessThreshold'] || fuzzyIndex} SIMILARITY CALCULATION MODE 'search'`
+
+        // rewrite ref to xpr to mix in search config
+        // ensure in place modification to reuse .toString method that ensures quoting
+        e.xpr = [{ ref: e.ref }, fuzzy]
+        delete e.ref
+      })
+    } else {
+      ref = `${ref} FUZZY MINIMAL TOKEN SCORE ${fuzzyIndex} SIMILARITY CALCULATION MODE 'search'`
+    }
+
     // REVISIT: remove once the protocol adapter only creates vals
     if (Array.isArray(arg.xpr)) arg = { val: arg.xpr.filter(a => a.val).map(a => a.val).join(' ') }
-    // REVISIT: make this more configurable
-    return (`(CASE WHEN SCORE(${arg} IN ${ref} FUZZY MINIMAL TOKEN SCORE 0.7 SIMILARITY CALCULATION MODE 'search') > 0 THEN TRUE ELSE FALSE END)`)
+
+    return (`(CASE WHEN SCORE(${arg} IN ${ref}) > 0 THEN TRUE ELSE FALSE END)`)
   },
 
   // Date and Time Functions
diff --git a/hana/test/fuzzy.cds b/hana/test/fuzzy.cds
index 977c4458a..56e22edda 100644
--- a/hana/test/fuzzy.cds
+++ b/hana/test/fuzzy.cds
@@ -1 +1,5 @@
-using {sap.capire.bookshop.Books as Books} from '../../test/bookshop/db/schema.cds';
+using {sap.capire.bookshop.BooksAnnotated as BooksAnnotated} from '../../test/bookshop/db/schema.cds';
+
+annotate BooksAnnotated with @cds.search: {title, descr, currency.code};
+annotate BooksAnnotated:title with @(Search.ranking: HIGH, Search.fuzzinessThreshold: 0.9);
+annotate BooksAnnotated:descr with @(Search.ranking: LOW, Search.fuzzinessThreshold: 0.9);
\ No newline at end of file
diff --git a/hana/test/fuzzy.test.js b/hana/test/fuzzy.test.js
index 857717c8f..a2ae4f103 100644
--- a/hana/test/fuzzy.test.js
+++ b/hana/test/fuzzy.test.js
@@ -1,19 +1,78 @@
 const cds = require('../../test/cds')
 
-describe('Fuzzy search', () => {
+describe('search', () => {
   const { expect } = cds.test(__dirname, 'fuzzy.cds')
 
-  test('select', async () => {
-    const { Books } = cds.entities('sap.capire.bookshop')
-    const res = await SELECT.from(Books).where({
-      func: 'contains',
-      args: [
-        { list: [{ ref: ['title'] }, { ref: ['descr'] }] },
-        { val: 'poem' },
-        { func: 'FUZZY', args: [{ val: 0.8 }, { val: 'similarCalculationMode=searchCompare' }] }
-      ]
+  beforeEach (() => {
+    delete cds.env.hana.fuzzy
+  })
+
+  describe('fuzzy', () => {
+    test('default', async () => {
+      const { Books } = cds.entities('sap.capire.bookshop')
+      const cqn = SELECT.from(Books).search('"autobio"').columns('1')
+      const {sql} = cqn.toSQL()
+      expect(sql).to.include('FUZZY MINIMAL TOKEN SCORE 0.7')
+      const res = await cqn
+      expect(res.length).to.be(2) // Eleonora and Jane Eyre
+    })
+    
+    //HCE returns different result than HXE
+    test.skip('multiple search terms', async () => {
+      const { Books } = cds.entities('sap.capire.bookshop')
+      const cqn = SELECT.from(Books).search('"autobio" "jane"').columns('1')
+      const {sql, values} = cqn.toSQL()
+      expect(sql).to.include('FUZZY MINIMAL TOKEN SCORE 0.7')
+      expect(values[0]).to.eq('"autobio" "jane"') // taken as is
+      const res = await cqn
+      expect(res.length).to.be(2) // Eleonora and Jane Eyre
+    })
+    
+    test('global config', async () => {
+      cds.env.hana.fuzzy = 1
+      const { Books } = cds.entities('sap.capire.bookshop')
+      const cqn = SELECT.from(Books).search('"autobio"').columns('1')
+      const {sql} = cqn.toSQL()
+      expect(sql).to.include('FUZZY MINIMAL TOKEN SCORE 1')
+      const res = await cqn
+      expect(res.length).to.be(2) // Eleonora and Jane Eyre
     })
 
-    expect(res).to.have.property('length').to.be.eq(1)
+    test('annotations', async () => {
+      const { BooksAnnotated } = cds.entities('sap.capire.bookshop')
+      const cqn = SELECT.from(BooksAnnotated).search('"first-person"').columns('1')
+      const {sql} = cqn.toSQL()
+      expect(sql).to.include('title FUZZY WEIGHT 0.8 MINIMAL TOKEN SCORE 0.9')
+      expect(sql).to.include('code FUZZY WEIGHT 0.5 MINIMAL TOKEN SCORE 0.7')
+      expect(sql).to.include('descr FUZZY WEIGHT 0.3 MINIMAL TOKEN SCORE 0.9')
+      
+      const res = await cqn
+      expect(res.length).to.be(1) // jane eyre
+    })
+  })
+
+  describe('like', () => {
+    beforeEach (() => cds.env.hana.fuzzy = false)
+    test('fallback - 1 search term', async () => {
+      const { Books } = cds.entities('sap.capire.bookshop')
+      const cqn = SELECT.from(Books).search('"autobio"').columns('1')
+      const {sql} = cqn.toSQL()
+      // 5 columns to be searched createdBy, modifiedBy, title, descr, currency_code
+      expect(sql.match(/(like)/g).length).to.be(5)
+      const res = await cqn
+      expect(res.length).to.be(2) // Eleonora and Jane Eyre
+    })
+    
+    test('fallback - 2 search terms', async () => {
+      const { Books } = cds.entities('sap.capire.bookshop')
+      const cqn = SELECT.from(Books).search('"autobio"', '"Jane"').columns('1')
+      const {sql, values} = cqn.toSQL()
+      // 5 columns to be searched createdBy, modifiedBy, title, descr, currency_code
+      expect(sql.match(/(like)/g).length).to.be(10)
+      expect(values).to.include('%autobio%')
+      expect(values).to.include('%jane%')
+      const res = await cqn
+      expect(res.length).to.be(1) // Jane Eyre
+    })
   })
 })
\ No newline at end of file
diff --git a/test/bookshop/db/schema.cds b/test/bookshop/db/schema.cds
index f00593075..72931bd71 100644
--- a/test/bookshop/db/schema.cds
+++ b/test/bookshop/db/schema.cds
@@ -67,4 +67,5 @@ entity C : managed {
       B   : Integer;
       toB : Composition of many B
               on toB.ID = $self.B;
-}
+};
+entity BooksAnnotated as projection on Books;
\ No newline at end of file
diff --git a/test/compliance/SELECT.test.js b/test/compliance/SELECT.test.js
index 49625ded7..5083391fd 100644
--- a/test/compliance/SELECT.test.js
+++ b/test/compliance/SELECT.test.js
@@ -433,7 +433,7 @@ describe('SELECT', () => {
     // search tests don't check results as the search behavior is undefined
     test('search one column', async () => {
       const { string } = cds.entities('basic.literals')
-      const cqn = CQL`SELECT * FROM ${string} WHERE search((string),${'yes'})`
+      const cqn = SELECT.from(string).where([{func: 'search', args: [{list: [{ref: ['string']}]}, {val: 'yes'}]}])
       await cds.run(cqn)
     })
 
@@ -994,7 +994,7 @@ describe('SELECT', () => {
     unified.scalar = [
       // TODO: investigate search issue for nvarchar columns
       ...unified.ref.filter(ref => cds.builtin.types[ref.element?.type] === cds.builtin.types.LargeString).map(ref => {
-        return unified.string.map(val => ({ func: 'search', args: [ref, val] }))
+        return unified.string.map(val => ({ func: 'search', args: [{list:[ref]}, val] }))
       }).flat(),
       // ...unified.string.map(val => ({ func: 'search', args: [{ list: unified.ref.filter(stringRefs) }, val] })),
       ...unified.ref.filter(stringRefs).filter(noBooleanRefs).map(X => {