diff --git a/README.md b/README.md index 4bd123e..e0c3cb5 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,21 @@ transformSpaces('Hello world'); } ``` +### transformDiacritics(str) +- removes diacritics of *str* +- returns an object consisting of *str* without diacritics and a mapping which indicates the relation between the original and the transformed string +``` +transformDiacritics('Straße'); +=> { + str: 'Strasse', + mapping: [ + { transformed: 4, original: 4 }, + { transformed: 2, original: 1 }, + { transformed: 1, original: 1 }, + ], + } +``` + ### backTransformPositions(positions, mapping) - returns the positions in the original string as an array ``` diff --git a/package.json b/package.json index 092e076..7c29d2e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "srch", - "version": "1.2.1", + "version": "1.3.0", "description": "full text search", "main": "src/index.js", "scripts": { @@ -18,6 +18,26 @@ "nyc": "^10.0.0", "should": "^11.1.0" }, + "keywords": [ + "search", + "string", + "index", + "full text", + "mapping", + "lower case", + "diacritics", + "space" + ], "author": "Jennifer Gebske ", - "license": "GPL-3.0" + "license": "GPL-3.0", + "repository": { + "type": "git", + "url": "https://github.com/paperhive/srch.git" + }, + "bugs": { + "url": "https://github.com/paperhive/srch/issues" + }, + "dependencies": { + "diacritics": "^1.3.0" + } } diff --git a/src/index.js b/src/index.js index 71ad3ea..442ebb1 100644 --- a/src/index.js +++ b/src/index.js @@ -1,8 +1,12 @@ +const diacriticsMap = require('diacritics').diacriticsMap; + + // cf. https://github.com/sindresorhus/escape-string-regexp function escapeRegExpCharacters(str) { return str.replace(/[|\\{}()[\]^$+*?.]/g, '\\$&'); } + exports.findPositions = function findPositions(fullstr, searchstr) { const regExpSearchstr = new RegExp(escapeRegExpCharacters(searchstr), 'g'); let match = []; @@ -44,6 +48,40 @@ exports.transformSpaces = function transformSpaces(str) { }; +exports.transformDiacritics = function transformDiacritics(str) { + const mapping = []; + let lastOffsetOriginal = 0; + let lastOffsetTransformed = 0; + const transformedStr = str.replace(/[^\u0000-\u007e]/g, (match, offset) => { + const replacement = diacriticsMap[match] || match; + + if (replacement.length > 1) { + const remain = offset - lastOffsetOriginal; + mapping.push({ transformed: remain, original: remain }); + mapping.push({ transformed: replacement.length, original: match.length }); + + lastOffsetOriginal = offset + match.length; + lastOffsetTransformed += remain + replacement.length; + } + + return replacement; + }); + + const lenRestTransformed = transformedStr.length - lastOffsetTransformed; + const lenRestOriginal = str.length - lastOffsetOriginal; + + if (lenRestOriginal !== lenRestTransformed) throw new Error('strings out of sync'); + if (lenRestOriginal) { + mapping.push({ + transformed: transformedStr.length - lastOffsetTransformed, + original: str.length - lastOffsetOriginal, + }); + } + + return { str: transformedStr, mapping }; +}; + + // assumes that elements in positions are sorted exports.backTransformPositions = function backTransformPositions(positions, mapping) { const transformedPositions = []; @@ -132,7 +170,8 @@ function transform(transformations, str) { exports.SearchIndex = class SearchIndex { constructor(str) { - this.transformations = [exports.transformSpaces, transformLowercase]; + this.transformations = + [exports.transformSpaces, exports.transformDiacritics, transformLowercase]; const {transformedStr, mapping} = transform(this.transformations, str); this.transformedStr = transformedStr; diff --git a/test/index.js b/test/index.js index 9079b13..a287e0b 100644 --- a/test/index.js +++ b/test/index.js @@ -23,6 +23,7 @@ describe('findPositions()', () => { }); }); + describe('transformSpaces', () => { it('should return input if no multiple spaces', () => { srch.transformSpaces('this is a test') @@ -47,6 +48,54 @@ describe('transformSpaces', () => { }); }); + +describe('transformDiacritics', () => { + it('should return input if no diacritics', () => { + srch.transformDiacritics('this is a test') + .should.eql({ + str: 'this is a test', + mapping: [{ transformed: 14, original: 14 }], + }); + }); + + it('should remove diacritics (mapping: original shorter than transformed)', () => { + srch.transformDiacritics('Straße') + .should.eql({ + str: 'Strasse', + mapping: [ + { transformed: 4, original: 4 }, + { transformed: 2, original: 1 }, + { transformed: 1, original: 1 }, + ], + }); + }); + + it('should remove diacritics of 2 words (mapping: original shorter than transformed)', () => { + srch.transformDiacritics('Spaß mit Soße') + .should.eql({ + str: 'Spass mit Sosse', + mapping: [ + { transformed: 3, original: 3 }, + { transformed: 2, original: 1 }, + { transformed: 7, original: 7 }, + { transformed: 2, original: 1 }, + { transformed: 1, original: 1 }, + ], + }); + }); + + it('should remove diacritics (mapping: original equal to transformed)', () => { + srch.transformDiacritics('Iлtèrnåtïonɑlíƶatï߀ԉ') + .should.eql({ + str: 'Internationalizati0n', + mapping: [ + { transformed: 20, original: 20 }, + ], + }); + }); +}); + + describe('backTransformPositions()', () => { it('should return the original positions', () => { // original: 'hällo world' @@ -64,6 +113,7 @@ describe('backTransformPositions()', () => { }); }); + describe('backTransformRange', () => { const transformations = [ {original: 3, transformed: 3, textObject: 1}, @@ -175,6 +225,7 @@ describe('backTransformRange', () => { }); }); + describe('SearchIndex', () => { it('should find ranges', () => { const index = new srch.SearchIndex('this is a test');