Skip to content

Commit

Permalink
Merge pull request #9 from paperhive/diacritics
Browse files Browse the repository at this point in the history
Add function to remove diacritics
  • Loading branch information
andrenarchy authored Jan 17, 2017
2 parents 5ed971e + b36176f commit a716290
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 3 deletions.
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,21 @@ transformSpaces('Hello world');
}
```

### transformDiacritics(str)
- removes diacritics of *str*
- returns an object consisting of *str* without diacritics and a mapping which indicates the relation between the original and the transformed string
```
transformDiacritics('Straße');
=> {
str: 'Strasse',
mapping: [
{ transformed: 4, original: 4 },
{ transformed: 2, original: 1 },
{ transformed: 1, original: 1 },
],
}
```

### backTransformPositions(positions, mapping)
- returns the positions in the original string as an array
```
Expand Down
24 changes: 22 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "srch",
"version": "1.2.1",
"version": "1.3.0",
"description": "full text search",
"main": "src/index.js",
"scripts": {
Expand All @@ -18,6 +18,26 @@
"nyc": "^10.0.0",
"should": "^11.1.0"
},
"keywords": [
"search",
"string",
"index",
"full text",
"mapping",
"lower case",
"diacritics",
"space"
],
"author": "Jennifer Gebske <jennifer@paperhive.org>",
"license": "GPL-3.0"
"license": "GPL-3.0",
"repository": {
"type": "git",
"url": "https://github.com/paperhive/srch.git"
},
"bugs": {
"url": "https://github.com/paperhive/srch/issues"
},
"dependencies": {
"diacritics": "^1.3.0"
}
}
41 changes: 40 additions & 1 deletion src/index.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
const diacriticsMap = require('diacritics').diacriticsMap;


// cf. https://github.com/sindresorhus/escape-string-regexp
function escapeRegExpCharacters(str) {
return str.replace(/[|\\{}()[\]^$+*?.]/g, '\\$&');
}


exports.findPositions = function findPositions(fullstr, searchstr) {
const regExpSearchstr = new RegExp(escapeRegExpCharacters(searchstr), 'g');
let match = [];
Expand Down Expand Up @@ -44,6 +48,40 @@ exports.transformSpaces = function transformSpaces(str) {
};


exports.transformDiacritics = function transformDiacritics(str) {
const mapping = [];
let lastOffsetOriginal = 0;
let lastOffsetTransformed = 0;
const transformedStr = str.replace(/[^\u0000-\u007e]/g, (match, offset) => {
const replacement = diacriticsMap[match] || match;

if (replacement.length > 1) {
const remain = offset - lastOffsetOriginal;
mapping.push({ transformed: remain, original: remain });
mapping.push({ transformed: replacement.length, original: match.length });

lastOffsetOriginal = offset + match.length;
lastOffsetTransformed += remain + replacement.length;
}

return replacement;
});

const lenRestTransformed = transformedStr.length - lastOffsetTransformed;
const lenRestOriginal = str.length - lastOffsetOriginal;

if (lenRestOriginal !== lenRestTransformed) throw new Error('strings out of sync');
if (lenRestOriginal) {
mapping.push({
transformed: transformedStr.length - lastOffsetTransformed,
original: str.length - lastOffsetOriginal,
});
}

return { str: transformedStr, mapping };
};


// assumes that elements in positions are sorted
exports.backTransformPositions = function backTransformPositions(positions, mapping) {
const transformedPositions = [];
Expand Down Expand Up @@ -132,7 +170,8 @@ function transform(transformations, str) {

exports.SearchIndex = class SearchIndex {
constructor(str) {
this.transformations = [exports.transformSpaces, transformLowercase];
this.transformations =
[exports.transformSpaces, exports.transformDiacritics, transformLowercase];

const {transformedStr, mapping} = transform(this.transformations, str);
this.transformedStr = transformedStr;
Expand Down
51 changes: 51 additions & 0 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ describe('findPositions()', () => {
});
});


describe('transformSpaces', () => {
it('should return input if no multiple spaces', () => {
srch.transformSpaces('this is a test')
Expand All @@ -47,6 +48,54 @@ describe('transformSpaces', () => {
});
});


describe('transformDiacritics', () => {
it('should return input if no diacritics', () => {
srch.transformDiacritics('this is a test')
.should.eql({
str: 'this is a test',
mapping: [{ transformed: 14, original: 14 }],
});
});

it('should remove diacritics (mapping: original shorter than transformed)', () => {
srch.transformDiacritics('Straße')
.should.eql({
str: 'Strasse',
mapping: [
{ transformed: 4, original: 4 },
{ transformed: 2, original: 1 },
{ transformed: 1, original: 1 },
],
});
});

it('should remove diacritics of 2 words (mapping: original shorter than transformed)', () => {
srch.transformDiacritics('Spaß mit Soße')
.should.eql({
str: 'Spass mit Sosse',
mapping: [
{ transformed: 3, original: 3 },
{ transformed: 2, original: 1 },
{ transformed: 7, original: 7 },
{ transformed: 2, original: 1 },
{ transformed: 1, original: 1 },
],
});
});

it('should remove diacritics (mapping: original equal to transformed)', () => {
srch.transformDiacritics('Iлtèrnåtïonɑlíƶatï߀ԉ')
.should.eql({
str: 'Internationalizati0n',
mapping: [
{ transformed: 20, original: 20 },
],
});
});
});


describe('backTransformPositions()', () => {
it('should return the original positions', () => {
// original: 'hällo world'
Expand All @@ -64,6 +113,7 @@ describe('backTransformPositions()', () => {
});
});


describe('backTransformRange', () => {
const transformations = [
{original: 3, transformed: 3, textObject: 1},
Expand Down Expand Up @@ -175,6 +225,7 @@ describe('backTransformRange', () => {
});
});


describe('SearchIndex', () => {
it('should find ranges', () => {
const index = new srch.SearchIndex('this is a test');
Expand Down

0 comments on commit a716290

Please sign in to comment.