Skip to content

Commit

Permalink
Speed up indexing, use stopwords, strip xml tags
Browse files Browse the repository at this point in the history
  • Loading branch information
Rik Smith-Unna committed Jun 28, 2017
1 parent 2f80bfe commit 03e0215
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 8 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,7 @@ jspm_packages

# local development examples
examples

# stupid files
yarn.lock
package-lock.json
Expand Down
5 changes: 3 additions & 2 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@ function Yuno (opts, cb) {
var indexOpts = _.defaults(opts, {
indexPath: this.indexPath,
batchsize: 100,
fieldedSearch: false,
nGramLength: 1,
separator: ' ',
stopwords: []
separator: /[|' .,\-|(\n)]+/,
stopwords: require('stopword').en
})

searchIndex(indexOpts, (err, si) => {
Expand Down
9 changes: 5 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,25 @@
"JSONStream": "^1.1.1",
"batch-stream": "^0.1.3",
"commander": "^2.9.0",
"jsonpath-plus": "^0.15.0",
"jsonpath-plus": "^0.16.0",
"level": "^1.6.0",
"level-batch-stream": "^1.3.1",
"level-write-stream": "^1.0.0",
"lodash": "^4.16.4",
"mkdirp": "^0.5.1",
"multi-write-stream": "^2.0.1",
"natural": "blahah/natural",
"path-exists": "^2.1.0",
"path-exists": "^3.0.0",
"pumpify": "^1.3.5",
"search-index": "^0.13.0",
"stopword": "^0.1.6",
"through2": "^2.0.1"
},
"devDependencies": {
"reuters-21578-json": "0.0.8",
"end-of-stream": "^1.1.0",
"reuters-21578-json": "0.0.8",
"rimraf": "^2.5.2",
"standard": "^6.0.8",
"standard": "^10.0.2",
"stream-array": "^1.1.2",
"tape": "^4.5.1",
"temporary": "0.0.8"
Expand Down
2 changes: 1 addition & 1 deletion preprocess/vector.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ function Vector (terms) {
}

function stripPunctuation (term) {
return term.replace(/\W+/g, '')
return term.replace(/\W+/g, '').replace(/<[^>]+>/g, '')
}

function stripTag (pair) {
Expand Down
2 changes: 1 addition & 1 deletion test/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ test('streaming search', function (t) {
var done = function (err) {
t.error(err, 'search completes without error')

t.equals(results.length, 20, 'correct number of hits')
t.equals(results.length, 55, 'correct number of hits')
rimraf(dbpath, {}, t.end)
}

Expand Down

0 comments on commit 03e0215

Please sign in to comment.