Skip to content

Commit

Permalink
cleanup of simplesort, added js comparisons, updated wiki md
Browse files Browse the repository at this point in the history
- cleaned up simplesort logic
- added simplified javascript comparisons $jgt, $jgte, $jlt, $jlte described in Query Examples wiki page
- updated tutorials folder with markdown versions of several modified wiki pages
- rebuilt minified
  • Loading branch information
obeliskos committed Feb 13, 2018
1 parent ead7793 commit bb011df
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 60 deletions.
4 changes: 2 additions & 2 deletions build/lokijs.min.js

Large diffs are not rendered by default.

122 changes: 77 additions & 45 deletions src/lokijs.js
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@
return aeqHelper(a, b);
},

// loki comparisons: return identical unindexed results as indexed comparisons
$gt: function (a, b) {
return gtHelper(a, b, false);
},
Expand All @@ -415,6 +416,23 @@
return ltHelper(a, b, true);
},

// lightweight javascript comparisons
$jgt: function (a, b) {
return a > b;
},

$jgte: function (a, b) {
return a >= b;
},

$jlt: function (a, b) {
return a < b;
},

$jlte: function (a, b) {
return a <= b;
},

// ex : coll.find({'orderCount': {$between: [10, 50]}});
$between: function (a, vals) {
if (a === undefined || a === null) return false;
Expand Down Expand Up @@ -2973,60 +2991,28 @@
* var results = users.chain().simplesort('age').data();
*/
Resultset.prototype.simplesort = function (propname, options) {
var dc, frl, eff;
var eff,
dc = this.collection.data.length,
frl = this.filteredrows.length,
hasBinaryIndex = this.collection.binaryIndices.hasOwnProperty(propname);

if (typeof (options) === 'undefined') {
if (typeof (options) === 'undefined' || options === false) {
options = { desc: false };
}
if (options === true) {
options = { desc: true };
}
if (options === false) {
options = { desc: false };
}

// If already filtered, but we want to leverage binary index on sort.
// This will use custom array intection algorithm.
if (!options.disableIndexIntersect && this.collection.binaryIndices.hasOwnProperty(propname) && this.filterInitialized) {

dc = this.collection.data.length;
frl = this.filteredrows.length;
eff = dc/frl;

// anything more than ratio of 10:1 (total documents/current results) should use old sort code path
// So we will only use array intersection if you have more than 10% of total docs in your current resultset.
if (eff <= 10 || options.forceIndexIntersect) {
var idx, len=this.filteredrows.length, fr=this.filteredrows;
var io = {};
// set up hashobject for simple 'inclusion test' with existing (filtered) results
for(idx=0; idx<len; idx++) {
io[fr[idx]] = true;
}
// grab full sorted binary index array
var pv = this.collection.binaryIndices[propname].values;

// filter by existing results
this.filteredrows = pv.filter(function(n) { return io[n]; });

if (options.desc) {
this.filteredrows.reverse();
}

// if nothing in filtered rows array...
if (frl === 0) {
// if the filter is initialized to be empty resultset, do nothing
if (this.filterInitialized) {
return this;
}
}

if (options.useJavascriptSorting) {
return this.sort(function(obj1, obj2) {
if (obj1[propname] === obj2[propname]) return 0;
if (obj1[propname] > obj2[propname]) return 1;
if (obj1[propname] < obj2[propname]) return -1;
});
}

// if this has no filters applied, just we need to populate filteredrows first
if (!this.filterInitialized && this.filteredrows.length === 0) {
// if we have a binary index and no other filters applied, we can use that instead of sorting (again)

// otherwise no filters applied implies all documents, so we need to populate filteredrows first

// if we have a binary index, we can just use that instead of sorting (again)
if (this.collection.binaryIndices.hasOwnProperty(propname)) {
// make sure index is up-to-date
this.collection.ensureIndex(propname);
Expand All @@ -3042,10 +3028,56 @@
}
// otherwise initialize array for sort below
else {
// build full document index (to be sorted subsequently)
this.filteredrows = this.collection.prepareFullDocIndex();
}
}
// otherwise we had results to begin with, see if we qualify for index intercept optimization
else {

// If already filtered, but we want to leverage binary index on sort.
// This will use custom array intection algorithm.
if (!options.disableIndexIntersect && hasBinaryIndex) {

// calculate filter efficiency
eff = dc/frl;

// anything more than ratio of 10:1 (total documents/current results) should use old sort code path
// So we will only use array intersection if you have more than 10% of total docs in your current resultset.
if (eff <= 10 || options.forceIndexIntersect) {
var idx, fr=this.filteredrows;
var io = {};
// set up hashobject for simple 'inclusion test' with existing (filtered) results
for(idx=0; idx<frl; idx++) {
io[fr[idx]] = true;
}
// grab full sorted binary index array
var pv = this.collection.binaryIndices[propname].values;

// filter by existing results
this.filteredrows = pv.filter(function(n) { return io[n]; });

if (options.desc) {
this.filteredrows.reverse();
}

return this;
}
}
}

// at this point, we will not be able to leverage binary index so we will have to do an array sort

// if we have opted to use simplified javascript comparison function...
if (options.useJavascriptSorting) {
return this.sort(function(obj1, obj2) {
if (obj1[propname] === obj2[propname]) return 0;
if (obj1[propname] > obj2[propname]) return 1;
if (obj1[propname] < obj2[propname]) return -1;
});
}

// otherwise use loki sort which will return same results if column is indexed or not
var wrappedComparer =
(function (prop, desc, data) {
var val1, val2, arr;
Expand Down
60 changes: 47 additions & 13 deletions tutorials/Indexing and Query performance.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
Loki.js has always been a fast, in-memory database solution. In fact, recent benchmarks indicate that its primary get() operation is about _1.4 million operations_ per second fast on a mid-range Core i5 running under node.js. The get() operation utilizes an auto generated '$loki' id column with its own auto generated binary index. If you wish to supply your own unique key, you can use add a single unique index to the collection to be used along with the collection.by() method. This method is every bit as fast as using the built in $loki id. So out of the gate if you intend to do single object lookups you get this performance.

Example object lookup specifying your own unique index :
### Example lookup using autogenerated $loki column :
```javascript
var users = db.addCollection("users");
var resultObj = users.insert({username:"Heimdallr"});

// now that our object has been inserted, it will have a $loki property added onto it
var heimdallr = users.get(resultObj.$loki);
```
### Example object lookup specifying your own unique index :
```javascript
var users = db.addCollection("users", {
unique: ['username']
Expand All @@ -9,36 +17,62 @@ var users = db.addCollection("users", {
// after inserting records you might retrieve your record using coll.by()
var result = users.by("username", "Heimdallr");
```
Example lookup using autogenerated $loki column :
```javascript
var users = db.addCollection("users");
var resultObj = users.insert({username:"Heimdallr"});

// now that our object has been inserted, it will have a $loki property added onto it
var heimdallr = users.get(resultObj.$loki);
```
### 'Find' filtering
A more versatile way to query is to use collection.find() which accepts a mongo-style query object. If you do not index the column you are searching against, you can expect about 20k ops/sec under node.js (browser performance may vary but this serves as a good order of magnitude). For most purposes that is probably more performance than is needed, but you can now apply loki.js binary indexes on your object properties as well. Using the collection.ensureIndex(propertyName) method, you can create an index which can be used by various find() operations such as collection.find(). For our test benchmark, this increased performance to about _500k ops/sec_.

These binary indices can match multiple results or ranges and you might apply your index similar to in this example :
Binary indices are can be used with range ops returning multiple document results for the given property/range. If you have applied a binary index to a property, you can utilize that index by calling collection.find() with a query object containing that property. The find() ops which are able to utilize binary indices include $eq, $aeq, $lt, $lte, $gt, $gte, $between, $in.

> By default, if you have a binary index applied on a property and you insert a document containing a javascript Date object as a value for that property, loki will replace it with a serializable-safe epoch time format date (integer). This is to prevent the index from becoming corrupt if we were to save it as a Date and load it as a string (they should be sorted differently). If you do not intend to save your database (using entirely in memory), you may pass a 'serializableIndices:false' collection option during collection creation and we will not alter your dates.
### Binary index example :
```javascript
var coll = db.addCollection('users', {
indices: ['location']
});

// after inserting records you might use equality or range ops,
// such as this implicit $eq op :
// such as this implicit (strict) $eq op :
var results = users.find({ location: 'Himinbjörg' });
```

'Where' filters (javascript filter functions) should be used sparingly if performance is of concern. It is unable to utilize indexes, so performance will be no better than an unindexed find, and depending on the complexity of your filter function even less so. Unindex queries and where filters always require a full array scan but they can be useful if thousands of ops/sec are sufficient or if used later in a query chain or dynamic view filter pipeline with less penalty.

### Loki Sorting and Ranges
> Native javascript provides == (abstract) equality, === (strict) equality, < (abstract) less than, > (abstract) greater than, etc. Javascript deals with alot of mixed types, so you might want a numeric 4 to be (abstractly) equal to string '4'. If you wanted to test for 'less than' 4, it would by default convert to string so if you did not want strings you would have to test using 'typeof' or other type detection to manually filter out other types.
Loki would prefer to deal with pure clean data but has had to evolve to support various levels of 'dirty' data which are found in the wild. As such we have tried to adapt our concept of 'ranges' as it pertains to mixed types on properties so that they accommodate mixed types and provide similar find() results whether you are using a binary index or not.
- All values in loki are interpreted by loki as being 'less than', 'greater than', or 'equal' to any other value for the purposes of our find() op. This is different than javascript, so loki establishes a unified range ordering among two values.
- 4 is $aeq (abstractly equal to) '4', as is '3' $lt 4, and 4 is $gte '3', so mixing numbers and 'number-like' strings range in loki find as numbers.
- 'Number-like' strings are kept in different range (with the numbers) than 'non-number-like' strings, so 9999 will be $lt '111asdf"
- objects are all equal (unless you use dot notation to query an objects properties)
- Dates are sorted as numbers as their epoch time... those numbers are large so generally at the high end of number range
- $type op can be used to filter out results which do not match a specific javascript type
- $finite op can be used to filter out results which are either 'number-like' or 'non-number-like'

### 'Where' filters
'Where' filters (javascript filter functions) should be used sparingly if performance is of concern. They are unable to utilize indexes, so performance will be no better than an unindexed find, and depending on the complexity of your filter function even less so. Unindex queries and where filters always require a full array scan but they can be useful if thousands of ops/sec are sufficient or if used later in a query chain or dynamic view filter pipeline with less penalty.

### Indexing in Query chains
The Resultset class introduced method chaining as an option for querying. You might use this method chaining to apply several find operations in succession or mix find(), where(), and sort() operations into a sequential chained pipe. For simplicity, an example of this might be (where users is a collection object) :

```javascript
users.chain().find(queryObj).where(queryFunc).simplesort('name').data();
```

Examining this statement, if queryObj (a mongo-style query object) were { 'age': { '$gt': 30 } }, then that age column would be best to apply an index on, and that find() chain operation should come first in the chain. In chained operations, only the first chained operation can utilize the indexes for filtering. If it filtered out a sufficient number of records, the impact of the (where) query function will be less. The overhead of maintaining the filtered result set reduces performance by about 20% over collection.find, but they enable much more versatility. In our benchmarks this is still about _400k ops/sec_.
Examining this statement, if queryObj (a mongo-style query object) were { 'age': { '$gt': 30 } }, then that age column would be best to apply an index on, and that find() chain operation should come first in the chain. In chained operations, only the first chained filter can utilize the indexes for filtering. If it filtered out a sufficient number of records, the impact of the (where) query function will be less. The overhead of maintaining the filtered result set reduces performance by about 20% over collection.find, but they enable much more versatility. In our benchmarks this is still about _400k ops/sec_.

### Indexes and sorting
When no filters are applied and a binary index exists on (for example) a 'name' property, the following can fully utilize the binary index :
```javascript
coll.chain().simplesort('name').data();
```
If filtering has occurred we will detect whether we can leverage the index within an 'index intersect' algorithm to speed up the sorting over a typical loki sort. This 'index intersect' algorithm will only be enabled if your resultset has more than 10% of the total documents within its resultset, otherwise a standard loki sort will be determined to be the faster method. The performance advantages of 'index intersect' are somewhat inversely proportional to your filter quality, so it leverages the binary index to help to reduce 'worst case' sorting penalties.

Loki sorting is used not just for sorting but for building the binary indices, but if you do not need it's more unified sorting across mixed types, you might be able to shave additional milliseconds off your sort call by calling :
```javascript
coll.chain().simplesort('name', { useJavascriptSorting: true }).data();
```
Which (if a binary index exists on that 'name' property) we will use the index intersect algorithm unless resultset has 10% or less total document count, at which point we will fallback to javascript sorting on the property passed to simplesort. If you do not have a binary index applied on that property, we would always use javascript sorting if that option were passed.

## Dynamic View pipelines

Dynamic Views behave similarly to resultsets in that you want to utilize an index, your first filter must be applied using

Expand Down
24 changes: 24 additions & 0 deletions tutorials/Query Examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,30 @@ var results = coll.find({'age': {'$lt': 40}});
```javascript
var results = coll.find({'age': {'$lte': 40}});
```
>
> Note : the above $gt, $gte, $lt, and $lte ops use 'loki' sorting which provides a unified range actoss mixed types
> and which return the same results whether the property is indexed or not. This is needed for binary indexes and
> for guarantees of results equality between indexed and non-indexed comparisons.
>
> If you do not expect to utilize a binary index and you expect that simple javascript comparisons are acceptable,
> we provide the following ops which (due to their simplified comparisons) may provide more optimal execution speeds.
>
> **$jgt** - filter (using simplified javascript comparison) for docs with property greater than provided value
> ```javascript
> var results = coll.find({'age': {'$jgt': 40}});
> ```
> **$jgte** - filter (using simplified javascript comparison) for docs with property greater than or equal to provided value
> ```javascript
> var results = coll.find({'age': {'$jgte': 40}});
> ```
> **$jlt** - filter (using simplified javascript comparison) for docs with property less than provided value
> ```javascript
> var results = coll.find({'age': {'$jlt': 40}});
> ```
> **$jlte** - filter (using simplified javascript comparison) for docs with property less than or equal to provided value
> ```javascript
> var results = coll.find({'age': {'$jlte': 40}});
> ```
**$between** - filter for documents(s) with property between provided vals
```javascript
// match users with count value between 50 and 75
Expand Down

0 comments on commit bb011df

Please sign in to comment.