11import { mongo , MONGO_OPERATION_TIMEOUT_MS } from '@powersync/lib-service-mongodb' ;
22import { logger , ReplicationAssertionError , ServiceAssertionError } from '@powersync/lib-services-framework' ;
3- import { addChecksums , InternalOpId , isPartialChecksum , storage , utils } from '@powersync/service-core' ;
3+ import {
4+ addChecksums ,
5+ InternalOpId ,
6+ isPartialChecksum ,
7+ PopulateChecksumCacheResults ,
8+ storage ,
9+ utils
10+ } from '@powersync/service-core' ;
411
512import { PowerSyncMongo } from './db.js' ;
613import { BucketDataDocument , BucketDataKey , BucketStateDocument } from './models.js' ;
@@ -10,6 +17,7 @@ import { cacheKey } from './OperationBatch.js';
1017interface CurrentBucketState {
1118 /** Bucket name */
1219 bucket : string ;
20+
1321 /**
1422 * Rows seen in the bucket, with the last op_id of each.
1523 */
@@ -96,75 +104,56 @@ export class MongoCompactor {
96104 // We can make this more efficient later on by iterating
97105 // through the buckets in a single query.
98106 // That makes batching more tricky, so we leave for later.
99- await this . compactInternal ( bucket ) ;
107+ await this . compactSingleBucket ( bucket ) ;
100108 }
101109 } else {
102110 await this . compactDirtyBuckets ( ) ;
103111 }
104112 }
105113
106114 private async compactDirtyBuckets ( ) {
107- for await ( let buckets of this . iterateDirtyBuckets ( ) ) {
115+ while ( ! this . signal ?. aborted ) {
116+ // Process all buckets with 1 or more changes since last time
117+ const buckets = await this . dirtyBucketBatch ( { minBucketChanges : 1 } ) ;
118+ if ( buckets . length == 0 ) {
119+ // All done
120+ break ;
121+ }
108122 for ( let bucket of buckets ) {
109- await this . compactInternal ( bucket ) ;
123+ await this . compactSingleBucket ( bucket ) ;
110124 }
111125 }
112126 }
113127
114- private async compactInternal ( bucket : string | undefined ) {
128+ private async compactSingleBucket ( bucket : string ) {
115129 const idLimitBytes = this . idLimitBytes ;
116130
117- let currentState : CurrentBucketState | null = null ;
118-
119- let bucketLower : string | mongo . MinKey ;
120- let bucketUpper : string | mongo . MaxKey ;
131+ let currentState : CurrentBucketState = {
132+ bucket,
133+ seen : new Map ( ) ,
134+ trackingSize : 0 ,
135+ lastNotPut : null ,
136+ opsSincePut : 0 ,
121137
122- if ( bucket == null ) {
123- bucketLower = new mongo . MinKey ( ) ;
124- bucketUpper = new mongo . MaxKey ( ) ;
125- } else if ( bucket . includes ( '[' ) ) {
126- // Exact bucket name
127- bucketLower = bucket ;
128- bucketUpper = bucket ;
129- } else {
130- // Bucket definition name
131- bucketLower = `${ bucket } [` ;
132- bucketUpper = `${ bucket } [\uFFFF` ;
133- }
138+ checksum : 0 ,
139+ opCount : 0 ,
140+ opBytes : 0
141+ } ;
134142
135143 // Constant lower bound
136144 const lowerBound : BucketDataKey = {
137145 g : this . group_id ,
138- b : bucketLower as string ,
146+ b : bucket ,
139147 o : new mongo . MinKey ( ) as any
140148 } ;
141149
142150 // Upper bound is adjusted for each batch
143151 let upperBound : BucketDataKey = {
144152 g : this . group_id ,
145- b : bucketUpper as string ,
153+ b : bucket ,
146154 o : new mongo . MaxKey ( ) as any
147155 } ;
148156
149- const doneWithBucket = async ( ) => {
150- if ( currentState == null ) {
151- return ;
152- }
153- // Free memory before clearing bucket
154- currentState . seen . clear ( ) ;
155- if ( currentState . lastNotPut != null && currentState . opsSincePut >= 1 ) {
156- logger . info (
157- `Inserting CLEAR at ${ this . group_id } :${ currentState . bucket } :${ currentState . lastNotPut } to remove ${ currentState . opsSincePut } operations`
158- ) ;
159- // Need flush() before clear()
160- await this . flush ( ) ;
161- await this . clearBucket ( currentState ) ;
162- }
163-
164- // Do this _after_ clearBucket so that we have accurate counts.
165- this . updateBucketChecksums ( currentState ) ;
166- } ;
167-
168157 while ( ! this . signal ?. aborted ) {
169158 // Query one batch at a time, to avoid cursor timeouts
170159 const cursor = this . db . bucket_data . aggregate < BucketDataDocument & { size : number | bigint } > (
@@ -211,22 +200,6 @@ export class MongoCompactor {
211200 upperBound = batch [ batch . length - 1 ] . _id ;
212201
213202 for ( let doc of batch ) {
214- if ( currentState == null || doc . _id . b != currentState . bucket ) {
215- await doneWithBucket ( ) ;
216-
217- currentState = {
218- bucket : doc . _id . b ,
219- seen : new Map ( ) ,
220- trackingSize : 0 ,
221- lastNotPut : null ,
222- opsSincePut : 0 ,
223-
224- checksum : 0 ,
225- opCount : 0 ,
226- opBytes : 0
227- } ;
228- }
229-
230203 if ( doc . _id . o > this . maxOpId ) {
231204 continue ;
232205 }
@@ -297,12 +270,22 @@ export class MongoCompactor {
297270 }
298271 }
299272
300- if ( currentState != null ) {
301- logger . info ( `Processed batch of length ${ batch . length } current bucket: ${ currentState . bucket } ` ) ;
302- }
273+ logger . info ( `Processed batch of length ${ batch . length } current bucket: ${ bucket } ` ) ;
303274 }
304275
305- await doneWithBucket ( ) ;
276+ // Free memory before clearing bucket
277+ currentState . seen . clear ( ) ;
278+ if ( currentState . lastNotPut != null && currentState . opsSincePut >= 1 ) {
279+ logger . info (
280+ `Inserting CLEAR at ${ this . group_id } :${ bucket } :${ currentState . lastNotPut } to remove ${ currentState . opsSincePut } operations`
281+ ) ;
282+ // Need flush() before clear()
283+ await this . flush ( ) ;
284+ await this . clearBucket ( currentState ) ;
285+ }
286+
287+ // Do this _after_ clearBucket so that we have accurate counts.
288+ this . updateBucketChecksums ( currentState ) ;
306289
307290 // Need another flush after updateBucketChecksums()
308291 await this . flush ( ) ;
@@ -490,61 +473,55 @@ export class MongoCompactor {
490473 /**
491474 * Subset of compact, only populating checksums where relevant.
492475 */
493- async populateChecksums ( ) {
494- for await ( let buckets of this . iterateDirtyBuckets ( ) ) {
476+ async populateChecksums ( options : { minBucketChanges : number } ) : Promise < PopulateChecksumCacheResults > {
477+ let count = 0 ;
478+ while ( ! this . signal ?. aborted ) {
479+ const buckets = await this . dirtyBucketBatch ( options ) ;
480+ if ( buckets . length == 0 ) {
481+ // All done
482+ break ;
483+ }
495484 const start = Date . now ( ) ;
496485 logger . info ( `Calculating checksums for batch of ${ buckets . length } buckets, starting at ${ buckets [ 0 ] } ` ) ;
497486
498487 await this . updateChecksumsBatch ( buckets ) ;
499488 logger . info ( `Updated checksums for batch of ${ buckets . length } buckets in ${ Date . now ( ) - start } ms` ) ;
489+ count += buckets . length ;
500490 }
491+ return { buckets : count } ;
501492 }
502493
503- private async * iterateDirtyBuckets ( ) : AsyncGenerator < string [ ] > {
504- // This is updated after each batch
505- let lowerBound : BucketStateDocument [ '_id' ] = {
506- g : this . group_id ,
507- b : new mongo . MinKey ( ) as any
508- } ;
509- // This is static
510- const upperBound : BucketStateDocument [ '_id' ] = {
511- g : this . group_id ,
512- b : new mongo . MaxKey ( ) as any
513- } ;
514- while ( ! this . signal ?. aborted ) {
515- // By filtering buckets, we effectively make this "resumeable".
516- const filter : mongo . Filter < BucketStateDocument > = {
517- _id : {
518- $gt : lowerBound ,
519- $lt : upperBound
494+ /**
495+ * Returns a batch of dirty buckets - buckets with most changes first.
496+ *
497+ * This cannot be used to iterate on its own - the client is expected to process these buckets and
498+ * set estimate_since_compact.count: 0 when done, before fetching the next batch.
499+ */
500+ private async dirtyBucketBatch ( options : { minBucketChanges : number } ) : Promise < string [ ] > {
501+ if ( options . minBucketChanges <= 0 ) {
502+ throw new ReplicationAssertionError ( 'minBucketChanges must be >= 1' ) ;
503+ }
504+ // We make use of an index on {_id.g: 1, 'estimate_since_compact.count': -1}
505+ const dirtyBuckets = await this . db . bucket_state
506+ . find (
507+ {
508+ '_id.g' : this . group_id ,
509+ 'estimate_since_compact.count' : { $gte : options . minBucketChanges }
520510 } ,
521- // Partial index exists on this
522- 'estimate_since_compact.count' : { $gt : 0 }
523- } ;
524-
525- const dirtyBuckets = await this . db . bucket_state
526- . find ( filter , {
511+ {
527512 projection : {
528513 _id : 1
529514 } ,
530515 sort : {
531- _id : 1
516+ 'estimate_since_compact.count' : - 1
532517 } ,
533518 limit : 5_000 ,
534- maxTimeMS : MONGO_OPERATION_TIMEOUT_MS ,
535- // Make sure we use the partial index
536- hint : 'dirty_buckets'
537- } )
538- . toArray ( ) ;
539-
540- if ( dirtyBuckets . length == 0 ) {
541- break ;
542- }
543-
544- yield dirtyBuckets . map ( ( bucket ) => bucket . _id . b ) ;
519+ maxTimeMS : MONGO_OPERATION_TIMEOUT_MS
520+ }
521+ )
522+ . toArray ( ) ;
545523
546- lowerBound = dirtyBuckets [ dirtyBuckets . length - 1 ] . _id ;
547- }
524+ return dirtyBuckets . map ( ( bucket ) => bucket . _id . b ) ;
548525 }
549526
550527 private async updateChecksumsBatch ( buckets : string [ ] ) {
0 commit comments