Skip to content

Commit

Permalink
Fix bug where same batches are send multiple times
Browse files Browse the repository at this point in the history
  • Loading branch information
bidoubiwa committed Aug 16, 2023
1 parent 4c54142 commit 6505d5b
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ data:
"meilisearch_index_uid": "google",
"stategy": "default", // docssearch, schema*, custom or default
"headless": true, // Open browser or not
"batch_size": 100, //null with send documents one by one
"batch_size": 1000, //null with send documents one by one
"primary_key": null,
"meilisearch_settings": {
"searchableAttributes": [
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@meilisearch/scrapix",
"version": "0.1.1",
"version": "0.1.3",
"description": "Automatic scraper and indexer to Meilisearch of any website.",
"main": "dist/src/index.js",
"dependencies": {
Expand Down
2 changes: 1 addition & 1 deletion src/package_version.ts
Original file line number Diff line number Diff line change
@@ -1 +1 @@
export const PACKAGE_VERSION = '0.1.1'
export const PACKAGE_VERSION = '0.1.3'
4 changes: 3 additions & 1 deletion src/scrapers/docssearch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,9 @@ export default class DocsearchScaper {
}
}
// Send remaining data
await this._send_data({ ...document })
if (document.content && document.content?.length > 0) {
await this._send_data({ ...document })
}
}

async _send_data(data: DocsSearchDocument) {
Expand Down
23 changes: 17 additions & 6 deletions src/sender.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ export class Sender {
this.config = config
this.initial_index_uid = config.meilisearch_index_uid
this.index_uid = this.initial_index_uid
this.batch_size = config.batch_size || 100
this.batch_size = config.batch_size || 1000

//Create a Meilisearch client
this.client = initMeilisearchClient({
Expand Down Expand Up @@ -54,17 +54,18 @@ export class Sender {

//Add a json object to the queue
async add(data: DocumentType) {
console.log('Sender::add')
this.nb_documents_sent++

console.log('Sender::add')
if (this.config.primary_key && this.config.primary_key !== 'uid') {
delete data['uid']
}

if (this.batch_size) {
this.queue.push(data)
if (this.queue.length >= this.batch_size) {
await this.__batchSend()
this.__batchSend()
this.queue = []
}
} else {
await this.client.index(this.index_uid).addDocuments([data])
Expand All @@ -80,7 +81,7 @@ export class Sender {
}

async finish() {
await this.__batchSend()
await this.__batchSendSync()
const index = await this.client.getIndex(this.index_uid)
const stats = await index.getStats()
if (
Expand All @@ -99,12 +100,22 @@ export class Sender {
)
}

async __batchSend() {
__batchSend() {
console.log(`Sender::__batchSend - size: ${this.queue.length}`)
this.client
.index(this.index_uid)
.addDocuments(this.queue)
.catch((e) => {
console.log(e)
console.log('Error while sending data to MeiliSearch')
})
}

async __batchSendSync() {
console.log(`Sender::__batchSend - size: ${this.queue.length}`)
const task = await this.client
.index(this.index_uid)
.addDocuments(this.queue)
this.queue = []
await this.client.waitForTask(task.taskUid)
}

Expand Down

0 comments on commit 6505d5b

Please sign in to comment.