Skip to content

Commit 93c1269

Browse files
authored
Merge pull request #28 from scanoss/chore/mdaloia/enhance-import-script
chore: enhance import script, add qdrant's docker compose to release artifacts
2 parents f4a6dbb + 3e6d9d0 commit 93c1269

File tree

3 files changed

+76
-4
lines changed

3 files changed

+76
-4
lines changed

.github/workflows/release.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ jobs:
4545
target/scanoss-folder-hashing-import-linux-arm64
4646
scanoss-folder-hashing-api_linux-amd64_${{ github.ref_name }}-1.tgz
4747
scanoss-folder-hashing-api_linux-arm64_${{ github.ref_name }}-1.tgz
48+
docker-compose.qdrant.yml

cmd/import/main.go

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,39 +238,58 @@ func main() {
238238
for _, collectionName := range collections {
239239
showCollectionStats(ctx, client, collectionName)
240240
}
241+
242+
// Re-enable production HNSW indexing for all collections
243+
log.Println("\n=== Enabling Production HNSW Indexing ===")
244+
log.Println("Re-enabling HNSW indexing (M=48) for production queries...")
245+
for _, collectionName := range collections {
246+
if err := enableProductionIndexing(ctx, client, collectionName); err != nil {
247+
log.Printf("WARNING: Failed to enable production indexing for %s: %v", collectionName, err)
248+
}
249+
}
250+
log.Println("\n✓ Production indexing enabled for all collections.")
251+
log.Println("The Qdrant optimizer will build HNSW indexes in the background.")
252+
log.Println("Monitor collection stats to track indexing progress.")
241253
}
242254

243255
// Create a language-based collection with named vectors (dirs, names, contents).
244256
func createCollection(ctx context.Context, client *qdrant.Client, collectionName string) {
245257
log.Printf("Creating language-based collection with named vectors: %s", collectionName)
246258

247259
// Create named vectors configuration for dirs, names, and contents
260+
// Optimized for bulk import: vectors on disk, HNSW disabled (M=0)
248261
namedVectors := map[string]*qdrant.VectorParams{
249262
"dirs": {
250263
Size: VectorDim,
251264
Distance: qdrant.Distance_Manhattan,
265+
OnDisk: qdrant.PtrOf(true), // Store vectors on disk to reduce RAM during import
252266
HnswConfig: &qdrant.HnswConfigDiff{
253-
M: qdrant.PtrOf(uint64(48)),
267+
M: qdrant.PtrOf(uint64(0)), // Disable HNSW during import, re-enabled after
254268
EfConstruct: qdrant.PtrOf(uint64(500)),
255269
FullScanThreshold: qdrant.PtrOf(uint64(100000)),
270+
OnDisk: qdrant.PtrOf(true), // Store HNSW index on disk
256271
},
257272
},
258273
"names": {
259274
Size: VectorDim,
260275
Distance: qdrant.Distance_Manhattan,
276+
OnDisk: qdrant.PtrOf(true), // Store vectors on disk to reduce RAM during import
261277
HnswConfig: &qdrant.HnswConfigDiff{
262-
M: qdrant.PtrOf(uint64(48)),
278+
M: qdrant.PtrOf(uint64(0)), // Disable HNSW during import, re-enabled after
263279
EfConstruct: qdrant.PtrOf(uint64(500)),
264280
FullScanThreshold: qdrant.PtrOf(uint64(100000)),
281+
OnDisk: qdrant.PtrOf(true), // Store HNSW index on disk
265282
},
266283
},
267284
"contents": {
268285
Size: VectorDim,
269286
Distance: qdrant.Distance_Manhattan,
287+
OnDisk: qdrant.PtrOf(true), // Store vectors on disk to reduce RAM during import
270288
HnswConfig: &qdrant.HnswConfigDiff{
271-
M: qdrant.PtrOf(uint64(48)),
289+
M: qdrant.PtrOf(uint64(0)), // Disable HNSW during import, re-enabled after
272290
EfConstruct: qdrant.PtrOf(uint64(500)),
273291
FullScanThreshold: qdrant.PtrOf(uint64(100000)),
292+
OnDisk: qdrant.PtrOf(true), // Store HNSW index on disk
274293
},
275294
},
276295
}
@@ -288,7 +307,7 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName
288307
QuantizationConfig: &qdrant.QuantizationConfig{
289308
Quantization: &qdrant.QuantizationConfig_Binary{
290309
Binary: &qdrant.BinaryQuantization{
291-
AlwaysRam: qdrant.PtrOf(true), // Keep quantized vectors in RAM
310+
AlwaysRam: qdrant.PtrOf(false), // Allow quantized vectors on disk to reduce RAM
292311
},
293312
},
294313
},
@@ -329,6 +348,52 @@ func createCollection(ctx context.Context, client *qdrant.Client, collectionName
329348
}
330349
}
331350

351+
// enableProductionIndexing re-enables HNSW indexing after bulk import is complete.
352+
// This should be called after all data has been imported to optimize for production queries.
353+
func enableProductionIndexing(ctx context.Context, client *qdrant.Client, collectionName string) error {
354+
log.Printf("Enabling production HNSW indexing for collection: %s", collectionName)
355+
356+
// Build named vectors config map
357+
namedVectorsConfig := make(map[string]*qdrant.VectorParamsDiff)
358+
for _, vectorName := range []string{"dirs", "names", "contents"} {
359+
namedVectorsConfig[vectorName] = &qdrant.VectorParamsDiff{
360+
HnswConfig: &qdrant.HnswConfigDiff{
361+
M: qdrant.PtrOf(uint64(48)),
362+
OnDisk: qdrant.PtrOf(false),
363+
},
364+
OnDisk: qdrant.PtrOf(true), // Keep vectors on disk, only HNSW in RAM
365+
}
366+
}
367+
368+
// Update all named vectors and collection settings in a single call
369+
err := client.UpdateCollection(ctx, &qdrant.UpdateCollection{
370+
CollectionName: collectionName,
371+
VectorsConfig: &qdrant.VectorsConfigDiff{
372+
Config: &qdrant.VectorsConfigDiff_ParamsMap{
373+
ParamsMap: &qdrant.VectorParamsDiffMap{
374+
Map: namedVectorsConfig,
375+
},
376+
},
377+
},
378+
OptimizersConfig: &qdrant.OptimizersConfigDiff{
379+
IndexingThreshold: qdrant.PtrOf(uint64(0)),
380+
},
381+
QuantizationConfig: &qdrant.QuantizationConfigDiff{
382+
Quantization: &qdrant.QuantizationConfigDiff_Binary{
383+
Binary: &qdrant.BinaryQuantization{
384+
AlwaysRam: qdrant.PtrOf(true),
385+
},
386+
},
387+
},
388+
})
389+
if err != nil {
390+
return fmt.Errorf("failed to update HNSW config: %w", err)
391+
}
392+
393+
log.Printf("✓ HNSW indexing enabled for %s. Optimizer will build indexes in background.", collectionName)
394+
return nil
395+
}
396+
332397
// Import data from a CSV file to separate collections.
333398
func importCSVFileWithProgress(ctx context.Context, client *qdrant.Client, filePath string, batchSize int, progress *progresstracker.ProgressTracker) (int, error) {
334399
file, err := os.Open(filePath)

docker-compose.qdrant.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ services:
77
- 6334:6334 # gRPC API port (used by our Go client)
88
volumes:
99
- ./qdrant_data:/qdrant/storage
10+
environment:
11+
# Optimize storage performance for large-scale imports
12+
- QDRANT__STORAGE__OPTIMIZERS__OVERWRITE__MAX_SEGMENT_SIZE=500000
13+
- QDRANT__STORAGE__PERFORMANCE__MAX_OPTIMIZATION_THREADS=4
14+
# Enable WAL for durability during bulk imports
15+
- QDRANT__STORAGE__WAL__WAL_CAPACITY_MB=32
1016
expose:
1117
- 6333
1218
- 6334

0 commit comments

Comments
 (0)