Skip to content

Commit

Permalink
file map stats, roaring bit map
Browse files Browse the repository at this point in the history
  • Loading branch information
kmulvey committed Sep 14, 2022
1 parent 0287ebc commit e520385
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 21 deletions.
17 changes: 13 additions & 4 deletions internal/app/imagedup/files.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@ package imagedup

import (
"context"
"math"

"github.com/kmulvey/imagedup/pkg/imagedup/types"
)

func (id *ImageDup) streamFiles(ctx context.Context, files []string) {
var dedup = make(map[string]struct{})
var numImages = float64(len(files))
id.stats.TotalComparisons.Set((math.Pow(numImages, 2) - numImages) / 2)

for i, one := range files {
for j, two := range files {
if i != j {
Expand All @@ -17,10 +20,9 @@ func (id *ImageDup) streamFiles(ctx context.Context, files []string) {
close(id.images)
return
default:
if _, found := dedup[one+two]; !found {
dedup[one+two] = struct{}{}
dedup[two+one] = struct{}{}
if found := id.Bitmap.Contains(compress(i, j)); !found {
id.images <- types.Pair{One: one, Two: two, I: i, J: j}
id.Bitmap.Add(compress(j, i))
id.stats.PairTotal.Inc()
}
}
Expand All @@ -29,3 +31,10 @@ func (id *ImageDup) streamFiles(ctx context.Context, files []string) {
}
close(id.images)
}

// compress stores two ints in one. Go stores ints as 8 bytes so we store
// the first int in the bottom four and the second in the top four.
// This has a limitation of only being able to store a max value of 4294967295.
func compress(a, b int) uint64 {
return uint64(a) | (uint64(b) << 32)
}
5 changes: 4 additions & 1 deletion internal/app/imagedup/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package imagedup
import (
"context"

"github.com/RoaringBitmap/roaring/roaring64"
"github.com/kmulvey/imagedup/internal/app/imagedup/hash"
"github.com/kmulvey/imagedup/pkg/imagedup/types"
)
Expand All @@ -11,6 +12,7 @@ type ImageDup struct {
*stats
*hash.Cache
*hash.Differ
*roaring64.Bitmap
images chan types.Pair
}

Expand All @@ -20,6 +22,7 @@ func NewImageDup(promNamespace, hashCacheFile string, numWorkers, distanceThresh

id.images = make(chan types.Pair)
id.stats = newStats(promNamespace)
id.Bitmap = roaring64.New()

id.Cache, err = hash.NewCache(hashCacheFile, promNamespace)
if err != nil {
Expand All @@ -28,7 +31,7 @@ func NewImageDup(promNamespace, hashCacheFile string, numWorkers, distanceThresh

id.Differ = hash.NewDiffer(numWorkers, distanceThreshold, id.images, id.Cache, promNamespace)

go id.stats.publishStats(id.Cache)
go id.stats.publishStats(id.Cache, id.Bitmap)

return id, nil
}
Expand Down
42 changes: 26 additions & 16 deletions internal/app/imagedup/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"runtime"
"time"

"github.com/RoaringBitmap/roaring/roaring64"
"github.com/kmulvey/imagedup/internal/app/imagedup/hash"
"github.com/prometheus/client_golang/prometheus"
)
Expand All @@ -12,72 +13,81 @@ type stats struct {
PairTotal prometheus.Counter
GCTime prometheus.Gauge
TotalComparisons prometheus.Gauge
ImageCacheSize prometheus.Gauge
ImageCacheBytes prometheus.Gauge
ImageCacheNumImages prometheus.Gauge
PairCacheSize prometheus.Gauge
FileMapBytes prometheus.Gauge
PromNamespace string
}

func newStats(promNamespace string) *stats {
var s = new(stats)
s.PromNamespace = promNamespace

s.PairTotal = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: promNamespace,
Name: "pair_total",
Help: "How many pairs we read.",
},
)
s.GCTime = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: promNamespace,
Name: "gc_time_nano",
Help: "how long a gc sweep took",
},
)
s.TotalComparisons = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: promNamespace,
Name: "total_comparisons",
Help: "how many comparisons need to be done",
},
)
s.PairTotal = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: promNamespace,
Name: "pair_total",
Help: "How many pairs we read.",
},
)
s.ImageCacheSize = prometheus.NewGauge(
s.ImageCacheBytes = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: promNamespace,
Name: "image_cache_size_bytes",
Help: "disk size of the cache",
},
)
s.ImageCacheNumImages = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: promNamespace,
Name: "image_cache_num_images",
Help: "how many images are in the cache",
},
)
s.PairCacheSize = prometheus.NewGauge(
s.FileMapBytes = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: promNamespace,
Name: "pair_cache_size",
Name: "file_map_bytes",
Help: "size of the file dedup map",
},
)
prometheus.MustRegister(s.PairTotal)
prometheus.MustRegister(s.GCTime)
prometheus.MustRegister(s.TotalComparisons)
prometheus.MustRegister(s.ImageCacheSize)
prometheus.MustRegister(s.ImageCacheBytes)
prometheus.MustRegister(s.ImageCacheNumImages)
prometheus.MustRegister(s.PairCacheSize)
prometheus.MustRegister(s.FileMapBytes)

return s
}

// publishStats publishes go GC stats + cache size to prom every 10 seconds
func (s *stats) publishStats(imageCache *hash.Cache) {
func (s *stats) publishStats(imageCache *hash.Cache, fileMap *roaring64.Bitmap) {
for {
var stats runtime.MemStats
runtime.ReadMemStats(&stats)

s.GCTime.Set(float64(stats.PauseTotalNs))

s.ImageCacheNumImages.Set(float64(imageCache.NumImages()))
var numImages, cacheBytes = imageCache.Stats()
s.ImageCacheNumImages.Set(float64(numImages))
s.ImageCacheBytes.Set(float64(cacheBytes))
var b, _ = fileMap.ToBytes()
s.FileMapBytes.Set(float64(len(b)))

time.Sleep(10 * time.Second)
}
Expand Down

0 comments on commit e520385

Please sign in to comment.