diff --git a/core/commands/dag/dag.go b/core/commands/dag/dag.go index 6827e46fab1..d8034d67104 100644 --- a/core/commands/dag/dag.go +++ b/core/commands/dag/dag.go @@ -377,7 +377,10 @@ var DagStatCmd = &cmds.Command{ 'ipfs dag stat' fetches a DAG and returns various statistics about it. Statistics include size and number of blocks. -Note: This command skips duplicate blocks in reporting both size and the number of blocks +Note: Duplicate blocks are identified by content hash (multihash) to reflect +actual disk usage. Identical data referenced via different CIDs is counted +once. 'dag export' uses CID-based keying and may include the same data +multiple times if referenced by different CIDs. `, }, Arguments: []cmds.Argument{ diff --git a/core/commands/dag/stat.go b/core/commands/dag/stat.go index bb9be7e0d90..72641ba07d5 100644 --- a/core/commands/dag/stat.go +++ b/core/commands/dag/stat.go @@ -7,8 +7,9 @@ import ( mdag "github.com/ipfs/boxo/ipld/merkledag" "github.com/ipfs/boxo/ipld/merkledag/traverse" - cid "github.com/ipfs/go-cid" cmds "github.com/ipfs/go-ipfs-cmds" + mh "github.com/multiformats/go-multihash" + "github.com/ipfs/kubo/core/commands/cmdenv" "github.com/ipfs/kubo/core/commands/cmdutils" "github.com/ipfs/kubo/core/commands/e" @@ -26,7 +27,10 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment) } nodeGetter := mdag.NewSession(req.Context, api.Dag()) - cidSet := cid.NewSet() + // Use multihash set for deduplication to reflect actual storage. + // Since Kubo v0.12.0, blocks are stored by multihash, so identical + // data with different CIDs (e.g., CIDv0 vs CIDv1) is stored once. + mhSet := mh.NewSet() dagStatSummary := &DagStatSummary{DagStatsArray: []*DagStat{}} for _, a := range req.Arguments { p, err := cmdutils.PathOrCidPath(a) @@ -54,11 +58,11 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment) currentNodeSize := uint64(len(current.Node.RawData())) dagstats.Size += currentNodeSize dagstats.NumBlocks++ - if !cidSet.Has(current.Node.Cid()) { + // Visit returns true if this multihash was not seen before + if mhSet.Visit(current.Node.Cid().Hash()) { dagStatSummary.incrementTotalSize(currentNodeSize) } dagStatSummary.incrementRedundantSize(currentNodeSize) - cidSet.Add(current.Node.Cid()) if progressive { if err := res.Emit(dagStatSummary); err != nil { return err @@ -74,7 +78,7 @@ func dagStat(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment) } } - dagStatSummary.UniqueBlocks = cidSet.Len() + dagStatSummary.UniqueBlocks = mhSet.Len() dagStatSummary.calculateSummary() if err := res.Emit(dagStatSummary); err != nil { diff --git a/docs/changelogs/v0.40.md b/docs/changelogs/v0.40.md index 29780937f4b..4eb318b9469 100644 --- a/docs/changelogs/v0.40.md +++ b/docs/changelogs/v0.40.md @@ -11,7 +11,8 @@ This release was brought to you by the [Shipyard](https://ipshipyard.com/) team. - [Overview](#overview) - [๐Ÿ”ฆ Highlights](#-highlights) - [Routing V1 HTTP API now exposed by default](#routing-v1-http-api-now-exposed-by-default) - - [Track total size when adding pins](#track-total-size-when-adding-pins] + - [Track total size when adding pins](#track-total-size-when-adding-pins) + - [Fixed `ipfs dag stat` block counting](#fixed-ipfs-dag-stat-block-counting) - [๐Ÿ“ Changelog](#-changelog) - [๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ Contributors](#-contributors) @@ -32,6 +33,10 @@ Example output: Fetched/Processed 336 nodes (83 MB) ``` +#### Fixed `ipfs dag stat` block counting + +Since Kubo v0.12.0, blocks are stored by multihash, so the same data is stored only once regardless of which CID references it. The `dag stat` command now reflects actual storage by deduplicating blocks by content hash (e.g., data referenced via both CIDv0 and CIDv1 is counted once). See `ipfs dag stat --help` for more details. + ### ๐Ÿ“ Changelog ### ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ Contributors diff --git a/test/cli/dag_test.go b/test/cli/dag_test.go index f6758a71037..ff05a401bbb 100644 --- a/test/cli/dag_test.go +++ b/test/cli/dag_test.go @@ -104,6 +104,27 @@ func TestDag(t *testing.T) { stat := node.RunIPFS("dag", "stat", "--progress=false", node1Cid, node2Cid) assert.Equal(t, content, stat.Stdout.Bytes()) }) + + t.Run("dag stat deduplicates by multihash", func(t *testing.T) { + t.Parallel() + node := harness.NewT(t).NewNode().Init().StartDaemon() + + // Add content and get CIDv0 with dag-pb (not raw leaves) + cidV0 := node.IPFSAddStr("hello world", "--cid-version=0", "--raw-leaves=false") + + // Convert to CIDv1 (same multihash, different CID) + cidV1 := node.IPFS("cid", "format", "-v", "1", "-b", "base32", cidV0).Stdout.Trimmed() + + // Run dag stat with both CIDs - should deduplicate by multihash + stat := node.RunIPFS("dag", "stat", "--progress=false", "--enc=json", cidV0, cidV1) + var data Data + err := json.Unmarshal(stat.Stdout.Bytes(), &data) + require.NoError(t, err) + + // Same block referenced via CIDv0 and CIDv1 should be counted once + assert.Equal(t, 1, data.UniqueBlocks, "same data via different CIDs should be 1 unique block") + assert.Equal(t, 2.0, data.Ratio, "ratio should be 2.0 (2 refs to 1 block)") + }) } func TestDagImportFastProvide(t *testing.T) {