From 9253e50dd4f609ae67e44ae89827542af752e726 Mon Sep 17 00:00:00 2001 From: Brian McGee Date: Tue, 9 Jul 2024 20:08:50 +0100 Subject: [PATCH] feat: only emit changed files with git walker If the modified time has not changed when compared with the git index we do not emit the file for processing. This allows users to introduce treefmt to a repository without suffering an initial large formatting commit. Instead, files can be formatted incrementally as they are changed. Closes #311 Signed-off-by: Brian McGee --- cache/cache.go | 8 ++--- cli/cli.go | 17 ++++----- cli/format.go | 36 +++++++++---------- format/formatter.go | 4 +-- format/task.go | 6 ++-- {walk => walker}/filesystem.go | 2 +- {walk => walker}/filesystem_test.go | 2 +- {walk => walker}/git.go | 54 +++++++++++++++++++++++------ {walk => walker}/walker.go | 12 +++---- 9 files changed, 87 insertions(+), 54 deletions(-) rename {walk => walker}/filesystem.go (98%) rename {walk => walker}/filesystem_test.go (99%) rename {walk => walker}/git.go (69%) rename {walk => walker}/walker.go (83%) diff --git a/cache/cache.go b/cache/cache.go index fe1c1ac2..3d9b3c59 100644 --- a/cache/cache.go +++ b/cache/cache.go @@ -12,7 +12,7 @@ import ( "git.numtide.com/numtide/treefmt/stats" "git.numtide.com/numtide/treefmt/format" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" "github.com/charmbracelet/log" @@ -187,7 +187,7 @@ func putEntry(bucket *bolt.Bucket, path string, entry *Entry) error { // ChangeSet is used to walk a filesystem, starting at root, and outputting any new or changed paths using pathsCh. // It determines if a path is new or has changed by comparing against cache entries. -func ChangeSet(ctx context.Context, walker walk.Walker, filesCh chan<- *walk.File) error { +func ChangeSet(ctx context.Context, wk walker.Walker, filesCh chan<- *walker.File) error { start := time.Now() defer func() { @@ -205,7 +205,7 @@ func ChangeSet(ctx context.Context, walker walk.Walker, filesCh chan<- *walk.Fil } }() - return walker.Walk(ctx, func(file *walk.File, err error) error { + return wk.Walk(ctx, func(file *walker.File, err error) error { select { case <-ctx.Done(): return ctx.Err() @@ -264,7 +264,7 @@ func ChangeSet(ctx context.Context, walker walk.Walker, filesCh chan<- *walk.Fil } // Update is used to record updated cache information for the specified list of paths. -func Update(files []*walk.File) error { +func Update(files []*walker.File) error { start := time.Now() defer func() { logger.Debugf("finished processing %v paths in %v", len(files), time.Since(start)) diff --git a/cli/cli.go b/cli/cli.go index 0d56cafb..655e670a 100644 --- a/cli/cli.go +++ b/cli/cli.go @@ -6,7 +6,7 @@ import ( "github.com/gobwas/glob" "git.numtide.com/numtide/treefmt/format" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" "github.com/alecthomas/kong" "github.com/charmbracelet/log" ) @@ -25,10 +25,11 @@ type Format struct { Formatters []string `short:"f" help:"Specify formatters to apply. Defaults to all formatters."` TreeRoot string `type:"existingdir" xor:"tree-root" help:"The root directory from which treefmt will start walking the filesystem (defaults to the directory containing the config file)."` TreeRootFile string `type:"string" xor:"tree-root" help:"File to search for to find the project root (if --tree-root is not passed)."` - Walk walk.Type `enum:"auto,git,filesystem" default:"auto" help:"The method used to traverse the files within --tree-root. Currently supports 'auto', 'git' or 'filesystem'."` - Verbosity int `name:"verbose" short:"v" type:"counter" default:"0" env:"LOG_LEVEL" help:"Set the verbosity of logs e.g. -vv."` - Version bool `name:"version" short:"V" help:"Print version."` - Init bool `name:"init" short:"i" help:"Create a new treefmt.toml."` + Walk walker.Type `enum:"auto,git,filesystem" default:"auto" help:"The method used to traverse the files within --tree-root. Currently supports 'auto', 'git' or 'filesystem'."` + + Verbosity int `name:"verbose" short:"v" type:"counter" default:"0" env:"LOG_LEVEL" help:"Set the verbosity of logs e.g. -vv."` + Version bool `name:"version" short:"V" help:"Print version."` + Init bool `name:"init" short:"i" help:"Create a new treefmt.toml."` OnUnmatched log.Level `name:"on-unmatched" short:"u" default:"warn" help:"Log paths that did not match any formatters at the specified log level, with fatal exiting the process with an error. Possible values are ."` @@ -40,9 +41,9 @@ type Format struct { formatters map[string]*format.Formatter globalExcludes []glob.Glob - filesCh chan *walk.File - formattedCh chan *walk.File - processedCh chan *walk.File + fileCh chan *walker.File + formattedCh chan *walker.File + processedCh chan *walker.File } func (f *Format) configureLogging() { diff --git a/cli/format.go b/cli/format.go index 95ff37bc..34cbfdb4 100644 --- a/cli/format.go +++ b/cli/format.go @@ -18,7 +18,7 @@ import ( "git.numtide.com/numtide/treefmt/cache" "git.numtide.com/numtide/treefmt/config" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" "github.com/charmbracelet/log" "golang.org/x/sync/errgroup" @@ -147,13 +147,13 @@ func (f *Format) Run() (err error) { // create a channel for files needing to be processed // we use a multiple of batch size here as a rudimentary concurrency optimization based on the host machine - f.filesCh = make(chan *walk.File, BatchSize*runtime.NumCPU()) + f.fileCh = make(chan *walker.File, BatchSize*runtime.NumCPU()) // create a channel for files that have been formatted - f.formattedCh = make(chan *walk.File, cap(f.filesCh)) + f.formattedCh = make(chan *walker.File, cap(f.fileCh)) // create a channel for files that have been processed - f.processedCh = make(chan *walk.File, cap(f.filesCh)) + f.processedCh = make(chan *walker.File, cap(f.fileCh)) // start concurrent processing tasks in reverse order eg.Go(f.updateCache(ctx)) @@ -168,14 +168,14 @@ func (f *Format) Run() (err error) { func (f *Format) walkFilesystem(ctx context.Context) func() error { return func() error { eg, ctx := errgroup.WithContext(ctx) - pathsCh := make(chan string, BatchSize) + pathCh := make(chan string, BatchSize) // By default, we use the cli arg, but if the stdin flag has been set we force a filesystem walk // since we will only be processing one file from a temp directory walkerType := f.Walk if f.Stdin { - walkerType = walk.Filesystem + walkerType = walker.Filesystem // check we have only received one path arg which we use for the file extension / matching to formatters if len(f.Paths) != 1 { @@ -197,7 +197,7 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { } walkPaths := func() error { - defer close(pathsCh) + defer close(pathCh) var idx int for idx < len(f.Paths) { @@ -205,7 +205,7 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { case <-ctx.Done(): return ctx.Err() default: - pathsCh <- f.Paths[idx] + pathCh <- f.Paths[idx] idx += 1 } } @@ -217,29 +217,29 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { eg.Go(walkPaths) } else { // no explicit paths to process, so we only need to process root - pathsCh <- f.TreeRoot - close(pathsCh) + pathCh <- f.TreeRoot + close(pathCh) } // create a filesystem walker - walker, err := walk.New(walkerType, f.TreeRoot, pathsCh) + wk, err := walker.New(walkerType, f.TreeRoot, f.NoCache, pathCh) if err != nil { return fmt.Errorf("failed to create walker: %w", err) } - // close the files channel when we're done walking the file system - defer close(f.filesCh) + // close the file channel when we're done walking the file system + defer close(f.fileCh) // if no cache has been configured, or we are processing from stdin, we invoke the walker directly if f.NoCache || f.Stdin { - return walker.Walk(ctx, func(file *walk.File, err error) error { + return wk.Walk(ctx, func(file *walker.File, err error) error { select { case <-ctx.Done(): return ctx.Err() default: stats.Add(stats.Traversed, 1) stats.Add(stats.Emitted, 1) - f.filesCh <- file + f.fileCh <- file return nil } }) @@ -247,7 +247,7 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error { // otherwise we pass the walker to the cache and have it generate files for processing based on whether or not // they have been added/changed since the last invocation - if err = cache.ChangeSet(ctx, walker, f.filesCh); err != nil { + if err = cache.ChangeSet(ctx, wk, f.fileCh); err != nil { return fmt.Errorf("failed to generate change set: %w", err) } return nil @@ -319,7 +319,7 @@ func (f *Format) applyFormatters(ctx context.Context) func() error { }() // iterate the files channel - for file := range f.filesCh { + for file := range f.fileCh { // first check if this file has been globally excluded if format.PathMatches(file.RelPath, f.globalExcludes) { @@ -419,7 +419,7 @@ func (f *Format) detectFormatted(ctx context.Context) func() error { func (f *Format) updateCache(ctx context.Context) func() error { return func() error { // used to batch updates for more efficient txs - batch := make([]*walk.File, 0, BatchSize) + batch := make([]*walker.File, 0, BatchSize) // apply a batch processBatch := func() error { diff --git a/format/formatter.go b/format/formatter.go index 36687393..4e611f44 100644 --- a/format/formatter.go +++ b/format/formatter.go @@ -8,7 +8,7 @@ import ( "os/exec" "time" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" "git.numtide.com/numtide/treefmt/config" @@ -89,7 +89,7 @@ func (f *Formatter) Apply(ctx context.Context, tasks []*Task) error { // Wants is used to test if a Formatter wants a path based on it's configured Includes and Excludes patterns. // Returns true if the Formatter should be applied to path, false otherwise. -func (f *Formatter) Wants(file *walk.File) bool { +func (f *Formatter) Wants(file *walker.File) bool { match := !PathMatches(file.RelPath, f.excludes) && PathMatches(file.RelPath, f.includes) if match { f.log.Debugf("match: %v", file) diff --git a/format/task.go b/format/task.go index 062b4544..81d16602 100644 --- a/format/task.go +++ b/format/task.go @@ -4,16 +4,16 @@ import ( "cmp" "slices" - "git.numtide.com/numtide/treefmt/walk" + "git.numtide.com/numtide/treefmt/walker" ) type Task struct { - File *walk.File + File *walker.File Formatters []*Formatter BatchKey string } -func NewTask(file *walk.File, formatters []*Formatter) Task { +func NewTask(file *walker.File, formatters []*Formatter) Task { // sort by priority in ascending order slices.SortFunc(formatters, func(a, b *Formatter) int { priorityA := a.Priority() diff --git a/walk/filesystem.go b/walker/filesystem.go similarity index 98% rename from walk/filesystem.go rename to walker/filesystem.go index 729a4972..8a47c7af 100644 --- a/walk/filesystem.go +++ b/walker/filesystem.go @@ -1,4 +1,4 @@ -package walk +package walker import ( "context" diff --git a/walk/filesystem_test.go b/walker/filesystem_test.go similarity index 99% rename from walk/filesystem_test.go rename to walker/filesystem_test.go index 503ec0d3..1edaa170 100644 --- a/walk/filesystem_test.go +++ b/walker/filesystem_test.go @@ -1,4 +1,4 @@ -package walk +package walker import ( "context" diff --git a/walk/git.go b/walker/git.go similarity index 69% rename from walk/git.go rename to walker/git.go index 4dc91967..bbf30652 100644 --- a/walk/git.go +++ b/walker/git.go @@ -1,4 +1,4 @@ -package walk +package walker import ( "context" @@ -6,6 +6,9 @@ import ( "io/fs" "os" "path/filepath" + "time" + + "github.com/go-git/go-git/v5/plumbing/format/index" "github.com/charmbracelet/log" @@ -13,9 +16,11 @@ import ( ) type gitWalker struct { - root string - paths chan string - repo *git.Repository + root string + paths chan string + repo *git.Repository + + noCache bool relPathOffset int } @@ -39,7 +44,20 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { } // cache in-memory whether a path is present in the git index - var cache map[string]bool + var cache map[string]*index.Entry + + // by default, we only emit files if they have changes when compared with the git index + emitFile := func(entry *index.Entry, info os.FileInfo) bool { + // mod time comparison is done with EPOCH (second) precision as per the POSIX spec + return entry.ModifiedAt.Truncate(time.Second) != info.ModTime().Truncate(time.Second) + } + + if g.noCache { + // emit all files in the index + emitFile = func(entry *index.Entry, info os.FileInfo) bool { + return true + } + } for path := range g.paths { @@ -63,6 +81,11 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return fmt.Errorf("failed to stat %s: %w", path, err) } + // skip processing if the file hasn't changed + if !emitFile(entry, info) { + continue + } + // determine a relative path relPath, err := g.relPath(path) if err != nil { @@ -83,11 +106,11 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { continue } - // otherwise we ensure the git index entries are cached and then check if they are in the git index + // otherwise we ensure the git index entries are cached and then check if the path is in the git index if cache == nil { - cache = make(map[string]bool) + cache = make(map[string]*index.Entry) for _, entry := range idx.Entries { - cache[entry.Name] = true + cache[entry.Name] = entry } } @@ -103,7 +126,8 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { } return filepath.Walk(path, func(path string, info fs.FileInfo, _ error) error { - if info.IsDir() { + // ignore directories and symlinks + if info.IsDir() || info.Mode()&os.ModeSymlink == os.ModeSymlink { return nil } @@ -112,9 +136,12 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return fmt.Errorf("failed to determine a relative path for %s: %w", path, err) } - if _, ok := cache[relPath]; !ok { + if entry, ok := cache[relPath]; !ok { log.Debugf("path %v not found in git index, skipping", path) return nil + } else if !emitFile(entry, info) { + log.Debugf("path %v has not changed, skipping", path) + return nil } file := File{ @@ -130,7 +157,11 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error { return nil } -func NewGit(root string, paths chan string) (Walker, error) { +func NewGit( + root string, + noCache bool, + paths chan string, +) (Walker, error) { repo, err := git.PlainOpen(root) if err != nil { return nil, fmt.Errorf("failed to open git repo: %w", err) @@ -139,6 +170,7 @@ func NewGit(root string, paths chan string) (Walker, error) { root: root, paths: paths, repo: repo, + noCache: noCache, relPathOffset: len(root) + 1, }, nil } diff --git a/walk/walker.go b/walker/walker.go similarity index 83% rename from walk/walker.go rename to walker/walker.go index e3afb77f..270854cf 100644 --- a/walk/walker.go +++ b/walker/walker.go @@ -1,4 +1,4 @@ -package walk +package walker import ( "context" @@ -56,12 +56,12 @@ type Walker interface { Walk(ctx context.Context, fn WalkFunc) error } -func New(walkerType Type, root string, pathsCh chan string) (Walker, error) { +func New(walkerType Type, root string, noCache bool, pathsCh chan string) (Walker, error) { switch walkerType { case Git: - return NewGit(root, pathsCh) + return NewGit(root, noCache, pathsCh) case Auto: - return Detect(root, pathsCh) + return Detect(root, noCache, pathsCh) case Filesystem: return NewFilesystem(root, pathsCh) default: @@ -69,9 +69,9 @@ func New(walkerType Type, root string, pathsCh chan string) (Walker, error) { } } -func Detect(root string, pathsCh chan string) (Walker, error) { +func Detect(root string, noCache bool, pathsCh chan string) (Walker, error) { // for now, we keep it simple and try git first, filesystem second - w, err := NewGit(root, pathsCh) + w, err := NewGit(root, noCache, pathsCh) if err == nil { return w, err }