-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdupmerge.c
860 lines (747 loc) · 29.6 KB
/
dupmerge.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
// $Id: dupmerge.c,v 1.7 2022/12/09 02:38:11 karn Exp $
/* Dupmerge - Reclaim disk space by deleting redundant copies of files and
* creating hardlinks in their place
* To build: gcc -o dupmerge -O2 dupmerge.c -l crypto
* Uses the SHA hash functions in the openssl crypto library
* This program reads from standard input, or generates from command line arguments, a list of files.
* It then discovers which files have identical contents, unlinks all but one and recreates the other
* path names as hard links to the remaining copy. Except when the -f option is used, identical files are discovered
* regardless of directory path or file basename.
* The input file list may be generated with a command like 'find . -print', or if the -0 option to dupmerge is used,
* 'find . -print0'.
* Non-plain files in the input (directories, pipes, devices, etc)
* are ignored. Identical files must be on the same file system to be linked.
* Empty files are ignored by default because they are often used as locks or flags, and no disk space is reclaimed
* by deleting them anyway.
* Dupmerge prefers to keep the older of two identical files, as the older
* timestamp is more likely to be the correct one given that many
* copy utilities (e.g., 'cp') do not by default preserve modification
* times.
* Note: although this program will not lose any file contents or path name information, it will lose the inode information
* in the unlinked copies. This includes file owner/group, modes and modification timestamps. For this reason, dupmerge is
* best used on collections of files owned by a single user.
* Command line arguments:
* -0 Delimit file names with nulls rather than newlines; for use with 'find -print0'
* -q Operate in quiet mode (otherwise, relinks are displayed on stdout)
* -f Fast (very fast) mode that bypasses an exhaustive file comparison in favor of modification timestamps.
* If the two files are the same size, have the same basename and exactly the same timestamp, they're probably the same.
* Rsync uses this method by default and it seems to work well.
* -t threshold_size (default 100,000 bytes); Apply the fast mode feature only to files larger than threshold_size
* -n Dry run: simply list the actions that would be taken without actually unlinking or linking anything. Turns off -q.
* -s By default, files are sorted in decreasing order of size so that the actual unlinking of duplicate files starts with
* the largest files. This recovers disk space as quickly as possible, but if for some reason you want to start with the
* smallest files, use this flag.
* -L locale (default: en_US.UTF8) I added this because I'm going blind mentally inserting commas in big integers.
* 2008: New algorithm. I no longer do the duplicate unlinking and relinking inside the qsort comparison routine.
* It was a cute idea, but probably not as good as just producing a list of files, sorting it by size,
* and running through that.
* March 2009: the sort comparison function looks at the first page of each file when they're the same size.
* This greatly reduces the number of comparisons done after the sort. Files with unique sizes are ignored. Files are
* completely read at most once thanks to caching of their SHA1 hash values.
* May 2010: simplify the algorithm (only one sort is performed) and fix bugs in the handling of more
* than two identical copies of a single file. This version should reliably identify and link all identical copies.
* Fall 2010: one or more directories and file names may be specified on the command line. In that case, dupmerge will explore
* those directories and build the list itself instead of reading it from stdin.
* Fall 2021: Significant rewrite. Simplify structure, use common library functions in library.c, switch to SHA256 hash
Now requires extended file attribute support; always caches hashes in user.sha1/user.sha256 (Linux) or sha1/sha256 (MacOS)
* Catch signals so that summary statistics can be produced when the program is aborted early.
* Extra paranoia checks right before a pair of files is merged: if either file appears to have been modified since the
* list was originally generated, dupmerge exits.
* Trivia: this program was inspired by a cheating scandal in the introductory computer science course CS-100
* at Cornell University the year after I graduated. The TAs simply sorted the projects by object code size and compared
* those that were equal. That effectively found copies where only the variable names and comments had been changed.
*
* Copyright Phil Karn, karn@ka9q.net. http://www.ka9q.net
* May be used under the terms of the GNU General Public License v 2.0
* Many thanks to Simon Baatz for bug reports and fixes
*/
#define _GNU_SOURCE
#define _FILE_OFFSET_BITS 64 // For Linux on 32-platforms
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <libgen.h>
#include <sys/mman.h>
#include <limits.h>
#include <unistd.h>
#include <regex.h>
#include <openssl/sha.h>
#include <locale.h>
#include <dirent.h>
#include <signal.h>
#include <ftw.h>
#include <sys/xattr.h>
#include "filehash.h"
// BSD-based OSX uses an argument to control the following of symbolic links; Linux uses separate system calls
#if __APPLE__
#define GETXATTR(a,b,c,d) getxattr((a),(b),(c),(d),0,XATTR_NOFOLLOW)
#define REMOVEXATTR(a,b) removexattr((a),(b),XATTR_NOFOLLOW)
#define SETXATTR(a,b,c,d,e) setxattr((a),(b),(c),(d),0,(e)|XATTR_NOFOLLOW)
#define FGETXATTR(a,b,c,d) fgetxattr((a),(b),(c),(d),0,0)
#define FSETXATTR(a,b,c,d,e) fsetxattr((a),(b),(c),(d),0,(e))
#else
#define GETXATTR(a,b,c,d) lgetxattr((a),(b),(c),(d))
#define REMOVEXATTR(a,b) lremovexattr((a),(b))
#define SETXATTR(a,b,c,d,e) lsetxattr((a),(b),(c),(d),(e))
#define FGETXATTR(a,b,c,d) fgetxattr((a),(b),(c),(d))
#define FSETXATTR(a,b,c,d,e) fsetxattr((a),(b),(c),(d),(e))
#endif
/* Darwin (OSX) has this, but Linux apparently doesn't */
#ifndef MAP_NOCACHE
#define MAP_NOCACHE (0)
#endif
/* Linux has this and OSX apparently doesn't */
#ifndef MAP_POPULATE
#define MAP_POPULATE (0)
#endif
int Nopenfd = 100;
enum flag { NO=0,YES=1,UNKNOWN=-1 };
enum flag Fast_flag = NO;
enum flag Zero_flag = NO;
enum flag Quiet_flag = NO;
enum flag Fast_threshold = 100000;
enum flag No_do = NO;
enum flag Small_first = NO;
unsigned long long Minimum_size = 1; // 1 byte minimum
int Verbose;
int Progress;
/* Statistics counts */
unsigned long long Regular_file;
unsigned long long FIFO;
unsigned long long Character_special;
unsigned long long Directory;
unsigned long long Block_special;
unsigned long long Symbolic_link;
unsigned long long Socket;
unsigned long long Whiteout;
unsigned long long Unknown;
unsigned long long Null_pathname;
unsigned long long Total_files;
unsigned long long Empty;
unsigned long long Too_small;
unsigned long long Stat_fail;
unsigned long long Not_accessible;
unsigned long long Files_deleted;
unsigned long long Unique_sizes;
unsigned long long Extra_links;
unsigned long long Hard_links;
unsigned long long Blocks_reclaimed;
unsigned long long Hashes_computed;
unsigned long long Hashes_fetched;
unsigned long long Hash_fail;
unsigned long long Unlinks;
unsigned long long Unlink_failures;
size_t pagesize;
size_t mapchunk;
char const *Program_name; // Points to argv[0], for benefit of subroutines generating error messages
#define ENTRYCHUNK 5000 // Allocation unit for file table
#define MAP_PAGES 32768 // Number of pages to mmap on each loop
struct timespec Epoch = {
0,0,
};
// File table entry
struct entry {
char *pathname; // Path name, reserved space for null
struct stat statbuf; // file i-node data
struct attr256 attr256;
int attr_present;
};
void print_stats(void);
void sig_handler(int);
// Comparison functions
int comparison_equal(void const *ap,void const *bp); // Does most of the work
int comparison_sort(void const *ap,void const *bp); // Version called by qsort()
int compare_inodes(void const *ap,void const *bp); // Version called by qsort()
int compare_extents(void const *ap,void const *bp);
struct entry *Entries; // Dynamically allocated file table
unsigned int Entryarraysize; // Start with empty table, allocate on first pass
unsigned int Nfiles; // Actual number of entries in Entries[]
void dump_files(void);
void dump_entry(struct entry const *);
int get_big_hash(struct entry *ep,struct stat const *);
// Our function that will handle each file in the hierarchy
int process(char const *pathname,struct stat const *statbuf,int typeflag,struct FTW *ftwbuf);
int main(int argc,char *argv[]){
char const *locale_string = "en_US.UTF-8";
Program_name = argv[0];
assert(sha256_selftest() == 0);
// Determine system page size and amount to map each time
pagesize = sysconf(_SC_PAGESIZE);
pagesize = pagesize > 0 ? pagesize : 4096; // If it fails, default to 4096 (x86 page size)
mapchunk = MAP_PAGES * pagesize; // Amount to hash at one time in full file hash
// Process command line args
int c;
while((c = getopt(argc,argv,"snqf0t:L:vm:")) != EOF){
switch(c){
case 'm':
Minimum_size = strtoll(optarg,NULL,0);
break;
default:
printf("Usage: %s [-i interval] [-v] [-m minsize] [-s] [-n] [-q] [-f] [-0] [-L locale] [-t threshold_size]\n",Program_name);
break;
case 'L':
locale_string = optarg;
break;
case 's':
Small_first = YES;
break;
case 'n':
No_do = YES;
break;
case 'q':
Quiet_flag = YES;
break;
case 'f':
Fast_flag = YES; /* Just compare modification timestamps, not file contents */
break;
case '0':
Zero_flag = YES; /* Path names are delimited by nulls, e.g., from 'find . -print0' */
break;
case 't':
Fast_threshold = atoi(optarg);
break;
case 'v':
Verbose++;
break;
}
}
if(NULL == setlocale(LC_NUMERIC,locale_string))
printf("setlocale %s failed\n",locale_string);
if(No_do && Quiet_flag){
printf("%s: -q flag forced off with -n set\n",Program_name);
Quiet_flag = NO; /* Force off */
}
// Catch signals so we can show statistics if the user hits ^C
signal(SIGHUP,sig_handler);
signal(SIGINT,sig_handler);
signal(SIGQUIT,sig_handler);
signal(SIGILL,sig_handler);
signal(SIGABRT,sig_handler);
signal(SIGBUS,sig_handler);
signal(SIGSEGV,sig_handler);
signal(SIGSYS,sig_handler);
signal(SIGPIPE,sig_handler);
signal(SIGTERM,sig_handler);
signal(SIGXCPU,sig_handler);
signal(SIGXFSZ,sig_handler);
signal(SIGVTALRM,sig_handler);
signal(SIGPROF,sig_handler);
signal(SIGUSR1,sig_handler);
signal(SIGUSR2,sig_handler);
// Process directories given as command-line arguments
if(optind < argc){
for(int i=optind;i<argc;i++)
nftw(argv[i],process,Nopenfd,FTW_PHYS);
} else {
// If no directories on command line, read list of files from stdin
while(!feof(stdin)){
char pathname[PATH_MAX+1];
int i;
for(i=0;i< PATH_MAX;i++){
char const ch = getc(stdin);
if(EOF == ch || '\0' == ch || (!Zero_flag && '\n' == ch))
break;
pathname[i] = ch;
}
pathname[i] = '\0';
nftw(pathname,process,Nopenfd,FTW_PHYS);
}
}
// Display file statistics
if(!Quiet_flag){
printf("%s: input files: total %'llu; ordinary %'llu",Program_name,Total_files,Regular_file);
if(FIFO)
printf("; FIFO %'llu",FIFO);
if(Character_special)
printf("; char special %'llu",Character_special);
if(Directory)
printf("; directories %'llu",Directory);
if(Block_special)
printf("; block specials %'llu",Block_special);
if(Symbolic_link)
printf("; symbolic links %'llu",Symbolic_link);
if(Socket)
printf("; sockets %'llu",Socket);
if(Whiteout)
printf("; whiteouts %'llu",Whiteout);
if(Empty)
printf("; empties %'llu",Empty);
if(Too_small)
printf("; too small %'llu",Too_small);
if(Unknown)
printf("; unknown %'llu",Unknown);
putc('\n',stdout);
if(Null_pathname)
printf("%s: null pathnames %'llu\n",Program_name,Null_pathname);
if(Stat_fail)
printf("%s: stat failures %'llu\n",Program_name,Stat_fail);
if(Not_accessible)
printf("%s: files not accessible %'llu\n",Program_name,Not_accessible);
if(Nfiles == 0)
printf("%s: no files left to examine\n",Program_name);
}
if(Nfiles < 2)
exit(0); // Nothing to do!
#if DEBUG
dump_files();
#endif
if(No_do)
printf("%s: dry run, no files will actually be unlinked\n",Program_name);
/* Sort by file size/device/mod time/nlinks */
qsort(Entries,Nfiles,sizeof(struct entry),comparison_sort);
if(Verbose)
printf("%s: sort done, %'u entries on list\n",Program_name,Nfiles);
#if DEBUG
dump_files();
#endif
// Walk through file list culling out redundant hard links
// This is only performed for the first (oldest) entry of a given size on a device; if other files of the
// same size follow, we must leave them all because it's possible we'll have to merge all those
// path names to the first one. This still takes care of the common case where all the entries
// of a given size refer to a single file
{
int j = 0;
for(int i=0;i<Nfiles;i = j){
for(j=i+1; j<Nfiles &&
Entries[i].statbuf.st_size == Entries[j].statbuf.st_size &&
Entries[i].statbuf.st_dev == Entries[j].statbuf.st_dev &&
Entries[i].statbuf.st_ino == Entries[j].statbuf.st_ino; j++){
// Already hard linked, remove from list
free(Entries[j].pathname);
Entries[j].pathname = NULL;
Extra_links++;
}
// Skip any more files with the same size on the same device
for(;j<Nfiles && Entries[i].statbuf.st_size == Entries[j].statbuf.st_size &&
Entries[i].statbuf.st_dev == Entries[j].statbuf.st_dev;j++)
; // empty loop body
}
}
if(Extra_links != 0){
if(Verbose)
printf("%s: %'llu redundant hard links removed from list\n",Program_name,Extra_links);
qsort(Entries,Nfiles,sizeof(struct entry),comparison_sort);
Nfiles -= Extra_links;
if(Verbose)
printf("%s: list resorted, %'u entries left\n",Program_name,Nfiles);
if(Nfiles < 2){
free(Entries);
exit(0);
}
}
// Walk through file list culling out unique sizes
{
int j = 0;
for(int i=0;i<Nfiles;i = j){
// Find first file after this one with a different size or device
for(j=i+1; j<Nfiles &&
Entries[i].statbuf.st_size == Entries[j].statbuf.st_size &&
Entries[i].statbuf.st_dev == Entries[j].statbuf.st_dev; j++){
// empty body
}
if(j == i+1){
// Unique size on this device, remove from list
free(Entries[i].pathname);
Entries[i].pathname = NULL;
Unique_sizes++;
}
}
}
if(Unique_sizes != 0){
if(Verbose)
printf("%s: %'llu files with unique sizes removed from list\n",Program_name,Unique_sizes);
qsort(Entries,Nfiles,sizeof(struct entry),comparison_sort);
Nfiles -= Unique_sizes;
if(Verbose)
printf("%s: list resorted, %'u entries left\n",Program_name,Nfiles);
if(Nfiles < 2){
free(Entries);
exit(0);
}
}
// Walk through first of each group of files that are candidates for being the same
// This is the reference file
for(int i=0;i<Nfiles-1;i++){
Progress = i;
// Ignore hard links to earlier reference files
if(Entries[i].pathname == NULL)
continue;
// The qsort grouped together all files with the same size on the same device
// Scan forward for all files with the same size and device as the reference file
for(int j=i+1;
j<Nfiles
&& Entries[i].statbuf.st_size == Entries[j].statbuf.st_size
&& Entries[i].statbuf.st_dev == Entries[j].statbuf.st_dev;
j++){
if(Entries[j].pathname == NULL)
continue;
if(Entries[i].statbuf.st_dev == Entries[j].statbuf.st_dev
&& Entries[i].statbuf.st_ino == Entries[j].statbuf.st_ino){
// Existing hard link to reference file; mark so we'll skip over it later
free(Entries[j].pathname);
Entries[j].pathname = NULL;
continue;
}
if(comparison_equal(&Entries[i],&Entries[j]) == 0){
// Distinct files with identical contents on same file system, can be linked
if(Verbose){
printf("%s: %'llu ln %s -> %s\n",Program_name,(unsigned long long)Entries[j].statbuf.st_size,Entries[j].pathname,Entries[i].pathname);
#if DEBUG
printf("debug: inodes %'llu %'llu\n",(unsigned long long)Entries[j].statbuf.st_ino,(unsigned long long)Entries[i].statbuf.st_ino);
#endif
}
if(Entries[j].statbuf.st_nlink == 1){
// Pathname has single remaining link, so its blocks will be recovered
Blocks_reclaimed += Entries[j].statbuf.st_blocks;
}
{
// Some last minute paranoid checks
struct stat statbuf_a,statbuf_b;
if(lstat(Entries[i].pathname,&statbuf_a)){
printf("%s: can't lstat(%s): %d %s\n",Program_name,Entries[i].pathname,errno,strerror(errno));
abort();
}
if(lstat(Entries[j].pathname,&statbuf_b)){
printf("%s: can't lstat(%s): %d %s\n",Program_name,Entries[j].pathname,errno,strerror(errno));
abort();
}
// ensure file hasn't been modified
if(statbuf_a.st_size != Entries[i].statbuf.st_size || time_cmp(&statbuf_a.st_mtim,&Entries[i].statbuf.st_mtim)){
printf("%s: %s has changed; restart program\n",Program_name,Entries[i].pathname);
goto done;
}
if(statbuf_b.st_size != Entries[j].statbuf.st_size || time_cmp(&statbuf_b.st_mtim,&Entries[j].statbuf.st_mtim)){
printf("%s: %s has changed; restart program\n",Program_name,Entries[j].pathname);
goto done;
}
assert(statbuf_a.st_size == statbuf_b.st_size);
assert(statbuf_a.st_ino != statbuf_b.st_ino);
assert(statbuf_a.st_dev == statbuf_b.st_dev);
// the epoch sorts after all other times because mtimes can sometimes be set to it by error
assert(time_cmp(&Epoch,&statbuf_b.st_mtim) == 0 || time_cmp(&statbuf_b.st_mtim,&statbuf_a.st_mtim) >= 0);
}
if(!No_do){
if(unlink(Entries[j].pathname)) {
Unlink_failures++;
printf("%s: can't unlink(%s): %d %s\n",Program_name,Entries[j].pathname,errno,strerror(errno));
} else if(link(Entries[i].pathname,Entries[j].pathname)){
// Should never fail if unlink succeeded
printf("%s: can't link(%s,%s): %d %s\n",Program_name,Entries[i].pathname,Entries[j].pathname,errno,strerror(errno));
abort();
}
}
// Don't use this entry as a reference file later
free(Entries[j].pathname);
Entries[j].pathname = NULL;
Unlinks++;
} // if same
} // End of inner for() loop
} // end of for loop looking ahead for match with file i
done:;
print_stats();
free(Entries); // Not really necessary since we're exiting
exit(0);
}
// Process a path name
// if it's an ordinary, readable, non-empty file, add it to the list
// if it's a directory, process it recursively
// Otherwise, ignore it
int process(char const *pathname,const struct stat *statbuf,int typeflag,struct FTW *ftwbuf){
struct entry *ep;
assert(statbuf != NULL);
Total_files++;
// Ignore null path names
if(pathname == NULL || strlen(pathname) == 0){
Null_pathname++;
return 0;
}
if(typeflag == FTW_NS){
Stat_fail++;
if(Verbose > 1)
printf("%s: stat failed!\n",pathname);
return 0; // Stat() failed, no analysis possible
}
// Count but otherwise ignore special files: FIFOs, block special, character special, sockets, whitespace
switch(statbuf->st_mode & S_IFMT){
case S_IFIFO:
FIFO++;
if(Verbose > 1)
printf("%s: FIFO\n",pathname);
return 0;
case S_IFBLK:
Block_special++;
if(Verbose > 1)
printf("%s: block special\n",pathname);
return 0;
case S_IFCHR:
Character_special++;
if(Verbose > 1)
printf("%s: char special\n",pathname);
return 0;
case S_IFSOCK:
Socket++;
if(Verbose > 1)
printf("%s: socket\n",pathname);
return 0;
#ifdef S_IFWHT
case S_IFWHT:
Whiteout++;
if(Verbose > 1)
printf("%s: whiteout\n",pathname);
return 0;
#endif
case S_IFREG:
Regular_file++;
break;
case S_IFLNK:
Symbolic_link++;
break;
case S_IFDIR:
Directory++;
if(Verbose > 1)
printf("%s: Directory\n",pathname);
return 0;
default:
Unknown++;
if(Verbose > 1)
printf("%s: unknown type (0%o)\n",pathname,statbuf->st_mode & S_IFMT);
return 0;
}
if(typeflag != FTW_F)
return 0; // Ignore all but ordinary files
if(statbuf->st_blocks == 0){
// Ignore files with no assigned data blocks (any data being stored in the inode).
// Zero size files are often used as flags and locks we don't want to upset. And we won't recover
// any data blocks from a file without any data blocks!
// I should also exclude HFS files on OSX with resource forks
if(Verbose > 1)
printf("%s: empty\n",pathname);
Empty++;
return 0;
}
if(statbuf->st_size < Minimum_size){
if(Verbose > 1)
printf("%s: too small\n",pathname);
Too_small++;
return 0;
}
// Ordinary file; add to file table
// Expand file table if necessary and possible
if(Nfiles >= Entryarraysize){
Entries = (struct entry *)realloc(Entries,(Entryarraysize + ENTRYCHUNK) * sizeof(struct entry));
assert(Entries != NULL);
Entryarraysize += ENTRYCHUNK;
}
ep = &Entries[Nfiles];
memset(ep,0,sizeof(*ep));
ep->statbuf = *statbuf;
ep->pathname = strdup(pathname);
Nfiles++;
#if DEBUG
dump_entry(ep);
#endif
return 0;
}
// Compare files by size (used by second sort)
// Return 0 means same size *and* on same device
// Returning <0 causes the first argument to sort toward the top of the list
// Returning >0 causes the second argument to sort toward the top of the list
// We want the largest files to go to the top of the list, so "smaller is greater".
// We also want empty entries to go to the end of the list, so they are always "greater"
int comparison_sort(void const *ap,void const *bp){
struct entry const *a = (struct entry *)ap;
struct entry const *b = (struct entry *)bp;
// These aren't really illegal, but qsort should never pass null entries, or try to compare an entry with itself
assert(a != NULL);
assert(b != NULL);
assert(a != b);
// Null pathnames indicate cleared entries marked for deletion; push them to the bottom of the sort
if(!b->pathname)
return -1;
else if(!a->pathname)
return 1;
// Each non-cleared entry has a unique path name, and qsort should never compare an entry with itself
assert(b->pathname != a->pathname);
// Distinguish first by size.
// By default, bigger files sort first unless overridden with -s option
if(b->statbuf.st_size != a->statbuf.st_size){
if(Small_first)
return a->statbuf.st_size > b->statbuf.st_size? +1 : -1;
else
return a->statbuf.st_size > b->statbuf.st_size? -1 : +1;
}
// Files are same size; distinguish if on different device; ordering is unimportant
if(b->statbuf.st_dev != a->statbuf.st_dev)
return b->statbuf.st_dev > a->statbuf.st_dev ? +1 : -1;
// Same size, same device; distinguish by modification time, older files first
// Exception: the epoch (0) is considered to come *after* all other times, as mtimes sometimes get zeroed and we don't
// want to lose a non-zero mtime that's more likely correct
if(time_cmp(&a->statbuf.st_mtim,&b->statbuf.st_mtim) != 0){
if(time_cmp(&Epoch,&a->statbuf.st_mtim) == 0) // epoch; comes after everything
return +1;
else if(time_cmp(&Epoch,&b->statbuf.st_mtim) == 0)
return -1;
else return time_cmp(&a->statbuf.st_mtim,&b->statbuf.st_mtim);
}
// Order equal, same time files with fewer links later so they'll be preferentially deleted sooner
if(a->statbuf.st_nlink != b->statbuf.st_nlink)
return b->statbuf.st_nlink > a->statbuf.st_nlink? +1 : -1;
// Same size, same device, same modification time, same number of links; includes case of two links to same inode
return 0;
}
// Note: these *do* modify their arguments by writing hashes if they don't already exist
int comparison_equal(const void *ap,const void *bp){
struct entry *a = (struct entry *)ap;
struct entry *b = (struct entry *)bp;
if(a == b)
return 0; /* Can this happen? */
assert(a != NULL);
assert(b != NULL);
// If the files are hard linked, they're the same. This "can't happen" because we've already
// checked for it, but to be safe...
if(a->statbuf.st_dev == b->statbuf.st_dev && a->statbuf.st_ino == b->statbuf.st_ino)
return 0;
// Make file size the most significant part of the comparison
if(b->statbuf.st_size != a->statbuf.st_size)
return b->statbuf.st_size > a->statbuf.st_size ? +1 : -1;
// Same-size files sort together only when they're on the same device
if(b->statbuf.st_dev != a->statbuf.st_dev)
return b->statbuf.st_dev > a->statbuf.st_dev ? +1 : -1;
// Optionally use the rsync heuristic -- if the files have the same size, mod timestamp and
// the same base name, declare them the same without actually reading the contents
// Do this only on files larger than Fast_threshold to further reduce chances of false equality
if(Fast_flag && a->statbuf.st_size > Fast_threshold){ // Rsync-style fast comparison
char *bn1,*bn2;
// I could use the built-in basename() function, but what a mess it is
if((bn1 = strrchr(a->pathname,'/')) == NULL)
bn1 = a->pathname;
if((bn2 = strrchr(b->pathname,'/')) == NULL)
bn2 = b->pathname;
// Are the basenames and mod times identical?
// Also require that the mod time isn't the epoch (0), which sometimes happens to a lot of files and could cause
// false equality
if(time_cmp(&Epoch,&a->statbuf.st_mtim) != 0 && 0 == strcmp(bn1,bn2) && time_cmp(&a->statbuf.st_mtim,&b->statbuf.st_mtim) == 0)
return 0;
}
// compare full file hashes
get_big_hash(a,&a->statbuf);
get_big_hash(b,&b->statbuf);
int const i = memcmp(a->attr256.hash,b->attr256.hash,sizeof(a->attr256.hash));
if(i != 0)
Hash_fail++;
return i; // full hash comparison is final and authoritative
}
int compare_inodes(const void *ap,const void *bp){
struct entry const *a = (struct entry *)ap;
struct entry const *b = (struct entry *)bp;
if(a == NULL && b == NULL)
return 0;
else if(a == NULL && b != NULL)
return -1;
else if(a != NULL && b == NULL)
return -1;
if(a->statbuf.st_dev > b->statbuf.st_dev)
return +1;
else if(a->statbuf.st_dev < b->statbuf.st_dev)
return -1;
if(a->statbuf.st_ino > b->statbuf.st_ino)
return +1;
else if(a->statbuf.st_ino < b->statbuf.st_ino)
return -1;
return 0; // The files are linked
}
// Fill in the filehash[] field in the file entry structure
// We can get it from three places:
// 1. If the small hash is available and the file is not larger than Small_hash_size (default 4kiB = 1 page on the x86),
// then we can just copy it over
// 2. The hash can be cached from a previous run in the user.sha1 extended attribute on the file. Not all filesytems support
// extended attributes; I developed this specifically for XFS
// 3. Last but not least, we can compute the full file hash ourselves
//
// In cases 1 & 3 we'll store the hash to the user.sha1 for future use
int get_big_hash(struct entry *ep,struct stat const *statbuf){
if(ep->attr_present)
return 0;
int rval = 0;
int const fd = open(ep->pathname,O_RDONLY);
if(fd == -1){
rval = -1;
if(Verbose)
printf("Can't read %s: %s\n",ep->pathname,strerror(errno));
goto done;
}
// Ensure tag is present and up to date
long long const r = update_tag_fd(fd,statbuf);
if(r < 0){
rval = -1;
if(Verbose)
printf("update_tag_fd(%s) failed:%s\n",ep->pathname,strerror(errno));
goto done;
}
if(r > 0)
Hashes_computed++;
if(getattr256(fd,&ep->attr256) == -1){
rval = -1;
if(Verbose)
printf("%s: error getting attribute: %s\n",ep->pathname,
strerror(errno));
goto done;
}
Hashes_fetched++;
ep->attr_present = 1;
done:;
if(fd != -1)
close(fd);
return rval;
}
void dump_entry(struct entry const * const ep){
#if (__darwin__)
printf(" inode %'llu; links %u; uid %d; gid %d; atime %ld.%09ld; mtime %ld.%09ld; ctime %ld.%09ld; size %'llu; gen %d; %s\n",
ep->statbuf.st_ino,
(unsigned int)ep->statbuf.st_nlink,ep->statbuf.st_uid,ep->statbuf.st_gid,
ep->statbuf.st_atimespec.tv_sec,ep->statbuf.st_atimespec.tv_nsec,
ep->statbuf.st_mtimespec.tv_sec,ep->statbuf.st_mtimespec.tv_nsec,
ep->statbuf.st_ctimespec.tv_sec,ep->statbuf.st_ctimespec.tv_nsec,
(unsigned long long)ep->statbuf.st_size,ep->statbuf.st_gen,ep->pathname);
#elif(__linux__)
printf(" inode %'llu; links %u; uid %d; gid %d; atime %ld.%09ld; mtime %ld.%09ld; ctime %ld.%09ld; size %'llu; %s\n",
(long long unsigned)ep->statbuf.st_ino,
(unsigned int)ep->statbuf.st_nlink,ep->statbuf.st_uid,ep->statbuf.st_gid,
ep->statbuf.st_atim.tv_sec,ep->statbuf.st_atim.tv_nsec,
ep->statbuf.st_mtim.tv_sec,ep->statbuf.st_mtim.tv_nsec,
ep->statbuf.st_ctim.tv_sec,ep->statbuf.st_ctim.tv_nsec,
(unsigned long long)ep->statbuf.st_size,ep->pathname);
#else
printf(" inode %'llu; links %u; uid %d; gid %d; atime %ld; mtime %ld; ctime %ld; size %'llu; %s\n",
(unsigned long long) ep->statbuf.st_ino,
(unsigned int)ep->statbuf.st_nlink,ep->statbuf.st_uid,ep->statbuf.st_gid,
ep->statbuf.st_atime,
ep->statbuf.st_mtime,
ep->statbuf.st_ctime,
(unsigned long long)ep->statbuf.st_size,
ep->pathname);
#endif
}
void dump_files(void){
for(int i=0;i<Nfiles;i++)
dump_entry(&Entries[i]);
}
void print_stats(void){
if(!Quiet_flag){
if(No_do)
printf("%s: This is a dry run; no files were actually unlinked.\n",Program_name);
if(Unlinks)
printf("%s: Unlinks: %'llu; Unlink failures: %'llu; disk blocks reclaimed: %'llu\n",Program_name,Unlinks,Unlink_failures,Blocks_reclaimed);
printf("%s: Hashes computed: %'llu; fetched from tags: %'llu; mismatches: %'llu\n",Program_name,Hashes_computed,Hashes_fetched,Hash_fail);
}
}
void sig_handler(int sig){
printf("%s: Signal %d\n",Program_name,sig);
print_stats();
exit(1);
}