-
Notifications
You must be signed in to change notification settings - Fork 8
/
link-by-hash.diff
426 lines (411 loc) · 15.7 KB
/
link-by-hash.diff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
Jason M. Felice wrote:
This patch adds the --link-by-hash=DIR option, which hard links received files
in a link farm arranged by MD4 or MD5 file hash. The result is that the system
will only store one copy of the unique contents of each file, regardless of the
file's name.
To use this patch, run these commands for a successful build:
patch -p1 <patches/link-by-hash.diff
./prepare-source
./configure
make
based-on: 6c8ca91c731b7bf2b081694bda85b7dadc2b7aff
diff --git a/Makefile.in b/Makefile.in
--- a/Makefile.in
+++ b/Makefile.in
@@ -47,7 +47,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
util1.o util2.o main.o checksum.o match.o syscall.o log.o backup.o delete.o
OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
usage.o fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
-OBJS3=progress.o pipe.o @MD5_ASM@ @ROLL_SIMD@ @ROLL_ASM@
+OBJS3=progress.o pipe.o hashlink.o @MD5_ASM@ @ROLL_SIMD@ @ROLL_ASM@
DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
popt/popthelp.o popt/poptparse.o
diff --git a/checksum.c b/checksum.c
--- a/checksum.c
+++ b/checksum.c
@@ -40,6 +40,8 @@ extern int whole_file;
extern int checksum_seed;
extern int protocol_version;
extern int proper_seed_order;
+extern char *link_by_hash_dir;
+extern char link_by_hash_extra_sum[MAX_DIGEST_LEN];
extern const char *checksum_choice;
#define NNI_BUILTIN (1<<0)
@@ -539,7 +541,7 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
}
static int32 sumresidue;
-static md_context ctx_md;
+static md_context ctx_md, ctx2_md;
#ifdef SUPPORT_XXHASH
static XXH64_state_t* xxh64_state;
#endif
@@ -597,6 +599,8 @@ int sum_init(struct name_num_item *nni, int seed)
#endif
case CSUM_MD5:
md5_begin(&ctx_md);
+ if (link_by_hash_dir)
+ md5_begin(&ctx2_md);
break;
case CSUM_MD4:
mdfour_begin(&ctx_md);
@@ -643,6 +647,8 @@ void sum_update(const char *p, int32 len)
#endif
case CSUM_MD5:
md5_update(&ctx_md, (uchar *)p, len);
+ if (link_by_hash_dir)
+ md5_update(&ctx2_md, (uchar *)p, len);
break;
case CSUM_MD4:
case CSUM_MD4_OLD:
@@ -709,6 +715,8 @@ void sum_end(char *sum)
#endif
case CSUM_MD5:
md5_result(&ctx_md, (uchar *)sum);
+ if (link_by_hash_dir)
+ md5_result(&ctx2_md, (uchar *)link_by_hash_extra_sum);
break;
case CSUM_MD4:
case CSUM_MD4_OLD:
diff --git a/clientserver.c b/clientserver.c
--- a/clientserver.c
+++ b/clientserver.c
@@ -53,6 +53,7 @@ extern int logfile_format_has_i;
extern int logfile_format_has_o_or_i;
extern char *bind_address;
extern char *config_file;
+extern char *link_by_hash_dir;
extern char *logfile_format;
extern char *files_from;
extern char *tmpdir;
@@ -736,6 +737,9 @@ static int rsync_module(int f_in, int f_out, int i, const char *addr, const char
return -1;
}
+ if (*lp_link_by_hash_dir(i))
+ link_by_hash_dir = lp_link_by_hash_dir(i);
+
if (am_daemon > 0) {
rprintf(FLOG, "rsync allowed access on module %s from %s (%s)\n",
name, host, addr);
diff --git a/daemon-parm.txt b/daemon-parm.txt
--- a/daemon-parm.txt
+++ b/daemon-parm.txt
@@ -29,6 +29,7 @@ STRING hosts_deny NULL
STRING include NULL
STRING include_from NULL
STRING incoming_chmod NULL
+STRING link_by_hash_dir NULL
STRING lock_file DEFAULT_LOCK_FILE
STRING log_file NULL
STRING log_format "%o %h [%a] %m (%u) %f %l"
diff --git a/hashlink.c b/hashlink.c
new file mode 100644
--- /dev/null
+++ b/hashlink.c
@@ -0,0 +1,92 @@
+/*
+ Copyright (C) Cronosys, LLC 2004
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+/* This file contains code used by the --link-by-hash option. */
+
+#include "rsync.h"
+#include "inums.h"
+
+extern int protocol_version;
+extern char *link_by_hash_dir;
+extern char sender_file_sum[MAX_DIGEST_LEN];
+
+char link_by_hash_extra_sum[MAX_DIGEST_LEN]; /* Only used when md4 sums are in the transfer */
+
+#ifdef HAVE_LINK
+
+/* This function is always called after a file is received, so the
+ * sender_file_sum buffer has whatever the last checksum was for the
+ * transferred file. */
+void link_by_hash(const char *fname, const char *fnametmp, struct file_struct *file)
+{
+ STRUCT_STAT st;
+ char *hashname, *last_slash, *num_str;
+ const char *hex;
+ int num = 0;
+
+ /* We don't bother to hard-link 0-length files. */
+ if (F_LENGTH(file) == 0)
+ return;
+
+ hex = sum_as_hex(5, protocol_version >= 30 ? sender_file_sum : link_by_hash_extra_sum, 0);
+ if (asprintf(&hashname, "%s/%.3s/%.3s/%.3s/%s.%s.000000",
+ link_by_hash_dir, hex, hex+3, hex+6, hex+9, big_num(F_LENGTH(file))) < 0)
+ {
+ out_of_memory("make_hash_name");
+ }
+
+ last_slash = strrchr(hashname, '/');
+ num_str = strrchr(last_slash, '.') + 1;
+
+ while (1) {
+ if (num >= 999999) { /* Surely we'll never reach this... */
+ if (DEBUG_GTE(HASHLINK, 1))
+ rprintf(FINFO, "link-by-hash: giving up after \"%s\".\n", hashname);
+ goto cleanup;
+ }
+ if (num > 0 && DEBUG_GTE(HASHLINK, 1))
+ rprintf(FINFO, "link-by-hash: max link count exceeded, starting new file \"%s\".\n", hashname);
+
+ snprintf(num_str, 7, "%d", num++);
+ if (do_stat(hashname, &st) < 0)
+ break;
+
+ if (do_link(hashname, fnametmp) < 0) {
+ if (errno == EMLINK)
+ continue;
+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", hashname, full_fname(fname));
+ } else {
+ if (DEBUG_GTE(HASHLINK, 2))
+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n", hashname, full_fname(fname));
+ robust_rename(fnametmp, fname, NULL, 0644);
+ }
+
+ goto cleanup;
+ }
+
+ if (DEBUG_GTE(HASHLINK, 2))
+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", full_fname(fname), hashname);
+
+ if (do_link(fname, hashname) < 0
+ && (errno != ENOENT || make_path(hashname, MKP_DROP_NAME) < 0 || do_link(fname, hashname) < 0))
+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", full_fname(fname), hashname);
+
+ cleanup:
+ free(hashname);
+}
+#endif
diff --git a/options.c b/options.c
--- a/options.c
+++ b/options.c
@@ -173,6 +173,7 @@ char *backup_suffix = NULL;
char *tmpdir = NULL;
char *partial_dir = NULL;
char *basis_dir[MAX_BASIS_DIRS+1];
+char *link_by_hash_dir = NULL;
char *config_file = NULL;
char *shell_cmd = NULL;
char *logfile_name = NULL;
@@ -231,7 +232,7 @@ static const char *debug_verbosity[] = {
/*2*/ "BIND,CMD,CONNECT,DEL,DELTASUM,DUP,FILTER,FLIST,ICONV",
/*3*/ "ACL,BACKUP,CONNECT2,DELTASUM2,DEL2,EXIT,FILTER2,FLIST2,FUZZY,GENR,OWN,RECV,SEND,TIME",
/*4*/ "CMD2,DELTASUM3,DEL3,EXIT2,FLIST3,ICONV2,OWN2,PROTO,TIME2",
- /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HLINK",
+ /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HASHLINK,HLINK",
};
#define MAX_VERBOSITY ((int)(sizeof debug_verbosity / sizeof debug_verbosity[0]) - 1)
@@ -302,6 +303,7 @@ static struct output_struct debug_words[COUNT_DEBUG+1] = {
DEBUG_WORD(FUZZY, W_REC, "Debug fuzzy scoring (levels 1-2)"),
DEBUG_WORD(GENR, W_REC, "Debug generator functions"),
DEBUG_WORD(HASH, W_SND|W_REC, "Debug hashtable code"),
+ DEBUG_WORD(HASHLINK, W_REC, "Debug hashlink code (levels 1-2)"),
DEBUG_WORD(HLINK, W_SND|W_REC, "Debug hard-link actions (levels 1-3)"),
DEBUG_WORD(ICONV, W_CLI|W_SRV, "Debug iconv character conversions (levels 1-2)"),
DEBUG_WORD(IO, W_CLI|W_SRV, "Debug I/O routines (levels 1-4)"),
@@ -582,7 +584,7 @@ enum {OPT_SERVER = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, OPT_BLOCK_SIZE,
- OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, OPT_STDERR,
+ OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, OPT_STDERR, OPT_LINK_BY_HASH,
OPT_OLD_COMPRESS, OPT_NEW_COMPRESS, OPT_NO_COMPRESS, OPT_OLD_ARGS,
OPT_STOP_AFTER, OPT_STOP_AT,
OPT_REFUSED_BASE = 9000};
@@ -743,6 +745,7 @@ static struct poptOption long_options[] = {
{"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
{"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
{"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
{"fuzzy", 'y', POPT_ARG_NONE, 0, 'y', 0, 0 },
{"no-fuzzy", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
{"no-y", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
@@ -990,6 +993,9 @@ static void set_refuse_options(void)
ref = cp + 1;
}
+ if (*lp_link_by_hash_dir(module_id))
+ parse_one_refuse_match(0, "link-by-hash", list_end);
+
if (am_daemon) {
#ifdef ICONV_OPTION
if (!*lp_charset(module_id))
@@ -1867,6 +1873,20 @@ int parse_arguments(int *argc_p, const char ***argv_p)
goto cleanup;
#endif
+ case OPT_LINK_BY_HASH:
+#ifdef HAVE_LINK
+ arg = poptGetOptArg(pc);
+ if (sanitize_paths)
+ arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT);
+ link_by_hash_dir = (char *)arg;
+ break;
+#else
+ snprintf(err_buf, sizeof err_buf,
+ "hard links are not supported on this %s\n",
+ am_server ? "server" : "client");
+ return 0;
+#endif
+
case OPT_STOP_AFTER: {
long val;
arg = poptGetOptArg(pc);
@@ -2252,6 +2272,8 @@ int parse_arguments(int *argc_p, const char ***argv_p)
tmpdir = sanitize_path(NULL, tmpdir, NULL, 0, SP_DEFAULT);
if (backup_dir)
backup_dir = sanitize_path(NULL, backup_dir, NULL, 0, SP_DEFAULT);
+ if (link_by_hash_dir)
+ link_by_hash_dir = sanitize_path(NULL, link_by_hash_dir, NULL, 0, SP_DEFAULT);
}
if (daemon_filter_list.head && !am_sender) {
filter_rule_list *elp = &daemon_filter_list;
@@ -2941,6 +2963,12 @@ void server_options(char **args, int *argc_p)
args[ac++] = "--no-W";
}
+ if (link_by_hash_dir && am_sender) {
+ args[ac++] = "--link-by-hash";
+ args[ac++] = link_by_hash_dir;
+ link_by_hash_dir = NULL; /* optimize sending-side checksums */
+ }
+
if (files_from && (!am_sender || filesfrom_host)) {
if (filesfrom_host) {
args[ac++] = "--files-from";
diff --git a/rsync.1.md b/rsync.1.md
--- a/rsync.1.md
+++ b/rsync.1.md
@@ -510,6 +510,7 @@ has its own detailed description later in this manpage.
--compare-dest=DIR also compare destination files relative to DIR
--copy-dest=DIR ... and include copies of unchanged files
--link-dest=DIR hardlink to files in DIR when unchanged
+--link-by-hash=DIR create hardlinks by hash into DIR
--compress, -z compress file data during the transfer
--compress-choice=STR choose the compression algorithm (aka --zc)
--compress-level=NUM explicitly set compression level (aka --zl)
@@ -2720,6 +2721,50 @@ expand it.
this bug by avoiding the `-o` option (or using `--no-o`) when sending to an
old rsync.
+0. `--link-by-hash=DIR`
+
+ This option hard links the destination files into _DIR_, a link farm
+ arranged by MD5 file hash. The result is that the system will only store
+ (usually) one copy of the unique contents of each file, regardless of the
+ file's name (it will use extra files if the links overflow the available
+ maximum).
+
+ This patch does not take into account file permissions, extended
+ attributes, or ACLs when linking things together, so you should only use
+ this if you don't care about preserving those extra file attributes (or if
+ they are always the same for identical files).
+
+ The _DIR_ is relative to the destination directory, so either specify a full
+ path to the hash hierarchy, or specify a relative path that puts the links
+ outside the destination (e.g. "../links").
+
+ Keep in mind that the hierarchy is never pruned, so if you need to reclaim
+ space, you should remove any files that have just one link (since they are
+ not linked into any destination dirs anymore):
+
+ > find $DIR -links 1 -delete
+
+ The link farm's directory hierarchy is determined by the file's (32-char)
+ MD5 hash and the file-length. The hash is split up into directory shards.
+ For example, if a file is 54321 bytes long, it could be stored like this:
+
+ > $DIR/123/456/789/01234567890123456789012.54321.0
+
+ Note that the directory layout in this patch was modified for version
+ 3.1.0, so anyone using an older version of this patch should move their
+ existing link hierarchy out of the way and then use the newer rsync to copy
+ the saved hierarchy into its new layout. Assuming that no files have
+ overflowed their link limits, this would work:
+
+ > mv $DIR $DIR.old
+ > rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
+ > rm -rf $DIR.tmp
+ > rm -rf $DIR.old
+
+ If some of your files are at their link limit, you'd be better of using a
+ script to calculate the md5 sum of each file in the hierarchy and move it
+ to its new location.
+
0. `--compress`, `-z`
With this option, rsync compresses the file data as it is sent to the
diff --git a/rsync.c b/rsync.c
--- a/rsync.c
+++ b/rsync.c
@@ -52,6 +52,7 @@ extern int flist_eof;
extern int file_old_total;
extern int keep_dirlinks;
extern int make_backups;
+extern char *link_by_hash_dir;
extern int sanitize_paths;
extern struct file_list *cur_flist, *first_flist, *dir_flist;
extern struct chmod_mode_struct *daemon_chmod_modes;
@@ -760,6 +761,10 @@ int finish_transfer(const char *fname, const char *fnametmp,
}
if (ret == 0) {
/* The file was moved into place (not copied), so it's done. */
+#ifdef HAVE_LINK
+ if (link_by_hash_dir)
+ link_by_hash(fname, fnametmp, file);
+#endif
return 1;
}
/* The file was copied, so tweak the perms of the copied file. If it
diff --git a/rsync.h b/rsync.h
--- a/rsync.h
+++ b/rsync.h
@@ -1446,7 +1446,8 @@ extern short info_levels[], debug_levels[];
#define DEBUG_FUZZY (DEBUG_FLIST+1)
#define DEBUG_GENR (DEBUG_FUZZY+1)
#define DEBUG_HASH (DEBUG_GENR+1)
-#define DEBUG_HLINK (DEBUG_HASH+1)
+#define DEBUG_HASHLINK (DEBUG_HASH+1)
+#define DEBUG_HLINK (DEBUG_HASHLINK+1)
#define DEBUG_ICONV (DEBUG_HLINK+1)
#define DEBUG_IO (DEBUG_ICONV+1)
#define DEBUG_NSTR (DEBUG_IO+1)
diff --git a/rsyncd.conf.5.md b/rsyncd.conf.5.md
--- a/rsyncd.conf.5.md
+++ b/rsyncd.conf.5.md
@@ -388,6 +388,23 @@ in the values of parameters. See that section for details.
is 0, which means no limit. A negative value disables the module. See
also the "[lock file](#)" parameter.
+0. `link by hash dir`
+
+ When the "link by hash dir" parameter is set to a non-empty string,
+ received files will be hard linked into **DIR**, a link farm arranged by
+ MD5 file hash. See the `--link-by-hash` option for a full explanation.
+
+ The **DIR** must be accessible inside any chroot restrictions for the
+ module, but can exist outside the transfer location if there is an
+ inside-the-chroot path to the module (see "use chroot"). Note that a
+ user-specified option does not allow this outside-the-transfer-area
+ placement.
+
+ If this parameter is set, it will disable the `--link-by-hash` command-line
+ option for copies into the module.
+
+The default is for this parameter to be unset.
+
0. `log file`
When the "log file" parameter is set to a non-empty string, the rsync