Skip to content

Commit

Permalink
Use shorter (base-36) encoding of outputPath digest
Browse files Browse the repository at this point in the history
This shortens branchnames and may make them more readable and easier to
announce: "483-s-b-n-t-pil" is easier to pronounce than "82dd7da9dfe" and
contains a more data.
  • Loading branch information
arielshaqed committed Oct 31, 2022
1 parent 0ed7ef0 commit 34174b0
Showing 1 changed file with 3 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.math.BigInteger;
import java.nio.charset.Charset;

// TODO(ariels): For Hadoop 3, it is enough (and better!) to extend PathOutputCommitter.
Expand Down Expand Up @@ -94,9 +95,9 @@ public class DummyOutputCommitter extends FileOutputCommitter {
protected String pathToBranch(Path p) {
String path = p.toString();
// TODO(ariels): Use a more compact encoding (base-36?)
String digest = hash.hashString(path, utf8).toString();
String digest = new BigInteger(hash.hashString(path, utf8).asBytes()).toString(36);
String pathPrefix = path.length() > 128 ? path.substring(0, 128) : path;
pathPrefix = pathPrefix.replaceAll("[^-_a-zA-Z0-9]", "-")
pathPrefix = pathPrefix.replaceAll("[^-_a-zA-Z0-9]", "-");
return String.format("%s-%s-%s", branchNamePrefix, digest, pathPrefix);
}

Expand Down

0 comments on commit 34174b0

Please sign in to comment.