refresh-measurements

#!/usr/bin/env python3

# TODO: make sure we measure non-computed metrics that are not part of any
# chart, if those metrics are then used for computed metrics that _are_ part of at least one chart

import os
import sys
import json
import stat
import time
import copy
import shutil
import random
import tempfile
import argparse
import datetime
import subprocess

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
CONFIG_DIR = "~/.git-source-metrics/"
CODE_CACHE_DIR = "~/.git-source-metrics/code/"

def test_human_readable_time_span():
    # normal use cases
    assert human_readable_time_span(1) == "1s"
    assert human_readable_time_span(36) == "36s"
    assert human_readable_time_span(59) == "59s"
    assert human_readable_time_span(60) == "1m"
    assert human_readable_time_span(61) == "1m1s"
    assert human_readable_time_span(119) == "1m59s"
    assert human_readable_time_span(120) == "2m"
    assert human_readable_time_span(121) == "2m1s"
    assert human_readable_time_span(3599) == "59m59s"
    assert human_readable_time_span(3600) == "1h"
    assert human_readable_time_span(3601) == "1h1s"
    assert human_readable_time_span(3660) == "1h1m"
    assert human_readable_time_span(3661) == "1h1m1s"
    assert human_readable_time_span(314159265) == "3636d2h27m45s"
    assert human_readable_time_span(4815162342) == "55731d1h5m42s"

    # edge cases
    assert human_readable_time_span(0) == "0s"
    assert human_readable_time_span(-1) == "-1s"
    assert human_readable_time_span(-3661) == "-1h1m1s"
    assert human_readable_time_span(sys.maxsize-1) == "106751991167300d15h30m6s"
    assert human_readable_time_span(sys.maxsize) == "106751991167300d15h30m7s"
    assert human_readable_time_span(sys.maxsize+1) == "106751991167300d15h30m8s"
    assert human_readable_time_span(-sys.maxsize) == "-106751991167300d15h30m7s"
    assert human_readable_time_span(-sys.maxsize-1) == "-106751991167300d15h30m8s"
    assert human_readable_time_span(-sys.maxsize-2) == "-106751991167300d15h30m9s"

    # invalid input
    # n/a


def human_readable_time_span(seconds):
    if seconds < 0:
        return "-" + human_readable_time_span(-1*seconds)

    output = ""
    if seconds == 0 or seconds % 60 != 0:
        output = str(int(seconds % 60)) + "s"

    minutes = seconds // 60
    if minutes % 60 != 0:
        output = str(int(minutes % 60)) + "m" + output

    hours = minutes // 60
    if hours % 24 != 0:
        output = str(int(hours % 24)) + "h" + output

    days = hours // 24
    if days != 0:
        output = str(int(days)) + "d" + output

    return output


def add_to_path_if_exists(target_env, folder):
    if os.path.exists(folder):
        if args.verbose:
            print("[DEBUG] add '%s' to PATH" % folder)
        target_env["PATH"] = target_env["PATH"] + ":" + folder


def prepend_to_path_if_exists(target_env, folder):
    if os.path.exists(folder):
        if args.verbose:
            print("[DEBUG] preprend '%s' to PATH" % folder)
        target_env["PATH"] = folder + ":" + target_env["PATH"]


def local_workingdir(git_url):
    code_cache_dir = os.path.expanduser(CODE_CACHE_DIR)
    if not os.path.exists(code_cache_dir):
        os.makedirs(code_cache_dir)
    working_dirname = ''.join(list(filter(str.isalnum, git_url)))
    working_dir_path = os.path.join(code_cache_dir, working_dirname)
    return working_dir_path


def spawn_git_fetcher(git_url):
    git_workdir = local_workingdir(git_url)
    if not os.path.exists(git_workdir):
        git_proc = subprocess.Popen("git clone %s %s" % (git_url, git_workdir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    else:
        git_proc = subprocess.Popen("git fetch -q", stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=git_workdir, shell=True)
    return (git_url, git_proc)


# Launch one git process per repo (in parallel) to fetch the latest
# code and then wait for all the processes to finish.
def fetch_latest_code(all_git_urls):
    longest_git_url_length = max([len(git_url) for git_url in all_git_urls])
    remaining_procs = [spawn_git_fetcher(git_url) for git_url in all_git_urls]
    if not args.quiet:
        print("Fetching the latest code from all repositories ...", file=sys.stderr)
    while len(remaining_procs) > 0:
        for (git_url, git_proc) in list(remaining_procs):
            git_proc.poll()
            if git_proc.returncode != None:
                remaining_procs.remove((git_url, git_proc))
                if git_proc.returncode == 0:
                    if not args.quiet:
                        print("Finished fetching the latest code from repo: " + git_url.ljust(longest_git_url_length + 1), end=' ', file=sys.stderr)
                else:
                    print(git_proc.stdout.read().strip(), file=sys.stderr)
                    print(git_proc.stderr.read().strip(), file=sys.stderr)
                    print("ERROR: Failed (see error above) to fetch the latest code from repo: " + git_url.ljust(longest_git_url_length + 1), end=' ', file=sys.stderr)
                if not args.quiet and remaining_procs:
                    print("(still waiting for %d more git processes to finish)" % len(remaining_procs), end=' ', file=sys.stderr)
                if not args.quiet:
                    print("", file=sys.stderr)
        time.sleep(1)


def git_reset_to(repo_name, git_workdir, sha1):
    if not args.quiet:
        print("%s: git cleaning" % repo_name, file=sys.stderr)
    subprocess.check_call("git clean -q -dxff", cwd=git_workdir, shell=True)
    if not args.quiet:
        print("%s: git resetting to %s" % (repo_name, sha1), file=sys.stderr)
    subprocess.check_call("git reset -q --hard %s" % sha1, cwd=git_workdir, shell=True)


def get_sparse_commits(repo_name, repo_config):
    branch = repo_config["branch"]
    git_url = repo_config["git_url"]
    git_workdir = local_workingdir(git_url)

    treepath = repo_config.get("treepath", "all")
    if treepath == 'first-parent':
        git_log_cmd = "git log --first-parent --pretty=format:'%H %ct %s' " + branch
    elif treepath == 'topo-order':
        git_log_cmd = "git log --topo-order --pretty=format:'%H %ct %s' " + branch
    elif treepath == 'author-date-order':
        git_log_cmd = "git log --author-date-order --pretty=format:'%H %ct %s' " + branch
    elif treepath == 'date-order':
        git_log_cmd = "git log --date-order --pretty=format:'%H %ct %s' " + branch
    elif treepath == 'commit-date-chronological':
        git_log_cmd = "git log --date-order --pretty=format:'%H %ct %s' " + branch + " | sort -k2,2 -rn"
    elif treepath == 'ancestry-path-to-oldest-root':
        git_log_cmd = "git log --ancestry-path --pretty=format:'%H %ct %s' $(git log --max-parents=0 HEAD --pretty=format:'%H %ct' | sort -k 2,2 -n | head -1 | cut -f 1 -d ' ').." + branch
    else:
        git_log_cmd = "git log --pretty=format:'%H %ct %s' " + branch
    if not args.quiet:
        print("git_log_cmd: " + git_log_cmd)
    try:
        # NOTE: We use .split("\n") rather than .splitlines() here because the latter
        # splits on \n, \r and \r\n and git allows \r in commit subject lines.
        git_log_output = subprocess.check_output(git_log_cmd, cwd=git_workdir, shell=True).decode("utf-8").strip().split("\n")
    except subprocess.CalledProcessError as e:
        print("ERROR: could not list commits on branch %s for git_url %s" % (branch, git_url), file=sys.stderr)
        sys.exit(1)

    all_xticks = []
    last_tick_commit_date = datetime.datetime.utcfromtimestamp(0)
    commit_date_limit = datetime.datetime.min
    if args.ignore_commits_before:
        if len(args.ignore_commits_before) == 4:
            commit_date_limit = datetime.datetime.strptime(args.ignore_commits_before, "%Y")
        else:
            commit_date_limit = datetime.datetime.strptime(args.ignore_commits_before, "%Y-%m-%d")
    else:
        repo_start_date = repo_config.get("start_date", None)
        if repo_start_date:
            commit_date_limit = datetime.datetime.strptime(repo_start_date, "%Y-%m-%d")

    tick_gap_days = repo_config.get("tick_gap_days", None)
    if args.tick_gap_days == 0:
        tick_gap_days = None
    elif args.tick_gap_days > 0:
        tick_gap_days = args.tick_gap_days

    bad_commits = []
    while len(git_log_output) > 0:
        log_line = git_log_output.pop(-1)
        sha1, commit_timestamp, subject = log_line.split(" ", 2)
        if not commit_timestamp:
            if not args.quiet:
                # For example 281a10c and e589091 etc in https://github.com/mozilla/gecko-dev
                # git fails to print --pretty=format:"%ct %H" properly even though
                # the commits have timestamps. Seems like git 1.7.10.4 has a problem
                # with these and git 2.1.0 handles it fine. Relating to empty %cn maybe?
                # I couldn't find a way to create a commit with empty author name (like the one
                # above) so maybe it's a bug in the mercurial to git conversion tool that
                # mozilla uses.
                print("WARNING: failed to parse 'sha1 commit_timestamp subject' parts (empty commit_timestamp) from git log output; sha1=%s repo=%s and line=%s" % (sha1, repo_name, log_line))
            continue
        try:
            commit_date = datetime.datetime.fromtimestamp(int(commit_timestamp))
        except ValueError:
            print("WARNING: failed to parse 'sha1 commit_timestamp subject' parts (non-numerical commit_timestamp) from git log output; sha1=%s repo=%s and line=%s" % (sha1, repo_name, log_line))
            continue
        if commit_date < commit_date_limit:
            if not args.quiet:
                print("INFO: commit with sha1 %s in repo %s is too old, ignoring." % (sha1, repo_name))
            continue
        if commit_date < last_tick_commit_date:
            bad_commits.append(sha1)
            continue
        if not tick_gap_days or args.all_commits or (commit_date - last_tick_commit_date).days >= tick_gap_days:
            all_xticks.append((sha1, commit_timestamp, subject))
            last_tick_commit_date = commit_date

    if len(bad_commits) > 0 and not args.quiet:
        print("WARNING: the following commits in repo %s had a commit date which is earlier than the commit date of one of its first parent commits; ignoring:" % repo_name)
        print("\n".join(bad_commits[:10]) + "\n...\n" + "\n".join(bad_commits[-10:]))
        print()

    return all_xticks


def get_metric_type(config, metric_name):
    return config["metrics"][metric_name]["type"]


def compute_moving_average(metric, repo_name, target_sha1, commit_timestamp, selected_commits, measured_data):
    # "tick_width" == 1 would make the moving average series identical to the
    # underlying metric so it should always be 2 or higher (often higher).
    tick_width = metric["moving_average_tick_width"]
    underlying_metric_name = metric["underlying_metric"]
    selected_sha1s = [sha1 for (sha1, _, _) in selected_commits]
    # "underlying_data" will be a dict that maps sha1 to { "timestamp": X, "value": Y }
    underlying_data = measured_data.get_repo_metric_dict(repo_name, underlying_metric_name)
    # To compute the "moving average" value at target_sha1 we take the value of
    # the underlying metric at target_sha1 as well as the values of the underlying
    # metric at the "tick_width-1" ticks before "target_sha1", and then we calculate
    # the average of these values.
    target_index = selected_sha1s.index(target_sha1)
    first_index = target_index - tick_width + 1
    # It's not possible to calculate a moving average unless we're at least
    # "tick_width" ticks into the series.
    if first_index < 0:
        return None
    summation = sum([underlying_data[sha1]["value"] for sha1 in selected_sha1s[first_index:target_index+1]])
    return summation / tick_width

def compute_metric(metric, repo_name, sha1, commit_timestamp, selected_commits, measured_data):
    metric_type = metric["type"]
    if metric_type == "moving_average":
        return compute_moving_average(metric, repo_name, sha1, commit_timestamp, selected_commits, measured_data)
    else:
        print("ERROR: config.pyon contains invalid metric type '%s'" % metric_type)
        sys.exit(1)


def measure_metric(metric, target, git_workdir, sha1, timestamp, branch):
    cmd_params_dict = {
        "sha1": sha1,
        "timestamp": timestamp,
        "target": target,
        "branch": branch,
    }
    metric_command = metric["cmd"] % cmd_params_dict
    if args.verbose:
        print("DEBUG: metric command is " + repr(metric_command), file=sys.stderr)

    child_env = os.environ.copy()
    # Prepend (rather than append) "builtin-metric-scripts" to $PATH because we want
    # measurements to have as little dependency on the host machine as possible.
    prepend_to_path_if_exists(child_env, os.path.join(SCRIPT_DIR, "builtin-metric-scripts"))
    external_scripts_root = os.path.join(SCRIPT_DIR, "../external-metric-scripts")
    # If your metrics require scripts for different git repositories you can add
    # each such git repo as a submodule under the "external-metric-scripts" dir
    # and every dir in every such repo will be put on the $PATH during
    # measurement. You can also add metric scripts directly in
    # "external-metric-scripts" without using a git submodule if you want.
    if os.path.isdir(external_scripts_root):
        for dirpath, _, _ in os.walk(external_scripts_root, followlinks=True):
            prepend_to_path_if_exists(child_env, dirpath)
    # Don't prepend this because it has host specific dependencies, it's added
    # only as a convenience.
    add_to_path_if_exists(child_env, "/usr/local/bin")

    child_env = os.environ.copy()
    external_metric_scripts_dir = os.path.join(SCRIPT_DIR, "..", "external-metric-scripts")
    if os.path.exists(external_metric_scripts_dir):
        child_env["PATH"] = external_metric_scripts_dir + ":" + child_env["PATH"]
    child_env["PATH"] = os.path.join(SCRIPT_DIR, "builtin-metric-scripts") + ":" + child_env["PATH"]

    if args.verbose:
        print("DEBUG: cwd was " + repr(git_workdir), file=sys.stderr)

    command_time_begin = time.time()
    proc = subprocess.Popen(metric_command, cwd=git_workdir, env=child_env, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    stdout_output, stderr_output = proc.communicate()
    command_secs_taken = int(time.time() - command_time_begin)

    stdout_output = stdout_output.strip()
    stderr_output = stderr_output.strip() if stderr_output else None

    if args.verbose:
        print("DEBUG: command took " + human_readable_time_span(command_secs_taken) + " to run", file=sys.stderr)
        print("DEBUG: stdout was " + repr(stdout_output), file=sys.stderr)
        print("DEBUG: stderr was " + repr(stderr_output), file=sys.stderr)

    try:
        value = int(stdout_output)
    except ValueError:
        value = None
        errmsg = ("ERROR: measurement failed because metric script output could not be converted to an int value\n" + \
                  "  sha1: " + sha1 + "\n" + \
                  "  metric name: " + metric["name"] + "\n" + \
                  "  metric cmd: " + metric_command + "\n" + \
                  "  metric script output: " + repr(stdout_output) + "\n")
        sys.stderr.write(errmsg)
    return value


# While this script is running it saves timestamps at various moments so that
# it can print a summary in the end of how long various stages took etc.
class TimingData(object):
    def __init__(self):
        self.fetch_start_time = None
        self.fetch_end_time = None
        self.refresh_start_time = None
        self.refresh_end_time = None


class MeasuredData(object):
    def __init__(self):
        # datapoints is a dict in which repo_name   maps to a dict,
        #                      in which metric_name maps to a dict,
        #                      in which sha1        maps to a dict,
        #                      that contains "timestamp" and "value".
        self.datapoints = {}

    def load_from_file(self, filename):
        self.datapoints = eval(open(filename).read())

    def save_to_file(self, filename):
        string_data = json.dumps(self.datapoints)
        file_dir = os.path.dirname(filename)
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)
        with open(filename, "w") as fil:
            fil.write(string_data + "\n")

    def clear_data_for_repo(self, repo_name):
        self.datapoints.pop(repo_name, None)

    def clear_data_for_repo_metric(self, repo_name, target_metric_name):
        repo_data = self.datapoints.get(repo_name)
        if repo_data:
            for datapoint_type in list(repo_data.keys()):
                metric_name = datapoint_type.split(":", 1)[0]
                if metric_name == target_metric_name:
                    repo_data.pop(datapoint_type, None)

    def get_repo_metric_dict(self, repo_name, metric_name):
        return self.datapoints.setdefault(repo_name, {}).setdefault(metric_name, {})

    def contains_data_for(self, repo_name, sha1, metric_name):
        return sha1 in self.get_repo_metric_dict(repo_name, metric_name)

    def add_data(self, repo_name, sha1, timestamp, metric_name, value):
        metric_dict = self.get_repo_metric_dict(repo_name, metric_name)
        datapoint = {
            "timestamp": timestamp,
            "value": value,
        }
        metric_dict[sha1] = datapoint


def save_data_jscopy(config, selected_commits, measured_data, jsdata_filename):
    all_charts = config["charts"]
    all_metrics = config["metrics"]
    jsdata_tempfile_oshandle, jsdata_tempfile_name = tempfile.mkstemp()
    with open(jsdata_tempfile_name, "w") as jsdata_temp_file:
        jsdata_temp_file.write("const all_chart_data = {\n")
        for chart_name, chart_config in all_charts.items():
            jsdata_temp_file.write("    '%s': {\n" % chart_name)
            jsdata_temp_file.write("        chart_title: '%s',\n" % chart_config["chart_title"])
            jsdata_temp_file.write("        chart_series: [\n")
            for series in chart_config["chart_series"]:
                jsdata_temp_file.write("        {\n")
                target = series.get("target", None)
                title_format_dict = {
                    "target": target,
                }
                metric_title = all_metrics[series["metric"]]["title"].replace("'", r"\'") % title_format_dict
                series_title = metric_title
                jsdata_temp_file.write("            name: '%s in %s',\n" % (series_title, series["repo"]))
                if "color" in series:
                    jsdata_temp_file.write("            color: '%s',\n" % series["color"])
                jsdata_temp_file.write("            repo: '%s',\n" % series["repo"])
                jsdata_temp_file.write("            data: [")
                datapoint_type = get_datapoint_type(series["metric"], target)
                series_data = measured_data.get_repo_metric_dict(series["repo"], datapoint_type)
                series_data = sorted(list(series_data.items()), key=lambda sha1_datapoint1: int(sha1_datapoint1[1]["timestamp"]))
                if not args.include_all_measured_data:
                    # TODO: try zip(*foo) here
                    repo_selected_sha1s = [sha1 for (sha1, commit_timestamp, msg) in selected_commits[series["repo"]]]
                    series_data = [sha1_datapoint for sha1_datapoint in series_data if sha1_datapoint[0] in repo_selected_sha1s]
                for idx, (sha1, datapoint) in enumerate(series_data):
                    if idx:
                        jsdata_temp_file.write("                     ")
                    jsdata_temp_file.write("{ x:%s, y:%s },\n" % (datapoint["timestamp"], datapoint["value"]))
                jsdata_temp_file.write("            ],\n")
                jsdata_temp_file.write("        },\n")
            jsdata_temp_file.write("        ],\n")
            if "annotations" in chart_config:
                jsdata_temp_file.write("        chart_annotations: {\n")
                for annotation_set_name in chart_config["annotations"]:
                    annotations = config["annotation_sets"][annotation_set_name]
                    for date, description in annotations.items():
                        timestamp = datetime.datetime.strptime(date, "%Y-%m-%d").strftime("%s")
                        jsdata_temp_file.write("            '%s': %s,\n" % (timestamp, json.dumps(description)))
                jsdata_temp_file.write("        },\n")
            jsdata_temp_file.write("    },\n")
        jsdata_temp_file.write("}\n")

        jsdata_temp_file.write("\n")
        jsdata_temp_file.write("const selected_commits = {\n")
        for repo_name, repo_selected_commits in selected_commits.items():
            jsdata_temp_file.write("\t'%s': [\n" % repo_name)
            last_commit_timestamp = None
            for sha1, commit_timestamp, msg in repo_selected_commits:
                if commit_timestamp != last_commit_timestamp:
                    if last_commit_timestamp != None:
                        jsdata_temp_file.write("]],\n")
                    jsdata_temp_file.write("\t\t[%s, [" % commit_timestamp)
                else:
                    jsdata_temp_file.write("\n,\t\t\t")
                jsdata_temp_file.write("{sha1: '%s', msg: %s}" % (sha1, json.dumps(msg)))
                last_commit_timestamp = commit_timestamp
            jsdata_temp_file.write("]],\n")
            jsdata_temp_file.write("\t],\n")
        jsdata_temp_file.write("}\n")

    # when file is written, move it into place
    os.close(jsdata_tempfile_oshandle)
    shutil.move(jsdata_tempfile_name, jsdata_filename)
    os.chmod(jsdata_filename, stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH)


def select_commit_ticks(config):
    selected_commits = {}
    for repo_name, repo_config in config["git_repos"].items():
        if args.skip_fetch and not os.path.exists(local_workingdir(repo_config["git_url"])):
            print("ERROR: --skip-fetch cannot be used with repositories that hasn't been cloned yet. Please re-run without --skip-fetch.")
            sys.exit(1)
        if not args.quiet:
            print("Making list of SHA1s to measure in repo: " + repo_name, file=sys.stderr)
        selected_commits[repo_name] = get_sparse_commits(repo_name, repo_config)
    return selected_commits


def get_datapoint_type(metric_name, target):
    if not target:
        return metric_name
    else:
        return metric_name + ":" + target


def refresh_measurements(config, selected_commits, data_filename, measured_data, jsdata_filename):
    repo_to_needed_measurements_map = {}
    for chart_name, chart in config["charts"].items():
        for series in chart["chart_series"]:
            if get_metric_type(config, series["metric"]) == "command":
                repo_to_needed_measurements_map.setdefault(series["repo"], set()).add((series["metric"], series.get("target", None)))

    todo_list = []
    for repo_name, repo_needed_measurements in repo_to_needed_measurements_map.items():
        for sha1, commit_timestamp, msg in selected_commits[repo_name]:
            measurements_needed_for_this_sha1 = []
            commit_datetime = datetime.datetime.fromtimestamp(int(commit_timestamp))
            for metric_name, target in repo_needed_measurements:
                if args.refresh_selected_metrics_only and metric_name not in args.refresh_selected_metrics_only:
                    continue
                too_old = False
                metric_start_date = config["metrics"][metric_name].get("start_date", None)
                if metric_start_date:
                    too_old = commit_datetime < datetime.datetime.strptime(metric_start_date, "%Y-%m-%d")

                datapoint_type = get_datapoint_type(metric_name, target)
                if not too_old and not measured_data.contains_data_for(repo_name, sha1, datapoint_type):
                    measurements_needed_for_this_sha1.append((metric_name, target))
            if len(measurements_needed_for_this_sha1) > 0:
                todo_list.append((repo_name, sha1, commit_timestamp, measurements_needed_for_this_sha1))

    if args.measurement_order == "random":
        random.shuffle(todo_list)
    elif args.measurement_order == "linear-newest-first":
        todo_list = sorted(todo_list, key=lambda todo_item: datetime.datetime.fromtimestamp(int(todo_item[2])), reverse=True)
    elif args.measurement_order == "linear-oldest-first":
        todo_list = sorted(todo_list, key=lambda todo_item: datetime.datetime.fromtimestamp(int(todo_item[2])))

    if args.max_measurements != -1:
        todo_list = todo_list[0:args.max_measurements]

    measurements_done = 0
    measurements_total = 0
    for (_, sha1, _, measurements_needed_for_this_sha1) in todo_list:
        measurements_total += len(measurements_needed_for_this_sha1)
    measurement_start_time = datetime.datetime.now()
    time_last_save = time.time()
    for (repo_name, sha1, timestamp, measurements_needed_for_this_sha1) in todo_list:
        git_workdir = local_workingdir(config["git_repos"][repo_name]["git_url"])

        atleast_one_metric_needs_a_reset_workdir = False
        for metric_name, target in measurements_needed_for_this_sha1:
            if config["metrics"][metric_name].get("workdir", True):
                atleast_one_metric_needs_a_reset_workdir = True
                break

        if atleast_one_metric_needs_a_reset_workdir:
            git_reset_to(repo_name, git_workdir, sha1)

        for metric_name, target in measurements_needed_for_this_sha1:
            measurements_done += 1
            if not args.quiet:
                measurement_elapsed_time = datetime.datetime.now() - measurement_start_time
                seconds_per_measurement = measurement_elapsed_time.total_seconds() / measurements_done
                estimated_seconds_left = (measurements_total-measurements_done) * seconds_per_measurement
                if target:
                    target_suffix = " for " + target
                else:
                    target_suffix = ""
                print_dict = {
                    "repo_name": repo_name,
                    "metric_name": metric_name,
                    "target_suffix": target_suffix,
                    "sha1": sha1[0:7],
                    "commit_date": datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M"),
                    "percent_done": 100 * measurements_done / measurements_total,
                    "time_left": human_readable_time_span(estimated_seconds_left)
                }
                print("%(repo_name)s: measuring %(metric_name)s%(target_suffix)s at %(sha1)s from %(commit_date)s ... %(percent_done)d%% done, est. time left: %(time_left)s" % print_dict, file=sys.stderr)
            value = measure_metric(config["metrics"][metric_name], target, git_workdir, sha1, timestamp, config["git_repos"][repo_name]["branch"])
            if value != None:
                datapoint_type = get_datapoint_type(metric_name, target)
                measured_data.add_data(repo_name, sha1, timestamp, datapoint_type, value)


        if int(time.time() - time_last_save) > 5:
            print("Saving data to disk ...")
            measured_data.save_to_file(data_filename)
            save_data_jscopy(config, selected_commits, measured_data, jsdata_filename)
            time_last_save = time.time()

    measured_data.save_to_file(data_filename)
    save_data_jscopy(config, selected_commits, measured_data, jsdata_filename)


def refresh_computed_metrics(config, selected_commits, data_filename, measured_data, jsdata_filename):
    repo_to_computed_metrics_map = {}
    for chart_name, chart in config["charts"].items():
        for series in chart["chart_series"]:
            if get_metric_type(config, series["metric"]) != "command":
                repo_to_computed_metrics_map.setdefault(series["repo"], set()).add(series["metric"])

    todo_list = []
    for repo_name, repo_metrics in repo_to_computed_metrics_map.items():
        for sha1, commit_timestamp, msg in selected_commits[repo_name]:
            metrics_that_lack_data_for_this_sha1 = []
            commit_datetime = datetime.datetime.fromtimestamp(int(commit_timestamp))
            for metric_name in repo_metrics:
                if args.refresh_selected_metrics_only and metric_name not in args.refresh_selected_metrics_only:
                    continue
                too_old = False
                metric_start_date = config["metrics"][metric_name].get("start_date", None)
                if metric_start_date:
                    too_old = commit_datetime < datetime.datetime.strptime(metric_start_date, "%Y-%m-%d")

                if not too_old and not measured_data.contains_data_for(repo_name, sha1, metric_name):
                    metrics_that_lack_data_for_this_sha1.append(metric_name)
            if len(metrics_that_lack_data_for_this_sha1) > 0:
                todo_list.append((repo_name, sha1, commit_timestamp, metrics_that_lack_data_for_this_sha1))

    # TODO: delete or rename to computations
    measurements_done = 0
    measurements_total = 0
    for (_, sha1, _, metrics_that_lack_data_for_this_sha1) in todo_list:
        measurements_total += len(metrics_that_lack_data_for_this_sha1)
    measurement_start_time = datetime.datetime.now()
    for (repo_name, sha1, timestamp, metrics_that_lack_data_for_this_sha1) in todo_list:
        for metric_name in metrics_that_lack_data_for_this_sha1:
            measurements_done += 1
            if not args.quiet:
                measurement_elapsed_time = datetime.datetime.now() - measurement_start_time
                seconds_per_measurement = measurement_elapsed_time.total_seconds() / measurements_done
                estimated_seconds_left = (measurements_total-measurements_done) * seconds_per_measurement
                print("%s: computing %s (%s) ... %d%% done, est. time left: %s" % (repo_name, sha1, metric_name, 100*measurements_done/measurements_total, human_readable_time_span(estimated_seconds_left)), file=sys.stderr)
            value = compute_metric(config["metrics"][metric_name], repo_name, sha1, timestamp, selected_commits[repo_name], measured_data)
            if value != None:
                measured_data.add_data(repo_name, sha1, timestamp, metric_name, value)

        # TODO: probably not save every loop iteration here?
        measured_data.save_to_file(data_filename)
        save_data_jscopy(config, selected_commits, measured_data, jsdata_filename)
    else:
        save_data_jscopy(config, selected_commits, measured_data, jsdata_filename)


def handle_delete_and_list_switches(config, measured_data, data_filename):
    if args.delete_data_for_repo:
        repo_name = args.delete_data_for_repo
        if not repo_name in config["git_repos"]:
            print("ERROR: no such repo name in config")
            sys.exit(1)
        measured_data.clear_data_for_repo(repo_name)
        measured_data.save_to_file(data_filename)
        print("All data for repo '%s' deleted." % repo_name)
        sys.exit(0)

    if args.delete_data_for_metric:
        metric_name = args.delete_data_for_metric
        if not metric_name in config["metrics"]:
            print("ERROR: no such metric name in config")
            sys.exit(1)
        for repo_name, repo_config in config["git_repos"].items():
            measured_data.clear_data_for_repo_metric(repo_name, metric_name)
        measured_data.save_to_file(data_filename)
        print("All data for metric '%s' deleted." % metric_name)
        sys.exit(0)

    if args.list_repos:
        for repo_name in config["git_repos"]:
            print(repo_name)
        sys.exit(0)

    if args.list_metrics:
        for metric_name in config["metrics"]:
            print(metric_name)
        sys.exit(0)


def print_summary(timing_data):
    print()
    print("---[ Finished ]-----------------------------------------------------")
    print()

    if not args.skip_fetch:
        print("Fetch began: " + str(timing_data.fetch_start_time)[:16])
        print("Fetch finished: " + str(timing_data.fetch_end_time)[:16])
        fetch_seconds = (timing_data.fetch_end_time - timing_data.fetch_start_time).total_seconds()
        print("Fetch took: " + human_readable_time_span(fetch_seconds))
        print()

    print("Measurement began: " + str(timing_data.refresh_start_time)[:16])
    print("Measurement finished: " + str(timing_data.refresh_end_time)[:16])
    measurement_seconds = (timing_data.refresh_end_time - timing_data.refresh_start_time).total_seconds()
    print("Measurement took: " + human_readable_time_span(measurement_seconds))
    print()


def validate_config(config):
    for (metric_name, metric_obj) in config["metrics"].items():
        if ":" in metric_name:
            print("ERROR: metric names may not contain the character ':', found invalid metric name '" + metric_name + "'", file=sys.stderr)
            sys.exit(1)
    for chart_name, chart_config in config["charts"].items():
        for series in chart_config["chart_series"]:
            if series["repo"] == "*":
                if series.get("color", None):
                    print("ERROR: series with repo=\"*\" may not have a color attribute set (set color attributes on each repository instead), found invalid series in chart " + chart_name, file=sys.stderr)
                    sys.exit(1)
            if series["metric"] not in config["metrics"]:
                print("ERROR: invalid metric name '%(metric_name)s', found in chart %(chart_name)s " % { "metric_name": series['metric'], "chart_name": chart_name }, file=sys.stderr)
                sys.exit(1)


def expand_wildcards(config):
    for chart_name, chart_config in config["charts"].items():
        series_with_expanded_wildcards = []
        for series in chart_config["chart_series"]:
            if series["repo"] == "*":
                for repo_name, repo_config in sorted(iter(config["git_repos"].items()), reverse=True):
                    repo_series = copy.deepcopy(series)
                    repo_series["repo"] = repo_name
                    if "color" in repo_config:
                        repo_series["color"] = repo_config["color"]
                    series_with_expanded_wildcards.append(repo_series)
            else:
                series_with_expanded_wildcards.append(series)
        chart_config["chart_series"] = series_with_expanded_wildcards

def load_config():
    config = eval(open(os.path.join(SCRIPT_DIR, "..", "config.pyon")).read())
    for (metric_name, metric_obj) in config["metrics"].items():
        metric_obj["name"] = metric_name
    for (_, repo_config) in config["git_repos"].items():
        repo_config["branch"] = "origin/" + repo_config.get("branch", "master")
    validate_config(config)
    expand_wildcards(config)
    return config


def main():
    config = load_config()
    data_filename = os.path.join(SCRIPT_DIR, "..", "data.pyon")
    jsdata_filename = os.path.join(SCRIPT_DIR, "..", "all_chart_data.js")

    timing_data = TimingData()

    if os.path.exists(data_filename):
        measured_data = MeasuredData()
        measured_data.load_from_file(data_filename)
    else:
        measured_data = MeasuredData()

    handle_delete_and_list_switches(config, measured_data, data_filename)

    if not args.skip_fetch and not args.just_regenerate_jsdata:
        timing_data.fetch_start_time = datetime.datetime.now()
        # Use set comprehension to build "git_urls" because if "git_repos" in
        # config might contain the same git url twice but with different
        # branches (mozilla inbound vs aurora etc).
        git_urls = {repo["git_url"] for (_, repo) in config["git_repos"].items()}
        fetch_latest_code(git_urls)
        timing_data.fetch_end_time = datetime.datetime.now()

    # By default we measure for every single commit, but depending on repo size,
    # volatility of the involved metrics, project commit frequency etc we might
    # not need/want to measure that often; in this case a
    #
    #   "tick_gap_days": N,
    #
    # key can be added in the repo config, which will make measurement happen as
    # often as possible while ensuring that at least N days passes between the
    # commit timestamps of two measured commits. The same set of selected commits
    # will then be used for all measured metrics as well as all computed metrics.
    selected_commits = select_commit_ticks(config)

    if args.just_regenerate_jsdata:
        print("Regenerating js data ...")
        save_data_jscopy(config, selected_commits, measured_data, jsdata_filename)
        sys.exit(0)

    # Make sure we have measured everything that can be measured directly
    # (i.e. doesn't depend on other metrics).
    timing_data.refresh_start_time = datetime.datetime.now()
    refresh_measurements(config, selected_commits, data_filename, measured_data, jsdata_filename)
    timing_data.refresh_end_time = datetime.datetime.now()

    # Finally, compute values for all metrics that derive from other
    # directly measured metrics.
    refresh_computed_metrics(config, selected_commits, data_filename, measured_data, jsdata_filename)

    print_summary(timing_data)


def run_tests():
    test_human_readable_time_span()

if __name__ == '__main__':
    try:
        run_tests()

        parser = argparse.ArgumentParser()
        parser.add_argument("--quiet", action="store_true")
        parser.add_argument("--verbose", action="store_true")

        # measurement options
        parser.add_argument("--skip-fetch", action="store_true", help="don't call 'git fetch', just process what is available locally (for debugging only)")
        parser.add_argument("--ignore-commits-before", metavar="YYYY or YYYY-MM-DD")
        parser.add_argument("--include-all-measured-data", action="store_true")
        parser.add_argument("--refresh-selected-metrics-only", nargs="+", metavar="METRIC_NAME")
        parser.add_argument("--measurement-order", default="random", choices=["random", "linear-oldest-first", "linear-newest-first"])
        parser.add_argument("--all-commits", action="store_true", help="ignore per repo tick_gap_days settings and force measurement of all commits")
        parser.add_argument("--tick-gap-days", type=int, help="minimum number of days between commits to measure")
        parser.add_argument("--max-measurements", metavar="N", type=int, default=-1, help="exit after making N measurements")

        # other commands
        parser.add_argument("--just-regenerate-jsdata", action="store_true")
        parser.add_argument("--delete-data-for-metric", metavar="METRIC_NAME", type=str)
        parser.add_argument("--delete-data-for-repo", metavar="REPO_NAME", type=str)
        parser.add_argument("--list-repos", action="store_true")
        parser.add_argument("--list-metrics", action="store_true")

        args = parser.parse_args()
        main()
    except KeyboardInterrupt:
        print()