version 1.9 public release

uw-pluverse · Jan 10, 2025 · fa3f75a · fa3f75a
1 parent f406c05
commit fa3f75a
Show file tree

Hide file tree

Showing 186 changed files with 101,662 additions and 0 deletions.
diff --git a/MODULE.bazel b/MODULE.bazel
diff --git a/MODULE.bazel.kept_for_future b/MODULE.bazel.kept_for_future
@@ -0,0 +1,79 @@
+module(
+    name = "your_project_name",
+    version = "1.0.0",
+)
+
+# Rules for Maven JVM dependencies
+bazel_dep(name = "rules_jvm_external", version = "6.3")
+
+# Local the Maven extension
+maven = use_extension("@rules_jvm_external//:extensions.bzl", "maven")
+
+# Declare Maven artifacts.
+maven.install(
+    artifacts = [
+        "com.beust:jcommander:1.82",
+        "com.fasterxml.jackson.core:jackson-core:2.18.2",
+        "com.fasterxml.jackson.core:jackson-databind:2.18.2",
+        "com.fasterxml.jackson.core:jackson-annotations:2.18.2",
+        "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.18.2",
+        "com.fasterxml.jackson.datatype:jackson-datatype-guava:2.18.2",
+        "com.fasterxml.jackson.module:jackson-module-kotlin:2.18.2",
+        "com.google.errorprone:error_prone_annotations:2.3.4",
+        "com.google.flogger:flogger-system-backend:0.8",
+        "com.google.flogger:flogger:0.8",
+        "com.google.googlejavaformat:google-java-format:1.15.0",
+        "com.google.guava:guava:32.0.0-jre",
+        "com.google.truth:truth:1.1.4",
+        "com.googlecode.java-diff-utils:diffutils:1.3.0",
+        "com.guardsquare:proguard-base:7.2.1",
+        "com.github.gumtreediff:core:3.0.0",
+        "com.pinterest:ktlint:0.50.0",
+        "io.gitlab.arturbosch.detekt:detekt-cli:1.23.1",
+        "io.netty:netty-all:4.1.66.Final",
+        "it.unimi.dsi:fastutil:8.5.12",
+        "me.lemire.integercompression:JavaFastPFOR:0.1.9",
+        "org.antlr:antlr4-runtime:4.13.2",
+        "org.antlr:antlr4:4.13.2",
+        "org.apache.commons:commons-csv:1.10.0",
+        "org.apache.commons:commons-exec:1.4.0",
+        "org.apache.commons:commons-lang3:3.9",
+        "org.apache.commons:commons-text:1.9",
+        "org.jetbrains.kotlin:kotlin-stdlib-jdk8:1.9.10",
+        "org.jetbrains.kotlin:kotlin-reflect:1.9.10",
+        "org.jfree:jfreechart:1.5.0",
+        "org.jgrapht:jgrapht-core:1.3.0",
+        "org.ow2.asm:asm:9.3",
+        "org.ow2.asm:asm-commons:9.3",
+        "org.ow2.asm:asm-util:9.3",
+        "junit:junit:4.13.2",
+        "org.commonmark:commonmark:0.18.1",
+    ],
+    fetch_sources = True,
+    repositories = [
+        "https://jcenter.bintray.com",
+        "https://maven.google.com",
+        "https://repo1.maven.org/maven2",
+    ],
+)
+
+# Finalize the Maven extension.
+use_repo(maven, "maven")
+
+# Rules for Kotlin
+bazel_dep(name = "rules_kotlin", version = "2.0.0")
+
+# Rules for Go
+bazel_dep(name = "rules_go", version = "0.45.1")
+
+# Rules for Gazelle (for Go)
+bazel_dep(name = "gazelle", version = "0.34.0")
+
+# Rules for Protobuf
+bazel_dep(name = "protobuf", version = "29.0")
+bazel_dep(name = "rules_proto", version = "7.0.2")
+
+# Rules for Buildtools
+bazel_dep(name = "buildifier_prebuilt", version = "7.3.1")
+
+register_toolchains("//:kotlin_toolchain")
diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock
diff --git a/benchmark/convert_memory_log_to_csv.py b/benchmark/convert_memory_log_to_csv.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+import os
+import json
+import argparse
+from typing import List, Final, Tuple, Dict
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Summarize benchmark results (in Byte)"
+                                                 "This is used to extract the data from log and json file."
+                                                 "The output file can be further analyzed by analyze_csv_memory.py")
+    parser.add_argument("folders", nargs='+', default=[], help="Folders of json and log files, "
+                                                               "while json and log files should be paired")
+    parser.add_argument("-o", type=str, dest="output_csv", required=True, help="output csv file")
+
+    return parser.parse_args()
+
+
+def extract_entries(lines: List[str]) -> Tuple[List[int], List[int]]:
+    # cache samples stores in form: '<timestamp> <size>'
+    timestamp = []
+    cache = []
+    for line in lines:
+        line.rstrip()
+        t, c = line.split()
+        timestamp.append(int(t))
+        cache.append(int(c))
+    return timestamp, cache
+
+
+def average_usage(timestamp: List[int], cache: List[int]) -> int:
+    # calculate average function value
+    # average = area under curve / period
+    assert len(timestamp) == len(cache)
+    if len(timestamp) == 0:
+        return -1
+    elif len(timestamp) == 1:
+        return cache[0]
+    total_usage = 0
+    previous_time = timestamp[0]
+    previous_usage = cache[0]
+    l = len(timestamp)
+    for i in range(1, l):
+        delta_time = timestamp[i] - previous_time
+        usage_sum = cache[i] + previous_usage
+        # trapezoid rule
+        total_usage += usage_sum / 2 * delta_time
+        previous_time = timestamp[i]
+        previous_usage = cache[i]
+
+    return int(total_usage / (timestamp[-1] - timestamp[0]))
+
+
+def average_dict_item(orig_dict: dict, update_dict: dict, key: str):
+    raise NotImplementedError("This function is not yet implemented.")
+    # this function is called in somewhere of this file
+    # however, at least in the experiments of rcc, this function is never triggered
+    # and thus I dont know its intention and do not know how to implement it. 
+    # the following one is wrong. since (a+b+c)/3 is not equal to ((a+b)/2 + c)/2 = a/4+b/4+c/2
+    # orig_dict[key] += update_dict[key]
+    # orig_dict[key] //= 2
+
+
+def analyze_json_file(filepath: str, statistics_handle: dict):
+    # decode filename
+    base = os.path.basename(filepath)
+    base = os.path.splitext(base)[0]
+    buf, subject, reducer, timemark, iter = base.split('_')
+    reducer_t = f"{reducer}@{timemark}"
+
+    # read json file to a map
+    with open(filepath) as f:
+        json_dict = json.load(f)
+
+    # trim redundant information
+    if subject != json_dict['Subject']:
+        raise Exception(f"Error: File name and content inconsistent: Subject. Please check {filepath}")
+    json_dict.pop('Subject')
+    if reducer != json_dict['Reducer']:
+        raise Exception(f"Error: File name and content inconsistent: REDUCER. Please check {filepath}")
+    json_dict.pop('Reducer')
+
+    # dictionary keys
+    QUERY: Final = "Query"
+    TIME: Final = "Time"
+    TOKEN_R: Final = "Token_remaining"
+
+    # append any new data to statistics report
+    if subject not in statistics_handle:
+        statistics_handle[subject] = dict()
+    if reducer_t not in statistics_handle[subject]:
+        statistics_handle[subject][reducer_t] = json_dict
+    else:
+        # average if existing already
+        sub_dict = statistics_handle[subject][reducer_t]
+        average_dict_item(sub_dict, json_dict, QUERY)
+        average_dict_item(sub_dict, json_dict, TIME)
+        average_dict_item(sub_dict, json_dict, TOKEN_R)
+
+
+def analyze_log_file(filepath: str, statistics: dict):
+    # decode filename
+    base = os.path.basename(filepath)
+    base = os.path.splitext(base)[0]
+    buf, subject, reducer, timemark, iter = base.split('_')
+    reducer_t = f"{reducer}@{timemark}"
+
+    # read log file to a list
+    with open(filepath) as f:
+        lines = f.readlines()
+
+    # extract memory info from log file
+    timestamp, cache = extract_entries(lines)
+
+    # dictionary keys
+    PEAK: Final = "peak_cache_size"
+    AVG: Final = "average_memory_usage"
+    NUM_SAMPLES: Final = "number_of_samples"
+
+    # calculate memory usages and store in map
+    log_dict = dict()
+    log_dict[PEAK] = max(cache)
+    log_dict[AVG] = average_usage(timestamp, cache)
+    log_dict[NUM_SAMPLES] = len(lines)
+
+    if PEAK not in statistics[subject][reducer_t]:
+        statistics[subject][reducer_t].update(log_dict)
+    else:
+        sub_dict = statistics[subject][reducer_t]
+        average_dict_item(sub_dict, log_dict, PEAK)
+        average_dict_item(sub_dict, log_dict, AVG)
+        average_dict_item(sub_dict, log_dict, NUM_SAMPLES)
+
+
+def validate_existence(filepath: str):
+    if not os.path.exists(filepath):
+        raise Exception(f"Error: File not found in path: {filepath}")
+
+
+def pair_files(folder_list: List[str]) -> Dict[str, str]:
+    """result json and profiling log file should appear in a pair"""
+    folder_list.sort()
+    files_paired = dict()
+    for folder in folder_list:
+        for dirpath, dirnames, filenames in os.walk(folder):
+            for filename in filenames:
+                if filename.endswith('.json'):
+                    json_file = os.path.join(dirpath, filename)
+                    validate_existence(json_file)
+                    log_file = json_file.replace(".json", ".log")
+                    validate_existence(log_file)
+                    files_paired[json_file] = log_file
+    return files_paired
+
+def save_to_csv_file(statistics, csv_file):
+    with open(csv_file, "w") as output_file:
+        output_file.write("Subject,Entry_peak_cache,Average_cache,Entry_environment\n")
+
+        for subject in statistics.keys():
+            entries = statistics[subject]
+            for key in entries.keys():
+                entry_environment = entries[key]["Environment"]
+                entry_peak_cache = entries[key]["peak_cache_size"]
+                average_cache = entries[key]["average_memory_usage"]
+                output_file.write(f"{subject},{entry_peak_cache},{average_cache},{entry_environment}\n")
+
+
+def main():
+    args = parse_arguments()
+    statistics = dict()
+
+    files_paired = pair_files(args.folders)
+
+    # paired input files
+    for json_file, log_file in files_paired.items():
+        analyze_json_file(json_file, statistics)
+        analyze_log_file(log_file, statistics)
+
+    json_object = json.dumps(statistics, indent=4)
+    # print(json_object)
+
+    save_to_csv_file(statistics, args.output_csv)
+
+
+if __name__ == "__main__":
+    main()