Skip to content

Commit

Permalink
added support for conformal set outputs to prediction container
Browse files Browse the repository at this point in the history
  • Loading branch information
Cortys committed Mar 14, 2022
1 parent 8a77445 commit 68d58d0
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 33 deletions.
2 changes: 1 addition & 1 deletion Dockerfile.pred
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ RUN go get -d -v
RUN go build

# TF model container:
FROM tensorflow/tensorflow:2.5.0
FROM tensorflow/tensorflow:2.8.0

RUN apt-get update &&\
apt-get install -y git golang graphviz graphviz-dev &&\
Expand Down
65 changes: 38 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,11 @@ It takes the following arguments:
- **Cmds:**
- `show --format [json (default)|dot]`:
Outputs the CFG for the selected usage as JSON or in Graphviz dot format.
- `predict --model [model name] [--limit-id [limit id, default=v127_d127_f127_p127]] [--logits]`:
- `predict --model [model name] [--conformal-alpha [alpha, default=0]] [--limit-id [id, default=v127_d127_f127_p127)]] [--logits]`:
Outputs the prediction of the selected model as JSON.
The `conformal-alpha` parameter specifies whether conformal prediction results should be shown.
By default no conformal sets are produced.
To obtain conformal sets, an error threshold `0 < alpha < 1` has to be provided; the smaller the alpha value, the larger the prediction sets will be (`0.1` is a good default choice).
The `limit_id` specifies how the data associated with individual CFG nodes should be mapped to binary dimensions.
If the `--logits` flag is set, prediction logits will be returned instead of normalized probabilities.
Note that only combinations of models, limit ids and convert modes that were exported when building the prediction container will work.
Expand All @@ -103,35 +106,43 @@ An unsafe usage can be classified as follows:
./predict.sh \
--project elastic/beats --package go.elastic.co/apm --file config.go \
--line 413 --snippet "unsafe.Pointer(oldConfig)," \
predict -m WL2GNN 2>/dev/null \
predict -m WL2GNN -a 0.1 2>/dev/null \
| jq
```
Prediction output for both labels (exact probabilites might vary):
```json
[{
"cast-basic": 3.8024680293347046e-08,
"cast-bytes": 2.10747663764721e-09,
"cast-header": 4.1693176910939655e-08,
"cast-pointer": 2.197234172385265e-09,
"cast-struct": 5.247088097348751e-07,
"definition": 1.1479721706564305e-07,
"delegate": 0.9999991655349731,
"memory-access": 8.367572235101761e-08,
"pointer-arithmetic": 3.887335964236627e-08,
"syscall": 2.412203492507814e-10,
"unused": 1.5881319870292288e-10
}, {
"atomic": 0.9999879598617554,
"efficiency": 1.967955931547749e-08,
"ffi": 5.721917204937199e-06,
"generics": 1.3880583082936937e-06,
"hide-escape": 4.661455932364333e-06,
"layout": 1.2701431728601165e-07,
"no-gc": 1.7310886057941843e-09,
"reflect": 7.049543726544982e-10,
"serialization": 9.552078239494222e-08,
"types": 3.732756326257913e-09,
"unused": 1.0282862339394683e-09
}]
{
"probabilities": [{
"cast-basic": 0.000799796252977103,
"cast-bytes": 0.00023943622363731265,
"cast-header": 0.0008311063284054399,
"cast-pointer": 0.00024363627017010003,
"cast-struct": 0.0023890091106295586,
"definition": 0.0012677970807999372,
"delegate": 0.9921323657035828,
"memory-access": 0.001111199613660574,
"pointer-arithmetic": 0.0008071911288425326,
"syscall": 9.69868924585171e-05,
"unused": 8.147588232532144e-05
}, {
"atomic": 0.9911662936210632,
"efficiency": 0.00020463968394324183,
"ffi": 0.003083886345848441,
"generics": 0.0015664942329749465,
"hide-escape": 0.0027959353756159544,
"layout": 0.0004991954774595797,
"no-gc": 6.399328412953764e-05,
"reflect": 4.1643997974460945e-05,
"serialization": 0.0004356006102170795,
"types": 9.241054794983938e-05,
"unused": 4.988365981262177e-05
}],
"conformal_sets": [
["delegate"],
["atomic"]
]
}
```
[jq](https://stedolan.github.io/jq/) is of course optional here. Also, `2>/dev/null` might not be the the right choice for production use 🙂.
Note that the output format differs if `--conformal-alpha 0` (`-a 0`) is used;
in this case, no conformal sets are produced, and the resulting JSON only contains the two probability maps (i.e. the value at `probabilites` in the above example output).
1 change: 1 addition & 0 deletions requirements.pred.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
networkx==2.5.1
pydot==1.4.2
pyyaml==6
pygraphviz==1.6
funcy==1.16
scikit-learn==0.24.2
Expand Down
9 changes: 8 additions & 1 deletion src/usgoc/evaluation/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ def export_best(

model_name = hypermodel_builder.name
kwargs["return_model_paths"] = True
kwargs["return_calibration_configs"] = True
kwargs["return_metrics"] = True
kwargs["return_dims"] = True
kwargs["dry"] = True
Expand All @@ -450,13 +451,15 @@ def export_best(
target_dir = f"{models_dir}/{convert_mode}_{limit_id}/{model_name}"
best_fold_crit = None
best_fold_path = None
best_fold_calib = None
best_fold_dims = None
for fold in folds:
fold_crits = []
best_repeat_crit = None
best_repeat_path = None
best_repeat_calib = None
best_repeat_dims = None
for model_path, metrics, dims in fold:
for model_path, calib_configs, metrics, dims in fold:
if isinstance(criterion, str):
crit = metrics[criterion]
else:
Expand All @@ -465,12 +468,14 @@ def export_best(
if best_repeat_crit is None or crit > best_repeat_crit:
best_repeat_crit = crit
best_repeat_path = model_path
best_repeat_calib = calib_configs
best_repeat_dims = dims
# Use mean - 1std as fold peformance criterion:
fold_crit = np.mean(fold_crits) - np.std(fold_crits)
if best_fold_crit is None or fold_crit > best_fold_crit:
best_fold_crit = fold_crit
best_fold_path = best_repeat_path
best_fold_calib = best_repeat_calib
best_fold_dims = best_repeat_dims

model_path = best_fold_path[len("file://"):]
Expand All @@ -480,6 +485,8 @@ def export_best(
best_fold_dims["in_enc"] = hypermodel_builder.in_enc
utils.cache_write(
f"{target_dir}/dims.json", best_fold_dims, "json")
utils.cache_write(
f"{target_dir}/conformal_calibration_configs.yml", best_fold_calib, "yaml")

def aggregate_confusion_matrices(cms, normalize=True):
cms = fy.lcat(cms)
Expand Down
33 changes: 29 additions & 4 deletions src/usgoc/run_prediction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import json
import yaml
import click
import subprocess
import funcy as fy
Expand All @@ -9,6 +10,7 @@
import usgoc.utils as utils
import usgoc.datasets.unsafe_go as dataset
import usgoc.metrics.multi as mm
import usgoc.postprocessing.conformal as conf

PROJECTS_DIR = "/projects"
EXPORT_DIR = f"{utils.PROJECT_ROOT}/exported_models"
Expand Down Expand Up @@ -108,10 +110,14 @@ def show(obj, format):
"--limit-id", "-l",
type=click.STRING,
default="v127_d127_f127_p127")
@click.option(
"--conformal-alpha", "-a",
type=click.FloatRange(0.0, 1.0),
default=0)
@click.option("--logits", is_flag=True, default=False)
@click.pass_obj
def predict(
obj, model, limit_id, logits=False):
obj, model, limit_id, conformal_alpha=0.1, logits=False):
with utils.cache_env(use_cache=False):
convert_mode = obj["convert_mode"]
cfg = get_cfg_json(**obj)
Expand All @@ -122,16 +128,25 @@ def predict(
assert os.path.isdir(dir), "Requested model does not exist."
with open(f"{EXPORT_DIR}/target_label_dims.json", "r") as f:
labels1, labels2 = json.load(f)
labels1_keys = labels1.keys()
labels2_keys = labels2.keys()
labels1_keys = list(labels1.keys())
labels2_keys = list(labels2.keys())
with open(f"{dir}/dims.json", "r") as f:
dims = json.load(f)
if conformal_alpha == 0.0:
calib_config = dict(t1=1, t2=1)
else:
with open(f"{dir}/conformal_calibration_configs.yml", "r") as f:
calib_configs = yaml.unsafe_load(f)
assert conformal_alpha in calib_configs, f"Alpha must be from {calib_configs.keys()}."
calib_config = calib_configs[conformal_alpha]
in_enc = dims["in_enc"]
encoder = dataset.dataset_encoders[in_enc]
ds = encoder(graphs, dims)
model = tf.keras.models.load_model(f"{dir}/model", custom_objects=dict(
SparseMultiAccuracy=mm.SparseMultiAccuracy))
l1_pred, l2_pred = model.predict(ds)
l1_pred /= calib_config["t1"]
l2_pred /= calib_config["t2"]
if logits:
prob1 = l1_pred[0]
prob2 = l2_pred[0]
Expand All @@ -140,7 +155,17 @@ def predict(
prob2 = tf.nn.softmax(l2_pred, -1).numpy()[0]
l1_dict = fy.zipdict(labels1_keys, prob1)
l2_dict = fy.zipdict(labels2_keys, prob2)
print(json.dumps([l1_dict, l2_dict], cls=utils.NumpyEncoder))

if conformal_alpha == 0.0:
print(json.dumps([l1_dict, l2_dict], cls=utils.NumpyEncoder))
else:
set1_idx = conf.adaptive_sets(l1_pred, calib_config["qhat1"])[0]
set2_idx = conf.adaptive_sets(l2_pred, calib_config["qhat2"])[0]
set1 = [labels1_keys[i] for i in set1_idx]
set2 = [labels2_keys[i] for i in set2_idx]
print(json.dumps(dict(
probabilities=[l1_dict, l2_dict],
conformal_sets=[set1, set2]), cls=utils.NumpyEncoder))


if __name__ == "__main__":
Expand Down
5 changes: 5 additions & 0 deletions src/usgoc/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import json
import yaml
import pickle
import inspect
import numbers
Expand Down Expand Up @@ -299,6 +300,10 @@ class CacheFormat:
fy.partial(json.load, cls=NumpyDecoder),
fy.partial(json.dump, indent="\t", cls=NumpyEncoder),
type="text"),
yaml=cache_format(
fy.partial(yaml.unsafe_load),
fy.partial(yaml.dump),
type="text"),
plot=cache_format(
lambda _: None,
lambda fig, file: fig.savefig(
Expand Down

0 comments on commit 68d58d0

Please sign in to comment.