infer_print.py

#!/usr/bin/env python3.10
# -*- coding: utf-8 -*-

from __future__ import annotations

import csv
import pickle
import sys
from dataclasses import dataclass
from typing import Any, Dict, Sequence, Type, TypeVar

import numpy as np

from best_majvote import FloatArray
from util import zip_strict

AnyPreds = TypeVar('AnyPreds', bound='Preds')


def pred_cmp(cpred: float, thresh: float) -> bool:
    # Negative threshold -> label only negative predictions
    return cpred <= -thresh if thresh < 0 else cpred > thresh


def _load_predfile(path: str) -> Dict[str, Any]:
    with open(path, 'rb') as f:
        header = pickle.load(f)
        if 'preds' in header:
            return header  # Old style

        # Incremental style
        preds = []
        uncertainties = []
        while True:
            try:
                chunk = pickle.load(f)
            except EOFError:
                break
            preds.append(chunk['preds'])
            uncertainties.append(chunk['uncertainties'])

        return dict(header, preds=np.concatenate(preds), uncertainties=np.concatenate(uncertainties))


@dataclass(frozen=True)
class Preds:
    class_names: Sequence[str]  # Not necessarily all classes
    sample_paths: Sequence[str]
    preds: FloatArray
    uncertainties: FloatArray

    @classmethod
    def load(cls: Type[AnyPreds], path: str) -> AnyPreds:
        data = _load_predfile(path)
        us = data['uncertainties']
        if us.ndim == 3 and us.shape[-1] == 1:
            # @TEMPORARY Remove extra dimension from broken pred_uncertainty
            data['uncertainties'] = us[..., 0]
        inst = cls(**data)
        ls, lp, lu = map(len, (inst.sample_paths, inst.preds, inst.uncertainties))  # type: ignore[arg-type]
        if not ls == lp == lu:
            raise ValueError("Prediction file lengths don't match: pred={}, u={}, sample={}".format(ls, lp, lu))
        return inst

    def lbl_preds(self, label: str) -> FloatArray:
        return self.preds[:, self.class_names.index(label)]

    def lbl_uncertainties(self, label: str) -> FloatArray:
        return self.uncertainties[:, self.class_names.index(label)]


if __name__ == '__main__':
    if len(sys.argv) != 5:
        raise ValueError('Expected 4 arguments, got {}'.format(len(sys.argv) - 1))

    # Predictions file generated by infer.py
    data_cnames_str, lblidx_str, threshold_str, predictions_file = sys.argv[1:]
    data_cnames, lblidx, threshold = data_cnames_str.split(','), int(lblidx_str), float(threshold_str)
    label = data_cnames[lblidx]
    del data_cnames_str, lblidx_str

    preds = Preds.load(predictions_file)

    writer = csv.writer(sys.stdout)
    writer.writerow(['sample_path', 'uncertainty', 'label'])

    for fname, pred, u in zip_strict(preds.sample_paths, preds.lbl_preds(label), preds.lbl_uncertainties(label)):
        writer.writerow([fname, u, label if pred_cmp(pred, threshold) else ''])