Merge pull request #562 from sibyl-dev/release-v0.4.10

Bump version
sibyl-dev · Aug 5, 2024 · f4b1ca2 · f4b1ca2
2 parents 5f6019c + 2445b11
commit f4b1ca2
Show file tree

Hide file tree

Showing 28 changed files with 1,761 additions and 574 deletions.
diff --git a/README.md b/README.md
@@ -37,6 +37,10 @@ An easier approach to understanding your model's predictions.
 **Pyreal** gives you easy-to-understand explanations of your machine learning models in a low-code manner.
 Pyreal wraps full ML pipelines in a RealApp object that makes it easy to use, understand, and interact with your ML model — regardless of your ML expertise.
 
+See our tutorial series for an example of using Pyreal for house-price prediction:
+- [Part 1: Learn about feature engineering and modelling](https://medium.com/mit-data-to-ai-lab/using-and-understanding-machine-learning-ml-models-ada6525cf192)
+- [Part 2: Learn how to use Pyreal to use and understand ML models ](https://medium.com/mit-data-to-ai-lab/using-and-understanding-machine-learning-ml-models-0ce7c227837e) 
+
 # Install
 
 ## Requirements

diff --git a/docs/api_reference/transformer.rst b/docs/api_reference/transformer.rst
@@ -89,3 +89,17 @@ Geo Transformers
 
     LatLongToPlace
 
+Narrative Transformers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+    :toctree: api/
+
+    NarrativeTransformer
+
+Aggregating Transformers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+    :toctree: api/
+
+    Aggregator
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ maintainers = [
 
 description = "Library for evaluating and deploying human readable machine learning explanations."
 name = "pyreal"
-version = "0.4.9"
+version = "0.4.10"
 
 license = ""
 

diff --git a/pyreal/__init__.py b/pyreal/__init__.py
@@ -6,7 +6,7 @@
 
 __author__ = "MIT Data To AI Lab"
 __email__ = "dailabmit@gmail.com"
-__version__ = "0.4.9"
+__version__ = "0.4.10"
 
 
 __all__ = [

diff --git a/pyreal/explainers/base.py b/pyreal/explainers/base.py
@@ -387,6 +387,10 @@ def transform_explanation(self, explanation, x_orig=None):
 
         # Iterate through algorithm transformers
         for i, t in enumerate(a_transformers[::-1]):
+            if t.require_values:
+                explanation.update_values(  # UNTESTED
+                    run_transformers(a_transformers[0 : len(a_transformers) - i], x), inplace=True
+                )
             try:
                 explanation = t.inverse_transform_explanation(explanation)
             # If this is a breaking transformer, transform x to the current point and return
@@ -404,6 +408,8 @@ def transform_explanation(self, explanation, x_orig=None):
                     return explanation
         # Iterate through interpret transformers
         for t in i_transformers:
+            if t.require_values and x is not None:
+                explanation.update_values(x, inplace=True)
             if not t.algorithm:
                 try:
                     explanation = t.transform_explanation(explanation)

diff --git a/pyreal/explainers/lfc/base.py b/pyreal/explainers/lfc/base.py
@@ -3,6 +3,7 @@
 import numpy as np
 
 from pyreal.explainers import ExplainerBase
+from pyreal.transformers import NarrativeTransformer
 
 
 class LocalFeatureContributionsBase(ExplainerBase, ABC):
@@ -108,17 +109,20 @@ def produce_narrative_explanation(
                 raise ValueError("OpenAI API key or client must be provided to produce narrative")
             openai_client = self.openai_client
 
-        return self.narrify(
-            openai_client,
-            self.produce(x_orig),
+        narrative_transformer = NarrativeTransformer(
+            openai_client=openai_client,
             num_features=num_features,
-            max_tokens=max_tokens,
-            temperature=temperature,
             llm_model=llm_model,
             detail_level=detail_level,
             context_description=context_description,
-            few_shot_training_examples=self.llm_training_data,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            training_examples={"feature_contributions": self.llm_training_data},
         )
+        self.transformers.append(narrative_transformer)
+        result = self.produce(x_orig)
+        self.transformers.pop()
+        return result
 
     def train_llm(
         self, x_train=None, live=True, provide_examples=False, num_inputs=5, num_features=3
@@ -163,7 +167,9 @@ def train_llm(
         else:
             x_train = x_train.sample(num_inputs)
         explanation = self.produce(x_train)
-        parsed_explanations = parse_explanation_for_llm(explanation, num_features=num_features)
+        parsed_explanations = NarrativeTransformer.parse_feature_contribution_explanation_for_llm(
+            explanation, num_features=num_features
+        )
         narratives = []
         print("For each of the following inputs, please provide an appropriate narrative version.")
         for i in range(num_inputs):
@@ -173,9 +179,13 @@ def train_llm(
                 f"Input {i+1} (feature, value, contribution):\n{parsed_explanation_formatted}\n"
             )
             if provide_examples:
-                example = LocalFeatureContributionsBase.narrify(
-                    self.openai_client, explanation[i], num_features=num_features
-                )[0]
+                example = (
+                    NarrativeTransformer(
+                        openai_client=self.openai_client, num_features=num_features
+                    )
+                    .transform_explanation_feature_contribution(explanation[i])
+                    .get()[0]
+                )
                 instruction += f"Example: {example}\n"
                 instruction += "Narrative explanation ('k' to keep example, 'q' to quit): "
                 narrative = input(instruction)
@@ -210,114 +220,3 @@ def set_llm_training_data(self, training_data):
                 trained on these examples before generating the explanation.
         """
         self.llm_training_data = training_data
-
-    @staticmethod
-    def narrify(
-        openai_client,
-        explanation,
-        num_features=None,
-        llm_model="gpt3.5",
-        detail_level="high",
-        context_description="",
-        max_tokens=200,
-        temperature=0.5,
-        few_shot_training_examples=None,
-    ):
-        """
-        Generate a narrative explanation from a feature contribution explanation
-        Args:
-            openai_client (OpenAI API client):
-                OpenAI API client, with API key set
-            explanation (LocalFeatureContributionExplanation):
-                Feature contribution explanations. Each row represents an instance, and each
-                column a feature.
-            num_features (int):
-                Number of features to include in the explanation. If None, all features will be
-                included
-            llm_model (string):
-                One of ["gpt3.5", "gpt4"]. LLM model to use to generate the explanation.
-                GPT4 may provide better results, but is more expensive.
-            detail_level (string):
-                One of ["high", "low"]. Level of detail to include in the explanation.
-                High detail should include precise contribution values. Low detail
-                will include only basic information about features used.
-            context_description (string):
-                Description of the model's prediction task, in sentence format. This will be
-                passed to the LLM and may help produce more accurate explanations.
-                For example: "The model predicts the price of houses."
-            max_tokens (int):
-                Maximum number of tokens to use in the explanation
-            temperature (float):
-                LLM Temperature to use. Values closer to 1 will produce more creative values.
-                Values closer to 0 will produce more consistent or conservative explanations.
-            few_shot_training_examples (list of (explanation, narrative) pairs):
-                Training examples to use for few-shot learning. If provided, the LLM will be
-                trained on these examples before generating the explanation.
-
-        Returns:
-            DataFrame of shape (n_instances, n_features)
-                Narrative explanation
-        """
-        if llm_model == "gpt3.5":
-            model = "gpt-3.5-turbo-0125"
-        elif llm_model == "gpt4":
-            model = "gpt-4-0125-preview"
-        else:
-            raise ValueError(
-                "Invalid LLM model %s. Expected one of ['gpt3.5', 'gpt4']" % llm_model
-            )
-        if context_description is None:
-            context_description = ""
-        if context_description:
-            context_description = context_description.strip()
-            if not context_description.endswith("."):
-                context_description += "."
-        prompt = (
-            "You are helping users who do not have experience working with ML understand an ML"
-            f" model's predictions. {context_description} I will give you feature contribution"
-            " explanations, generated using SHAP, in (feature, feature_value, contribution )"
-            " format. Convert the explanations into simple narratives. Do not use more tokens"
-            " than necessary. Make your answers sound very natural, as if said in conversation. "
-        )
-        if detail_level == "low":
-            prompt += (
-                "Keep the explanations simple and easy to understand. Do not include exact"
-                " contribution values. "
-            )
-        elif detail_level == "high":
-            prompt += "Include all exact contribution values in your response. "
-        else:
-            raise ValueError(
-                "Invalid detail_level %s. Expected one of ['high', 'low']" % detail_level
-            )
-        # explanation = explanation.get_top_features(num_features=num_features)
-        narrative_explanations = []
-        base_messages = [{"role": "system", "content": prompt}]
-        if few_shot_training_examples is not None:
-            for training_exp, training_narr in few_shot_training_examples:
-                base_messages.append({"role": "user", "content": training_exp})
-                base_messages.append({"role": "assistant", "content": training_narr})
-        parsed_explanations = parse_explanation_for_llm(explanation, num_features=num_features)
-        for parsed_explanation in parsed_explanations:
-            messages = base_messages + [{"role": "user", "content": parsed_explanation}]
-            response = openai_client.chat.completions.create(
-                model=model,
-                messages=messages,
-                max_tokens=max_tokens,
-                temperature=temperature,
-            )
-            narrative_explanations.append(response.choices[0].message.content)
-        return narrative_explanations
-
-
-def parse_explanation_for_llm(explanation, num_features=None):
-    explanations = explanation.get_top_features(num_features=num_features)
-    parsed_explanations = []
-    for explanation in explanations:
-        strings = []
-        for feature, value, contribution in zip(
-            explanation[0].index, explanation[1], explanation[0]
-        ):
-            strings.append(f"({feature}, {value}, {contribution})")
-        parsed_explanations.append(", ".join(strings))
-    return parsed_explanations
diff --git a/pyreal/explainers/lfc/shap_feature_contribution.py b/pyreal/explainers/lfc/shap_feature_contribution.py
@@ -7,6 +7,23 @@
 from pyreal.explanation_types import AdditiveFeatureContributionExplanation
 
 
+def _get_average_or_mode(df):
+    """
+    Gets the average of numeric features and the mode of categorical features
+
+    Args:
+        df (DataFrame):
+            Input
+    Returns:
+        Series
+            Average or mode of every column in df
+    """
+    s = df.select_dtypes(np.number).mean()
+    if len(s) == df.shape[1]:  # all columns are numeric
+        return s
+    return pd.concat((df.drop(s.index, axis=1).mode().iloc[0], s))
+
+
 class ShapFeatureContribution(LocalFeatureContributionsBase):
     """
     ShapFeatureContribution object.
@@ -20,13 +37,13 @@ class ShapFeatureContribution(LocalFeatureContributionsBase):
            Filepath to the pickled model to explain, or model object with .predict() function
         x_train_orig (DataFrame of size (n_instances, n_features)):
             Training set in original form.
-        shap_type (string, one of ["kernel", "linear"]):
+        shap_type (string, one of ["kernel", "linear", "tree"]):
             Type of shap algorithm to use. If None, SHAP will pick one.
         **kwargs: see base Explainer args
     """
 
     def __init__(self, model, x_train_orig=None, shap_type=None, **kwargs):
-        supported_types = ["kernel", "linear"]
+        supported_types = ["kernel", "linear", "tree"]
         if shap_type is not None and shap_type not in supported_types:
             raise ValueError(
                 "Shap type not supported, given %s, expected one of %s or None"
@@ -37,6 +54,7 @@ def __init__(self, model, x_train_orig=None, shap_type=None, **kwargs):
 
         self.explainer = None
         self.explainer_input_size = None
+        self.average_values = None
         super(ShapFeatureContribution, self).__init__(model, x_train_orig, **kwargs)
 
     def fit(self, x_train_orig=None, y_train=None):
@@ -50,6 +68,7 @@ def fit(self, x_train_orig=None, y_train=None):
                 Targets of training set, required if not provided on initialization
         """
         x_train_orig = self._get_x_train_orig(x_train_orig)
+        self.average_values = _get_average_or_mode(x_train_orig)
 
         dataset = self.transform_to_x_algorithm(x_train_orig)
         self.explainer_input_size = dataset.shape[1]
@@ -58,6 +77,8 @@ def fit(self, x_train_orig=None, y_train=None):
         # Note: we manually check for linear model here because of SHAP bug
         elif self.shap_type == "linear":
             self.explainer = LinearExplainer(self.model, dataset)
+        elif self.shap_type == "tree":
+            self.explainer = TreeExplainer(self.model, dataset)
         else:
             self.explainer = ShapExplainer(self.model, dataset)  # SHAP will pick an algorithm
         return self
@@ -83,6 +104,7 @@ def produce_explanation(self, x_orig, **kwargs):
                 )
             )
         columns = x.columns
+        index = x.index
         x = np.asanyarray(x)
 
         if isinstance(self.explainer, TreeExplainer):
@@ -93,13 +115,15 @@ def produce_explanation(self, x_orig, **kwargs):
             raise RuntimeError("Something went wrong with SHAP - expected at least 2 dimensions")
         if shap_values.ndim == 2:
             return AdditiveFeatureContributionExplanation(
-                pd.DataFrame(shap_values, columns=columns)
+                pd.DataFrame(shap_values, columns=columns, index=index),
+                other_properties={"average_values": self.average_values},
             )
         if shap_values.ndim > 2:
             predictions = self.model_predict(x_orig)
             if self.classes is not None:
                 predictions = [np.where(self.classes == i)[0][0] for i in predictions]
             shap_values = shap_values[predictions, np.arange(shap_values.shape[1]), :]
             return AdditiveFeatureContributionExplanation(
-                pd.DataFrame(shap_values, columns=columns)
+                pd.DataFrame(shap_values, columns=columns, index=index),
+                other_properties={"average_values": self.average_values},
             )
diff --git a/pyreal/explanation_types/__init__.py b/pyreal/explanation_types/__init__.py
@@ -18,6 +18,7 @@
     PartialDependenceExplanation,
 )
 from pyreal.explanation_types.time_series_saliency import TimeSeriesSaliency
+from pyreal.explanation_types.narrative import NarrativeExplanation
 
 __all__ = [
     "Explanation",
@@ -34,4 +35,5 @@
     "FeatureValueBased",
     "PartialDependenceExplanation",
     "TimeSeriesSaliency",
+    "NarrativeExplanation",
 ]