Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/econometrics: Added Variance Inflation Factor #5866

Merged
merged 16 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -403,3 +403,22 @@ def test_econometrics_panel_fmac(params, obb):
assert result
assert isinstance(result, OBBject)
assert len(result.results) > 0


@pytest.mark.parametrize(
"params",
[
({"data": "", "columns": ["income", "age"]}),
({"data": "", "columns": ["education"]}),
],
)
@pytest.mark.integration
def test_econometrics_variance_inflation_factor(params, obb):
params = {p: v for p, v in params.items() if v}

params["data"] = mock_multi_index_data()

result = obb.econometrics.variance_inflation_factor(**params)
assert result
assert isinstance(result, OBBject)
assert len(result.results) > 0
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Econometrics Router."""

from itertools import combinations
from typing import Dict, List, Literal
from typing import Dict, List, Literal, Optional

from openbb_core.app.model.example import APIEx, PythonEx
from openbb_core.app.model.obbject import OBBject
Expand Down Expand Up @@ -910,3 +910,76 @@ def panel_fmac(
exogenous = sm.add_constant(X)
results = FamaMacBeth(y, exogenous).fit()
return OBBject(results={"results": results})


@router.command(
methods=["POST"],
include_in_schema=False,
examples=[
PythonEx(
description="Calculate the variance inflation factor.",
code=[
"stock_data = obb.equity.price.historical(symbol='TSLA', start_date='2023-01-01', provider='yfinance').to_df()", # noqa: E501 pylint: disable= C0301
'obb.econometrics.variance_inflation_factor(data=stock_data, column="close")',
],
),
],
)
def variance_inflation_factor(
data: List[Data], columns: Optional[list] = None
) -> OBBject[List[Data]]:
"""Calculate VIF (variance inflation factor), which tests for collinearity.

It quantifies the severity of multicollinearity in an ordinary least squares regression analysis. The square
root of the variance inflation factor indicates how much larger the standard error increases compared to if
that variable had 0 correlation to other predictor variables in the model.

It is defined as:

$ VIF_i = 1 / (1 - R_i^2) $
where $ R_i $ is the coefficient of determination of the regression equation with the column i being the result
from the i:th series being the exogenous variable.

A VIF over 5 indicates a high collinearity and correlation. Values over 10 indicates causes problems, while a
value of 1 indicates no correlation. Thus VIF values between 1 and 5 are most commonly considered acceptable.
In order to improve the results one can often remove a column with high VIF.

For further information see: https://en.wikipedia.org/wiki/Variance_inflation_factor

Parameters
----------
dataset: List[Data]
Dataset to calculate VIF on
columns: Optional[list]
The columns to calculate to test for collinearity

Returns
-------
OBBject[List[Data]]
The resulting VIF values for the selected columns
"""
# pylint: disable=import-outside-toplevel
from openbb_core.app.utils import (
basemodel_to_df,
df_to_basemodel,
)
from pandas import DataFrame
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from statsmodels.tools.tools import add_constant

# Convert to pandas dataframe
dataset = basemodel_to_df(data)

# Add a constant
df = add_constant(dataset if columns is None else dataset[columns])

# Remove date and string type because VIF doesn't work for these types
df = df.select_dtypes(exclude=["object", "datetime", "timedelta"])

# Calculate the VIF values
vif_values: dict = {}
for i in range(len(df.columns))[1:]:
vif_values[f"{df.columns[i]}"] = vif(df.values, i)

results = df_to_basemodel(DataFrame(vif_values, index=[0]))
return OBBject(results=results)