Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Gaussian Noise Option to SyntheticBanditDataset #188

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 36 additions & 16 deletions obp/dataset/synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ class SyntheticBanditDataset(BaseBanditDataset):
A larger value leads to a noisier reward distribution.
This argument is valid only when `reward_type="continuous"`.

reward_noise_distribution: str, default='normal'
From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'.
If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset.
This argument is valid only when `reward_type="continuous"`.

action_context: np.ndarray, default=None
Vector representation of (discrete) actions.
If None, one-hot representation will be used.
Expand Down Expand Up @@ -177,6 +182,7 @@ class SyntheticBanditDataset(BaseBanditDataset):
reward_type: str = RewardType.BINARY.value
reward_function: Optional[Callable[[np.ndarray, np.ndarray], np.ndarray]] = None
reward_std: float = 1.0
reward_noise_distribution: str = "normal"
action_context: Optional[np.ndarray] = None
behavior_policy_function: Optional[
Callable[[np.ndarray, np.ndarray], np.ndarray]
Expand Down Expand Up @@ -211,6 +217,12 @@ def __post_init__(self) -> None:
f"`reward_type` must be either '{RewardType.BINARY.value}' or '{RewardType.CONTINUOUS.value}',"
f"but {self.reward_type} is given.'"
)
if self.reward_noise_distribution not in ["normal", "truncated_normal"]:
raise ValueError(
f"`reward_noise_distribution` must be either 'normal' or 'truncated_normal',"
f"but {self.reward_noise_distribution} is given.'"
)

check_scalar(self.reward_std, "reward_std", (int, float), min_val=0)
if self.reward_function is None:
self.expected_reward = self.sample_contextfree_expected_reward()
Expand Down Expand Up @@ -263,16 +275,23 @@ def sample_reward_given_expected_reward(
if RewardType(self.reward_type) == RewardType.BINARY:
reward = self.random_.binomial(n=1, p=expected_reward_factual)
elif RewardType(self.reward_type) == RewardType.CONTINUOUS:
mean = expected_reward_factual
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
reward = truncnorm.rvs(
a=a,
b=b,
loc=mean,
scale=self.reward_std,
random_state=self.random_state,
)
if self.reward_noise_distribution == "normal":
reward = self.random_.normal(
loc=expected_reward_factual,
scale=self.reward_std,
size=action.shape,
)
elif self.reward_noise_distribution == "truncated_normal":
mean = expected_reward_factual
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
reward = truncnorm.rvs(
a=a,
b=b,
loc=mean,
scale=self.reward_std,
random_state=self.random_state,
)
else:
raise NotImplementedError

Expand Down Expand Up @@ -329,12 +348,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
expected_reward_ = self.calc_expected_reward(contexts)
if RewardType(self.reward_type) == RewardType.CONTINUOUS:
# correct expected_reward_, as we use truncated normal distribution here
mean = expected_reward_
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
expected_reward_ = truncnorm.stats(
a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
)
if self.reward_noise_distribution == "truncated_normal":
mean = expected_reward_
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
expected_reward_ = truncnorm.stats(
a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
)

# calculate the action choice probabilities of the behavior policy
if self.behavior_policy_function is None:
Expand Down
18 changes: 12 additions & 6 deletions obp/dataset/synthetic_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ class SyntheticMultiLoggersBanditDataset(SyntheticBanditDataset):
A larger value leads to a noisier reward distribution.
This argument is valid only when `reward_type="continuous"`.

reward_noise_distribution: str, default='normal'
From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'.
If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset.
This argument is valid only when `reward_type="continuous"`.

action_context: np.ndarray, default=None
Vector representation of (discrete) actions.
If None, one-hot representation will be used.
Expand Down Expand Up @@ -272,12 +277,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
expected_reward_ = self.calc_expected_reward(contexts)
if RewardType(self.reward_type) == RewardType.CONTINUOUS:
# correct expected_reward_, as we use truncated normal distribution here
mean = expected_reward_
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
expected_reward_ = truncnorm.stats(
a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
)
if self.reward_noise_distribution == "truncated_normal":
mean = expected_reward_
a = (self.reward_min - mean) / self.reward_std
b = (self.reward_max - mean) / self.reward_std
expected_reward_ = truncnorm.stats(
a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
)

# calculate the action choice probabilities of the behavior policy
pi_b_logits = expected_reward_
Expand Down
1 change: 1 addition & 0 deletions obp/policy/offline.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,7 @@ def fit(
raise ValueError("When `self.len_list > 1`, `position` must be given.")

unif_action_dist = np.ones((context.shape[0], self.n_actions, self.len_list))
unif_action_dist /= self.n_actions
self.q_estimator.fit(
context=context,
action=action,
Expand Down