diff --git a/obp/dataset/synthetic.py b/obp/dataset/synthetic.py
index f1f0232..2434eb7 100644
--- a/obp/dataset/synthetic.py
+++ b/obp/dataset/synthetic.py
@@ -77,6 +77,11 @@ class SyntheticBanditDataset(BaseBanditDataset):
         A larger value leads to a noisier reward distribution.
         This argument is valid only when `reward_type="continuous"`.
 
+    reward_noise_distribution: str, default='normal'
+        From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'.
+        If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset.
+        This argument is valid only when `reward_type="continuous"`.
+
     action_context: np.ndarray, default=None
          Vector representation of (discrete) actions.
          If None, one-hot representation will be used.
@@ -177,6 +182,7 @@ class SyntheticBanditDataset(BaseBanditDataset):
     reward_type: str = RewardType.BINARY.value
     reward_function: Optional[Callable[[np.ndarray, np.ndarray], np.ndarray]] = None
     reward_std: float = 1.0
+    reward_noise_distribution: str = "normal"
     action_context: Optional[np.ndarray] = None
     behavior_policy_function: Optional[
         Callable[[np.ndarray, np.ndarray], np.ndarray]
@@ -211,6 +217,12 @@ def __post_init__(self) -> None:
                 f"`reward_type` must be either '{RewardType.BINARY.value}' or '{RewardType.CONTINUOUS.value}',"
                 f"but {self.reward_type} is given.'"
             )
+        if self.reward_noise_distribution not in ["normal", "truncated_normal"]:
+            raise ValueError(
+                f"`reward_noise_distribution` must be either 'normal' or 'truncated_normal',"
+                f"but {self.reward_noise_distribution} is given.'"
+            )
+
         check_scalar(self.reward_std, "reward_std", (int, float), min_val=0)
         if self.reward_function is None:
             self.expected_reward = self.sample_contextfree_expected_reward()
@@ -263,16 +275,23 @@ def sample_reward_given_expected_reward(
         if RewardType(self.reward_type) == RewardType.BINARY:
             reward = self.random_.binomial(n=1, p=expected_reward_factual)
         elif RewardType(self.reward_type) == RewardType.CONTINUOUS:
-            mean = expected_reward_factual
-            a = (self.reward_min - mean) / self.reward_std
-            b = (self.reward_max - mean) / self.reward_std
-            reward = truncnorm.rvs(
-                a=a,
-                b=b,
-                loc=mean,
-                scale=self.reward_std,
-                random_state=self.random_state,
-            )
+            if self.reward_noise_distribution == "normal":
+                reward = self.random_.normal(
+                    loc=expected_reward_factual,
+                    scale=self.reward_std,
+                    size=action.shape,
+                )
+            elif self.reward_noise_distribution == "truncated_normal":
+                mean = expected_reward_factual
+                a = (self.reward_min - mean) / self.reward_std
+                b = (self.reward_max - mean) / self.reward_std
+                reward = truncnorm.rvs(
+                    a=a,
+                    b=b,
+                    loc=mean,
+                    scale=self.reward_std,
+                    random_state=self.random_state,
+                )
         else:
             raise NotImplementedError
 
@@ -329,12 +348,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
         expected_reward_ = self.calc_expected_reward(contexts)
         if RewardType(self.reward_type) == RewardType.CONTINUOUS:
             # correct expected_reward_, as we use truncated normal distribution here
-            mean = expected_reward_
-            a = (self.reward_min - mean) / self.reward_std
-            b = (self.reward_max - mean) / self.reward_std
-            expected_reward_ = truncnorm.stats(
-                a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
-            )
+            if self.reward_noise_distribution == "truncated_normal":
+                mean = expected_reward_
+                a = (self.reward_min - mean) / self.reward_std
+                b = (self.reward_max - mean) / self.reward_std
+                expected_reward_ = truncnorm.stats(
+                    a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
+                )
 
         # calculate the action choice probabilities of the behavior policy
         if self.behavior_policy_function is None:
diff --git a/obp/dataset/synthetic_multi.py b/obp/dataset/synthetic_multi.py
index 1bf3f8a..5b3697a 100644
--- a/obp/dataset/synthetic_multi.py
+++ b/obp/dataset/synthetic_multi.py
@@ -74,6 +74,11 @@ class SyntheticMultiLoggersBanditDataset(SyntheticBanditDataset):
         A larger value leads to a noisier reward distribution.
         This argument is valid only when `reward_type="continuous"`.
 
+    reward_noise_distribution: str, default='normal'
+        From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'.
+        If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset.
+        This argument is valid only when `reward_type="continuous"`.
+
     action_context: np.ndarray, default=None
          Vector representation of (discrete) actions.
          If None, one-hot representation will be used.
@@ -272,12 +277,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback:
         expected_reward_ = self.calc_expected_reward(contexts)
         if RewardType(self.reward_type) == RewardType.CONTINUOUS:
             # correct expected_reward_, as we use truncated normal distribution here
-            mean = expected_reward_
-            a = (self.reward_min - mean) / self.reward_std
-            b = (self.reward_max - mean) / self.reward_std
-            expected_reward_ = truncnorm.stats(
-                a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
-            )
+            if self.reward_noise_distribution == "truncated_normal":
+                mean = expected_reward_
+                a = (self.reward_min - mean) / self.reward_std
+                b = (self.reward_max - mean) / self.reward_std
+                expected_reward_ = truncnorm.stats(
+                    a=a, b=b, loc=mean, scale=self.reward_std, moments="m"
+                )
 
         # calculate the action choice probabilities of the behavior policy
         pi_b_logits = expected_reward_
diff --git a/obp/policy/offline.py b/obp/policy/offline.py
index 21041ac..b0d0ed0 100644
--- a/obp/policy/offline.py
+++ b/obp/policy/offline.py
@@ -441,6 +441,7 @@ def fit(
                 raise ValueError("When `self.len_list > 1`, `position` must be given.")
 
         unif_action_dist = np.ones((context.shape[0], self.n_actions, self.len_list))
+        unif_action_dist /= self.n_actions
         self.q_estimator.fit(
             context=context,
             action=action,