talmolab · sheridana · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023
diff --git a/biogtr/config.py b/biogtr/config.py
@@ -1,8 +1,9 @@
 # to implement - config class that handles getters/setters
 """Data structures for handling config parsing."""
+from biogtr.datasets.cell_tracking_dataset import CellTrackingDataset
+from biogtr.datasets.data_utils import FixedRandomSampler
 from biogtr.datasets.microscopy_dataset import MicroscopyDataset
 from biogtr.datasets.sleap_dataset import SleapDataset
-from biogtr.datasets.cell_tracking_dataset import CellTrackingDataset
 from biogtr.models.global_tracking_transformer import GlobalTrackingTransformer
 from biogtr.models.gtr_runner import GTRRunner
 from biogtr.models.model_utils import init_optimizer, init_scheduler, init_logger
@@ -180,12 +181,28 @@
  )
  else:
  generator = None
+
+ if "sampler_params" in dataloader_params:
+ sampler_params = dataloader_params.sampler_params
+ sampler = FixedRandomSampler(dataset, **sampler_params)
+
+ # make sure shuffle is set to false if using fixed random sampler
+ if "shuffle" in dataloader_params:
+ dataloader_params["shuffle"] = False
+
+ # now remove from dataloader_params
+ del dataloader_params["sampler_params"]
+
+ else:
+ sampler = None
+
  return torch.utils.data.DataLoader(
  dataset=dataset,
  batch_size=1,
  pin_memory=pin_memory,
  generator=generator,
  collate_fn=dataset.no_batching_fn,
+ sampler=sampler,
  **dataloader_params,
  )
 

diff --git a/biogtr/datasets/cell_tracking_dataset.py b/biogtr/datasets/cell_tracking_dataset.py
@@ -28,7 +28,7 @@ def __init__(
  clip_length: int = 10,
  mode: str = "train",
  augmentations: Optional[dict] = None,
- gt_list: str = None,
+ gt_list: Optional[str] = None,
  ):
  """Initialize CellTrackingDataset.
 

diff --git a/biogtr/datasets/data_utils.py b/biogtr/datasets/data_utils.py
@@ -2,7 +2,7 @@
 from PIL import Image
 from numpy.typing import ArrayLike
 from torchvision.transforms import functional as tvf
-from typing import List, Dict
+from typing import List, Dict, Optional
 from xml.etree import cElementTree as et
 import albumentations as A
 import math
@@ -473,3 +473,55 @@
 
  plt.tight_layout()
  plt.show()
+
+
+class FixedRandomSampler(torch.utils.data.Sampler):
+ """Custom sampler class."""
+
+ def __init__(
+ self,
+ dataset: torch.utils.data.Dataset,
+ seed: int = 1234,
+ num_epochs: Optional[int] = None,
+ ):
+ """Custom sampler that generates indices using a fixed random seed.
+
+ Args:
+ dataset: The dataset to sample from.
+ seed: The seed for the random number generator. Default is 1234.
+ num_epochs: The number of epochs to generate indices
+ for. Allows for random sampling over epochs.
+ """
+ self.num_samples = len(dataset)
+ self.seed = seed
+ if num_epochs is None:
+ self.size = self.num_samples
+ else:
+ self.size = self.num_samples * num_epochs
+ self.indices = self._generate_indices()
+
+ def _generate_indices(self):
+ """Generates indices for sampling.
+
+ Returns:
+ indices (list): The list of generated indices.
+ """
+ rng = np.random.default_rng(seed=self.seed)
+ indices = rng.choice(range(self.num_samples), size=self.size, replace=True)
+ return indices
+
+ def __iter__(self):
+ """Iterator for generating indices.
+
+ Returns:
+ iterator (iter): An iterator of the generated indices.
+ """
+ return iter(self.indices)
+
+ def __len__(self):
+ """Returns the total number of indices.
+
+ Returns:
+ size (int): The total number of indices.
+ """
+ return self.size
diff --git a/biogtr/datasets/microscopy_dataset.py b/biogtr/datasets/microscopy_dataset.py
@@ -4,6 +4,7 @@
 from biogtr.datasets.base_dataset import BaseDataset
 from torch.utils.data import Dataset
 from torchvision.transforms import functional as tvf
+from typing import Optional
 import albumentations as A
 import numpy as np
 import random
@@ -23,7 +24,7 @@ def __init__(
  chunk: bool = False,
  clip_length: int = 10,
  mode: str = "Train",
- augmentations: dict = None,
+ augmentations: Optional[dict] = None,
  ):
  """Initialize MicroscopyDataset.
 

diff --git a/biogtr/datasets/sleap_dataset.py b/biogtr/datasets/sleap_dataset.py
@@ -1,14 +1,14 @@
 """Module containing logic for loading sleap datasets."""
+from biogtr.datasets import data_utils
+from biogtr.datasets.base_dataset import BaseDataset
+from torchvision.transforms import functional as tvf
+from typing import List, Optional
 import albumentations as A
-import torch
 import imageio
 import numpy as np
-import sleap_io as sio
 import random
-from biogtr.datasets import data_utils
-from biogtr.datasets.base_dataset import BaseDataset
-from torchvision.transforms import functional as tvf
-from typing import List
+import sleap_io as sio
+import torch
 
 
 class SleapDataset(BaseDataset):
@@ -23,7 +23,7 @@ def __init__(
  chunk: bool = True,
  clip_length: int = 500,
  mode: str = "train",
- augmentations: dict = None,
+ augmentations: Optional[dict] = None,
  ):
  """Initialize SleapDataset.
 
@@ -137,6 +137,7 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
  gt_track_ids, bboxes, crops, poses, shown_poses = [], [], [], [], []
 
  i = int(i)
+ print(i)
 
  lf = video[i]
  img = vid_reader.get_data(i)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -1,11 +1,12 @@
 """Test dataset logic."""
 from biogtr.datasets.base_dataset import BaseDataset
-from biogtr.datasets.data_utils import get_max_padding
+from biogtr.datasets.data_utils import get_max_padding, FixedRandomSampler
 from biogtr.datasets.microscopy_dataset import MicroscopyDataset
 from biogtr.datasets.sleap_dataset import SleapDataset
 from biogtr.datasets.tracking_dataset import TrackingDataset
 from biogtr.datasets.cell_tracking_dataset import CellTrackingDataset
-from torch.utils.data import DataLoader
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
 import pytest
 import torch
 
@@ -388,3 +389,38 @@ def test_augmentations(two_flies, ten_icy_particles):
  b = augs_instances[0]["crops"]
 
  assert not torch.all(a.eq(b))
+
+
+def test_fixed_random_sampler():
+ """Test FixedRandomSampler logic."""
+
+ # dummy dataset
+ dataset = TensorDataset(torch.rand(100, 10))
+
+ random_seed = 12345
+
+ # Test only seed
+ sampler = FixedRandomSampler(dataset, seed=random_seed)
+ sample_indices = list(iter(sampler))
+ assert len(sample_indices) == len(dataset)
+ assert len(set(sample_indices)) <= len(dataset)
+
+ # Test both seed and num_epochs
+ sampler = FixedRandomSampler(dataset, seed=random_seed, num_epochs=2)
+ sample_indices = list(iter(sampler))
+ assert len(sample_indices) == 2 * len(dataset)
+
+ # Test fixed seed sampling consistency
+ sampler1 = FixedRandomSampler(dataset, seed=random_seed)
+ sampler2 = FixedRandomSampler(dataset, seed=random_seed)
+ sample_indices1 = list(iter(sampler1))
+ sample_indices2 = list(iter(sampler2))
+ np.testing.assert_array_equal(sample_indices1, sample_indices2)
+
+ # Test different seeds give different results
+ sampler1 = FixedRandomSampler(dataset, seed=random_seed)
+ sampler2 = FixedRandomSampler(dataset, seed=random_seed + 1)
+ sample_indices1 = list(iter(sampler1))
+ sample_indices2 = list(iter(sampler2))
+ with pytest.raises(AssertionError):
+ np.testing.assert_array_equal(sample_indices1, sample_indices2)