From 7ae452c04211236b1bc84d6849848a783a09421d Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Tue, 19 Sep 2023 16:44:06 -0700
Subject: [PATCH 01/14] temp fix to allow for 1 chunk by rounding floats

---
 biogtr/datasets/base_dataset.py          | 9 ++++-----
 biogtr/datasets/cell_tracking_dataset.py | 3 ---
 biogtr/datasets/microscopy_dataset.py    | 3 ---
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/biogtr/datasets/base_dataset.py b/biogtr/datasets/base_dataset.py
index 5bdd20c..b63dab7 100644
--- a/biogtr/datasets/base_dataset.py
+++ b/biogtr/datasets/base_dataset.py
@@ -49,9 +49,6 @@ def __init__(
         self.n_chunks = n_chunks
         self.seed = seed
 
-        if self.n_chunks > 1.0:
-            self.n_chunks = int(self.n_chunks)
-
         # if self.seed is not None:
         #     np.random.seed(self.seed)
 
@@ -80,11 +77,13 @@ def create_chunks(self):
                 frame_idx_split = torch.split(frame_idx, self.clip_length)
                 self.chunked_frame_idx.extend(frame_idx_split)
                 self.label_idx.extend(len(frame_idx_split) * [i])
-            
+                
             if self.n_chunks > 0 and self.n_chunks <= 1.0:
                 n_chunks = int(self.n_chunks * len(self.chunked_frame_idx))
+
             elif self.n_chunks <= len(self.chunked_frame_idx):
-                n_chunks = self.n_chunks
+                n_chunks = int(self.n_chunks)
+                
             else:
                 n_chunks = len(self.chunked_frame_idx)
 
diff --git a/biogtr/datasets/cell_tracking_dataset.py b/biogtr/datasets/cell_tracking_dataset.py
index 3ba2284..00d0db8 100644
--- a/biogtr/datasets/cell_tracking_dataset.py
+++ b/biogtr/datasets/cell_tracking_dataset.py
@@ -80,9 +80,6 @@ def __init__(
         self.n_chunks = n_chunks
         self.seed = seed
 
-        if self.n_chunks > 1.0:
-            self.n_chunks = int(self.n_chunks)
-
         # if self.seed is not None:
         #     np.random.seed(self.seed)
 
diff --git a/biogtr/datasets/microscopy_dataset.py b/biogtr/datasets/microscopy_dataset.py
index 39f3391..0ec3d02 100644
--- a/biogtr/datasets/microscopy_dataset.py
+++ b/biogtr/datasets/microscopy_dataset.py
@@ -73,9 +73,6 @@ def __init__(
         self.n_chunks = n_chunks
         self.seed = seed
 
-        if self.n_chunks > 1.0:
-            self.n_chunks = int(self.n_chunks)
-
         # if self.seed is not None:
         #     np.random.seed(self.seed)
 

From f759ec0f1def003a37a23ae7e90183638eb403c2 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Tue, 19 Sep 2023 16:44:43 -0700
Subject: [PATCH 02/14] load model from checkpoint directly rather than with
 trainer

---
 biogtr/config.py | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/biogtr/config.py b/biogtr/config.py
index e965f64..8b06cf8 100644
--- a/biogtr/config.py
+++ b/biogtr/config.py
@@ -92,20 +92,32 @@ def get_tracker_cfg(self) -> dict:
 
     def get_gtr_runner(self):
         """Get lightning module for training, validation, and inference."""
-        model_params = self.cfg.model
+        
         tracker_params = self.cfg.tracker
         optimizer_params = self.cfg.optimizer
         scheduler_params = self.cfg.scheduler
         loss_params = self.cfg.loss
         gtr_runner_params = self.cfg.runner
-        return GTRRunner(
-            model_params,
-            tracker_params,
-            loss_params,
-            optimizer_params,
-            scheduler_params,
-            **gtr_runner_params,
-        )
+        
+        if self.cfg.model.ckpt_path is not None and self.cfg.model.ckpt_path != "":
+            model = GTRRunner.load_from_checkpoint(self.cfg.model.ckpt_path,
+                                                   tracker_cfg = tracker_params,
+                                                   train_metrics=self.cfg.runner.train_metrics,
+                                                  val_metrics=self.cfg.runner.val_metrics,
+                                                  test_metrics=self.cfg.runner.test_metrics)
+            
+        else:
+            model_params = self.cfg.model
+            model = GTRRunner(
+                model_params,
+                tracker_params,
+                loss_params,
+                optimizer_params,
+                scheduler_params,
+                **gtr_runner_params,
+            )
+
+        return model
 
     def get_dataset(
         self, mode: str
@@ -296,7 +308,3 @@ def get_trainer(
             logger=logger,
             **trainer_params,
         )
-
-    def get_ckpt_path(self):
-        """Get model ckpt path for loading."""
-        return self.cfg.model.ckpt_path

From fb898077844295f76d14f3d9e2e549b2e898b3d5 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Tue, 19 Sep 2023 16:45:41 -0700
Subject: [PATCH 03/14] change pose_bbox to calculate bbox around midpoint of
 skeleton

---
 biogtr/datasets/data_utils.py | 42 +++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/biogtr/datasets/data_utils.py b/biogtr/datasets/data_utils.py
index 81db2a4..8ee2d19 100644
--- a/biogtr/datasets/data_utils.py
+++ b/biogtr/datasets/data_utils.py
@@ -2,7 +2,7 @@
 from PIL import Image
 from numpy.typing import ArrayLike
 from torchvision.transforms import functional as tvf
-from typing import List, Dict
+from typing import List, Dict, Union
 from xml.etree import cElementTree as et
 import albumentations as A
 import math
@@ -34,7 +34,7 @@ def crop_bbox(img: torch.Tensor, bbox: ArrayLike) -> torch.Tensor:
 
     Args:
         img: Image as a tensor of shape (channels, height, width).
-        bbox: Bounding box in [x1, y1, x2, y2] format.
+        bbox: Bounding box in [y1, x1, y2, x2] format.
 
     Returns:
         Cropped pixels as tensor of shape (channels, height, width).
@@ -52,7 +52,7 @@ def crop_bbox(img: torch.Tensor, bbox: ArrayLike) -> torch.Tensor:
     return crop
 
 
-def get_bbox(center: ArrayLike, size: int) -> torch.Tensor:
+def get_bbox(center: ArrayLike, size: Union[int, tuple[int]]) -> torch.Tensor:
     """Get a square bbox around a centroid coordinates.
 
     Args:
@@ -62,10 +62,13 @@ def get_bbox(center: ArrayLike, size: int) -> torch.Tensor:
     Returns:
         A torch tensor in form y1, x1, y2, x2
     """
+    if type(size) == int:
+        size = (size, size)
     cx, cy = center[0], center[1]
 
     bbox = torch.Tensor(
-        [-size // 2 + cy, -size // 2 + cx, size // 2 + cy, size // 2 + cx]
+        [-size[-1] // 2 + cy, -size[0] // 2 + cx,
+         size[-1] // 2 + cy, size[0] // 2 + cx]
     )
 
     return bbox
@@ -86,6 +89,7 @@ def centroid_bbox(points: ArrayLike, anchors: list, crop_size: int) -> torch.Ten
     Returns:
         Bounding box in [y1, x1, y2, x2] format.
     """
+    print(anchors)
     for anchor in anchors:
         cx, cy = points[anchor][0], points[anchor][1]
         if not np.isnan(cx):
@@ -104,28 +108,32 @@ def centroid_bbox(points: ArrayLike, anchors: list, crop_size: int) -> torch.Ten
 
 
 def pose_bbox(
-    instance: sio.Instance, padding: int, im_shape: ArrayLike
+    points: np.ndarray, bbox_size: Union[tuple[int], int]
 ) -> torch.Tensor:
     """Calculate bbox around instance pose.
 
     Args:
         instance: a labeled instance in a frame,
-        padding: the amount to pad around the pose crop
-        im_shape: the size of the original image in (w,h)
+        bbox_size: size of bbox either an int indicating square bbox or in (x,y)
 
     Returns:
         Bounding box in [y1, x1, y2, x2] format.
     """
-    w, h = im_shape
-
-    points = torch.Tensor([[p.x, p.y] for p in instance.points])
-
-    min_x = max(torch.nanmin(points[:, 0]) - padding, 0)
-    min_y = max(torch.nanmin(points[:, 1]) - padding, 0)
-    max_x = min(torch.nanmax(points[:, 0]) + padding, w)
-    max_y = min(torch.nanmax(points[:, 1]) + padding, h)
-
-    bbox = torch.Tensor([min_y, min_x, max_y, max_x])
+    if type(bbox_size) == int:
+        bbox_size = (bbox_size, bbox_size)
+    # print(points)
+    minx = np.nanmin(points[:,0], axis=-1)
+    miny = np.nanmin(points[:,-1], axis=-1)
+    minpoints = np.array([minx, miny]).T
+    
+    maxx = np.nanmax(points[:,0], axis=-1)
+    maxy = np.nanmax(points[:,-1], axis=-1)
+    maxpoints = np.array([maxx, maxy]).T
+    
+    c = ((minpoints + maxpoints)/2)
+    
+    bbox = torch.Tensor([c[-1]-bbox_size[-1]/2, c[0] - bbox_size[0]/2,
+                         c[-1] + bbox_size[-1]/2, c[0] + bbox_size[0]/2])
     return bbox
 
 

From 6cc71e76363f653a4023a8a380260a5f85c24ff1 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Tue, 19 Sep 2023 16:46:20 -0700
Subject: [PATCH 04/14] use pose anchors directly rather than a sorted list

---
 biogtr/datasets/sleap_dataset.py | 73 ++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 28 deletions(-)

diff --git a/biogtr/datasets/sleap_dataset.py b/biogtr/datasets/sleap_dataset.py
index 6b18e8e..c16c4ad 100644
--- a/biogtr/datasets/sleap_dataset.py
+++ b/biogtr/datasets/sleap_dataset.py
@@ -20,6 +20,7 @@ def __init__(
         video_files: list[str],
         padding: int = 5,
         crop_size: int = 128,
+        anchor: str = "",
         chunk: bool = True,
         clip_length: int = 500,
         mode: str = "train",
@@ -34,6 +35,8 @@ def __init__(
             video_files: a list of paths to video files
             padding: amount of padding around object crops
             crop_size: the size of the object crops
+            anchor: the name of the anchor keypoint to be used as centroid for cropping. 
+            If unavailable then crop around the midpoint between all visible anchors.
             chunk: whether or not to chunk the dataset into batches
             clip_length: the number of frames in each chunk
             mode: `train` or `val`. Determines whether this dataset is used for
@@ -70,9 +73,7 @@ def __init__(
         self.mode = mode
         self.n_chunks = n_chunks
         self.seed = seed
-
-        if self.n_chunks > 1.0:
-            self.n_chunks = int(self.n_chunks)
+        self.anchor = anchor
 
         # if self.seed is not None:
         #     np.random.seed(self.seed)
@@ -88,12 +89,7 @@ def __init__(
         # for label in self.labels:
         # label.remove_empty_instances(keep_empty_frames=False)
 
-        self.anchor_names = [
-            data_utils.sorted_anchors(labels) for labels in self.labels
-        ]
-
-        self.frame_idx = [torch.arange(len(label)) for label in self.labels]
-
+        self.frame_idx = [torch.arange(len(labels)) for labels in self.labels]
         # Method in BaseDataset. Creates label_idx and chunked_frame_idx to be
         # used in call to get_instances()
         self.create_chunks()
@@ -134,11 +130,6 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
         """
         video = self.labels[label_idx]
 
-        anchors = [
-            video.skeletons[0].node_names.index(anchor_name)
-            for anchor_name in self.anchor_names[label_idx]
-        ]
-
         video_name = self.video_files[label_idx]
 
         vid_reader = imageio.get_reader(video_name, "ffmpeg")
@@ -147,14 +138,14 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
         crop_shape = (img.shape[-1], *(self.crop_size + 2 * self.padding,) * 2)
 
         instances = []
-
-        for i in frame_idx:
+        for i, frame in enumerate(frame_idx):
             gt_track_ids, bboxes, crops, poses, shown_poses = [], [], [], [], []
 
-            i = int(i)
+            frame = int(frame)
+            
+            lf = video[frame]
 
-            lf = video[i]
-            img = vid_reader.get_data(i)
+            img = vid_reader.get_data(frame)
 
             for instance in lf:
                 gt_track_ids.append(video.tracks.index(instance.track))
@@ -177,6 +168,9 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
                     )
                 )
 
+                shown_poses = [{key: val for key, val in instance.items()
+                                if not np.isnan(val).any()
+                                } for instance in shown_poses]
             # augmentations
             if self.augmentations is not None:
                 for transform in self.augmentations:
@@ -207,18 +201,41 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
                     for aug_pose_arr, pose_dict in zip(aug_poses, shown_poses)
                 ]
 
-                _ = [pose.update(aug_pose) for pose, aug_pose in zip(poses, aug_poses)]
+                _ = [pose.update(aug_pose) for pose, aug_pose in zip(shown_poses, aug_poses)]
 
             img = tvf.to_tensor(img)
 
-            for pose in poses:
-                bbox = data_utils.pad_bbox(
-                    data_utils.centroid_bbox(
-                        np.array(list(pose.values())), anchors, self.crop_size
-                    ),
-                    padding=self.padding,
-                )
+            for pose in shown_poses:
+
+                if self.anchor in pose: 
+                    centroid = pose[self.anchor]
+
+                    if not np.isnan(centroid).any():
+                        bbox = data_utils.pad_bbox(
+                                data_utils.get_bbox(
+                                    centroid, self.crop_size
+                                ),
+                                padding=self.padding,
+                            )
+                        
+                    else:
+                        #print(f'{self.anchor} contains NaN: {centroid}. Using midpoint')
+                        bbox = data_utils.pad_bbox(
+                            data_utils.pose_bbox(
+                                np.array(list(pose.values())), self.crop_size
+                            ),
+                            padding=self.padding,
+                        )
+                else:
+                    #print(f'{self.anchor} not an available option amongst {pose.keys()}. Using midpoint')
+                    bbox = data_utils.pad_bbox(
+                        data_utils.pose_bbox(
+                            np.array(list(pose.values())), self.crop_size
+                        ),
+                        padding=self.padding,
+                    )
 
+                
                 crop = data_utils.crop_bbox(img, bbox)
 
                 bboxes.append(bbox)
@@ -232,7 +249,7 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
                 {
                     "video_id": torch.tensor([label_idx]),
                     "img_shape": torch.tensor([img.shape]),
-                    "frame_id": torch.tensor([i]),
+                    "frame_id": torch.tensor([frame]),
                     "num_detected": torch.tensor([len(bboxes)]),
                     "gt_track_ids": torch.tensor(gt_track_ids),
                     "bboxes": torch.stack(bboxes) if bboxes else torch.empty((0, 4)),

From 1e4edea50fbe9d74a0c8f5bcb65abe151f4c766b Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Tue, 19 Sep 2023 16:46:52 -0700
Subject: [PATCH 05/14] put model in eval mode before extracting visual
 features

---
 biogtr/inference/tracker.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/biogtr/inference/tracker.py b/biogtr/inference/tracker.py
index 1b00ffa..2f055d9 100644
--- a/biogtr/inference/tracker.py
+++ b/biogtr/inference/tracker.py
@@ -75,6 +75,8 @@ def track(self, model: GlobalTrackingTransformer, instances: list[dict], all_ins
         """
 # Extract feature representations with pre-trained encoder.
 
+        _ = model.eval()
+    
         if not self.persistent_tracking:
             # print(f'Clearing Queue after tracking')
             self.track_queue.clear()
@@ -93,7 +95,7 @@ def track(self, model: GlobalTrackingTransformer, instances: list[dict], all_ins
                 # comment out to turn encoder off
 
                 # Assuming the encoder is already trained or train encoder jointly.
-                elif 'features' not in frame or frame['features'] == None:
+                elif 'features' not in frame or frame['features'] == None or len(frame['features']) == 0:
                     with torch.no_grad():
                         z = model.visual_encoder(frame["crops"])
                         frame["features"] = z
@@ -264,6 +266,7 @@ def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query
 
         # Number of instances in each frame of the window.
         # E.g.: instances_per_frame: [4, 5, 6, 7]; window of length 4 with 4 detected instances in the first frame of the window.
+        
         _ = model.eval()
         instances_per_frame = [frame["num_detected"] for frame in instances]
 
@@ -279,7 +282,8 @@ def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query
                 instances[query_frame]["embeddings"] = embed
             else:
                 asso_output = model(instances, query_frame=query_frame)
-        
+        # if query_frame == 1:
+        #     print(asso_output)
         asso_output = asso_output[-1].split(instances_per_frame, dim=1)  # (window_size, n_query, N_i)
         asso_output = model_utils.softmax_asso(asso_output)  # (window_size, n_query, N_i)
         asso_output = torch.cat(asso_output, dim=1).cpu()  # (n_query, total_instances)

From 78293b865dfe4384b35eac693210dc07aac0a087 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Tue, 19 Sep 2023 16:47:23 -0700
Subject: [PATCH 06/14] fix typo - only extract features once

---
 biogtr/models/global_tracking_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/biogtr/models/global_tracking_transformer.py b/biogtr/models/global_tracking_transformer.py
index 8fa40d9..373d368 100644
--- a/biogtr/models/global_tracking_transformer.py
+++ b/biogtr/models/global_tracking_transformer.py
@@ -116,7 +116,7 @@ def forward(
         # Extract feature representations with pre-trained encoder.
         for frame in instances:
             if (frame["num_detected"] > 0).item(): 
-                if "features" in frame.keys() and len(frame["features"]) == 0:
+                if "features" not in frame.keys() or frame['features'] == None or len(frame["features"]) == 0:
                     z = self.visual_encoder(frame["crops"])
                     frame["features"] = z
 

From 3676ccd851da7ac4e0354e1a2da58632cd6ee371 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Tue, 19 Sep 2023 16:48:00 -0700
Subject: [PATCH 07/14] backwards compatibility with "sw_cnt"

---
 biogtr/inference/metrics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/biogtr/inference/metrics.py b/biogtr/inference/metrics.py
index 7357c5b..d8d6386 100644
--- a/biogtr/inference/metrics.py
+++ b/biogtr/inference/metrics.py
@@ -250,7 +250,8 @@ def get_pymotmetrics(data: dict, metrics: Union[str, tuple] = "all", key: str =
         "num_timsteps": L,
     }
     """
-
+    if not isinstance(metrics, str):
+        metrics = ["num_switches" if metric.lower() == "sw_cnt" else metric for metric in metrics] #backward compatibility
     acc = mm.MOTAccumulator(auto_id=True)
 
     for i in range(len(data["gt_ids"])):

From f1031be1d19405d1d33ad3db0461899fc47ed162 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Tue, 19 Sep 2023 16:48:49 -0700
Subject: [PATCH 08/14] load model from checkpoint directly rather than using
 trainer

---
 biogtr/training/train.py | 3 +--
 tests/test_training.py   | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/biogtr/training/train.py b/biogtr/training/train.py
index 3f02c92..b2505ef 100644
--- a/biogtr/training/train.py
+++ b/biogtr/training/train.py
@@ -99,8 +99,7 @@ def main(cfg: DictConfig):
         devices=devices,
     )
 
-    ckpt_path = train_cfg.get_ckpt_path()
-    trainer.fit(model, dataset, ckpt_path=ckpt_path)
+    trainer.fit(model, dataset)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_training.py b/tests/test_training.py
index a23f8b0..79a65a7 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -93,14 +93,14 @@ def test_basic_gtr_runner():
         for i, batch in enumerate(train_ds):
             assert gtr_runner.model.training
             metrics = gtr_runner.training_step(batch, i)
-            assert "loss" in metrics and "sw_cnt" not in metrics
+            assert "loss" in metrics and "num_switches" not in metrics
             assert metrics["loss"].requires_grad
 
         for j, batch in enumerate(train_ds):
             gtr_runner.eval()
             with torch.no_grad():
                 metrics = gtr_runner.validation_step(batch, j)
-            assert "loss" in metrics and "sw_cnt" in metrics
+            assert "loss" in metrics and "num_switches" in metrics
             assert not metrics["loss"].requires_grad
 
         gtr_runner.train()
@@ -109,7 +109,7 @@ def test_basic_gtr_runner():
         gtr_runner.eval()
         with torch.no_grad():
             metrics = gtr_runner.test_step(batch, k)
-        assert "loss" in metrics and "sw_cnt" in metrics
+        assert "loss" in metrics and "num_switches" in metrics
         assert not metrics["loss"].requires_grad
 
 

From 6d9df3bbcaebb4844cb2a1c1c6dc2e39669ff2d5 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Wed, 20 Sep 2023 10:48:03 -0700
Subject: [PATCH 09/14] fix bug with missing frames

---
 biogtr/inference/tracker.py | 58 +++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 21 deletions(-)

diff --git a/biogtr/inference/tracker.py b/biogtr/inference/tracker.py
index 2f055d9..7a28f2c 100644
--- a/biogtr/inference/tracker.py
+++ b/biogtr/inference/tracker.py
@@ -76,10 +76,6 @@ def track(self, model: GlobalTrackingTransformer, instances: list[dict], all_ins
 # Extract feature representations with pre-trained encoder.
 
         _ = model.eval()
-    
-        if not self.persistent_tracking:
-            # print(f'Clearing Queue after tracking')
-            self.track_queue.clear()
 
         for frame in instances:
             if (frame["num_detected"] > 0).item():
@@ -111,6 +107,11 @@ def track(self, model: GlobalTrackingTransformer, instances: list[dict], all_ins
         instances_pred = self.sliding_inference(
             model, instances, window_size=self.window_size, all_instances=all_instances
         )
+        
+        if not self.persistent_tracking:
+            # print(f'Clearing Queue after tracking')
+            self.track_queue.clear()
+            
         return instances_pred
 
     def sliding_inference(self, model: GlobalTrackingTransformer, instances, window_size, all_instances=None):
@@ -153,24 +154,33 @@ def sliding_inference(self, model: GlobalTrackingTransformer, instances, window_
         id_count = 0
 
         for batch_idx in range(video_len):
+                
             if (self.persistent_tracking and instances[batch_idx]['frame_id'] == 0):
                 self.track_queue.clear()
-            if len(self.track_queue) == 0:
-                # print(f'Initializing tracks...')
+                
+            if len(self.track_queue) == 0 or sum([len(frame["pred_track_ids"]) for frame in self.track_queue]) == 0:
+                print(f'Initializing track on batch {batch_idx} frame {instances[batch_idx]["frame_id"]}')
                 instances[batch_idx]["pred_track_ids"] = torch.arange(
-                    0, len(instances[0]["bboxes"])
+                    0, len(instances[batch_idx]["bboxes"])
                 )
 
-                id_count = len(instances[0]["bboxes"])
-
+                id_count = len(instances[batch_idx]["bboxes"])
+                print(f'Initial tracks are {instances[batch_idx]["pred_track_ids"]}')
                 self.track_queue.append(instances[batch_idx])
 
-            else:
+            else:   
                 instances_to_track = (list(self.track_queue) + [instances[batch_idx]])[-window_size:]
-                # if not self.persistent_tracking:
-                #     query_ind = min(window_size - 1, batch_idx)
-                # else:
-                #     query_ind = min(window_size - 1, instances[batch_idx]['frame_id'])
+                
+                if sum([frame['num_detected'] for frame in instances_to_track]) == 0:
+                    print("No detections to track!")
+                    
+                    instances[batch_idx]["pred_track_ids"] = torch.arange(
+                    0, len(instances[batch_idx]["bboxes"])
+                    )
+                    
+                    self.track_queue.append(instances[batch_idx])
+                    continue
+                
                 query_ind = min(window_size - 1, len(instances_to_track) - 1)
                     
                 instances[batch_idx], id_count = self._run_global_tracker(
@@ -266,7 +276,9 @@ def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query
 
         # Number of instances in each frame of the window.
         # E.g.: instances_per_frame: [4, 5, 6, 7]; window of length 4 with 4 detected instances in the first frame of the window.
-        
+        # print([frame['frame_id'].item() for frame in instances])
+        # print([frame['frame_id'].item() for frame in instances])
+        # print([frame['pred_track_ids'] for frame in instances])
         _ = model.eval()
         instances_per_frame = [frame["num_detected"] for frame in instances]
 
@@ -299,12 +311,16 @@ def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query
         n_nonquery = (
             total_instances - n_query
         )  # Number of instances in the window not including the current/query frame.
-
-        instance_ids = torch.cat(
-            [x["pred_track_ids"] for batch_idx, x in enumerate(instances) if batch_idx != query_frame], dim=0
-        ).view(
-            n_nonquery
-        )  # (n_nonquery,)
+        
+        try:
+            instance_ids = torch.cat(
+                [x["pred_track_ids"] for batch_idx, x in enumerate(instances) if batch_idx != query_frame], dim=0
+            ).view(
+                n_nonquery
+            )  # (n_nonquery,)
+        except Exception as e:
+            print(instances)
+            raise(e)
 
         query_inds = [x for x in range(sum(instances_per_frame[:query_frame]), sum(instances_per_frame[: query_frame + 1]))]
         nonquery_inds = [i for i in range(total_instances) if i not in query_inds]

From 85931ab729eff29558d8810d7484195b8f0c4915 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Wed, 20 Sep 2023 10:49:23 -0700
Subject: [PATCH 10/14] remove print statements

---
 biogtr/inference/tracker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/biogtr/inference/tracker.py b/biogtr/inference/tracker.py
index 7a28f2c..0162587 100644
--- a/biogtr/inference/tracker.py
+++ b/biogtr/inference/tracker.py
@@ -159,13 +159,13 @@ def sliding_inference(self, model: GlobalTrackingTransformer, instances, window_
                 self.track_queue.clear()
                 
             if len(self.track_queue) == 0 or sum([len(frame["pred_track_ids"]) for frame in self.track_queue]) == 0:
-                print(f'Initializing track on batch {batch_idx} frame {instances[batch_idx]["frame_id"]}')
+                # print(f'Initializing track on batch {batch_idx} frame {instances[batch_idx]["frame_id"]}')
                 instances[batch_idx]["pred_track_ids"] = torch.arange(
                     0, len(instances[batch_idx]["bboxes"])
                 )
 
                 id_count = len(instances[batch_idx]["bboxes"])
-                print(f'Initial tracks are {instances[batch_idx]["pred_track_ids"]}')
+                # print(f'Initial tracks are {instances[batch_idx]["pred_track_ids"]}')
                 self.track_queue.append(instances[batch_idx])
 
             else:   

From 121c36498f526f4eff91b199f5c430ab0a825349 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Thu, 2 Nov 2023 10:38:47 -0700
Subject: [PATCH 11/14] case match anchors, add eval dataset

---
 biogtr/datasets/eval_dataset.py  | 39 ++++++++++++++++++++++++++++++++
 biogtr/datasets/sleap_dataset.py | 23 +++++++++++++++----
 2 files changed, 57 insertions(+), 5 deletions(-)
 create mode 100644 biogtr/datasets/eval_dataset.py

diff --git a/biogtr/datasets/eval_dataset.py b/biogtr/datasets/eval_dataset.py
new file mode 100644
index 0000000..870ec01
--- /dev/null
+++ b/biogtr/datasets/eval_dataset.py
@@ -0,0 +1,39 @@
+"Module containing wrapper for merging gt and pred datasets for evaluation"
+import torch
+from torch.utils.data import Dataset
+
+class EvalDataset(Dataset):
+    
+    def __init__(self, gt_dataset: Dataset, pred_dataset: Dataset):
+        
+        self.gt_dataset = gt_dataset
+        self.pred_dataset = pred_dataset
+        
+    def __len__(self):
+        """Get the size of the dataset.
+
+        Returns:
+            the size or the number of chunks in the dataset
+        """
+        return len(self.gt_dataset)
+        
+    def __getitem__(self, idx: int):
+        """Get an element of the dataset.
+
+        Args:
+            idx: the index of the batch. Note this is not the index of the video
+            or the frame.
+
+        Returns:
+            A list of dicts where each dict corresponds a frame in the chunk and
+            each value is a `torch.Tensor`. Dict elements are the video id, frame id, and gt/pred track ids
+
+        """
+        labels = [{"video_id": gt_frame['video_id'],
+                  "frame_id": gt_frame['video_id'],
+                  "gt_track_ids": gt_frame['gt_track_ids'],
+                  "pred_track_ids": pred_frame['gt_track_ids'],
+                  "bboxes": pred_frame["bboxes"]
+                 } for gt_frame, pred_frame in zip(self.gt_dataset[idx], self.pred_dataset[idx])]
+        
+        return labels
\ No newline at end of file
diff --git a/biogtr/datasets/sleap_dataset.py b/biogtr/datasets/sleap_dataset.py
index c16c4ad..0a2cb33 100644
--- a/biogtr/datasets/sleap_dataset.py
+++ b/biogtr/datasets/sleap_dataset.py
@@ -144,9 +144,13 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
             frame = int(frame)
             
             lf = video[frame]
-
-            img = vid_reader.get_data(frame)
-
+            
+            try:
+                img = vid_reader.get_data(frame)
+            except IndexError as e:
+                print(f"Could not read frame {frame} from {video_name}")
+                continue
+                
             for instance in lf:
                 gt_track_ids.append(video.tracks.index(instance.track))
 
@@ -207,8 +211,17 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
 
             for pose in shown_poses:
 
-                if self.anchor in pose: 
-                    centroid = pose[self.anchor]
+                if self.anchor in pose:
+                    anchor = self.anchor
+                elif self.anchor.lower() in pose:
+                    anchor = self.anchor.lower()
+                elif self.anchor.upper() in pose:
+                    anchor = self.anchor.upper()
+                else:
+                    anchor = "midpoint"
+                    
+                if anchor != "midpoint":
+                    centroid = pose[anchor]
 
                     if not np.isnan(centroid).any():
                         bbox = data_utils.pad_bbox(

From cd6d61a7c57f8b7a7daaa8b9068402129b2e5082 Mon Sep 17 00:00:00 2001
From: aaprasad <aaprasad.ucsd.edu>
Date: Thu, 2 Nov 2023 10:42:47 -0700
Subject: [PATCH 12/14] misc changes: -gtr_runner.py: add logging function,
 condense metrics,persistent tracking -train.py: allow selection of batch task
 when running local -config.py: minor bug fix

---
 biogtr/config.py            |  2 +-
 biogtr/models/gtr_runner.py | 82 +++++++++++++++++++++----------------
 biogtr/training/train.py    |  8 ++--
 3 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/biogtr/config.py b/biogtr/config.py
index 8b06cf8..df32350 100644
--- a/biogtr/config.py
+++ b/biogtr/config.py
@@ -237,7 +237,7 @@ def get_logger(self):
         Returns:
             A Logger with specified params
         """
-        logger_params = self.cfg.logging
+        logger_params = OmegaConf.to_container(self.cfg.logging, resolve=True)
         return init_logger(logger_params)
 
     def get_early_stopping(self) -> pl.callbacks.EarlyStopping:
diff --git a/biogtr/models/gtr_runner.py b/biogtr/models/gtr_runner.py
index cc675de..7fb32e3 100644
--- a/biogtr/models/gtr_runner.py
+++ b/biogtr/models/gtr_runner.py
@@ -24,9 +24,8 @@ def __init__(
         loss_cfg: dict = {},
         optimizer_cfg: dict = None,
         scheduler_cfg: dict = None,
-        train_metrics: list[str] = (""),
-        val_metrics: list[str] = ("num_switches",),
-        test_metrics: list[str] = ("num_switches",),
+        metrics: dict[str,list[str]] = {"train": ["num_switches"], "val": ["num_switches"], "test": ["num_switches"]},
+        persistent_tracking: dict[str, bool] = {"train": False, "val": True, "test": True}
     ):
         """Initialize a lightning module for GTR.
 
@@ -51,10 +50,8 @@ def __init__(
         self.optimizer_cfg = optimizer_cfg
         self.scheduler_cfg = scheduler_cfg
 
-        self.train_metrics = train_metrics
-        self.val_metrics = val_metrics
-        self.test_metrics = test_metrics
-
+        self.metrics = metrics
+        self.persistent_tracking = persistent_tracking
     def forward(self, instances) -> torch.Tensor:
         """The forward pass of the lightning module.
 
@@ -64,7 +61,9 @@ def forward(self, instances) -> torch.Tensor:
         Returns:
             An association matrix between objects
         """
-        return self.model(instances)
+        if sum([frame['num_detected'] for frame in instances]) > 0:
+            return self.model(instances)
+        return None
 
     def training_step(
         self, train_batch: list[dict], batch_idx: int
@@ -79,9 +78,9 @@ def training_step(
         Returns:
             A dict containing the train loss plus any other metrics specified
         """
-        result = self._shared_eval_step(train_batch[0], persistent_tracking=False, eval_metrics=self.train_metrics)
-        for metric, val in result.items():
-            self.log(f"train_{metric}", val, batch_size=len(train_batch[0]))
+        result = self._shared_eval_step(train_batch[0], mode="train")
+        self.log_metrics(result, "train")
+                         
         return result
 
     def validation_step(
@@ -97,9 +96,9 @@ def validation_step(
         Returns:
             A dict containing the val loss plus any other metrics specified
         """
-        result = self._shared_eval_step(val_batch[0], persistent_tracking=True, eval_metrics=self.val_metrics)
-        for metric, val in result.items():
-            self.log(f"val_{metric}", val, batch_size=len(val_batch[0]))
+        result = self._shared_eval_step(val_batch[0], mode = "val")
+        self.log_metrics(result, "val")
+        
         return result
 
     def test_step(self, test_batch: list[dict], batch_idx: int) -> dict[str, float]:
@@ -113,9 +112,9 @@ def test_step(self, test_batch: list[dict], batch_idx: int) -> dict[str, float]:
         Returns:
             A dict containing the val loss plus any other metrics specified
         """
-        result = self._shared_eval_step(test_batch[0], persistent_tracking=True, eval_metrics=self.test_metrics)
-        for metric, val in result.items():
-            self.log(f"val_{metric}", val, batch_size=len(test_batch[0]))
+        result = self._shared_eval_step(test_batch[0], mode="test")
+        self.log_metrics(result, "test")
+        
         return result
 
     def predict_step(self, batch: list[dict], batch_idx: int) -> dict:
@@ -135,31 +134,39 @@ def predict_step(self, batch: list[dict], batch_idx: int) -> dict:
         instances_pred = self.tracker(self.model, batch[0])
         return instances_pred
 
-    def _shared_eval_step(self, instances, persistent_tracking=False, eval_metrics=("num_switches",)):
+    def _shared_eval_step(self, instances, mode):
         """Helper function for running evaluation used by train, test, and val steps.
 
         Args:
             instances: A list of dicts where each dict is a frame containing gt data
-            persistent_tracking: Whether or not to track across chunks. During training this should be set to false due to shuffling.
-            eval_metrics: A list of metrics calculated and saved
+            mode: which metrics to compute and whether to use persistent tracking or not
 
         Returns:
             a dict containing the loss and any other metrics specified by `eval_metrics`
         """
-        if self.model.transformer.return_embedding:
-            logits, _ = self(instances)
-        else:
-            logits = self(instances)
-        loss = self.loss(logits, instances)
-
-        return_metrics = {"loss": loss}
-        if eval_metrics is not None and len(eval_metrics) > 0:
-            self.tracker.persistent_tracking = persistent_tracking
-            instances_pred = self.tracker(self.model, instances)
-            instances_mm = metrics.to_track_eval(instances_pred)
-            clearmot = metrics.get_pymotmetrics(instances_mm, eval_metrics)
-            return_metrics.update(clearmot.to_dict())
-
+        try:
+            eval_metrics = self.metrics[mode]
+            persistent_tracking = self.persistent_tracking[mode]
+            if self.model.transformer.return_embedding:
+                logits, _ = self(instances)
+            else:
+                logits = self(instances)
+
+            if not logits:
+                return None
+
+            loss = self.loss(logits, instances)
+
+            return_metrics = {"loss": loss}
+            if eval_metrics is not None and len(eval_metrics) > 0:
+                self.tracker.persistent_tracking = persistent_tracking
+                instances_pred = self.tracker(self.model, instances)
+                instances_mm = metrics.to_track_eval(instances_pred)
+                clearmot = metrics.get_pymotmetrics(instances_mm, eval_metrics)
+                return_metrics.update(clearmot.to_dict())
+        except Exception as e:
+            print(f'Failed on frame {instances[0]["frame_id"]} of video {instances[0]["video_id"]}')
+            raise(e)
         return return_metrics
 
     def configure_optimizers(self) -> dict:
@@ -187,8 +194,13 @@ def configure_optimizers(self) -> dict:
             "optimizer": optimizer,
             "lr_scheduler": {
                 "scheduler": scheduler,
-                "monitor": "train_loss",
+                "monitor": "val_loss",
                 "interval": "epoch",
                 "frequency": 10,
             },
         }
+    
+    def log_metrics(self, result, mode):
+        if result:
+            for metric, val in result.items():
+                self.log(f"{mode}_{metric}", val, on_step = True, on_epoch=True)
diff --git a/biogtr/training/train.py b/biogtr/training/train.py
index b2505ef..2537f7c 100644
--- a/biogtr/training/train.py
+++ b/biogtr/training/train.py
@@ -41,13 +41,11 @@ def main(cfg: DictConfig):
 
     # update with parameters for batch train job
     if "batch_config" in cfg.keys():
+        
         try:
             index = int(os.environ["POD_INDEX"])
-        # For testing without deploying a job on runai
-        except KeyError:
-            print("Pod Index Not found! Setting index to 0")
-            index = 0
-        print(f"Pod Index: {index}")
+        except KeyError as e:
+            index = int(input("No pod index found, assuming single run!\nPlease input task index to run:"))
 
         hparams_df = pd.read_csv(cfg.batch_config)
         hparams = hparams_df.iloc[index].to_dict()

From cfc55002e8e460f9e9d6ac75a7569ee3c3041955 Mon Sep 17 00:00:00 2001
From: Aaditya Prasad <78439225+aaprasad@users.noreply.github.com>
Date: Mon, 22 Apr 2024 12:40:53 -0700
Subject: [PATCH 13/14] Aadi/track local queues (#20)

Co-authored-by: aaprasad <aaprasad.ucsd.edu>
---
 .gitignore                                   |   1 +
 biogtr/config.py                             |  57 +-
 biogtr/data_structures.py                    | 958 +++++++++++++++++++
 biogtr/datasets/base_dataset.py              |  36 +-
 biogtr/datasets/cell_tracking_dataset.py     | 108 +--
 biogtr/datasets/data_utils.py                |  50 +-
 biogtr/datasets/eval_dataset.py              |  73 +-
 biogtr/datasets/microscopy_dataset.py        |  96 +-
 biogtr/datasets/sleap_dataset.py             | 157 +--
 biogtr/datasets/tracking_dataset.py          |  16 +-
 biogtr/inference/__init__.py                 |   1 +
 biogtr/inference/boxes.py                    |   5 +-
 biogtr/inference/metrics.py                  | 191 ++--
 biogtr/inference/post_processing.py          |  15 +-
 biogtr/inference/track.py                    |  46 +-
 biogtr/inference/track_queue.py              | 306 ++++++
 biogtr/inference/tracker.py                  | 485 ++++++----
 biogtr/models/attention_head.py              |   4 +-
 biogtr/models/embedding.py                   |  37 +-
 biogtr/models/global_tracking_transformer.py |  35 +-
 biogtr/models/gtr_runner.py                  |  91 +-
 biogtr/models/model_utils.py                 |  39 +-
 biogtr/models/transformer.py                 |  76 +-
 biogtr/training/configs/base.yaml            |  33 +-
 biogtr/training/losses.py                    |  12 +-
 biogtr/training/train.py                     |  15 +-
 biogtr/visualize.py                          | 114 ++-
 environment.yml                              |   4 +-
 environment_cpu.yml                          |   1 +
 tests/configs/base.yaml                      |  10 +-
 tests/conftest.py                            |   1 +
 tests/fixtures/configs.py                    |   6 +-
 tests/fixtures/datasets.py                   |   1 +
 tests/fixtures/torch.py                      |   4 +-
 tests/test_data_structures.py                | 205 ++++
 tests/test_datasets.py                       |  53 +-
 tests/test_inference.py                      | 103 +-
 tests/test_models.py                         |  68 +-
 tests/test_training.py                       |  65 +-
 tests/test_version.py                        |   1 +
 40 files changed, 2666 insertions(+), 913 deletions(-)
 create mode 100644 biogtr/data_structures.py
 create mode 100644 biogtr/inference/__init__.py
 create mode 100644 biogtr/inference/track_queue.py
 create mode 100644 tests/test_data_structures.py

diff --git a/.gitignore b/.gitignore
index b24819f..1e8f6ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -76,6 +76,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+notebooks/
 
 # IPython
 profile_default/
diff --git a/biogtr/config.py b/biogtr/config.py
index df32350..00a24a1 100644
--- a/biogtr/config.py
+++ b/biogtr/config.py
@@ -10,6 +10,8 @@
 from omegaconf import DictConfig, OmegaConf
 from pprint import pprint
 from typing import Union, Iterable
+from pathlib import Path
+import os
 import pytorch_lightning as pl
 import torch
 
@@ -43,7 +45,7 @@ def __repr__(self):
         return f"Config({self.cfg})"
 
     def __str__(self):
-        """String representation of config class."""
+        """Return a string representation of config class."""
         return f"Config({self.cfg})"
 
     def set_hparams(self, hparams: dict) -> bool:
@@ -92,20 +94,21 @@ def get_tracker_cfg(self) -> dict:
 
     def get_gtr_runner(self):
         """Get lightning module for training, validation, and inference."""
-        
         tracker_params = self.cfg.tracker
         optimizer_params = self.cfg.optimizer
         scheduler_params = self.cfg.scheduler
         loss_params = self.cfg.loss
         gtr_runner_params = self.cfg.runner
-        
+
         if self.cfg.model.ckpt_path is not None and self.cfg.model.ckpt_path != "":
-            model = GTRRunner.load_from_checkpoint(self.cfg.model.ckpt_path,
-                                                   tracker_cfg = tracker_params,
-                                                   train_metrics=self.cfg.runner.train_metrics,
-                                                  val_metrics=self.cfg.runner.val_metrics,
-                                                  test_metrics=self.cfg.runner.test_metrics)
-            
+            model = GTRRunner.load_from_checkpoint(
+                self.cfg.model.ckpt_path,
+                tracker_cfg=tracker_params,
+                train_metrics=self.cfg.runner.metrics.train,
+                val_metrics=self.cfg.runner.metrics.val,
+                test_metrics=self.cfg.runner.metrics.test,
+            )
+
         else:
             model_params = self.cfg.model
             model = GTRRunner(
@@ -186,13 +189,13 @@ def get_dataloader(
             torch.multiprocessing.set_sharing_strategy("file_system")
         else:
             pin_memory = False
-        
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=1,
             pin_memory=pin_memory,
             collate_fn=dataset.no_batching_fn,
-            **dataloader_params
+            **dataloader_params,
         )
 
     def get_optimizer(self, params: Iterable) -> torch.optim.Optimizer:
@@ -238,7 +241,9 @@ def get_logger(self):
             A Logger with specified params
         """
         logger_params = OmegaConf.to_container(self.cfg.logging, resolve=True)
-        return init_logger(logger_params)
+        return init_logger(
+            logger_params, OmegaConf.to_container(self.cfg, resolve=True)
+        )
 
     def get_early_stopping(self) -> pl.callbacks.EarlyStopping:
         """Getter for lightning early stopping callback.
@@ -266,12 +271,25 @@ def get_checkpointing(self) -> pl.callbacks.ModelCheckpoint:
 
         else:
             dirpath = checkpoint_params["dirpath"]
+
+        dirpath = Path(dirpath).resolve()
+        if not Path(dirpath).exists():
+            try:
+                Path(dirpath).mkdir(parents=True, exist_ok=True)
+            except OSError as e:
+                print(
+                    f"Cannot create a new folder. Check the permissions to the given Checkpoint directory. \n {e}"
+                )
+
         _ = checkpoint_params.pop("dirpath")
         checkpointers = []
         monitor = checkpoint_params.pop("monitor")
         for metric in monitor:
             checkpointer = pl.callbacks.ModelCheckpoint(
-                monitor=metric, dirpath=dirpath, **checkpoint_params
+                monitor=metric,
+                dirpath=dirpath,
+                filename=f"{{epoch}}-{{{metric}}}",
+                **checkpoint_params,
             )
             checkpointer.CHECKPOINT_NAME_LAST = f"{{epoch}}-best-{{{metric}}}"
             checkpointers.append(checkpointer)
@@ -282,7 +300,7 @@ def get_trainer(
         callbacks: list[pl.callbacks.Callback],
         logger: pl.loggers.WandbLogger,
         devices: int = 1,
-        accelerator: str = None
+        accelerator: str = None,
     ) -> pl.Trainer:
         """Getter for the lightning trainer.
 
@@ -297,14 +315,19 @@ def get_trainer(
             A lightning Trainer with specified params
         """
         if "accelerator" not in self.cfg.trainer:
-            self.set_hparams({'trainer.accelerator': accelerator})
+            self.set_hparams({"trainer.accelerator": accelerator})
         if "devices" not in self.cfg.trainer:
-            self.set_hparams({'trainer.devices': devices})
+            self.set_hparams({"trainer.devices": devices})
 
         trainer_params = self.cfg.trainer
-        
+        if "profiler" in trainer_params:
+            profiler = pl.profilers.AdvancedProfiler(filename="profile.txt")
+            trainer_params.pop("profiler")
+        else:
+            profiler = None
         return pl.Trainer(
             callbacks=callbacks,
             logger=logger,
+            profiler=profiler,
             **trainer_params,
         )
diff --git a/biogtr/data_structures.py b/biogtr/data_structures.py
new file mode 100644
index 0000000..a6ce32d
--- /dev/null
+++ b/biogtr/data_structures.py
@@ -0,0 +1,958 @@
+"""Module containing data classes such as Instances and Frames."""
+
+import torch
+import sleap_io as sio
+import numpy as np
+from numpy.typing import ArrayLike
+from typing import Union, List
+
+
+class Instance:
+    """Class representing a single instance to be tracked."""
+
+    def __init__(
+        self,
+        gt_track_id: int = -1,
+        pred_track_id: int = -1,
+        bbox: ArrayLike = torch.empty((0, 4)),
+        crop: ArrayLike = torch.tensor([]),
+        features: ArrayLike = torch.tensor([]),
+        track_score: float = -1.0,
+        point_scores: ArrayLike = None,
+        instance_score: float = -1.0,
+        skeleton: sio.Skeleton = None,
+        pose: dict[str, ArrayLike] = np.array([]),
+        device: str = None,
+    ):
+        """Initialize Instance.
+
+        Args:
+            gt_track_id: Ground truth track id - only used for train/eval.
+            pred_track_id: Predicted track id. Untracked instance is represented by -1.
+            bbox: The bounding box coordinate of the instance. Defaults to an empty tensor.
+            crop: The crop of the instance.
+            features: The reid features extracted from the CNN backbone used in the transformer.
+            track_score: The track score output from the association matrix.
+            point_scores: The point scores from sleap.
+            instance_score: The instance scores from sleap.
+            skeleton: The sleap skeleton used for the instance.
+            pose: A dictionary containing the node name and corresponding point.
+            device: String representation of the device the instance should be on.
+        """
+        if gt_track_id is not None:
+            self._gt_track_id = torch.tensor([gt_track_id])
+        else:
+            self._gt_track_id = torch.tensor([-1])
+
+        if pred_track_id is not None:
+            self._pred_track_id = torch.tensor([pred_track_id])
+        else:
+            self._pred_track_id = torch.tensor([])
+
+        if skeleton is None:
+            self._skeleton = sio.Skeleton(["centroid"])
+        else:
+            self._skeleton = skeleton
+
+        if not isinstance(bbox, torch.Tensor):
+            self._bbox = torch.tensor(bbox)
+        else:
+            self._bbox = bbox
+
+        if self._bbox.shape[0] and len(self._bbox.shape) == 1:
+            self._bbox = self._bbox.unsqueeze(0)
+
+        if not isinstance(crop, torch.Tensor):
+            self._crop = torch.tensor(crop)
+        else:
+            self._crop = crop
+
+        if len(self._crop.shape) == 2:
+            self._crop = self._crop.unsqueeze(0).unsqueeze(0)
+        elif len(self._crop.shape) == 3:
+            self._crop = self._crop.unsqueeze(0)
+
+        if not isinstance(crop, torch.Tensor):
+            self._features = torch.tensor(features)
+        else:
+            self._features = features
+
+        if self._features.shape[0] and len(self._features.shape) == 1:
+            self._features = self._features.unsqueeze(0)
+
+        if pose is not None:
+            self._pose = pose
+
+        elif self.bbox.shape[0]:
+
+            y1, x1, y2, x2 = self.bbox.squeeze()
+            self._pose = {"centroid": np.array([(x1 + x2) / 2, (y1 + y2) / 2])}
+
+        else:
+            self._pose = {}
+
+        self._track_score = track_score
+        self._instance_score = instance_score
+
+        if point_scores is not None:
+            self._point_scores = point_scores
+        else:
+            self._point_scores = np.zeros_like(self.pose)
+
+        self._device = device
+        self.to(self._device)
+
+    def __repr__(self) -> str:
+        """Return string representation of the Instance."""
+        return (
+            "Instance("
+            f"gt_track_id={self._gt_track_id.item()}, "
+            f"pred_track_id={self._pred_track_id.item()}, "
+            f"bbox={self._bbox}, "
+            f"crop={self._crop.shape}, "
+            f"features={self._features.shape}, "
+            f"device={self._device}"
+            ")"
+        )
+
+    def to(self, map_location):
+        """Move instance to different device or change dtype. (See `torch.to` for more info).
+
+        Args:
+            map_location: Either the device or dtype for the instance to be moved.
+
+        Returns:
+            self: reference to the instance moved to correct device/dtype.
+        """
+        self._gt_track_id = self._gt_track_id.to(map_location)
+        self._pred_track_id = self._pred_track_id.to(map_location)
+        self._bbox = self._bbox.to(map_location)
+        self._crop = self._crop.to(map_location)
+        self._features = self._features.to(map_location)
+        self.device = map_location
+        return self
+
+    def to_slp(
+        self, track_lookup: dict[int, sio.Track] = {}
+    ) -> tuple[sio.PredictedInstance, dict[int, sio.Track]]:
+        """Convert instance to sleap_io.PredictedInstance object.
+
+        Args:
+            track_lookup: A track look up dictionary containing track_id:sio.Track.
+        Returns: A sleap_io.PredictedInstance with necessary metadata
+        and a track_lookup dictionary to persist tracks.
+        """
+        try:
+            track_id = self.pred_track_id.item()
+            if track_id not in track_lookup:
+                track_lookup[track_id] = sio.Track(name=self.pred_track_id.item())
+
+            track = track_lookup[track_id]
+
+            return (
+                sio.PredictedInstance.from_numpy(
+                    points=self.pose,
+                    skeleton=self.skeleton,
+                    point_scores=self.point_scores,
+                    instance_score=self.instance_score,
+                    tracking_score=self.track_score,
+                    track=track,
+                ),
+                track_lookup,
+            )
+        except Exception as e:
+            print(
+                f"Pose shape: {self.pose.shape}, Pose score shape {self.point_scores.shape}"
+            )
+            raise RuntimeError(f"Failed to convert to sio.PredictedInstance: {e}")
+
+    @property
+    def device(self) -> str:
+        """The device the instance is on.
+
+        Returns:
+            The str representation of the device the gpu is on.
+        """
+        return self._device
+
+    @device.setter
+    def device(self, device) -> None:
+        """Set for the device property.
+
+        Args:
+            device: The str representation of the device.
+        """
+        self._device = device
+
+    @property
+    def gt_track_id(self) -> torch.Tensor:
+        """The ground truth track id of the instance.
+
+        Returns:
+            A tensor containing the ground truth track id
+        """
+        return self._gt_track_id
+
+    @gt_track_id.setter
+    def gt_track_id(self, track: int):
+        """Set the instance ground-truth track id.
+
+        Args:
+           track: An int representing the ground-truth track id.
+        """
+        if track is not None:
+            self._gt_track_id = torch.tensor([track])
+        else:
+            self._gt_track_id = torch.tensor([])
+
+    def has_gt_track_id(self) -> bool:
+        """Determine if instance has a gt track assignment.
+
+        Returns:
+            True if the gt track id is set, otherwise False.
+        """
+        if self._gt_track_id.shape[0] == 0:
+            return False
+        else:
+            return True
+
+    @property
+    def pred_track_id(self) -> torch.Tensor:
+        """The track id predicted by the tracker using asso_output from model.
+
+        Returns:
+            A tensor containing the predicted track id.
+        """
+        return self._pred_track_id
+
+    @pred_track_id.setter
+    def pred_track_id(self, track: int) -> None:
+        """Set predicted track id.
+
+        Args:
+            track: an int representing the predicted track id.
+        """
+        if track is not None:
+            self._pred_track_id = torch.tensor([track])
+        else:
+            self._pred_track_id = torch.tensor([])
+
+    def has_pred_track_id(self) -> bool:
+        """Determine whether instance has predicted track id.
+
+        Returns:
+            True if instance has a pred track id, False otherwise.
+        """
+        if self._pred_track_id.item() == -1 or self._pred_track_id.shape[0] == 0:
+            return False
+        else:
+            return True
+
+    @property
+    def bbox(self) -> torch.Tensor:
+        """The bounding box coordinates of the instance in the original frame.
+
+        Returns:
+            A (1,4) tensor containing the bounding box coordinates.
+        """
+        return self._bbox
+
+    @bbox.setter
+    def bbox(self, bbox: ArrayLike) -> None:
+        """Set the instance bounding box.
+
+        Args:
+            bbox: an arraylike object containing the bounding box coordinates.
+        """
+        if bbox is None or len(bbox) == 0:
+            self._bbox = torch.empty((0, 4))
+        else:
+            if not isinstance(bbox, torch.Tensor):
+                self._bbox = torch.tensor(bbox)
+            else:
+                self._bbox = bbox
+
+        if self._bbox.shape[0] and len(self._bbox.shape) == 1:
+            self._bbox = self._bbox.unsqueeze(0)
+
+    def has_bbox(self) -> bool:
+        """Determine if the instance has a bbox.
+
+        Returns:
+            True if the instance has a bounding box, false otherwise.
+        """
+        if self._bbox.shape[0] == 0:
+            return False
+        else:
+            return True
+
+    @property
+    def crop(self) -> torch.Tensor:
+        """The crop of the instance.
+
+        Returns:
+            A (1, c, h , w) tensor containing the cropped image centered around the instance.
+        """
+        return self._crop
+
+    @crop.setter
+    def crop(self, crop: ArrayLike) -> None:
+        """Set the crop of the instance.
+
+        Args:
+            crop: an arraylike object containing the cropped image of the centered instance.
+        """
+        if crop is None or len(crop) == 0:
+            self._crop = torch.tensor([])
+        else:
+            if not isinstance(crop, torch.Tensor):
+                self._crop = torch.tensor(crop)
+            else:
+                self._crop = crop
+
+        if len(self._crop.shape) == 2:
+            self._crop = self._crop.unsqueeze(0).unsqueeze(0)
+        elif len(self._crop.shape) == 3:
+            self._crop = self._crop.unsqueeze(0)
+
+    def has_crop(self) -> bool:
+        """Determine if the instance has a crop.
+
+        Returns:
+            True if the instance has an image otherwise False.
+        """
+        if self._crop.shape[0] == 0:
+            return False
+        else:
+            return True
+
+    @property
+    def features(self) -> torch.Tensor:
+        """Re-ID feature vector from backbone model to be used as input to transformer.
+
+        Returns:
+            a (1, d) tensor containing the reid feature vector.
+        """
+        return self._features
+
+    @features.setter
+    def features(self, features: ArrayLike) -> None:
+        """Set the reid feature vector of the instance.
+
+        Args:
+            features: a (1,d) array like object containing the reid features for the instance.
+        """
+        if features is None or len(features) == 0:
+            self._features = torch.tensor([])
+
+        elif not isinstance(features, torch.Tensor):
+            self._features = torch.tensor(features)
+        else:
+            self._features = features
+
+        if self._features.shape[0] and len(self._features.shape) == 1:
+            self._features = self._features.unsqueeze(0)
+
+    def has_features(self) -> bool:
+        """Determine if the instance has computed reid features.
+
+        Returns:
+            True if the instance has reid features, False otherwise.
+        """
+        if self._features.shape[0] == 0:
+            return False
+        else:
+            return True
+
+    @property
+    def pose(self) -> dict[str, ArrayLike]:
+        """Get the pose of the instance.
+
+        Returns:
+            A dictionary containing the node and corresponding x,y points
+        """
+        return self._pose
+
+    @pose.setter
+    def pose(self, pose: dict[str, ArrayLike]) -> None:
+        """Set the pose of the instance.
+
+        Args:
+            pose: A nodes x 2 array containing the pose coordinates.
+        """
+        if pose is not None:
+            self._pose = pose
+
+        elif self.bbox.shape[0]:
+            y1, x1, y2, x2 = self.bbox.squeeze()
+            self._pose = {"centroid": np.array([(x1 + x2) / 2, (y1 + y2) / 2])}
+
+        else:
+            self._pose = {}
+
+    def has_pose(self) -> bool:
+        """Check if the instance has a pose.
+
+        Returns True if the instance has a pose.
+        """
+        if len(self.pose):
+            return True
+        return False
+
+    @property
+    def shown_pose(self) -> dict[str, ArrayLike]:
+        """Get the pose with shown nodes only.
+
+        Returns: A dictionary filtered by nodes that are shown (points are not nan).
+        """
+        pose = self.pose
+        return {node: point for node, point in pose.items() if not np.isna(point).any()}
+
+    @property
+    def skeleton(self) -> sio.Skeleton:
+        """Get the skeleton associated with the instance.
+
+        Returns: The sio.Skeleton associated with the instance.
+        """
+        return self._skeleton
+
+    @skeleton.setter
+    def skeleton(self, skeleton: sio.Skeleton) -> None:
+        """Set the skeleton associated with the instance.
+
+        Args:
+            skeleton: The sio.Skeleton associated with the instance.
+        """
+        self._skeleton = skeleton
+
+    @property
+    def point_scores(self) -> ArrayLike:
+        """Get the point scores associated with the pose prediction.
+
+        Returns: a vector of shape n containing the point scores outputed from sleap associated with pose predictions.
+        """
+        return self._point_scores
+
+    @point_scores.setter
+    def point_scores(self, point_scores: ArrayLike) -> None:
+        """Set the point scores associated with the pose prediction.
+
+        Args:
+            point_scores: a vector of shape n containing the point scores
+            outputted from sleap associated with pose predictions.
+        """
+        self._point_scores = point_scores
+
+    @property
+    def instance_score(self) -> float:
+        """Get the pose prediction score associated with the instance.
+
+        Returns: a float from 0-1 representing an instance_score.
+        """
+        return self._instance_score
+
+    @instance_score.setter
+    def instance_score(self, instance_score: float) -> None:
+        """Set the pose prediction score associated with the instance.
+
+        Args:
+            instance_score: a float from 0-1 representing an instance_score.
+        """
+        self._instance_score = instance_score
+
+    @property
+    def track_score(self) -> float:
+        """Get the track_score of the instance.
+
+        Returns: A float from 0-1 representing the output used in the tracker for assignment.
+        """
+        return self._track_score
+
+    @track_score.setter
+    def track_score(self, track_score: float) -> None:
+        """Set the track_score of the instance.
+
+        Args:
+            track_score: A float from 0-1 representing the output used in the tracker for assignment.
+        """
+        self._track_score = track_score
+
+
+class Frame:
+    """Data structure containing metadata for a single frame of a video."""
+
+    def __init__(
+        self,
+        video_id: int,
+        frame_id: int,
+        vid_file: str = "",
+        img_shape: ArrayLike = [0, 0, 0],
+        instances: List[Instance] = [],
+        asso_output: ArrayLike = None,
+        matches: tuple = None,
+        traj_score: Union[ArrayLike, dict] = None,
+        device=None,
+    ):
+        """Initialize Frame.
+
+        Args:
+            video_id: The video index in the dataset.
+            frame_id: The index of the frame in a video.
+            vid_file: The path to the video the frame is from.
+            img_shape: The shape of the original frame (not the crop).
+            instances: A list of Instance objects that appear in the frame.
+            asso_output: The association matrix between instances
+            output directly from the transformer.
+            matches: matches from LSA algorithm between the instances and
+            available trajectories during tracking.
+            traj_score: Either a dict containing the association matrix
+            between instances and trajectories along postprocessing pipeline
+            or a single association matrix.
+            device: The device the frame should be moved to.
+        """
+        self._video_id = torch.tensor([video_id])
+        self._frame_id = torch.tensor([frame_id])
+
+        try:
+            self._video = sio.Video(vid_file)
+        except ValueError:
+            self._video = vid_file
+
+        if isinstance(img_shape, torch.Tensor):
+            self._img_shape = img_shape
+        else:
+            self._img_shape = torch.tensor([img_shape])
+
+        self._instances = instances
+
+        self._asso_output = asso_output
+        self._matches = matches
+
+        if traj_score is None:
+            self._traj_score = {}
+        elif isinstance(traj_score, dict):
+            self._traj_score = traj_score
+        else:
+            self._traj_score = {"initial": traj_score}
+
+        self._device = device
+        self.to(device)
+
+    def __repr__(self) -> str:
+        """Return String representation of the Frame.
+
+        Returns:
+            The string representation of the frame.
+        """
+        return (
+            "Frame("
+            f"video={self._video.filename if isinstance(self._video, sio.Video) else self._video}, "
+            f"video_id={self._video_id.item()}, "
+            f"frame_id={self._frame_id.item()}, "
+            f"img_shape={self._img_shape}, "
+            f"num_detected={self.num_detected}, "
+            f"asso_output={self._asso_output}, "
+            f"traj_score={self._traj_score}, "
+            f"matches={self._matches}, "
+            f"instances={self._instances}, "
+            f"device={self._device}"
+            ")"
+        )
+
+    def to(self, map_location: str):
+        """Move frame to different device or dtype (See `torch.to` for more info).
+
+        Args:
+            map_location: A string representing the device to move to.
+
+        Returns:
+            The frame moved to a different device/dtype.
+        """
+        self._video_id = self._video_id.to(map_location)
+        self._frame_id = self._frame_id.to(map_location)
+        self._img_shape = self._img_shape.to(map_location)
+
+        if isinstance(self._asso_output, torch.Tensor):
+            self._asso_output = self._asso_output.to(map_location)
+
+        if isinstance(self._matches, torch.Tensor):
+            self._matches = self._matches.to(map_location)
+
+        for key, val in self._traj_score.items():
+            if isinstance(val, torch.Tensor):
+                self._traj_score[key] = val.to(map_location)
+
+        for instance in self._instances:
+            instance = instance.to(map_location)
+
+        self._device = map_location
+        return self
+
+    def to_slp(
+        self, track_lookup: dict[int : sio.Track] = {}
+    ) -> tuple[sio.LabeledFrame, dict[int, sio.Track]]:
+        """Convert Frame to sleap_io.LabeledFrame object.
+
+        Args:
+            track_lookup: A lookup dictionary containing the track_id and sio.Track for persistence
+
+        Returns: A tuple containing a LabeledFrame object with necessary metadata and
+        a lookup dictionary containing the track_id and sio.Track for persistence
+        """
+        slp_instances = []
+        for instance in self.instances:
+            slp_instance, track_lookup = instance.to_slp(track_lookup=track_lookup)
+            slp_instances.append(slp_instance)
+        return (
+            sio.LabeledFrame(
+                video=self.video,
+                frame_idx=self.frame_id.item(),
+                instances=slp_instances,
+            ),
+            track_lookup,
+        )
+
+    @property
+    def device(self) -> str:
+        """The device the frame is on.
+
+        Returns:
+            The string representation of the device the frame is on.
+        """
+        return self._device
+
+    @device.setter
+    def device(self, device: str) -> None:
+        """Set the device.
+
+        Note: Do not set `frame.device = device` normally. Use `frame.to(device)` instead.
+
+        Args:
+            device: the device the function should be on.
+        """
+        self._device = device
+
+    @property
+    def video_id(self) -> torch.Tensor:
+        """The index of the video the frame comes from.
+
+        Returns:
+            A tensor containing the video index.
+        """
+        return self._video_id
+
+    @video_id.setter
+    def video_id(self, video_id: int) -> None:
+        """Set the video index.
+
+        Note: Generally the video_id should be immutable after initialization.
+
+        Args:
+            video_id: an int representing the index of the video that the frame came from.
+        """
+        self._video_id = torch.tensor([video_id])
+
+    @property
+    def frame_id(self) -> torch.Tensor:
+        """The index of the frame in a full video.
+
+        Returns:
+            A torch tensor containing the index of the frame in the video.
+        """
+        return self._frame_id
+
+    @frame_id.setter
+    def frame_id(self, frame_id: int) -> None:
+        """Set the frame index of the frame.
+
+        Note: The frame_id should generally be immutable after initialization.
+
+        Args:
+            frame_id: The int index of the frame in the full video.
+        """
+        self._frame_id = torch.tensor([frame_id])
+
+    @property
+    def video(self) -> Union[sio.Video, str]:
+        """Get the video associated with the frame.
+
+        Returns: An sio.Video object representing the video or a placeholder string
+        if it is not possible to create the sio.Video
+        """
+        return self._video
+
+    @video.setter
+    def video(self, video_filename: str) -> None:
+        """Set the video associated with the frame.
+
+        Note: we try to store the video in an sio.Video object.
+        However, if this is not possible (e.g. incompatible format or missing filepath)
+        then we simply store the string.
+
+        Args:
+            video_filename: string path to video_file
+        """
+        try:
+            self._video = video_filename
+        except ValueError:
+            self._video = video_filename
+
+    @property
+    def img_shape(self) -> torch.Tensor:
+        """The shape of the pre-cropped frame.
+
+        Returns:
+            A torch tensor containing the shape of the frame. Should generally be (c, h, w)
+        """
+        return self._img_shape
+
+    @img_shape.setter
+    def img_shape(self, img_shape: ArrayLike) -> None:
+        """Set the shape of the frame image.
+
+        Note: the img_shape should generally be immutable after initialization.
+
+        Args:
+            img_shape: an ArrayLike object containing the shape of the frame image.
+        """
+        if isinstance(img_shape, torch.Tensor):
+            self._img_shape = img_shape
+        else:
+            self._img_shape = torch.tensor([img_shape])
+
+    @property
+    def instances(self) -> List[Instance]:
+        """A list of instances in the frame.
+
+        Returns:
+            The list of instances that appear in the frame.
+        """
+        return self._instances
+
+    @instances.setter
+    def instances(self, instances: List[Instance]) -> None:
+        """Set the frame's instance.
+
+        Args:
+            instances: A list of Instances that appear in the frame.
+        """
+        self._instances = instances
+
+    def has_instances(self) -> bool:
+        """Determine whether there are instances in the frame.
+
+        Returns:
+            True if there are instances in the frame, otherwise False.
+        """
+        if self.num_detected == 0:
+            return False
+        return True
+
+    @property
+    def num_detected(self) -> int:
+        """The number of instances in the frame.
+
+        Returns:
+            the number of instances in the frame.
+        """
+        return len(self.instances)
+
+    @property
+    def asso_output(self) -> ArrayLike:
+        """The association matrix between instances outputed directly by transformer.
+
+        Returns:
+            An arraylike (n_query, n_nonquery) association matrix between instances.
+        """
+        return self._asso_output
+
+    def has_asso_output(self) -> bool:
+        """Determine whether the frame has an association matrix computed.
+
+        Returns:
+            True if the frame has an association matrix otherwise, False.
+        """
+        if self._asso_output is None or len(self._asso_output) == 0:
+            return False
+        return True
+
+    @asso_output.setter
+    def asso_output(self, asso_output: ArrayLike) -> None:
+        """Set the association matrix of a frame.
+
+        Args:
+            asso_output: An arraylike (n_query, n_nonquery) association matrix between instances.
+        """
+        self._asso_output = asso_output
+
+    @property
+    def matches(self) -> tuple:
+        """Matches between frame instances and availabel trajectories.
+
+        Returns:
+            A tuple containing the instance idx and trajectory idx for the matched instance.
+        """
+        return self._matches
+
+    @matches.setter
+    def matches(self, matches: tuple) -> None:
+        """Set the frame matches.
+
+        Args:
+            matches: A tuple containing the instance idx and trajectory idx for the matched instance.
+        """
+        self._matches = matches
+
+    def has_matches(self) -> bool:
+        """Check whether or not matches have been computed for frame.
+
+        Returns:
+            True if frame contains matches otherwise False.
+        """
+        if self._matches is not None and len(self._matches) > 0:
+            return True
+        return False
+
+    def get_traj_score(self, key=None) -> Union[dict, ArrayLike, None]:
+        """Get dictionary containing association matrix between instances and trajectories along postprocessing pipeline.
+
+        Args:
+            key: The key of the trajectory score to be accessed.
+            Can be one of {None, 'initial', 'decay_time', 'max_center_dist', 'iou', 'final'}
+
+        Returns:
+            - dictionary containing all trajectory scores if key is None
+            - trajectory score associated with key
+            - None if the key is not found
+        """
+        if key is None:
+            return self._traj_score
+        else:
+            try:
+                return self._traj_score[key]
+            except KeyError as e:
+                print(f"Could not access {key} traj_score due to {e}")
+                return None
+
+    def add_traj_score(self, key, traj_score: ArrayLike) -> None:
+        """Add trajectory score to dictionary.
+
+        Args:
+            key: key associated with traj score to be used in dictionary
+            traj_score: association matrix between instances and trajectories
+        """
+        self._traj_score[key] = traj_score
+
+    def has_traj_score(self) -> bool:
+        """Check if any trajectory association matrix has been saved.
+
+        Returns:
+            True there is at least one association matrix otherwise, false.
+        """
+        if len(self._traj_score) == 0:
+            return False
+        return True
+
+    def has_gt_track_ids(self) -> bool:
+        """Check if any of frames instances has a gt track id.
+
+        Returns:
+            True if at least 1 instance has a gt track id otherwise False.
+        """
+        if self.has_instances():
+            return any([instance.has_gt_track_id() for instance in self.instances])
+        return False
+
+    def get_gt_track_ids(self) -> torch.Tensor:
+        """Get the gt track ids of all instances in the frame.
+
+        Returns:
+            an (N,) shaped tensor with the gt track ids of each instance in the frame.
+        """
+        if not self.has_instances():
+            return torch.tensor([])
+        return torch.cat([instance.gt_track_id for instance in self.instances])
+
+    def has_pred_track_ids(self) -> bool:
+        """Check if any of frames instances has a pred track id.
+
+        Returns:
+            True if at least 1 instance has a pred track id otherwise False.
+        """
+        if self.has_instances():
+            return any([instance.has_pred_track_id() for instance in self.instances])
+        return False
+
+    def get_pred_track_ids(self) -> torch.Tensor:
+        """Get the pred track ids of all instances in the frame.
+
+        Returns:
+            an (N,) shaped tensor with the pred track ids of each instance in the frame.
+        """
+        if not self.has_instances():
+            return torch.tensor([])
+        return torch.cat([instance.pred_track_id for instance in self.instances])
+
+    def has_bboxes(self) -> bool:
+        """Check if any of frames instances has a bounding box.
+
+        Returns:
+            True if at least 1 instance has a bounding box otherwise False.
+        """
+        if self.has_instances():
+            return any([instance.has_bboxes() for instance in self.instances])
+        return False
+
+    def get_bboxes(self) -> torch.Tensor:
+        """Get the bounding boxes of all instances in the frame.
+
+        Returns:
+            an (N,4) shaped tensor with bounding boxes of each instance in the frame.
+        """
+        if not self.has_instances():
+            return torch.empty(0, 4)
+        return torch.cat([instance.bbox for instance in self.instances], dim=0)
+
+    def has_crops(self) -> bool:
+        """Check if any of frames instances has a crop.
+
+        Returns:
+            True if at least 1 instance has a crop otherwise False.
+        """
+        if self.has_instances():
+            return any([instance.has_crop() for instance in self.instances])
+        return False
+
+    def get_crops(self) -> torch.Tensor:
+        """Get the crops of all instances in the frame.
+
+        Returns:
+            an (N, C, H, W) shaped tensor with crops of each instance in the frame.
+        """
+        if not self.has_instances():
+            return torch.tensor([])
+        try:
+            return torch.cat([instance.crop for instance in self.instances], dim=0)
+        except Exception as e:
+            print(self)
+            raise (e)
+
+    def has_features(self):
+        """Check if any of frames instances has reid features already computed.
+
+        Returns:
+            True if at least 1 instance have reid features otherwise False.
+        """
+        if self.has_instances():
+            return any([instance.has_features() for instance in self.instances])
+        return False
+
+    def get_features(self):
+        """Get the reid feature vectors of all instances in the frame.
+
+        Returns:
+            an (N, D) shaped tensor with reid feature vectors of each instance in the frame.
+        """
+        if not self.has_instances():
+            return torch.tensor([])
+        return torch.cat([instance.features for instance in self.instances], dim=0)
diff --git a/biogtr/datasets/base_dataset.py b/biogtr/datasets/base_dataset.py
index b63dab7..e7484ef 100644
--- a/biogtr/datasets/base_dataset.py
+++ b/biogtr/datasets/base_dataset.py
@@ -1,5 +1,7 @@
 """Module containing logic for loading datasets."""
+
 from biogtr.datasets import data_utils
+from biogtr.data_structures import Frame
 from torch.utils.data import Dataset
 from typing import List, Union
 import numpy as np
@@ -20,7 +22,7 @@ def __init__(
         augmentations: dict = None,
         n_chunks: Union[int, float] = 1.0,
         seed: int = None,
-        gt_list: str = None
+        gt_list: str = None,
     ):
         """Initialize Dataset.
 
@@ -49,8 +51,8 @@ def __init__(
         self.n_chunks = n_chunks
         self.seed = seed
 
-        # if self.seed is not None:
-        #     np.random.seed(self.seed)
+        if self.seed is not None:
+            np.random.seed(self.seed)
 
         self.augmentations = (
             data_utils.build_augmentations(augmentations) if augmentations else None
@@ -61,7 +63,7 @@ def __init__(
         self.labels = None
         self.gt_list = None
 
-    def create_chunks(self):
+    def create_chunks(self) -> None:
         """Get indexing for data.
 
         Creates both indexes for selecting dataset (label_idx) and frame in
@@ -71,34 +73,35 @@ def create_chunks(self):
         efficiency and data shuffling. To be called by subclass __init__()
         """
         if self.chunk:
-
             self.chunked_frame_idx, self.label_idx = [], []
             for i, frame_idx in enumerate(self.frame_idx):
                 frame_idx_split = torch.split(frame_idx, self.clip_length)
                 self.chunked_frame_idx.extend(frame_idx_split)
                 self.label_idx.extend(len(frame_idx_split) * [i])
-                
+
             if self.n_chunks > 0 and self.n_chunks <= 1.0:
                 n_chunks = int(self.n_chunks * len(self.chunked_frame_idx))
 
             elif self.n_chunks <= len(self.chunked_frame_idx):
                 n_chunks = int(self.n_chunks)
-                
+
             else:
                 n_chunks = len(self.chunked_frame_idx)
 
             if n_chunks > 0 and n_chunks < len(self.chunked_frame_idx):
-                sample_idx = np.random.choice(np.arange(len(self.chunked_frame_idx)), n_chunks)
+                sample_idx = np.random.choice(
+                    np.arange(len(self.chunked_frame_idx)), n_chunks, replace=False
+                )
 
                 self.chunked_frame_idx = [self.chunked_frame_idx[i] for i in sample_idx]
-                
+
                 self.label_idx = [self.label_idx[i] for i in sample_idx]
 
         else:
             self.chunked_frame_idx = self.frame_idx
             self.label_idx = [i for i in range(len(self.labels))]
 
-    def __len__(self):
+    def __len__(self) -> int:
         """Get the size of the dataset.
 
         Returns:
@@ -106,7 +109,7 @@ def __len__(self):
         """
         return len(self.chunked_frame_idx)
 
-    def no_batching_fn(self, batch):
+    def no_batching_fn(self, batch) -> List[Frame]:
         """Collate function used to overwrite dataloader batching function.
 
         Args:
@@ -117,7 +120,7 @@ def no_batching_fn(self, batch):
         """
         return batch
 
-    def __getitem__(self, idx: int) -> List[dict]:
+    def __getitem__(self, idx: int) -> List[Frame]:
         """Get an element of the dataset.
 
         Args:
@@ -125,17 +128,14 @@ def __getitem__(self, idx: int) -> List[dict]:
             or the frame.
 
         Returns:
-            A list of dicts where each dict corresponds a frame in the chunk and
-            each value is a `torch.Tensor`. Dict elements can be seen in
-            subclasses
-
+            A list of `Frame`s in the chunk containing the metadata + instance features.
         """
         label_idx, frame_idx = self.get_indices(idx)
 
         return self.get_instances(label_idx, frame_idx)
 
     def get_indices(self, idx: int):
-        """Retrieves label and frame indices given batch index.
+        """Retrieve label and frame indices given batch index.
 
         This method should be implemented in any subclass of the BaseDataset.
 
@@ -148,7 +148,7 @@ def get_indices(self, idx: int):
         raise NotImplementedError("Must be implemented in subclass")
 
     def get_instances(self, label_idx: List[int], frame_idx: List[int]):
-        """Builds instances dict given label and frame indices.
+        """Build chunk of frames.
 
         This method should be implemented in any subclass of the BaseDataset.
 
diff --git a/biogtr/datasets/cell_tracking_dataset.py b/biogtr/datasets/cell_tracking_dataset.py
index 00d0db8..4b784fd 100644
--- a/biogtr/datasets/cell_tracking_dataset.py
+++ b/biogtr/datasets/cell_tracking_dataset.py
@@ -1,15 +1,13 @@
 """Module containing cell tracking challenge dataset."""
+
 from PIL import Image
 from biogtr.datasets import data_utils
 from biogtr.datasets.base_dataset import BaseDataset
+from biogtr.data_structures import Instance, Frame
 from scipy.ndimage import measurements
-from torch.utils.data import Dataset
-from torchvision.transforms import functional as tvf
 from typing import List, Optional, Union
 import albumentations as A
-import glob
 import numpy as np
-import os
 import pandas as pd
 import random
 import torch
@@ -20,8 +18,8 @@ class CellTrackingDataset(BaseDataset):
 
     def __init__(
         self,
-        raw_images: list[str],
-        gt_images: list[str],
+        raw_images: list[list[str]],
+        gt_images: list[list[str]],
         padding: int = 5,
         crop_size: int = 20,
         chunk: bool = False,
@@ -30,7 +28,7 @@ def __init__(
         augmentations: Optional[dict] = None,
         n_chunks: Union[int, float] = 1.0,
         seed: int = None,
-        gt_list: str = None
+        gt_list: list[str] = None,
     ):
         """Initialize CellTrackingDataset.
 
@@ -67,7 +65,7 @@ def __init__(
             augmentations,
             n_chunks,
             seed,
-            gt_list
+            gt_list,
         )
 
         self.videos = raw_images
@@ -88,12 +86,15 @@ def __init__(
         )
 
         if gt_list is not None:
-            self.gt_list = pd.read_csv(
-                gt_list,
-                delimiter=" ",
-                header=None,
-                names=["track_id", "start_frame", "end_frame", "parent_id"],
-            )
+            self.gt_list = [
+                pd.read_csv(
+                    gtf,
+                    delimiter=" ",
+                    header=None,
+                    names=["track_id", "start_frame", "end_frame", "parent_id"],
+                )
+                for gtf in gt_list
+            ]
         else:
             self.gt_list = None
 
@@ -104,14 +105,14 @@ def __init__(
         self.create_chunks()
 
     def get_indices(self, idx):
-        """Retrieves label and frame indices given batch index.
+        """Retrieve label and frame indices given batch index.
 
         Args:
             idx: the index of the batch.
         """
         return self.label_idx[idx], self.chunked_frame_idx[idx]
 
-    def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict]:
+    def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> List[Frame]:
         """Get an element of the dataset.
 
         Args:
@@ -119,34 +120,21 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
             frame_idx: index of the frames
 
         Returns:
-            a list of dicts where each dict corresponds a frame in the chunk
-            and each value is a `torch.Tensor`.
-
-            Dict Elements:
-                {
-                    "video_id": The video being passed through the transformer,
-                    "img_shape": the shape of each frame,
-                    "frame_id": the specific frame in the entire video being used,
-                    "num_detected": The number of objects in the frame,
-                    "gt_track_ids": The ground truth labels,
-                    "bboxes": The bounding boxes of each object,
-                    "crops": The raw pixel crops,
-                    "features": The feature vectors for each crop outputed by the
-                        CNN encoder,
-                    "pred_track_ids": The predicted trajectory labels from the
-                        tracker,
-                    "asso_output": the association matrix preprocessing,
-                    "matches": the true positives from the model,
-                    "traj_score": the association matrix post processing,
-                }
+            a list of Frame objects containing frame metadata and Instance Objects.
+            See `biogtr.data_structures` for more info.
         """
         image = self.videos[label_idx]
         gt = self.labels[label_idx]
 
-        instances = []
+        if self.gt_list is not None:
+            gt_list = self.gt_list[label_idx]
+        else:
+            gt_list = None
+
+        frames = []
 
         for i in frame_idx:
-            gt_track_ids, centroids, bboxes, crops = [], [], [], []
+            instances, gt_track_ids, centroids, bboxes = [], [], [], []
 
             i = int(i)
 
@@ -161,10 +149,10 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
                     np.uint8
                 )
 
-            if self.gt_list is None:
+            if gt_list is None:
                 unique_instances = np.unique(gt_sec)
             else:
-                unique_instances = self.gt_list["track_id"].unique()
+                unique_instances = gt_list["track_id"].unique()
 
             for instance in unique_instances:
                 # not all instances are in the frame, and they also label the
@@ -201,25 +189,25 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
 
             img = torch.Tensor(img).unsqueeze(0)
 
-            for bbox in bboxes:
-                crop = data_utils.crop_bbox(img, bbox)
-                crops.append(crop)
-
-            instances.append(
-                {
-                    "video_id": torch.tensor([label_idx]),
-                    "img_shape": torch.tensor([img.shape]),
-                    "frame_id": torch.tensor([i]),
-                    "num_detected": torch.tensor([len(bboxes)]),
-                    "gt_track_ids": torch.tensor(gt_track_ids).type(torch.int64),
-                    "bboxes": torch.stack(bboxes),
-                    "crops": torch.stack(crops),
-                    "features": torch.tensor([]),
-                    "pred_track_ids": torch.tensor([-1 for _ in range(len(bboxes))]),
-                    "asso_output": torch.tensor([]),
-                    "matches": torch.tensor([]),
-                    "traj_score": torch.tensor([]),
-                }
+            for j in range(len(gt_track_ids)):
+                crop = data_utils.crop_bbox(img, bboxes[j])
+
+                instances.append(
+                    Instance(
+                        gt_track_id=gt_track_ids[j],
+                        pred_track_id=-1,
+                        bbox=bboxes[j],
+                        crop=crop,
+                    )
+                )
+
+            frames.append(
+                Frame(
+                    video_id=label_idx,
+                    frame_id=i,
+                    img_shape=img.shape,
+                    instances=instances,
+                )
             )
 
-        return instances
+        return frames
diff --git a/biogtr/datasets/data_utils.py b/biogtr/datasets/data_utils.py
index 8ee2d19..2304a25 100644
--- a/biogtr/datasets/data_utils.py
+++ b/biogtr/datasets/data_utils.py
@@ -1,4 +1,5 @@
 """Module containing helper functions for datasets."""
+
 from PIL import Image
 from numpy.typing import ArrayLike
 from torchvision.transforms import functional as tvf
@@ -62,14 +63,15 @@ def get_bbox(center: ArrayLike, size: Union[int, tuple[int]]) -> torch.Tensor:
     Returns:
         A torch tensor in form y1, x1, y2, x2
     """
-    if type(size) == int:
+    if isinstance(size, int):
         size = (size, size)
     cx, cy = center[0], center[1]
 
-    bbox = torch.Tensor(
-        [-size[-1] // 2 + cy, -size[0] // 2 + cx,
-         size[-1] // 2 + cy, size[0] // 2 + cx]
-    )
+    y1 = max(0, -size[-1] // 2 + cy)
+    x1 = max(0, -size[0] // 2 + cx)
+    y2 = size[-1] // 2 + cy if y1 != 0 else size[1]
+    x2 = size[0] // 2 + cx if x1 != 0 else size[0]
+    bbox = torch.Tensor([y1, x1, y2, x2])
 
     return bbox
 
@@ -107,9 +109,7 @@ def centroid_bbox(points: ArrayLike, anchors: list, crop_size: int) -> torch.Ten
     return bbox
 
 
-def pose_bbox(
-    points: np.ndarray, bbox_size: Union[tuple[int], int]
-) -> torch.Tensor:
+def pose_bbox(points: np.ndarray, bbox_size: Union[tuple[int], int]) -> torch.Tensor:
     """Calculate bbox around instance pose.
 
     Args:
@@ -119,21 +119,27 @@ def pose_bbox(
     Returns:
         Bounding box in [y1, x1, y2, x2] format.
     """
-    if type(bbox_size) == int:
+    if isinstance(bbox_size, int):
         bbox_size = (bbox_size, bbox_size)
     # print(points)
-    minx = np.nanmin(points[:,0], axis=-1)
-    miny = np.nanmin(points[:,-1], axis=-1)
+    minx = np.nanmin(points[:, 0], axis=-1)
+    miny = np.nanmin(points[:, -1], axis=-1)
     minpoints = np.array([minx, miny]).T
-    
-    maxx = np.nanmax(points[:,0], axis=-1)
-    maxy = np.nanmax(points[:,-1], axis=-1)
+
+    maxx = np.nanmax(points[:, 0], axis=-1)
+    maxy = np.nanmax(points[:, -1], axis=-1)
     maxpoints = np.array([maxx, maxy]).T
-    
-    c = ((minpoints + maxpoints)/2)
-    
-    bbox = torch.Tensor([c[-1]-bbox_size[-1]/2, c[0] - bbox_size[0]/2,
-                         c[-1] + bbox_size[-1]/2, c[0] + bbox_size[0]/2])
+
+    c = (minpoints + maxpoints) / 2
+
+    bbox = torch.Tensor(
+        [
+            c[-1] - bbox_size[-1] / 2,
+            c[0] - bbox_size[0] / 2,
+            c[-1] + bbox_size[-1] / 2,
+            c[0] + bbox_size[0] / 2,
+        ]
+    )
     return bbox
 
 
@@ -210,7 +216,7 @@ def parse_trackmate(data_path: str) -> pd.DataFrame:
         and centroid x,y coordinates in pixels
     """
     if data_path.endswith(".xml"):
-        root = et.fromstring(open(xml_path).read())
+        root = et.fromstring(open(data_path).read())
 
         objects = []
         features = root.find("Model").find("FeatureDeclarations").find("SpotFeatures")
@@ -444,7 +450,7 @@ def get_max_padding(height: int, width: int) -> tuple:
 def view_training_batch(
     instances: List[Dict[str, List[np.ndarray]]], num_frames: int = 1, cmap=None
 ) -> None:
-    """Displays a grid of images from a batch of training instances.
+    """Display a grid of images from a batch of training instances.
 
     Args:
         instances: A list of training instances, where each instance is a
@@ -472,7 +478,7 @@ def view_training_batch(
                     else (axes[i] if num_crops == 1 else axes[i, j])
                 )
 
-                ax.imshow(data.T) if cmap is None else ax.imshow(data.T, cmap=cmap)
+                (ax.imshow(data.T) if cmap is None else ax.imshow(data.T, cmap=cmap))
                 ax.axis("off")
 
             except Exception as e:
diff --git a/biogtr/datasets/eval_dataset.py b/biogtr/datasets/eval_dataset.py
index 870ec01..e2cbea2 100644
--- a/biogtr/datasets/eval_dataset.py
+++ b/biogtr/datasets/eval_dataset.py
@@ -1,23 +1,32 @@
-"Module containing wrapper for merging gt and pred datasets for evaluation"
-import torch
+"""Module containing wrapper for merging gt and pred datasets for evaluation."""
+
 from torch.utils.data import Dataset
+from biogtr.data_structures import Frame, Instance
+from typing import List
+
 
 class EvalDataset(Dataset):
-    
-    def __init__(self, gt_dataset: Dataset, pred_dataset: Dataset):
-        
+    """Wrapper around gt and predicted dataset."""
+
+    def __init__(self, gt_dataset: Dataset, pred_dataset: Dataset) -> None:
+        """Initialize EvalDataset.
+
+        Args:
+            gt_dataset: A Dataset object containing ground truth track ids
+            pred_dataset: A dataset object containing predicted track ids
+        """
         self.gt_dataset = gt_dataset
         self.pred_dataset = pred_dataset
-        
-    def __len__(self):
+
+    def __len__(self) -> int:
         """Get the size of the dataset.
 
         Returns:
             the size or the number of chunks in the dataset
         """
         return len(self.gt_dataset)
-        
-    def __getitem__(self, idx: int):
+
+    def __getitem__(self, idx: int) -> List[Frame]:
         """Get an element of the dataset.
 
         Args:
@@ -25,15 +34,39 @@ def __getitem__(self, idx: int):
             or the frame.
 
         Returns:
-            A list of dicts where each dict corresponds a frame in the chunk and
-            each value is a `torch.Tensor`. Dict elements are the video id, frame id, and gt/pred track ids
-
+            A list of Frames where frames contain instances w gt and pred track ids + bboxes.
         """
-        labels = [{"video_id": gt_frame['video_id'],
-                  "frame_id": gt_frame['video_id'],
-                  "gt_track_ids": gt_frame['gt_track_ids'],
-                  "pred_track_ids": pred_frame['gt_track_ids'],
-                  "bboxes": pred_frame["bboxes"]
-                 } for gt_frame, pred_frame in zip(self.gt_dataset[idx], self.pred_dataset[idx])]
-        
-        return labels
\ No newline at end of file
+        gt_batch = self.gt_dataset[idx]
+        pred_batch = self.pred_dataset[idx]
+
+        eval_frames = []
+        for gt_frame, pred_frame in zip(gt_batch, pred_batch):
+            eval_instances = []
+            for i, gt_instance in enumerate(gt_frame.instances):
+
+                gt_track_id = gt_instance.gt_track_id
+
+                try:
+                    pred_track_id = pred_frame.instances[i].gt_track_id
+                    pred_bbox = pred_frame.instances[i].bbox
+                except IndexError:
+                    pred_track_id = -1
+                    pred_bbox = [-1, -1, -1, -1]
+                eval_instances.append(
+                    Instance(
+                        gt_track_id=gt_track_id,
+                        pred_track_id=pred_track_id,
+                        bbox=pred_bbox,
+                    )
+                )
+            eval_frames.append(
+                Frame(
+                    video_id=gt_frame.video_id,
+                    frame_id=gt_frame.frame_id,
+                    vid_file=gt_frame.video.filename,
+                    img_shape=gt_frame.img_shape,
+                    instances=eval_instances,
+                )
+            )
+
+        return eval_frames
diff --git a/biogtr/datasets/microscopy_dataset.py b/biogtr/datasets/microscopy_dataset.py
index 0ec3d02..39a49b1 100644
--- a/biogtr/datasets/microscopy_dataset.py
+++ b/biogtr/datasets/microscopy_dataset.py
@@ -1,9 +1,9 @@
 """Module containing microscopy dataset."""
+
 from PIL import Image
 from biogtr.datasets import data_utils
 from biogtr.datasets.base_dataset import BaseDataset
-from torch.utils.data import Dataset
-from torchvision.transforms import functional as tvf
+from biogtr.data_structures import Frame, Instance
 from typing import Union
 import albumentations as A
 import numpy as np
@@ -26,7 +26,7 @@ def __init__(
         mode: str = "Train",
         augmentations: dict = None,
         n_chunks: Union[int, float] = 1.0,
-        seed: int = None
+        seed: int = None,
     ):
         """Initialize MicroscopyDataset.
 
@@ -94,9 +94,11 @@ def __init__(
         ]
 
         self.frame_idx = [
-            torch.arange(Image.open(video).n_frames)
-            if type(video) == str
-            else torch.arange(len(video))
+            (
+                torch.arange(Image.open(video).n_frames)
+                if isinstance(video, str)
+                else torch.arange(len(video))
+            )
             for video in self.videos
         ]
 
@@ -105,14 +107,14 @@ def __init__(
         self.create_chunks()
 
     def get_indices(self, idx):
-        """Retrieves label and frame indices given batch index.
+        """Retrieve label and frame indices given batch index.
 
         Args:
             idx: the index of the batch.
         """
         return self.label_idx[idx], self.chunked_frame_idx[idx]
 
-    def get_instances(self, label_idx: list[int], frame_idx: list[int]) -> list[dict]:
+    def get_instances(self, label_idx: list[int], frame_idx: list[int]) -> list[Frame]:
         """Get an element of the dataset.
 
         Args:
@@ -120,47 +122,28 @@ def get_instances(self, label_idx: list[int], frame_idx: list[int]) -> list[dict
             frame_idx: index of the frames
 
         Returns:
-            a list of dicts where each dict corresponds a frame in the chunk
-            and each value is a `torch.Tensor`.
-
-            Dict Elements:
-                {
-                    "video_id": The video being passed through the transformer,
-                    "img_shape": the shape of each frame,
-                    "frame_id": the specific frame in the entire video being used,
-                    "num_detected": The number of objects in the frame,
-                    "gt_track_ids": The ground truth labels,
-                    "bboxes": The bounding boxes of each object,
-                    "crops": The raw pixel crops,
-                    "features": The feature vectors for each crop outputed by the
-                        CNN encoder,
-                    "pred_track_ids": The predicted trajectory labels from the
-                        tracker,
-                    "asso_output": the association matrix preprocessing,
-                    "matches": the true positives from the model,
-                    "traj_score": the association matrix post processing,
-                }
+            A list of Frames containing Instances to be tracked (See `biogtr.data_structures for more info`)
         """
         labels = self.labels[label_idx]
         labels = labels.dropna(how="all")
 
         video = self.videos[label_idx]
 
-        if type(video) != list:
+        if not isinstance(video, list):
             video = data_utils.LazyTiffStack(self.videos[label_idx])
 
-        instances = []
-
-        for i in frame_idx:
-            gt_track_ids, centroids, bboxes, crops = [], [], [], []
+        frames = []
+        for frame_id in frame_idx:
+            # print(i)
+            instances, gt_track_ids, centroids = [], [], []
 
             img = (
-                video.get_section(i)
-                if type(video) != list
-                else np.array(Image.open(video[i]))
+                video.get_section(frame_id)
+                if not isinstance(video, list)
+                else np.array(Image.open(video[frame_id]))
             )
 
-            lf = labels[labels["FRAME"].astype(int) == i.item()]
+            lf = labels[labels["FRAME"].astype(int) == frame_id.item()]
 
             for instance in sorted(lf["TRACK_ID"].unique()):
                 gt_track_ids.append(int(instance))
@@ -191,31 +174,30 @@ def get_instances(self, label_idx: list[int], frame_idx: list[int]) -> list[dict
                 if img.shape[2] == 3:
                     img = img.T  # todo: check for edge cases
 
-            for c in centroids:
+            for gt_id in range(len(gt_track_ids)):
+                c = centroids[gt_id]
                 bbox = data_utils.pad_bbox(
                     data_utils.get_bbox([int(c[0]), int(c[1])], self.crop_size),
                     padding=self.padding,
                 )
                 crop = data_utils.crop_bbox(img, bbox)
 
-                bboxes.append(bbox)
-                crops.append(crop)
-
-            instances.append(
-                {
-                    "video_id": torch.tensor([label_idx]),
-                    "img_shape": torch.tensor([img.shape]),
-                    "frame_id": torch.tensor([i]),
-                    "num_detected": torch.tensor([len(bboxes)]),
-                    "gt_track_ids": torch.tensor(gt_track_ids).type(torch.int64),
-                    "bboxes": torch.stack(bboxes),
-                    "crops": torch.stack(crops),
-                    "features": torch.tensor([]),
-                    "pred_track_ids": torch.tensor([-1 for _ in range(len(bboxes))]),
-                    "asso_output": torch.tensor([]),
-                    "matches": torch.tensor([]),
-                    "traj_score": torch.tensor([]),
-                }
+                instances.append(
+                    Instance(
+                        gt_track_id=gt_track_ids[gt_id],
+                        pred_track_id=-1,
+                        bbox=bbox,
+                        crop=crop,
+                    )
+                )
+
+            frames.append(
+                Frame(
+                    video_id=label_idx,
+                    frame_id=frame_id,
+                    img_shape=img.shape,
+                    instances=instances,
+                )
             )
 
-        return instances
+        return frames
diff --git a/biogtr/datasets/sleap_dataset.py b/biogtr/datasets/sleap_dataset.py
index 0a2cb33..73ef5be 100644
--- a/biogtr/datasets/sleap_dataset.py
+++ b/biogtr/datasets/sleap_dataset.py
@@ -1,10 +1,13 @@
 """Module containing logic for loading sleap datasets."""
+
 import albumentations as A
 import torch
 import imageio
 import numpy as np
 import sleap_io as sio
 import random
+import warnings
+from biogtr.data_structures import Frame, Instance
 from biogtr.datasets import data_utils
 from biogtr.datasets.base_dataset import BaseDataset
 from torchvision.transforms import functional as tvf
@@ -27,6 +30,7 @@ def __init__(
         augmentations: dict = None,
         n_chunks: Union[int, float] = 1.0,
         seed: int = None,
+        verbose: bool = False,
     ):
         """Initialize SleapDataset.
 
@@ -35,7 +39,7 @@ def __init__(
             video_files: a list of paths to video files
             padding: amount of padding around object crops
             crop_size: the size of the object crops
-            anchor: the name of the anchor keypoint to be used as centroid for cropping. 
+            anchor: the name of the anchor keypoint to be used as centroid for cropping.
             If unavailable then crop around the midpoint between all visible anchors.
             chunk: whether or not to chunk the dataset into batches
             clip_length: the number of frames in each chunk
@@ -51,6 +55,7 @@ def __init__(
             n_chunks: Number of chunks to subsample from.
                 Can either a fraction of the dataset (ie (0,1.0]) or number of chunks
             seed: set a seed for reproducibility
+            verbose: boolean representing whether to print
         """
         super().__init__(
             slp_files + video_files,
@@ -73,7 +78,8 @@ def __init__(
         self.mode = mode
         self.n_chunks = n_chunks
         self.seed = seed
-        self.anchor = anchor
+        self.anchor = anchor.lower()
+        self.verbose = verbose
 
         # if self.seed is not None:
         #     np.random.seed(self.seed)
@@ -95,7 +101,7 @@ def __init__(
         self.create_chunks()
 
     def get_indices(self, idx):
-        """Retrieves label and frame indices given batch index.
+        """Retrieve label and frame indices given batch index.
 
         Args:
             idx: the index of the batch.
@@ -135,46 +141,71 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
         vid_reader = imageio.get_reader(video_name, "ffmpeg")
 
         img = vid_reader.get_data(0)
-        crop_shape = (img.shape[-1], *(self.crop_size + 2 * self.padding,) * 2)
 
-        instances = []
-        for i, frame in enumerate(frame_idx):
-            gt_track_ids, bboxes, crops, poses, shown_poses = [], [], [], [], []
+        skeleton = video.skeletons[-1]
+
+        frames = []
+        for i, frame_ind in enumerate(frame_idx):
+            (
+                instances,
+                gt_track_ids,
+                poses,
+                shown_poses,
+                point_scores,
+                instance_score,
+            ) = ([], [], [], [], [], [])
+
+            frame_ind = int(frame_ind)
+
+            lf = video[frame_ind]
 
-            frame = int(frame)
-            
-            lf = video[frame]
-            
             try:
-                img = vid_reader.get_data(frame)
+                img = vid_reader.get_data(frame_ind)
             except IndexError as e:
-                print(f"Could not read frame {frame} from {video_name}")
+                print(f"Could not read frame {frame_ind} from {video_name} due to {e}")
                 continue
-                
+
             for instance in lf:
-                gt_track_ids.append(video.tracks.index(instance.track))
+                if instance.track is not None:
+                    gt_track_id = video.tracks.index(instance.track)
+                else:
+                    gt_track_id = -1
+                gt_track_ids.append(gt_track_id)
 
                 poses.append(
                     dict(
                         zip(
                             [n.name for n in instance.skeleton.nodes],
-                            np.array(instance.numpy()).tolist(),
+                            [[p.x, p.y] for p in instance.points.values()],
                         )
                     )
                 )
 
-                shown_poses.append(
-                    dict(
-                        zip(
-                            [n.name for n in instance.skeleton.nodes],
-                            [[p.x, p.y] for p in instance.points.values()],
-                        )
+                shown_poses = [
+                    {
+                        key.lower(): val
+                        for key, val in instance.items()
+                        if not np.isnan(val).any()
+                    }
+                    for instance in poses
+                ]
+
+                point_scores.append(
+                    np.array(
+                        [
+                            (
+                                point.score
+                                if isinstance(point, sio.PredictedPoint)
+                                else 1.0
+                            )
+                            for point in instance.points.values()
+                        ]
                     )
                 )
-
-                shown_poses = [{key: val for key, val in instance.items()
-                                if not np.isnan(val).any()
-                                } for instance in shown_poses]
+                if isinstance(instance, sio.PredictedInstance):
+                    instance_score.append(instance.score)
+                else:
+                    instance_score.append(1.0)
             # augmentations
             if self.augmentations is not None:
                 for transform in self.augmentations:
@@ -205,34 +236,37 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
                     for aug_pose_arr, pose_dict in zip(aug_poses, shown_poses)
                 ]
 
-                _ = [pose.update(aug_pose) for pose, aug_pose in zip(shown_poses, aug_poses)]
+                _ = [
+                    pose.update(aug_pose)
+                    for pose, aug_pose in zip(shown_poses, aug_poses)
+                ]
 
             img = tvf.to_tensor(img)
 
-            for pose in shown_poses:
+            for j in range(len(gt_track_ids)):
+                pose = shown_poses[j]
 
+                """Check for anchor"""
                 if self.anchor in pose:
                     anchor = self.anchor
-                elif self.anchor.lower() in pose:
-                    anchor = self.anchor.lower()
-                elif self.anchor.upper() in pose:
-                    anchor = self.anchor.upper()
                 else:
+                    if self.verbose:
+                        warnings.warn(
+                            f"{self.anchor} not in {[key for key in pose.keys()]}! Defaulting to midpoint"
+                        )
                     anchor = "midpoint"
-                    
+
                 if anchor != "midpoint":
                     centroid = pose[anchor]
 
                     if not np.isnan(centroid).any():
                         bbox = data_utils.pad_bbox(
-                                data_utils.get_bbox(
-                                    centroid, self.crop_size
-                                ),
-                                padding=self.padding,
-                            )
-                        
+                            data_utils.get_bbox(centroid, self.crop_size),
+                            padding=self.padding,
+                        )
+
                     else:
-                        #print(f'{self.anchor} contains NaN: {centroid}. Using midpoint')
+                        # print(f'{self.anchor} contains NaN: {centroid}. Using midpoint')
                         bbox = data_utils.pad_bbox(
                             data_utils.pose_bbox(
                                 np.array(list(pose.values())), self.crop_size
@@ -240,7 +274,7 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
                             padding=self.padding,
                         )
                 else:
-                    #print(f'{self.anchor} not an available option amongst {pose.keys()}. Using midpoint')
+                    # print(f'{self.anchor} not an available option amongst {pose.keys()}. Using midpoint')
                     bbox = data_utils.pad_bbox(
                         data_utils.pose_bbox(
                             np.array(list(pose.values())), self.crop_size
@@ -248,31 +282,28 @@ def get_instances(self, label_idx: List[int], frame_idx: List[int]) -> list[dict
                         padding=self.padding,
                     )
 
-                
                 crop = data_utils.crop_bbox(img, bbox)
 
-                bboxes.append(bbox)
-                crops.append(crop)
+                instance = Instance(
+                    gt_track_id=gt_track_ids[j],
+                    pred_track_id=-1,
+                    crop=crop,
+                    bbox=bbox,
+                    skeleton=skeleton,
+                    pose=np.array(list(poses[j].values())),
+                    point_scores=point_scores[j],
+                    instance_score=instance_score[j],
+                )
 
-            stacked_crops = (
-                torch.stack(crops) if crops else torch.empty((0, *crop_shape))
-            )
+                instances.append(instance)
 
-            instances.append(
-                {
-                    "video_id": torch.tensor([label_idx]),
-                    "img_shape": torch.tensor([img.shape]),
-                    "frame_id": torch.tensor([frame]),
-                    "num_detected": torch.tensor([len(bboxes)]),
-                    "gt_track_ids": torch.tensor(gt_track_ids),
-                    "bboxes": torch.stack(bboxes) if bboxes else torch.empty((0, 4)),
-                    "crops": stacked_crops,
-                    "features": torch.tensor([]),
-                    "pred_track_ids": torch.tensor([-1 for _ in range(len(bboxes))]),
-                    "asso_output": torch.tensor([]),
-                    "matches": torch.tensor([]),
-                    "traj_score": torch.tensor([]),
-                }
+            frame = Frame(
+                video_id=label_idx,
+                frame_id=frame_ind,
+                vid_file=video_name,
+                img_shape=img.shape,
+                instances=instances,
             )
+            frames.append(frame)
 
-        return instances
+        return frames
diff --git a/biogtr/datasets/tracking_dataset.py b/biogtr/datasets/tracking_dataset.py
index f645933..fdc54ca 100644
--- a/biogtr/datasets/tracking_dataset.py
+++ b/biogtr/datasets/tracking_dataset.py
@@ -1,4 +1,5 @@
 """Module containing Lightning module wrapper around all other datasets."""
+
 from biogtr.datasets.cell_tracking_dataset import CellTrackingDataset
 from biogtr.datasets.microscopy_dataset import MicroscopyDataset
 from biogtr.datasets.sleap_dataset import SleapDataset
@@ -53,21 +54,20 @@ def __init__(
         self.test_dl = test_dl
 
     def setup(self, stage=None):
-        """Setup function needed for lightning dataset.
+        """Set up lightning dataset.
 
         UNUSED.
         """
         pass
 
     def train_dataloader(self) -> DataLoader:
-        """Getter for train_dataloader.
+        """Get train_dataloader.
 
         Returns: The Training Dataloader.
         """
         if self.train_dl is None and self.train_ds is None:
             return None
         elif self.train_dl is None:
-                
             return DataLoader(
                 self.train_ds,
                 batch_size=1,
@@ -75,13 +75,17 @@ def train_dataloader(self) -> DataLoader:
                 pin_memory=False,
                 collate_fn=self.train_ds.no_batching_fn,
                 num_workers=0,
-                generator=torch.Generator(device="cuda") if torch.cuda.is_available() else torch.Generator()
+                generator=(
+                    torch.Generator(device="cuda")
+                    if torch.cuda.is_available()
+                    else torch.Generator()
+                ),
             )
         else:
             return self.train_dl
 
     def val_dataloader(self) -> DataLoader:
-        """Getter for val dataloader.
+        """Get val dataloader.
 
         Returns: The validation dataloader.
         """
@@ -101,7 +105,7 @@ def val_dataloader(self) -> DataLoader:
             return self.val_dl
 
     def test_dataloader(self) -> DataLoader:
-        """Getter for test dataloader.
+        """Get.
 
         Returns: The test dataloader
         """
diff --git a/biogtr/inference/__init__.py b/biogtr/inference/__init__.py
new file mode 100644
index 0000000..c1c53dc
--- /dev/null
+++ b/biogtr/inference/__init__.py
@@ -0,0 +1 @@
+"""Tracking Inference using GTR Model."""
diff --git a/biogtr/inference/boxes.py b/biogtr/inference/boxes.py
index 951529b..ec123b1 100644
--- a/biogtr/inference/boxes.py
+++ b/biogtr/inference/boxes.py
@@ -1,5 +1,6 @@
 """Module containing Boxes class."""
-from typing import List, Tuple, Union
+
+from typing import List, Tuple
 import torch
 
 
@@ -56,7 +57,7 @@ def to(self, device: torch.device) -> "Boxes":
         return Boxes(self.tensor.to(device=device))
 
     def area(self) -> torch.Tensor:
-        """Computes the area of all the boxes.
+        """Compute the area of all the boxes.
 
         Returns:
             torch.Tensor: a vector with areas of each box.
diff --git a/biogtr/inference/metrics.py b/biogtr/inference/metrics.py
index d8d6386..8827e7a 100644
--- a/biogtr/inference/metrics.py
+++ b/biogtr/inference/metrics.py
@@ -1,17 +1,21 @@
 """Helper functions for calculating mot metrics."""
+
 import numpy as np
 import motmetrics as mm
-from biogtr.inference.post_processing import _pairwise_iou
-from biogtr.inference.boxes import Boxes
+import torch
+from biogtr.data_structures import Frame
 from typing import Union, Iterable
 
+# from biogtr.inference.post_processing import _pairwise_iou
+# from biogtr.inference.boxes import Boxes
+
 
-def get_matches(instances: list[dict]) -> tuple[dict, list, int]:
+def get_matches(frames: list[Frame]) -> tuple[dict, list, int]:
     """Get comparison between predicted and gt trajectory labels.
 
     Args:
-        instances: a list of dicts where each dict corresponds to a frame and
-        contains the video_id, frame_id, gt labels and predicted labels
+        frames: a list of Frames containing the video_id, frame_id,
+        gt labels and predicted labels
 
     Returns:
         matches: a dict containing predicted and gt trajectory labels
@@ -21,19 +25,22 @@ def get_matches(instances: list[dict]) -> tuple[dict, list, int]:
     matches = {}
     indices = []
 
-    video_id = instances[0]["video_id"].item()
+    video_id = frames[0].video_id.item()
 
-    for idx, instance in enumerate(instances):
-        indices.append(instance["frame_id"].item())
-        for i, gt_track_id in enumerate(instance["gt_track_ids"]):
-            gt_track_id = instance["gt_track_ids"][i]
-            pred_track_id = instance["pred_track_ids"][i]
-            match = f"{gt_track_id} -> {pred_track_id}"
+    if any([frame.has_instances() for frame in frames]):
+        for idx, frame in enumerate(frames):
+            indices.append(frame.frame_id.item())
+            for gt_track_id, pred_track_id in zip(
+                frame.get_gt_track_ids(), frame.get_pred_track_ids()
+            ):
+                match = f"{gt_track_id} -> {pred_track_id}"
 
-            if match not in matches:
-                matches[match] = np.full(len(instances), 0)
+                if match not in matches:
+                    matches[match] = np.full(len(frames), 0)
 
-            matches[match][idx] = 1
+                matches[match][idx] = 1
+    # else:
+    #     warnings.warn("No instances detected!")
     return matches, indices, video_id
 
 
@@ -49,30 +56,32 @@ def get_switches(matches: dict, indices: list) -> dict:
         and the change in labels
     """
     track, switches = {}, {}
-    # unique_gt_ids = np.unique([k.split(" ")[0] for k in list(matches.keys())])
-    matches_key = np.array(list(matches.keys()))
-    matches = np.array(list(matches.values()))
-    num_frames = matches.shape[1]
+    if len(matches) > 0 and len(indices) > 0:
+        matches_key = np.array(list(matches.keys()))
+        matches = np.array(list(matches.values()))
+        num_frames = matches.shape[1]
 
-    assert num_frames == len(indices)
+        assert num_frames == len(indices)
 
-    for i, idx in zip(range(num_frames), indices):
-        switches[idx] = {}
+        for i, idx in zip(range(num_frames), indices):
+            switches[idx] = {}
 
-        col = matches[:, i]
-        indices = np.where(col == 1)[0]
-        match_i = [(m.split(" ")[0], m.split(" ")[-1]) for m in matches_key[indices]]
+            col = matches[:, i]
+            match_indices = np.where(col == 1)[0]
+            match_i = [
+                (m.split(" ")[0], m.split(" ")[-1]) for m in matches_key[match_indices]
+            ]
 
-        for m in match_i:
-            gt, pred = m
+            for m in match_i:
+                gt, pred = m
 
-            if gt in track and track[gt] != pred:
-                switches[idx][gt] = {
-                    "frames": (idx - 1, idx),
-                    "pred tracks (from, to)": (track[gt], pred),
-                }
+                if gt in track and track[gt] != pred:
+                    switches[idx][gt] = {
+                        "frames": (idx - 1, idx),
+                        "pred tracks (from, to)": (track[gt], pred),
+                    }
 
-            track[gt] = pred
+                track[gt] = pred
 
     return switches
 
@@ -92,36 +101,15 @@ def get_switch_count(switches: dict) -> int:
     return sw_cnt
 
 
-def to_track_eval(instances: list[dict]) -> dict:
-    """Reformats instances, the output from `sliding_inference` to be used by `TrackEval.`
+def to_track_eval(frames: list[Frame]) -> dict:
+    """Reformats frames the output from `sliding_inference` to be used by `TrackEval`.
 
     Args:
-        instances: A list of dictionaries. One for each frame. An example is provided below.
+        frames: A list of Frames. `See biogtr.data_structures for more info`.
 
     Returns:
         data: A dictionary. Example provided below.
 
-    # ------------------------- An example of instances ------------------------ #
-
-    D: embedding dimension.
-    N_i: number of detected instances in i-th frame of window.
-
-    instances = [
-        {
-            # Each dictionary is a frame.
-
-            "frame_id": frame index int,
-            "num_detected": N_i,
-            "gt_track_ids": (N_i,),
-            "poses": (N_i, 13, 2),  # 13 keypoints for the pose (x, y) coords.
-            "bboxes": (N_i, 4),  # in pascal_voc unrounded unnormalized
-            "features": (N_i, D),  # Features are deleted but can optionally be kept if need be.
-            "pred_track_ids": (N_i,),
-        },
-        {},  # Frame 2.
-        ...
-    ]
-
     # --------------------------- An example of data --------------------------- #
 
     *: number of ids for gt at every frame of the video
@@ -135,10 +123,9 @@ def to_track_eval(instances: list[dict]) -> dict:
         "gt_ids": (L, *),  # Ragged np.array
         "tracker_ids": (L, ^),  # Ragged np.array
         "similarity_scores": (L, *, ^),  # Ragged np.array
-        "num_timsteps": L,
+        "num_timesteps": L,
     }
     """
-
     unique_gt_ids = []
     num_tracker_dets = 0
     num_gt_dets = 0
@@ -147,30 +134,30 @@ def to_track_eval(instances: list[dict]) -> dict:
     similarity_scores = []
 
     data = {}
-    #cos_sim = torch.nn.CosineSimilarity()
+    cos_sim = torch.nn.CosineSimilarity()
 
-    for fidx, instance in enumerate(instances):
-        gt_track_ids = instance["gt_track_ids"].cpu().numpy().tolist()
-        pred_track_ids = instance["pred_track_ids"].cpu().numpy().tolist()
-        boxes = Boxes(instance['bboxes'].cpu())
+    for fidx, frame in enumerate(frames):
+        gt_track_ids = frame.get_gt_track_ids().cpu().numpy().tolist()
+        pred_track_ids = frame.get_pred_track_ids().cpu().numpy().tolist()
+        # boxes = Boxes(frame.get_bboxes().cpu())
 
         gt_ids.append(np.array(gt_track_ids))
         track_ids.append(np.array(pred_track_ids))
 
-        num_tracker_dets += len(instance["pred_track_ids"])
+        num_tracker_dets += len(pred_track_ids)
         num_gt_dets += len(gt_track_ids)
 
         if not set(gt_track_ids).issubset(set(unique_gt_ids)):
             unique_gt_ids.extend(list(set(gt_track_ids).difference(set(unique_gt_ids))))
-        
-        eval_matrix = _pairwise_iou(boxes, boxes)
-#         eval_matrix = np.full((len(gt_track_ids), len(pred_track_ids)), np.nan)
 
-#         for i, feature_i in enumerate(features):
-#             for j, feature_j in enumerate(features):
-#                 eval_matrix[i][j] = cos_sim(
-#                     feature_i.unsqueeze(0), feature_j.unsqueeze(0)
-#                 )
+        # eval_matrix = _pairwise_iou(boxes, boxes)
+        eval_matrix = np.full((len(gt_track_ids), len(pred_track_ids)), np.nan)
+
+        for i, feature_i in enumerate(frame.get_features()):
+            for j, feature_j in enumerate(frame.get_features()):
+                eval_matrix[i][j] = cos_sim(
+                    feature_i.unsqueeze(0), feature_j.unsqueeze(0)
+                )
 
         # eval_matrix
         #                      pred_track_ids
@@ -206,18 +193,41 @@ def to_track_eval(instances: list[dict]) -> dict:
     data["num_gt_dets"] = num_gt_dets
     try:
         data["gt_ids"] = gt_ids
-        #print(data['gt_ids'])
+        # print(data['gt_ids'])
     except Exception as e:
         print(gt_ids)
-        raise(e)
+        raise (e)
     data["tracker_ids"] = track_ids
     data["similarity_scores"] = similarity_scores
-    data["num_timesteps"] = len(instances)
+    data["num_timesteps"] = len(frames)
 
     return data
 
 
 def get_track_evals(data: dict, metrics: dict) -> dict:
+    """Run track_eval and get mot metrics.
+
+    Args:
+        data: A dictionary. Example provided below.
+        metrics: mot metrics to be computed
+    Returns:
+        A dictionary with key being the metric, and value being the metric value computed.
+    # --------------------------- An example of data --------------------------- #
+
+    *: number of ids for gt at every frame of the video
+    ^: number of ids for tracker at every frame of the video
+    L: length of video
+
+    data = {
+        "num_gt_ids": total number of unique gt ids,
+        "num_tracker_dets": total number of detections by your detection algorithm,
+        "num_gt_dets": total number of gt detections,
+        "gt_ids": (L, *),  # Ragged np.array
+        "tracker_ids": (L, ^),  # Ragged np.array
+        "similarity_scores": (L, *, ^),  # Ragged np.array
+        "num_timsteps": L,
+    }
+    """
     results = {}
     for metric_name, metric in metrics.items():
         result = metric.eval_sequence(data)
@@ -225,12 +235,18 @@ def get_track_evals(data: dict, metrics: dict) -> dict:
     return results
 
 
-def get_pymotmetrics(data: dict, metrics: Union[str, tuple] = "all", key: str = "tracker_ids", save: str = None):
+def get_pymotmetrics(
+    data: dict,
+    metrics: Union[str, tuple] = "all",
+    key: str = "tracker_ids",
+    save: str = None,
+):
     """Given data and a key, evaluate the predictions.
 
     Args:
         data: A dictionary. Example provided below.
         key: The key within instances to look for track_ids (can be "gt_ids" or "tracker_ids").
+
     Returns:
         summary: A pandas DataFrame of all the pymot-metrics.
 
@@ -251,7 +267,10 @@ def get_pymotmetrics(data: dict, metrics: Union[str, tuple] = "all", key: str =
     }
     """
     if not isinstance(metrics, str):
-        metrics = ["num_switches" if metric.lower() == "sw_cnt" else metric for metric in metrics] #backward compatibility
+        metrics = [
+            "num_switches" if metric.lower() == "sw_cnt" else metric
+            for metric in metrics
+        ]  # backward compatibility
     acc = mm.MOTAccumulator(auto_id=True)
 
     for i in range(len(data["gt_ids"])):
@@ -267,22 +286,22 @@ def get_pymotmetrics(data: dict, metrics: Union[str, tuple] = "all", key: str =
         metric.split("|")[0] for metric in mh.list_metrics_markdown().split("\n")[2:-1]
     ]
 
-    if type(metrics) == str:
+    if isinstance(metrics, str):
         metrics_list = all_metrics
-        
+
     elif isinstance(metrics, Iterable):
         metrics = [metric.lower() for metric in metrics]
         metrics_list = [metric for metric in all_metrics if metric.lower() in metrics]
-        
+
     else:
-        raise TypeError(f"Metrics must either be an iterable of strings or `all` not: {type(metrics)}")
-    
+        raise TypeError(
+            f"Metrics must either be an iterable of strings or `all` not: {type(metrics)}"
+        )
+
     summary = mh.compute(acc, metrics=metrics_list, name="acc")
     summary = summary.transpose()
 
     if save is not None and save != "":
         summary.to_csv(save)
 
-    return summary['acc']
-
-
+    return summary["acc"]
diff --git a/biogtr/inference/post_processing.py b/biogtr/inference/post_processing.py
index 1dcb847..2683715 100644
--- a/biogtr/inference/post_processing.py
+++ b/biogtr/inference/post_processing.py
@@ -1,7 +1,7 @@
 """Helper functions for post-processing association matrix pre-tracking."""
+
 import torch
 from biogtr.inference.boxes import Boxes
-from copy import deepcopy
 
 
 def weight_decay_time(
@@ -142,18 +142,21 @@ def filter_max_center_dist(
         ), "Need `k_boxes`, `nonk_boxes`, and `id_ind` to filter by `max_center_dist`"
         k_ct = (k_boxes[:, :2] + k_boxes[:, 2:]) / 2
         k_s = ((k_boxes[:, 2:] - k_boxes[:, :2]) ** 2).sum(dim=1)  # n_k
-        
+
         nonk_ct = (nonk_boxes[:, :2] + nonk_boxes[:, 2:]) / 2
         dist = ((k_ct[:, None] - nonk_ct[None, :]) ** 2).sum(dim=2)  # n_k x Np
-        
+
         norm_dist = dist / (k_s[:, None] + 1e-8)  # n_k x Np
         # id_inds # Np x M
         valid = norm_dist < max_center_dist  # n_k x Np
-        
+
         valid_assn = (
-            torch.mm(valid.float(), id_inds.to(valid.device)).clamp_(max=1.0).long().bool()
+            torch.mm(valid.float(), id_inds.to(valid.device))
+            .clamp_(max=1.0)
+            .long()
+            .bool()
         )  # n_k x M
-        asso_output_filtered = deepcopy(asso_output)
+        asso_output_filtered = asso_output.clone()
         asso_output_filtered[~valid_assn] = 0  # n_k x M
         return asso_output_filtered
     else:
diff --git a/biogtr/inference/track.py b/biogtr/inference/track.py
index 6c051e6..e4d417d 100644
--- a/biogtr/inference/track.py
+++ b/biogtr/inference/track.py
@@ -2,7 +2,7 @@
 
 from biogtr.config import Config
 from biogtr.models.gtr_runner import GTRRunner
-from biogtr.datasets.tracking_dataset import TrackingDataset
+from biogtr.data_structures import Frame
 from omegaconf import DictConfig
 from pprint import pprint
 from pathlib import Path
@@ -17,29 +17,44 @@
 
 torch.set_default_device(device)
 
-def export_trajectories(instances_pred: list[dict], save_path: str = None):
+
+def export_trajectories(frames_pred: list[Frame], save_path: str = None):
+    """Convert trajectories to data frame and save as .csv.
+
+    Args:
+        frames_pred: A list of Frames with predicted track ids.
+        save_path: The path to save the predicted trajectories to.
+
+    Returns:
+        A dictionary containing the predicted track id and centroid coordinates for each instance in the video.
+    """
     save_dict = {}
     frame_ids = []
     X, Y = [], []
     pred_track_ids = []
-    for frame in instances_pred:
-        for i in range(frame["num_detected"]):
-            frame_ids.append(frame["frame_id"].item())
-            bbox = frame["bboxes"][i]
+    track_scores = []
+    for frame in frames_pred:
+        for i, instance in enumerate(frame.instances):
+            frame_ids.append(frame.frame_id.item())
+            bbox = instance.bbox.squeeze()
             y = (bbox[2] + bbox[0]) / 2
             x = (bbox[3] + bbox[1]) / 2
             X.append(x.item())
             Y.append(y.item())
-            pred_track_ids.append(frame["pred_track_ids"][i].item())
+            track_scores.append(instance.track_score)
+            pred_track_ids.append(instance.pred_track_id.item())
+
     save_dict["Frame"] = frame_ids
     save_dict["X"] = X
     save_dict["Y"] = Y
     save_dict["Pred_track_id"] = pred_track_ids
+    save_dict["Track_score"] = track_scores
     save_df = pd.DataFrame(save_dict)
     if save_path:
         save_df.to_csv(save_path, index=False)
     return save_df
 
+
 def inference(
     model: GTRRunner, dataloader: torch.utils.data.DataLoader
 ) -> list[pd.DataFrame]:
@@ -60,7 +75,7 @@ def inference(
 
     for batch in preds:
         for frame in batch:
-            vid_trajectories[frame["video_id"]].append(frame)
+            vid_trajectories[frame.video_id].append(frame)
 
     saved = []
 
@@ -72,16 +87,15 @@ def inference(
             X, Y = [], []
             pred_track_ids = []
             for frame in video:
-                for i in range(frame["num_detected"]):
-                    video_ids.append(frame["video_id"].item())
-                    frame_ids.append(frame["frame_id"].item())
-                    bbox = frame["bboxes"][i]
-
+                for i, instance in frame.instances:
+                    video_ids.append(frame.video_id.item())
+                    frame_ids.append(frame.frame_id.item())
+                    bbox = instance.bbox
                     y = (bbox[2] + bbox[0]) / 2
                     x = (bbox[3] + bbox[1]) / 2
                     X.append(x.item())
                     Y.append(y.item())
-                    pred_track_ids.append(frame["pred_track_ids"][i].item())
+                    pred_track_ids.append(instance.pred_track_id.item())
             save_dict["Video"] = video_ids
             save_dict["Frame"] = frame_ids
             save_dict["X"] = X
@@ -95,9 +109,7 @@ def inference(
 
 @hydra.main(config_path="configs", config_name=None, version_base=None)
 def main(cfg: DictConfig):
-    """Main function for running inference.
-
-    handles config parsing, batch deployment and saving results
+    """Run inference based on config file.
 
     Args:
         cfg: A dictconfig loaded from hydra containing checkpoint path and data
diff --git a/biogtr/inference/track_queue.py b/biogtr/inference/track_queue.py
new file mode 100644
index 0000000..43a4353
--- /dev/null
+++ b/biogtr/inference/track_queue.py
@@ -0,0 +1,306 @@
+"""Module handling sliding window tracking."""
+
+import warnings
+from biogtr.data_structures import Frame
+from collections import deque
+import numpy as np
+
+
+class TrackQueue:
+    """Class handling track local queue system for sliding window.
+
+    Each trajectory has its own deque based queue of size `window_size - 1`.
+    Elements of the queue are Instance objects that have already been tracked
+    and will be compared against later frames for assignment.
+    """
+
+    def __init__(self, window_size: int, max_gap: int = np.inf, verbose: bool = False):
+        """Initialize track queue.
+
+        Args:
+            window_size: The number of instances per trajectory allowed in the
+            queue to be compared against.
+            max_gap: The number of consecutive frames a trajectory can fail to
+            appear in before terminating the track.
+            verbose: Whether to print info during operations.
+        """
+        self._window_size = window_size
+        self._queues = {}
+        self._max_gap = max_gap
+        self._curr_gap = {}
+        if self._max_gap <= self._window_size:
+            self._max_gap = self._window_size
+        self._curr_track = -1
+        self._verbose = verbose
+
+    def __len__(self):
+        """Get length of the queue.
+
+        Returns:
+            The total number of instances in every sub-queue.
+        """
+        return sum([len(queue) for queue in self._queues.values()])
+
+    def __repr__(self):
+        """Return the string representation of the TrackQueue.
+
+        Returns:
+            The string representation of the current state of the queue.
+        """
+        return (
+            "TrackQueue("
+            f"window_size={self.window_size}, "
+            f"max_gap={self.max_gap}, "
+            f"n_tracks={self.n_tracks}, "
+            f"curr_track={self.curr_track}, "
+            f"queues={[(key,len(queue)) for key, queue in self._queues.items()]}, "
+            f"curr_gap:{self._curr_gap}"
+            ")"
+        )
+
+    @property
+    def window_size(self) -> int:
+        """The maximum number of instances allowed in a sub-queue to be compared against.
+
+        Returns:
+            An int representing The maximum number of instances allowed in a
+            sub-queue to be compared against.
+        """
+        return self._window_size
+
+    @window_size.setter
+    def window_size(self, window_size: int) -> None:
+        """Set the window size of the queue.
+
+        Args:
+            window_size: An int representing The maximum number of instances
+            allowed in a sub-queue to be compared against.
+        """
+        self._window_size = window_size
+
+    @property
+    def max_gap(self) -> int:
+        """The maximum number of consecutive frames an trajectory can fail to appear before termination.
+
+        Returns:
+            An int representing the maximum number of consecutive frames an trajectory can fail to
+            appear before termination.
+        """
+        return self._max_gap
+
+    @max_gap.setter
+    def max_gap(self, max_gap: int) -> None:
+        """Set the max consecutive frame gap allowed for a trajectory.
+
+        Args:
+            max_gap: An int representing the maximum number of consecutive frames an trajectory can fail to
+            appear before termination.
+        """
+        self._max_gap = max_gap
+
+    @property
+    def curr_track(self) -> int:
+        """The newest *created* trajectory in the queue.
+
+        Returns:
+            The latest *created* trajectory in the queue.
+        """
+        return self._curr_track
+
+    @curr_track.setter
+    def curr_track(self, curr_track: int) -> None:
+        """Set the newest *created* trajectory in the queue.
+
+        Args:
+            curr_track: The latest *created* trajectory in the queue.
+        """
+        self._curr_track = curr_track
+
+    @property
+    def n_tracks(self) -> int:
+        """The current number of trajectories in the queue.
+
+        Returns:
+            An int representing the current number of trajectories in the queue.
+        """
+        return len(self._queues.keys())
+
+    @property
+    def tracks(self) -> list:
+        """A list of the track ids currently in the queue.
+
+        Returns:
+            A list containing the track ids currently in the queue.
+        """
+        return list(self._queues.keys())
+
+    @property
+    def verbose(self) -> bool:
+        """Indicate whether or not to print outputs along operations. Mostly used for debugging.
+
+        Returns:
+            A boolean representing whether or not printing is turned on.
+        """
+        return self._verbose
+
+    @verbose.setter
+    def verbose(self, verbose: bool) -> None:
+        """Turn on/off printing.
+
+        Args:
+            verbose: A boolean representing whether printing should be on or off.
+        """
+        self._verbose = verbose
+
+    def end_tracks(self, track_id=None):
+        """Terminate tracks and removing them from the queue.
+
+        Args:
+            track_id: The index of the trajectory to be ended and removed.
+            If `None` then then every trajectory is removed and the track queue is reset.
+
+        Returns:
+            True if the track is successively removed, otherwise False.
+            (ie if the track doesn't exist in the queue.)
+        """
+        if track_id is None:
+            self._queues = {}
+            self._curr_gap = {}
+            self.curr_track = -1
+        else:
+            try:
+                self._queues.pop(track_id)
+                self._curr_gap.pop(track_id)
+            except Exception as e:
+                print(f"Unable to end track due to {e}")
+                return False
+        return True
+
+    def add_frame(self, frame: Frame) -> None:
+        """Add frames to the queue.
+
+        Each instance from the frame is added to the queue according to its pred_track_id.
+        If the corresponding trajectory is not already in the queue then create a new queue for the track.
+
+        Args:
+            frame: A Frame object containing instances that have already been tracked.
+        """
+        if frame.num_detected == 0:  # only add frames with instances.
+            return
+        vid_id = frame.video_id.item()
+        frame_id = frame.frame_id.item()
+        img_shape = frame.img_shape
+        if isinstance(frame.video, str):
+            vid_name = frame.video
+        else:
+            vid_name = frame.video.filename
+        # traj_score = frame.get_traj_score()  TODO: figure out better way to save trajectory scores.
+        frame_meta = (vid_id, frame_id, vid_name, img_shape.cpu().tolist())
+
+        pred_tracks = []
+        for instance in frame.instances:
+            pred_track_id = instance.pred_track_id.item()
+            pred_tracks.append(pred_track_id)
+
+            if pred_track_id not in self._queues.keys():
+                self._queues[pred_track_id] = deque(
+                    [(*frame_meta, instance)], maxlen=self.window_size - 1
+                )  # dumb work around to retain `img_shape`
+                self.curr_track = pred_track_id
+
+                if self.verbose:
+                    warnings.warn(
+                        f"New track = {pred_track_id} on frame {frame_id}! Current number of tracks = {self.n_tracks}"
+                    )
+
+            else:
+                self._queues[pred_track_id].append((*frame_meta, instance))
+        self.increment_gaps(
+            pred_tracks
+        )  # should this be done in the tracker or the queue?
+
+    def collate_tracks(
+        self, track_ids: list[int] = None, device: str = None
+    ) -> list[Frame]:
+        """Merge queues into a single list of Frames containing corresponding instances.
+
+        Args:
+            track_ids: A list of trajectorys to merge. If None, then merge all
+            queues, otherwise filter queues by track_ids then merge.
+            device: A str representation of the device the frames should be on after merging
+            since all instances in the queue are kept on the cpu.
+
+        Returns:
+            A sorted list of Frame objects from which each instance came from,
+            containing the corresponding instances.
+        """
+        if len(self._queues) == 0:
+            return []
+
+        frames = {}
+
+        tracks_to_convert = (
+            {track: queue for track, queue in self._queues if track in track_ids}
+            if track_ids is not None
+            else self._queues
+        )
+        for track, instances in tracks_to_convert.items():
+            for video_id, frame_id, vid_name, img_shape, instance in instances:
+                if (video_id, frame_id) not in frames.keys():
+                    frame = Frame(
+                        video_id,
+                        frame_id,
+                        img_shape=img_shape,
+                        instances=[instance],
+                        vid_file=vid_name,
+                    )
+                    frames[(video_id, frame_id)] = frame
+                else:
+                    frames[(video_id, frame_id)].instances.append(instance)
+        return [frames[frame].to(device) for frame in sorted(frames.keys())]
+
+    def increment_gaps(self, pred_track_ids: list[int]) -> dict[int, bool]:
+        """Keep track of number of consecutive frames each trajectory has been missing from the queue.
+
+        If a trajectory has exceeded the `max_gap` then terminate the track and remove it from the queue.
+
+        Args:
+            pred_track_ids: A list of track_ids to be matched against the trajectories in the queue.
+            If a trajectory is in `pred_track_ids` then its gap counter is reset,
+            otherwise its incremented by 1.
+
+        Returns:
+            A dictionary containing the trajectory id and a boolean value representing
+            whether or not it has exceeded the max allowed gap and been
+            terminated.
+        """
+        exceeded_gap = {}
+
+        for track in pred_track_ids:
+            if track not in self._curr_gap:
+                self._curr_gap[track] = 0
+
+        for track in self._curr_gap:
+            if track not in pred_track_ids:
+                self._curr_gap[track] += 1
+                if self.verbose:
+                    warnings.warn(
+                        f"Track {track} has not been seen for {self._curr_gap[track]} frames."
+                    )
+            else:
+                self._curr_gap[track] = 0
+            if self._curr_gap[track] >= self.max_gap:
+                exceeded_gap[track] = True
+            else:
+                exceeded_gap[track] = False
+
+        for track, gap_exceeded in exceeded_gap.items():
+            if gap_exceeded:
+                if self.verbose:
+                    warnings.warn(
+                        f"Track {track} has not been seen for {self._curr_gap[track]} frames! Terminating Track...Current number of tracks = {self.n_tracks}."
+                    )
+                self._queues.pop(track)
+                self._curr_gap.pop(track)
+
+        return exceeded_gap
diff --git a/biogtr/inference/tracker.py b/biogtr/inference/tracker.py
index 0162587..44674aa 100644
--- a/biogtr/inference/tracker.py
+++ b/biogtr/inference/tracker.py
@@ -1,13 +1,16 @@
 """Module containing logic for going from association -> assignment."""
+
 import torch
 import pandas as pd
+import warnings
+from biogtr.data_structures import Frame
+from biogtr.models.global_tracking_transformer import GlobalTrackingTransformer
 from biogtr.models import model_utils
+from biogtr.inference.track_queue import TrackQueue
 from biogtr.inference import post_processing
 from biogtr.inference.boxes import Boxes
-from biogtr.models.global_tracking_transformer import GlobalTrackingTransformer
 from scipy.optimize import linear_sum_assignment
-from copy import deepcopy
-from collections import deque
+from math import inf
 
 
 class Tracker:
@@ -22,24 +25,31 @@ def __init__(
         decay_time: float = None,
         iou: str = None,
         max_center_dist: float = None,
-        persistent_tracking: bool = False
+        persistent_tracking: bool = False,
+        max_gap: int = inf,
+        max_tracks: int = inf,
+        verbose=False,
     ):
         """Initialize a tracker to run inference.
 
         Args:
-            window_size: the size of the window used during sliding inference
-            use_vis_feats: Whether or not to use visual feature extractor
-            overlap_thresh: the trajectory overlap threshold to be used for assignment
-            mult_thresh: Whether or not to use weight threshold
-            decay_time: weight for `decay_time` postprocessing
+            window_size: the size of the window used during sliding inference.
+            use_vis_feats: Whether or not to use visual feature extractor.
+            overlap_thresh: the trajectory overlap threshold to be used for assignment.
+            mult_thresh: Whether or not to use weight threshold.
+            decay_time: weight for `decay_time` postprocessing.
             iou: Either [None, '', "mult" or "max"]
-                 Whether to use multiplicative or max iou reweighting
-            max_center_dist: distance threshold for filtering trajectory score matrix
-            persistent_tracking: whether to keep a buffer across chunks or not
+                 Whether to use multiplicative or max iou reweighting.
+            max_center_dist: distance threshold for filtering trajectory score matrix.
+            persistent_tracking: whether to keep a buffer across chunks or not.
+            max_gap: the max number of frames a trajectory can be missing before termination.
+            max_tracks: the maximum number of tracks that can be created while tracking.
+            We force the tracker to assign instances to a track instead of creating a new track if max_tracks has been reached.
+            verbose: Whether or not to turn on debug printing after each operation.
         """
-        
-        self.window_size = window_size
-        self.track_queue = deque(maxlen=self.window_size)
+        self.track_queue = TrackQueue(
+            window_size=window_size, max_gap=max_gap, verbose=verbose
+        )
         self.use_vis_feats = use_vis_feats
         self.overlap_thresh = overlap_thresh
         self.mult_thresh = mult_thresh
@@ -47,43 +57,40 @@ def __init__(
         self.iou = iou
         self.max_center_dist = max_center_dist
         self.persistent_tracking = persistent_tracking
+        self.verbose = verbose
+        self.max_tracks = max_tracks
 
-    def __call__(self, model: GlobalTrackingTransformer, instances: list[dict], all_instances: list = None):
-        """Wrapper around `track` to enable `tracker()` instead of `tracker.track()`.
+    def __call__(self, model: GlobalTrackingTransformer, frames: list[Frame]):
+        """Wrap around `track` to enable `tracker()` instead of `tracker.track()`.
 
         Args:
             model: the pretrained GlobalTrackingTransformer to be used for inference
-            instances: data dict to run inference on
-            all_instances: list of instances from previous chunks
-                           to stitch together full trajectory
+            frames: list of Frames to run inference on
 
         Returns:
-            instances dict populated with pred track ids and association matrix scores
+            List of frames containing association matrix scores and instances populated with pred track ids.
         """
-        return self.track(model, instances, all_instances)
+        return self.track(model, frames)
 
-    def track(self, model: GlobalTrackingTransformer, instances: list[dict], all_instances: list = None):
+    def track(self, model: GlobalTrackingTransformer, frames: list[dict]):
         """Run tracker and get predicted trajectories.
 
         Args:
             model: the pretrained GlobalTrackingTransformer to be used for inference
-            instances: data dict to run inference on
-            all_instances: list of instances from previous chunks to stitch together full trajectory
+            frames: data dict to run inference on
 
         Returns:
-            instances dict populated with pred track ids and association matrix scores
+            List of Frames populated with pred track ids and association matrix scores
         """
-# Extract feature representations with pre-trained encoder.
+        # Extract feature representations with pre-trained encoder.
 
         _ = model.eval()
 
-        for frame in instances:
-            if (frame["num_detected"] > 0).item():
+        for frame in frames:
+            if frame.has_instances():
                 if not self.use_vis_feats:
-                    num_frame_instances = frame["crops"].shape[0]
-                    frame["features"] = torch.zeros(
-                        num_frame_instances, model.d_model
-                    )
+                    for instance in frame.instances:
+                        instance.features = torch.zeros(1, model.d_model)
                     # frame["features"] = torch.randn(
                     #     num_frame_instances, self.model.d_model
                     # )
@@ -91,10 +98,13 @@ def track(self, model: GlobalTrackingTransformer, instances: list[dict], all_ins
                 # comment out to turn encoder off
 
                 # Assuming the encoder is already trained or train encoder jointly.
-                elif 'features' not in frame or frame['features'] == None or len(frame['features']) == 0:
+                elif not frame.has_features():
                     with torch.no_grad():
-                        z = model.visual_encoder(frame["crops"])
-                        frame["features"] = z
+                        crops = frame.get_crops()
+                        z = model.visual_encoder(crops)
+
+                        for i, z_i in enumerate(z):
+                            frame.instances[i].features = z_i
 
         # I feel like this chunk is unnecessary:
         # reid_features = torch.cat(
@@ -104,45 +114,25 @@ def track(self, model: GlobalTrackingTransformer, instances: list[dict], all_ins
         # asso_preds, pred_boxes, pred_time, embeddings = self.model(
         #     instances, reid_features
         # )
-        instances_pred = self.sliding_inference(
-            model, instances, window_size=self.window_size, all_instances=all_instances
-        )
-        
+        instances_pred = self.sliding_inference(model, frames)
+
         if not self.persistent_tracking:
-            # print(f'Clearing Queue after tracking')
-            self.track_queue.clear()
-            
+            if self.verbose:
+                warnings.warn(f"Clearing Queue after tracking")
+            self.track_queue.end_tracks()
+
         return instances_pred
 
-    def sliding_inference(self, model: GlobalTrackingTransformer, instances, window_size, all_instances=None):
-        """Performs sliding inference on the input video (instances) with a given window size.
+    def sliding_inference(self, model: GlobalTrackingTransformer, frames: list[Frame]):
+        """Perform sliding inference on the input video (instances) with a given window size.
 
         Args:
             model: the pretrained GlobalTrackingTransformer to be used for inference
-            instances: A list of dictionaries, one dictionary for each frame. An example
-            is provided below.
-            window_size: An integer.
+            frame: A list of Frames (See `biogtr.data_structures.Frame` for more info).
+
 
         Returns:
-            instances: A list of dictionaries, one dictionary for each frame. An example
-            is provided below.
-        # ------------------------- An example of instances ------------------------ #
-        D: embedding dimension.
-        N_i: number of detected instances in i-th frame of window.
-        instances = [
-            {
-                # Each dictionary is a frame.
-                "frame_id": frame index int,
-                "num_detected": N_i,
-                "gt_track_ids": (N_i,),
-                "poses": (N_i, 13, 2),  # 13 keypoints for the pose (x, y) coords.
-                "bboxes": (N_i, 4),  # in pascal_voc unrounded unnormalized
-                "features": (N_i, D),  # Features are deleted but can optionally be kept if need be.
-                "pred_track_ids": (N_i,),  # Filled out after sliding_inference.
-            },
-            {},  # Frame 2.
-            ...
-        ]
+            Frames: A list of Frames populated with pred_track_ids and asso_matrices
         """
         # B: batch size.
         # D: embedding dimension.
@@ -150,119 +140,80 @@ def sliding_inference(self, model: GlobalTrackingTransformer, instances, window_
         # H: height.
         # W: width.
 
-        video_len = len(instances)
-        id_count = 0
-
-        for batch_idx in range(video_len):
-                
-            if (self.persistent_tracking and instances[batch_idx]['frame_id'] == 0):
-                self.track_queue.clear()
-                
-            if len(self.track_queue) == 0 or sum([len(frame["pred_track_ids"]) for frame in self.track_queue]) == 0:
-                # print(f'Initializing track on batch {batch_idx} frame {instances[batch_idx]["frame_id"]}')
-                instances[batch_idx]["pred_track_ids"] = torch.arange(
-                    0, len(instances[batch_idx]["bboxes"])
+        for batch_idx, frame_to_track in enumerate(frames):
+            tracked_frames = self.track_queue.collate_tracks()
+            if self.verbose:
+                warnings.warn(
+                    f"Current number of tracks is {self.track_queue.n_tracks}"
                 )
 
-                id_count = len(instances[batch_idx]["bboxes"])
-                # print(f'Initial tracks are {instances[batch_idx]["pred_track_ids"]}')
-                self.track_queue.append(instances[batch_idx])
-
-            else:   
-                instances_to_track = (list(self.track_queue) + [instances[batch_idx]])[-window_size:]
-                
-                if sum([frame['num_detected'] for frame in instances_to_track]) == 0:
-                    print("No detections to track!")
-                    
-                    instances[batch_idx]["pred_track_ids"] = torch.arange(
-                    0, len(instances[batch_idx]["bboxes"])
-                    )
-                    
-                    self.track_queue.append(instances[batch_idx])
-                    continue
-                
-                query_ind = min(window_size - 1, len(instances_to_track) - 1)
-                    
-                instances[batch_idx], id_count = self._run_global_tracker(
-                    model,
-                    instances_to_track,
-                    query_frame=query_ind,
-                    id_count=id_count,
-                    overlap_thresh=self.overlap_thresh,
-                    mult_thresh=self.mult_thresh,
-                )
+            if (
+                self.persistent_tracking and frame_to_track.frame_id == 0
+            ):  # check for new video and clear queue
+                if self.verbose:
+                    warnings.warn("New Video! Resetting Track Queue.")
+                self.track_queue.end_tracks()
 
             """
-            # If first frame.
-            if frame_id == 0:
-                instances[0]["pred_track_ids"] = torch.arange(
-                    0, len(instances[0]["bboxes"]))
-                id_count = len(instances[0]["bboxes"])
-            else:
-                win_st = max(0, frame_id + 1 - window_size)
-                win_ed = frame_id + 1
-                instances[win_st: win_ed], id_count = self._run_global_tracker(
-                    instances[win_st: win_ed],
-                    query_frame=min(window_size - 1, frame_id),
-                    id_count=id_count,
-                    overlap_thresh=self.overlap_thresh,
-                    mult_thresh=self.mult_thresh)
+            Initialize tracks on first frame of video or first instance of detections.
             """
+            if len(self.track_queue) == 0:
+                if frame_to_track.has_instances():
+                    if self.verbose:
+                        warnings.warn(
+                            f"Initializing track on clip ind {batch_idx} frame {frame_to_track.frame_id.item()}"
+                        )
+
+                    curr_track_id = 0
+                    for i, instance in enumerate(frames[batch_idx].instances):
+                        instance.pred_track_id = instance.gt_track_id
+                        curr_track_id = instance.pred_track_id
+
+                    for i, instance in enumerate(frames[batch_idx].instances):
+                        if instance.pred_track_id == -1:
+                            instance.pred_track_id = curr_track_id
+                            curr_track += 1
 
-            # If features are out of window, set to none.
-            # if frame_id - window_size >= 0:
-            #     instances[frame_id - window_size]["features"] = None
-
-        # TODO: Insert postprocessing.
+            else:
+                if (
+                    frame_to_track.has_instances()
+                ):  # Check if there are detections. If there are skip and increment gap count
+                    frames_to_track = tracked_frames + [
+                        frame_to_track
+                    ]  # better var name?
+
+                    query_ind = len(frames_to_track) - 1
+
+                    frame_to_track = self._run_global_tracker(
+                        model,
+                        frames_to_track,
+                        query_ind=query_ind,
+                    )
 
-        for frame in instances[:len(instances)-window_size]:
-            frame["features"] = frame["features"].cpu()
+            if frame_to_track.has_instances():
+                self.track_queue.add_frame(frame_to_track)
+            else:
+                self.track_queue.increment_gaps([])
 
-        return instances
+            frames[batch_idx] = frame_to_track
+        return frames
 
-    def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query_frame, id_count, overlap_thresh, mult_thresh):
-        """Run_global_tracker performs the actual tracking.
+    def _run_global_tracker(
+        self, model: GlobalTrackingTransformer, frames: list[Frame], query_ind: int
+    ) -> Frame:
+        """Run global tracker performs the actual tracking.
 
         Uses Hungarian algorithm to do track assigning.
 
         Args:
             model: the pretrained GlobalTrackingTransformer to be used for inference
-            instances: A list of dictionaries, one dictionary for each frame. An example
-            is provided below.
-            query_frame: An integer for the query frame within the window of instances.
-            id_count: The count of total identities so far.
-            overlap_thresh: A float number between 0 and 1 specifying how much
-            overlap is necessary for assigning a new instance to an existing identity.
-            mult_thresh: A boolean for whether or not multiple thresholds should be used.
-            This is not functional as of now.
+            frames: A list of Frames containing reid features. See `biogtr.data_structures` for more info.
+            query_ind: An integer for the query frame within the window of instances.
 
         Returns:
-            instances: The exact list of dictionaries as before but with assigned track ids
-            and new track ids for the query frame. Refer to the example for the structure.
-            id_count: An integer for the updated identity count so far.
-            # ------------------------- An example of instances ------------------------ #
-            NOTE: This instances variable is the window subset of the instances variable in sliding_inference.
-            *: each item in instances is a frame in the window. So it follows
-                that each frame in the window has * detected instances.
-            D: embedding dimension.
-            N_i: number of detected instances in i-th frame of window.
-            window_size: length of window.
-            The features in instances can be of shape (2 to window_size, *, D) when stacked together.
-            instances = [
-                {
-                    # Each dictionary is a frame.
-                    "frame_id": frame index int,
-                    "num_detected": N_i,
-                    "gt_track_ids": (N_i,),
-                    "poses": (N_i, 13, 2),  # 13 keypoints for the pose (x, y) coords.
-                    "bboxes": (N_i, 4),  # in pascal_voc unrounded unnormalized
-                    "features": (N_i, D),
-                    "pred_track_ids": (N_i,),  # Before assignnment, these are all -1.
-                },
-                ...
-            ]
+            query_frame: The query frame now populated with the pred_track_ids.
         """
-        # *: each item in instances is a frame in the window. So it follows
+        # *: each item in frames is a frame in the window. So it follows
         #    that each frame in the window has * detected instances.
         # D: embedding dimension.
         # total_instances: number of instances in the window.
@@ -276,81 +227,156 @@ def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query
 
         # Number of instances in each frame of the window.
         # E.g.: instances_per_frame: [4, 5, 6, 7]; window of length 4 with 4 detected instances in the first frame of the window.
-        # print([frame['frame_id'].item() for frame in instances])
-        # print([frame['frame_id'].item() for frame in instances])
-        # print([frame['pred_track_ids'] for frame in instances])
+
         _ = model.eval()
-        instances_per_frame = [frame["num_detected"] for frame in instances]
+        query_frame = frames[query_ind]
+
+        if self.verbose:
+            print(f"Frame {query_frame.frame_id.item()}")
+
+        instances_per_frame = [frame.num_detected for frame in frames]
+
+        total_instances, window_size = sum(instances_per_frame), len(
+            instances_per_frame
+        )  # Number of instances in window; length of window.
+
+        if self.verbose:
+            print(f"total_instances: {total_instances}")
+
+        overlap_thresh = self.overlap_thresh
+        mult_thresh = self.mult_thresh
+        n_traj = self.track_queue.n_tracks
 
-        total_instances, window_size = sum(instances_per_frame), len(instances_per_frame)  # Number of instances in window; length of window.
-        reid_features = torch.cat([frame["features"] for frame in instances], dim=0)[
+        reid_features = torch.cat([frame.get_features() for frame in frames], dim=0)[
             None
         ]  # (1, total_instances, D=512)
 
         # (L=1, n_query, total_instances)
         with torch.no_grad():
-            if model.transformer.return_embedding:
-                asso_output, embed = model(instances, query_frame=query_frame)
-                instances[query_frame]["embeddings"] = embed
-            else:
-                asso_output = model(instances, query_frame=query_frame)
+            asso_output, embed = model(frames, query_frame=query_ind)
+            # if model.transformer.return_embedding:
+            # query_frame.embeddings = embed TODO add embedding to Instance Object
         # if query_frame == 1:
         #     print(asso_output)
-        asso_output = asso_output[-1].split(instances_per_frame, dim=1)  # (window_size, n_query, N_i)
-        asso_output = model_utils.softmax_asso(asso_output)  # (window_size, n_query, N_i)
+        asso_output = asso_output[-1].split(
+            instances_per_frame, dim=1
+        )  # (window_size, n_query, N_i)
+        asso_output = model_utils.softmax_asso(
+            asso_output
+        )  # (window_size, n_query, N_i)
         asso_output = torch.cat(asso_output, dim=1).cpu()  # (n_query, total_instances)
 
+        asso_output_df = pd.DataFrame(
+            asso_output.clone().numpy(),
+            columns=[f"Instance {i}" for i in range(asso_output.shape[-1])],
+        )
+
+        asso_output_df.index.name = "Instances"
+        asso_output_df.columns.name = "Instances"
+
+        query_frame.add_traj_score("asso_output", asso_output_df)
+        query_frame.asso_output = asso_output
+
         try:
-            n_query = instances[query_frame][
-                "num_detected"
-            ]  # Number of instances in the current/query frame.
+            n_query = (
+                query_frame.num_detected
+            )  # Number of instances in the current/query frame.
         except Exception as e:
-            print(len(instances), query_frame, instances[-1])
-            raise(e)
+            print(len(frames), query_frame, frames[-1])
+            raise (e)
 
         n_nonquery = (
             total_instances - n_query
         )  # Number of instances in the window not including the current/query frame.
-        
+
+        if self.verbose:
+            print(f"n_nonquery: {n_nonquery}")
+            print(f"n_query: {n_query}")
         try:
             instance_ids = torch.cat(
-                [x["pred_track_ids"] for batch_idx, x in enumerate(instances) if batch_idx != query_frame], dim=0
+                [
+                    x.get_pred_track_ids()
+                    for batch_idx, x in enumerate(frames)
+                    if batch_idx != query_ind
+                ],
+                dim=0,
             ).view(
                 n_nonquery
             )  # (n_nonquery,)
         except Exception as e:
-            print(instances)
-            raise(e)
+            print(
+                [
+                    [instance.pred_track_id.device for instance in frame.instances]
+                    for frame in frames
+                ]
+            )
+            raise (e)
 
-        query_inds = [x for x in range(sum(instances_per_frame[:query_frame]), sum(instances_per_frame[: query_frame + 1]))]
+        query_inds = [
+            x
+            for x in range(
+                sum(instances_per_frame[:query_ind]),
+                sum(instances_per_frame[: query_ind + 1]),
+            )
+        ]
         nonquery_inds = [i for i in range(total_instances) if i not in query_inds]
+
         asso_nonquery = asso_output[:, nonquery_inds]  # (n_query, n_nonquery)
 
-        pred_boxes, _ = model_utils.get_boxes_times(instances)
+        asso_nonquery_df = pd.DataFrame(
+            asso_nonquery.clone().numpy(), columns=nonquery_inds
+        )
+
+        asso_nonquery_df.index.name = "Current Frame Instances"
+        asso_nonquery_df.columns.name = "Nonquery Instances"
+
+        query_frame.add_traj_score("asso_nonquery", asso_nonquery_df)
+
+        pred_boxes, _ = model_utils.get_boxes_times(frames)
         query_boxes = pred_boxes[query_inds]  # n_k x 4
-        nonquery_boxes = pred_boxes[nonquery_inds]  #n_nonquery x 4
+        nonquery_boxes = pred_boxes[nonquery_inds]  # n_nonquery x 4
         # TODO: Insert postprocessing.
 
         unique_ids = torch.unique(instance_ids)  # (n_nonquery,)
-        n_traj = len(unique_ids)  # Number of existing tracks.
-        id_inds = (unique_ids[None, :] == instance_ids[:, None]).float()  # (n_nonquery, n_traj)
+
+        if self.verbose:
+            print(f"Instance IDs: {instance_ids}")
+            print(f"unique ids: {unique_ids}")
+
+        id_inds = (
+            unique_ids[None, :] == instance_ids[:, None]
+        ).float()  # (n_nonquery, n_traj)
 
         ################################################################################
 
         # reweighting hyper-parameters for association -> they use 0.9
 
-        # (n_query x n_nonquery) x (n_nonquery x n_traj) --> n_k x n_traj
         traj_score = post_processing.weight_decay_time(
-            asso_nonquery, self.decay_time, reid_features, window_size, query_frame
+            asso_nonquery, self.decay_time, reid_features, window_size, query_ind
         )
 
+        if self.decay_time is not None and self.decay_time > 0:
+            decay_time_traj_score = pd.DataFrame(
+                traj_score.clone().numpy(), columns=nonquery_inds
+            )
+
+            decay_time_traj_score.index.name = "Query Instances"
+            decay_time_traj_score.columns.name = "Nonquery Instances"
+
+            query_frame.add_traj_score("decay_time", decay_time_traj_score)
+        ################################################################################
+
+        # (n_query x n_nonquery) x (n_nonquery x n_traj) --> n_k x n_traj
         traj_score = torch.mm(traj_score, id_inds.cpu())  # (n_query, n_traj)
 
-        instances[query_frame]["decay_time_traj_score"] = pd.DataFrame(
-            deepcopy((traj_score).numpy()), columns=unique_ids.cpu().numpy()
+        traj_score_df = pd.DataFrame(
+            traj_score.clone().numpy(), columns=unique_ids.cpu().numpy()
         )
-        instances[query_frame]["decay_time_traj_score"].index.name = "Current Frame Instances"
-        instances[query_frame]["decay_time_traj_score"].columns.name = "Unique IDs"
+
+        traj_score_df.index.name = "Current Frame Instances"
+        traj_score_df.columns.name = "Unique IDs"
+
+        query_frame.add_traj_score("traj_score", traj_score_df)
         ################################################################################
 
         # with iou -> combining with location in tracker, they set to True
@@ -362,7 +388,7 @@ def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query
             #    n_nonquery, device=id_inds.device)[:, None]).max(dim=0)[1] # n_traj
 
             last_inds = (
-                id_inds * torch.arange(n_nonquery[0], device=id_inds.device)[:, None]
+                id_inds * torch.arange(n_nonquery, device=id_inds.device)[:, None]
             ).max(dim=0)[
                 1
             ]  # M
@@ -374,6 +400,16 @@ def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query
         else:
             last_ious = traj_score.new_zeros(traj_score.shape)
         traj_score = post_processing.weight_iou(traj_score, self.iou, last_ious.cpu())
+
+        if self.iou is not None and self.iou != "":
+            iou_traj_score = pd.DataFrame(
+                traj_score.clone().numpy(), columns=unique_ids.cpu().numpy()
+            )
+
+            iou_traj_score.index.name = "Current Frame Instances"
+            iou_traj_score.columns.name = "Unique IDs"
+
+            query_frame.add_traj_score("weight_iou", iou_traj_score)
         ################################################################################
 
         # threshold for continuing a tracking or starting a new track -> they use 1.0
@@ -382,6 +418,25 @@ def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query
             traj_score, self.max_center_dist, query_boxes, nonquery_boxes, id_inds
         )
 
+        if self.max_center_dist is not None and self.max_center_dist > 0:
+            max_center_dist_traj_score = pd.DataFrame(
+                traj_score.clone().numpy(), columns=unique_ids.cpu().numpy()
+            )
+
+            max_center_dist_traj_score.index.name = "Current Frame Instances"
+            max_center_dist_traj_score.columns.name = "Unique IDs"
+
+            query_frame.add_traj_score("max_center_dist", max_center_dist_traj_score)
+
+        ################################################################################
+        scaled_traj_score = torch.softmax(traj_score, dim=1)
+        scaled_traj_score_df = pd.DataFrame(
+            scaled_traj_score.numpy(), columns=unique_ids.cpu().numpy()
+        )
+        scaled_traj_score_df.index.name = "Current Frame Instances"
+        scaled_traj_score_df.columns.name = "Unique IDs"
+
+        query_frame.add_traj_score("scaled", scaled_traj_score_df)
         ################################################################################
 
         match_i, match_j = linear_sum_assignment((-traj_score))
@@ -397,22 +452,32 @@ def _run_global_tracker(self, model: GlobalTrackingTransformer, instances, query
             thresh = (
                 overlap_thresh * id_inds[:, j].sum() if mult_thresh else overlap_thresh
             )
-            if traj_score[i, j] > thresh:
+            if n_traj >= self.max_tracks or traj_score[i, j] > thresh:
+                if self.verbose:
+                    print(
+                        f"Assigning instance {i} to track {j} with id {unique_ids[j]}"
+                    )
                 track_ids[i] = unique_ids[j]
-
+                query_frame.instances[i].track_score = scaled_traj_score[i, j].item()
+        if self.verbose:
+            print(f"track_ids: {track_ids}")
         for i in range(n_query):
             if track_ids[i] < 0:
-                track_ids[i] = id_count
-                id_count += 1
+                if self.verbose:
+                    print(f"Creating new track {n_traj}")
+                track_ids[i] = n_traj
+                n_traj += 1
 
-        instances[query_frame]["matches"] = (match_i, match_j)
-        instances[query_frame]["pred_track_ids"] = track_ids
-        instances[query_frame]["final_traj_score"] = pd.DataFrame(
-            deepcopy((traj_score).numpy()), columns=unique_ids.cpu().numpy()
-        )
-        instances[query_frame]["final_traj_score"].index.name = "Current Frame Instances"
-        instances[query_frame]["final_traj_score"].columns.name = "Unique IDs"
+        query_frame.matches = (match_i, match_j)
 
-        self.track_queue.append(instances[query_frame])
+        for instance, track_id in zip(query_frame.instances, track_ids):
+            instance.pred_track_id = track_id
+
+        final_traj_score = pd.DataFrame(
+            traj_score.clone().numpy(), columns=unique_ids.cpu().numpy()
+        )
+        final_traj_score.index.name = "Current Frame Instances"
+        final_traj_score.columns.name = "Unique IDs"
 
-        return instances[query_frame], id_count
+        query_frame.add_traj_score("final", final_traj_score)
+        return query_frame
diff --git a/biogtr/models/attention_head.py b/biogtr/models/attention_head.py
index 3e8d6a8..d562b62 100644
--- a/biogtr/models/attention_head.py
+++ b/biogtr/models/attention_head.py
@@ -72,7 +72,7 @@ def __init__(
         num_layers: int,
         dropout: float,
     ):
-        """Initializes an instance of ATTWeightHead.
+        """Initialize an instance of ATTWeightHead.
 
         Args:
             feature_dim: The dimensionality of input features.
@@ -89,7 +89,7 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
     ) -> torch.Tensor:
-        """Computes the attention weights of a query tensor using the key tensor.
+        """Compute the attention weights of a query tensor using the key tensor.
 
         Args:
             query: Input tensor of shape (batch_size, num_frame_instances, feature_dim).
diff --git a/biogtr/models/embedding.py b/biogtr/models/embedding.py
index ac3336c..364d4c8 100644
--- a/biogtr/models/embedding.py
+++ b/biogtr/models/embedding.py
@@ -17,12 +17,13 @@ def __init__(self):
         """Initialize embeddings."""
         super().__init__()
         # empty init for flexibility
-        pass
+        self.pos_lookup = None
+        self.temp_lookup = None
 
     def _torch_int_div(
         self, tensor1: torch.Tensor, tensor2: torch.Tensor
     ) -> torch.Tensor:
-        """Performs integer division of two tensors.
+        """Perform integer division of two tensors.
 
         Args:
             tensor1: dividend tensor.
@@ -42,7 +43,7 @@ def _sine_box_embedding(
         normalize: bool = False,
         **kwargs,
     ) -> torch.Tensor:
-        """Computes sine positional embeddings for boxes using given parameters.
+        """Compute sine positional embeddings for boxes using given parameters.
 
         Args:
             boxes: the input boxes.
@@ -104,7 +105,7 @@ def _learned_pos_embedding(
         over_boxes: bool = True,
         **kwargs,
     ) -> torch.Tensor:
-        """Computes learned positional embeddings for boxes using given parameters.
+        """Compute learned positional embeddings for boxes using given parameters.
 
         Args:
             boxes: the input boxes.
@@ -126,7 +127,11 @@ def _learned_pos_embedding(
         self.learn_pos_emb_num = params["learn_pos_emb_num"]
         self.over_boxes = params["over_boxes"]
 
-        pos_lookup = torch.nn.Embedding(self.learn_pos_emb_num * 4, self.features // 4)
+        if self.pos_lookup is None:
+            self.pos_lookup = torch.nn.Embedding(
+                self.learn_pos_emb_num * 4, self.features // 4
+            )
+        pos_lookup = self.pos_lookup
 
         N = boxes.shape[0]
         boxes = boxes.view(N, 4)
@@ -147,9 +152,15 @@ def _learned_pos_embedding(
             self.learn_pos_emb_num, 4, f
         )  # T x 4 x (D * 4)
 
-        pos_le = pos_emb_table.gather(0, l[:, :, None].to(pos_emb_table.device).expand(N, 4, f))  # N x 4 x d
-        pos_re = pos_emb_table.gather(0, r[:, :, None].to(pos_emb_table.device).expand(N, 4, f))  # N x 4 x d
-        pos_emb = lw[:, :, None] * pos_re.to(lw.device) + rw[:, :, None] * pos_le.to(rw.device)
+        pos_le = pos_emb_table.gather(
+            0, l[:, :, None].to(pos_emb_table.device).expand(N, 4, f)
+        )  # N x 4 x d
+        pos_re = pos_emb_table.gather(
+            0, r[:, :, None].to(pos_emb_table.device).expand(N, 4, f)
+        )  # N x 4 x d
+        pos_emb = lw[:, :, None] * pos_re.to(lw.device) + rw[:, :, None] * pos_le.to(
+            rw.device
+        )
 
         pos_emb = pos_emb.view(N, 4 * f)
 
@@ -162,7 +173,7 @@ def _learned_temp_embedding(
         learn_temp_emb_num: int = 16,
         **kwargs,
     ) -> torch.Tensor:
-        """Computes learned temporal embeddings for times using given parameters.
+        """Compute learned temporal embeddings for times using given parameters.
 
         Args:
             times: the input times.
@@ -181,8 +192,12 @@ def _learned_temp_embedding(
         self.features = params["features"]
         self.learn_temp_emb_num = params["learn_temp_emb_num"]
 
-        temp_lookup = torch.nn.Embedding(self.learn_temp_emb_num, self.features)
+        if self.temp_lookup is None:
+            self.temp_lookup = torch.nn.Embedding(
+                self.learn_temp_emb_num, self.features
+            )
 
+        temp_lookup = self.temp_lookup
         N = times.shape[0]
 
         l, r, lw, rw = self._compute_weights(times, self.learn_temp_emb_num)
@@ -197,7 +212,7 @@ def _learned_temp_embedding(
     def _compute_weights(
         self, data: torch.Tensor, learn_emb_num: int = 16
     ) -> Tuple[torch.Tensor, ...]:
-        """Computes left and right learned embedding weights.
+        """Compute left and right learned embedding weights.
 
         Args:
             data: the input data (e.g boxes or times).
diff --git a/biogtr/models/global_tracking_transformer.py b/biogtr/models/global_tracking_transformer.py
index 373d368..0743ce4 100644
--- a/biogtr/models/global_tracking_transformer.py
+++ b/biogtr/models/global_tracking_transformer.py
@@ -1,6 +1,8 @@
 """Module containing GTR model used for training."""
+
 from biogtr.models.transformer import Transformer
 from biogtr.models.visual_encoder import VisualEncoder
+from biogtr.data_structures import Frame
 from torch import nn
 
 # todo: do we want to handle params with configs already here?
@@ -97,33 +99,26 @@ def __init__(
             decoder_self_attn=decoder_self_attn,
         )
 
-    def forward(
-        self,
-        instances: list[dict],
-        all_instances: list[dict] = None,
-        query_frame: int = None,
-    ):
-        """Forward pass of GTR Model to get asso matrix.
+    def forward(self, frames: list[Frame], query_frame: int = None):
+        """Execute forward pass of GTR Model to get asso matrix.
 
         Args:
-            instances: List of dicts from chunk containing crops of objects + gt label info
-            all_instances: List of dicts containing crops of objects + gt label info. Used for stitching together full trajectory
+            frames: List of Frames from chunk containing crops of objects + gt label info
             query_frame: Frame index used as query for self attention. Only used in sliding inference where query frame is the last frame in the window.
 
         Returns:
             An N_T x N association matrix
         """
         # Extract feature representations with pre-trained encoder.
-        for frame in instances:
-            if (frame["num_detected"] > 0).item(): 
-                if "features" not in frame.keys() or frame['features'] == None or len(frame["features"]) == 0:
-                    z = self.visual_encoder(frame["crops"])
-                    frame["features"] = z
+        for frame in frames:
+            if frame.has_instances():
+                if not frame.has_features():
+                    crops = frame.get_crops()
+                    z = self.visual_encoder(crops)
+
+                    for i, z_i in enumerate(z):
+                        frame.instances[i].features = z_i
 
-        # Extract association matrix with transformer.
-        if self.transformer.return_embedding:
-            asso_preds, emb = self.transformer(instances, query_frame=query_frame)
-        else:
-            asso_preds = self.transformer(instances, query_frame=query_frame)
+        asso_preds, emb = self.transformer(frames, query_frame=query_frame)
 
-        return (asso_preds, emb) if self.transformer.return_embedding else asso_preds
+        return asso_preds, emb
diff --git a/biogtr/models/gtr_runner.py b/biogtr/models/gtr_runner.py
index 7fb32e3..e7e6a57 100644
--- a/biogtr/models/gtr_runner.py
+++ b/biogtr/models/gtr_runner.py
@@ -1,8 +1,7 @@
 """Module containing training, validation and inference logic."""
 
-from typing import Any, Optional
-from pytorch_lightning.utilities.types import STEP_OUTPUT
 import torch
+import gc
 from biogtr.inference.tracker import Tracker
 from biogtr.inference import metrics
 from biogtr.models.global_tracking_transformer import GlobalTrackingTransformer
@@ -24,8 +23,16 @@ def __init__(
         loss_cfg: dict = {},
         optimizer_cfg: dict = None,
         scheduler_cfg: dict = None,
-        metrics: dict[str,list[str]] = {"train": ["num_switches"], "val": ["num_switches"], "test": ["num_switches"]},
-        persistent_tracking: dict[str, bool] = {"train": False, "val": True, "test": True}
+        metrics: dict[str, list[str]] = {
+            "train": [],
+            "val": ["num_switches"],
+            "test": ["num_switches"],
+        },
+        persistent_tracking: dict[str, bool] = {
+            "train": False,
+            "val": True,
+            "test": True,
+        },
     ):
         """Initialize a lightning module for GTR.
 
@@ -36,9 +43,8 @@ def __init__(
             optimizer_cfg: hyper parameters used for optimizer.
                        Only used to overwrite `configure_optimizer`
             scheduler_cfg: hyperparameters for lr_scheduler used to overwrite `configure_optimizer
-            train_metrics: a list of metrics to be calculated during training
-            val_metrics: a list of metrics to be calculated during validation
-            test_metrics: a list of metrics to be calculated at test time
+            metrics: a dict containing the metrics to be computed during train, val, and test.
+            persistent_tracking: a dict containing whether to use persistent tracking during train, val and test inference.
         """
         super().__init__()
         self.save_hyperparameters()
@@ -52,8 +58,9 @@ def __init__(
 
         self.metrics = metrics
         self.persistent_tracking = persistent_tracking
+
     def forward(self, instances) -> torch.Tensor:
-        """The forward pass of the lightning module.
+        """Execute forward pass of the lightning module.
 
         Args:
             instances: a list of dicts where each dict is a frame with gt data
@@ -61,14 +68,15 @@ def forward(self, instances) -> torch.Tensor:
         Returns:
             An association matrix between objects
         """
-        if sum([frame['num_detected'] for frame in instances]) > 0:
-            return self.model(instances)
+        if sum([frame.num_detected for frame in instances]) > 0:
+            asso_preds, _ = self.model(instances)
+            return asso_preds
         return None
 
     def training_step(
         self, train_batch: list[dict], batch_idx: int
     ) -> dict[str, float]:
-        """Method outlining the training procedure for model.
+        """Execute single training step for model.
 
         Args:
             train_batch: A single batch from the dataset which is a list of dicts
@@ -79,14 +87,14 @@ def training_step(
             A dict containing the train loss plus any other metrics specified
         """
         result = self._shared_eval_step(train_batch[0], mode="train")
-        self.log_metrics(result, "train")
-                         
+        self.log_metrics(result, len(train_batch[0]), "train")
+
         return result
 
     def validation_step(
         self, val_batch: list[dict], batch_idx: int
     ) -> dict[str, float]:
-        """Method outlining the val procedure for model.
+        """Execute single val step for model.
 
         Args:
             val_batch: A single batch from the dataset which is a list of dicts
@@ -96,13 +104,13 @@ def validation_step(
         Returns:
             A dict containing the val loss plus any other metrics specified
         """
-        result = self._shared_eval_step(val_batch[0], mode = "val")
-        self.log_metrics(result, "val")
-        
+        result = self._shared_eval_step(val_batch[0], mode="val")
+        self.log_metrics(result, len(val_batch[0]), "val")
+
         return result
 
     def test_step(self, test_batch: list[dict], batch_idx: int) -> dict[str, float]:
-        """Method outlining the test procedure for model.
+        """Execute single test step for model.
 
         Args:
             val_batch: A single batch from the dataset which is a list of dicts
@@ -113,12 +121,12 @@ def test_step(self, test_batch: list[dict], batch_idx: int) -> dict[str, float]:
             A dict containing the val loss plus any other metrics specified
         """
         result = self._shared_eval_step(test_batch[0], mode="test")
-        self.log_metrics(result, "test")
-        
+        self.log_metrics(result, len(test_batch[0]), "test")
+
         return result
 
     def predict_step(self, batch: list[dict], batch_idx: int) -> dict:
-        """Method describing inference for model.
+        """Run inference for model.
 
         Computes association + assignment.
 
@@ -135,7 +143,7 @@ def predict_step(self, batch: list[dict], batch_idx: int) -> dict:
         return instances_pred
 
     def _shared_eval_step(self, instances, mode):
-        """Helper function for running evaluation used by train, test, and val steps.
+        """Run evaluation used by train, test, and val steps.
 
         Args:
             instances: A list of dicts where each dict is a frame containing gt data
@@ -145,12 +153,11 @@ def _shared_eval_step(self, instances, mode):
             a dict containing the loss and any other metrics specified by `eval_metrics`
         """
         try:
+            instances = [frame for frame in instances if frame.has_instances()]
             eval_metrics = self.metrics[mode]
             persistent_tracking = self.persistent_tracking[mode]
-            if self.model.transformer.return_embedding:
-                logits, _ = self(instances)
-            else:
-                logits = self(instances)
+
+            logits = self(instances)
 
             if not logits:
                 return None
@@ -164,9 +171,13 @@ def _shared_eval_step(self, instances, mode):
                 instances_mm = metrics.to_track_eval(instances_pred)
                 clearmot = metrics.get_pymotmetrics(instances_mm, eval_metrics)
                 return_metrics.update(clearmot.to_dict())
+            return_metrics["batch_size"] = len(instances)
         except Exception as e:
-            print(f'Failed on frame {instances[0]["frame_id"]} of video {instances[0]["video_id"]}')
-            raise(e)
+            print(
+                f"Failed on frame {instances[0].frame_id} of video {instances[0].video_id}"
+            )
+            raise (e)
+
         return return_metrics
 
     def configure_optimizers(self) -> dict:
@@ -199,8 +210,26 @@ def configure_optimizers(self) -> dict:
                 "frequency": 10,
             },
         }
-    
-    def log_metrics(self, result, mode):
+
+    def log_metrics(self, result: dict, batch_size: int, mode: str) -> None:
+        """Log metrics computed during evaluation.
+
+        Args:
+            result: A dict containing metrics to be logged.
+            batch_size: the size of the batch used to compute the metrics
+            mode: One of {'train', 'test' or 'val'}. Used as prefix while logging.
+        """
         if result:
+            batch_size = result.pop("batch_size")
             for metric, val in result.items():
-                self.log(f"{mode}_{metric}", val, on_step = True, on_epoch=True)
+                if isinstance(val, torch.Tensor):
+                    val = val.item()
+                self.log(f"{mode}_{metric}", val, batch_size=batch_size)
+
+    def on_validation_epoch_end(self):
+        """Execute hook for validation end.
+
+        Currently, we simply clear the gpu cache and do garbage collection.
+        """
+        gc.collect()
+        torch.cuda.empty_cache()
diff --git a/biogtr/models/model_utils.py b/biogtr/models/model_utils.py
index b4457ac..b68fcad 100644
--- a/biogtr/models/model_utils.py
+++ b/biogtr/models/model_utils.py
@@ -1,15 +1,17 @@
 """Module containing model helper functions."""
+
 from copy import deepcopy
-from typing import Dict, List, Tuple, Iterable
+from typing import List, Tuple, Iterable
 from pytorch_lightning import loggers
+from biogtr.data_structures import Frame
 import torch
 
 
-def get_boxes_times(instances: List[Dict]) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Extracts the bounding boxes and frame indices from the input list of instances.
+def get_boxes_times(frames: List[Frame]) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Extract the bounding boxes and frame indices from the input list of instances.
 
     Args:
-        instances (List[Dict]): List of instance dictionaries
+        frames (List[Frame]): List of frame objects containing metadata and instances.
 
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: A tuple of two tensors containing the
@@ -17,10 +19,10 @@ def get_boxes_times(instances: List[Dict]) -> Tuple[torch.Tensor, torch.Tensor]:
                                             indices, respectively.
     """
     boxes, times = [], []
-    _, h, w = instances[0]["img_shape"].flatten()
+    _, h, w = frames[0].img_shape.flatten()
 
-    for fidx, instance in enumerate(instances):
-        bbox = deepcopy(instance["bboxes"])
+    for fidx, frame in enumerate(frames):
+        bbox = deepcopy(frame.get_bboxes())
         bbox[:, [0, 2]] /= w
         bbox[:, [1, 3]] /= h
 
@@ -33,7 +35,7 @@ def get_boxes_times(instances: List[Dict]) -> Tuple[torch.Tensor, torch.Tensor]:
 
 
 def softmax_asso(asso_output: list[torch.Tensor]) -> list[torch.Tensor]:
-    """Applies the softmax activation function on asso_output.
+    """Apply the softmax activation function on asso_output.
 
     Args:
         asso_output: Raw logits output of the tracking transformer. A list of
@@ -132,18 +134,19 @@ def init_scheduler(optimizer: torch.optim.Optimizer, config: dict):
     return scheduler_class(optimizer, **scheduler_params)
 
 
-def init_logger(config: dict):
+def init_logger(logger_params: dict, config: dict = None):
     """Initialize logger based on config parameters.
 
     Allows more flexibility in choosing which logger to use.
 
     Args:
-        config: logger hyperparameters
+        logger_params: logger hyperparameters
+        config: rest of hyperparameters to log (mostly used for WandB)
 
     Returns:
         logger: A logger with specified params (or None).
     """
-    logger_type = config.pop("logger_type", None)
+    logger_type = logger_params.pop("logger_type", None)
 
     valid_loggers = [
         "CSVLogger",
@@ -153,10 +156,16 @@ def init_logger(config: dict):
 
     if logger_type in valid_loggers:
         logger_class = getattr(loggers, logger_type)
-        try:
-            return logger_class(**config)
-        except Exception as e:
-            print(e, logger_type)
+        if logger_class == loggers.WandbLogger:
+            try:
+                return logger_class(config=config, **logger_params)
+            except Exception as e:
+                print(e, logger_type)
+        else:
+            try:
+                return logger_class(**logger_params)
+            except Exception as e:
+                print(e, logger_type)
     else:
         print(
             f"{logger_type} not one of {valid_loggers} or set to None, skipping logging"
diff --git a/biogtr/models/transformer.py b/biogtr/models/transformer.py
index 91274dd..dec1fc3 100644
--- a/biogtr/models/transformer.py
+++ b/biogtr/models/transformer.py
@@ -11,12 +11,11 @@
     * added fixed embeddings over boxes
 """
 
-
+from biogtr.data_structures import Frame
 from biogtr.models.attention_head import ATTWeightHead
 from biogtr.models.embedding import Embedding
 from biogtr.models.model_utils import get_boxes_times
 from torch import nn
-from typing import Dict, List, Tuple
 import copy
 import torch
 import torch.nn.functional as F
@@ -163,11 +162,11 @@ def _reset_parameters(self):
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
 
-    def forward(self, instances, query_frame=None):
-        """A forward pass through the transformer and attention head.
+    def forward(self, frames: list[Frame], query_frame: int = None):
+        """Execute a forward pass through the transformer and attention head.
 
         Args:
-            instances: A list of dictionaries, one dictionary for each frame
+            frames: A list of Frames (See `biogtr.data_structures.Frame for more info.)
             query_frame: An integer (k) specifying the frame within the window to be queried.
 
         Returns:
@@ -175,33 +174,25 @@ def forward(self, instances, query_frame=None):
                 L: number of decoder blocks
                 n_query: number of instances in current query/frame
                 total_instances: number of instances in window
-
-        # ------------------------- An example of instances ------------------------ #
-        instances = [
-            {
-                # Each dictionary is a frame.
-                "frame_id": frame index int,
-                "num_detected": N_i, # num of detected instances in i-th frame
-                "bboxes": (N_i, 4),  # in pascal_voc unrounded unnormalized
-                "features": (N_i, embed_dim), # embed_dim = embedding dimension
-                ...
-            },
-            ...
-        ]
         """
-        reid_features = torch.cat(
-            [frame["features"] for frame in instances], dim=0
-        ).unsqueeze(0)
-
-        window_length = len(instances)
-        instances_per_frame = [frame["num_detected"] for frame in instances]
+        try:
+            reid_features = torch.cat(
+                [frame.get_features() for frame in frames], dim=0
+            ).unsqueeze(0)
+        except Exception as e:
+            print([[f.device for f in frame.get_features()] for frame in frames])
+            raise (e)
+
+        window_length = len(frames)
+        instances_per_frame = [frame.num_detected for frame in frames]
         total_instances = sum(instances_per_frame)
         embed_dim = reid_features.shape[-1]
 
+        # print(f'T: {window_length}; N: {total_instances}; N_t: {instances_per_frame} n_reid: {reid_features.shape}')
         if self.embedding_meta:
             kwargs = self.embedding_meta.get("kwargs", {})
 
-            pred_box, pred_time = get_boxes_times(instances)  # total_instances x 4
+            pred_box, pred_time = get_boxes_times(frames)  # total_instances x 4
 
             embedding_type = self.embedding_meta["embedding_type"]
 
@@ -227,21 +218,32 @@ def forward(self, instances, query_frame=None):
                 pos_emb = (pos_emb + temp_emb) / 2.0
 
             pos_emb = pos_emb.view(1, total_instances, embed_dim)
-            pos_emb = pos_emb.permute(1, 0, 2)  # (total_instances, batch_size, embed_dim)
+            pos_emb = pos_emb.permute(
+                1, 0, 2
+            )  # (total_instances, batch_size, embed_dim)
         else:
             pos_emb = None
 
         query_inds = None
         n_query = total_instances
         if query_frame is not None:
-            
-            query_inds = [x for x in range(sum(instances_per_frame[:query_frame]), sum(instances_per_frame[: query_frame + 1]))]
+            query_inds = [
+                x
+                for x in range(
+                    sum(instances_per_frame[:query_frame]),
+                    sum(instances_per_frame[: query_frame + 1]),
+                )
+            ]
             n_query = len(query_inds)
 
         batch_size, total_instances, embed_dim = reid_features.shape
-        reid_features = reid_features.permute(1, 0, 2)  # (total_instances x batch_size x embed_dim)
+        reid_features = reid_features.permute(
+            1, 0, 2
+        )  # (total_instances x batch_size x embed_dim)
 
-        memory = self.encoder(reid_features, pos_emb=pos_emb)  # (total_instances, batch_size, embed_dim)
+        memory = self.encoder(
+            reid_features, pos_emb=pos_emb
+        )  # (total_instances, batch_size, embed_dim)
 
         if query_inds is not None:
             tgt = reid_features[query_inds]
@@ -260,7 +262,9 @@ def forward(self, instances, query_frame=None):
         )  # (L, n_query, batch_size, embed_dim)
 
         feats = hs.transpose(1, 2)  # # (L, batch_size, n_query, embed_dim)
-        memory = memory.permute(1, 0, 2).view(batch_size, total_instances, embed_dim)  # (batch_size, total_instances, embed_dim)
+        memory = memory.permute(1, 0, 2).view(
+            batch_size, total_instances, embed_dim
+        )  # (batch_size, total_instances, embed_dim)
 
         asso_output = []
         for x in feats:
@@ -269,7 +273,7 @@ def forward(self, instances, query_frame=None):
             asso_output.append(self.attn_head(x, memory).view(n_query, total_instances))
 
         # (L=1, n_query, total_instances)
-        return (asso_output, pos_emb) if self.return_embedding else asso_output
+        return (asso_output, pos_emb) if self.return_embedding else (asso_output, None)
 
 
 class TransformerEncoder(nn.Module):
@@ -292,7 +296,7 @@ def __init__(
         self.norm = norm
 
     def forward(self, src: torch.Tensor, pos_emb: torch.Tensor = None) -> torch.Tensor:
-        """Forward pass of encoder layer.
+        """Execute a forward pass of encoder layer.
 
         Args:
             src: The input tensor of shape (n_query, batch_size, embed_dim).
@@ -339,7 +343,7 @@ def __init__(
     def forward(
         self, tgt: torch.Tensor, memory: torch.Tensor, pos_emb=None, tgt_pos_emb=None
     ):
-        """Forward pass of the decoder block.
+        """Execute a forward pass of the decoder block.
 
         Args:
             tgt: Target sequence for decoder to generate (n_query, batch_size, embed_dim).
@@ -414,7 +418,7 @@ def __init__(
         self.activation = _get_activation_fn(activation)
 
     def forward(self, src: torch.Tensor, pos: torch.Tensor = None):
-        """Forward pass of the encoder layer.
+        """Execute a forward pass of the encoder layer.
 
         Args:
             src: Input sequence for encoder (n_query, batch_size, embed_dim).
@@ -488,7 +492,7 @@ def __init__(
         self.activation = _get_activation_fn(activation)
 
     def forward(self, tgt, memory, pos=None, tgt_pos=None):
-        """Forward pass of decoder layer.
+        """Execute forward pass of decoder layer.
 
         Args:
             tgt: Target sequence for decoder to generate (n_query, batch_size, embed_dim).
diff --git a/biogtr/training/configs/base.yaml b/biogtr/training/configs/base.yaml
index 5088b1c..f7069f4 100644
--- a/biogtr/training/configs/base.yaml
+++ b/biogtr/training/configs/base.yaml
@@ -55,30 +55,35 @@ tracker:
   max_center_dist: null
 
 runner:
-  train_metrics: [""]
-  val_metrics: ["sw_cnt"]
-  test_metrics: ["sw_cnt"]
-
+  metrics:
+      train: ['num_switches']
+      val: ['num_switches']
+      test: ['num_switches']
+  persistent_tracking:
+      train: false
+      val: true
+      test: true
+      
 dataset:
   train_dataset:
-    slp_files: ['190612_110405_wt_18159111_rig2.2@11730.slp']
-    video_files: ['190612_110405_wt_18159111_rig2.2@11730.mp4']
+    slp_files: ["../../tests/data/sleap/two_flies.slp"]
+    video_files: ["../../tests/data/sleap/two_flies.mp4"]
     padding: 5
     crop_size: 128
     chunk: true
     clip_length: 32
 
   val_dataset:
-    slp_files: ['190612_110405_wt_18159111_rig2.2@11730.slp']
-    video_files: ['190612_110405_wt_18159111_rig2.2@11730.mp4']
+    slp_files: ["../../tests/data/sleap/two_flies.slp"]
+    video_files: ["../../tests/data/sleap/two_flies.mp4"]
     padding: 5
     crop_size: 128 
     chunk: True
     clip_length: 32
 
   test_dataset:
-    slp_files: ['190612_110405_wt_18159111_rig2.2@11730.slp']
-    video_files: ['190612_110405_wt_18159111_rig2.2@11730.mp4']
+    slp_files: ["../../tests/data/sleap/two_flies.slp"]
+    video_files: ["../../tests/data/sleap/two_flies.mp4"]
     padding: 5
     crop_size: 128 
     chunk: True
@@ -96,6 +101,7 @@ dataloader:
     num_workers: 0
 
 logging:
+  logger_type: null
   name: "example_train"
   entity: null
   job_type: "train"
@@ -116,7 +122,7 @@ early_stopping:
   divergence_threshold: 30
 
 checkpointing:
-  monitor: ["val_loss","val_sw_cnt"]
+  monitor: ["val_loss","val_num_switches"]
   verbose: true
   save_last: true
   dirpath: null
@@ -133,3 +139,8 @@ trainer:
   log_every_n_steps: 1
   max_epochs: 100
   min_epochs: 10
+
+view_batch:
+  enable: False
+  num_frames: 0
+  no_train: False
diff --git a/biogtr/training/losses.py b/biogtr/training/losses.py
index 5990949..557b78e 100644
--- a/biogtr/training/losses.py
+++ b/biogtr/training/losses.py
@@ -1,4 +1,6 @@
 """Module containing different loss functions to be optimized."""
+
+from biogtr.data_structures import Frame
 from biogtr.models.model_utils import get_boxes_times
 from torch import nn
 from typing import List, Tuple
@@ -33,23 +35,23 @@ def __init__(
         self.asso_weight = asso_weight
 
     def forward(
-        self, asso_preds: List[torch.Tensor], instances: List[dict]
+        self, asso_preds: List[torch.Tensor], frames: List[Frame]
     ) -> torch.Tensor:
         """Calculate association loss.
 
         Args:
             asso_preds: a list containing the association matrix at each frame
-            instances: a list of dictionaries for each frame containing gt labels.
+            frames: a list of Frames containing gt labels.
 
         Returns:
             the association loss between predicted association and actual
         """
         # get number of detected objects and ground truth ids
-        n_t = [frame["num_detected"] for frame in instances]
-        target_inst_id = torch.cat([frame["gt_track_ids"] for frame in instances])
+        n_t = [frame.num_detected for frame in frames]
+        target_inst_id = torch.cat([frame.get_gt_track_ids() for frame in frames])
 
         # for now set equal since detections are fixed
-        pred_box, pred_time = get_boxes_times(instances)
+        pred_box, pred_time = get_boxes_times(frames)
         target_box, target_time = pred_box, pred_time
 
         # todo: we should maybe reconsider how we label gt instances. The second
diff --git a/biogtr/training/train.py b/biogtr/training/train.py
index 2537f7c..56a2d81 100644
--- a/biogtr/training/train.py
+++ b/biogtr/training/train.py
@@ -2,6 +2,7 @@
 
 Used for training a single model or deploying a batch train job on RUNAI CLI
 """
+
 from biogtr.config import Config
 from biogtr.datasets.tracking_dataset import TrackingDataset
 from biogtr.datasets.data_utils import view_training_batch
@@ -15,7 +16,7 @@
 import torch
 import torch.multiprocessing
 
-#device = "cuda" if torch.cuda.is_available() else "cpu"
+# device = "cuda" if torch.cuda.is_available() else "cpu"
 
 # useful for longer training runs, but not for single iteration debugging
 # finds optimal hardware algs which has upfront time increase for first
@@ -24,12 +25,12 @@
 # torch.backends.cudnn.benchmark = True
 
 # pytorch 2 logic - we set our device once here so we don't have to keep setting
-#torch.set_default_device(device)
+# torch.set_default_device(device)
 
 
 @hydra.main(config_path="configs", config_name=None, version_base=None)
 def main(cfg: DictConfig):
-    """Main function for training.
+    """Train model based on config.
 
     Handles all config parsing and initialization then calls `trainer.train()`.
     If `batch_config` is included then run will be assumed to be a batch job.
@@ -37,15 +38,17 @@ def main(cfg: DictConfig):
     Args:
         cfg: The config dict parsed by `hydra`
     """
+    torch.set_float32_matmul_precision("medium")
     train_cfg = Config(cfg)
 
     # update with parameters for batch train job
     if "batch_config" in cfg.keys():
-        
         try:
             index = int(os.environ["POD_INDEX"])
         except KeyError as e:
-            index = int(input("No pod index found, assuming single run!\nPlease input task index to run:"))
+            index = int(
+                input(f"{e}. Assuming single run!\nPlease input task index to run:")
+            )
 
         hparams_df = pd.read_csv(cfg.batch_config)
         hparams = hparams_df.iloc[index].to_dict()
@@ -78,7 +81,7 @@ def main(cfg: DictConfig):
         if cfg.view_batch.no_train:
             return
 
-    model = train_cfg.get_gtr_runner()
+    model = train_cfg.get_gtr_runner()  # TODO see if we can use torch.compile()
 
     logger = train_cfg.get_logger()
 
diff --git a/biogtr/visualize.py b/biogtr/visualize.py
index 1f497bc..bafcf14 100644
--- a/biogtr/visualize.py
+++ b/biogtr/visualize.py
@@ -1,10 +1,9 @@
 """Helper functions for visualizing tracking."""
+
 from scipy.interpolate import interp1d
 from copy import deepcopy
 from tqdm import tqdm
-from matplotlib import pyplot as plt
 from omegaconf import DictConfig
-from tqdm import tqdm
 
 import seaborn as sns
 import imageio
@@ -12,14 +11,14 @@
 import pandas as pd
 import numpy as np
 import cv2
-import imageio
+from matplotlib import pyplot
+import gc
 
-
-palette = sns.color_palette("tab10")
+palette = sns.color_palette("tab20")
 
 
 def fill_missing(data: np.ndarray, kind: str = "linear") -> np.ndarray:
-    """Fills missing values independently along each dimension after the first.
+    """Fill missing values independently along each dimension after the first.
 
     Args:
         data: the array for which to fill missing value
@@ -64,13 +63,15 @@ def annotate_video(
     labels: pd.DataFrame,
     key: str,
     color_palette=palette,
-    trails: bool = True,
-    boxes: int = 64,
+    trails: int = 2,
+    boxes: int = (64, 64),
     names: bool = True,
-    centroids: bool = True,
+    track_scores=0.5,
+    centroids: int = 4,
     poses=False,
-    save_path: str = "debug_animal",
-    fps: int = 30
+    save_path: str = "debug_animal.mp4",
+    fps: int = 30,
+    alpha=0.2,
 ) -> list:
     """Annotate video frames with labels.
 
@@ -90,20 +91,19 @@ def annotate_video(
     Returns:
         A list of annotated video frames
     """
-    
     writer = imageio.get_writer(save_path, fps=fps)
     color_palette = deepcopy(color_palette)
-    annotated_frames = []
 
     if trails:
         track_trails = {}
     try:
-        for i in tqdm(sorted(labels["Frame"].unique()), desc = 'Frame', unit='Frame'):
+        for i in tqdm(sorted(labels["Frame"].unique()), desc="Frame", unit="Frame"):
             frame = video.get_data(i)
             if frame.shape[0] == 1 or frame.shape[-1] == 1:
-                frame = cv2.cvtColor((frame * 255).astype(np.uint8), cv2.COLOR_GRAY2RGB)
-            else:
-                frame = (frame * 255).astype(np.uint8).copy()
+                frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
+            # else:
+            #     frame = frame.copy()
+
             lf = labels[labels["Frame"] == i]
             for idx, instance in lf.iterrows():
                 if not trails:
@@ -152,15 +152,19 @@ def annotate_video(
 
                             frame = cv2.line(frame, source, target, track_color, 1)
 
-                if (boxes is not None and boxes > 0) or centroids:
+                if (boxes) or centroids:
                     # Get coordinates for detected objects in the current frame.
+                    if isinstance(boxes, int):
+                        boxes = (boxes, boxes)
+
+                    box_w, box_h = boxes
                     x = instance["X"]
                     y = instance["Y"]
                     min_x, min_y, max_x, max_y = (
-                        int(x - boxes / 2),
-                        int(y - boxes / 2),
-                        int(x + boxes / 2),
-                        int(y + boxes / 2),
+                        int(x - box_w / 2),
+                        int(y - box_h / 2),
+                        int(x + box_w / 2),
+                        int(y + box_h / 2),
                     )
                     midpt = (int(x), int(y))
 
@@ -169,6 +173,11 @@ def annotate_video(
                     # assert idx < len(instance[key])
                     pred_track_id = instance[key]
 
+                    if "Track_score" in instance.index:
+                        track_score = instance["Track_score"]
+                    else:
+                        track_scores = 0
+
                     # Add midpt to track trail.
                     if pred_track_id not in list(track_trails.keys()):
                         track_trails[pred_track_id] = []
@@ -185,7 +194,7 @@ def annotate_video(
                     # print(instance[key])
 
                     # Bbox.
-                    if boxes is not None and boxes > 0:
+                    if boxes is not None:
                         frame = cv2.rectangle(
                             frame,
                             (min_x, min_y),
@@ -197,30 +206,50 @@ def annotate_video(
                     # Track trail.
                     if centroids:
                         frame = cv2.circle(
-                            frame, midpt, radius=4, color=track_color, thickness=-1
+                            frame,
+                            midpt,
+                            radius=centroids,
+                            color=track_color,
+                            thickness=-1,
                         )
                         for i in range(0, len(track_trails[pred_track_id]) - 1):
-                            frame = cv2.circle(
+                            frame = cv2.addWeighted(
+                                cv2.circle(
+                                    frame,  # .copy(),
+                                    track_trails[pred_track_id][i],
+                                    radius=4,
+                                    color=track_color,
+                                    thickness=-1,
+                                ),
+                                alpha,
                                 frame,
-                                track_trails[pred_track_id][i],
-                                radius=4,
-                                color=track_color,
-                                thickness=-1,
-                            )
-                            frame = cv2.line(
-                                frame,
-                                track_trails[pred_track_id][i],
-                                track_trails[pred_track_id][i + 1],
-                                color=track_color,
-                                thickness=2,
+                                1 - alpha,
+                                0,
                             )
+                            if trails:
+                                frame = cv2.line(
+                                    frame,
+                                    track_trails[pred_track_id][i],
+                                    track_trails[pred_track_id][i + 1],
+                                    color=track_color,
+                                    thickness=trails,
+                                )
 
                 # Track name.
+                name_str = ""
+
                 if names:
+                    name_str += f"track_{pred_track_id}"
+                if names and track_scores:
+                    name_str += " | "
+                if track_scores:
+                    name_str += f"score: {track_score:0.3f}"
+
+                if len(name_str) > 0:
                     frame = cv2.putText(
                         frame,
                         # f"idx:{idx} | track_{pred_track_id}",
-                        f"track_{pred_track_id}",
+                        name_str,
                         org=(int(min_x), max(0, int(min_y) - 10)),
                         fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                         fontScale=0.9,
@@ -228,12 +257,14 @@ def annotate_video(
                         thickness=2,
                     )
             writer.append_data(frame)
-            
+            # if i % fps == 0:
+            #     gc.collect()
+
     except Exception as e:
         writer.close()
         print(e)
         return False
-    
+
     writer.close()
     return True
 
@@ -284,10 +315,7 @@ def bold(val: float, thresh: float = 0.01) -> str:
 
 @hydra.main(config_path=None, config_name=None, version_base=None)
 def main(cfg: DictConfig):
-    """Main function for visualizations script.
-
-    Takes in a path to a video + labels file, annotates a video and saves it to the specified path
-    """
+    """Take in a path to a video + labels file, annotates a video and saves it to the specified path."""
     labels = pd.read_csv(cfg.labels_path)
     video = imageio.get_reader(cfg.vid_path, "ffmpeg")
     annotated_frames = annotate_video(video, labels, **cfg.annotate)
diff --git a/environment.yml b/environment.yml
index 85696a7..3637a7e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -8,8 +8,8 @@ channels:
 
 dependencies:
   - python=3.9
-  - pytorch-cuda=11.8
-  - cudatoolkit=11.8
+  - pytorch-cuda=12.1
+  - conda-forge::opencv <4.9.0
   - cudnn
   - pytorch
   - torchvision
diff --git a/environment_cpu.yml b/environment_cpu.yml
index 1b2a684..8e22da3 100644
--- a/environment_cpu.yml
+++ b/environment_cpu.yml
@@ -8,6 +8,7 @@ channels:
 
 dependencies:
   - python=3.9
+  - conda-forge::opencv <4.9.0
   - pytorch
   - cpuonly
   - torchvision
diff --git a/tests/configs/base.yaml b/tests/configs/base.yaml
index ad78b82..f8cc842 100644
--- a/tests/configs/base.yaml
+++ b/tests/configs/base.yaml
@@ -55,14 +55,16 @@ tracker:
   max_center_dist: null
 
 runner:
-  train_metrics: [""]
-  val_metrics: ["sw_cnt"]
-  test_metrics: ["sw_cnt"]
+  metrics: 
+    train: [""]
+    val: ["sw_cnt"]
+    test: ["sw_cnt"]
 
 dataset:
   train_dataset:
     slp_files: ['tests/data/sleap/two_flies.slp']
     video_files: ['tests/data/sleap/two_flies.mp4']
+    anchor: "thorax"
     padding: 5
     crop_size: 128
     chunk: true
@@ -71,6 +73,7 @@ dataset:
   val_dataset:
     slp_files: ['tests/data/sleap/two_flies.slp']
     video_files: ['tests/data/sleap/two_flies.mp4']
+    anchor: "thorax"
     padding: 5
     crop_size: 128
     chunk: True
@@ -79,6 +82,7 @@ dataset:
   test_dataset:
     slp_files: ['tests/data/sleap/two_flies.slp']
     video_files: ['tests/data/sleap/two_flies.mp4']
+    anchor: "thorax"
     padding: 5
     crop_size: 128
     chunk: True
diff --git a/tests/conftest.py b/tests/conftest.py
index 434bb56..bf6e649 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 """Config for pytests."""
+
 from tests.fixtures.configs import *
 from tests.fixtures.datasets import *
 from tests.fixtures.torch import *
diff --git a/tests/fixtures/configs.py b/tests/fixtures/configs.py
index 0f3a1c4..3cf0684 100644
--- a/tests/fixtures/configs.py
+++ b/tests/fixtures/configs.py
@@ -1,18 +1,22 @@
+"""Test config paths."""
+
 import os
 import pytest
 
 
 @pytest.fixture
 def config_dir(pytestconfig):
-    """Dir path to sleap data."""
+    """Get the dir path to configs."""
     return os.path.join(pytestconfig.rootdir, "tests/configs")
 
 
 @pytest.fixture
 def base_config(config_dir):
+    """Get the full path to base config."""
     return os.path.join(config_dir, "base.yaml")
 
 
 @pytest.fixture
 def params_config(config_dir):
+    """Get the full path to the supplementary params config."""
     return os.path.join(config_dir, "params.yaml")
diff --git a/tests/fixtures/datasets.py b/tests/fixtures/datasets.py
index 572aa09..db57409 100644
--- a/tests/fixtures/datasets.py
+++ b/tests/fixtures/datasets.py
@@ -1,4 +1,5 @@
 """Fixtures for testing biogtr."""
+
 import pytest
 from pathlib import Path
 
diff --git a/tests/fixtures/torch.py b/tests/fixtures/torch.py
index 0ea1444..9bd6d79 100644
--- a/tests/fixtures/torch.py
+++ b/tests/fixtures/torch.py
@@ -1,7 +1,9 @@
 """
-Commenting this file out for now. 
+Commenting this file out for now.
+
 For some reason it screws up `test_training` by causing a device error
 """
+
 # import pytest
 # import torch
 
diff --git a/tests/test_data_structures.py b/tests/test_data_structures.py
new file mode 100644
index 0000000..31f249b
--- /dev/null
+++ b/tests/test_data_structures.py
@@ -0,0 +1,205 @@
+"""Tests for Instance, Frame, and TrackQueue Object"""
+
+from biogtr.data_structures import Instance, Frame
+from biogtr.inference.track_queue import TrackQueue
+import torch
+
+
+def test_instance():
+    """Test Instance object logic."""
+
+    gt_track_id = 0
+    pred_track_id = 0
+    bbox = torch.randn((1, 4))
+    crop = torch.randn((1, 3, 128, 128))
+    features = torch.randn((1, 64))
+
+    instance = Instance(
+        gt_track_id=gt_track_id,
+        pred_track_id=pred_track_id,
+        bbox=bbox,
+        crop=crop,
+        features=features,
+    )
+
+    assert instance.has_gt_track_id()
+    assert instance.gt_track_id.item() == gt_track_id
+    assert instance.has_pred_track_id()
+    assert instance.pred_track_id.item() == pred_track_id
+    assert instance.has_bbox()
+    assert torch.equal(instance.bbox, bbox)
+    assert instance.has_features()
+    assert torch.equal(instance.features, features)
+
+    instance.gt_track_id = 1
+    instance.pred_track_id = 1
+    instance.bbox = torch.randn((1, 4))
+    instance.crop = torch.randn((1, 3, 128, 128))
+    instance.features = torch.randn((1, 64))
+
+    assert instance.has_gt_track_id()
+    assert instance.gt_track_id.item() != gt_track_id
+    assert instance.has_pred_track_id()
+    assert instance.pred_track_id.item() != pred_track_id
+    assert instance.has_bbox()
+    assert not torch.equal(instance.bbox, bbox)
+    assert instance.has_features()
+    assert not torch.equal(instance.features, features)
+
+    instance.gt_track_id = None
+    instance.pred_track_id = -1
+    instance.bbox = None
+    instance.crop = None
+    instance.features = None
+
+    assert not instance.has_gt_track_id()
+    assert instance.gt_track_id.shape[0] == 0
+    assert not instance.has_pred_track_id()
+    assert instance.pred_track_id.item() != pred_track_id
+    assert not instance.has_bbox()
+    assert not torch.equal(instance.bbox, bbox)
+    assert not instance.has_features()
+    assert not torch.equal(instance.features, features)
+
+
+def test_frame():
+    n_detected = 2
+    n_traj = 3
+    video_id = 0
+    frame_id = 0
+    img_shape = torch.tensor([3, 1024, 1024])
+    asso_output = torch.randn(n_detected, 16)
+    traj_score = torch.randn(n_detected, n_traj)
+    matches = ([0, 1], [0, 1])
+
+    instances = []
+    for i in range(n_detected):
+        instances.append(
+            Instance(
+                gt_track_id=i,
+                pred_track_id=i,
+                bbox=torch.randn(1, 4),
+                crop=torch.randn(1, 3, 64, 64),
+                features=torch.randn(1, 64),
+            )
+        )
+    frame = Frame(
+        video_id=video_id, frame_id=frame_id, img_shape=img_shape, instances=instances
+    )
+
+    assert frame.video_id.item() == video_id
+    assert frame.frame_id.item() == frame_id
+    assert torch.equal(frame.img_shape, img_shape)
+    assert frame.num_detected == n_detected
+    assert frame.has_instances()
+    assert len(frame.instances) == n_detected
+    assert frame.has_gt_track_ids()
+    assert len(frame.get_gt_track_ids()) == n_detected
+    assert frame.has_pred_track_ids()
+    assert len(frame.get_pred_track_ids()) == n_detected
+    assert not frame.has_matches()
+    assert not frame.has_asso_output()
+    assert not frame.has_traj_score()
+
+    frame.asso_output = asso_output
+    frame.add_traj_score("initial", traj_score)
+    frame.matches = matches
+
+    assert frame.has_matches()
+    assert frame.matches == matches
+    assert frame.has_asso_output()
+    assert torch.equal(frame.asso_output, asso_output)
+    assert frame.has_traj_score()
+    assert torch.equal(frame.get_traj_score("initial"), traj_score)
+
+    frame.instances = []
+
+    assert frame.video_id.item() == video_id
+    assert frame.num_detected == 0
+    assert not frame.has_instances()
+    assert len(frame.instances) == 0
+    assert not frame.has_gt_track_ids()
+    assert not len(frame.get_gt_track_ids())
+    assert not frame.has_pred_track_ids()
+    assert len(frame.get_pred_track_ids()) == 0
+    assert frame.has_matches()
+    assert frame.has_asso_output()
+    assert frame.has_traj_score()
+
+
+def test_track_queue():
+    window_size = 8
+    max_gap = 10
+    img_shape = (3, 1024, 1024)
+    n_instances_per_frame = [2] * window_size
+
+    frames = []
+    instances_per_frame = []
+
+    tq = TrackQueue(window_size, max_gap)
+    for i in range(window_size):
+        instances = []
+        for j in range(n_instances_per_frame[i]):
+            instances.append(Instance(gt_track_id=j, pred_track_id=j))
+        instances_per_frame.append(instances)
+        frame = Frame(video_id=0, frame_id=i, img_shape=img_shape, instances=instances)
+        frames.append(frame)
+
+        tq.add_frame(frame)
+
+    assert len(tq) == sum(n_instances_per_frame[1:])
+    assert tq.n_tracks == max(n_instances_per_frame)
+    assert tq.tracks == [i for i in range(max(n_instances_per_frame))]
+    assert len(tq.collate_tracks()) == window_size - 1
+    assert all([gap == 0 for gap in tq._curr_gap.values()])
+    assert tq.curr_track == max(n_instances_per_frame) - 1
+
+    tq.add_frame(
+        Frame(
+            video_id=0,
+            frame_id=window_size + 1,
+            img_shape=img_shape,
+            instances=[Instance(gt_track_id=0, pred_track_id=0)],
+        )
+    )
+
+    assert len(tq._queues[0]) == window_size - 1
+    assert tq._curr_gap[0] == 0
+    assert tq._curr_gap[max(n_instances_per_frame) - 1] == 1
+
+    tq.add_frame(
+        Frame(
+            video_id=0,
+            frame_id=window_size + 1,
+            img_shape=img_shape,
+            instances=[
+                Instance(gt_track_id=1, pred_track_id=1),
+                Instance(
+                    gt_track_id=max(n_instances_per_frame),
+                    pred_track_id=max(n_instances_per_frame),
+                ),
+            ],
+        )
+    )
+
+    assert len(tq._queues[max(n_instances_per_frame)]) == 1
+    assert tq._curr_gap[1] == 0
+    assert tq._curr_gap[0] == 1
+
+    for i in range(max_gap):
+        tq.add_frame(
+            Frame(
+                video_id=0,
+                frame_id=window_size + i + 1,
+                img_shape=img_shape,
+                instances=[Instance(gt_track_id=0, pred_track_id=0)],
+            )
+        )
+
+    assert tq.n_tracks == 1
+    assert tq.curr_track == max(n_instances_per_frame)
+    assert 0 in tq._queues.keys()
+
+    tq.end_tracks()
+
+    assert len(tq) == 0
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 844efd7..97882a8 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,4 +1,5 @@
 """Test dataset logic."""
+
 from biogtr.datasets.base_dataset import BaseDataset
 from biogtr.datasets.data_utils import get_max_padding
 from biogtr.datasets.microscopy_dataset import MicroscopyDataset
@@ -54,8 +55,8 @@ def test_sleap_dataset(two_flies):
     instances = next(iter(train_ds))
 
     assert len(instances) == clip_length
-    assert len(instances[0]["gt_track_ids"]) == 2
-    assert len(instances[0]["gt_track_ids"]) == instances[0]["num_detected"].item()
+    assert len(instances[0].get_gt_track_ids()) == 2
+    assert len(instances[0].get_gt_track_ids()) == instances[0].num_detected
 
     chunk_frac = 0.5
 
@@ -65,10 +66,10 @@ def test_sleap_dataset(two_flies):
         crop_size=128,
         chunk=True,
         clip_length=clip_length,
-        n_chunks = chunk_frac
+        n_chunks=chunk_frac,
     )
 
-    assert len(train_ds) == int(ds_length*chunk_frac)
+    assert len(train_ds) == int(ds_length * chunk_frac)
 
     n_chunks = 2
 
@@ -78,7 +79,7 @@ def test_sleap_dataset(two_flies):
         crop_size=128,
         chunk=True,
         clip_length=clip_length,
-        n_chunks = n_chunks
+        n_chunks=n_chunks,
     )
 
     assert len(train_ds) == n_chunks
@@ -90,7 +91,7 @@ def test_sleap_dataset(two_flies):
         crop_size=128,
         chunk=True,
         clip_length=clip_length,
-        n_chunks = 0
+        n_chunks=0,
     )
 
     assert len(train_ds) == ds_length
@@ -101,14 +102,12 @@ def test_sleap_dataset(two_flies):
         crop_size=128,
         chunk=True,
         clip_length=clip_length,
-        n_chunks = ds_length + 10000
+        n_chunks=ds_length + 10000,
     )
 
     assert len(train_ds) == ds_length
 
 
-
-
 def test_icy_dataset(ten_icy_particles):
     """Test icy dataset logic.
 
@@ -129,8 +128,8 @@ def test_icy_dataset(ten_icy_particles):
     instances = next(iter(train_ds))
 
     assert len(instances) == clip_length
-    assert len(instances[0]["gt_track_ids"]) == 10
-    assert len(instances[0]["gt_track_ids"]) == instances[0]["num_detected"].item()
+    assert len(instances[0].get_gt_track_ids()) == 10
+    assert len(instances[0].get_gt_track_ids()) == instances[0].num_detected
 
 
 def test_trackmate_dataset(trackmate_lysosomes):
@@ -153,8 +152,8 @@ def test_trackmate_dataset(trackmate_lysosomes):
     instances = next(iter(train_ds))
 
     assert len(instances) == clip_length
-    assert len(instances[0]["gt_track_ids"]) == 26
-    assert len(instances[0]["gt_track_ids"]) == instances[0]["num_detected"].item()
+    assert len(instances[0].get_gt_track_ids()) == 26
+    assert len(instances[0].get_gt_track_ids()) == instances[0].num_detected
 
 
 def test_isbi_dataset(isbi_microtubules, isbi_receptors):
@@ -182,8 +181,8 @@ def test_isbi_dataset(isbi_microtubules, isbi_receptors):
         instances = next(iter(train_ds))
 
         assert len(instances) == clip_length
-        assert len(instances[0]["gt_track_ids"]) == num_objects
-        assert len(instances[0]["gt_track_ids"]) == instances[0]["num_detected"].item()
+        assert len(instances[0].get_gt_track_ids()) == num_objects
+        assert len(instances[0].get_gt_track_ids()) == instances[0].num_detected
 
 
 def test_cell_tracking_dataset(cell_tracking):
@@ -195,22 +194,26 @@ def test_cell_tracking_dataset(cell_tracking):
 
     clip_length = 8
 
+    #     print(cell_tracking[0])
+    #     print(cell_tracking[1])
+    #     print(cell_tracking[2])
+
     train_ds = CellTrackingDataset(
         raw_images=[cell_tracking[0]],
         gt_images=[cell_tracking[1]],
         crop_size=128,
         chunk=True,
         clip_length=clip_length,
-        gt_list=cell_tracking[2],
+        gt_list=[cell_tracking[2]],
     )
 
     instances = next(iter(train_ds))
 
-    gt_track_ids_1 = instances[0]["gt_track_ids"]
+    gt_track_ids_1 = instances[0].get_gt_track_ids()
 
     assert len(instances) == clip_length
     assert len(gt_track_ids_1) == 30
-    assert len(gt_track_ids_1) == instances[0]["num_detected"].item()
+    assert len(gt_track_ids_1) == instances[0].num_detected
 
     # fall back to using np.unique when gt_list not available
     train_ds = CellTrackingDataset(
@@ -223,11 +226,11 @@ def test_cell_tracking_dataset(cell_tracking):
 
     instances = next(iter(train_ds))
 
-    gt_track_ids_2 = instances[0]["gt_track_ids"]
+    gt_track_ids_2 = instances[0].get_gt_track_ids()
 
     assert len(instances) == clip_length
     assert len(gt_track_ids_2) == 30
-    assert len(gt_track_ids_2) == instances[0]["num_detected"].item()
+    assert len(gt_track_ids_2) == instances[0].num_detected
     assert gt_track_ids_1.all() == gt_track_ids_2.all()
 
 
@@ -386,8 +389,8 @@ def test_augmentations(two_flies, ten_icy_particles):
 
     augs_instances = next(iter(augs_ds))
 
-    a = no_augs_instances[0]["crops"]
-    b = augs_instances[0]["crops"]
+    a = no_augs_instances[0].get_crops()
+    b = augs_instances[0].get_crops()
 
     assert not torch.all(a.eq(b))
 
@@ -433,7 +436,7 @@ def test_augmentations(two_flies, ten_icy_particles):
 
     augs_instances = next(iter(augs_ds))
 
-    a = no_augs_instances[0]["crops"]
-    b = augs_instances[0]["crops"]
+    a = no_augs_instances[0].get_crops()
+    b = augs_instances[0].get_crops()
 
-    assert not torch.all(a.eq(b))
\ No newline at end of file
+    assert not torch.all(a.eq(b))
diff --git a/tests/test_inference.py b/tests/test_inference.py
index 93f5743..a38a5c9 100644
--- a/tests/test_inference.py
+++ b/tests/test_inference.py
@@ -1,7 +1,9 @@
 """Test inference logic."""
+
 import torch
 import pytest
 import numpy as np
+from biogtr.data_structures import Frame, Instance
 from biogtr.models.global_tracking_transformer import GlobalTrackingTransformer
 from biogtr.inference.tracker import Tracker
 from biogtr.inference import post_processing
@@ -18,19 +20,21 @@ def test_tracker():
     num_detected = 2
     img_shape = (1, 128, 128)
     test_frame = 1
-    instances = []
+    frames = []
 
     for i in range(num_frames):
-        instances.append(
-            {
-                "frame_id": torch.tensor(i),
-                "img_shape": torch.tensor(img_shape),
-                "num_detected": torch.tensor([num_detected]),
-                "crops": torch.rand(size=(num_detected, 1, 64, 64)),
-                "bboxes": torch.rand(size=(num_detected, 4)),
-                "gt_track_ids": torch.arange(num_detected),
-                "pred_track_ids": torch.tensor([-1] * num_detected),
-            }
+        instances = []
+        for j in range(num_detected):
+            instances.append(
+                Instance(
+                    gt_track_id=j,
+                    pred_track_id=-1,
+                    bbox=torch.rand(size=(1, 4)),
+                    crop=torch.rand(size=(1, 1, 64, 64)),
+                )
+            )
+        frames.append(
+            Frame(video_id=0, frame_id=i, img_shape=img_shape, instances=instances)
         )
 
     embedding_meta = {
@@ -59,19 +63,20 @@ def test_tracker():
 
     tracker = Tracker(**tracking_cfg)
 
-    instances_pred = tracker(tracking_transformer, instances)
+    frames_pred = tracker(tracking_transformer, frames)
 
-    asso_equals = (
-        instances_pred[test_frame]["decay_time_traj_score"].to_numpy()
-        == instances_pred[test_frame]["final_traj_score"].to_numpy()
-    ).all()
-    assert asso_equals
+    # TODO: Debug saving asso matrices
+    # asso_equals = (
+    #     frames_pred[test_frame].get_traj_score("decay_time").to_numpy()
+    #     == frames_pred[test_frame].get_traj_score("final").to_numpy()
+    # ).all()
+    # assert asso_equals
 
-    assert len(instances_pred[test_frame]["pred_track_ids"] == num_detected)
+    assert len(frames_pred[test_frame].get_pred_track_ids()) == num_detected
 
 
-#@pytest.mark.parametrize("set_default_device", ["cpu"], indirect=True)
-def test_post_processing(): #set_default_device
+# @pytest.mark.parametrize("set_default_device", ["cpu"], indirect=True)
+def test_post_processing():  # set_default_device
     """Test postprocessing methods.
 
     Tests each postprocessing method to ensure that
@@ -147,50 +152,38 @@ def test_post_processing(): #set_default_device
         )
     ).all()
 
+
 def test_metrics():
     """Test basic GTR Runner."""
     num_frames = 3
     num_detected = 3
     n_batches = 1
-    instances_pred = []
-    
+    batches = []
+
     for i in range(n_batches):
+        frames_pred = []
         for j in range(num_frames):
-            bboxes = torch.tensor(np.random.uniform(size=(num_detected, 4)))
-            bboxes[:, -2:] += 1
-            instances_pred.append(
-                
-                {
-                    "video_id": torch.tensor(0),
-                    "frame_id": torch.tensor(j),
-                    "num_detected": torch.tensor([num_detected]),
-                    "bboxes": bboxes,
-                    "gt_track_ids": torch.arange(num_detected),
-                    "pred_track_ids": torch.arange(num_detected),
-                }
-            )
-    instances_mm = metrics.to_track_eval(instances_pred)
-    clear_mot = metrics.get_pymotmetrics(instances_mm)
+            instances_pred = []
+            for k in range(num_detected):
+                bboxes = torch.tensor(np.random.uniform(size=(num_detected, 4)))
+                bboxes[:, -2:] += 1
+                instances_pred.append(
+                    Instance(gt_track_id=k, pred_track_id=k, bbox=torch.randn((1, 4)))
+                )
+            frames_pred.append(Frame(video_id=0, frame_id=j, instances=instances_pred))
+        batches.append(frames_pred)
 
-    matches, indices, _ = metrics.get_matches(instances_pred)
+    for batch in batches:
+        instances_mm = metrics.to_track_eval(batch)
+        clear_mot = metrics.get_pymotmetrics(instances_mm)
 
-    switches = metrics.get_switches(matches, indices)
+        matches, indices, _ = metrics.get_matches(batch)
 
-    sw_cnt = metrics.get_switch_count(switches)
+        switches = metrics.get_switches(matches, indices)
 
-    assert sw_cnt == clear_mot["num_switches"] == 0, (sw_cnt, clear_mot["num_switches"])
+        sw_cnt = metrics.get_switch_count(switches)
 
-    instances_pred[1]['pred_track_ids'] = torch.tensor([1,2,0])
-    instances_pred[2]['pred_track_ids'] = torch.tensor([2,0,1])
-
-    instances_mm = metrics.to_track_eval(instances_pred)
-    clear_mot = metrics.get_pymotmetrics(instances_mm)
-
-    matches, indices, _ = metrics.get_matches(instances_pred)
-
-    switches = metrics.get_switches(matches, indices)
-
-    sw_cnt = metrics.get_switch_count(switches)
-
-    assert sw_cnt == clear_mot["num_switches"] == 6, (instances_pred[1]['gt_track_ids'],instances_pred[1]['pred_track_ids'], sw_cnt, clear_mot["num_switches"])
-    
+        assert sw_cnt == clear_mot["num_switches"] == 0, (
+            sw_cnt,
+            clear_mot["num_switches"],
+        )
diff --git a/tests/test_models.py b/tests/test_models.py
index f85fdfb..ceae0bc 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -1,7 +1,8 @@
 """Test model modules."""
+
 import pytest
 import torch
-import numpy as np
+from biogtr.data_structures import Frame, Instance
 from biogtr.models.attention_head import MLP, ATTWeightHead
 from biogtr.models.embedding import Embedding
 from biogtr.models.global_tracking_transformer import GlobalTrackingTransformer
@@ -122,6 +123,7 @@ def test_embedding_kwargs():
 
     lp_args = {"learn_pos_emb_num": 100, "over_boxes": False}
 
+    emb = Embedding()
     lp_with_args = emb._learned_pos_embedding(boxes, **lp_args)
 
     assert not torch.equal(lp_no_args, lp_with_args)
@@ -132,6 +134,7 @@ def test_embedding_kwargs():
 
     lt_args = {"learn_temp_emb_num": 100}
 
+    emb = Embedding()
     lt_with_args = emb._learned_temp_embedding(times, **lt_args)
 
     assert not torch.equal(lt_no_args, lt_with_args)
@@ -207,20 +210,19 @@ def test_transformer_basic():
         feature_dim_attn_head=feats,
     )
 
-    instances = []
+    frames = []
 
     for i in range(num_frames):
-        instances.append(
-            {
-                "frame_id": torch.tensor(i),
-                "img_shape": torch.tensor(img_shape),
-                "num_detected": torch.tensor([num_detected]),
-                "bboxes": torch.rand(size=(num_detected, 4)),
-                "features": torch.rand(size=(num_detected, feats)),
-            }
-        )
+        instances = []
+        for j in range(num_detected):
+            instances.append(
+                Instance(
+                    bbox=torch.rand(size=(1, 4)), features=torch.rand(size=(1, feats))
+                )
+            )
+        frames.append(Frame(video_id=0, frame_id=i, instances=instances))
 
-    asso_preds = transformer(instances)
+    asso_preds, _ = transformer(frames)
 
     assert asso_preds[0].size() == (num_detected * num_frames,) * 2
 
@@ -270,18 +272,17 @@ def test_transformer_embedding():
     num_detected = 10
     img_shape = (1, 50, 50)
 
-    instances = []
+    frames = []
 
     for i in range(num_frames):
-        instances.append(
-            {
-                "frame_id": torch.tensor(i),
-                "img_shape": torch.tensor(img_shape),
-                "num_detected": torch.tensor([num_detected]),
-                "bboxes": torch.rand(size=(num_detected, 4)),
-                "features": torch.rand(size=(num_detected, feats)),
-            }
-        )
+        instances = []
+        for j in range(num_detected):
+            instances.append(
+                Instance(
+                    bbox=torch.rand(size=(1, 4)), features=torch.rand(size=(1, feats))
+                )
+            )
+        frames.append(Frame(video_id=0, frame_id=i, instances=instances))
 
     embedding_meta = {
         "embedding_type": "learned_pos_temp",
@@ -302,7 +303,7 @@ def test_transformer_embedding():
         return_embedding=True,
     )
 
-    asso_preds, embedding = transformer(instances)
+    asso_preds, embedding = transformer(frames)
 
     assert asso_preds[0].size() == (num_detected * num_frames,) * 2
     assert embedding.size() == (num_detected * num_frames, 1, feats)
@@ -315,17 +316,18 @@ def test_tracking_transformer():
     num_detected = 20
     img_shape = (1, 128, 128)
 
-    instances = []
+    frames = []
 
     for i in range(num_frames):
-        instances.append(
-            {
-                "frame_id": torch.tensor(i),
-                "img_shape": torch.tensor(img_shape),
-                "num_detected": torch.tensor([num_detected]),
-                "crops": torch.rand(size=(num_detected, 1, 64, 64)),
-                "bboxes": torch.rand(size=(num_detected, 4)),
-            }
+        instances = []
+        for j in range(num_detected):
+            instances.append(
+                Instance(
+                    bbox=torch.rand(size=(1, 4)), crop=torch.rand(size=(1, 1, 64, 64))
+                )
+            )
+        frames.append(
+            Frame(video_id=0, frame_id=i, img_shape=img_shape, instances=instances)
         )
 
     embedding_meta = {
@@ -347,7 +349,7 @@ def test_tracking_transformer():
         return_embedding=True,
     )
 
-    asso_preds, embedding = tracking_transformer(instances)
+    asso_preds, embedding = tracking_transformer(frames)
 
     assert asso_preds[0].size() == (num_detected * num_frames,) * 2
     assert embedding.size() == (num_detected * num_frames, 1, feats)
diff --git a/tests/test_training.py b/tests/test_training.py
index 79a65a7..9120af4 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -1,7 +1,9 @@
 """Test training logic."""
+
 import os
 import pytest
 import torch
+from biogtr.data_structures import Frame, Instance
 from biogtr.training.losses import AssoLoss
 from biogtr.models.gtr_runner import GTRRunner
 from biogtr.models.global_tracking_transformer import GlobalTrackingTransformer
@@ -18,23 +20,21 @@ def test_asso_loss():
     num_detected = 20
     img_shape = (1, 128, 128)
 
-    instances = []
+    frames = []
 
     for i in range(num_frames):
-        instances.append(
-            {
-                "img_shape": torch.tensor(img_shape),
-                "num_detected": torch.tensor([num_detected]),
-                "gt_track_ids": torch.arange(num_detected),
-                "bboxes": torch.rand(size=(num_detected, 4)),
-            }
+        instances = []
+        for j in range(num_detected):
+            instances.append(Instance(gt_track_id=j, bbox=torch.rand(size=(1, 4))))
+        frames.append(
+            Frame(video_id=0, frame_id=i, instances=instances, img_shape=img_shape)
         )
 
     asso_loss = AssoLoss(neg_unmatched=True, asso_weight=10.0)
 
     asso_preds = torch.rand(size=(1, 100, 100))
 
-    loss = asso_loss(asso_preds, instances)
+    loss = asso_loss(asso_preds, frames)
 
     assert len(loss.size()) == 0
     assert type(loss.item()) == float
@@ -47,25 +47,33 @@ def test_basic_gtr_runner():
     num_detected = 3
     img_shape = (1, 128, 128)
     n_batches = 2
-    instances = []
     train_ds = []
     epochs = 2
-
+    frame_ind = 0
     for i in range(n_batches):
+        frames = []
         for j in range(num_frames):
-            instances.append(
-                {
-                    "video_id": torch.tensor(0),
-                    "frame_id": torch.tensor(j),
-                    "img_shape": torch.tensor(img_shape),
-                    "num_detected": torch.tensor([num_detected]),
-                    "crops": torch.rand(size=(num_detected, 1, 64, 64)),
-                    "bboxes": torch.rand(size=(num_detected, 4)),
-                    "gt_track_ids": torch.arange(num_detected),
-                    "pred_track_ids": torch.tensor([-1] * num_detected),
-                }
+            instances = []
+            for k in range(num_detected):
+                instances.append(
+                    Instance(
+                        gt_track_id=k,
+                        pred_track_id=-1,
+                        bbox=torch.rand(size=(1, 4)),
+                        crop=torch.randn(size=img_shape),
+                    ),
+                )
+
+            frames.append(
+                Frame(
+                    video_id=0,
+                    frame_id=frame_ind,
+                    instances=instances,
+                    img_shape=img_shape,
+                )
             )
-        train_ds.append([instances])
+            frame_ind += 1
+        train_ds.append(frames)
 
     gtr_runner = GTRRunner()
 
@@ -91,24 +99,23 @@ def test_basic_gtr_runner():
 
     for epoch in range(epochs):
         for i, batch in enumerate(train_ds):
+            gtr_runner.train()
             assert gtr_runner.model.training
-            metrics = gtr_runner.training_step(batch, i)
-            assert "loss" in metrics and "num_switches" not in metrics
+            metrics = gtr_runner.training_step([batch], i)
+            assert "loss" in metrics
             assert metrics["loss"].requires_grad
 
         for j, batch in enumerate(train_ds):
             gtr_runner.eval()
             with torch.no_grad():
-                metrics = gtr_runner.validation_step(batch, j)
+                metrics = gtr_runner.validation_step([batch], j)
             assert "loss" in metrics and "num_switches" in metrics
             assert not metrics["loss"].requires_grad
 
-        gtr_runner.train()
-
     for k, batch in enumerate(train_ds):
         gtr_runner.eval()
         with torch.no_grad():
-            metrics = gtr_runner.test_step(batch, k)
+            metrics = gtr_runner.test_step([batch], k)
         assert "loss" in metrics and "num_switches" in metrics
         assert not metrics["loss"].requires_grad
 
diff --git a/tests/test_version.py b/tests/test_version.py
index 3f9e7e0..6bde7e4 100644
--- a/tests/test_version.py
+++ b/tests/test_version.py
@@ -1,4 +1,5 @@
 """Test version."""
+
 import biogtr
 
 

From a1d24a84bbb1782ddc00465104378501ab4a67f7 Mon Sep 17 00:00:00 2001
From: Aaditya Prasad <78439225+aaprasad@users.noreply.github.com>
Date: Mon, 22 Apr 2024 13:11:11 -0700
Subject: [PATCH 14/14] Apply suggestions from @coderabbitai's code review

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 biogtr/config.py                 | 1 -
 biogtr/datasets/sleap_dataset.py | 1 -
 biogtr/visualize.py              | 1 -
 3 files changed, 3 deletions(-)

diff --git a/biogtr/config.py b/biogtr/config.py
index 00a24a1..a1cbf69 100644
--- a/biogtr/config.py
+++ b/biogtr/config.py
@@ -11,7 +11,6 @@
 from pprint import pprint
 from typing import Union, Iterable
 from pathlib import Path
-import os
 import pytorch_lightning as pl
 import torch
 
diff --git a/biogtr/datasets/sleap_dataset.py b/biogtr/datasets/sleap_dataset.py
index 73ef5be..42ded8b 100644
--- a/biogtr/datasets/sleap_dataset.py
+++ b/biogtr/datasets/sleap_dataset.py
@@ -6,7 +6,6 @@
 import numpy as np
 import sleap_io as sio
 import random
-import warnings
 from biogtr.data_structures import Frame, Instance
 from biogtr.datasets import data_utils
 from biogtr.datasets.base_dataset import BaseDataset
diff --git a/biogtr/visualize.py b/biogtr/visualize.py
index bafcf14..f8e710b 100644
--- a/biogtr/visualize.py
+++ b/biogtr/visualize.py
@@ -12,7 +12,6 @@
 import numpy as np
 import cv2
 from matplotlib import pyplot
-import gc
 
 palette = sns.color_palette("tab20")