Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ImbalancedSampler support #4198

Merged
merged 6 commits into from
Mar 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions test/loader/test_imbalanced_sampler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from typing import List

import torch
from torch import Tensor

from torch_geometric.data import Data
from torch_geometric.loader import (DataLoader, ImbalancedSampler,
NeighborLoader)


def test_dataloader_with_imbalanced_sampler():
data_list: List[Data] = []
for _ in range(10):
data_list.append(Data(num_nodes=10, y=0))
for _ in range(90):
data_list.append(Data(num_nodes=10, y=1))

torch.manual_seed(12345)
sampler = ImbalancedSampler(data_list)
loader = DataLoader(data_list, batch_size=10, sampler=sampler)

ys: List[Tensor] = []
for batch in loader:
ys.append(batch.y)

histogram = torch.cat(ys).bincount()
prob = histogram / histogram.sum()

assert histogram.sum() == len(data_list)
assert prob.min() > 0.4 and prob.max() < 0.6


def test_neighbor_loader_with_imbalanced_sampler():
zeros = torch.zeros(10, dtype=torch.long)
ones = torch.ones(90, dtype=torch.long)

y = torch.cat([zeros, ones], dim=0)
edge_index = torch.empty((2, 0), dtype=torch.long)
data = Data(edge_index=edge_index, y=y, num_nodes=y.size(0))

torch.manual_seed(12345)
sampler = ImbalancedSampler(data)
loader = NeighborLoader(data, batch_size=10, sampler=sampler,
num_neighbors=[-1])

ys: List[Tensor] = []
for batch in loader:
ys.append(batch.y)

histogram = torch.cat(ys).bincount()
prob = histogram / histogram.sum()

assert histogram.sum() == data.num_nodes
assert prob.min() > 0.4 and prob.max() < 0.6
10 changes: 5 additions & 5 deletions torch_geometric/loader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
from .random_node_sampler import RandomNodeSampler
from .data_list_loader import DataListLoader
from .dense_data_loader import DenseDataLoader
from .neighbor_sampler import NeighborSampler
from .temporal_dataloader import TemporalDataLoader
from .neighbor_sampler import NeighborSampler
from .imbalanced_sampler import ImbalancedSampler

__all__ = [
__all__ = classes = [
'DataLoader',
'NeighborLoader',
'HGTLoader',
Expand All @@ -25,8 +26,7 @@
'RandomNodeSampler',
'DataListLoader',
'DenseDataLoader',
'NeighborSampler',
'TemporalDataLoader',
'NeighborSampler',
'ImbalancedSampler',
]

classes = __all__
82 changes: 82 additions & 0 deletions torch_geometric/loader/imbalanced_sampler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from typing import List, Optional, Union

import torch
from torch import Tensor

from torch_geometric.data import Data, Dataset, InMemoryDataset


class ImbalancedSampler(torch.utils.data.WeightedRandomSampler):
r"""A weighted random sampler that randomly samples elements according to
class distribution.
As such, it will either remove samples from the majority class
(under-sampling) or add more examples from the minority class
(over-sampling).

**Graph-level sampling:**

.. code-block:: python

from torch_geometric.loader import DataLoader, ImbalancedSampler

sampler = ImbalancedSampler(dataset)
loader = DataLoader(dataset, batch_size=64, sampler=sampler, ...)

**Node-level sampling:**

.. code-block:: python

from torch_geometric.loader import NeighborLoader, ImbalancedSampler

sampler = ImbalancedSampler(data, input_nodes=data.train_mask)
loader = NeighborLoader(data, input_nodes=data.train_mask,
batch_size=64, num_neighbors=[-1, -1],
sampler=sampler, ...)

Args:
dataset (Dataset or Data): The dataset from which to sample the data,
either given as a :class:`~torch_geometric.data.Dataset` or
:class:`~torch_geometric.data.Data` object.
input_nodes (Tensor, optional): The indices of nodes that are used by
the corresponding loader, *e.g.*, by
:class:`~torch_geometric.loader.NeighborLoader`.
If set to :obj:`None`, all nodes will be considered.
This argument should only be set for node-level loaders and does
not have any effect when operating on a set of graphs as given by
:class:`~torch_geometric.data.Dataset`. (default: :obj:`None`)
num_samples (int, optional): The number of samples to draw for a single
epoch. If set to :obj:`None`, will sample as much elements as there
exists in the underlying data. (default: :obj:`None`)
"""
def __init__(
self,
dataset: Union[Data, Dataset, List[Data]],
input_nodes: Optional[Tensor] = None,
num_samples: Optional[int] = None,
):

if isinstance(dataset, Data):
y = dataset.y.view(-1)
assert dataset.num_nodes == y.numel()
y = y[input_nodes] if input_nodes is not None else y

elif isinstance(dataset, InMemoryDataset):
y = dataset.data.y.view(-1)
assert len(dataset) == y.numel()

else:
ys = [data.y for data in dataset]
if isinstance(ys[0], Tensor):
y = torch.cat(ys, dim=0).view(-1)
else:
y = torch.tensor(ys).view(-1)
assert len(dataset) == y.numel()

assert y.dtype == torch.long # Require classification.

num_samples = y.numel() if num_samples is None else num_samples

class_weight = 1. / y.bincount()
weight = class_weight[y]

return super().__init__(weight, num_samples, replacement=True)