"""Dataset classes for handling various data modalities in machine learning workflows.
This module provides flexible dataset classes that bridge the gap between common data
formats (numpy, pandas) and PyTorch tensors, with support for both in-memory and
out-of-memory datasets.
Classes:
AbstractDataset: Base abstract class defining the dataset interface.
Dataset: In-memory dataset supporting numpy, pandas, and torch tensors.
BatchedDataset: Abstract class for datasets too large to fit in memory.
"""
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
import torch
from torch import Tensor
from typing_extensions import Self
class BaseDataset(torch.utils.data.Dataset, ABC):
"""Abstract base class for all dataset types.
Defines the common interface that all dataset classes must implement,
including properties for introspecting dataset dimensions.
"""
@property
@abstractmethod
def num_samples(self):
"""int: Total number of samples in the dataset."""
pass
@property
@abstractmethod
def feature_dimension(self):
"""Int or tuple: Dimension(s) of the feature space."""
pass
@property
@abstractmethod
def target_dimension(self):
"""Int or tuple: Dimension(s) of the target space."""
pass
[docs]
class BatchedDataset(BaseDataset, ABC):
"""Abstract base class for datasets that are too large to fit in memory.
Subclasses must implement __getitem__ and __len__ following torch.utils.data.Dataset
conventions, as well as properties to introspect feature and target dimensions.
This class is designed for datasets that can only be accessed in batches,
where loading the entire dataset into memory is infeasible.
Examples:
>>> class MyLargeDataset(BatchedDataset):
... def __init__(self, data_path):
... self.data_path = data_path
... # Load metadata to determine shapes
...
... def __getitem__(self, idx):
... # Load sample(s) from disk
... return X, y
...
... def __len__(self):
... return self.total_samples
...
... @property
... def feature_dimension(self):
... return self.n_features
...
... @property
... def target_dimension(self):
... return self.n_targets
"""
[docs]
def __init__(self):
"""Initialize BatchedDataset."""
super().__init__()
[docs]
@abstractmethod
def __getitem__(self, idx):
"""Retrieve a sample or batch of samples.
Args:
idx (int or slice): Index or slice of samples to retrieve.
Returns:
tuple: (X, y, idx) where X is features and y is target(s).
"""
pass
[docs]
@abstractmethod
def __len__(self):
"""Return the total number of samples in the dataset.
Returns:
int: Total number of samples.
"""
pass
@property
def num_samples(self):
"""int: Total number of samples in the dataset."""
return len(self)
@property
@abstractmethod
def feature_dimension(self):
"""Int or tuple: Dimension(s) of the feature space.
Subclasses should implement this by inspecting metadata or a sample.
"""
pass
@property
@abstractmethod
def target_dimension(self):
"""Int or tuple: Dimension(s) of the target space.
Subclasses should implement this by inspecting metadata or a sample.
"""
pass
[docs]
class Dataset(BaseDataset, torch.utils.data.TensorDataset):
"""In-memory dataset for classical machine learning tasks.
Handles data matrices with labels/response vectors that fit in memory.
Automatically converts numpy arrays and pandas DataFrames/Series to PyTorch tensors.
Suitable for GLMs, classical statistical problems, and convex optimization tasks.
Args:
X (Tensor, np.ndarray, pd.DataFrame, or pd.Series): Feature matrix of shape
(n_samples, n_features).
y (Tensor, np.ndarray, pd.DataFrame, or pd.Series): Target array of shape
(n_samples, ...). Can be any dimensionality.
device (str or torch.device, optional): Device to place tensors on
(e.g., 'cpu', 'cuda', 'cuda:0'). Defaults to None.
dtype (torch.dtype, optional): Data type for tensors. Defaults to torch.float32.
Raises:
ValueError: If X is not 2-dimensional or if X and y have mismatched
sample sizes.
Examples:
>>> # From numpy
>>> X = np.random.randn(100, 10)
>>> y = np.random.randn(100)
>>> data = Dataset(X, y)
>>> # From pandas with device specification
>>> df = pd.DataFrame({'x1': [1, 2], 'x2': [3, 4]})
>>> y = pd.Series([5, 6])
>>> data = Dataset(df, y, device='cuda')
>>> # Multi-target
>>> y_multi = np.random.randn(100, 3)
>>> data = Dataset(X, y_multi)
"""
[docs]
def __init__(
self,
X: Tensor | np.ndarray | pd.DataFrame | pd.Series,
y: Tensor | np.ndarray | pd.DataFrame | pd.Series,
device: str | torch.device | None = None,
dtype: torch.dtype = torch.float32,
):
"""Initialize Dataset with feature matrix and target array."""
# Convert pandas to numpy first
if isinstance(X, (pd.DataFrame, pd.Series)):
X = X.values
if isinstance(y, (pd.DataFrame, pd.Series)):
y = y.values
# Convert X to tensor with specified dtype
if isinstance(X, np.ndarray):
X = torch.from_numpy(X).to(dtype)
elif isinstance(X, Tensor) and X.dtype != dtype:
X = X.to(dtype)
# Carefully handle dtype of y
# Preserve integer types for classification, use specified dtype for regression
if isinstance(y, np.ndarray):
# Check if y contains integer data (for classification)
if np.issubdtype(y.dtype, np.integer):
y = torch.from_numpy(y).long()
else:
y = torch.from_numpy(y).to(dtype)
elif isinstance(y, Tensor):
# Preserve long/int dtypes, convert others to specified dtype
if y.dtype in [torch.int32, torch.int64, torch.long]:
y = y.long()
elif y.dtype != dtype:
y = y.to(dtype)
# Handle device placement
if device is not None:
device = torch.device(device)
X = X.to(device)
y = y.to(device)
# Validate X dimensions
if X.ndim != 2:
raise ValueError("Input data tensor X must be 2-dimensional.")
# Validate matching sample sizes
if X.shape[0] != y.shape[0]:
raise ValueError(
f"X and y must have the same number of samples. "
f"Got X: {X.shape[0]}, y: {y.shape[0]}"
)
super().__init__(X, y)
[docs]
def __getitem__(self, index):
"""Retrieve a sample and its index."""
# Call the parent's __getitem__ to get (X, y)
X, y = super().__getitem__(index)
# Return the data, target, and the index
# The index is returned as a single-element tensor for consistency with
# collation
if isinstance(index, slice):
return X, y, index
return X, y, torch.as_tensor(index, dtype=torch.long)
[docs]
@classmethod
def from_numpy(
cls,
X: np.ndarray,
y: np.ndarray,
device: str | torch.device = None,
dtype: torch.dtype = torch.float32,
):
"""Create a Dataset from numpy arrays.
Args:
X (np.ndarray): Feature matrix of shape (n_samples, n_features).
y (np.ndarray): Target array of shape (n_samples, ...). Can be any
dimensionality.
device (str or torch.device, optional): Device to place tensors on
(e.g., 'cpu', 'cuda', 'cuda:0'). Defaults to None.
dtype (torch.dtype, optional): Data type for tensors. Defaults to
torch.float32.
Returns:
Dataset: Dataset instance with data on specified device.
Examples:
>>> X = np.random.randn(100, 10)
>>> y = np.random.randn(100)
>>> data = Dataset.from_numpy(X, y, device='cuda')
"""
return cls(X, y, device=device, dtype=dtype)
[docs]
@classmethod
def from_pandas(
cls,
X: pd.DataFrame | pd.Series,
y: pd.DataFrame | pd.Series,
device: str | torch.device | None = None,
dtype: torch.dtype = torch.float32,
):
"""Create a Dataset from pandas DataFrames or Series.
Args:
X (pd.DataFrame or pd.Series): Feature data of shape
(n_samples, n_features).
y (pd.DataFrame or pd.Series): Target data of shape (n_samples, ...).
Can be any dimensionality.
device (str or torch.device, optional): Device to place tensors on
(e.g., 'cpu', 'cuda', 'cuda:0'). Defaults to None.
dtype (torch.dtype, optional): Data type for tensors. Defaults to
torch.float32.
Returns:
Dataset: Dataset instance with data on specified device.
Examples:
>>> # From separate DataFrames
>>> df_X = pd.DataFrame({'x1': [1, 2, 3], 'x2': [4, 5, 6]})
>>> df_y = pd.Series([7, 8, 9])
>>> data = Dataset.from_pandas(df_X, df_y)
>>> # From a single DataFrame using column selection
>>> df = pd.DataFrame({'x1': [1, 2, 3], 'x2': [4, 5, 6], 'y': [7, 8, 9]})
>>> data = Dataset.from_pandas(df[['x1', 'x2']], df['y'])
>>> # Multi-target
>>> df_multi = pd.DataFrame({'x1': [1, 2], 'y1': [3, 4], 'y2': [5, 6]})
>>> data = Dataset.from_pandas(df_multi[['x1']], df_multi[['y1', 'y2']])
"""
return cls(X, y, device=device, dtype=dtype)
[docs]
def to(self, device: str | torch.device) -> Self:
"""Move dataset to specified device.
Args:
device (str or torch.device): Target device (e.g., 'cpu', 'cuda',
'cuda:0').
Returns:
Dataset: New Dataset instance on the specified device.
Examples:
>>> data = Dataset(X, y, device='cpu')
>>> data_gpu = data.to('cuda')
>>> print(data_gpu.device) # cuda:0
"""
device = torch.device(device)
X_new = self.X.to(device)
y_new = self.y.to(device)
return Dataset(X_new, y_new)
@property
def device(self) -> torch.device:
"""torch.device: Device where the dataset tensors are stored."""
return self.X.device
@property
def dtype(self) -> torch.dtype:
"""torch.dtype: Data type of the dataset tensors."""
return self.X.dtype
@property
def num_samples(self):
"""int: Total number of samples in the dataset."""
return self.tensors[0].shape[0]
@property
def feature_dimension(self):
"""int: Number of features in the dataset."""
return self.tensors[0].shape[1]
@property
def target_dimension(self):
"""Int or tuple: Dimension(s) of the target.
Returns 1 for 1D targets (shape (n,)). For multi-dimensional targets,
returns a tuple of dimensions excluding the sample dimension.
Examples:
>>> # 1D target
>>> y = torch.randn(100)
>>> data = Dataset(X, y)
>>> data.target_dimension # 1
>>> # 2D multi-target
>>> y = torch.randn(100, 5)
>>> data = Dataset(X, y)
>>> data.target_dimension # (5,)
>>> # 3D target (e.g., images)
>>> y = torch.randn(100, 3, 28, 28)
>>> data = Dataset(X, y)
>>> data.target_dimension # (3, 28, 28)
"""
if self.tensors[1].ndim == 1:
return 1
return self.tensors[1].shape[1:] # Return all dimensions after batch
@property
def X(self):
"""Tensor: Feature matrix of shape (n_samples, n_features)."""
return self.tensors[0]
@property
def y(self):
"""Tensor: Target array of shape (n_samples, ...)."""
return self.tensors[1]
[docs]
def __repr__(self):
"""Return string representation of the dataset.
Returns:
str: String showing dataset dimensions and device.
"""
return (
f"{self.__class__.__name__}("
f"num_samples={self.num_samples}, "
f"feature_dimension={self.feature_dimension}, "
f"target_dimension={self.target_dimension}, "
f"dtype={self.dtype}, "
f"device={self.device})"
)