Source code for rlaopt.data.datasets

"""Dataset classes for handling various data modalities in machine learning workflows.

This module provides flexible dataset classes that bridge the gap between common data
formats (numpy, pandas) and PyTorch tensors, with support for both in-memory and
out-of-memory datasets.

Classes:
    AbstractDataset: Base abstract class defining the dataset interface.
    Dataset: In-memory dataset supporting numpy, pandas, and torch tensors.
    BatchedDataset: Abstract class for datasets too large to fit in memory.
"""

from abc import ABC, abstractmethod

import numpy as np
import pandas as pd
import torch
from torch import Tensor
from typing_extensions import Self


class BaseDataset(torch.utils.data.Dataset, ABC):
    """Abstract base class for all dataset types.

    Defines the common interface that all dataset classes must implement,
    including properties for introspecting dataset dimensions.
    """

    @property
    @abstractmethod
    def num_samples(self):
        """int: Total number of samples in the dataset."""
        pass

    @property
    @abstractmethod
    def feature_dimension(self):
        """Int or tuple: Dimension(s) of the feature space."""
        pass

    @property
    @abstractmethod
    def target_dimension(self):
        """Int or tuple: Dimension(s) of the target space."""
        pass



[docs]
class BatchedDataset(BaseDataset, ABC):
    """Abstract base class for datasets that are too large to fit in memory.

    Subclasses must implement __getitem__ and __len__ following torch.utils.data.Dataset
    conventions, as well as properties to introspect feature and target dimensions.

    This class is designed for datasets that can only be accessed in batches,
    where loading the entire dataset into memory is infeasible.

    Examples:
        >>> class MyLargeDataset(BatchedDataset):
        ...     def __init__(self, data_path):
        ...         self.data_path = data_path
        ...         # Load metadata to determine shapes
        ...
        ...     def __getitem__(self, idx):
        ...         # Load sample(s) from disk
        ...         return X, y
        ...
        ...     def __len__(self):
        ...         return self.total_samples
        ...
        ...     @property
        ...     def feature_dimension(self):
        ...         return self.n_features
        ...
        ...     @property
        ...     def target_dimension(self):
        ...         return self.n_targets
    """


[docs]
    def __init__(self):
        """Initialize BatchedDataset."""
        super().__init__()



[docs]
    @abstractmethod
    def __getitem__(self, idx):
        """Retrieve a sample or batch of samples.

        Args:
            idx (int or slice): Index or slice of samples to retrieve.

        Returns:
            tuple: (X, y, idx) where X is features and y is target(s).
        """
        pass



[docs]
    @abstractmethod
    def __len__(self):
        """Return the total number of samples in the dataset.

        Returns:
            int: Total number of samples.
        """
        pass


    @property
    def num_samples(self):
        """int: Total number of samples in the dataset."""
        return len(self)

    @property
    @abstractmethod
    def feature_dimension(self):
        """Int or tuple: Dimension(s) of the feature space.

        Subclasses should implement this by inspecting metadata or a sample.
        """
        pass

    @property
    @abstractmethod
    def target_dimension(self):
        """Int or tuple: Dimension(s) of the target space.

        Subclasses should implement this by inspecting metadata or a sample.
        """
        pass




[docs]
class Dataset(BaseDataset, torch.utils.data.TensorDataset):
    """In-memory dataset for classical machine learning tasks.

    Handles data matrices with labels/response vectors that fit in memory.
    Automatically converts numpy arrays and pandas DataFrames/Series to PyTorch tensors.
    Suitable for GLMs, classical statistical problems, and convex optimization tasks.

    Args:
        X (Tensor, np.ndarray, pd.DataFrame, or pd.Series): Feature matrix of shape
            (n_samples, n_features).
        y (Tensor, np.ndarray, pd.DataFrame, or pd.Series): Target array of shape
            (n_samples, ...). Can be any dimensionality.
        device (str or torch.device, optional): Device to place tensors on
            (e.g., 'cpu', 'cuda', 'cuda:0'). Defaults to None.
        dtype (torch.dtype, optional): Data type for tensors. Defaults to torch.float32.

    Raises:
        ValueError: If X is not 2-dimensional or if X and y have mismatched
        sample sizes.

    Examples:
        >>> # From numpy
        >>> X = np.random.randn(100, 10)
        >>> y = np.random.randn(100)
        >>> data = Dataset(X, y)

        >>> # From pandas with device specification
        >>> df = pd.DataFrame({'x1': [1, 2], 'x2': [3, 4]})
        >>> y = pd.Series([5, 6])
        >>> data = Dataset(df, y, device='cuda')

        >>> # Multi-target
        >>> y_multi = np.random.randn(100, 3)
        >>> data = Dataset(X, y_multi)
    """


[docs]
    def __init__(
        self,
        X: Tensor | np.ndarray | pd.DataFrame | pd.Series,
        y: Tensor | np.ndarray | pd.DataFrame | pd.Series,
        device: str | torch.device | None = None,
        dtype: torch.dtype = torch.float32,
    ):
        """Initialize Dataset with feature matrix and target array."""
        # Convert pandas to numpy first
        if isinstance(X, (pd.DataFrame, pd.Series)):
            X = X.values
        if isinstance(y, (pd.DataFrame, pd.Series)):
            y = y.values

        # Convert X to tensor with specified dtype
        if isinstance(X, np.ndarray):
            X = torch.from_numpy(X).to(dtype)
        elif isinstance(X, Tensor) and X.dtype != dtype:
            X = X.to(dtype)

        # Carefully handle dtype of y
        # Preserve integer types for classification, use specified dtype for regression
        if isinstance(y, np.ndarray):
            # Check if y contains integer data (for classification)
            if np.issubdtype(y.dtype, np.integer):
                y = torch.from_numpy(y).long()
            else:
                y = torch.from_numpy(y).to(dtype)
        elif isinstance(y, Tensor):
            # Preserve long/int dtypes, convert others to specified dtype
            if y.dtype in [torch.int32, torch.int64, torch.long]:
                y = y.long()
            elif y.dtype != dtype:
                y = y.to(dtype)

        # Handle device placement
        if device is not None:
            device = torch.device(device)
            X = X.to(device)
            y = y.to(device)

        # Validate X dimensions
        if X.ndim != 2:
            raise ValueError("Input data tensor X must be 2-dimensional.")

        # Validate matching sample sizes
        if X.shape[0] != y.shape[0]:
            raise ValueError(
                f"X and y must have the same number of samples. "
                f"Got X: {X.shape[0]}, y: {y.shape[0]}"
            )

        super().__init__(X, y)



[docs]
    def __getitem__(self, index):
        """Retrieve a sample and its index."""
        # Call the parent's __getitem__ to get (X, y)
        X, y = super().__getitem__(index)

        # Return the data, target, and the index
        # The index is returned as a single-element tensor for consistency with
        # collation
        if isinstance(index, slice):
            return X, y, index
        return X, y, torch.as_tensor(index, dtype=torch.long)



[docs]
    @classmethod
    def from_numpy(
        cls,
        X: np.ndarray,
        y: np.ndarray,
        device: str | torch.device = None,
        dtype: torch.dtype = torch.float32,
    ):
        """Create a Dataset from numpy arrays.

        Args:
            X (np.ndarray): Feature matrix of shape (n_samples, n_features).
            y (np.ndarray): Target array of shape (n_samples, ...). Can be any
                dimensionality.
            device (str or torch.device, optional): Device to place tensors on
                (e.g., 'cpu', 'cuda', 'cuda:0'). Defaults to None.
            dtype (torch.dtype, optional): Data type for tensors. Defaults to
                torch.float32.

        Returns:
            Dataset: Dataset instance with data on specified device.

        Examples:
            >>> X = np.random.randn(100, 10)
            >>> y = np.random.randn(100)
            >>> data = Dataset.from_numpy(X, y, device='cuda')
        """
        return cls(X, y, device=device, dtype=dtype)



[docs]
    @classmethod
    def from_pandas(
        cls,
        X: pd.DataFrame | pd.Series,
        y: pd.DataFrame | pd.Series,
        device: str | torch.device | None = None,
        dtype: torch.dtype = torch.float32,
    ):
        """Create a Dataset from pandas DataFrames or Series.

        Args:
            X (pd.DataFrame or pd.Series): Feature data of shape
                (n_samples, n_features).
            y (pd.DataFrame or pd.Series): Target data of shape (n_samples, ...).
                Can be any dimensionality.
            device (str or torch.device, optional): Device to place tensors on
                (e.g., 'cpu', 'cuda', 'cuda:0'). Defaults to None.
            dtype (torch.dtype, optional): Data type for tensors. Defaults to
                torch.float32.

        Returns:
            Dataset: Dataset instance with data on specified device.

        Examples:
            >>> # From separate DataFrames
            >>> df_X = pd.DataFrame({'x1': [1, 2, 3], 'x2': [4, 5, 6]})
            >>> df_y = pd.Series([7, 8, 9])
            >>> data = Dataset.from_pandas(df_X, df_y)

            >>> # From a single DataFrame using column selection
            >>> df = pd.DataFrame({'x1': [1, 2, 3], 'x2': [4, 5, 6], 'y': [7, 8, 9]})
            >>> data = Dataset.from_pandas(df[['x1', 'x2']], df['y'])

            >>> # Multi-target
            >>> df_multi = pd.DataFrame({'x1': [1, 2], 'y1': [3, 4], 'y2': [5, 6]})
            >>> data = Dataset.from_pandas(df_multi[['x1']], df_multi[['y1', 'y2']])
        """
        return cls(X, y, device=device, dtype=dtype)



[docs]
    def to(self, device: str | torch.device) -> Self:
        """Move dataset to specified device.

        Args:
            device (str or torch.device): Target device (e.g., 'cpu', 'cuda',
                'cuda:0').

        Returns:
            Dataset: New Dataset instance on the specified device.

        Examples:
            >>> data = Dataset(X, y, device='cpu')
            >>> data_gpu = data.to('cuda')
            >>> print(data_gpu.device)  # cuda:0
        """
        device = torch.device(device)
        X_new = self.X.to(device)
        y_new = self.y.to(device)
        return Dataset(X_new, y_new)


    @property
    def device(self) -> torch.device:
        """torch.device: Device where the dataset tensors are stored."""
        return self.X.device

    @property
    def dtype(self) -> torch.dtype:
        """torch.dtype: Data type of the dataset tensors."""
        return self.X.dtype

    @property
    def num_samples(self):
        """int: Total number of samples in the dataset."""
        return self.tensors[0].shape[0]

    @property
    def feature_dimension(self):
        """int: Number of features in the dataset."""
        return self.tensors[0].shape[1]

    @property
    def target_dimension(self):
        """Int or tuple: Dimension(s) of the target.

        Returns 1 for 1D targets (shape (n,)). For multi-dimensional targets,
        returns a tuple of dimensions excluding the sample dimension.

        Examples:
            >>> # 1D target
            >>> y = torch.randn(100)
            >>> data = Dataset(X, y)
            >>> data.target_dimension  # 1

            >>> # 2D multi-target
            >>> y = torch.randn(100, 5)
            >>> data = Dataset(X, y)
            >>> data.target_dimension  # (5,)

            >>> # 3D target (e.g., images)
            >>> y = torch.randn(100, 3, 28, 28)
            >>> data = Dataset(X, y)
            >>> data.target_dimension  # (3, 28, 28)
        """
        if self.tensors[1].ndim == 1:
            return 1
        return self.tensors[1].shape[1:]  # Return all dimensions after batch

    @property
    def X(self):
        """Tensor: Feature matrix of shape (n_samples, n_features)."""
        return self.tensors[0]

    @property
    def y(self):
        """Tensor: Target array of shape (n_samples, ...)."""
        return self.tensors[1]


[docs]
    def __repr__(self):
        """Return string representation of the dataset.

        Returns:
            str: String showing dataset dimensions and device.
        """
        return (
            f"{self.__class__.__name__}("
            f"num_samples={self.num_samples}, "
            f"feature_dimension={self.feature_dimension}, "
            f"target_dimension={self.target_dimension}, "
            f"dtype={self.dtype}, "
            f"device={self.device})"
        )