Source code for minerva.data.readers.tabular_reader

import re
from pathlib import Path
from typing import List, Optional, Tuple, Union

import numpy as np
import pandas as pd

from minerva.data.readers.reader import _Reader



[docs]
class TabularReader(_Reader):
    def __init__(
        self,
        df: pd.DataFrame,
        columns_to_select: Union[str, List[str]],
        cast_to: Optional[str] = None,
        data_shape: Optional[Tuple[int, ...]] = None,
    ):
        """Reader to select columns from a DataFrame and return them as a NumPy
        array. The DataFrame is indexed by the row number. Each row of the
        DataFrame is considered as a sample. Thus, the __getitem__ method will
        return the columns of the DataFrame at the specified index as a NumPy
        array.

        Parameters
        ----------
        df : pd.DataFrame
            The DataFrame to select the columns from. The DataFrame should have
            the columns that are specified in the `columns_to_select` parameter.
        columns_to_select : Union[str, list[str]]
            A string or a list of strings used to select the columns from the
            DataFrame. The string can be a regular expression pattern or a
            column name. The columns that match the pattern will be selected.
            Note that if columns_to_select is a list, the result is always a
            numpy array with the columns in the same order as the list.
            If the columns_to_select is a string, the result is a numpy array
            if the selected columns are more than one, otherwise it is a single
            value (which is not a numpy array).
        cast_to : str, optional
            Cast the selected columns to the specified data type. If None, the
            data type of the columns will not be changed. (default is None)
        data_shape : tuple[int, ...], optional
            The shape of the data to be returned. If None, the data will be
            returned as a 1D array. If provided, the data will be reshaped to
            the specified shape. (default is None)
        """
        self.df = df
        self.columns_to_select = columns_to_select
        self.cast_to = cast_to
        self.data_shape = data_shape
        self.return_single = False

        if isinstance(self.columns_to_select, str):
            self.columns_to_select = [self.columns_to_select]
            self.return_single = True


[docs]
    def __getitem__(self, index: int) -> np.ndarray:
        """Return the columns of the DataFrame at the specified row index as a NumPy
        array. The columns are selected based on the `self.columns_to_select`.

        Parameters
        ----------
        index : int
            The row index to select the columns from the DataFrame.

        Returns
        -------
        np.ndarray
            The selected columns from the row as a NumPy array.
        """
        columns = list(self.df.columns)

        # Filter valid columns based on columns_to_select list
        valid_columns = []
        for pattern in self.columns_to_select:
            selected_columns = [col for col in columns if re.match(pattern, col)]
            if len(selected_columns) == 0:
                raise ValueError(f"No columns macth pattern: '{pattern}'")

            valid_columns.extend(selected_columns)

        # Select the elements and return
        row = self.df.iloc[index][valid_columns]
        row = row.to_numpy()

        if self.cast_to is not None:
            row = row.astype(self.cast_to)

        if self.data_shape is not None:
            row = row.reshape(self.data_shape)

        if self.return_single and row.shape[0] == 1:
            return row[0]

        return row



[docs]
    def __len__(self) -> int:
        """Return the number of samples in the DataFrame. The number of samples
        is equal to the number of rows in the DataFrame.

        Returns
        -------
        int
            The number of samples in the DataFrame.
        """
        return len(self.df)



[docs]
    def __str__(self) -> str:
        """Return a string representation of the TabularReader object.

        Returns
        -------
        str
            A string representation of the TabularReader object.
        """
        return f"TabularReader(df={self.df.shape}, columns_to_select={self.columns_to_select}, cast_to={self.cast_to}, data_shape={self.data_shape})"