Source code for canonical_sets.data.base

"""Base class for data sets."""

from typing import Dict, List, Optional, Tuple

import joblib
import pandas as pd
import torch
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset


[docs]class BaseData: """Base class for data sets. This is a base class from which all data sets inherit. Attributes ---------- train_data : pd.DataFrame The training data. test_data : pd.DataFrame The testing data. train_labels : pd.DataFrame The training labels. test_labels : pd.DataFrame The testing labels. val_data : pd.DataFrame The validation data. val_labels : pd.DataFrame The validation labels. numerical_cols : List[str] The numerical columns. categorical_cols : List[str] The categorical columns. """ train_data: pd.DataFrame val_data: Optional[pd.DataFrame] test_data: Optional[pd.DataFrame] train_labels: pd.DataFrame val_labels: Optional[pd.DataFrame] test_labels: Optional[pd.DataFrame] numerical_cols: Optional[List[str]] categorical_cols: Optional[List[str]] def __init__( self, features: Optional[List[str]] = None, groups: Optional[Dict[str, Dict[str, str]]] = None, scaler: TransformerMixin = MinMaxScaler(feature_range=(-1, 1)), prefix_sep: str = "+", val_prop: float = 0.2, test_prop: float = 0.2, preprocess: bool = True, seed: int = 1234, ): """Initialize the data. Parameters ---------- features: List[str], optional The features to use. The default is ``None``. groups: Dict[str, Dict[str, str]], optional The groups to use. The default is ``None``. scaler : sklearn.base.TransformerMixin Any of the ``sklearn`` preprocessing modules for the numerical features. The default is ``sklearn.preprocessing.MinMaxScaler``. prefix_sep : str The prefix separator to split the categorical feature and category when one-hot encoding. For example, Color = [Red, Green] -> Color+Red and Color+Green. The default is ``+``. val_prop: float The proportion of the training data to use for validation. The default is 0.2. test_prop: float The proportion of the training data to use for testing. The default is 0.2. preprocess: bool Whether to preprocess the data. The default is ``True``. seed: int The seed for the random state. The default is 1234. Raises ------ ValueError Proportions must be between [0, 1). """ if val_prop >= 1 or test_prop >= 1 or val_prop < 0 or test_prop < 0: raise ValueError("Proportions must be between [0, 1).") self.features = features self.groups = groups self.scaler = scaler self.prefix_sep = prefix_sep self.val_prop = val_prop self.test_prop = test_prop self.preprocess = preprocess self.seed = seed self.train_data = pd.DataFrame() self.val_data = None self.test_data = None self.train_labels = pd.DataFrame() self.val_labels = None self.test_labels = None self.numerical_cols = None self.categorical_cols = None
[docs] @classmethod def load(cls, path): """Load the data. Parameters ---------- path: str The path to load the data from (needs to end in .pkl). """ return joblib.load(path)
[docs] def save(self, path: str) -> None: """Save the object. Parameters ---------- path: str The path to save the object (needs to end in .pkl). """ joblib.dump(self, path)
[docs] def inverse_preprocess(self, data: pd.DataFrame) -> pd.DataFrame: """Inverse preprocess the data. Parameters ---------- data: pd.DataFrame The data to inverse preprocess. Returns ------- pd.DataFrame The inverse preprocessed data. """ numbers = self._inverse_scale_numeric(data) dummies = self._inverse_ohe(data) return numbers.join(dummies)
def _create_groups( self, data: pd.DataFrame, groups: Dict[str, Dict[str, str]] ) -> pd.DataFrame: """Create groups for the data. Parameters ---------- data: pd.DataFrame The data to group. groups: Dict[str, Dict[str, str]] The groups to use. The default is ``None``. Returns ------- df: pd.DataFrame The grouped data. """ df = data.replace(groups) return df def _split_data( self, data: pd.DataFrame, labels: pd.DataFrame, prop: float = 0.2, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Split the data. Parameters ---------- data: pd.DataFrame The data to split. labels: pd.DataFrame The labels to split. prop: int The proportion of the data to use. The default is 0.2. Returns ------- x_data: pd.DataFrame The x data. y_data: pd.DataFrame The y data. x_labels: pd.DataFrame The x labels. y_labels: pd.DataFrame The y labels. """ x_data, y_data, x_labels, y_labels = train_test_split( data, labels, test_size=prop, random_state=self.seed ) x_data.reset_index(drop=True, inplace=True) y_data.reset_index(drop=True, inplace=True) x_labels.reset_index(drop=True, inplace=True) y_labels.reset_index(drop=True, inplace=True) return x_data, y_data, x_labels, y_labels def _one_hot_encode(self, data: pd.DataFrame) -> pd.DataFrame: """One-hot encode the data. Parameters ---------- data: pd.DataFrame The data to one-hot encode. Returns ------- data: pd.DataFrame The one-hot encoded data. """ categories = data.select_dtypes(exclude="number") if categories.empty: return pd.DataFrame(index=data.index) self.categorical_cols = categories.columns.to_list() dummies = pd.get_dummies( categories, prefix=self.categorical_cols, prefix_sep=self.prefix_sep, ) return dummies def _scale_numeric( self, data: pd.DataFrame, fit_scaler: bool = False ) -> pd.DataFrame: """Scale the numeric data. Parameters ---------- data: pd.DataFrame The data to scale. fit_scaler: bool Whether to fit the scaler. The default is ``False``. Returns ------- data: pd.DataFrame The scaled data. """ numerical_data = data.select_dtypes(include="number") if numerical_data.empty: return pd.DataFrame(index=data.index) if fit_scaler: self.scaler.fit(numerical_data) self.numerical_cols = numerical_data.columns.to_list() scaled_data = pd.DataFrame( self.scaler.transform(numerical_data), columns=self.numerical_cols, ) return scaled_data def _inverse_ohe(self, data: pd.DataFrame) -> pd.DataFrame: """Inverse one-hot encode the data. Parameters ---------- data: pd.DataFrame The data to inverse one-hot encode. Returns ------- undummified_df: pd.DataFrame The inverse one-hot encoded data. """ df = data.select_dtypes(include="uint8") if df.empty or self.categorical_cols is None: return pd.DataFrame(index=data.index) cols2collapse = { item.split(self.prefix_sep)[0]: (self.prefix_sep in item) for item in df.columns } series_list = [] for col, needs_to_collapse in cols2collapse.items(): if needs_to_collapse & (col in self.categorical_cols): undummified = ( df.filter(like=col) .idxmax(axis=1) .apply(lambda x: x.split(self.prefix_sep, maxsplit=1)[1]) .rename(col) ) series_list.append(undummified) else: series_list.append(df[col]) undummified_df = pd.concat(series_list, axis=1) return undummified_df def _inverse_scale_numeric(self, data: pd.DataFrame) -> pd.DataFrame: """Inverse scale the numeric data. Parameters ---------- data: pd.DataFrame The data to inverse scale. Returns ------- pd.DataFrame The inverse scaled data. """ df = data.select_dtypes(include="float64") if df.empty: return pd.DataFrame(index=data.index) return pd.DataFrame( self.scaler.inverse_transform(df), columns=df.columns )
[docs]class DataSet(Dataset): """The dataset class for the PyTorch dataloader.""" def __init__(self, data: pd.DataFrame, labels: pd.DataFrame): """Initialize the dataset. Parameters ---------- data: pd.DataFrame The data. labels: pd.DataFrame The labels. """ self.data = data self.labels = labels def __len__(self) -> int: return len(self.labels) def __getitem__(self, idx): x = torch.Tensor(self.data.iloc[idx]) y = torch.Tensor(self.labels.iloc[idx]) return x, y