Source code for canonical_sets.data.adult

"""Adult Data Set - UCI Machine Learning Repository."""

from typing import Dict, List, Optional

import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler

from canonical_sets.data.base import BaseData

# names of the columns in the downloaded csv file
NAMES = [
    "Age",
    "Workclass",
    "fnlwgt",
    "Education",
    "Education-Num",
    "Martial Status",
    "Occupation",
    "Relationship",
    "Race",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Target",
]


[docs]class Adult(BaseData): """Adult Data Set - UCI Machine Learning Repository. This class downloads and preprocesses the Adult dataset as a `pd.DataFrame`. Attributes ---------- train_data : pd.DataFrame The training data. test_data : pd.DataFrame The testing data. train_labels : pd.DataFrame The training labels. test_labels : pd.DataFrame The testing labels. val_data : pd.DataFrame The validation data. val_labels : pd.DataFrame The validation labels. numerical_cols : List[str] The numerical columns. categorical_cols : List[str] The categorical columns. Example ------- >>> adult = Adult() """ train_data: pd.DataFrame val_data: Optional[pd.DataFrame] test_data: pd.DataFrame train_labels: Optional[pd.DataFrame] val_labels: Optional[pd.DataFrame] test_labels: Optional[pd.DataFrame] numerical_cols: Optional[List[str]] categorical_cols: Optional[List[str]] def __init__( self, train_path: Optional[str] = None, test_path: Optional[str] = None, download_train_path: Optional[str] = None, download_test_path: Optional[str] = None, features: Optional[List[str]] = None, groups: Optional[Dict[str, Dict[str, str]]] = None, scaler: TransformerMixin = MinMaxScaler(feature_range=(-1, 1)), prefix_sep: str = "+", val_prop: float = 0.2, preprocess: bool = True, seed: int = 1234, ): """Initialize the data. Parameters ---------- train_path : str, optional The path to the training data if it is already downloaded. test_path : str, optional The path to the testing data if it is already downloaded. download_train_path : str, optional The path to save the training data to (needs to end in .csv). The default is ``None``. download_test_path : str, optional The path to save the testing data to (needs to end in .csv). The default is ``None``. features: List[str], optional The features to use. The default is ``None``. groups: Dict[str, Dict[str, str]], optional The groups to use. The default is ``None``. scaler : sklearn.base.TransformerMixin Any of the ``sklearn`` preprocessing modules. The default is ``sklearn.preprocessing.MinMaxScaler``. prefix_sep : str The prefix separator to split the categorical feature and category when one-hot encoding. For example, Color = [Red, Green] -> Color+Red and Color+Green. The default is ``+``. val_prop: float The proportion of the training data to use for validation. The default is 0.2. preprocess: bool Whether to preprocess the data. The default is ``True``. seed: int The seed for the random state. The default is 1234. """ super().__init__( features, groups, scaler, prefix_sep, val_prop, 0, preprocess, seed ) if self.features: self.features = features else: self.features = [ "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status", "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss", "Hours per week", "Country", ] if train_path and test_path: train_data = pd.read_csv(train_path) test_data = pd.read_csv(test_path) else: train_url = ( "https://archive.ics.uci.edu/ml/" "machine-learning-databases/adult/adult.data" ) test_url = ( "https://archive.ics.uci.edu/ml/" "machine-learning-databases/adult/adult.test" ) train_data = pd.read_csv( train_url, names=NAMES, sep=r"\s*,\s*", engine="python", na_values="?", header=None, ) test_data = pd.read_csv( test_url, names=NAMES, sep=r"\s*,\s*", engine="python", na_values="?", header=None, skiprows=1, ) if download_train_path: train_data.to_csv(download_train_path, index=False) if download_test_path: test_data.to_csv(download_test_path, index=False) if preprocess: self._preprocess(train_data, test_data) else: self.train_data = train_data self.test_data = test_data self.val_data = None self.train_labels = None self.val_labels = None self.test_labels = None self.numerical_cols = None self.categorical_cols = None def _preprocess( self, train_data: pd.DataFrame, test_data: pd.DataFrame ) -> None: """Preprocess the data. Parameters ---------- train_data: pd.DataFrame The training data. test_data: pd.DataFrame The testing data. Returns ------- None. """ # drop NA train_data.dropna(inplace=True) test_data.dropna(inplace=True) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) df = pd.concat([train_data, test_data]) # get labels, drop target (erase unnecessary "." in test data) df["Target"] = df["Target"].str.replace(r".", "", regex=True) labels = pd.get_dummies(df["Target"]) df.drop("Target", inplace=True, axis=1) # drop columns df = df[self.features] # create groups # if self.groups is None: # others = list(df.Country.unique()) # others.remove("United-States") # self.groups = {"Country": dict.fromkeys(others, "Others")} if self.groups is not None: df = self._create_groups(df, self.groups) # split data x_train = df.iloc[: len(train_data)] y_train = labels.iloc[: len(train_data)] x_test = df.iloc[len(train_data) :] y_test = labels.iloc[len(train_data) :] if self.val_prop > 0: ( x_train, x_val, y_train, y_val, ) = self._split_data(x_train, y_train, self.val_prop) data = pd.concat([x_train, x_val, x_test]) else: data = pd.concat([x_train, x_test]) # one-hot encode categorical columns dummies = self._one_hot_encode(data) # scale numerical columns scaled_train_data = self._scale_numeric(x_train, fit_scaler=True) scaled_test_data = self._scale_numeric(x_test) if self.val_prop > 0: scaled_val_data = self._scale_numeric(x_val) # merge the pre-processed data and attribute to self self.train_data = scaled_train_data.join(dummies[: len(x_train)]) self.train_labels = y_train self.test_labels = y_test if self.val_prop > 0: self.val_data = scaled_val_data.join( dummies[len(x_train) : (len(x_train) + len(x_val))] ) self.test_data = scaled_test_data.join( dummies[(len(x_train) + len(x_val)) :] ) self.val_labels = y_val else: self.test_data = scaled_test_data.join(dummies[len(x_train) :])