Source code for canonical_sets.data.compas

"""Compas Data Set - ProPublica."""

from typing import Dict, List, Optional

import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler

from canonical_sets.data.base import BaseData


[docs]class Compas(BaseData): """Compas Data Set - ProPublica. This class downloads and preprocesses the Compas dataset as a `pd.DataFrame`. Attributes ---------- train_data : pd.DataFrame The training data. test_data : pd.DataFrame The testing data. train_labels : pd.DataFrame The training labels. test_labels : pd.DataFrame The testing labels. val_data : pd.DataFrame The validation data. val_labels : pd.DataFrame The validation labels. numerical_cols : List[str] The numerical columns. categorical_cols : List[str] The categorical columns. Example ------- >>> compas = Compas() """ train_data: pd.DataFrame val_data: Optional[pd.DataFrame] test_data: Optional[pd.DataFrame] train_labels: Optional[pd.DataFrame] val_labels: Optional[pd.DataFrame] test_labels: Optional[pd.DataFrame] numerical_cols: Optional[List[str]] categorical_cols: Optional[List[str]] def __init__( self, path: Optional[str] = None, download_path: Optional[str] = None, features: Optional[List[str]] = None, groups: Optional[Dict[str, Dict[str, str]]] = None, scaler: TransformerMixin = MinMaxScaler(feature_range=(-1, 1)), prefix_sep: str = "+", val_prop: float = 0.2, test_prop: float = 0.2, preprocess: bool = True, seed: int = 1234, ): """Initialize the data. Parameters ---------- path : Optional[str] The path to the data if it is already downloaded. download_path : Optional[str] The path to save the data to (needs to end in .csv). The default is ``None``. features: List[str], optional The features to use. The default is ``None``. groups: Dict[str, Dict[str, str]], optional The groups to use. The default is ``None``. scaler : sklearn.base.TransformerMixin Any of the ``sklearn`` preprocessing modules. The default is ``sklearn.preprocessing.MinMaxScaler``. prefix_sep : str The prefix separator to split the categorical feature and category when one-hot encoding. For example, Color = [Red, Green] -> Color+Red and Color+Green. The default is ``+``. val_prop: float The proportion of the training data (minus the testing data) to use for validation. The default is 0.2. test_prop: float The proportion of the training data to use for testing. The default is 0.2. preprocess: bool Whether to preprocess the data. The default is ``True``. seed: int The seed for the random state. The default is 1234. """ super().__init__( features, groups, scaler, prefix_sep, val_prop, test_prop, preprocess, seed, ) if self.features: self.features = features else: self.features = [ "c_charge_degree", "race", "age_cat", "sex", "priors_count", ] if path: data = pd.read_csv(path) else: url = ( "https://raw.githubusercontent.com/propublica/" "compas-analysis/master/compas-scores-two-years.csv" ) data = pd.read_csv(url) if download_path: data.to_csv(download_path, index=False) if preprocess: self._preprocess(data) else: self.train_data = data self.test_data = None self.val_data = None self.train_labels = None self.val_labels = None self.test_labels = None self.numerical_cols = None self.categorical_cols = None def _preprocess(self, data: pd.DataFrame) -> None: """Preprocess the data. Parameters ---------- data: pd.DataFrame The data. Returns ------- None. """ # cleaning data like in the original ProPublica paper df = ( data.loc[ (data["days_b_screening_arrest"] <= 30) & (data["days_b_screening_arrest"] >= -30), :, ] .loc[data["is_recid"] != -1, :] .loc[data["c_charge_degree"] != "O", :] .loc[data["score_text"] != "N/A", :] ) df.reset_index(drop=True, inplace=True) # get labels labels = pd.get_dummies(df["two_year_recid"]) df.drop("two_year_recid", inplace=True, axis=1) # drop columns df = df[self.features] # create groups if self.groups is not None: df = self._create_groups(df, self.groups) # split data if self.val_prop > 0 and self.test_prop > 0: ( x_train, x_test, y_train, y_test, ) = self._split_data(df, labels, self.test_prop) ( x_train, x_val, y_train, y_val, ) = self._split_data(x_train, y_train, self.val_prop) data = pd.concat([x_train, x_val, x_test]) elif self.val_prop > 0: ( x_train, x_val, y_train, y_val, ) = self._split_data(df, labels, self.val_prop) data = pd.concat([x_train, x_val]) elif self.test_prop > 0: ( x_train, x_test, y_train, y_test, ) = self._split_data(df, labels, self.test_prop) data = pd.concat([x_train, x_test]) # one-hot encode categorical columns dummies = self._one_hot_encode(data) # scale numerical columns scaled_train_data = self._scale_numeric(x_train, fit_scaler=True) if self.val_prop > 0: scaled_val_data = self._scale_numeric(x_val) if self.test_prop > 0: scaled_test_data = self._scale_numeric(x_test) # merge the pre-processed data and attribute to self self.train_data = scaled_train_data.join(dummies[: len(x_train)]) self.train_labels = y_train if self.val_prop > 0 and self.test_prop > 0: self.val_data = scaled_val_data.join( dummies[len(x_train) : (len(x_train) + len(x_val))] ) self.test_data = scaled_test_data.join( dummies[(len(x_train) + len(x_val)) :] ) self.val_labels = y_val self.test_labels = y_test elif self.val_prop > 0: self.val_data = scaled_val_data.join(dummies[len(x_train) :]) self.val_labels = y_val elif self.test_prop > 0: self.test_data = scaled_test_data.join(dummies[len(x_train) :]) self.test_labels = y_test