from sklearn.base import BaseEstimator, TransformerMixin
# Import modules
import pandas as pd
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
import xgboost as xgb
# Create list of column names for kidney data: kidney_cols
kidney_cols = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm',
'cad', 'appet', 'pe', 'ane', 'label']
# Load dataset: df_kidney
df_kidney = pd.read_csv('chronic_kidney_disease.csv', names=kidney_cols,
# Replace label values with 0 (ckd) and 1
df_kidney['label'].replace({'ckd':0, 'notckd':1}, inplace=True)
# Define X and y: X, y
X, y = df_kidney.iloc[:, :-1], df_kidney['label'].values
# Define new column order for X: col_order
col_order = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot',
'hemo', 'pcv', 'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm',
'cad', 'appet', 'pe', 'ane']
# Rearrange columns of X
X = X[col_order]
# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object
# Get a list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()
# Get a list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()
# Create empty list to hold column imputers: transformers
transformers = []
# Create numeric imputers and add to list of transformers
transformers.extend([([num_col], [Imputer(strategy='median'),
StandardScaler()]) for num_col
in non_categorical_columns])
# Create categorical imputers and add to list of transformers
transformers.extend([(cat_col, [CategoricalImputer()]) for cat_col in
# Use list of transformers to create a DataFrameMapper object
numeric_categorical_union = DataFrameMapper(transformers, input_df=True,
# Define Dictifier class to turn df into dictionary as part of pipeline
class Dictifier(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return X.to_dict('records')
# Create full pipeline
pipeline = Pipeline([('featureunion', numeric_categorical_union),
('dictifier', Dictifier()),
('vectorizer', DictVectorizer(sort=False)),
('clf', xgb.XGBClassifier(max_depth=3))])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=3)