import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.metrics as sk
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from lime import lime_tabular
import random
from IPython.display import HTML
from sklearn.metrics import classification_report, confusion_matrix


FILE = 'healthcare-dataset-stroke-data'
RANDOM_STATE = 7
TEST_SIZE = 0.3
COLUMN_HEADING = ["Model", "TP", "FP", "TN", "FN", "TPR", "TNR", "Accuracy%"]
classifier_summary_dataset = []
classifier_summary_dataset.append(COLUMN_HEADING)


try:
    df_data_stroke = pd.read_csv("./healthcare-dataset-stroke-data.csv")
    
except Exception as e:
    print(e)
    print('failed to read data for ', FILE)


df_data_stroke = df_data_stroke.dropna()
df_data_stroke.drop(df_data_stroke[df_data_stroke['gender'] == 'Other'].index, inplace = True)


def add_True_label(df_data, column, feature):
   df_data_stroke[column] = df_data[feature].apply(lambda x : 'yes' if x == 1 else 'no')

add_True_label(df_data_stroke,'stroke label', 'stroke')
df_data_stroke.head()


df_data_stroke.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4908 entries, 0 to 5109
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4908 non-null   int64  
 1   gender             4908 non-null   object 
 2   age                4908 non-null   float64
 3   hypertension       4908 non-null   int64  
 4   heart_disease      4908 non-null   int64  
 5   ever_married       4908 non-null   object 
 6   work_type          4908 non-null   object 
 7   Residence_type     4908 non-null   object 
 8   avg_glucose_level  4908 non-null   float64
 9   bmi                4908 non-null   float64
 10  smoking_status     4908 non-null   object 
 11  stroke             4908 non-null   int64  
 12  stroke label       4908 non-null   object 
dtypes: float64(3), int64(4), object(6)
memory usage: 536.8+ KB


df_data_stroke.describe()


def plot_corr_matrix(df, plot_name, pdf_name):
    result_path = os.path.join("./", pdf_name)   
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    sns.heatmap(df,cmap=cmap, annot=True)
    plt.savefig(result_path,format = "pdf",bbox_inches="tight", pad_inches=0.1,dpi=300)  
    plt.show() 
    
df_stroke_0 = df_data_stroke[df_data_stroke['stroke'] == 0] 
df_stroke_1 = df_data_stroke[df_data_stroke['stroke'] == 1] 
features = ['age','bmi','avg_glucose_level']

df_0 = df_stroke_0[features]
df_1 = df_stroke_1[features]

corrMatrix_0 = df_0.corr()
corrMatrix_1 = df_1.corr()

print("Plot of corelation matrix : patient did not have stroke\n")
plot_corr_matrix(corrMatrix_0, 'plot_0', 'plot_0.pdf')

print("Plot of corelation matrix : patient had stroke\n")
plot_corr_matrix(corrMatrix_1, 'plot_1', 'plot_1.pdf')

Plot of corelation matrix : patient did not have stroke

Plot of corelation matrix : patient had stroke


def plot_graph(df, plot_name, pdf_name, color):
    
    # here =os. path . abspath ( __file__ ) 
    # plot_dir =os. path . abspath (os. path . join (here ,os. pardir ))
    plot_dir = os.path.join("./", pdf_name)
    colors = [color]
    cols = sns.set_palette(sns.color_palette(colors))
    plot_name = sns.pairplot(df,vars = cols) 
    plt.show()
    plot_name.savefig(plot_dir)   
    

features = ['age','bmi','avg_glucose_level','stroke']
X = df_data_stroke[features]
# Y = df_data_stroke['stroke label']
# X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=TEST_SIZE,random_state =RANDOM_STATE)

X_0 = X[X['stroke'] == 0]
X_1 = X[X['stroke'] == 1]
del X_0['stroke']
del X_1['stroke']

print("Stroke = NO")
plot_graph(X_0, 'pair_plot_stroke_0', 'stroke_0.pdf', 'green')
print("Stroke = YES")
plot_graph(X_1, 'pair_plot_stroke_1', 'stroke_1.pdf', 'red')

Stroke = NO

Stroke = YES


# Following 7 features are taken into account and a subset is created:
# age, hypertension, heart_disease, bmi, avg_glucose_level, gender, smoking_status   
CATEGORICAL_FEATURES = ['gender','smoking_status']
NUMERICAL_FEATURES = ['age', 'hypertension','heart_disease','bmi','avg_glucose_level','stroke','stroke label']


# Convert categorical features into dummy variables and combine them with numerical variables
def dataset_dummies(categrical_features, numerical_features):   
    input_data_categorical = df_data_stroke[categrical_features]
    dummies = [pd.get_dummies(df_data_stroke[c]) for c in input_data_categorical.columns] 
    binary_data = pd.concat(dummies, axis=1)        
    df_result = binary_data.join(df_data_stroke[numerical_features])  
    return (df_result)


def sample_dataset(categrical_features, numerical_features, dummy):
    
    if dummy:
        df_result = dataset_dummies(categrical_features, numerical_features)
    else:
        df_result = df_data_stroke[numerical_features]

    # stroke event 0
    df_stroke0 = df_result[df_result['stroke'] == 0]
    # here you get 10 % of the rows of stroke event 0
    df_stroke0_sample = df_stroke0.sample(frac = 0.1, random_state = RANDOM_STATE)
    # stroke event 1
    df_stroke1 = df_result[df_result['stroke'] == 1]
    
    df_stroke = df_stroke1.append(df_stroke0_sample)
    
    col_no = len(df_stroke.columns)-2
    X = df_stroke.iloc[:, 0:col_no]
    Y = df_stroke['stroke label']

    le = LabelEncoder ()
    Y = le.fit_transform(Y)
    return (X, Y, df_result, col_no)

X, Y , df_result, COLUMN_NO = sample_dataset(CATEGORICAL_FEATURES, NUMERICAL_FEATURES, True)
X.shape, Y.shape

((679, 11), (679,))


# Train your K_NN classifier on a random sample X_train and compute its accuracy for X_test

# computes TP, FP, TN, FN
def model_eval(pred_label, true_label):
    true_positive = 0
    false_positive = 0
    false_negative = 0
    true_negative = 0
    for i in range(len(true_label)):
        if true_label[i] == 1:
            if pred_label[i] == 1:
                true_positive+=1
            else:
                false_negative+=1        
        else:
            if pred_label[i] == 1:
                false_positive+=1
            else:
                true_negative+=1
    
    return(true_positive, false_positive, true_negative, false_negative)



# computes TPR = TP/(TP + FN) - true positive rate
def TPR_cal(TP, FP, TN, FN):
    TPR = TP / (TP + FN)
    return TPR


# computes TNR = TN/(TN + FP) - true negative rate
def TNR_cal(TP, FP, TN, FN):
    TNR = TN / (TN + FP)
    return TNR

# prints model summary
def print_model_summary(model_summary):
   
    row = 0
    print("%20s"%model_summary[row][0],"%5s"%model_summary[row][1],"%5s"%model_summary[row][2],
          "%5s"%model_summary[row][3],"%5s"%model_summary[row][4],"%6s"%model_summary[row][5],
          "%6s"%model_summary[row][6],"%10s"%model_summary[row][7])
        
    for row in range(1,len(model_summary)):
            print("%20s"%model_summary[row][0],"%5s"%model_summary[row][1],"%5s"%model_summary[row][2],
                  "%5s"%model_summary[row][3],"%5s"%model_summary[row][4],"{:6.1f}".format(model_summary[row][5]),
                  "{:6.1f}".format(model_summary[row][6]),"{:10.1f}".format(model_summary[row][7]))    


# K_NN Classifier
def k_nn_classifier(X, Y, df_result, COLUMN_NO):
    error_rate = [] 
    accuracy_rate = []
    for k in [3,5,7,9,11,13,15,17,19]:
        # split the dataset into train and test set
        X_train ,X_test ,Y_train ,Y_test=train_test_split(X,Y,test_size=TEST_SIZE, random_state=RANDOM_STATE)
        X_train = X_train.values
        X_test = X_test.values
        
        # scale data
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        # train K_NN classifier
        knn_classifier = KNeighborsClassifier(n_neighbors=k) 
        knn_classifier.fit(X_train ,Y_train)
        
        # predict
        pred_k = knn_classifier.predict(X_test) 
        accuracy_rate.append(np.mean(pred_k == Y_test))
        accuracy = np.mean(pred_k == Y_test)
        error_rate.append(np.mean(pred_k != Y_test))
        
        # print("Accuracy of prediction using k-NN classifier for k =",k,"is {:.1f}".format(accuracy*100),"%")
    

    # Plot a graph showing the accuracy      
    plt.figure(figsize=(10,4))
    ax = plt.gca() 
    plt.plot(range(3,20,2), accuracy_rate , color='red', linestyle='dashed',marker='o', markerfacecolor='black', markersize=10) 
    plt.title('k value vs Accuracy for Stroke Prediction dataset')
    plt.xlabel('Number of neighbors: k') 
    plt.ylabel('Accuracy')
    plt.show()
    
    # Use the optimal value k∗ to compute performance measures and summarize them in the table    
    index = np.argmax(accuracy_rate, axis=0)
    k = index * 2 + 3   
    print("Optimal k = ", k)
    le = LabelEncoder ()
    Y = le.fit_transform(Y)  
    
    Xtrain ,Xtest ,Y_train ,Y_test=train_test_split(X,Y,test_size=TEST_SIZE,random_state=RANDOM_STATE)
    X_train = Xtrain.values
    X_test = Xtest.values
    
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    
    knn_classifier = KNeighborsClassifier(n_neighbors=k) 
    knn_classifier.fit(X_train ,Y_train)
       
    # subtract X_train from the entire dataset
    df = df_result[~df_result.index.isin(Xtrain.index)]
    X_test = df.iloc[:, 0:COLUMN_NO].values
    Y_test = df['stroke label'].values

    # apply K_NN classifier trained on the sample training set to the entire dataset
    X_test = scaler.transform(X_test)
    Y_test = le.fit_transform(Y_test)
    
    pred_k = knn_classifier.predict(X_test) 
    knn_accuracy = np.mean(pred_k == Y_test)
    
    (TP, FP, TN, FN) = model_eval(list(pred_k), list(Y_test))
    TPR = TPR_cal(TP, FP, TN, FN)
    TNR = TNR_cal(TP, FP, TN, FN)
    
    # Summarize KNN classifier performance measures on the entire dataset in a table    
    knn_classifier_summary_dataset= []
    knn_classifier_summary_dataset.append(COLUMN_HEADING)
    
    knn_classifier_summary_dataset.append(['K_NN', TP, FP, TN, FN, TPR, TNR, knn_accuracy * 100])
    classifier_summary_dataset.append(['K_NN', TP, FP, TN, FN, TPR, TNR, knn_accuracy * 100])
    print("Model Summary:\n")
    print_model_summary(knn_classifier_summary_dataset) 
    
k_nn_classifier(X, Y, df_result, COLUMN_NO)

Optimal k =  7
Model Summary:

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
                K_NN    35   722  3657    19    0.6    0.8       83.3


# Train your logistic regression classifier on a random sample X_train and compute its accuracy for X_test

def logistic_reg_classifier(X, Y, df_result, COLUMN_NO):

    # print("\nLogistic Regression Classifier:\n")
    le = LabelEncoder ()
    Y = le.fit_transform(Y)  
    
    Xtrain ,Xtest ,Y_train ,Y_test = train_test_split(X,Y,test_size=TEST_SIZE,random_state=RANDOM_STATE)
    
    X_train = Xtrain.values
    X_test = Xtest.values
    
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    log_reg_classifier = LogisticRegression() 
    log_reg_classifier.fit(X_train, Y_train)
    
 
    # Subtract X_train from the entire dataset

    df = df_result[~df_result.index.isin(Xtrain.index)]
    X_test = df.iloc[:, 0:COLUMN_NO].values
    Y_test = df['stroke label'].values
    
    #Apply logistic regression classifier trained on the sample X_train to the entire dataset

    X_test = scaler.transform(X_test)
    Y_test = le.fit_transform(Y_test)
    
    prediction = log_reg_classifier.predict(X_test) 
    # log_reg_accuracy = np.mean(prediction == Y_test)
    log_reg_accuracy = log_reg_classifier.score(X_test, Y_test)
    
    (TP, FP, TN, FN) = model_eval(list(prediction), list(Y_test))
    TPR = TPR_cal(TP, FP, TN, FN)
    TNR = TNR_cal(TP, FP, TN, FN)
    
    # Summarize logistic regression classifier performance measures on the entire dataset in a table
    
    log_reg_summary_dataset = []
    log_reg_summary_dataset.append(COLUMN_HEADING)
    
    log_reg_summary_dataset.append(['Logistic Regression', TP, FP, TN, FN, TPR, TNR, log_reg_accuracy*100])
    classifier_summary_dataset.append(['Logistic Regression', TP, FP, TN, FN, TPR, TNR, log_reg_accuracy * 100])
    print("Model Summary:\n")
    print_model_summary(log_reg_summary_dataset) 
    
logistic_reg_classifier(X, Y, df_result, COLUMN_NO)

Model Summary:

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
 Logistic Regression    41   764  3615    13    0.8    0.8       82.5


# train Decision Tree Classifier on X_train and predict class labels in X_test.

def decision_tree_classifier(X, Y, df_result, COLUMN_NO):
    
    le = LabelEncoder ()
    Y = le.fit_transform(Y) 
    
    # Split your sample dataset 70/30, 
    Xtrain ,Xtest ,Y_train ,Y_test = train_test_split(X,Y, test_size=TEST_SIZE,random_state=RANDOM_STATE)
    X_train = Xtrain.values
    X_test = Xtest.values
    tree_classifier = tree.DecisionTreeClassifier(criterion = 'entropy') 
    tree_classifier = tree_classifier.fit(X_train, Y_train)

    # Subtract X_train from the entire dataset
    df = df_result[~df_result.index.isin(Xtrain.index)]
    X_test = df.iloc[:, 0:COLUMN_NO].values
    Y_test = df['stroke label'].values
    Y_test = le.fit_transform(Y_test)
    
    # Apply Decision Tree classifier trained on the sample X_train to the entire dataset
    prediction = tree_classifier.predict(X_test) 
    error_rate = np.mean(prediction != Y_test)
    DT_accuracy = np.mean(prediction == Y_test)
    
    cf = sk.confusion_matrix(Y_test, prediction)
    
    (TP, FP, TN, FN) = model_eval(list(prediction), list(Y_test))
    TPR = TPR_cal(TP, FP, TN, FN)
    TNR = TNR_cal(TP, FP, TN, FN)
    
    decision_tree_dataset = []
    decision_tree_dataset.append(COLUMN_HEADING)
    
    decision_tree_dataset.append(['Decision Tree', TP, FP, TN, FN, TPR, TNR, DT_accuracy * 100])
    classifier_summary_dataset.append(['Decision Tree', TP, FP, TN, FN, TPR, TNR, DT_accuracy * 100])
    print("Model Summary:\n")
    print_model_summary(decision_tree_dataset)

decision_tree_classifier(X, Y, df_result, COLUMN_NO)

Model Summary:

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
       Decision Tree    40  1049  3330    14    0.7    0.8       76.0


# train Random Forest Classifier on X_train and predict class labels in X_test.
def random_forest_classifier(X, Y, df_result, COLUMN_NO):

    # print("\nRandom Forest Classifier:\n")    
    le = LabelEncoder ()
    Y = le.fit_transform(Y)  

    for d, color in enumerate (['red', 'black', 'blue', 'brown', 'green'], start=1):
        error_rate = np.empty([])
    
        for N in range(1,11):
            X_train ,X_test ,Y_train ,Y_test = train_test_split(X,Y, test_size=TEST_SIZE)
            X_train = X_train.values
            X_test = X_test.values
            model = RandomForestClassifier(n_estimators = N, max_depth = d, criterion='entropy')
            model.fit(X_train , Y_train)
            prediction = model.predict(X_test)         
            error = np.mean(prediction != Y_test)
            # print('\nestimator:', N, model.estimators_[N-1])
            # print('error rate:',error)
            error_rate  = np.append(error_rate,np.mean(prediction != Y_test))
        
        error_rate = np.delete(error_rate, 0)
        N_values = np.arange(1,11)
        plt.plot(N_values, error_rate , "-o",  color=color, label = "depth-"+str(d))
        plt.legend()
        

    # Plot error rates and find the best combination of N and d.
    plt.xlabel("number of estimators", fontsize=12) 
    plt.ylabel("error rate", fontsize=12)
    plt.title("Random Forest") 
    plt.show()   
    
    N = 10
    d = 4
    
    print("Plot shows that the best combination of number of estimators(sub trees) and depth of tree is:")
    print("estimators(sub trees) = ", N)
    print("depth of tree = ", d)
    print()
    
    Xtrain ,Xtest ,Y_train ,Y_test = train_test_split(X,Y, test_size=TEST_SIZE, random_state = RANDOM_STATE)
    X_train = Xtrain.values
    X_test = Xtest.values
    model = RandomForestClassifier(n_estimators = N, max_depth = d, criterion='entropy')
    model.fit(X_train , Y_train)
  
    # Subtract X_train from the entire dataset
    df = df_result[~df_result.index.isin(Xtrain.index)]
    X_test = df.iloc[:, 0:COLUMN_NO].values
    Y_test = df['stroke label'].values
    
    # Apply Random Forest classifier trained on the sample X_train to the entire dataset
    Y_test = le.fit_transform(Y_test)
    
    prediction = model.predict(X_test) 
    error_rate = np.mean(prediction != Y_test)
    rf_accuracy = np.mean(prediction == Y_test)
    
    cf = sk.confusion_matrix(Y_test, prediction)
    
    (TP, FP, TN, FN) = model_eval(list(prediction), list(Y_test))
    TPR = TPR_cal(TP, FP, TN, FN)
    TNR = TNR_cal(TP, FP, TN, FN)
    
    random_forest_dataset = []
    random_forest_dataset.append(COLUMN_HEADING)
    
    random_forest_dataset.append(['Random Forest', TP, FP, TN, FN, TPR, TNR, rf_accuracy * 100])
    classifier_summary_dataset.append(['Random Forest', TP, FP, TN, FN, TPR, TNR, rf_accuracy * 100])
    print("Model Summary:\n")
    print_model_summary(random_forest_dataset)
    
random_forest_classifier(X, Y, df_result, COLUMN_NO)

Plot shows that the best combination of number of estimators(sub trees) and depth of tree is:
estimators(sub trees) =  10
depth of tree =  4

Model Summary:

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
       Random Forest    30   645  3734    24    0.6    0.9       84.9


print_model_summary(classifier_summary_dataset)

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
                K_NN    35   722  3657    19    0.6    0.8       83.3
 Logistic Regression    41   764  3615    13    0.8    0.8       82.5
       Decision Tree    40  1049  3330    14    0.7    0.8       76.0
       Random Forest    30   645  3734    24    0.6    0.9       84.9


CATEGORICAL_FEATURES = ['smoking_status']
NUMERICAL_FEATURES = ['age', 'hypertension','heart_disease','bmi','avg_glucose_level','stroke','stroke label']
"""
create two lists to store summary of all the classifiers applied to the sample data 
and to store summary of all the classifiers applied to the whole dataset.
"""
classifier_summary_dataset = []
classifier_summary_dataset.append(COLUMN_HEADING)

X, Y , df_result, COLUMN_NO = sample_dataset(CATEGORICAL_FEATURES, NUMERICAL_FEATURES, True)

# classifiers
k_nn_classifier(X, Y, df_result, COLUMN_NO)
logistic_reg_classifier(X, Y, df_result, COLUMN_NO)
decision_tree_classifier(X, Y, df_result, COLUMN_NO)
random_forest_classifier(X, Y, df_result, COLUMN_NO)

print("Summary of performance measures using different classifiers on the whole dataset excluding sample train-set:\n")        
print_model_summary(classifier_summary_dataset)

Optimal k =  19
Model Summary:

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
                K_NN    31   613  3766    23    0.6    0.9       85.7
Model Summary:

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
 Logistic Regression    40   750  3629    14    0.7    0.8       82.8
Model Summary:

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
       Decision Tree    38  1022  3357    16    0.7    0.8       76.6

Plot shows that the best combination of number of estimators(sub trees) and depth of tree is:
estimators(sub trees) =  10
depth of tree =  4

Model Summary:

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
       Random Forest    30   549  3830    24    0.6    0.9       87.1
Summary of performance measures using different classifiers on the whole dataset excluding sample train-set:

               Model    TP    FP    TN    FN    TPR    TNR  Accuracy%
                K_NN    31   613  3766    23    0.6    0.9       85.7
 Logistic Regression    40   750  3629    14    0.7    0.8       82.8
       Decision Tree    38  1022  3357    16    0.7    0.8       76.6
       Random Forest    30   549  3830    24    0.6    0.9       87.1


# create a copy of original dataframe and remove features id and stroke label from the dataset
df_stroke_data = df_data_stroke.copy()
del df_stroke_data['id']
del df_stroke_data['stroke label']


# get dummy variables for all categorical features
df_stroke_data = pd.get_dummies(df_stroke_data)


# stroke event 0 (no stroke)
df_stroke0 = df_stroke_data[df_stroke_data['stroke'] == 0]
# here you get 10 % of the rows of stroke event 0
df_stroke0_sample = df_stroke0.sample(frac = 0.1, random_state = RANDOM_STATE)
# stroke event 1 (stroke)
df_stroke1 = df_stroke_data[df_stroke_data['stroke'] == 1]
# create a balanced sample dataset by combining 209 obsetvations for stroke = 1 and 10% of the observations for stroke = 0
df_stroke_sample = df_stroke1.append(df_stroke0_sample)

Y = df_stroke_sample['stroke']
X = df_stroke_sample.copy()

target_names = ['no stroke', 'stroke']

print("\nData Size of sample dataset: ", X.shape, Y.shape)

Data Size of sample dataset:  (679, 21) (679,)


# split sample dataset into training and testing sets using 70/30 split
Xtrain, Xtest, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, stratify=Y, random_state=3)

X_train = Xtrain.copy()
del X_train['stroke']
column_names = list(X_train.columns)
X_train = X_train.values
X_test = Xtest.copy()
del X_test['stroke']
X_test = X_test.values
Y_train = Y_train.values
Y_test = Y_test.values

# scale train and test set
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("\nTrain set size and shape : ", X_train.shape, Y_train.shape)
print("\nTest set size and shape  : ", X_test.shape, Y_test.shape)

# Logistic Regression Classifier
lr = LogisticRegression()
lr.fit(X_train, Y_train)

# Subtract X_train from the entire dataset to get the testing set
df = df_stroke_data[~df_stroke_data.index.isin(Xtrain.index)]
Y_test = df['stroke'].values
df_X_test = df.copy()
del df['stroke']
X_test = df.values

# Apply logistic regression classifier trained on the sample X_train to the entire dataset
X_test = scaler.transform(X_test)

print("\nLogistic Regression Classifier:\n")
print("Test  Accuracy test set : %.2f"%lr.score(X_test, Y_test))
print("Train Accuracy train set : %.2f"%lr.score(X_train, Y_train))
print()
print("Confusion Matrix : ")
print(confusion_matrix(Y_test, lr.predict(X_test)))
print()
print("Classification Report")
print(classification_report(Y_test, lr.predict(X_test)))

Train set size and shape :  (475, 20) (475,)

Test set size and shape  :  (204, 20) (204,)

Logistic Regression Classifier:

Test  Accuracy test set : 0.84
Train Accuracy train set : 0.83

Confusion Matrix : 
[[3683  687]
 [  26   37]]

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.84      0.91      4370
           1       0.05      0.59      0.09        63

    accuracy                           0.84      4433
   macro avg       0.52      0.72      0.50      4433
weighted avg       0.98      0.84      0.90      4433


# Lime

explainer = lime_tabular.LimeTabularExplainer(X_train, mode="classification",
                                              class_names=target_names,
                                              feature_names=column_names,
                                              )

print(explainer)
print()

# Randomly pick a patient from the dataset for prediction
idx = random.randint(1, len(X_test))

# stroke 
# idx = 11

# no stroke
# idx = 100

print('A patient picked randomly from the dataset for prediction is at index = ',idx)
print("Prediction: ", target_names[lr.predict(X_test[idx].reshape(1,-1))[0]])
print("Actual:     ", target_names[Y_test[idx]])

explanation_lr = explainer.explain_instance(X_test[idx], lr.predict_proba,
                                          num_features=len(column_names))
with plt.style.context("ggplot"):
    explanation_lr.as_pyplot_figure()

# method for classification
print("\nExplanation of prediction using Logistic Regression Classifier:\n")
print(explanation_lr.as_map())

html_data = explanation_lr.as_html()
HTML(data=html_data)

explanation_lr.save_to_file("logreg_classification_explanation.html")

<lime.lime_tabular.LimeTabularExplainer object at 0x7f987f0be640>

A patient picked randomly from the dataset for prediction is at index =  644
Prediction:  no stroke
Actual:      no stroke

Explanation of prediction using Logistic Regression Classifier:

{1: [(0, -0.39536603801619047), (10, 0.16327824154458712), (13, 0.13995973136678488), (1, -0.09331217073674346), (19, -0.07229239439380207), (2, -0.05260316961648549), (12, 0.05210950087572776), (16, -0.03025086821911223), (3, -0.030248717536888212), (17, -0.021324376123602252), (9, 0.01841268880027977), (4, -0.01630755272423561), (14, -0.01568000957822738), (15, -0.005961695218381386), (7, 0.005461026434586675), (18, 0.005215310213146889), (6, -0.0021946926939977293), (11, -0.001182355705553531), (8, -0.0005559793827276675), (5, 0.00018202611335154672)]}

	id	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke	stroke label
0	9046	Male	67.0	0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	1	yes
2	31112	Male	80.0	0	1	Yes	Private	Rural	105.92	32.5	never smoked	1	yes
3	60182	Female	49.0	0	0	Yes	Private	Urban	171.23	34.4	smokes	1	yes
4	1665	Female	79.0	1	0	Yes	Self-employed	Rural	174.12	24.0	never smoked	1	yes
5	56669	Male	81.0	0	0	Yes	Private	Urban	186.21	29.0	formerly smoked	1	yes

	id	age	hypertension	heart_disease	avg_glucose_level	bmi	stroke
count	4908.000000	4908.000000	4908.000000	4908.000000	4908.000000	4908.00000	4908.000000
mean	37060.423594	42.868810	0.091891	0.049511	105.297402	28.89456	0.042584
std	20995.468407	22.556128	0.288901	0.216954	44.425550	7.85432	0.201937
min	77.000000	0.080000	0.000000	0.000000	55.120000	10.30000	0.000000
25%	18602.500000	25.000000	0.000000	0.000000	77.067500	23.50000	0.000000
50%	37580.500000	44.000000	0.000000	0.000000	91.680000	28.10000	0.000000
75%	55181.750000	60.000000	0.000000	0.000000	113.495000	33.10000	0.000000
max	72940.000000	82.000000	1.000000	1.000000	271.740000	97.60000	1.000000

Stroke Prediction Dataset Analysis and Visualization¶

Data Set Characteristics:¶

Name:¶

Potential Goal:¶

Import Libraries¶

Constants¶

Load healthcare stroke dataset¶

Data preprocessing:¶

remove rows with missing values and remove 'other' from gender since there is only 1 row with gender = other¶

Add a label column with 'yes' value if stroke = 1 and 'no' value if stroke = 0¶

Summarize data¶

Correlation matrix:¶

Inferences¶

Pairwise plots for stroke = 0 and stroke = 1.¶

Analyze features¶

Encode categorical data¶

Balance dataset¶

Evaluation metrics¶

Provided by Nabeel Shaikh (github.com/nshaikh99)¶

KNN Classifier¶

Logistic Regression Classfier¶

Decision Tree Classifier¶

Random Forest Classifier¶

Summary of performance measures using different classifiers on the whole dataset with the following features:¶

Feature selection: gender feature is missing¶

Accuracy of prediction when age feature is removed from the dataset¶

Prediction using lime package¶

Analyze features¶

Data preprocessing¶

Encode categorical data¶¶

Balance dataset¶

Logistic Regression¶

LIME explanation of prediction of logistic regression classifier¶