# -*- coding: utf-8 -*-
"""3_nust_datamining_heartdisease.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1SV5YapbfS2RlqEXYBoCC6rYQYgsEjcbX
"""

#import libraries
import pandas as pd                           #data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns                         #data visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#Upload files
#from google.colab import files
#f = files.upload()

#Update dataframe and read dataset
df = pd.read_csv('/content/HeartDiseasePrediction.csv')
df.head()

#HANDLING MISSING VALUES

df.info()

print("missing values")
df.isnull().sum().sum()

#We observe the data even further, to see any significant trends

df.describe(include='all')

#We see that most patients are Male, representing more than 75% of data
#Thus our analysis could be biased towards Male patients but lets dig deeper

#HANDLING OUTLIERS

#First we handle object type values
#Sex
df['Sex'].value_counts().loc[['M','F']].plot.bar()

#Although Female patients are less than Male patients but they cannot be termed as outliers

#Lets see Chest Pain Type
#ChestPainType
df['ChestPainType'].value_counts().loc[['ASY','ATA','NAP','TA']].plot.bar()

#We see that there are a few values as TA, let's see their percentage to see if they are outliers or not

(df['ChestPainType']=="TA").sum() / len(df['ChestPainType'])

#We can clearly see that TA are around 5%, so they can be termed as outliers
#But let's dig deeper to see if we can find any other insights

df_ChestPainTypeTA = df[df.ChestPainType.isin(['TA'])]
df_ChestPainTypeTA.describe(include='all')

#Here when we compare the above mean and median with the mean and median of our entire dataset
#We see that all columns have almost the same mean and median, except for HeartDisease column
#In HeartDisease column the median is 0 while in the dataset the median is 1, so let's see the % of 1s and 0s for all values of ChestPainType

#For ChestPainTypeTA
print("TA")
(df_ChestPainTypeTA['HeartDisease']==0).sum() / len(df_ChestPainTypeTA['HeartDisease'])
(df_ChestPainTypeTA['HeartDisease']==1).sum() / len(df_ChestPainTypeTA['HeartDisease'])

#For ChestPainTypeASY
print("\nASY")
df_ChestPainTypeASY = df[df.ChestPainType.isin(['ASY'])]
(df_ChestPainTypeASY['HeartDisease']==0).sum() / len(df_ChestPainTypeASY['HeartDisease'])
(df_ChestPainTypeASY['HeartDisease']==1).sum() / len(df_ChestPainTypeASY['HeartDisease'])

#For ChestPainTypeATA
print("\nATA")
df_ChestPainTypeATA = df[df.ChestPainType.isin(['ATA'])]
(df_ChestPainTypeATA['HeartDisease']==0).sum() / len(df_ChestPainTypeATA['HeartDisease'])
(df_ChestPainTypeATA['HeartDisease']==1).sum() / len(df_ChestPainTypeATA['HeartDisease'])

#For ChestPainTypeNAP
print("\nNAP")
df_ChestPainTypeNAP = df[df.ChestPainType.isin(['NAP'])]
(df_ChestPainTypeNAP['HeartDisease']==0).sum() / len(df_ChestPainTypeNAP['HeartDisease'])
(df_ChestPainTypeNAP['HeartDisease']==1).sum() / len(df_ChestPainTypeNAP['HeartDisease'])

#Here we see that NAP is the only value in which we can merge TA outliers without significantly altering the results of HeartDisease column
#Let's replace TA with NAP and check the percentage again

df.loc[df['ChestPainType'] == "TA", "ChestPainType"] = "NAP"

#For ChestPainTypeNAP
print("\nNAP")
df_ChestPainTypeNAP = df[df.ChestPainType.isin(['NAP'])]
(df_ChestPainTypeNAP['HeartDisease']==0).sum() / len(df_ChestPainTypeNAP['HeartDisease'])
(df_ChestPainTypeNAP['HeartDisease']==1).sum() / len(df_ChestPainTypeNAP['HeartDisease'])

#Since there is no significant change in the result, we proceed again with the graph

df['ChestPainType'].value_counts().loc[['ASY','ATA','NAP']].plot.bar()

#Since the outliers are removed, let's move on to the next object column

#RestingECG
df['RestingECG'].value_counts().loc[['LVH','Normal','ST']].plot.bar()

#Both LVH and ST are significant, so we cannot term them as outliers
#Let's move on to next column

#ExerciseAngina
df['ExerciseAngina'].value_counts().loc[['N','Y']].plot.bar()

#Again, no outliers
#Let's move on to next column

#ST_Slope
df['ST_Slope'].value_counts().loc[['Down','Flat','Up']].plot.bar()

#We see that there are a few values as Down, let's see their percentage to see if they are outliers or not

(df['ST_Slope']=="Down").sum() / len(df['ST_Slope'])

#We can clearly see that Down are around 6%, so they can be termed as outliers
#But let's dig deeper to see if we can find any other insights

df.describe(include='all')
df_ST_SlopeDown = df[df.ST_Slope.isin(['Down'])]
df_ST_SlopeDown.describe(include='all')

#Here when we compare the above mean and median with the mean and median of our entire dataset
#We see that all columns have almost the same mean and median, except for Oldpeak column
#In Oldpeak column the median is 2 while in the dataset the median is 0.6
#Similarly the mean is 2.15 while in the dataset the mean is 0.88
#So let's see the mean and median Oldpeak column w.r.t all values in ST_Slope column

#Down
print("Down")
df_ST_SlopeDown = df[df.ST_Slope.isin(['Down'])]
df_ST_SlopeDown.Oldpeak.mean(numeric_only=True)
df_ST_SlopeDown.Oldpeak.median(numeric_only=True)

#Flat
print("\nFlat")
df_ST_SlopeFlat = df[df.ST_Slope.isin(['Flat'])]
df_ST_SlopeFlat.Oldpeak.mean(numeric_only=True)
df_ST_SlopeFlat.Oldpeak.median(numeric_only=True)

#Up
print("\nUp")
df_ST_SlopeUp = df[df.ST_Slope.isin(['Up'])]
df_ST_SlopeUp.Oldpeak.mean(numeric_only=True)
df_ST_SlopeUp.Oldpeak.median(numeric_only=True)

#Here we see that mean and medians are significantly different for other two ST_Slope values
#So although the outliers are insignificant but they carry an important information so we keep them
#Let's move on to the next column
#Let's see if we have any outliers in int64 and float64 data types

#Age
sns.boxplot(df['Age'])

#Oldpeak
sns.boxplot(df['Oldpeak'])

#From a little google search on Oldpeak, we find out that.
#A depression in the ST segment can indicate myocardial ischemia, which is reduced blood flow to the heart.
#The old peak value is a feature used in machine learning algorithms to predict heart disease.
#The old peak value is positively correlated with heart disease.

#So even though there are a lot of outliers in Oldpeak, the data itself is important
#Let's see how much of these outliers predict heart disease?


df['Oldpeak'].describe()
pd.crosstab(df['Oldpeak'],df['HeartDisease']).plot(kind='box')

#We can see that a lot of outliers are predicting the heart disease. Let's look a little deeper at these outliers

# finding the 1st quartile
Oldpeak_Q1 = np.quantile(df['Oldpeak'], 0.25)

# finding the 3rd quartile
Oldpeak_Q3 = np.quantile(df['Oldpeak'], 0.75)
Oldpeak_med = np.median(df['Oldpeak'])

# finding the IQR region
Oldpeak_IQR = Oldpeak_Q3-Oldpeak_Q1

# finding upper and lower whiskers
Oldpeak_upper_bound = Oldpeak_Q3+(1.5*Oldpeak_IQR)
Oldpeak_lower_bound = Oldpeak_Q1-(1.5*Oldpeak_IQR)
print(Oldpeak_IQR, Oldpeak_upper_bound, Oldpeak_lower_bound)

#To see which are the outliers
Oldpeak_Outliers = df['Oldpeak'][(df['Oldpeak'] <= Oldpeak_lower_bound) | (df['Oldpeak'] >= Oldpeak_upper_bound)]
print('The following are the outliers in the boxplot:{}'.format(Oldpeak_Outliers))

#To see other columns with respect to the outliers
df.describe(include='all')
Oldpeak_filter = df[(df['Oldpeak'] > Oldpeak_upper_bound) | (df['Oldpeak'] < Oldpeak_lower_bound)]
Oldpeak_filter.describe(include='all')

#We can see clearly that most of the outliers have a ChestPainType ASY and most of them predict the Heart Disease as Yes.
#Also, most of them have high cholesterol levels which means that they are Obese
#So we keep these outliers because they carry some important information
#Let's move on to other columns

#MaxHR
sns.boxplot(df['MaxHR'])

#We can see that there are a few outliers
#From a little google search on MaxHR, we find out that,
#MaxHR is the upper limit of how fast your heart can beat during physical activity
#People who take beta blockers have a lower MaxHR
#A different mechanism is needed to estimate MaxHR for obese people
#Although we do not have the data on beta blockers but we do have the data on obesity
#Let's take a closer look at these outliers

# finding the 1st quartile
MaxHR_Q1 = np.quantile(df['MaxHR'], 0.25)

# finding the 3rd quartile
MaxHR_Q3 = np.quantile(df['MaxHR'], 0.75)
MaxHR_med = np.median(df['MaxHR'])

# finding the IQR region
MaxHR_IQR = MaxHR_Q3-MaxHR_Q1

# finding upper and lower whiskers
MaxHR_upper_bound = MaxHR_Q3+(1.5*MaxHR_IQR)
MaxHR_lower_bound = MaxHR_Q1-(1.5*MaxHR_IQR)
print(MaxHR_IQR, MaxHR_upper_bound, MaxHR_lower_bound)

#To see which are the outliers
MaxHR_Outliers = df['MaxHR'][(df['MaxHR'] <= MaxHR_lower_bound) | (df['MaxHR'] >= MaxHR_upper_bound)]
print('The following are the outliers in the boxplot:{}'.format(MaxHR_Outliers))

#To see other columns with respect to the outliers
df.describe(include='all')
MaxHR_filter = df[(df['MaxHR'] > MaxHR_upper_bound) | (df['MaxHR'] < MaxHR_lower_bound)]
MaxHR_filter.describe(include='all')

#By looking at the outliers we see that there are only two outliers
#Both of them have Hearth Disease and both of them are Male.
#Both of them have ChestPainType ASY
#But since they are only two, they won't significantly alter our result, so we remove them and update our dataframe

df = df[(df['MaxHR'] < MaxHR_upper_bound) & (df['MaxHR'] > MaxHR_lower_bound)]

#Now let's move on to other columns

#FastingBS
sns.boxplot(df['FastingBS'])

#Info from Google,
#Fasting blood sugar (FBS) is a blood test that measures blood sugar levels after a person has not eaten.
#A fasting blood sugar level of 5.7–6.4% indicates prediabetes
#A fasting blood sugar level of 6.5% or higher indicates diabetes
#High FBS levels can indicate insulin resistance or diabetes, which are both risk factors for Heart Disease
#Both low and high FBS levels can increase the risk of Heart Disease\
#Since this data is just 0 and 1, it doesn't make sense as FastingBS data. So let's see this data w.r.t heart disease

pd.crosstab(df['FastingBS'],df['HeartDisease']).plot(kind='box')

#Since they are distributed in normal ranges in 0s and 1s, we'll leave this data as it is
#We'll see if we remove it in multicollinearity or in some other way.
#Let's move on to other columns

#Cholesterol
sns.boxplot(df['Cholesterol'])

#We can see that there are a few outliers
#A little google search tells us that
#High levels of cholesterol can increase your risk of heart disease
#Healthy cholesterol levels vary by age, weight, and sex
#A total cholesterol level of 200–239 mg/dL is considered borderline high, and 240 mg/dL or higher is considered high.
#So let's take a closer look at these outliers

# finding the 1st quartile
Cholesterol_Q1 = np.quantile(df['Cholesterol'], 0.25)

# finding the 3rd quartile
Cholesterol_Q3 = np.quantile(df['Cholesterol'], 0.75)
Cholesterol_med = np.median(df['Cholesterol'])

# finding the IQR region
Cholesterol_IQR = Cholesterol_Q3-Cholesterol_Q1

# finding upper and lower whiskers
Cholesterol_upper_bound = Cholesterol_Q3+(1.5*Cholesterol_IQR)
Cholesterol_lower_bound = Cholesterol_Q1-(1.5*Cholesterol_IQR)
print(Cholesterol_IQR, Cholesterol_upper_bound, Cholesterol_lower_bound)

#To see which are the outliers
Cholesterol_Outliers = df['Cholesterol'][(df['Cholesterol'] <= Cholesterol_lower_bound) | (df['Cholesterol'] >= Cholesterol_upper_bound)]
print('The following are the outliers in the boxplot:{}'.format(Cholesterol_Outliers))

#To see other columns with respect to the outliers
df.describe(include='all')
Cholesterol_filter = df[(df['Cholesterol'] > Cholesterol_upper_bound) | (df['Cholesterol'] < Cholesterol_lower_bound)]
Cholesterol_filter.describe(include='all')

#Although there is no significant change in values for other columns
#But we do see that higher cholesterol levels relate to higher risk of Heart Disease
#But the mean of 31 is too low, considering these are high values, so let's look at the mean of values greater than the upper bound only

Cholesterol_filter2 = df[(df['Cholesterol'] > Cholesterol_upper_bound)]
Cholesterol_filter2.describe(include='all')

#The count of 12 is a surprise. Looking at the boxplot we thought that most of the outliers are above the upper bound
#Since these outliers do not significantly change the outcome
#Nor do they significantly alter mean and medians in the other columns, so we replace them with median cholesterol values

df.loc[df['Cholesterol'] >= 400, "Cholesterol"] = 240
sns.boxplot(df['Cholesterol'])

#Let's look at the outliers below the lower bound

Cholesterol_filter3 = df[(df['Cholesterol'] < Cholesterol_lower_bound)]
Cholesterol_filter3.describe(include='all')

#We see that most of these values are zero.
#Although total cholesterol levels can be zero but they are linked with no heart disease risk
#But in our case these zeros are predicting an increased risk of heart disease
#So either our data is faulty or these are calcium levels, LDL or HDL and not cholesterol
#Since there is no way of knowing the true nature of these values, we assume that they are total cholesterol levels so we move on
#Since the lowest value of cholesterol after zero is 85, so we smooth our data and make them all 84, so that they fall within the whiskers.

df.loc[df['Cholesterol'] < 85, "Cholesterol"] = 84
sns.boxplot(df['Cholesterol'])

#Now let's move on to our last column

#RestingBP
sns.boxplot(df['RestingBP'])

#A normal resting blood pressure for most adults is 120/80 millimeters of mercury (mmHg) or lower.
#This means a systolic pressure of less than 120 and a diastolic pressure of less than 80.
#High-normal: Also known as pre-hypertension, this is when your blood pressure is between 120/80 and 140/90 mmHg.
#Stage 1 high blood pressure: This is when your blood pressure is 130 to 139/80 to 89 mmHg.
#Stage 2 high blood pressure: This is when your blood pressure is 140/90 or higher.
#Low blood pressure: Also known as hypotension, this is when your blood pressure is much lower than expected.
#Since we have a few readings which are zero and a systolic blood pressure (SBP) of zero is not possible.
#So this column of RestingBP is likely the diastolic blood pressure, so we are looking at readings below the bar.
#So let's take a closer look at these outliers

# finding the 1st quartile
RestingBP_Q1 = np.quantile(df['RestingBP'], 0.25)

# finding the 3rd quartile
RestingBP_Q3 = np.quantile(df['RestingBP'], 0.75)
RestingBP_med = np.median(df['RestingBP'])

# finding the IQR region
RestingBP_IQR = RestingBP_Q3-RestingBP_Q1

# finding upper and lower whiskers
RestingBP_upper_bound = RestingBP_Q3+(1.5*RestingBP_IQR)
RestingBP_lower_bound = RestingBP_Q1-(1.5*RestingBP_IQR)
print(RestingBP_IQR, RestingBP_upper_bound, RestingBP_lower_bound)

#To see which are the outliers
RestingBP_Outliers = df['RestingBP'][(df['RestingBP'] <= RestingBP_lower_bound) | (df['RestingBP'] >= RestingBP_upper_bound)]
print('The following are the outliers in the boxplot:{}'.format(RestingBP_Outliers))

#To see other columns with respect to the outliers
df.describe(include='all')
RestingBP_filter = df[(df['RestingBP'] > RestingBP_upper_bound) | (df['RestingBP'] < RestingBP_lower_bound)]
RestingBP_filter.describe(include='all')

#Since these outliers do not significantly change the outcome
#Nor do they significantly alter mean and medians in the other columns, so we replace them with median RestingBP values

df.loc[df['RestingBP'] > 170, "RestingBP"] = 130
df.loc[df['RestingBP'] < 90, "RestingBP"] = 130
sns.boxplot(df['RestingBP'])

#Now since all the outliers have been treated, now we're ready to move on to the next phase
#Now we do the encodings
#FEATURE ENCODING
#Label encoding
#We know that our label is HeartDisease column which has two unique entries, 0 and 1, and it is already encoded
#So moving on to one-hot encoding
#One hot encoding

#Sex
Sex_ohe = pd.get_dummies(df['Sex'], dtype=int)
df = df.join(Sex_ohe)

#ChestPainType
ChestPainType_ohe = pd.get_dummies(df['ChestPainType'], dtype=int)
df = df.join(ChestPainType_ohe)

#RestingECG
RestingECG_ohe = pd.get_dummies(df['RestingECG'], dtype=int)
df = df.join(RestingECG_ohe)

#ExerciseAngina
ExerciseAngina_ohe = pd.get_dummies(df['ExerciseAngina'], dtype=int)
df = df.join(ExerciseAngina_ohe)

#ST_Slope
ST_Slope_ohe = pd.get_dummies(df['ST_Slope'], dtype=int)
df = df.join(ST_Slope_ohe)
df.head()

#Now since we have encoded all the features we move on to droping redundant features
#We also need to drop extra dummy variables like drop Female, if we have Male and Female etc.
#Droping the extra columns
df.drop(['Sex'], axis=1, inplace=True)
df.drop(['ChestPainType'], axis=1, inplace=True)
df.drop(['RestingECG'], axis=1, inplace=True)
df.drop(['ExerciseAngina'], axis=1, inplace=True)
df.drop(['ST_Slope'], axis=1, inplace=True)
df.drop(['F'], axis=1, inplace=True)
df.drop(['ASY'], axis=1, inplace=True)
df.drop(['LVH'], axis=1, inplace=True)
df.drop(['N'], axis=1, inplace=True)
df.drop(['Down'], axis=1, inplace=True)
df.head()

#Now that we have everything ready, let's move on to feature scaling
#FEATURE SCALING
#In the above data set, we only have to scale five columns, namely Age, RestingBP, Cholesterol, MaxHR, Oldpeak
#Since there are outliers in our dataset and we want to keep these outliers, however we don't want our model to get affected by them too much, so we will use Robust Scalar

from sklearn.preprocessing import RobustScaler
df_scaled = RobustScaler(with_centering = False)
df[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']] = df_scaled.fit_transform(df[['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']])
df.head()

#Here we have a good score but we have not checked multicollinearity, so let's check it
#Now we will check for multicollinearity for the independent variables
#MULTICOLLINEARITY
#We will check this through Variance Inflation Factor (VIF)

from statsmodels.stats.outliers_influence import variance_inflation_factor

x = df.drop(['HeartDisease'], axis = 1)
y = df['HeartDisease']

def calc_vif(x):

  # Calculating VIF
  vif = pd.DataFrame()
  vif["variables"] = x.columns
  vif["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]

  return(vif)

x = x.iloc[:,:-1]
calc_vif(x)

#Above we can see that some features have high multicollinearity
#Let's start dropping them one by one and calculate multicollinearity again
#VIF greater than 4 indicates that multicollinearity might exist
#VIF greater than 10 indicates that there is significant multicollinearity
#We first drop RestingBP, since it has a VIF of 61

x = df.drop(['HeartDisease','RestingBP'], axis = 1)
calc_vif(x)

#Let's drop MaxHR and check again

x = df.drop(['HeartDisease','RestingBP','MaxHR'], axis = 1)
calc_vif(x)

#Let's drop Age and check again

x = df.drop(['HeartDisease','RestingBP','MaxHR','Age'], axis = 1)
calc_vif(x)

#Let's drop Cholesterol and check again

x = df.drop(['HeartDisease','RestingBP','Age','MaxHR','Cholesterol'], axis = 1)
calc_vif(x)

#MODEL-ONE-LOGISTIC REGRESSION
#Now everything is ready for the logistic regression step

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

#Predictions and scores
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Let's drop M and check model parameters again

x = df.drop(['HeartDisease','RestingBP','Age','MaxHR','Cholesterol','M'], axis = 1)
calc_vif(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#These are acceptable results as we are getting 83% accuracy on test data
#Let's first update our data frame try other models
#PRINCIPAL COMPONENT ANALYSIS

df = df.drop(['RestingBP','Age','MaxHR','Cholesterol','M'], axis = 1)

#Separating out the features
features = ['FastingBS','Oldpeak','ATA','NAP','Normal','ST','Y','Flat','Up']

x = df.loc[:, features].values

# Separating out the target variable
y = df.loc[:, ['HeartDisease']].values

from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2'])
principalDf

#Now since we have the PCAs, let's see the regression output

x = principalDf
y = df['HeartDisease']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#80% is a good accuracy but lets see if our accuracy increases with more Pricipal Components

x = df.loc[:, features].values
y = df.loc[:, ['HeartDisease']].values
pca = PCA(n_components = 3)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2','PCA 3'])
x = principalDf
y = df['HeartDisease']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Our accuracy has decreased, let's add another component and see the results
x = df.loc[:, features].values
y = df.loc[:, ['HeartDisease']].values
pca = PCA(n_components = 4)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2','PCA 3','PCA 4'])
x = principalDf
y = df['HeartDisease']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Since our accuracy has not increased significantly, we stop at 2 principal components and try other models instead

x = df.loc[:, features].values
y = df.loc[:, ['HeartDisease']].values
pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2'])
x = principalDf
y = df['HeartDisease']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Let's try other models on our dataframe
#APRIORI ALGORITHM

#To use Apriori Algorithm, we have to drop Oldpeak because it cannot be encoded with one-hot encoding
features = ['FastingBS','ATA','NAP','Normal','ST','Y','Flat','Up','HeartDisease']
x = df.loc[:, features]         #We removed the .values term because Apriori takes Pandas dataframe and not a numpy array

from mlxtend.frequent_patterns import apriori, association_rules
# Building the model
frq_items = apriori(x, min_support = 0.05, use_colnames = True)

# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1, num_itemsets=frq_items.shape[0])
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
rules.head()

#Using the train-test split of 70-30 and perform Apriori again
x_train, x_test= train_test_split(x, test_size = 0.3, random_state=0)
x_train_frq_items = apriori(x_train, min_support = 0.05, use_colnames = True)
train_rules = association_rules(x_train_frq_items, metric ="lift", min_threshold = 1, num_itemsets=x_train_frq_items.shape[0])
train_rules = train_rules.sort_values(['confidence', 'lift'], ascending =[False, False])
train_rules.head()

x_test_frq_items = apriori(x_test, min_support = 0.05, use_colnames = True)
test_rules = association_rules(x_test_frq_items, metric ="lift", min_threshold = 1, num_itemsets=x_test_frq_items.shape[0])
test_rules = test_rules.sort_values(['confidence', 'lift'], ascending =[False, False])
test_rules.head()

#Now let's try another algorithm
#K-Nearest Neighbour

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity

# Create feature and target arrays
features = ['FastingBS','ATA','NAP','Normal','ST','Y','Flat','Up']
x = df.loc[:, features]
y = df.loc[:, ['HeartDisease']]

# Split into training and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)

#k=7
knn = KNeighborsClassifier(n_neighbors=7,  metric='cosine') #Using cosine similarity as a distance measure
knn.fit(x_train, y_train)

# Calculate the accuracy of the model
print(knn.score(x_test, y_test))

#Let's see the results at other values of k
#k=1
knn = KNeighborsClassifier(n_neighbors=1,  metric='cosine')
knn.fit(x_train, y_train)
print("k=1")
print(knn.score(x_test, y_test))

#k=2
knn = KNeighborsClassifier(n_neighbors=2,  metric='cosine')
knn.fit(x_train, y_train)
print("\nk=2")
print(knn.score(x_test, y_test))

#k=3
knn = KNeighborsClassifier(n_neighbors=3,  metric='cosine')
knn.fit(x_train, y_train)
print("\nk=3")
print(knn.score(x_test, y_test))

#k=4
knn = KNeighborsClassifier(n_neighbors=4,  metric='cosine')
knn.fit(x_train, y_train)
print("\nk=4")
print(knn.score(x_test, y_test))

#k=5
knn = KNeighborsClassifier(n_neighbors=5,  metric='cosine')
knn.fit(x_train, y_train)
print("\nk=5")
print(knn.score(x_test, y_test))

#k=6
knn = KNeighborsClassifier(n_neighbors=6,  metric='cosine')
knn.fit(x_train, y_train)
print("\nk=6")
print(knn.score(x_test, y_test))

#k=8
knn = KNeighborsClassifier(n_neighbors=8,  metric='cosine')
knn.fit(x_train, y_train)
print("\nk=8")
print(knn.score(x_test, y_test))

#k=9
knn = KNeighborsClassifier(n_neighbors=9,  metric='cosine')
knn.fit(x_train, y_train)
print("\nk=9")
print(knn.score(x_test, y_test))

#So ideal value is k=4
#Let's move on to other models
#Decision Tree

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

features = ['FastingBS','ATA','NAP','Normal','ST','Y','Flat','Up']
x = df.loc[:, features].values
y = df.loc[:, ['HeartDisease']].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

#Let's train the model and check its accuracy using Gini Index

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

#Train using Gini Index
Gini = DecisionTreeClassifier(criterion = 'gini', random_state = 0)
Gini.fit(x_train, y_train)

# Prediction
y_pred = Gini.predict(x_test)

#Accuracy
accuracy_score(y_test, y_pred)

# Classification report
print(f'Classification Report: \n{classification_report(y_test, y_pred)}')

# F1 score
print(f"F1 Score : {f1_score(y_test, y_pred)}")

# Confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)

#Let's train the model and check its accuracy using Entropy

#Train using Entropy
Entropy = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
Entropy.fit(x_train, y_train)

# Prediction
y_pred = Entropy.predict(x_test)

#Accuracy
accuracy_score(y_test, y_pred)

# Classification report
print(f'Classification Report: \n{classification_report(y_test, y_pred)}')

# F1 score
print(f"F1 Score : {f1_score(y_test, y_pred)}")

# Confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)

#Entropy gives us a better accuracy, so we use entropy
#Now let's move on to our last model
#Naive Bayesian

from sklearn.naive_bayes import GaussianNB

features = ['FastingBS','ATA','NAP','Normal','ST','Y','Flat','Up']
x = df.loc[:, features].values
y = df.loc[:, ['HeartDisease']].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)

# Classifier
classifier = GaussianNB()
classifier.fit(x_train, y_train)

#Let's try to check the accuracy of our model
# Prediction
y_pred = classifier.predict(x_test)

# Accuracy
accuracy_score(y_test, y_pred)

# Classification report
print(f'Classification Report: \n{classification_report(y_test, y_pred)}')

# F1 score
print(f"F1 Score : {f1_score(y_test, y_pred)}")

# Confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)

#So based on different models, we conclude that Apriori algorithm produces the best results