# -*- coding: utf-8 -*-
"""2_nust_datamining_airline.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/10AkVokHWYgJfoC_cJrGOolqbyYCbUuIX
"""

#import libraries
import pandas as pd                           #data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns                         #data visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#Upload files
#from google.colab import files
#f = files.upload()

#Update dataframe and read dataset
df = pd.read_csv('/content/2_nust_datamining_airline_train.csv')
df.head()

#HANDLING MISSING VALUES

df.info()

print("missing values")
df.isnull().sum().sum()

#We can see that the missing values are only in Arrival Delay in Minutes column

#Visualizing missing values using seaborn heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.isna().transpose(),
            cmap="YlGnBu",
            cbar_kws={'label': 'Missing Data'})

#We observe above that the missing values do not form any specifice pattern
#We now see how the missing values relate to overall data in that column

df.describe(include='all')

#Here we can observe that 13min of delay time falls within 75 percentile so filling the missing values with mean of 15 will not be wise.
#We can ofcourse drop the missing values but that will result in loss of our data which might affect the accuracy of our final model
#The best strategy would be to fill the missing values with median

df['Arrival Delay in Minutes'].median()

#Since the median is 0, we will fill the missing values with 0

df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median())
df.info()

#We've verified the results, there are no more missing values
#Now we move on to handling outliers

#HANDLING OUTLIERS

#First we handle object type values
#Gender
df['Gender'].value_counts().loc[['Male','Female']].plot.bar()

#We can see that there are no outliers in Gender
#Now we check Customer Type

df['Customer Type'].value_counts().loc[['disloyal Customer','Loyal Customer']].plot.bar()

#Although there are few disloyal Customers but they are not enough to become outliers
#Now we check Type of Travel

df['Type of Travel'].value_counts().loc[['Business travel','Personal Travel']].plot.bar()

#We can see that Type of Travel also do not have any outliers
#Now we check Class

df['Class'].value_counts().loc[['Business','Eco','Eco Plus']].plot.bar()

#Although there are a few Eco Plus customers but they cannot be termed as outliers
#Let's see if we have any outliers in int64 and float64 data types

#Age
sns.boxplot(df['Age'])

#Seat comfort
sns.boxplot(df['Seat comfort'])

#Departure/Arrival time convenient
sns.boxplot(df['Departure/Arrival time convenient'])

#Food and drink
sns.boxplot(df['Food and drink'])

#Gate location
sns.boxplot(df['Gate location'])

#Inflight wifi service
sns.boxplot(df['Inflight wifi service'])

#Inflight entertainment
sns.boxplot(df['Inflight entertainment'])

#Online support
sns.boxplot(df['Online support'])

#Ease of Online booking
sns.boxplot(df['Ease of Online booking'])

#Leg room service
sns.boxplot(df['Leg room service'])

#Baggage handling
sns.boxplot(df['Baggage handling'])

#Cleanliness
sns.boxplot(df['Cleanliness'])

#Online boarding
sns.boxplot(df['Online boarding'])

#Flight Distance
sns.boxplot(df['Flight Distance'])

#As we can see we have a lot of outliers in Flight Distance, let's check Flight Distance data

df['Flight Distance'].describe()

pd.crosstab(df['Flight Distance'],df['satisfaction']).plot(kind='box')

#Check whether customers are dissatisfied on longer flights?

df_dis = df[df.satisfaction.isin(['dissatisfied'])]
df_dis['Flight Distance'].describe()

df_dis['Flight Distance'].hist(bins=10)

#We can clearly see in the above graph that after 4100 miles, there are outliers.

df_sat = df[df.satisfaction.isin(['satisfied'])]
df_sat['Flight Distance'].describe()

df_sat['Flight Distance'].hist(bins=10)

#We can clearly see in the above graph that after 4100 miles, there are outliers.
#One way could be to remove them, however, these dissatisfied customers are on the longer flights
#This can be an important insight for the airlines, what if the longer flights have bad customer service
#What if the longer flights arrive late, we need to analyze longer flights w.r.t all the columns

df_long = df[df['Flight Distance'].isin(range(4100,7000))]
df_long.describe(include='all')
df['Flight Distance'].median()

#Looking at the above stats, we conclude that there are more satisfied customers on longer flights than dissatisfied ones
#Therefore we can comfortably drop the outliers or change their distance with the median
#Since the median is 1925, we will assign that number to the outliers and update our dataframe

df.loc[df['Flight Distance'] >= 4100, "Flight Distance"] = 1925
df.describe(include='all')

#After replacing the outliers in Flight Distance, we check again
#For dissatisfied customers
df_dis = df[df.satisfaction.isin(['dissatisfied'])]
df_dis['Flight Distance'].hist(bins=10)

#For satisfied customers
df_sat = df[df.satisfaction.isin(['satisfied'])]
df_sat['Flight Distance'].hist(bins=10)

#The graphs above look great and show an almost normal distribution
sns.boxplot(df['Flight Distance'])

#Now since we have a perfect boxplot for Flight Distance, lets move on to other columns
#On-board service
sns.boxplot(df['On-board service'])

#Looks like we have a few outliers, let's see through a histogram
df['On-board service'].hist(bins=10)
df['On-board service'].unique()
(df['On-board service']<=1).sum()

#Let's see the percentage of ones and zeros to see if they are significant
(df['On-board service']<=1).sum() / len(df['On-board service'])

#10% of outliers seems significant, let's see whether these outliers show a specifice trend?

pd.crosstab(df['On-board service'],df['satisfaction']).plot(kind='box')

#The outliers seem to be evenly distributed between satisfied and dissatisfied customers
#Let's analyze them w.r.t other features
outliers_onboard = df[df['On-board service'].isin(range(0,2))]
outliers_onboard.describe(include='all')

#Observing the outliers in on-board service we find out that most of these people are loyal customers who are dissatisfied
#While analyzing all other columns we find out that they have only given low scores to on-board service and most of them are traveling for Business purposes
#Therefore, these outliers contain an important insight that loyal customers traveling for business are dissatisfied because of the bad on-board service
#Thus we will keep these outliers

#Let's move on to the next column
#Checkin service
sns.boxplot(df['Checkin service'])

#Looks like we have a few outliers, let's see through a histogram and check the outlier percentage
df['Checkin service'].hist(bins=10)
df['Checkin service'].unique()
(df['Checkin service']<=1).sum()
(df['Checkin service']<=1).sum() / len(df['Checkin service'])

#Here we also see that we have almost the same amount of twos as ones, let's look a little deeper into the outliers
pd.crosstab(df['Checkin service'],df['satisfaction']).plot(kind='box')

#The outliers seem to be evenly distributed between satisfied and dissatisfied customers
#Let's analyze them w.r.t other features
outliers_checkin = df[df['Checkin service'].isin(range(0,2))]
outliers_checkin.describe(include='all')

#Observing the outliers in Checkin service we find out that most of these people are loyal customers who are dissatisfied
#While analyzing all other columns we find out that they have only given low scores to Checkin service and most of them are traveling for Business purposes
#Therefore, these outliers contain an important insight that loyal customers traveling for business are dissatisfied because of the bad Checkin service
#Thus we will keep these outliers

#Let's move on to the next column
#Departure Delay in Minutes
sns.boxplot(df['Departure Delay in Minutes'])

#Looks like we have a lot outliers, let's see through a histogram and check the outlier percentage
df['Departure Delay in Minutes'].hist(bins=100)

#Looks like we have a few outliers after 170, let's analyze them even further
pd.crosstab(df['Departure Delay in Minutes'],df['satisfaction']).plot(kind='box')

#Looks like the outliers are evenly distributed, let's analyze them with Arrival Delay
pd.crosstab(df['Arrival Delay in Minutes'],df['satisfaction']).plot(kind='box')

#If we account for the missing values in arrival delay that we filled earlier, the outliers in the departure delay and arrival delay look almost the same
#Looks like the outliers are evenly distributed so any changes to Departure delay should also be made to arrival delay
#Most probably the same planes which departed late, arrived late so there was a recorded delay in both columns
#Let's dig a little deeper into these outliers
df_depdelay = df[df['Departure Delay in Minutes'].isin(range(40,1000))]
df_depdelay.describe(include='all')
(df['Departure Delay in Minutes']>=40).sum()
(df['Departure Delay in Minutes']>=40).sum() / len(df['Departure Delay in Minutes'])

#Here we see that the outliers do not show a specific trend w.r.t other columns and they are evenly distributed between satisfied and dissatisfied customers
#Therefore, we will replace the outliers with median values of the dataset
#We will replace only 11% of the data, because anything more than that might change the precision of our model
df.loc[df['Departure Delay in Minutes'] >= 40, "Departure Delay in Minutes"] = 0
df.describe(include='all')
sns.boxplot(df['Departure Delay in Minutes'])

#Now we have an evenly distribution of outliers and most of the data seems consistent
#Let's move on to the next column
#Arrival Delay in Minutes
sns.boxplot(df['Arrival Delay in Minutes'])

#We know that these are the same outliers as in Departure delay
#Therefore, we will replace the outliers with median values of the dataset
#We will replace only 11% of the data, because anything more than that might change the precision of our model
df.loc[df['Arrival Delay in Minutes'] >= 40, "Arrival Delay in Minutes"] = 0
df.describe(include='all')
sns.boxplot(df['Arrival Delay in Minutes'])

#Now we have an evenly distribution of outliers and most of the data seems consistent
#Now we do the encodings
#FEATURE ENCODING
#Label encoding
#We know that our label is satisfaction column which has two unique entries, satisfied and dissatisfied, so we will encode them

from sklearn.preprocessing import LabelEncoder
lbl_encode = LabelEncoder()
df['satisfaction_label'] = lbl_encode.fit_transform(df['satisfaction'])
df.describe(include='all')

#Now moving on to one-hot encoding
#One hot encoding
#Gender

gender_ohe = pd.get_dummies(df['Gender'], dtype=int)
df = df.join(gender_ohe)
df.head()

#One hot encoding
#Customer Type

customer_ohe = pd.get_dummies(df['Customer Type'], dtype=int)
df = df.join(customer_ohe)
df.head()

#One hot encoding
#Type of Travel

travel_ohe = pd.get_dummies(df['Type of Travel'], dtype=int)
df = df.join(travel_ohe)
df.head()

#One hot encoding
#Class

class_ohe = pd.get_dummies(df['Class'], dtype=int)
df = df.join(class_ohe)
df.head()

#Now since we have encoded all the features we move on to droping redundant features
#Droping the extra columns
df.drop(['satisfaction'], axis=1, inplace=True)
df.drop(['Gender'], axis=1, inplace=True)
df.drop(['Customer Type'], axis=1, inplace=True)
df.drop(['Type of Travel'], axis=1, inplace=True)
df.drop(['Class'], axis=1, inplace=True)
df.drop(['Female'], axis=1, inplace=True)
df.drop(['disloyal Customer'], axis=1, inplace=True)
df.drop(['Personal Travel'], axis=1, inplace=True)
df.drop(['Eco Plus'], axis=1, inplace=True)
df.drop(['Arrival Delay in Minutes'], axis=1, inplace=True)
df.head()

#Now that we have everything ready, let's move on to feature scaling
#FEATURE SCALING
#In the above data set, we only have to scale the first two columns, namely Age and Flight Distance
#Since there are outliers in our dataset and we want to keep these outliers, however we don't want our model to get affected by them too much, so we will use Robust Scalar

from sklearn.preprocessing import RobustScaler
df_scaled = RobustScaler(with_centering = False)
df[['Age','Flight Distance']] = df_scaled.fit_transform(df[['Age','Flight Distance']])
df.head()

#Now everything is ready for the logistic regression step
#We will see the accuracy of our training set and based on that we will decide whether to keep negative values in scaled Age and Flight Distance or to keep outliers

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

x = df.drop(columns = 'satisfaction_label')
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

#Let's see the predictions and scores
logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Here we have a good score but we have not checked multicollinearity, so let's check it
#Now we will check for multicollinearity for the independent variables
#MULTICOLLINEARITY
#We will check this through Variance Inflation Factor (VIF)

from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(x):

  # Calculating VIF
  vif = pd.DataFrame()
  vif["variables"] = x.columns
  vif["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]

  return(vif)

x = df.iloc[:,:-1]
calc_vif(x)

#Above we can see that some features have high multicollinearity
#Let's start dropping them one by one and calculate multicollinearity again
#VIF greater than 4 indicates that multicollinearity might exist
#VIF greater than 10 indicates that there is significant multicollinearity
#We drop Ease of online booking, since it has a VIF of 29

x = df.drop(['satisfaction_label','Ease of Online booking'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Cleanliness has a VIF of 21, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Online Support has a VIF of 18, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Baggage handling has a VIF of 16, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Online boarding has a VIF of 14, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling', 'Online boarding'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Food and drink has a VIF of 13, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Inflight entertainment has a VIF of 11, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but On-board service has a VIF of 10, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Gate location has a VIF of 9, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service', 'Gate location'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Age has a VIF of 8, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service', 'Gate location', 'Age'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Leg room service has a VIF of 8, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service', 'Gate location', 'Age', 'Leg room service'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Business has a VIF of 7, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service', 'Gate location', 'Age', 'Leg room service', 'Business'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Checkin service has a VIF of 6, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Checkin service', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service', 'Gate location', 'Age', 'Leg room service', 'Business'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Seat comfort has a VIF of 6, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Seat comfort','Checkin service', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service', 'Gate location', 'Age', 'Leg room service', 'Business'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Inflight wifi service has a VIF of 5, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Inflight wifi service' , 'Seat comfort','Checkin service', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service', 'Gate location', 'Age', 'Leg room service', 'Business'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Flight Distance has a VIF of 4, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Flight Distance','Inflight wifi service' , 'Seat comfort','Checkin service', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service', 'Gate location', 'Age', 'Leg room service', 'Business'], axis = 1)
calc_vif(x)

#Above we see that the VIF has dropped a bit but Departure/Arrival time convenient has a VIF of 4, so we remove it as well

x = df.drop(['satisfaction_label','Ease of Online booking', 'Departure/Arrival time convenient','Cleanliness', 'Flight Distance','Inflight wifi service' , 'Seat comfort','Checkin service', 'Online support', 'Baggage handling', 'Online boarding', 'Food and drink', 'Inflight entertainment', 'On-board service', 'Gate location', 'Age', 'Leg room service', 'Business'], axis = 1)
calc_vif(x)

#Now everything is ready for the logistic regression step
#We will see the accuracy of our training set and based on that we will decide whether to keep negative values in scaled Age and Flight Distance or to keep outliers

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

#Let's see the predictions and scores
logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Here we see that although we have removed multicollinearity but it has also resulted in some loss of information
#Because of this our model accuracy has decreased from 83% to 74%
#Lets try to remove features one by one again, but this time we will check for model every time
#We will stop where the model no longer improves

x = df.drop(['satisfaction_label','Ease of Online booking'], axis = 1)
calc_vif(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Here we see that the model's accuracy has jumped back to 83%
#Let's drop another column and check again. Cleanliness has a VIF of 21, so let's drop it.
x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness'], axis = 1)
calc_vif(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Here we see that the model's accuracy did not significanlty change from 83%
#Let's drop another column and check again. Online support has a VIF of 18, so let's drop it.
x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support'], axis = 1)
calc_vif(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Here we see that the model's accuracy did not significanlty change from 83%
#Let's drop another column and check again. Baggage handling has a VIF of 16, so let's drop it.
x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling'], axis = 1)
calc_vif(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Here we see that the model's accuracy did not significanlty change from 83%
#Let's drop another column and check again. Online boarding has a VIF of 14, so let's drop it.
x = df.drop(['satisfaction_label','Ease of Online booking', 'Cleanliness', 'Online support', 'Baggage handling', 'Online boarding'], axis = 1)
calc_vif(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Here we see that the model's accuracy has decreased slightly from 83% to 82% on test data
#Let's drop another column and check again. Food and drink has a VIF of 13, so let's drop it.
x = df.drop(['satisfaction_label','Ease of Online booking', 'Food and drink','Cleanliness', 'Online support', 'Baggage handling', 'Online boarding'], axis = 1)
calc_vif(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Here we see that the model's accuracy has again decreased, so the trend is downwards.
#Let's bring back the last two columns to keep our accuracy at 83%
#So we are bringing back 'Food and drink' and 'Online boarding
x = df.drop(['satisfaction_label','Ease of Online booking','Cleanliness', 'Online support', 'Baggage handling'], axis = 1)
calc_vif(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

df = df.drop(['Ease of Online booking','Cleanliness', 'Online support', 'Baggage handling'], axis = 1)

#Now we do PCA and see the results of regression model
#PRINCIPAL COMPONENT ANALYSIS (PCA)

x.head()

#Separating out the features
features = ['Age', 'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient', 'Food and drink'
            , 'Gate location', 'Inflight wifi service', 'Inflight entertainment', 'On-board service', 'Leg room service'
            , 'Checkin service', 'Online boarding', 'Departure Delay in Minutes', 'Male', 'Loyal Customer', 'Business travel'
            , 'Business', 'Eco']

x = df.loc[:, features].values

# Separating out the target variable
y = df.loc[:, ['satisfaction_label']].values


from sklearn.decomposition import PCA

pca = PCA(n_components = 2)

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2'])

principalDf

#Now since we have the PCAs, let's see the regression output

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

#Here we see that our model's accuracy has decreased significantly
#Let's try to increase out principal components to 3 and check the model's accuracy again

x = df.loc[:, features].values
y = df.loc[:,['satisfaction_label']].values

pca = PCA(n_components = 3)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2', 'PCA 3'])

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

principalDf.head()

#Now we see that by adding PCA 3 our accuracy has increased from 59% to 77%.
#Let's add another principal component to see its effect

x = df.loc[:, features].values
y = df.loc[:,['satisfaction_label']].values

pca = PCA(n_components = 4)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4'])

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

principalDf.head()

#Here we see that the accuracy of our test result has increased by 1%
#Let's add another principal component to see further

x = df.loc[:, features].values
y = df.loc[:,['satisfaction_label']].values

pca = PCA(n_components = 5)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5'])

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

principalDf.head()

#The accuracy has further increased
#Let's add another principal component

x = df.loc[:, features].values
y = df.loc[:,['satisfaction_label']].values

pca = PCA(n_components = 6)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5', 'PCA 6'])

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

principalDf.head()

#The accuracy has slightly increased
#Let's add another to check further

x = df.loc[:, features].values
y = df.loc[:,['satisfaction_label']].values

pca = PCA(n_components = 7)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5', 'PCA 6', 'PCA 7'])

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

principalDf.head()

#The model's accuracy has slightly increased
#Let's add another component to check

x = df.loc[:, features].values
y = df.loc[:,['satisfaction_label']].values

pca = PCA(n_components = 8)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5', 'PCA 6', 'PCA 7', 'PCA 8'])

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

principalDf.head()

#We can see that the accuracy has decreased a bit
#This is because more principal components are adding redundant features which are driving back our model's accuracy
#So far we've had the highest accuracy at PCA 7
#Let's check one more time to confirm this hypothesis

x = df.loc[:, features].values
y = df.loc[:,['satisfaction_label']].values

pca = PCA(n_components = 9)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5', 'PCA 6', 'PCA 7', 'PCA 8', 'PCA 9'])

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

principalDf.head()

#Our hypothesis has been proven wrong, the ninth feature has increased our model's accuracy
#Let's add another principal component

x = df.loc[:, features].values
y = df.loc[:,['satisfaction_label']].values

pca = PCA(n_components = 10)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5', 'PCA 6', 'PCA 7', 'PCA 8', 'PCA 9', 'PCA 10'])

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

principalDf.head()

#Since it has only slightly increased our model's prediction, so we stop at PCA 9 at 80% accuracy
#This is a good accuracy for this model

x = df.loc[:, features].values
y = df.loc[:,['satisfaction_label']].values

pca = PCA(n_components = 9)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5', 'PCA 6', 'PCA 7', 'PCA 8', 'PCA 9'])

x = principalDf
y = df['satisfaction_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=0)
logreg = LogisticRegression(solver='lbfgs', max_iter=500)
logreg.fit(x_train, y_train)

logreg.predict(x_train)
logreg.score(x_train, y_train)
logreg.score(x_test, y_test)

principalDf.head()