Mayouran VARATHALINGAM¶

Prediction of wine quality based on its composition

In [1]:
import pandas as pd
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown as md
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

1. Analysis of the dataset:

In [2]:
dfWine = pd.read_csv("winequality-white.csv", sep=";")
In [3]:
dfWine.head()
Out[3]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.0 0.27 0.36 20.7 0.045 45.0 170.0 1.0010 3.00 0.45 8.8 6
1 6.3 0.30 0.34 1.6 0.049 14.0 132.0 0.9940 3.30 0.49 9.5 6
2 8.1 0.28 0.40 6.9 0.050 30.0 97.0 0.9951 3.26 0.44 10.1 6
3 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40 9.9 6
4 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40 9.9 6
In [4]:
dfWine.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB

We can see that there are not any null values and all the datatypes seem correct.

In [5]:
dfWine.describe()
Out[5]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000
mean 6.854788 0.278241 0.334192 6.391415 0.045772 35.308085 138.360657 0.994027 3.188267 0.489847 10.514267 5.877909
std 0.843868 0.100795 0.121020 5.072058 0.021848 17.007137 42.498065 0.002991 0.151001 0.114126 1.230621 0.885639
min 3.800000 0.080000 0.000000 0.600000 0.009000 2.000000 9.000000 0.987110 2.720000 0.220000 8.000000 3.000000
25% 6.300000 0.210000 0.270000 1.700000 0.036000 23.000000 108.000000 0.991723 3.090000 0.410000 9.500000 5.000000
50% 6.800000 0.260000 0.320000 5.200000 0.043000 34.000000 134.000000 0.993740 3.180000 0.470000 10.400000 6.000000
75% 7.300000 0.320000 0.390000 9.900000 0.050000 46.000000 167.000000 0.996100 3.280000 0.550000 11.400000 6.000000
max 14.200000 1.100000 1.660000 65.800000 0.346000 289.000000 440.000000 1.038980 3.820000 1.080000 14.200000 9.000000
In [6]:
cmap = LinearSegmentedColormap.from_list('mycmap', ['#4cc9f0','#4361ee','#3a0ca3'])

plt.figure(figsize=(13,8))
ax = sns.heatmap(dfWine.corr().round(decimals=2), cmap=cmap, annot=True)
plt.title("Correlation Matrix", fontsize=20)
plt.show()

The feature alcohol seems to have a higher correlation with the wine quality, with a correlation of 0.44.

Our goal here is to predict the quality of a wine based on its composition. So in our dataset, the target variable will be "quality".

In [7]:
columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
features = dfWine[columns]
target = dfWine['quality']
In [8]:
scaler = StandardScaler()
scaler.fit(features)
scaled_features = scaler.transform(features)
In [9]:
colorPalette = ['#4cc9f0','#4361ee','#3a0ca3']
fig = plt.figure(figsize=(30,15))
sns.boxplot(data=scaled_features, palette=colorPalette)
# plt.axes().set_xticklabels(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'], fontsize=13)
plt.xticks(np.arange(11),['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'], fontsize=15)
plt.yticks(fontsize=15)
plt.title("Outliers by features\n",fontsize=30)
Out[9]:
Text(0.5, 1.0, 'Outliers by features\n')

Features like "chlorides" or "volatile acidity" present many outliers. Let's treat them by replacing the outliers in each features by the median.

In [10]:
df_scaled_features = pd.DataFrame(scaled_features)
df_scaled_features.columns= ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
df_scaled_features.head()
Out[10]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
0 0.172097 -0.081770 0.213280 2.821349 -0.035355 0.569932 0.744565 2.331512 -1.246921 -0.349184 -1.393152
1 -0.657501 0.215896 0.048001 -0.944765 0.147747 -1.253019 -0.149685 -0.009154 0.740029 0.001342 -0.824276
2 1.475751 0.017452 0.543838 0.100282 0.193523 -0.312141 -0.973336 0.358665 0.475102 -0.436816 -0.336667
3 0.409125 -0.478657 -0.117278 0.415768 0.559727 0.687541 1.121091 0.525855 0.011480 -0.787342 -0.499203
4 0.409125 -0.478657 -0.117278 0.415768 0.559727 0.687541 1.121091 0.525855 0.011480 -0.787342 -0.499203
In [11]:
for i in df_scaled_features.columns:
    for j in df_scaled_features[i]:
        q1 = df_scaled_features[i].quantile(0.25)
        q3 = df_scaled_features[i].quantile(0.75)
        iqr = q3-q1
        Lower_tail = q1 - 1.5 * iqr
        Upper_tail = q3 + 1.5 * iqr
        if j > Upper_tail or j < Lower_tail:
            df_scaled_features[i] = df_scaled_features[i].replace(j, np.mean(df_scaled_features[i]))
In [12]:
fig = plt.figure(figsize=(20,10))
sns.boxplot(data=df_scaled_features, palette=colorPalette)
plt.title("Processed outliers by features\n",fontsize=18)
Out[12]:
Text(0.5, 1.0, 'Processed outliers by features\n')

We can see that we don't have outlier anymore.

2. Machine Learning and Prediction:

a. Regression

Linear Regression¶

In [13]:
#We will do the regression only with the feature alcohol
alcohol = df_scaled_features.iloc[:,10].values.reshape((len(target),1))
targetReg = target.values.reshape((len(target),1))
print(alcohol.shape)
print(targetReg.shape)
(4898, 1)
(4898, 1)
In [14]:
X_train,X_test,y_train,y_test = train_test_split(alcohol, targetReg, test_size=0.2, random_state=0)

print(X_train)
[[-0.01159456]
 [ 1.53249956]
 [ 0.80108656]
 ...
 [-1.39315246]
 [ 0.23220977]
 [-0.41793512]]
In [15]:
lin_regr = linear_model.LinearRegression()
lin_regr.fit(X_train, y_train)
Out[15]:
LinearRegression()
In [16]:
predicted_y = lin_regr.predict(X_test)
In [17]:
md(f"Mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,predicted_y):.2f}</font>**")
Out[17]:

Mean squared error: 0.73

In [18]:
md(f"Root mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,predicted_y, squared=False):.2f}</font>**")
Out[18]:

Root mean squared error: 0.85

In [19]:
fig = plt.figure(figsize=(10,8))
plt.plot(alcohol, targetReg, "b.", color = "#3a0ca3")
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
plt.tight_layout()
In [20]:
fig = plt.figure(figsize=(10,8))
plt.plot(X_test,predicted_y, "r-", linewidth=2, label="Target Predictions", color="#f72585")
plt.plot( X_train,y_train, "b.", color="#3a0ca3")
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
plt.legend(loc="upper left", fontsize=13)
plt.show()
In [21]:
df_linear_regr = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': predicted_y.flatten()})
df_linear_regr.head(5)
Out[21]:
Actual Predicted
0 5 5.948460
1 6 5.668554
2 7 5.979560
3 8 6.632673
4 5 5.761856
In [22]:
df_linear_regr_bar = df_linear_regr.head(25)
df_linear_regr_bar.plot(kind='bar',color = ['#3a0ca3', '#f72585'], figsize=(16,10))
plt.title("Prediction of wine quality based on alcohol\n",fontsize=18)
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
Out[22]:
Text(0, 0.5, 'Quality')

Logistic regression¶

In [23]:
X_train,X_test,y_train,y_test = train_test_split(scaled_features, targetReg, test_size=0.2, random_state=0)
In [24]:
log_reg = linear_model.LogisticRegression(solver ='newton-cg', multi_class='multinomial',max_iter=1000)
log_reg.fit(X_train,y_train.ravel())
Out[24]:
LogisticRegression(max_iter=1000, multi_class='multinomial', solver='newton-cg')
In [25]:
y_hat = log_reg.predict(X_test)
In [26]:
acc = accuracy_score(y_hat,y_test)
md(f"Accuracy: **<font color=#3a0ca3>{100*acc:.2f}%</font>**")
Out[26]:

Accuracy: 51.22%

In [27]:
md(f"Mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,y_hat):.2f}</font>**")
Out[27]:

Mean squared error: 0.73

In [28]:
md(f"Root mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,y_hat, squared=False):.2f}</font>**")
Out[28]:

Root mean squared error: 0.85

In [29]:
df_logist_regr = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_hat.flatten()})
df_logist_regr.head(5)
Out[29]:
Actual Predicted
0 5 5
1 6 5
2 7 6
3 8 7
4 5 6
In [30]:
df_logist_regr_bar = df_logist_regr.head(25)
df_logist_regr_bar.plot(kind='bar',color = ['#3a0ca3', '#f72585'], figsize=(16,10))
plt.title("Prediction of wine quality based on alcohol",fontsize=18)
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
Out[30]:
Text(0, 0.5, 'Quality')

b. Classification

KNN¶

In [70]:
X_trainKnn,X_testKnn,y_trainKnn,y_testKnn = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
In [32]:
Knn = KNeighborsClassifier (n_neighbors=10,p=1)
Knn.fit(X_trainKnn,y_trainKnn.ravel())
Out[32]:
KNeighborsClassifier(n_neighbors=10, p=1)
In [33]:
y_pred_knn=Knn.predict(X_testKnn)
In [34]:
print("Wrong values predicted out of total values : ")
print((y_testKnn!=y_pred_knn).sum(),'/',((y_testKnn==y_pred_knn).sum()+(y_testKnn!=y_pred_knn).sum()))
Wrong values predicted out of total values : 
630176 / 960400
In [35]:
md(f"Accuracy using **<font color=#3a0ca3>KNN</font>** is: **<font color=#3a0ca3>{100*accuracy_score(y_testKnn,y_pred_knn):.2f}%</font>**")
Out[35]:

Accuracy using KNN is: 51.02%

In [36]:
adjusted_rand_score(y_testKnn.ravel(), y_pred_knn)
Out[36]:
0.09186932267667855
In [37]:
dfKnn = pd.DataFrame(X_testKnn)
In [38]:
dfKnn.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
In [39]:
dfKnn['True'] = y_testKnn
dfKnn['Predict'] = y_pred_knn
dfKnn.reset_index(inplace = True, drop = True)
In [40]:
dfKnn.head()
Out[40]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol True Predict
0 0.527639 0.414339 0.130641 -0.984201 0.193523 -1.605848 0.579835 -0.530788 0.342639 -0.612079 0.150942 5 5
1 0.172097 0.315117 -0.613115 0.198872 -0.080834 -0.429751 0.509236 0.458979 -0.385910 -0.261553 -0.580471 6 5
2 0.883181 -1.371654 -0.046493 -0.944765 -0.264233 -0.488556 -0.832138 -0.811668 -0.783300 -0.787342 0.232210 7 6
3 -2.198183 0.116674 -0.030487 -0.136332 -0.493110 1.099175 0.391572 -1.426929 0.541334 -1.313131 1.938840 8 6
4 -1.013043 0.017452 -0.943673 1.135471 0.101972 0.393517 0.579835 0.559293 0.077712 -0.261553 -0.336667 5 5
In [41]:
dfKnn['True'] = pd.Categorical(dfKnn['True'])
dfKnn['Predict'] = pd.Categorical(dfKnn['Predict'])
In [42]:
dfKnn.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 13 columns):
fixed acidity           980 non-null float64
volatile acidity        980 non-null float64
citric acid             980 non-null float64
residual sugar          980 non-null float64
chlorides               980 non-null float64
free sulfur dioxide     980 non-null float64
total sulfur dioxide    980 non-null float64
density                 980 non-null float64
pH                      980 non-null float64
sulphates               980 non-null float64
alcohol                 980 non-null float64
True                    980 non-null category
Predict                 980 non-null category
dtypes: category(2), float64(11)
memory usage: 86.6 KB
In [43]:
paletteScatter = {3:'#717EC3', 4:'#3A86FF', 5:'#8338EC', 6:'#FF006E', 7:'#FB5607', 8:'#FFBE0B'}
fig = plt.figure(figsize=(15,10), dpi=200)
plt.suptitle("KNN prediction result",fontsize=18, y=0.92)
plt.subplot(121)
sns.scatterplot(data=dfKnn, x=dfKnn['alcohol'], y=dfKnn['density'], hue=dfKnn['Predict'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
plt.subplot(122)
sns.scatterplot(data=dfKnn, x=dfKnn['alcohol'], y=dfKnn['density'], hue=dfKnn['True'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')
Out[43]:
Text(0, 0.5, 'Density')

Decision Tree¶

In [44]:
X_trainDT,X_testDT,y_trainDT,y_testDT = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
In [45]:
tree = DecisionTreeClassifier(max_depth=4,random_state=0)
tree.fit(X_trainDT, y_trainDT.ravel())
Out[45]:
DecisionTreeClassifier(max_depth=4, random_state=0)
In [46]:
y_pred_dt = tree.predict(X_testDT)
In [47]:
print(classification_report(y_testDT, y_pred_dt.round(), digits=3))
              precision    recall  f1-score   support

           3      0.000     0.000     0.000         9
           4      0.000     0.000     0.000        51
           5      0.535     0.522     0.528       295
           6      0.480     0.638     0.548       409
           7      0.392     0.317     0.350       183
           8      0.000     0.000     0.000        33

    accuracy                          0.483       980
   macro avg      0.234     0.246     0.238       980
weighted avg      0.434     0.483     0.453       980

C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [48]:
print("Wrong values predicted out of total values : ")
print((y_testDT!=y_pred_dt).sum(),'/',((y_testDT==y_pred_dt).sum()+(y_testDT!=y_pred_dt).sum()))
Wrong values predicted out of total values : 
625860 / 960400
In [49]:
md(f"Accuracy using **<font color=#3a0ca3>Decision Tree</font>** is: **<font color=#3a0ca3>{100*accuracy_score(y_testDT,y_pred_dt):.2f}%</font>**")
Out[49]:

Accuracy using Decision Tree is: 48.27%

In [50]:
adjusted_rand_score(y_testDT.ravel(), y_pred_dt)
Out[50]:
0.07016693981479545
In [51]:
dfTree = pd.DataFrame(X_testDT)
dfTree.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
dfTree['True'] = y_testDT
dfTree['Predict'] = y_pred_dt
dfTree.head()
Out[51]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol True Predict
2762 0.527639 0.414339 0.130641 -0.984201 0.193523 -1.605848 0.579835 -0.530788 0.342639 -0.612079 0.150942 5 5
42 0.172097 0.315117 -0.613115 0.198872 -0.080834 -0.429751 0.509236 0.458979 -0.385910 -0.261553 -0.580471 6 5
1419 0.883181 -1.371654 -0.046493 -0.944765 -0.264233 -0.488556 -0.832138 -0.811668 -0.783300 -0.787342 0.232210 7 6
3664 -2.198183 0.116674 -0.030487 -0.136332 -0.493110 1.099175 0.391572 -1.426929 0.541334 -1.313131 1.938840 8 7
2125 -1.013043 0.017452 -0.943673 1.135471 0.101972 0.393517 0.579835 0.559293 0.077712 -0.261553 -0.336667 5 6
In [52]:
dfTree.reset_index(inplace = True, drop = True)
In [53]:
dfTree.head()
Out[53]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol True Predict
0 0.527639 0.414339 0.130641 -0.984201 0.193523 -1.605848 0.579835 -0.530788 0.342639 -0.612079 0.150942 5 5
1 0.172097 0.315117 -0.613115 0.198872 -0.080834 -0.429751 0.509236 0.458979 -0.385910 -0.261553 -0.580471 6 5
2 0.883181 -1.371654 -0.046493 -0.944765 -0.264233 -0.488556 -0.832138 -0.811668 -0.783300 -0.787342 0.232210 7 6
3 -2.198183 0.116674 -0.030487 -0.136332 -0.493110 1.099175 0.391572 -1.426929 0.541334 -1.313131 1.938840 8 7
4 -1.013043 0.017452 -0.943673 1.135471 0.101972 0.393517 0.579835 0.559293 0.077712 -0.261553 -0.336667 5 6
In [54]:
dfTree['True'] = pd.Categorical(dfTree['True'])
dfTree['Predict'] = pd.Categorical(dfTree['Predict'])
dfTree.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 13 columns):
fixed acidity           980 non-null float64
volatile acidity        980 non-null float64
citric acid             980 non-null float64
residual sugar          980 non-null float64
chlorides               980 non-null float64
free sulfur dioxide     980 non-null float64
total sulfur dioxide    980 non-null float64
density                 980 non-null float64
pH                      980 non-null float64
sulphates               980 non-null float64
alcohol                 980 non-null float64
True                    980 non-null category
Predict                 980 non-null category
dtypes: category(2), float64(11)
memory usage: 86.5 KB
In [55]:
paletteScatter = {3:'#717EC3', 4:'#3A86FF', 5:'#8338EC', 6:'#FF006E', 7:'#FB5607', 8:'#FFBE0B'}
fig = plt.figure(figsize=(15,10))
plt.suptitle("Decision Tree prediction result",fontsize=18, y=0.92)
plt.subplot(121)
sns.scatterplot(data=dfTree, x=dfTree['alcohol'], y=dfTree['density'], hue=dfTree['Predict'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
plt.subplot(122)
sns.scatterplot(data=dfTree, x=dfTree['alcohol'], y=dfTree['density'], hue=dfTree['True'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')
Out[55]:
Text(0, 0.5, 'Density')

SVC¶

In [56]:
X_trainSvc,X_testSvc,y_trainSvc,y_testSvc = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
In [57]:
svc_model = SVC()
svc_model.fit(X_train, y_train.ravel())

y_pred_svc = svc_model.predict(X_testSvc)
C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\base.py:444: UserWarning: X has feature names, but SVC was fitted without feature names
  f"X has feature names, but {self.__class__.__name__} was fitted without"
In [58]:
md(f"Accuracy using **<font color=#3a0ca3>SVC</font>** is: **<font color=#3a0ca3>{100*accuracy_score(y_testSvc,y_pred_svc):.2f}%</font>**")
Out[58]:

Accuracy using SVC is: 53.47%

In [59]:
print(adjusted_rand_score(y_testSvc.ravel(), y_pred_svc))
0.1133982159886251
In [60]:
dfSvc = pd.DataFrame(X_testSvc)
dfSvc.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
In [61]:
dfSvc['True'] = y_testSvc
dfSvc['Predict'] = y_pred_svc
dfSvc.reset_index(inplace = True, drop = True)
dfSvc.head()
Out[61]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol True Predict
0 0.527639 0.414339 0.130641 -0.984201 0.193523 -1.605848 0.579835 -0.530788 0.342639 -0.612079 0.150942 5 5
1 0.172097 0.315117 -0.613115 0.198872 -0.080834 -0.429751 0.509236 0.458979 -0.385910 -0.261553 -0.580471 6 5
2 0.883181 -1.371654 -0.046493 -0.944765 -0.264233 -0.488556 -0.832138 -0.811668 -0.783300 -0.787342 0.232210 7 6
3 -2.198183 0.116674 -0.030487 -0.136332 -0.493110 1.099175 0.391572 -1.426929 0.541334 -1.313131 1.938840 8 7
4 -1.013043 0.017452 -0.943673 1.135471 0.101972 0.393517 0.579835 0.559293 0.077712 -0.261553 -0.336667 5 6
In [62]:
fig = plt.figure(figsize=(15,10))
plt.suptitle("SVC prediction result",fontsize=18, y=0.92)
plt.subplot(121)
sns.scatterplot(data=dfSvc, x=dfSvc['alcohol'], y=dfSvc['density'], hue=dfSvc['Predict'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
plt.subplot(122)
sns.scatterplot(data=dfSvc, x=dfSvc['alcohol'], y=dfSvc['density'], hue=dfSvc['True'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')
Out[62]:
Text(0, 0.5, 'Density')
In [63]:
knnAccuracy = round(100*accuracy_score(y_testKnn,y_pred_knn),2)
dtAccuracy = round(100*accuracy_score(y_testDT,y_pred_dt),2)
svcAccuracy = round(100*accuracy_score(y_testSvc,y_pred_svc),2)
In [64]:
fig = plt.figure(figsize=(10,8))
plt.bar(x=['KNN','Decision Tree', 'SVC'], height=[knnAccuracy,dtAccuracy,svcAccuracy], 
        width=0.6, 
        color=['#cfd2cd','#e5e6e4','#FF006E'])
plt.text(0,knnAccuracy-5,f'{knnAccuracy}%',ha='center',fontsize=15, fontweight='bold')
plt.text(1,dtAccuracy-5,f'{dtAccuracy}%',ha='center',fontsize=15, fontweight='bold')
plt.text(2,svcAccuracy-5,f'{svcAccuracy}%',ha='center',fontsize=15, fontweight='bold', color='white')
plt.text(0.16,0.89,'SVC',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#FF006E')
plt.text(0.22,0.89,'is the model that has the',transform=fig.transFigure, fontsize=16)
plt.text(0.51,0.89,'highest accuracy.',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#FF006E')
plt.xticks(fontsize=12)
plt.yticks([])
plt.title("Models' accuracy comparison", fontsize=18, y=1.08)
plt.xlabel('Models',fontsize=15)
plt.gca().get_xticklabels()[2].set_color('#FF006E') 
sns.despine(left=True)

c. Optimisation

KNN¶

In [65]:
knn = KNeighborsClassifier()
param_grid = {"n_neighbors": np.arange(1, 32),
              "weights":['uniform','distance'],
              "metric":['minkowski','euclidean','manhattan']
             }
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_trainKnn, y_trainKnn.ravel())
Out[65]:
GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['minkowski', 'euclidean', 'manhattan'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
                         'weights': ['uniform', 'distance']})
In [66]:
knn_gscv.best_params_
Out[66]:
{'metric': 'manhattan', 'n_neighbors': 28, 'weights': 'distance'}
In [67]:
knn_gscv.best_score_
Out[67]:
0.6641195558683244
In [68]:
modelKnn = knn_gscv.best_estimator_
In [71]:
knnOpti = modelKnn.score(X_testKnn,y_testKnn)
md(f"**<font color=#3a0ca3>Optimised KNN</font>** accuracy: **<font color=#3a0ca3>{100*knnOpti:.2f}%</font>**")
Out[71]:

Optimised KNN accuracy: 63.27%

We can see that the best hyper-parameters are:

  • metric: Manhattan
  • n_neighbors: 28
  • weights: distance

The optimised model's accuracy is 63.27% while the initial one is 51.02%.

Decision Tree¶

In [72]:
dec_tree = DecisionTreeClassifier()
paramTree_grid = {"criterion":["gini","entropy"],
                  "max_depth":np.arange(1, 32)
                 }
dtree_gscv = GridSearchCV(dec_tree, paramTree_grid, cv=5)
dtree_gscv.fit(X_trainDT, y_trainDT.ravel())
Out[72]:
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])})
In [73]:
dtree_gscv.best_params_
Out[73]:
{'criterion': 'entropy', 'max_depth': 24}
In [74]:
dtree_gscv.best_score_
Out[74]:
0.5890755727578388
In [75]:
modelDT = dtree_gscv.best_estimator_
In [76]:
dtOpti = modelDT.score(X_testDT,y_testDT)
md(f"**<font color=#3a0ca3>Optimised Decision Tree</font>** accuracy: **<font color=#3a0ca3>{100*dtOpti:.2f}%</font>**")
C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\base.py:493: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names unseen at fit time:
- Predict
- True
Feature names must be in the same order as they were in fit.

  warnings.warn(message, FutureWarning)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-76-8f73aa2fefba> in <module>
----> 1 dtOpti = modelDT.score(X_testDT,y_testDT)
      2 md(f"**<font color=#3a0ca3>Optimised Decision Tree</font>** accuracy: **<font color=#3a0ca3>{100*dtOpti:.2f}%</font>**")

~\Anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight)
    649         from .metrics import accuracy_score
    650 
--> 651         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    652 
    653     def _more_tags(self):

~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in predict(self, X, check_input)
    465         """
    466         check_is_fitted(self)
--> 467         X = self._validate_X_predict(X, check_input)
    468         proba = self.tree_.predict(X)
    469         n_samples = X.shape[0]

~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in _validate_X_predict(self, X, check_input)
    431         """Validate the training data on predict (probabilities)."""
    432         if check_input:
--> 433             X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
    434             if issparse(X) and (
    435                 X.indices.dtype != np.intc or X.indptr.dtype != np.intc

~\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    583 
    584         if not no_val_X and check_params.get("ensure_2d", True):
--> 585             self._check_n_features(X, reset=reset)
    586 
    587         return out

~\Anaconda3\lib\site-packages\sklearn\base.py in _check_n_features(self, X, reset)
    399         if n_features != self.n_features_in_:
    400             raise ValueError(
--> 401                 f"X has {n_features} features, but {self.__class__.__name__} "
    402                 f"is expecting {self.n_features_in_} features as input."
    403             )

ValueError: X has 13 features, but DecisionTreeClassifier is expecting 11 features as input.

We can see that the best hyper-parameters are:

  • criterion: gini
  • max_depth: 31

The optimised model's accuracy is 56.73% while the initial one is 48.27%.

SVC¶

In [ ]:
svc = SVC()
paramSvc_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

svc_gscv = GridSearchCV(svc, paramSvc_grid, cv=5)
svc_gscv.fit(X_trainSvc, y_trainSvc.ravel())
In [ ]:
svc_gscv.best_params_
In [ ]:
svc_gscv.best_score_
In [ ]:
modelsvc = svc_gscv.best_estimator_
In [ ]:
svcOpti = modelsvc.score(X_testSvc,y_testSvc)
md(f"**<font color=#3a0ca3>Optimised SVC</font>** accuracy: **<font color=#3a0ca3>{100*svcOpti:.2f}%</font>**")

We can see that the best hyper-parameters are:

  • C: 10
  • gamma: 1
  • kernel: rbf

The optimised model's accuracy is 62.45% while the initial one is 53.47%.

In [ ]:
knnOptimal = round(100*knnOpti,2)
dtOptimal = round(100*dtOpti,2)
svcOptimal = round(100*svcOpti,2)
In [ ]:
fig = plt.figure(figsize=(10,8))
plt.bar(x=['KNN','Decision Tree', 'SVC'], height=[knnOptimal,dtOptimal,svcOptimal], 
        width=0.6, 
        color=['#3a0ca3','#e5e6e4','#cfd2cd'])
plt.text(0,knnOptimal-5,f'{knnOptimal}%',ha='center',fontsize=15, fontweight='bold', color='white')
plt.text(1,dtOptimal-5,f'{dtOptimal}%',ha='center',fontsize=15, fontweight='bold')
plt.text(2,svcOptimal-5,f'{svcOptimal}%',ha='center',fontsize=15, fontweight='bold')
plt.text(0.16,0.89,'KNN',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#3a0ca3')
plt.text(0.22,0.89,'is the optimized model that has the',transform=fig.transFigure, fontsize=16)
plt.text(0.626,0.89,'highest accuracy.',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#3a0ca3')
plt.xticks(fontsize=12)
plt.yticks([])
plt.title("Optimized models' accuracy comparison", fontsize=18, y=1.08)
plt.xlabel('Models',fontsize=15)
plt.gca().get_xticklabels()[0].set_color('#3a0ca3') 
sns.despine(left=True)

Finally, our best model seems to be the KNN model.

d. Improvement

So far we have tested different classifier's methods and even found the best hyper-parameters for each of them to increase their accuracy. But the accuracy can still be improved by combining those prediction using ensemble methods. In our case we will use the voting classifier, the bagging classifier with KNN as the base estimator and finally the Random forest which is also an ensemble method.

Voting classifier¶

In [ ]:
X_trainVC,X_testVC,y_trainVC,y_testVC = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
In [ ]:
estimator = []
estimator.append(('KNN', KNeighborsClassifier(n_neighbors = 28, weights = 'distance', metric = 'manhattan')))
estimator.append(('DTC', DecisionTreeClassifier(criterion = 'gini', max_depth = 31)))
estimator.append(('SVC', SVC(C=10, gamma=1, kernel='rbf')))
In [ ]:
vot_hard = VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(X_trainVC, y_trainVC.ravel())
y_pred = vot_hard.predict(X_testVC)

md(f"Hard Voting Score: **<font color=#3a0ca3>{100*accuracy_score(y_testVC, y_pred):.2f}%</font>**")



# vot_soft = VotingClassifier(estimators = estimator, voting ='soft')
# vot_soft.fit(X_trainVC, y_trainVC.ravel())
# y_pred = vot_soft.predict(X_testVC)

# print("Soft Voting Score:",accuracy_score(y_testVC, y_pred))

Bagging classifier¶

In [ ]:
from sklearn.ensemble import BaggingClassifier
In [ ]:
model = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors = 28, weights = 'distance', metric = 'manhattan'),n_estimators=100)
In [ ]:
model.fit(X_trainKnn, y_trainKnn.ravel())
baggingScore = model.score(X_testKnn, y_testKnn.ravel())

md(f"Bagging classifier Score: **<font color=#3a0ca3>{100*baggingScore:.2f}%</font>**")

Random Forest¶

In [ ]:
X_trainRF,X_testRF,y_trainRF,y_testRF = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
In [ ]:
forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=31)
forest.fit(X_trainRF, y_trainRF.ravel())

y_pred_rf = forest.predict(X_testRF)
In [ ]:
print(classification_report(y_testRF, y_pred_rf.round(), digits=3))
In [ ]:
print("Wrong values predicted out of total values : ")
print((y_testRF!=y_pred_rf).sum(),'/',((y_testRF==y_pred_rf).sum()+(y_testRF!=y_pred_rf).sum()))
In [ ]:
md(f"Random Forest accuracy: **<font color=#3a0ca3>{100*accuracy_score(y_testRF,y_pred_rf):.2f}%</font>**")