import pandas as pd
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown as md
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
dfWine = pd.read_csv("winequality-white.csv", sep=";")
dfWine.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
dfWine.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4898 entries, 0 to 4897 Data columns (total 12 columns): fixed acidity 4898 non-null float64 volatile acidity 4898 non-null float64 citric acid 4898 non-null float64 residual sugar 4898 non-null float64 chlorides 4898 non-null float64 free sulfur dioxide 4898 non-null float64 total sulfur dioxide 4898 non-null float64 density 4898 non-null float64 pH 4898 non-null float64 sulphates 4898 non-null float64 alcohol 4898 non-null float64 quality 4898 non-null int64 dtypes: float64(11), int64(1) memory usage: 459.3 KB
We can see that there are not any null values and all the datatypes seem correct.
dfWine.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 | 4898.000000 |
| mean | 6.854788 | 0.278241 | 0.334192 | 6.391415 | 0.045772 | 35.308085 | 138.360657 | 0.994027 | 3.188267 | 0.489847 | 10.514267 | 5.877909 |
| std | 0.843868 | 0.100795 | 0.121020 | 5.072058 | 0.021848 | 17.007137 | 42.498065 | 0.002991 | 0.151001 | 0.114126 | 1.230621 | 0.885639 |
| min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 2.000000 | 9.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 |
| 25% | 6.300000 | 0.210000 | 0.270000 | 1.700000 | 0.036000 | 23.000000 | 108.000000 | 0.991723 | 3.090000 | 0.410000 | 9.500000 | 5.000000 |
| 50% | 6.800000 | 0.260000 | 0.320000 | 5.200000 | 0.043000 | 34.000000 | 134.000000 | 0.993740 | 3.180000 | 0.470000 | 10.400000 | 6.000000 |
| 75% | 7.300000 | 0.320000 | 0.390000 | 9.900000 | 0.050000 | 46.000000 | 167.000000 | 0.996100 | 3.280000 | 0.550000 | 11.400000 | 6.000000 |
| max | 14.200000 | 1.100000 | 1.660000 | 65.800000 | 0.346000 | 289.000000 | 440.000000 | 1.038980 | 3.820000 | 1.080000 | 14.200000 | 9.000000 |
cmap = LinearSegmentedColormap.from_list('mycmap', ['#4cc9f0','#4361ee','#3a0ca3'])
plt.figure(figsize=(13,8))
ax = sns.heatmap(dfWine.corr().round(decimals=2), cmap=cmap, annot=True)
plt.title("Correlation Matrix", fontsize=20)
plt.show()
The feature alcohol seems to have a higher correlation with the wine quality, with a correlation of 0.44.
Our goal here is to predict the quality of a wine based on its composition. So in our dataset, the target variable will be "quality".
columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
features = dfWine[columns]
target = dfWine['quality']
scaler = StandardScaler()
scaler.fit(features)
scaled_features = scaler.transform(features)
colorPalette = ['#4cc9f0','#4361ee','#3a0ca3']
fig = plt.figure(figsize=(30,15))
sns.boxplot(data=scaled_features, palette=colorPalette)
# plt.axes().set_xticklabels(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'], fontsize=13)
plt.xticks(np.arange(11),['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'], fontsize=15)
plt.yticks(fontsize=15)
plt.title("Outliers by features\n",fontsize=30)
Text(0.5, 1.0, 'Outliers by features\n')
Features like "chlorides" or "volatile acidity" present many outliers. Let's treat them by replacing the outliers in each features by the median.
df_scaled_features = pd.DataFrame(scaled_features)
df_scaled_features.columns= ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
df_scaled_features.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.172097 | -0.081770 | 0.213280 | 2.821349 | -0.035355 | 0.569932 | 0.744565 | 2.331512 | -1.246921 | -0.349184 | -1.393152 |
| 1 | -0.657501 | 0.215896 | 0.048001 | -0.944765 | 0.147747 | -1.253019 | -0.149685 | -0.009154 | 0.740029 | 0.001342 | -0.824276 |
| 2 | 1.475751 | 0.017452 | 0.543838 | 0.100282 | 0.193523 | -0.312141 | -0.973336 | 0.358665 | 0.475102 | -0.436816 | -0.336667 |
| 3 | 0.409125 | -0.478657 | -0.117278 | 0.415768 | 0.559727 | 0.687541 | 1.121091 | 0.525855 | 0.011480 | -0.787342 | -0.499203 |
| 4 | 0.409125 | -0.478657 | -0.117278 | 0.415768 | 0.559727 | 0.687541 | 1.121091 | 0.525855 | 0.011480 | -0.787342 | -0.499203 |
for i in df_scaled_features.columns:
for j in df_scaled_features[i]:
q1 = df_scaled_features[i].quantile(0.25)
q3 = df_scaled_features[i].quantile(0.75)
iqr = q3-q1
Lower_tail = q1 - 1.5 * iqr
Upper_tail = q3 + 1.5 * iqr
if j > Upper_tail or j < Lower_tail:
df_scaled_features[i] = df_scaled_features[i].replace(j, np.mean(df_scaled_features[i]))
fig = plt.figure(figsize=(20,10))
sns.boxplot(data=df_scaled_features, palette=colorPalette)
plt.title("Processed outliers by features\n",fontsize=18)
Text(0.5, 1.0, 'Processed outliers by features\n')
We can see that we don't have outlier anymore.
#We will do the regression only with the feature alcohol
alcohol = df_scaled_features.iloc[:,10].values.reshape((len(target),1))
targetReg = target.values.reshape((len(target),1))
print(alcohol.shape)
print(targetReg.shape)
(4898, 1) (4898, 1)
X_train,X_test,y_train,y_test = train_test_split(alcohol, targetReg, test_size=0.2, random_state=0)
print(X_train)
[[-0.01159456] [ 1.53249956] [ 0.80108656] ... [-1.39315246] [ 0.23220977] [-0.41793512]]
lin_regr = linear_model.LinearRegression()
lin_regr.fit(X_train, y_train)
LinearRegression()
predicted_y = lin_regr.predict(X_test)
md(f"Mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,predicted_y):.2f}</font>**")
Mean squared error: 0.73
md(f"Root mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,predicted_y, squared=False):.2f}</font>**")
Root mean squared error: 0.85
fig = plt.figure(figsize=(10,8))
plt.plot(alcohol, targetReg, "b.", color = "#3a0ca3")
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
plt.tight_layout()
fig = plt.figure(figsize=(10,8))
plt.plot(X_test,predicted_y, "r-", linewidth=2, label="Target Predictions", color="#f72585")
plt.plot( X_train,y_train, "b.", color="#3a0ca3")
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
plt.legend(loc="upper left", fontsize=13)
plt.show()
df_linear_regr = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': predicted_y.flatten()})
df_linear_regr.head(5)
| Actual | Predicted | |
|---|---|---|
| 0 | 5 | 5.948460 |
| 1 | 6 | 5.668554 |
| 2 | 7 | 5.979560 |
| 3 | 8 | 6.632673 |
| 4 | 5 | 5.761856 |
df_linear_regr_bar = df_linear_regr.head(25)
df_linear_regr_bar.plot(kind='bar',color = ['#3a0ca3', '#f72585'], figsize=(16,10))
plt.title("Prediction of wine quality based on alcohol\n",fontsize=18)
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
Text(0, 0.5, 'Quality')
X_train,X_test,y_train,y_test = train_test_split(scaled_features, targetReg, test_size=0.2, random_state=0)
log_reg = linear_model.LogisticRegression(solver ='newton-cg', multi_class='multinomial',max_iter=1000)
log_reg.fit(X_train,y_train.ravel())
LogisticRegression(max_iter=1000, multi_class='multinomial', solver='newton-cg')
y_hat = log_reg.predict(X_test)
acc = accuracy_score(y_hat,y_test)
md(f"Accuracy: **<font color=#3a0ca3>{100*acc:.2f}%</font>**")
Accuracy: 51.22%
md(f"Mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,y_hat):.2f}</font>**")
Mean squared error: 0.73
md(f"Root mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,y_hat, squared=False):.2f}</font>**")
Root mean squared error: 0.85
df_logist_regr = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_hat.flatten()})
df_logist_regr.head(5)
| Actual | Predicted | |
|---|---|---|
| 0 | 5 | 5 |
| 1 | 6 | 5 |
| 2 | 7 | 6 |
| 3 | 8 | 7 |
| 4 | 5 | 6 |
df_logist_regr_bar = df_logist_regr.head(25)
df_logist_regr_bar.plot(kind='bar',color = ['#3a0ca3', '#f72585'], figsize=(16,10))
plt.title("Prediction of wine quality based on alcohol",fontsize=18)
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
Text(0, 0.5, 'Quality')
X_trainKnn,X_testKnn,y_trainKnn,y_testKnn = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
Knn = KNeighborsClassifier (n_neighbors=10,p=1)
Knn.fit(X_trainKnn,y_trainKnn.ravel())
KNeighborsClassifier(n_neighbors=10, p=1)
y_pred_knn=Knn.predict(X_testKnn)
print("Wrong values predicted out of total values : ")
print((y_testKnn!=y_pred_knn).sum(),'/',((y_testKnn==y_pred_knn).sum()+(y_testKnn!=y_pred_knn).sum()))
Wrong values predicted out of total values : 630176 / 960400
md(f"Accuracy using **<font color=#3a0ca3>KNN</font>** is: **<font color=#3a0ca3>{100*accuracy_score(y_testKnn,y_pred_knn):.2f}%</font>**")
Accuracy using KNN is: 51.02%
adjusted_rand_score(y_testKnn.ravel(), y_pred_knn)
0.09186932267667855
dfKnn = pd.DataFrame(X_testKnn)
dfKnn.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
dfKnn['True'] = y_testKnn
dfKnn['Predict'] = y_pred_knn
dfKnn.reset_index(inplace = True, drop = True)
dfKnn.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | True | Predict | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.527639 | 0.414339 | 0.130641 | -0.984201 | 0.193523 | -1.605848 | 0.579835 | -0.530788 | 0.342639 | -0.612079 | 0.150942 | 5 | 5 |
| 1 | 0.172097 | 0.315117 | -0.613115 | 0.198872 | -0.080834 | -0.429751 | 0.509236 | 0.458979 | -0.385910 | -0.261553 | -0.580471 | 6 | 5 |
| 2 | 0.883181 | -1.371654 | -0.046493 | -0.944765 | -0.264233 | -0.488556 | -0.832138 | -0.811668 | -0.783300 | -0.787342 | 0.232210 | 7 | 6 |
| 3 | -2.198183 | 0.116674 | -0.030487 | -0.136332 | -0.493110 | 1.099175 | 0.391572 | -1.426929 | 0.541334 | -1.313131 | 1.938840 | 8 | 6 |
| 4 | -1.013043 | 0.017452 | -0.943673 | 1.135471 | 0.101972 | 0.393517 | 0.579835 | 0.559293 | 0.077712 | -0.261553 | -0.336667 | 5 | 5 |
dfKnn['True'] = pd.Categorical(dfKnn['True'])
dfKnn['Predict'] = pd.Categorical(dfKnn['Predict'])
dfKnn.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 980 entries, 0 to 979 Data columns (total 13 columns): fixed acidity 980 non-null float64 volatile acidity 980 non-null float64 citric acid 980 non-null float64 residual sugar 980 non-null float64 chlorides 980 non-null float64 free sulfur dioxide 980 non-null float64 total sulfur dioxide 980 non-null float64 density 980 non-null float64 pH 980 non-null float64 sulphates 980 non-null float64 alcohol 980 non-null float64 True 980 non-null category Predict 980 non-null category dtypes: category(2), float64(11) memory usage: 86.6 KB
paletteScatter = {3:'#717EC3', 4:'#3A86FF', 5:'#8338EC', 6:'#FF006E', 7:'#FB5607', 8:'#FFBE0B'}
fig = plt.figure(figsize=(15,10), dpi=200)
plt.suptitle("KNN prediction result",fontsize=18, y=0.92)
plt.subplot(121)
sns.scatterplot(data=dfKnn, x=dfKnn['alcohol'], y=dfKnn['density'], hue=dfKnn['Predict'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
plt.subplot(122)
sns.scatterplot(data=dfKnn, x=dfKnn['alcohol'], y=dfKnn['density'], hue=dfKnn['True'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead. alternative='`top`', obj_type='argument') C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead. alternative='`top`', obj_type='argument')
Text(0, 0.5, 'Density')
X_trainDT,X_testDT,y_trainDT,y_testDT = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
tree = DecisionTreeClassifier(max_depth=4,random_state=0)
tree.fit(X_trainDT, y_trainDT.ravel())
DecisionTreeClassifier(max_depth=4, random_state=0)
y_pred_dt = tree.predict(X_testDT)
print(classification_report(y_testDT, y_pred_dt.round(), digits=3))
precision recall f1-score support
3 0.000 0.000 0.000 9
4 0.000 0.000 0.000 51
5 0.535 0.522 0.528 295
6 0.480 0.638 0.548 409
7 0.392 0.317 0.350 183
8 0.000 0.000 0.000 33
accuracy 0.483 980
macro avg 0.234 0.246 0.238 980
weighted avg 0.434 0.483 0.453 980
C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
print("Wrong values predicted out of total values : ")
print((y_testDT!=y_pred_dt).sum(),'/',((y_testDT==y_pred_dt).sum()+(y_testDT!=y_pred_dt).sum()))
Wrong values predicted out of total values : 625860 / 960400
md(f"Accuracy using **<font color=#3a0ca3>Decision Tree</font>** is: **<font color=#3a0ca3>{100*accuracy_score(y_testDT,y_pred_dt):.2f}%</font>**")
Accuracy using Decision Tree is: 48.27%
adjusted_rand_score(y_testDT.ravel(), y_pred_dt)
0.07016693981479545
dfTree = pd.DataFrame(X_testDT)
dfTree.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
dfTree['True'] = y_testDT
dfTree['Predict'] = y_pred_dt
dfTree.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | True | Predict | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2762 | 0.527639 | 0.414339 | 0.130641 | -0.984201 | 0.193523 | -1.605848 | 0.579835 | -0.530788 | 0.342639 | -0.612079 | 0.150942 | 5 | 5 |
| 42 | 0.172097 | 0.315117 | -0.613115 | 0.198872 | -0.080834 | -0.429751 | 0.509236 | 0.458979 | -0.385910 | -0.261553 | -0.580471 | 6 | 5 |
| 1419 | 0.883181 | -1.371654 | -0.046493 | -0.944765 | -0.264233 | -0.488556 | -0.832138 | -0.811668 | -0.783300 | -0.787342 | 0.232210 | 7 | 6 |
| 3664 | -2.198183 | 0.116674 | -0.030487 | -0.136332 | -0.493110 | 1.099175 | 0.391572 | -1.426929 | 0.541334 | -1.313131 | 1.938840 | 8 | 7 |
| 2125 | -1.013043 | 0.017452 | -0.943673 | 1.135471 | 0.101972 | 0.393517 | 0.579835 | 0.559293 | 0.077712 | -0.261553 | -0.336667 | 5 | 6 |
dfTree.reset_index(inplace = True, drop = True)
dfTree.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | True | Predict | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.527639 | 0.414339 | 0.130641 | -0.984201 | 0.193523 | -1.605848 | 0.579835 | -0.530788 | 0.342639 | -0.612079 | 0.150942 | 5 | 5 |
| 1 | 0.172097 | 0.315117 | -0.613115 | 0.198872 | -0.080834 | -0.429751 | 0.509236 | 0.458979 | -0.385910 | -0.261553 | -0.580471 | 6 | 5 |
| 2 | 0.883181 | -1.371654 | -0.046493 | -0.944765 | -0.264233 | -0.488556 | -0.832138 | -0.811668 | -0.783300 | -0.787342 | 0.232210 | 7 | 6 |
| 3 | -2.198183 | 0.116674 | -0.030487 | -0.136332 | -0.493110 | 1.099175 | 0.391572 | -1.426929 | 0.541334 | -1.313131 | 1.938840 | 8 | 7 |
| 4 | -1.013043 | 0.017452 | -0.943673 | 1.135471 | 0.101972 | 0.393517 | 0.579835 | 0.559293 | 0.077712 | -0.261553 | -0.336667 | 5 | 6 |
dfTree['True'] = pd.Categorical(dfTree['True'])
dfTree['Predict'] = pd.Categorical(dfTree['Predict'])
dfTree.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 980 entries, 0 to 979 Data columns (total 13 columns): fixed acidity 980 non-null float64 volatile acidity 980 non-null float64 citric acid 980 non-null float64 residual sugar 980 non-null float64 chlorides 980 non-null float64 free sulfur dioxide 980 non-null float64 total sulfur dioxide 980 non-null float64 density 980 non-null float64 pH 980 non-null float64 sulphates 980 non-null float64 alcohol 980 non-null float64 True 980 non-null category Predict 980 non-null category dtypes: category(2), float64(11) memory usage: 86.5 KB
paletteScatter = {3:'#717EC3', 4:'#3A86FF', 5:'#8338EC', 6:'#FF006E', 7:'#FB5607', 8:'#FFBE0B'}
fig = plt.figure(figsize=(15,10))
plt.suptitle("Decision Tree prediction result",fontsize=18, y=0.92)
plt.subplot(121)
sns.scatterplot(data=dfTree, x=dfTree['alcohol'], y=dfTree['density'], hue=dfTree['Predict'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
plt.subplot(122)
sns.scatterplot(data=dfTree, x=dfTree['alcohol'], y=dfTree['density'], hue=dfTree['True'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead. alternative='`top`', obj_type='argument') C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead. alternative='`top`', obj_type='argument')
Text(0, 0.5, 'Density')
X_trainSvc,X_testSvc,y_trainSvc,y_testSvc = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
svc_model = SVC()
svc_model.fit(X_train, y_train.ravel())
y_pred_svc = svc_model.predict(X_testSvc)
C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\base.py:444: UserWarning: X has feature names, but SVC was fitted without feature names
f"X has feature names, but {self.__class__.__name__} was fitted without"
md(f"Accuracy using **<font color=#3a0ca3>SVC</font>** is: **<font color=#3a0ca3>{100*accuracy_score(y_testSvc,y_pred_svc):.2f}%</font>**")
Accuracy using SVC is: 53.47%
print(adjusted_rand_score(y_testSvc.ravel(), y_pred_svc))
0.1133982159886251
dfSvc = pd.DataFrame(X_testSvc)
dfSvc.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
dfSvc['True'] = y_testSvc
dfSvc['Predict'] = y_pred_svc
dfSvc.reset_index(inplace = True, drop = True)
dfSvc.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | True | Predict | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.527639 | 0.414339 | 0.130641 | -0.984201 | 0.193523 | -1.605848 | 0.579835 | -0.530788 | 0.342639 | -0.612079 | 0.150942 | 5 | 5 |
| 1 | 0.172097 | 0.315117 | -0.613115 | 0.198872 | -0.080834 | -0.429751 | 0.509236 | 0.458979 | -0.385910 | -0.261553 | -0.580471 | 6 | 5 |
| 2 | 0.883181 | -1.371654 | -0.046493 | -0.944765 | -0.264233 | -0.488556 | -0.832138 | -0.811668 | -0.783300 | -0.787342 | 0.232210 | 7 | 6 |
| 3 | -2.198183 | 0.116674 | -0.030487 | -0.136332 | -0.493110 | 1.099175 | 0.391572 | -1.426929 | 0.541334 | -1.313131 | 1.938840 | 8 | 7 |
| 4 | -1.013043 | 0.017452 | -0.943673 | 1.135471 | 0.101972 | 0.393517 | 0.579835 | 0.559293 | 0.077712 | -0.261553 | -0.336667 | 5 | 6 |
fig = plt.figure(figsize=(15,10))
plt.suptitle("SVC prediction result",fontsize=18, y=0.92)
plt.subplot(121)
sns.scatterplot(data=dfSvc, x=dfSvc['alcohol'], y=dfSvc['density'], hue=dfSvc['Predict'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
plt.subplot(122)
sns.scatterplot(data=dfSvc, x=dfSvc['alcohol'], y=dfSvc['density'], hue=dfSvc['True'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead. alternative='`top`', obj_type='argument') C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead. alternative='`top`', obj_type='argument')
Text(0, 0.5, 'Density')
knnAccuracy = round(100*accuracy_score(y_testKnn,y_pred_knn),2)
dtAccuracy = round(100*accuracy_score(y_testDT,y_pred_dt),2)
svcAccuracy = round(100*accuracy_score(y_testSvc,y_pred_svc),2)
fig = plt.figure(figsize=(10,8))
plt.bar(x=['KNN','Decision Tree', 'SVC'], height=[knnAccuracy,dtAccuracy,svcAccuracy],
width=0.6,
color=['#cfd2cd','#e5e6e4','#FF006E'])
plt.text(0,knnAccuracy-5,f'{knnAccuracy}%',ha='center',fontsize=15, fontweight='bold')
plt.text(1,dtAccuracy-5,f'{dtAccuracy}%',ha='center',fontsize=15, fontweight='bold')
plt.text(2,svcAccuracy-5,f'{svcAccuracy}%',ha='center',fontsize=15, fontweight='bold', color='white')
plt.text(0.16,0.89,'SVC',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#FF006E')
plt.text(0.22,0.89,'is the model that has the',transform=fig.transFigure, fontsize=16)
plt.text(0.51,0.89,'highest accuracy.',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#FF006E')
plt.xticks(fontsize=12)
plt.yticks([])
plt.title("Models' accuracy comparison", fontsize=18, y=1.08)
plt.xlabel('Models',fontsize=15)
plt.gca().get_xticklabels()[2].set_color('#FF006E')
sns.despine(left=True)
knn = KNeighborsClassifier()
param_grid = {"n_neighbors": np.arange(1, 32),
"weights":['uniform','distance'],
"metric":['minkowski','euclidean','manhattan']
}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_trainKnn, y_trainKnn.ravel())
GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
param_grid={'metric': ['minkowski', 'euclidean', 'manhattan'],
'n_neighbors': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
'weights': ['uniform', 'distance']})
knn_gscv.best_params_
{'metric': 'manhattan', 'n_neighbors': 28, 'weights': 'distance'}
knn_gscv.best_score_
0.6641195558683244
modelKnn = knn_gscv.best_estimator_
knnOpti = modelKnn.score(X_testKnn,y_testKnn)
md(f"**<font color=#3a0ca3>Optimised KNN</font>** accuracy: **<font color=#3a0ca3>{100*knnOpti:.2f}%</font>**")
Optimised KNN accuracy: 63.27%
We can see that the best hyper-parameters are:
The optimised model's accuracy is 63.27% while the initial one is 51.02%.
dec_tree = DecisionTreeClassifier()
paramTree_grid = {"criterion":["gini","entropy"],
"max_depth":np.arange(1, 32)
}
dtree_gscv = GridSearchCV(dec_tree, paramTree_grid, cv=5)
dtree_gscv.fit(X_trainDT, y_trainDT.ravel())
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])})
dtree_gscv.best_params_
{'criterion': 'entropy', 'max_depth': 24}
dtree_gscv.best_score_
0.5890755727578388
modelDT = dtree_gscv.best_estimator_
dtOpti = modelDT.score(X_testDT,y_testDT)
md(f"**<font color=#3a0ca3>Optimised Decision Tree</font>** accuracy: **<font color=#3a0ca3>{100*dtOpti:.2f}%</font>**")
C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\base.py:493: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised. Feature names unseen at fit time: - Predict - True Feature names must be in the same order as they were in fit. warnings.warn(message, FutureWarning)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-76-8f73aa2fefba> in <module> ----> 1 dtOpti = modelDT.score(X_testDT,y_testDT) 2 md(f"**<font color=#3a0ca3>Optimised Decision Tree</font>** accuracy: **<font color=#3a0ca3>{100*dtOpti:.2f}%</font>**") ~\Anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight) 649 from .metrics import accuracy_score 650 --> 651 return accuracy_score(y, self.predict(X), sample_weight=sample_weight) 652 653 def _more_tags(self): ~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in predict(self, X, check_input) 465 """ 466 check_is_fitted(self) --> 467 X = self._validate_X_predict(X, check_input) 468 proba = self.tree_.predict(X) 469 n_samples = X.shape[0] ~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in _validate_X_predict(self, X, check_input) 431 """Validate the training data on predict (probabilities).""" 432 if check_input: --> 433 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False) 434 if issparse(X) and ( 435 X.indices.dtype != np.intc or X.indptr.dtype != np.intc ~\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 583 584 if not no_val_X and check_params.get("ensure_2d", True): --> 585 self._check_n_features(X, reset=reset) 586 587 return out ~\Anaconda3\lib\site-packages\sklearn\base.py in _check_n_features(self, X, reset) 399 if n_features != self.n_features_in_: 400 raise ValueError( --> 401 f"X has {n_features} features, but {self.__class__.__name__} " 402 f"is expecting {self.n_features_in_} features as input." 403 ) ValueError: X has 13 features, but DecisionTreeClassifier is expecting 11 features as input.
We can see that the best hyper-parameters are:
The optimised model's accuracy is 56.73% while the initial one is 48.27%.
svc = SVC()
paramSvc_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
svc_gscv = GridSearchCV(svc, paramSvc_grid, cv=5)
svc_gscv.fit(X_trainSvc, y_trainSvc.ravel())
svc_gscv.best_params_
svc_gscv.best_score_
modelsvc = svc_gscv.best_estimator_
svcOpti = modelsvc.score(X_testSvc,y_testSvc)
md(f"**<font color=#3a0ca3>Optimised SVC</font>** accuracy: **<font color=#3a0ca3>{100*svcOpti:.2f}%</font>**")
We can see that the best hyper-parameters are:
The optimised model's accuracy is 62.45% while the initial one is 53.47%.
knnOptimal = round(100*knnOpti,2)
dtOptimal = round(100*dtOpti,2)
svcOptimal = round(100*svcOpti,2)
fig = plt.figure(figsize=(10,8))
plt.bar(x=['KNN','Decision Tree', 'SVC'], height=[knnOptimal,dtOptimal,svcOptimal],
width=0.6,
color=['#3a0ca3','#e5e6e4','#cfd2cd'])
plt.text(0,knnOptimal-5,f'{knnOptimal}%',ha='center',fontsize=15, fontweight='bold', color='white')
plt.text(1,dtOptimal-5,f'{dtOptimal}%',ha='center',fontsize=15, fontweight='bold')
plt.text(2,svcOptimal-5,f'{svcOptimal}%',ha='center',fontsize=15, fontweight='bold')
plt.text(0.16,0.89,'KNN',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#3a0ca3')
plt.text(0.22,0.89,'is the optimized model that has the',transform=fig.transFigure, fontsize=16)
plt.text(0.626,0.89,'highest accuracy.',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#3a0ca3')
plt.xticks(fontsize=12)
plt.yticks([])
plt.title("Optimized models' accuracy comparison", fontsize=18, y=1.08)
plt.xlabel('Models',fontsize=15)
plt.gca().get_xticklabels()[0].set_color('#3a0ca3')
sns.despine(left=True)
Finally, our best model seems to be the KNN model.
So far we have tested different classifier's methods and even found the best hyper-parameters for each of them to increase their accuracy. But the accuracy can still be improved by combining those prediction using ensemble methods. In our case we will use the voting classifier, the bagging classifier with KNN as the base estimator and finally the Random forest which is also an ensemble method.
X_trainVC,X_testVC,y_trainVC,y_testVC = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
estimator = []
estimator.append(('KNN', KNeighborsClassifier(n_neighbors = 28, weights = 'distance', metric = 'manhattan')))
estimator.append(('DTC', DecisionTreeClassifier(criterion = 'gini', max_depth = 31)))
estimator.append(('SVC', SVC(C=10, gamma=1, kernel='rbf')))
vot_hard = VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(X_trainVC, y_trainVC.ravel())
y_pred = vot_hard.predict(X_testVC)
md(f"Hard Voting Score: **<font color=#3a0ca3>{100*accuracy_score(y_testVC, y_pred):.2f}%</font>**")
# vot_soft = VotingClassifier(estimators = estimator, voting ='soft')
# vot_soft.fit(X_trainVC, y_trainVC.ravel())
# y_pred = vot_soft.predict(X_testVC)
# print("Soft Voting Score:",accuracy_score(y_testVC, y_pred))
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors = 28, weights = 'distance', metric = 'manhattan'),n_estimators=100)
model.fit(X_trainKnn, y_trainKnn.ravel())
baggingScore = model.score(X_testKnn, y_testKnn.ravel())
md(f"Bagging classifier Score: **<font color=#3a0ca3>{100*baggingScore:.2f}%</font>**")
X_trainRF,X_testRF,y_trainRF,y_testRF = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)
forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=31)
forest.fit(X_trainRF, y_trainRF.ravel())
y_pred_rf = forest.predict(X_testRF)
print(classification_report(y_testRF, y_pred_rf.round(), digits=3))
print("Wrong values predicted out of total values : ")
print((y_testRF!=y_pred_rf).sum(),'/',((y_testRF==y_pred_rf).sum()+(y_testRF!=y_pred_rf).sum()))
md(f"Random Forest accuracy: **<font color=#3a0ca3>{100*accuracy_score(y_testRF,y_pred_rf):.2f}%</font>**")