import pandas as pd
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown as md
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier


dfWine = pd.read_csv("winequality-white.csv", sep=";")


dfWine.head()


dfWine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


dfWine.describe()


cmap = LinearSegmentedColormap.from_list('mycmap', ['#4cc9f0','#4361ee','#3a0ca3'])

plt.figure(figsize=(13,8))
ax = sns.heatmap(dfWine.corr().round(decimals=2), cmap=cmap, annot=True)
plt.title("Correlation Matrix", fontsize=20)
plt.show()


columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
features = dfWine[columns]
target = dfWine['quality']


scaler = StandardScaler()
scaler.fit(features)
scaled_features = scaler.transform(features)


colorPalette = ['#4cc9f0','#4361ee','#3a0ca3']
fig = plt.figure(figsize=(30,15))
sns.boxplot(data=scaled_features, palette=colorPalette)
# plt.axes().set_xticklabels(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'], fontsize=13)
plt.xticks(np.arange(11),['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'], fontsize=15)
plt.yticks(fontsize=15)
plt.title("Outliers by features\n",fontsize=30)

Text(0.5, 1.0, 'Outliers by features\n')


df_scaled_features = pd.DataFrame(scaled_features)
df_scaled_features.columns= ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
df_scaled_features.head()


for i in df_scaled_features.columns:
    for j in df_scaled_features[i]:
        q1 = df_scaled_features[i].quantile(0.25)
        q3 = df_scaled_features[i].quantile(0.75)
        iqr = q3-q1
        Lower_tail = q1 - 1.5 * iqr
        Upper_tail = q3 + 1.5 * iqr
        if j > Upper_tail or j < Lower_tail:
            df_scaled_features[i] = df_scaled_features[i].replace(j, np.mean(df_scaled_features[i]))


fig = plt.figure(figsize=(20,10))
sns.boxplot(data=df_scaled_features, palette=colorPalette)
plt.title("Processed outliers by features\n",fontsize=18)

Text(0.5, 1.0, 'Processed outliers by features\n')


#We will do the regression only with the feature alcohol
alcohol = df_scaled_features.iloc[:,10].values.reshape((len(target),1))
targetReg = target.values.reshape((len(target),1))
print(alcohol.shape)
print(targetReg.shape)

(4898, 1)
(4898, 1)


X_train,X_test,y_train,y_test = train_test_split(alcohol, targetReg, test_size=0.2, random_state=0)

print(X_train)

[[-0.01159456]
 [ 1.53249956]
 [ 0.80108656]
 ...
 [-1.39315246]
 [ 0.23220977]
 [-0.41793512]]


lin_regr = linear_model.LinearRegression()
lin_regr.fit(X_train, y_train)

LinearRegression()


predicted_y = lin_regr.predict(X_test)


md(f"Mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,predicted_y):.2f}</font>**")


md(f"Root mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,predicted_y, squared=False):.2f}</font>**")


fig = plt.figure(figsize=(10,8))
plt.plot(alcohol, targetReg, "b.", color = "#3a0ca3")
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
plt.tight_layout()


fig = plt.figure(figsize=(10,8))
plt.plot(X_test,predicted_y, "r-", linewidth=2, label="Target Predictions", color="#f72585")
plt.plot( X_train,y_train, "b.", color="#3a0ca3")
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)
plt.legend(loc="upper left", fontsize=13)
plt.show()


df_linear_regr = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': predicted_y.flatten()})
df_linear_regr.head(5)


df_linear_regr_bar = df_linear_regr.head(25)
df_linear_regr_bar.plot(kind='bar',color = ['#3a0ca3', '#f72585'], figsize=(16,10))
plt.title("Prediction of wine quality based on alcohol\n",fontsize=18)
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)

Text(0, 0.5, 'Quality')


X_train,X_test,y_train,y_test = train_test_split(scaled_features, targetReg, test_size=0.2, random_state=0)


log_reg = linear_model.LogisticRegression(solver ='newton-cg', multi_class='multinomial',max_iter=1000)
log_reg.fit(X_train,y_train.ravel())

LogisticRegression(max_iter=1000, multi_class='multinomial', solver='newton-cg')


y_hat = log_reg.predict(X_test)


acc = accuracy_score(y_hat,y_test)
md(f"Accuracy: **<font color=#3a0ca3>{100*acc:.2f}%</font>**")


md(f"Mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,y_hat):.2f}</font>**")


md(f"Root mean squared error: **<font color=#3a0ca3>{mean_squared_error(y_test,y_hat, squared=False):.2f}</font>**")


df_logist_regr = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_hat.flatten()})
df_logist_regr.head(5)


df_logist_regr_bar = df_logist_regr.head(25)
df_logist_regr_bar.plot(kind='bar',color = ['#3a0ca3', '#f72585'], figsize=(16,10))
plt.title("Prediction of wine quality based on alcohol",fontsize=18)
plt.xlabel("Alcohol", fontsize=15)
plt.ylabel("Quality", fontsize=15)

Text(0, 0.5, 'Quality')


X_trainKnn,X_testKnn,y_trainKnn,y_testKnn = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)


Knn = KNeighborsClassifier (n_neighbors=10,p=1)
Knn.fit(X_trainKnn,y_trainKnn.ravel())

KNeighborsClassifier(n_neighbors=10, p=1)


y_pred_knn=Knn.predict(X_testKnn)


print("Wrong values predicted out of total values : ")
print((y_testKnn!=y_pred_knn).sum(),'/',((y_testKnn==y_pred_knn).sum()+(y_testKnn!=y_pred_knn).sum()))

Wrong values predicted out of total values : 
630176 / 960400


md(f"Accuracy using **<font color=#3a0ca3>KNN</font>** is: **<font color=#3a0ca3>{100*accuracy_score(y_testKnn,y_pred_knn):.2f}%</font>**")


adjusted_rand_score(y_testKnn.ravel(), y_pred_knn)

0.09186932267667855


dfKnn = pd.DataFrame(X_testKnn)


dfKnn.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


dfKnn['True'] = y_testKnn
dfKnn['Predict'] = y_pred_knn
dfKnn.reset_index(inplace = True, drop = True)


dfKnn.head()


dfKnn['True'] = pd.Categorical(dfKnn['True'])
dfKnn['Predict'] = pd.Categorical(dfKnn['Predict'])


dfKnn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 13 columns):
fixed acidity           980 non-null float64
volatile acidity        980 non-null float64
citric acid             980 non-null float64
residual sugar          980 non-null float64
chlorides               980 non-null float64
free sulfur dioxide     980 non-null float64
total sulfur dioxide    980 non-null float64
density                 980 non-null float64
pH                      980 non-null float64
sulphates               980 non-null float64
alcohol                 980 non-null float64
True                    980 non-null category
Predict                 980 non-null category
dtypes: category(2), float64(11)
memory usage: 86.6 KB


paletteScatter = {3:'#717EC3', 4:'#3A86FF', 5:'#8338EC', 6:'#FF006E', 7:'#FB5607', 8:'#FFBE0B'}
fig = plt.figure(figsize=(15,10), dpi=200)
plt.suptitle("KNN prediction result",fontsize=18, y=0.92)
plt.subplot(121)
sns.scatterplot(data=dfKnn, x=dfKnn['alcohol'], y=dfKnn['density'], hue=dfKnn['Predict'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
plt.subplot(122)
sns.scatterplot(data=dfKnn, x=dfKnn['alcohol'], y=dfKnn['density'], hue=dfKnn['True'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)

C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')

Text(0, 0.5, 'Density')


X_trainDT,X_testDT,y_trainDT,y_testDT = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)


tree = DecisionTreeClassifier(max_depth=4,random_state=0)
tree.fit(X_trainDT, y_trainDT.ravel())

DecisionTreeClassifier(max_depth=4, random_state=0)


y_pred_dt = tree.predict(X_testDT)


print(classification_report(y_testDT, y_pred_dt.round(), digits=3))

              precision    recall  f1-score   support

           3      0.000     0.000     0.000         9
           4      0.000     0.000     0.000        51
           5      0.535     0.522     0.528       295
           6      0.480     0.638     0.548       409
           7      0.392     0.317     0.350       183
           8      0.000     0.000     0.000        33

    accuracy                          0.483       980
   macro avg      0.234     0.246     0.238       980
weighted avg      0.434     0.483     0.453       980

C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


print("Wrong values predicted out of total values : ")
print((y_testDT!=y_pred_dt).sum(),'/',((y_testDT==y_pred_dt).sum()+(y_testDT!=y_pred_dt).sum()))

Wrong values predicted out of total values : 
625860 / 960400


md(f"Accuracy using **<font color=#3a0ca3>Decision Tree</font>** is: **<font color=#3a0ca3>{100*accuracy_score(y_testDT,y_pred_dt):.2f}%</font>**")


adjusted_rand_score(y_testDT.ravel(), y_pred_dt)

0.07016693981479545


dfTree = pd.DataFrame(X_testDT)
dfTree.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
dfTree['True'] = y_testDT
dfTree['Predict'] = y_pred_dt
dfTree.head()


dfTree.reset_index(inplace = True, drop = True)


dfTree.head()


dfTree['True'] = pd.Categorical(dfTree['True'])
dfTree['Predict'] = pd.Categorical(dfTree['Predict'])
dfTree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 13 columns):
fixed acidity           980 non-null float64
volatile acidity        980 non-null float64
citric acid             980 non-null float64
residual sugar          980 non-null float64
chlorides               980 non-null float64
free sulfur dioxide     980 non-null float64
total sulfur dioxide    980 non-null float64
density                 980 non-null float64
pH                      980 non-null float64
sulphates               980 non-null float64
alcohol                 980 non-null float64
True                    980 non-null category
Predict                 980 non-null category
dtypes: category(2), float64(11)
memory usage: 86.5 KB


paletteScatter = {3:'#717EC3', 4:'#3A86FF', 5:'#8338EC', 6:'#FF006E', 7:'#FB5607', 8:'#FFBE0B'}
fig = plt.figure(figsize=(15,10))
plt.suptitle("Decision Tree prediction result",fontsize=18, y=0.92)
plt.subplot(121)
sns.scatterplot(data=dfTree, x=dfTree['alcohol'], y=dfTree['density'], hue=dfTree['Predict'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
plt.subplot(122)
sns.scatterplot(data=dfTree, x=dfTree['alcohol'], y=dfTree['density'], hue=dfTree['True'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)

C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')

Text(0, 0.5, 'Density')


X_trainSvc,X_testSvc,y_trainSvc,y_testSvc = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)


svc_model = SVC()
svc_model.fit(X_train, y_train.ravel())

y_pred_svc = svc_model.predict(X_testSvc)

C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\base.py:444: UserWarning: X has feature names, but SVC was fitted without feature names
  f"X has feature names, but {self.__class__.__name__} was fitted without"


md(f"Accuracy using **<font color=#3a0ca3>SVC</font>** is: **<font color=#3a0ca3>{100*accuracy_score(y_testSvc,y_pred_svc):.2f}%</font>**")


print(adjusted_rand_score(y_testSvc.ravel(), y_pred_svc))

0.1133982159886251


dfSvc = pd.DataFrame(X_testSvc)
dfSvc.columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


dfSvc['True'] = y_testSvc
dfSvc['Predict'] = y_pred_svc
dfSvc.reset_index(inplace = True, drop = True)
dfSvc.head()


fig = plt.figure(figsize=(15,10))
plt.suptitle("SVC prediction result",fontsize=18, y=0.92)
plt.subplot(121)
sns.scatterplot(data=dfSvc, x=dfSvc['alcohol'], y=dfSvc['density'], hue=dfSvc['Predict'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)
plt.subplot(122)
sns.scatterplot(data=dfSvc, x=dfSvc['alcohol'], y=dfSvc['density'], hue=dfSvc['True'], palette=paletteScatter)
plt.ylim(ymax=3.0)
plt.xlabel("Alcohol",fontsize=15)
plt.ylabel("Density",fontsize=15)

C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')
C:\Users\mayou\Anaconda3\lib\site-packages\matplotlib\axes\_base.py:3610: MatplotlibDeprecationWarning: 
The `ymax` argument was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use `top` instead.
  alternative='`top`', obj_type='argument')

Text(0, 0.5, 'Density')


knnAccuracy = round(100*accuracy_score(y_testKnn,y_pred_knn),2)
dtAccuracy = round(100*accuracy_score(y_testDT,y_pred_dt),2)
svcAccuracy = round(100*accuracy_score(y_testSvc,y_pred_svc),2)


fig = plt.figure(figsize=(10,8))
plt.bar(x=['KNN','Decision Tree', 'SVC'], height=[knnAccuracy,dtAccuracy,svcAccuracy], 
        width=0.6, 
        color=['#cfd2cd','#e5e6e4','#FF006E'])
plt.text(0,knnAccuracy-5,f'{knnAccuracy}%',ha='center',fontsize=15, fontweight='bold')
plt.text(1,dtAccuracy-5,f'{dtAccuracy}%',ha='center',fontsize=15, fontweight='bold')
plt.text(2,svcAccuracy-5,f'{svcAccuracy}%',ha='center',fontsize=15, fontweight='bold', color='white')
plt.text(0.16,0.89,'SVC',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#FF006E')
plt.text(0.22,0.89,'is the model that has the',transform=fig.transFigure, fontsize=16)
plt.text(0.51,0.89,'highest accuracy.',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#FF006E')
plt.xticks(fontsize=12)
plt.yticks([])
plt.title("Models' accuracy comparison", fontsize=18, y=1.08)
plt.xlabel('Models',fontsize=15)
plt.gca().get_xticklabels()[2].set_color('#FF006E') 
sns.despine(left=True)


knn = KNeighborsClassifier()
param_grid = {"n_neighbors": np.arange(1, 32),
              "weights":['uniform','distance'],
              "metric":['minkowski','euclidean','manhattan']
             }
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X_trainKnn, y_trainKnn.ravel())

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['minkowski', 'euclidean', 'manhattan'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
                         'weights': ['uniform', 'distance']})


knn_gscv.best_params_

{'metric': 'manhattan', 'n_neighbors': 28, 'weights': 'distance'}


knn_gscv.best_score_

0.6641195558683244


modelKnn = knn_gscv.best_estimator_


knnOpti = modelKnn.score(X_testKnn,y_testKnn)
md(f"**<font color=#3a0ca3>Optimised KNN</font>** accuracy: **<font color=#3a0ca3>{100*knnOpti:.2f}%</font>**")


dec_tree = DecisionTreeClassifier()
paramTree_grid = {"criterion":["gini","entropy"],
                  "max_depth":np.arange(1, 32)
                 }
dtree_gscv = GridSearchCV(dec_tree, paramTree_grid, cv=5)
dtree_gscv.fit(X_trainDT, y_trainDT.ravel())

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])})


dtree_gscv.best_params_

{'criterion': 'entropy', 'max_depth': 24}


dtree_gscv.best_score_

0.5890755727578388


modelDT = dtree_gscv.best_estimator_


dtOpti = modelDT.score(X_testDT,y_testDT)
md(f"**<font color=#3a0ca3>Optimised Decision Tree</font>** accuracy: **<font color=#3a0ca3>{100*dtOpti:.2f}%</font>**")

C:\Users\mayou\Anaconda3\lib\site-packages\sklearn\base.py:493: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names unseen at fit time:
- Predict
- True
Feature names must be in the same order as they were in fit.

  warnings.warn(message, FutureWarning)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-76-8f73aa2fefba> in <module>
----> 1 dtOpti = modelDT.score(X_testDT,y_testDT)
      2 md(f"**<font color=#3a0ca3>Optimised Decision Tree</font>** accuracy: **<font color=#3a0ca3>{100*dtOpti:.2f}%</font>**")

~\Anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight)
    649         from .metrics import accuracy_score
    650 
--> 651         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    652 
    653     def _more_tags(self):

~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in predict(self, X, check_input)
    465         """
    466         check_is_fitted(self)
--> 467         X = self._validate_X_predict(X, check_input)
    468         proba = self.tree_.predict(X)
    469         n_samples = X.shape[0]

~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in _validate_X_predict(self, X, check_input)
    431         """Validate the training data on predict (probabilities)."""
    432         if check_input:
--> 433             X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
    434             if issparse(X) and (
    435                 X.indices.dtype != np.intc or X.indptr.dtype != np.intc

~\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    583 
    584         if not no_val_X and check_params.get("ensure_2d", True):
--> 585             self._check_n_features(X, reset=reset)
    586 
    587         return out

~\Anaconda3\lib\site-packages\sklearn\base.py in _check_n_features(self, X, reset)
    399         if n_features != self.n_features_in_:
    400             raise ValueError(
--> 401                 f"X has {n_features} features, but {self.__class__.__name__} "
    402                 f"is expecting {self.n_features_in_} features as input."
    403             )

ValueError: X has 13 features, but DecisionTreeClassifier is expecting 11 features as input.


svc = SVC()
paramSvc_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

svc_gscv = GridSearchCV(svc, paramSvc_grid, cv=5)
svc_gscv.fit(X_trainSvc, y_trainSvc.ravel())


svc_gscv.best_params_


svc_gscv.best_score_


modelsvc = svc_gscv.best_estimator_


svcOpti = modelsvc.score(X_testSvc,y_testSvc)
md(f"**<font color=#3a0ca3>Optimised SVC</font>** accuracy: **<font color=#3a0ca3>{100*svcOpti:.2f}%</font>**")


knnOptimal = round(100*knnOpti,2)
dtOptimal = round(100*dtOpti,2)
svcOptimal = round(100*svcOpti,2)


fig = plt.figure(figsize=(10,8))
plt.bar(x=['KNN','Decision Tree', 'SVC'], height=[knnOptimal,dtOptimal,svcOptimal], 
        width=0.6, 
        color=['#3a0ca3','#e5e6e4','#cfd2cd'])
plt.text(0,knnOptimal-5,f'{knnOptimal}%',ha='center',fontsize=15, fontweight='bold', color='white')
plt.text(1,dtOptimal-5,f'{dtOptimal}%',ha='center',fontsize=15, fontweight='bold')
plt.text(2,svcOptimal-5,f'{svcOptimal}%',ha='center',fontsize=15, fontweight='bold')
plt.text(0.16,0.89,'KNN',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#3a0ca3')
plt.text(0.22,0.89,'is the optimized model that has the',transform=fig.transFigure, fontsize=16)
plt.text(0.626,0.89,'highest accuracy.',transform=fig.transFigure, fontsize=16, fontweight='bold', color='#3a0ca3')
plt.xticks(fontsize=12)
plt.yticks([])
plt.title("Optimized models' accuracy comparison", fontsize=18, y=1.08)
plt.xlabel('Models',fontsize=15)
plt.gca().get_xticklabels()[0].set_color('#3a0ca3') 
sns.despine(left=True)


X_trainVC,X_testVC,y_trainVC,y_testVC = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)


estimator = []
estimator.append(('KNN', KNeighborsClassifier(n_neighbors = 28, weights = 'distance', metric = 'manhattan')))
estimator.append(('DTC', DecisionTreeClassifier(criterion = 'gini', max_depth = 31)))
estimator.append(('SVC', SVC(C=10, gamma=1, kernel='rbf')))


vot_hard = VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(X_trainVC, y_trainVC.ravel())
y_pred = vot_hard.predict(X_testVC)

md(f"Hard Voting Score: **<font color=#3a0ca3>{100*accuracy_score(y_testVC, y_pred):.2f}%</font>**")



# vot_soft = VotingClassifier(estimators = estimator, voting ='soft')
# vot_soft.fit(X_trainVC, y_trainVC.ravel())
# y_pred = vot_soft.predict(X_testVC)

# print("Soft Voting Score:",accuracy_score(y_testVC, y_pred))


from sklearn.ensemble import BaggingClassifier


model = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors = 28, weights = 'distance', metric = 'manhattan'),n_estimators=100)


model.fit(X_trainKnn, y_trainKnn.ravel())
baggingScore = model.score(X_testKnn, y_testKnn.ravel())

md(f"Bagging classifier Score: **<font color=#3a0ca3>{100*baggingScore:.2f}%</font>**")


X_trainRF,X_testRF,y_trainRF,y_testRF = train_test_split(df_scaled_features, targetReg, test_size=0.2, random_state=0)


forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=31)
forest.fit(X_trainRF, y_trainRF.ravel())

y_pred_rf = forest.predict(X_testRF)


print(classification_report(y_testRF, y_pred_rf.round(), digits=3))


print("Wrong values predicted out of total values : ")
print((y_testRF!=y_pred_rf).sum(),'/',((y_testRF==y_pred_rf).sum()+(y_testRF!=y_pred_rf).sum()))


md(f"Random Forest accuracy: **<font color=#3a0ca3>{100*accuracy_score(y_testRF,y_pred_rf):.2f}%</font>**")

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.0	0.27	0.36	20.7	0.045	45.0	170.0	1.0010	3.00	0.45	8.8	6
1	6.3	0.30	0.34	1.6	0.049	14.0	132.0	0.9940	3.30	0.49	9.5	6
2	8.1	0.28	0.40	6.9	0.050	30.0	97.0	0.9951	3.26	0.44	10.1	6
3	7.2	0.23	0.32	8.5	0.058	47.0	186.0	0.9956	3.19	0.40	9.9	6
4	7.2	0.23	0.32	8.5	0.058	47.0	186.0	0.9956	3.19	0.40	9.9	6

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
count	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000
mean	6.854788	0.278241	0.334192	6.391415	0.045772	35.308085	138.360657	0.994027	3.188267	0.489847	10.514267	5.877909
std	0.843868	0.100795	0.121020	5.072058	0.021848	17.007137	42.498065	0.002991	0.151001	0.114126	1.230621	0.885639
min	3.800000	0.080000	0.000000	0.600000	0.009000	2.000000	9.000000	0.987110	2.720000	0.220000	8.000000	3.000000
25%	6.300000	0.210000	0.270000	1.700000	0.036000	23.000000	108.000000	0.991723	3.090000	0.410000	9.500000	5.000000
50%	6.800000	0.260000	0.320000	5.200000	0.043000	34.000000	134.000000	0.993740	3.180000	0.470000	10.400000	6.000000
75%	7.300000	0.320000	0.390000	9.900000	0.050000	46.000000	167.000000	0.996100	3.280000	0.550000	11.400000	6.000000
max	14.200000	1.100000	1.660000	65.800000	0.346000	289.000000	440.000000	1.038980	3.820000	1.080000	14.200000	9.000000

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol
0	0.172097	-0.081770	0.213280	2.821349	-0.035355	0.569932	0.744565	2.331512	-1.246921	-0.349184	-1.393152
1	-0.657501	0.215896	0.048001	-0.944765	0.147747	-1.253019	-0.149685	-0.009154	0.740029	0.001342	-0.824276
2	1.475751	0.017452	0.543838	0.100282	0.193523	-0.312141	-0.973336	0.358665	0.475102	-0.436816	-0.336667
3	0.409125	-0.478657	-0.117278	0.415768	0.559727	0.687541	1.121091	0.525855	0.011480	-0.787342	-0.499203
4	0.409125	-0.478657	-0.117278	0.415768	0.559727	0.687541	1.121091	0.525855	0.011480	-0.787342	-0.499203

	Actual	Predicted
0	5	5.948460
1	6	5.668554
2	7	5.979560
3	8	6.632673
4	5	5.761856

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	True	Predict
0	0.527639	0.414339	0.130641	-0.984201	0.193523	-1.605848	0.579835	-0.530788	0.342639	-0.612079	0.150942	5	5
1	0.172097	0.315117	-0.613115	0.198872	-0.080834	-0.429751	0.509236	0.458979	-0.385910	-0.261553	-0.580471	6	5
2	0.883181	-1.371654	-0.046493	-0.944765	-0.264233	-0.488556	-0.832138	-0.811668	-0.783300	-0.787342	0.232210	7	6
3	-2.198183	0.116674	-0.030487	-0.136332	-0.493110	1.099175	0.391572	-1.426929	0.541334	-1.313131	1.938840	8	6
4	-1.013043	0.017452	-0.943673	1.135471	0.101972	0.393517	0.579835	0.559293	0.077712	-0.261553	-0.336667	5	5

Mayouran VARATHALINGAM¶

Prediction of wine quality based on its composition

1. Analysis of the dataset:

2. Machine Learning and Prediction:

a. Regression

Linear Regression¶

Logistic regression¶

b. Classification

KNN¶

Decision Tree¶

SVC¶

c. Optimisation

KNN¶

Decision Tree¶

SVC¶

d. Improvement

Voting classifier¶

Bagging classifier¶

Random Forest¶

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	True	Predict
2762	0.527639	0.414339	0.130641	-0.984201	0.193523	-1.605848	0.579835	-0.530788	0.342639	-0.612079	0.150942	5	5
42	0.172097	0.315117	-0.613115	0.198872	-0.080834	-0.429751	0.509236	0.458979	-0.385910	-0.261553	-0.580471	6	5
1419	0.883181	-1.371654	-0.046493	-0.944765	-0.264233	-0.488556	-0.832138	-0.811668	-0.783300	-0.787342	0.232210	7	6
3664	-2.198183	0.116674	-0.030487	-0.136332	-0.493110	1.099175	0.391572	-1.426929	0.541334	-1.313131	1.938840	8	7
2125	-1.013043	0.017452	-0.943673	1.135471	0.101972	0.393517	0.579835	0.559293	0.077712	-0.261553	-0.336667	5	6