import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv('WineQT.csv')
Basic data understanding and preparation
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1143 entries, 0 to 1142 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1143 non-null float64 1 volatile acidity 1143 non-null float64 2 citric acid 1143 non-null float64 3 residual sugar 1143 non-null float64 4 chlorides 1143 non-null float64 5 free sulfur dioxide 1143 non-null float64 6 total sulfur dioxide 1143 non-null float64 7 density 1143 non-null float64 8 pH 1143 non-null float64 9 sulphates 1143 non-null float64 10 alcohol 1143 non-null float64 11 quality 1143 non-null int64 12 Id 1143 non-null int64 dtypes: float64(11), int64(2) memory usage: 116.2 KB
df.describe()
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 |
mean | 8.311111 | 0.531339 | 0.268364 | 2.532152 | 0.086933 | 15.615486 | 45.914698 | 0.996730 | 3.311015 | 0.657708 | 10.442111 | 5.657043 | 804.969379 |
std | 1.747595 | 0.179633 | 0.196686 | 1.355917 | 0.047267 | 10.250486 | 32.782130 | 0.001925 | 0.156664 | 0.170399 | 1.082196 | 0.805824 | 463.997116 |
min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 | 0.000000 |
25% | 7.100000 | 0.392500 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 21.000000 | 0.995570 | 3.205000 | 0.550000 | 9.500000 | 5.000000 | 411.000000 |
50% | 7.900000 | 0.520000 | 0.250000 | 2.200000 | 0.079000 | 13.000000 | 37.000000 | 0.996680 | 3.310000 | 0.620000 | 10.200000 | 6.000000 | 794.000000 |
75% | 9.100000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 61.000000 | 0.997845 | 3.400000 | 0.730000 | 11.100000 | 6.000000 | 1209.500000 |
max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 68.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 | 1597.000000 |
df.head(10)
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | 1 |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | 2 |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | 3 |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 4 |
5 | 7.4 | 0.66 | 0.00 | 1.8 | 0.075 | 13.0 | 40.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 5 |
6 | 7.9 | 0.60 | 0.06 | 1.6 | 0.069 | 15.0 | 59.0 | 0.9964 | 3.30 | 0.46 | 9.4 | 5 | 6 |
7 | 7.3 | 0.65 | 0.00 | 1.2 | 0.065 | 15.0 | 21.0 | 0.9946 | 3.39 | 0.47 | 10.0 | 7 | 7 |
8 | 7.8 | 0.58 | 0.02 | 2.0 | 0.073 | 9.0 | 18.0 | 0.9968 | 3.36 | 0.57 | 9.5 | 7 | 8 |
9 | 6.7 | 0.58 | 0.08 | 1.8 | 0.097 | 15.0 | 65.0 | 0.9959 | 3.28 | 0.54 | 9.2 | 5 | 10 |
df.isna().sum() # checking for null values
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 Id 0 dtype: int64
df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'Id'], dtype='object')
df = df.drop('Id',axis=1)
df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'], dtype='object')
EDA of the wine quality data
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(),annot=True,cmap='coolwarm',linecolor='white',linewidths=1) # complete correlation of all variables
#sns.heatmap(df.corr()[df.corr()>0.6],annot=True,cmap='coolwarm',linecolor='white',linewidths=1) # correlations with > 0.6
<AxesSubplot: >
sns.countplot(data = df, x = 'quality')
<AxesSubplot: xlabel='quality', ylabel='count'>
sns.boxplot(data = df, y= 'fixed acidity', x = 'quality')
<AxesSubplot: xlabel='quality', ylabel='fixed acidity'>
sns.pairplot(data=df,corner=True)
<seaborn.axisgrid.PairGrid at 0x296c3112430>
major understanding from the pair plot is that the its a classification function
g=sns.FacetGrid(data=df,col='quality')
g.map(plt.hist,'fixed acidity')
<seaborn.axisgrid.FacetGrid at 0x296c677f460>
sns.kdeplot(x='citric acid',y='quality',data=df)
<AxesSubplot: xlabel='citric acid', ylabel='quality'>
**ML building section**
df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'], dtype='object')
for i in df.columns:
plt.figure()
sns.histplot(df[i],kde=True)
Data scaling
from sklearn import preprocessing
X = df.drop('quality',axis=1)
y = df['quality']
scaler = preprocessing.StandardScaler()
xdf = scaler.fit_transform(X)
xdf = pd.DataFrame(xdf,columns=X.columns)
xdf
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.521580 | 0.939332 | -1.365027 | -0.466421 | -0.231395 | -0.450467 | -0.363610 | 0.555854 | 1.270695 | -0.573658 | -0.963382 |
1 | -0.292593 | 1.941813 | -1.365027 | 0.050060 | 0.234247 | 0.915920 | 0.643477 | 0.036165 | -0.708928 | 0.130881 | -0.593601 |
2 | -0.292593 | 1.273492 | -1.161568 | -0.171289 | 0.107253 | -0.060071 | 0.246745 | 0.140103 | -0.325775 | -0.045254 | -0.593601 |
3 | 1.653789 | -1.399789 | 1.483400 | -0.466421 | -0.252560 | 0.135127 | 0.429852 | 0.659792 | -0.964363 | -0.456235 | -0.593601 |
4 | -0.521580 | 0.939332 | -1.365027 | -0.466421 | -0.231395 | -0.450467 | -0.363610 | 0.555854 | 1.270695 | -0.573658 | -0.963382 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1138 | -1.151292 | -0.118842 | -0.703785 | -0.171289 | -0.231395 | 1.306316 | -0.180503 | -0.514707 | 0.695966 | 0.541862 | 0.515741 |
1139 | -0.865059 | 0.493785 | -0.958109 | -0.466421 | -0.400719 | 1.208717 | -0.241539 | -0.114545 | 0.695966 | 0.952843 | -0.870937 |
1140 | -1.208538 | 0.382399 | -0.958109 | -0.392638 | 0.064922 | 1.599113 | -0.058432 | -0.951246 | 0.887542 | -0.456235 | 0.053515 |
1141 | -1.380278 | 0.103932 | -0.856379 | -0.245072 | -0.527712 | 2.282306 | 0.155192 | -0.836914 | 1.334554 | 0.600574 | 0.700632 |
1142 | -1.380278 | 0.633019 | -0.754650 | -0.392638 | -0.252560 | 1.599113 | -0.058432 | -0.655023 | 1.653848 | 0.307016 | -0.223820 |
1143 rows × 11 columns
X = xdf
y = df['quality']
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lr = LinearRegression()
lr.fit(X_train,y_train)
test_predictions = lr.predict(X_test)
train_predictions = lr.predict(X_train)
test_r2 = r2_score(y_test,test_predictions)
train_r2 = r2_score(y_train,train_predictions)
MAE = metrics.mean_absolute_error(y_test, test_predictions)
MSE = metrics.mean_squared_error(y_test, test_predictions)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, test_predictions))
print('Test R2: ',test_r2)
print('Train R2: ',train_r2)
print('MAE: ', MAE)
print('MSE: ', MSE)
print('RMSE: ', RMSE)
Test R2: 0.3368152773954599 Train R2: 0.3827250369452767 MAE: 0.4801747424866615 MSE: 0.3855917837384165 RMSE: 0.6209603721159801
plt.scatter(y_test,test_predictions)
<matplotlib.collections.PathCollection at 0x2969efcbd30>
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(n_estimators=200,max_depth=15)
parameters = {'n_estimators' : [100,150,200],
'max_depth' : [5,10,15]}
grid_RFC = GridSearchCV(estimator = model, param_grid = parameters, cv = 5, n_jobs = -1, verbose=1,return_train_score = True)
grid_RFC.fit(X_train,y_train)
print(grid_RFC.best_estimator_)
print(grid_RFC.best_score_)
grid_RFC_test_predictions = grid_RFC.predict(X_test)
grid_RFC_train_predictions = grid_RFC.predict(X_train)
Fitting 5 folds for each of 9 candidates, totalling 45 fits RandomForestClassifier(max_depth=15) 0.6525
CM=confusion_matrix(y_test,grid_RFC_test_predictions,labels=grid_RFC.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=CM, display_labels=grid_RFC.classes_)
disp.plot(cmap='viridis')
plt.grid(None)
print(classification_report(y_test,grid_RFC_test_predictions))
#print(classification_report(y_pred,y_valid))
precision recall f1-score support 4 0.00 0.00 0.00 9 5 0.67 0.74 0.70 143 6 0.64 0.64 0.64 146 7 0.65 0.59 0.62 41 8 1.00 0.25 0.40 4 accuracy 0.65 343 macro avg 0.59 0.44 0.47 343 weighted avg 0.64 0.65 0.64 343