import pandas as pd 
# Load the wine dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"  # UCI Machine Learning Repository link
columns = [
	"Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium",
	"Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins",
	"Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"
]
wine_df = pd.read_csv(url, header=None, names=columns)

# Display the first few rows
wine_df.head()

wine_df.isnull().sum() # Check for null values in the dataset

Alcohol                         0
Malic acid                      0
Ash                             0
Alcalinity of ash               0
Magnesium                       0
Total phenols                   0
Flavanoids                      0
Nonflavanoid phenols            0
Proanthocyanins                 0
Color intensity                 0
Hue                             0
OD280/OD315 of diluted wines    0
Proline                         0
dtype: int64

wine_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 178 entries, 1 to 3
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Alcohol                       178 non-null    float64
 1   Malic acid                    178 non-null    float64
 2   Ash                           178 non-null    float64
 3   Alcalinity of ash             178 non-null    float64
 4   Magnesium                     178 non-null    int64  
 5   Total phenols                 178 non-null    float64
 6   Flavanoids                    178 non-null    float64
 7   Nonflavanoid phenols          178 non-null    float64
 8   Proanthocyanins               178 non-null    float64
 9   Color intensity               178 non-null    float64
 10  Hue                           178 non-null    float64
 11  OD280/OD315 of diluted wines  178 non-null    float64
 12  Proline                       178 non-null    int64  
dtypes: float64(11), int64(2)
memory usage: 19.5 KB

wine_df.describe() # Get a quick overview of the dataset statistics
wine_df.describe(include='all') # Get a quick overview of the dataset statistics including categorical data

wine_df.dtypes

Alcohol                         float64
Malic acid                      float64
Ash                             float64
Alcalinity of ash               float64
Magnesium                         int64
Total phenols                   float64
Flavanoids                      float64
Nonflavanoid phenols            float64
Proanthocyanins                 float64
Color intensity                 float64
Hue                             float64
OD280/OD315 of diluted wines    float64
Proline                           int64
dtype: object

# Drop rows with missing values
wine_df.dropna(inplace=True)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
wine_scaled = scaler.fit_transform(wine_df)

import seaborn as sns
import matplotlib.pyplot as plt

# Quick seaborn pairplot (small subset to avoid overload)
sns.pairplot(wine_df.iloc[:, :5])  # first 5 features for readability
plt.suptitle("Pairplot of First 5 Wine Features", y=1.02)
plt.show()

plt.figure(figsize=(12, 8))
sns.heatmap(wine_df.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

from sklearn.decomposition import PCA

# Ensure wine_scaled is defined
if 'wine_scaled' not in locals():
	raise NameError("The variable 'wine_scaled' is not defined. Please ensure the cell defining 'wine_scaled' is executed before running this cell.")

# Reduce dimensions
pca = PCA(n_components=2)
wine_pca = pca.fit_transform(wine_scaled)

# Plot
plt.figure(figsize=(8, 6))
plt.scatter(wine_pca[:, 0], wine_pca[:, 1])
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('2D PCA Projection of Wine Dataset')
plt.show()

from sklearn.cluster import KMeans

# Find best number of clusters using Elbow Method
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(wine_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

from sklearn.metrics import silhouette_score

# Select features
X = wine_df[['Proline', 'Alcohol']]

# Standardize features
scaler = StandardScaler()
wine_scaled = scaler.fit_transform(X)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(wine_scaled)

# Apply PCA for visualization (reduce to 2D)
pca = PCA(n_components=2)
pca_components = pca.fit_transform(wine_scaled)

# Create DataFrame for plotting
pca_df = pd.DataFrame(pca_components, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = kmeans.labels_

# Plot clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='Cluster', palette='Set2', s=100)
plt.title('KMeans Clustering: Proline vs Alcohol')
plt.show()

# Silhouette Score (measures cluster separation quality)
silh_score = silhouette_score(wine_scaled, kmeans.labels_)
print(f"Silhouette Score (Proline & Alcohol): {silh_score:.3f}")

Silhouette Score (Proline & Alcohol): 0.440

# Select features
X = wine_df[['Total phenols', 'Flavanoids']]

# Standardize
wine_scaled = scaler.fit_transform(X)

# KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(wine_scaled)

# PCA for 2D visualization
pca_components = pca.fit_transform(wine_scaled)
pca_df = pd.DataFrame(pca_components, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = kmeans.labels_

# Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='Cluster', palette='Set1', s=100)
plt.title('KMeans Clustering: Total Phenols and Flavanoids')
plt.show()

# Evaluate with Silhouette Score
silh_score = silhouette_score(wine_scaled, kmeans.labels_)
print(f"Silhouette Score (Total Phenols & Flavanoids): {silh_score:.3f}")

Silhouette Score (Total Phenols & Flavanoids): 0.481

# Select features
X = wine_df[['Malic acid', 'Flavanoids']]

# Standardize
wine_scaled = scaler.fit_transform(X)

# KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(wine_scaled)

# PCA transformation for visualization
pca_components = pca.fit_transform(wine_scaled)
pca_df = pd.DataFrame(pca_components, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = kmeans.labels_

# Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='Cluster', palette='deep', s=100)
plt.title('KMeans Clustering: Malic Acid and Flavanoids')
plt.show()

# Evaluate clustering
silh_score = silhouette_score(wine_scaled, kmeans.labels_)
print(f"Silhouette Score (Malic Acid & Flavanoids): {silh_score:.3f}")

Silhouette Score (Malic Acid & Flavanoids): 0.483

# 1. Use all features
X = wine_df.copy()

# 2. Standardize
scaler = StandardScaler()
wine_scaled = scaler.fit_transform(X)

# 3. Final KMeans with optimal k = 3
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(wine_scaled)

# 4. Silhouette score
silh_score = silhouette_score(wine_scaled, labels)
print(f"Final Model Silhouette Score (k=3, all features): {silh_score:.3f}")

# 5. PCA for visualization
pca = PCA(n_components=2)
pca_components = pca.fit_transform(wine_scaled)
pca_df = pd.DataFrame(pca_components, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = labels

# 6. Plot clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='Cluster', palette='Set1', s=100)
plt.title('Final KMeans Clustering (All Features, k=3)')
plt.show()

Final Model Silhouette Score (k=3, all features): 0.285

# Select features
X = wine_df[['Malic acid', 'Proline']]

# Standardize
wine_scaled = scaler.fit_transform(X)

# KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(wine_scaled)

# PCA transformation for visualization
pca_components = pca.fit_transform(wine_scaled)
pca_df = pd.DataFrame(pca_components, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = kmeans.labels_

# Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='Cluster', palette='deep', s=100)
plt.title('KMeans Clustering: Malic Acid and Proline')
plt.show()

# Evaluate clustering
silh_score = silhouette_score(wine_scaled, kmeans.labels_)
print(f"Silhouette Score (Malic Acid & Flavanoids): {silh_score:.3f}")

Silhouette Score (Malic Acid & Flavanoids): 0.415

from sklearn.datasets import load_wine
import numpy as np
# Load sklearn version
wine_sklearn = load_wine()
df_sklearn = pd.DataFrame(wine_sklearn.data, columns=wine_sklearn.feature_names)

# Assuming your dataset is in wine_df
# Step 1: Check if shape matches
print("Shape match:", df_sklearn.shape == wine_df.shape)

# Step 2: Check if values match
print("Data match:", np.allclose(df_sklearn.values, wine_df.values, atol=1e-6))

Shape match: True
Data match: True

# Load the original dataset from sklearn
wine_data = load_wine()
true_labels = wine_data.target  # 0, 1, 2

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Assume 'labels' is from your KMeans model
print("ARI:", adjusted_rand_score(true_labels, labels))
print("NMI:", normalized_mutual_info_score(true_labels, labels))

ARI: 0.8974949815093207
NMI: 0.8758935341223069

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment # Import for Hungarian algorithm
import numpy as np

# Calculate the original confusion matrix
cm = confusion_matrix(true_labels, labels)

# Use the Hungarian algorithm to find the optimal label permutation
# We negate the matrix because linear_sum_assignment finds the minimum cost assignment
row_ind, col_ind = linear_sum_assignment(-cm)

# Create a mapping from old cluster labels to new ones based on the assignment
new_labels = np.zeros_like(labels)
for i, j in zip(row_ind, col_ind):
    new_labels[labels == j] = i # Assign the true class index 'i' to samples clustered as 'j'

# Calculate the adjusted confusion matrix
adjusted_cm = confusion_matrix(true_labels, new_labels)

# Plot the adjusted confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(adjusted_cm, annot=True, cmap="Blues", fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Mapped Cluster Label")
plt.ylabel("True Wine Class")
plt.show()

def evaluate_clustering(model_name, labels, features_scaled, true_labels):
    """
    Evaluate a clustering model using silhouette score, ARI, NMI, and a confusion matrix.
    
    Parameters:
    - model_name: str, name of your model (for labeling plots and printouts)
    - labels: array-like, predicted cluster labels from KMeans
    - features_scaled: array-like, standardized feature matrix used in clustering
    - true_labels: array-like, ground truth class labels (from sklearn or matching dataset)
    """
    
    # Compute metrics
    silhouette = silhouette_score(features_scaled, labels)
    ari = adjusted_rand_score(true_labels, labels)
    nmi = normalized_mutual_info_score(true_labels, labels)

    print(f"--- Evaluation: {model_name} ---")
    print(f"Silhouette Score: {silhouette:.3f}")
    print(f"Adjusted Rand Index (ARI): {ari:.3f}")
    print(f"Normalized Mutual Info (NMI): {nmi:.3f}")

    # Confusion matrix - Apply Hungarian algorithm for optimal mapping
    cm_orig = confusion_matrix(true_labels, labels)
    row_ind, col_ind = linear_sum_assignment(-cm_orig) # Find optimal assignment
    
    # Create mapped labels
    mapped_labels = np.zeros_like(labels)
    for i, j in zip(row_ind, col_ind):
        mapped_labels[labels == j] = i
        
    # Calculate adjusted confusion matrix
    cm_adjusted = confusion_matrix(true_labels, mapped_labels)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm_adjusted, annot=True, fmt='d', cmap='Blues') # Plot adjusted matrix
    plt.title(f"Confusion Matrix: {model_name}")
    plt.xlabel("Mapped Cluster Label")
    plt.ylabel("True Class")
    plt.tight_layout()
    plt.show()

evaluate_clustering(
    model_name="KMeans - All Features",
    labels=labels,                      # from kmeans.fit_predict()
    features_scaled=wine_scaled,        # the standardized data you clustered
    true_labels=true_labels             # from sklearn.datasets.load_wine().target
)

--- Evaluation: KMeans - All Features ---
Silhouette Score: 0.299
Adjusted Rand Index (ARI): 0.897
Normalized Mutual Info (NMI): 0.876

	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735

	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
count	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000
mean	13.000618	2.336348	2.366517	19.494944	99.741573	2.295112	2.029270	0.361854	1.590899	5.058090	0.957449	2.611685	746.893258
std	0.811827	1.117146	0.274344	3.339564	14.282484	0.625851	0.998859	0.124453	0.572359	2.318286	0.228572	0.709990	314.907474
min	11.030000	0.740000	1.360000	10.600000	70.000000	0.980000	0.340000	0.130000	0.410000	1.280000	0.480000	1.270000	278.000000
25%	12.362500	1.602500	2.210000	17.200000	88.000000	1.742500	1.205000	0.270000	1.250000	3.220000	0.782500	1.937500	500.500000
50%	13.050000	1.865000	2.360000	19.500000	98.000000	2.355000	2.135000	0.340000	1.555000	4.690000	0.965000	2.780000	673.500000
75%	13.677500	3.082500	2.557500	21.500000	107.000000	2.800000	2.875000	0.437500	1.950000	6.200000	1.120000	3.170000	985.000000
max	14.830000	5.800000	3.230000	30.000000	162.000000	3.880000	5.080000	0.660000	3.580000	13.000000	1.710000	4.000000	1680.000000

ITCS 3162 Final Project Code¶

1. Introduction¶

2. About the Data¶

3. Methods¶

4. Evaluation¶

4. Evaluation Results¶

5. References¶