from itertools import permutations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score, accuracy_score, confusion_matrix
import scipy.cluster.hierarchy as sch
from scipy import stats

%matplotlib inline

# Load the Iris dataset
iris = load_iris()
X = iris.data
target = iris.target
species_names = iris.target_names

# Create a DataFrame for better visualization
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = species_names[target]
df['true_labels'] = target

print("Dataset Overview:")
print("Features:", iris.feature_names)
print("Species:", species_names.tolist())
print("\nDataframe structure:")
df.info()
df.species.value_counts().plot.pie(autopct='%.1f%%', ylabel='')
df.head()

Dataset Overview:
Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Species: ['setosa', 'versicolor', 'virginica']

Dataframe structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   species            150 non-null    object 
 5   true_labels        150 non-null    int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB

# Basic statistics
df[iris.feature_names].describe().round(2)

# Check for missing values
print("MISSING VALUES:")
print("=" * 30)
print(df.isnull().sum())

MISSING VALUES:
==============================
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
true_labels          0
dtype: int64

# Statistics by species
print("STATISTICS BY SPECIES:")
print("=" * 50)
species_stats = df.groupby('species')[iris.feature_names].agg(['mean', 'std', 'min', 'max']).round(2)
print(species_stats)

STATISTICS BY SPECIES:
==================================================
           sepal length (cm)                 sepal width (cm)                  \
                        mean   std  min  max             mean   std  min  max   
species                                                                         
setosa                  5.01  0.35  4.3  5.8             3.43  0.38  2.3  4.4   
versicolor              5.94  0.52  4.9  7.0             2.77  0.31  2.0  3.4   
virginica               6.59  0.64  4.9  7.9             2.97  0.32  2.2  3.8   

           petal length (cm)                 petal width (cm)                  
                        mean   std  min  max             mean   std  min  max  
species                                                                        
setosa                  1.46  0.17  1.0  1.9             0.25  0.11  0.1  0.6  
versicolor              4.26  0.47  3.0  5.1             1.33  0.20  1.0  1.8  
virginica               5.55  0.55  4.5  6.9             2.03  0.27  1.4  2.5

# Pairplot to see relationships between features
sns.pairplot(df, hue='species', diag_kind='hist')
plt.suptitle('Pairplot of Iris Features by Species', y=1.02)
plt.show()

# Correlation matrix
correlation_matrix = df[iris.feature_names].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Iris Features')
plt.show()

# Boxplots for each feature by species
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for ax, feature in zip(axes, iris.feature_names):
    sns.boxplot(x='species', y=feature, data=df, ax=ax)
    ax.set_title(feature)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Distribution plots for each feature
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for ax, feature in zip(axes, iris.feature_names):
    for species in species_names:
        species_data = df[df['species'] == species][feature]
        ax.hist(species_data, alpha=0.6, label=species, bins=15, density=True)
    ax.set_title("Distribution of %s" % feature)
    ax.set_xlabel(feature)
    ax.set_ylabel("Density")
    ax.legend()

plt.tight_layout()
plt.show()

# Scale the numerical features
# This is necessary as the above EDA confirms that the features are well varied in their means and standard deviations
# Scaling the data should make the models less biased
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# There is no need to perform other transformations, such as log transformation, as the data seems to almost follow a normal distribution
# The number of outliers is also negligible

# Perform PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print("PCA RESULTS:")
print("=" * 50)
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total explained variance: %.3f" % sum(pca.explained_variance_ratio_))

PCA RESULTS:
==================================================
Explained variance ratio: [0.72962445 0.22850762]
Total explained variance: 0.958

# Visualize PCA results
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=target, cmap='viridis', alpha=0.7)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA: Actual Species')
plt.legend(handles=scatter.legend_elements()[0], labels=species_names.tolist())
plt.show()

# Function to find best label permutation for accuracy
def label_permute_compare(y_true, y_pred):
    """
    y_true: true labels
    y_pred: predicted cluster labels
    Returns:
    - best_perm: best permutation of labels found
    - best_acc: best accuracy achieved
    - best_y_pred_mapped: predicted labels mapped using best permutation
    """
    unique_labels = np.unique(y_pred)
    best_acc = 0.
    
    # Generate all possible permutations of the unique labels
    for perm in permutations(unique_labels):
        # Create mapping from original cluster labels to permuted labels
        y_pred_mapped = [perm[label] for label in y_pred]
        
        accuracy = accuracy_score(y_true, y_pred_mapped)
        if accuracy > best_acc:
            best_acc = accuracy
            best_perm = perm
            best_y_pred_mapped = y_pred_mapped
    
    return best_perm, best_acc, best_y_pred_mapped

# Determine optimal number of clusters using multiple metrics
inertia = []
silhouette_scores_kmeans = []
silhouette_scores_agg = []
accuracy_scores_kmeans = []
accuracy_scores_agg = []
k_range = range(2, 8)

for k in k_range:
    # K-Means
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_labels = kmeans.fit_predict(X_scaled)
    inertia.append(kmeans.inertia_)
    silhouette_scores_kmeans.append(silhouette_score(X_scaled, kmeans_labels))
    accuracy_scores_kmeans.append(label_permute_compare(target, kmeans_labels)[1])
    
    # Agglomerative Clustering (default parameters)
    agg = AgglomerativeClustering(n_clusters=k)
    agg_labels = agg.fit_predict(X_scaled)
    silhouette_scores_agg.append(silhouette_score(X_scaled, agg_labels))
    accuracy_scores_agg.append(label_permute_compare(target, agg_labels)[1])

# Plot comparison of methods
plt.figure(figsize=(18, 5))

# Elbow Method
plt.subplot(1, 4, 1)
plt.plot(k_range, inertia, 'bo-', label='K-Means')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method (K-Means Only)')
plt.legend()

# Silhouette Scores Comparison
plt.subplot(1, 4, 2)
plt.plot(k_range, silhouette_scores_kmeans, 'ro-', label='K-Means')
plt.plot(k_range, silhouette_scores_agg, 'go-', label='Agglomerative')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores Comparison')
plt.legend()

# Accuracy Scores Comparison
plt.subplot(1, 4, 3)
plt.plot(k_range, accuracy_scores_kmeans, 'ro-', label='K-Means')
plt.plot(k_range, accuracy_scores_agg, 'go-', label='Agglomerative')
plt.xlabel('Number of clusters')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores Comparison')
plt.legend()

# Dendrogram for Agglomerative Clustering
plt.subplot(1, 4, 4)
dendrogram = sch.dendrogram(sch.linkage(X_scaled, method='ward'))
plt.title('Dendrogram for Agglomerative Clustering\n(ward linkage)')
plt.xlabel('Samples')
plt.ylabel('Euclidean Distance')

plt.tight_layout()
plt.show()

# Define different linkage methods to test
linkage_methods = ('ward', 'complete', 'average', 'single')

# Test different parameter combinations
agg_results = {
    'linkage': [],
    'metric': [],
    'silhouette_score': [],
    'accuracy': [],
    'ari': [],
    'labels': [],
    'mapped_labels': []
}

optimal_clusters = 3

for linkage in linkage_methods:
    for metric in ('euclidean', 'manhattan', 'cosine'):
        # Skip incompatible combinations
        if linkage == 'ward' and metric != 'euclidean':
            continue
            
        agg = AgglomerativeClustering(n_clusters=optimal_clusters, linkage=linkage, metric=metric)
        agg_labels = agg.fit_predict(X_scaled)
        
        # Calculate metrics
        silhouette = silhouette_score(X_scaled, agg_labels)
        _, accuracy, mapped_labels = label_permute_compare(target, agg_labels)
        ari = adjusted_rand_score(target, agg_labels)
        
        agg_results['linkage'].append(linkage)
        agg_results['metric'].append(metric)
        agg_results['silhouette_score'].append(silhouette)
        agg_results['accuracy'].append(accuracy)
        agg_results['ari'].append(ari)
        agg_results['labels'].append(agg_labels)
        agg_results['mapped_labels'].append(mapped_labels)

# Convert results to DataFrame, sorted by accuracy (descending)
agg_comparison_df = pd.DataFrame(agg_results).sort_values('accuracy', ascending=False)

print("AGGLOMERATIVE CLUSTERING PARAMETER COMPARISON:")
print("=" * 60)
print(agg_comparison_df.iloc[:,:5].round(4))

AGGLOMERATIVE CLUSTERING PARAMETER COMPARISON:
============================================================
    linkage     metric  silhouette_score  accuracy     ari
5   average  manhattan            0.4530    0.8867  0.7184
3  complete     cosine            0.4466    0.8400  0.6335
0      ward  euclidean            0.4467    0.8267  0.6153
2  complete  manhattan            0.4350    0.8200  0.6146
6   average     cosine            0.4302    0.8200  0.6097
1  complete  euclidean            0.4496    0.7867  0.5726
4   average  euclidean            0.4803    0.6867  0.5621
8    single  manhattan            0.4949    0.6800  0.5638
9    single     cosine            0.0956    0.6667  0.5414
7    single  euclidean            0.5046    0.6600  0.5584

# Plot comparison of different parameter combinations
plt.figure(figsize=(20, 5))

# Accuracy comparison
plt.subplot(1, 3, 1)
for linkage in linkage_methods:
    linkage_data = agg_comparison_df[agg_comparison_df['linkage'] == linkage]
    if not linkage_data.empty:
        plt.plot(linkage_data['metric'], linkage_data['accuracy'], 'o-', label=linkage)
plt.xlabel('Metric')
plt.ylabel('Accuracy')
plt.title('Accuracy by Linkage Method and Metric')
plt.legend()
plt.xticks(rotation=45)

# Silhouette score comparison
plt.subplot(1, 3, 2)
for linkage in linkage_methods:
    linkage_data = agg_comparison_df[agg_comparison_df['linkage'] == linkage]
    if not linkage_data.empty:
        plt.plot(linkage_data['metric'], linkage_data['silhouette_score'], 'o-', label=linkage)
plt.xlabel('Metric')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score by Linkage Method and Metric')
plt.legend()
plt.xticks(rotation=45)

# ARI comparison
plt.subplot(1, 3, 3)
for linkage in linkage_methods:
    linkage_data = agg_comparison_df[agg_comparison_df['linkage'] == linkage]
    if not linkage_data.empty:
        plt.plot(linkage_data['metric'], linkage_data['ari'], 'o-', label=linkage)
plt.xlabel('Metric')
plt.ylabel('Adjusted Rand Index')
plt.title('ARI by Linkage Method and Metric')
plt.legend()
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Create side-by-side dendrograms for comparison
plt.figure(figsize=(20, 6))

# First dendrogram (ward linkage with Euclidean distance)
plt.subplot(1, 2, 1)
dendrogram1 = sch.dendrogram(sch.linkage(X_scaled, method='ward', metric='euclidean'))
plt.title('Dendrogram: Ward Linkage with Euclidean Metric')
plt.xlabel('Samples')
plt.ylabel('Euclidean Distance')
plt.axhline(y=10, color='r', linestyle='--', label='Cut line for 3 clusters')
plt.legend()

# Second dendrogram (best parameters)
plt.subplot(1, 2, 2)
dendrogram2 = sch.dendrogram(sch.linkage(X_scaled, method="average", metric="cityblock"))
plt.title('Dendrogram: Average Linkage with Manhattan Metric')
plt.xlabel('Samples')
plt.ylabel('Manhattan Distance')
plt.axhline(y=3.7, color='r', linestyle='--', label='Cut line for 3 clusters')
plt.legend()

plt.tight_layout()
plt.show()

# Apply both clustering algorithms with optimal clusters (3)
optimal_clusters = 3

# K-Means
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Agglomerative Clustering with best parameters
best_linkage = "average"
best_metric = "manhattan"
agg_best = AgglomerativeClustering(n_clusters=optimal_clusters, 
                                   linkage=best_linkage, 
                                   metric=best_metric)
agg_labels_best = agg_best.fit_predict(X_scaled)

# Also keep default agglomerative for comparison
agg_default = AgglomerativeClustering(n_clusters=optimal_clusters)
agg_labels_default = agg_default.fit_predict(X_scaled)

print("Cluster distributions:")
print("K-Means:", np.bincount(kmeans_labels))
print("Agglomerative (default):", np.bincount(agg_labels_default))
print("Agglomerative (best):", np.bincount(agg_labels_best))

Cluster distributions:
K-Means: [53 50 47]
Agglomerative (default): [71 49 30]
Agglomerative (best): [50 35 65]

# Find best label mappings using permutation method
_, kmeans_accuracy, kmeans_mapped = label_permute_compare(target, kmeans_labels)
_, agg_accuracy_default, agg_mapped_default = label_permute_compare(target, agg_labels_default)
_, agg_accuracy_best, agg_mapped_best = label_permute_compare(target, agg_labels_best)

# Add mapped labels to dataframe

print("Accuracy scores:")
print("K-Means = %.1f%%" % (100 * kmeans_accuracy))
print("Agglomerative (default) = %.1f%%" % (100 * agg_accuracy_default))
print("Agglomerative (best) = %.1f%%" % (100 * agg_accuracy_best))

Accuracy scores:
K-Means = 83.3%
Agglomerative (default) = 82.7%
Agglomerative (best) = 88.7%

# Calculate performance metrics for all methods
kmeans_silhouette = silhouette_score(X_scaled, kmeans_labels)
agg_silhouette_default = silhouette_score(X_scaled, agg_labels_default)
agg_silhouette_best = silhouette_score(X_scaled, agg_labels_best)

kmeans_ari = adjusted_rand_score(target, kmeans_labels)
agg_ari_default = adjusted_rand_score(target, agg_labels_default)
agg_ari_best = adjusted_rand_score(target, agg_labels_best)

# Create confusion matrices
kmeans_cm = confusion_matrix(target, kmeans_mapped)
agg_cm_default = confusion_matrix(target, agg_mapped_default)
agg_cm_best = confusion_matrix(target, agg_mapped_best)

# Create comparison table
comparison_df = pd.DataFrame({
    'Algorithm': ['K-Means', 'Agglomerative (default)', 'Agglomerative (best)'],
    'Accuracy': [kmeans_accuracy, agg_accuracy_default, agg_accuracy_best],
    'Silhouette Score': [kmeans_silhouette, agg_silhouette_default, agg_silhouette_best],
    'Adjusted Rand Index': [kmeans_ari, agg_ari_default, agg_ari_best],
})

print("PERFORMANCE COMPARISON:")
print("=" * 60)
print(comparison_df.round(4))

PERFORMANCE COMPARISON:
============================================================
                 Algorithm  Accuracy  Silhouette Score  Adjusted Rand Index
0                  K-Means    0.8333            0.4599               0.6201
1  Agglomerative (default)    0.8267            0.4467               0.6153
2     Agglomerative (best)    0.8867            0.4530               0.7184

# Plot confusion matrices
plt.figure(figsize=(20, 6))

# K-Means Confusion Matrix
plt.subplot(1, 3, 1)
sns.heatmap(kmeans_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=species_names, yticklabels=species_names)
plt.title('K-Means Confusion Matrix\nAccuracy: %.1f%%' % (100 * kmeans_accuracy))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

# Agglomerative Confusion Matrix (default)
plt.subplot(1, 3, 2)
sns.heatmap(agg_cm_default, annot=True, fmt='d', cmap='Greens', 
            xticklabels=species_names, yticklabels=species_names)
plt.title('Agglomerative Confusion Matrix (default)\nAccuracy: %.1f%%' % (100 * agg_accuracy_default))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

# Agglomerative Confusion Matrix (best)
plt.subplot(1, 3, 3)
sns.heatmap(agg_cm_best, annot=True, fmt='d', cmap='Oranges', 
            xticklabels=species_names, yticklabels=species_names)
plt.title('Agglomerative Confusion Matrix (best)\nAccuracy: %.1f%%' % (100 * agg_accuracy_best))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

plt.tight_layout()
plt.show()

# Create comparison visualization
plt.figure(figsize=(25, 5))

# Actual species
plt.subplot(1, 5, 1)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=target, cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Actual Species')
plt.legend(handles=scatter.legend_elements()[0], labels=species_names.tolist())

# K-Means clusters (mapped)
plt.subplot(1, 5, 2)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_mapped, cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('K-Means Clusters (Mapped)\nAccuracy: %.1f%%' % (100 * kmeans_accuracy))
plt.legend(handles=scatter.legend_elements()[0], labels=species_names.tolist())

# Agglomerative clusters default (mapped)
plt.subplot(1, 5, 3)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=agg_mapped_default, cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Agglomerative Clusters (Default)\nAccuracy: %.1f%%' % (100 * agg_accuracy_default))
plt.legend(handles=scatter.legend_elements()[0], labels=species_names.tolist())

# Agglomerative clusters best (mapped)
plt.subplot(1, 5, 4)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=agg_mapped_best, cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Agglomerative Clusters (best)\nAccuracy: %.1f%%' % (100 * agg_accuracy_best))
plt.legend(handles=scatter.legend_elements()[0], labels=species_names.tolist())

# Error comparison
plt.subplot(1, 5, 5)
kmeans_errors = (target != kmeans_mapped).astype(int)
agg_default_errors = (target != agg_mapped_default).astype(int)
agg_best_errors = (target != agg_mapped_best).astype(int)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_errors + agg_default_errors + agg_best_errors, cmap='Reds')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Classification Errors\n(Red intensity = more methods failed)')

plt.tight_layout()
plt.show()

# Detailed error analysis
print("DETAILED ERROR ANALYSIS:")
print("=" * 60)

for method, mapped_labels, accuracy in (
    ('K-Means', kmeans_mapped, kmeans_accuracy),
    ('Agglomerative (default)', agg_mapped_default, agg_accuracy_default),
    ('Agglomerative (best)', agg_mapped_best, agg_accuracy_best),
):
    errors = target != mapped_labels
    error_indices = np.where(errors)[0]
    
    print("%s:" % method)
    print("  Accuracy: %.1f%%" % (accuracy * 100))
    print("  Total errors: %2d/%3d errors (%.1f%%)" % (np.sum(errors), len(errors), np.mean(errors) * 100))
    
    if len(error_indices) > 0:
        error_samples = df.iloc[error_indices]
        print("  Error breakdown by species:")
        for species in species_names:
            species_errors = error_samples[error_samples['species'] == species]
            total_species = np.sum(target == np.where(species_names == species)[0][0])
            if len(species_errors) > 0:
                print("    %s: %2d/%2d  errors (%4.1f%%)" % (species.ljust(10), len(species_errors), total_species, len(species_errors)/total_species*100))
    print()

DETAILED ERROR ANALYSIS:
============================================================
K-Means:
  Accuracy: 83.3%
  Total errors: 25/150 errors (16.7%)
  Error breakdown by species:
    versicolor: 11/50  errors (22.0%)
    virginica : 14/50  errors (28.0%)

Agglomerative (default):
  Accuracy: 82.7%
  Total errors: 26/150 errors (17.3%)
  Error breakdown by species:
    setosa    :  1/50  errors ( 2.0%)
    versicolor: 23/50  errors (46.0%)
    virginica :  2/50  errors ( 4.0%)

Agglomerative (best):
  Accuracy: 88.7%
  Total errors: 17/150 errors (11.3%)
  Error breakdown by species:
    versicolor:  1/50  errors ( 2.0%)
    virginica : 16/50  errors (32.0%)

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
count	150.00	150.00	150.00	150.00
mean	5.84	3.06	3.76	1.20
std	0.83	0.44	1.77	0.76
min	4.30	2.00	1.00	0.10
25%	5.10	2.80	1.60	0.30
50%	5.80	3.00	4.35	1.30
75%	6.40	3.30	5.10	1.80
max	7.90	4.40	6.90	2.50

Unsupervised Learning Final Project: Iris Species Clustering Analysis¶

Comparative Study of K-Means vs Agglomerative Hierarchical Clustering¶

Problem Description¶

Background¶

Objectives¶

Import Necessary Libraries¶

Load and Explore the Dataset¶

EDA (Exploratory Data Analysis)¶

Statistical Analysis by Species¶

Visual EDA¶

Observation¶

Data Preprocessing¶

PCA for Dimensionality Reduction and Visualization¶

Observation¶

Observation¶

Determine Optimal Number of Clusters¶

Compare Different Linkage Methods and Metric Parameters¶

Visualize Performance of Different Parameter Combinations¶

Dendrograms for Comparison¶

Apply Both Clustering Algorithms with Best Parameters¶

Map Clusters to True Labels for Accuracy Calculation¶

Calculate Performance Metrics¶

Visualize Confusion Matrices¶

Visualize the Results¶

Detailed Error Analysis¶

Discussion and Interpretation¶

Species-Specific Insights:¶

Key Findings:¶

Practical Implications:¶

Limitations and Future Work:¶

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa