import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, CSVLogger
from tensorflow.keras.regularizers import l1, l2, l1_l2

import os
from PIL import Image
from tqdm import tqdm

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

2025-09-10 10:43:25.735520: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

# Configuration
DATA_DIR = '/kaggle/input/histopathologic-cancer-detection' if 'KAGGLE_URL_BASE' in os.environ else 'data_3'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TEST_DIR = os.path.join(DATA_DIR, 'test')

IMG_SIZE = 96
BATCH_SIZE = 64
EPOCHS = 30

# Load and explore the data
train_labels = pd.read_csv(os.path.join(DATA_DIR, 'train_labels.csv'))

# Inspect the data
display(train_labels.head())
train_labels.info()

# Add full paths to the dataframe
train_labels['path'] = train_labels['id'].apply(lambda img_id: os.path.join(TRAIN_DIR, f"{img_id}.tif"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220025 entries, 0 to 220024
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      220025 non-null  object
 1   label   220025 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.4+ MB

# Check for missing values
print("Missing values in labels:", train_labels.isnull().sum().sum())
print("Duplicate IDs:", train_labels['id'].duplicated().sum())

Missing values in labels: 0
Duplicate IDs: 0

# Visualize class distribution
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.countplot(x='label', data=train_labels)
plt.title('Class Distribution')

plt.subplot(1, 2, 2)
plt.pie(train_labels['label'].value_counts(),
        labels=['0', '1'],
        autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
plt.title('Class Proportions')
plt.tight_layout()
plt.show()

# Image metadata analysis
def analyze_image_metadata(df, sample_size=1000):
    """Analyze image dimensions, formats, and basic statistics"""
    print("Analysis done on a sample of %d images" % sample_size)

    # Sample images for analysis
    sample_df = df.sample(min(sample_size, len(df)), random_state=42)

    dimensions = []
    formats = []
    mean_intensities = []
    std_intensities = []

    for _, row in tqdm(sample_df.iterrows(), desc="Analyzing image metadata"):
        img = Image.open(row['path'])
        img_array = np.array(img)

        dimensions.append(img_array.shape)
        formats.append(img.format if img.format else 'Unknown')
        mean_intensities.append(np.mean(img_array))
        std_intensities.append(np.std(img_array))

    # Analyze dimensions
    unique_dims = set(dimensions)
    print("Unique image dimensions:", unique_dims)
    print("All images same size:", len(unique_dims) == 1)

    # Analyze formats
    print("All images have a format of %s:" % formats[0], all(map(lambda x: x == formats[0], formats)))

    # Analyze intensity statistics
    print("\nIntensity statistics:")
    print("Mean intensity: %.2f ± %.2f" % (np.mean(mean_intensities), np.std(mean_intensities)))
    print("Std intensity: %.2f ± %.2f" % (np.mean(std_intensities), np.std(std_intensities)))

# Run metadata analysis
analyze_image_metadata(train_labels)

Analysis done on a sample of 1000 images

Analyzing image metadata: 1000it [00:00, 1217.26it/s]

Unique image dimensions: {(96, 96, 3)}
All images same size: True
All images have a format of TIFF: True

Intensity statistics:
Mean intensity: 164.00 ± 38.53
Std intensity: 51.66 ± 13.62

# Data leakage check - ensure no duplicate images between train and test
overlap = set(train_labels['id']).intersection((os.path.splitext(f)[0] for f in os.listdir(TEST_DIR)))
print("Number of overlapping files:", len(overlap))
print("No leakage detected:", len(overlap) == 0)

Number of overlapping files: 0
No leakage detected: True

# Sample some images from each class
def visualize_samples(n_samples=5):
    fig, axes = plt.subplots(2, n_samples, figsize=(15, 6))

    for label in [0, 1]:
        sample_df = train_labels[train_labels['label'] == label].sample(n_samples)

        for i, (_, row) in enumerate(sample_df.iterrows()):
            img = Image.open(row['path'])

            axes[label, i].imshow(img)
            axes[label, i].set_title("Label: %s" % label)
            axes[label, i].axis('off')

    plt.tight_layout()
    plt.show()

visualize_samples()

def analyze_pixel_intensities(df, sample_size=1000):
    """Analyze pixel intensity distributions for each class"""
    np.random.seed(42)

    # Sample images from each class
    samples_0 = df[df['label'] == 0].sample(sample_size // 2)
    samples_1 = df[df['label'] == 1].sample(sample_size // 2)

    pixels_0 = []
    pixels_1 = []

    # Process negative samples
    for _, row in tqdm(samples_0.iterrows(), desc="Processing negative samples"):
        img = Image.open(row['path'])
        img_array = np.array(img)
        pixels_0.extend(img_array.flatten())

    # Process positive samples
    for _, row in tqdm(samples_1.iterrows(), desc="Processing positive samples"):
        img = Image.open(row['path'])
        img_array = np.array(img)
        pixels_1.extend(img_array.flatten())

    return pixels_0, pixels_1

# Analyze pixel intensities
pixels_0, pixels_1 = analyze_pixel_intensities(train_labels, sample_size=1000)

Processing negative samples: 500it [00:01, 489.64it/s]
Processing positive samples: 500it [00:01, 453.75it/s]

# Plot pixel intensity histograms
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
plt.hist(pixels_0, bins=50, alpha=0.7, label='0', color='blue', density=True)
plt.hist(pixels_1, bins=50, alpha=0.7, label='1', color='red', density=True)
plt.title('Pixel Intensity Distribution (All Channels)')
plt.xlabel('Pixel Intensity')
plt.ylabel('Density')
plt.legend()

plt.subplot(2, 2, 2)
# Sample a smaller subset for individual channels
sample_img = Image.open(train_labels.iloc[0]['path'])
sample_array = np.array(sample_img)
for i, color in enumerate(['Red', 'Green', 'Blue']):
    plt.hist(sample_array[:, :, i].flatten(), bins=50, alpha=0.7, label=color, density=True)
plt.title('Channel Intensity Distribution (Single Image)')
plt.xlabel('Pixel Intensity')
plt.ylabel('Density')
plt.legend()

plt.subplot(2, 2, 3)
# Sample a smaller subset for individual channels
sample_img = Image.open(train_labels.iloc[1]['path'])
sample_array = np.array(sample_img)
for i, color in enumerate(['Red', 'Green', 'Blue']):
    plt.hist(sample_array[:, :, i].flatten(), bins=50, alpha=0.7, label=color, density=True)
plt.title('Channel Intensity Distribution (Another Image)')
plt.xlabel('Pixel Intensity')
plt.ylabel('Density')
plt.legend()

plt.subplot(2, 2, 4)
# Compare mean intensities
means_0 = []
means_1 = []

for i in tqdm(range(200)):  # Smaller sample for performance
    neg_img = Image.open(train_labels[train_labels['label'] == 0].iloc[i]['path'])
    pos_img = Image.open(train_labels[train_labels['label'] == 1].iloc[i]['path'])

    means_0.append(np.mean(np.array(neg_img)))
    means_1.append(np.mean(np.array(pos_img)))

plt.hist(means_0, bins=30, alpha=0.7, label='0', color='blue', density=True)
plt.hist(means_1, bins=30, alpha=0.7, label='1', color='red', density=True)
plt.title('Mean Image Intensity Distribution')
plt.xlabel('Mean Pixel Intensity')
plt.ylabel('Density')
plt.legend()

plt.tight_layout()
plt.show()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:04<00:00, 41.99it/s]

# Convert labels to strings for the generator
train_labels['label_str'] = train_labels['label'].astype(str)

# Prepare a subset for hyperparameter tuning
tuning_subset = pd.concat([
    train_labels[train_labels['label'] == 0].sample(5000, random_state=42),
    train_labels[train_labels['label'] == 1].sample(5000, random_state=42)
])

# Split into train and validation
train_df, val_df = train_test_split(
    tuning_subset,
    test_size=0.2,
    stratify=tuning_subset['label'],
    random_state=42
)

# Prepare data generators - use class_mode='binary' with string labels
datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.1
)

# Create data generators using the path column and string labels
train_generator = datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='path',
    y_col='label_str',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary'
)

val_generator = datagen.flow_from_dataframe(
    dataframe=val_df,
    x_col='path',
    y_col='label_str',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False
)

Found 8000 validated image filenames belonging to 2 classes.
Found 2000 validated image filenames belonging to 2 classes.

# Single flexible function to create CNN models with different architectures and regularization
def create_cnn_model(architecture='deeper', regularization=None, reg_strength=.001,
                     dropout_rate=.5, additional_dropout=False, additional_bn=False):
    """
    Create a CNN model with specified architecture and regularization.

    Parameters:
    - architecture: 'simple', 'deeper', or 'wider'
    - regularization: None, 'l1', 'l2', 'l1_l2'
    - reg_strength: regularization strength
    - dropout_rate: dropout rate for dense layers
    - additional_dropout: whether to add dropout after convolutional layers
    - additional_bn: whether to add additional batch normalization layers

    Returns:
    - Compiled Keras model
    """

    # Define regularization function
    if regularization == 'l1':
        reg = l1(reg_strength)
    elif regularization == 'l2':
        reg = l2(reg_strength)
    elif regularization == 'l1_l2':
        reg = l1_l2(l1=reg_strength/2, l2=reg_strength/2)
    else:  # 'none'
        reg = None

    # Define architecture parameters
    if architecture == 'simple':
        conv_layers = [
            (32, 3),  # (filters, kernel_size)
            (64, 3)
        ]
        dense_units = 64
    elif architecture == 'deeper':
        conv_layers = [
            (32, 3),
            (64, 3),
            (128, 3)
        ]
        dense_units = 128
    elif architecture == 'wider':
        conv_layers = [
            (64, 3),
            (128, 3),
            (256, 3)
        ]
        dense_units = 256
    else:
        raise ValueError("Architecture must be 'simple', 'deeper', or 'wider'")

    # Build the model
    model = Sequential()

    # Input layer
    model.add(Conv2D(conv_layers[0][0], (conv_layers[0][1], conv_layers[0][1]),
                    activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3),
                    kernel_regularizer=reg))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2)))

    if additional_dropout:
        model.add(Dropout(0.2))
    if additional_bn:
        model.add(BatchNormalization())

    # Additional convolutional layers
    for filters, kernel_size in conv_layers[1:]:
        model.add(Conv2D(filters, (kernel_size, kernel_size),
                        activation='relu', kernel_regularizer=reg))
        model.add(BatchNormalization())
        model.add(MaxPooling2D((2, 2)))

        if additional_dropout:
            model.add(Dropout(0.2))
        if additional_bn:
            model.add(BatchNormalization())

    # Dense layers
    model.add(Flatten())
    model.add(Dense(dense_units, activation='relu', kernel_regularizer=reg))

    if additional_bn:
        model.add(BatchNormalization())

    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

    return model

# Test different model architectures
arch_models = [
    {
        "name": "Simple CNN",
        "params": { 'architecture': 'simple', },
    },
    {
        "name": "Deeper CNN",
        "params": { 'architecture': 'deeper', },
    },
    {
        "name": "Wider CNN",
        "params": { 'architecture': 'wider', },
    },
]

# Create directories for checkpoints and histories
os.makedirs("checkpoints", exist_ok=True)
os.makedirs("histories", exist_ok=True)

for model in arch_models:
    model["checkpoint"] = os.path.join("checkpoints", "%s.keras" % model["name"])
    model["history_file"] = os.path.join("histories", "%s.csv" % model["name"])

# Load and evaluate each model
for model in arch_models:
    model["instance"] = load_model(model["checkpoint"])

    # Evaluate the model
    val_generator.reset()
    val_preds = model["instance"].predict(val_generator)
    model["auc"] = roc_auc_score(val_generator.classes, val_preds)

    model["history"] = pd.read_csv(model["history_file"], sep=',', engine='python')

    print("%s Validation AUC: %.4f" % (model["name"], model["auc"]))

arch_models_df = pd.DataFrame(arch_models)

WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1757490264.572239 1667087 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3546 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5
/home/farzat/files/University/Colorado/courses/csca5632=unsupervised-algorithms-in-machine-learning/reviews/venv/lib/python3.13/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py:121: UserWarning: Your `PyDataset` class should call `super().__init__(**kwargs)` in its constructor. `**kwargs` can include `workers`, `use_multiprocessing`, `max_queue_size`. Do not pass these arguments to `fit()`, as they will be ignored.
  self._warn_if_super_not_called()
2025-09-10 10:44:25.930818: I external/local_xla/xla/service/service.cc:163] XLA service 0x7f0bfc0044c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-09-10 10:44:25.930853: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce GTX 1660 Ti, Compute Capability 7.5
2025-09-10 10:44:25.941837: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-10 10:44:25.990517: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91200
2025-09-10 10:44:26.179768: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[64,32,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,3,96,96]{3,2,1,0}, f32[32,3,3,3]{3,2,1,0}, f32[32]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:44:26.263880: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[64,64,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,32,47,47]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}

 3/32 ━━━━━━━━━━━━━━━━━━━━ 1s 60ms/step

I0000 00:00:1757490266.659969 1673357 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

31/32 ━━━━━━━━━━━━━━━━━━━━ 0s 110ms/step

2025-09-10 10:44:30.161400: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[16,32,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,3,96,96]{3,2,1,0}, f32[32,3,3,3]{3,2,1,0}, f32[32]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:44:30.182667: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[16,64,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,32,47,47]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}

32/32 ━━━━━━━━━━━━━━━━━━━━ 5s 124ms/step
Simple CNN Validation AUC: 0.9180

2025-09-10 10:44:31.373871: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[64,128,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,64,22,22]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}

31/32 ━━━━━━━━━━━━━━━━━━━━ 0s 114ms/step

2025-09-10 10:44:35.371030: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[16,128,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,64,22,22]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}

32/32 ━━━━━━━━━━━━━━━━━━━━ 5s 128ms/step
Deeper CNN Validation AUC: 0.9331

# Compare model performance
plt.figure(figsize=(10, 6))

bars = plt.bar(arch_models_df["name"], arch_models_df["auc"], color=['skyblue', 'lightgreen', 'lightcoral'])
plt.title('Model Comparison - Validation AUC Scores')
plt.ylabel('AUC Score')
plt.ylim(.7, 1.0)

# Add value labels on bars
for bar, score in zip(bars, arch_models_df["auc"]):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + .005,
             '%.4f' % score, ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Plot training history for the best model
arch_best_model = arch_models[arch_models_df.auc.idxmax()]

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(arch_best_model['history']['accuracy'], label='Training Accuracy')
plt.plot(arch_best_model['history']['val_accuracy'], label='Validation Accuracy')
plt.title('%s - Accuracy' % arch_best_model["name"])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(arch_best_model['history']['loss'], label='Training Loss')
plt.plot(arch_best_model['history']['val_loss'], label='Validation Loss')
plt.title('%s - Loss' % arch_best_model["name"])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 3, 3)
plt.plot(arch_best_model['history']['auc'], label='Training AUC')
plt.plot(arch_best_model['history']['val_auc'], label='Validation AUC')
plt.title('%s - AUC' % arch_best_model["name"])
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.legend()

plt.tight_layout()
plt.show()

# Test different regularization methods on the best architecture
best_architecture = arch_best_model["params"]["architecture"]

reg_models = [
    {
        "name": "Original (No Reg)",
        "params": {'architecture': best_architecture, 'regularization': None}
    },
    {
        "name": "L2 Regularization",
        "params": {'architecture': best_architecture, 'regularization': 'l2', 'reg_strength': 0.001}
    },
    {
        "name": "L1 Regularization",
        "params": {'architecture': best_architecture, 'regularization': 'l1', 'reg_strength': 0.001}
    },
    {
        "name": "Elastic Net (L1+L2)",
        "params": {'architecture': best_architecture, 'regularization': 'l1_l2', 'reg_strength': 0.001}
    },
    {
        "name": "Increased Dropout",
        "params": {'architecture': best_architecture, 'regularization': None,
                  'dropout_rate': 0.6, 'additional_dropout': True}
    },
    {
        "name": "Additional BatchNorm",
        "params": {'architecture': best_architecture, 'regularization': None,
                  'additional_bn': True}
    }
]

# Create checkpoints and history files for each regularization model
for model in reg_models:
    model["checkpoint"] = os.path.join("checkpoints", "%s %s.keras" % (best_architecture, model["name"]))
    model["history_file"] = os.path.join("histories", "%s %s.csv" % (best_architecture, model["name"]))

# Load and evaluate each regularization model
for model in reg_models:
    model["instance"] = load_model(model["checkpoint"])

    # Evaluate the model
    val_generator.reset()
    val_preds = model["instance"].predict(val_generator)
    model["auc"] = roc_auc_score(val_generator.classes, val_preds)

    model["history"] = pd.read_csv(model["history_file"], sep=',', engine='python')

    print("%s Validation AUC: %.4f" % (model["name"], model["auc"]))

reg_models_df = pd.DataFrame(reg_models)

32/32 ━━━━━━━━━━━━━━━━━━━━ 4s 122ms/step
Original (No Reg) Validation AUC: 0.9258
32/32 ━━━━━━━━━━━━━━━━━━━━ 4s 118ms/step
L2 Regularization Validation AUC: 0.9247
32/32 ━━━━━━━━━━━━━━━━━━━━ 5s 129ms/step
L1 Regularization Validation AUC: 0.9083
32/32 ━━━━━━━━━━━━━━━━━━━━ 5s 127ms/step
Elastic Net (L1+L2) Validation AUC: 0.9103
32/32 ━━━━━━━━━━━━━━━━━━━━ 4s 121ms/step
Increased Dropout Validation AUC: 0.9122
32/32 ━━━━━━━━━━━━━━━━━━━━ 4s 123ms/step
Additional BatchNorm Validation AUC: 0.9327

# Compare regularization methods
plt.figure(figsize=(12, 6))

bars = plt.bar(reg_models_df["name"], reg_models_df["auc"],
               color=['skyblue', 'lightgreen', 'lightcoral', 'gold', 'lightpink', 'lightseagreen'])
plt.title('Regularization Methods Comparison - Validation AUC Scores')
plt.ylabel('AUC Score')
plt.xticks(rotation=45, ha='right')
plt.ylim(0.8, 1.0)

# Add value labels on bars
for bar, score in zip(bars, reg_models_df["auc"]):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
             '%.4f' % score, ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Find the best regularization method
best_reg_model = reg_models[reg_models_df.auc.idxmax()]
best_reg_method = best_reg_model["name"]

print("\nBest regularization method: %s with AUC: %.4f" % (best_reg_method, best_reg_model['auc']))

Best regularization method: Additional BatchNorm with AUC: 0.9327

# Plot training curves for the best regularization method
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(best_reg_model['history']['accuracy'], label='Training Accuracy')
plt.plot(best_reg_model['history']['val_accuracy'], label='Validation Accuracy')
plt.title('%s - Accuracy' % best_reg_model["name"])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(best_reg_model['history']['loss'], label='Training Loss')
plt.plot(best_reg_model['history']['val_loss'], label='Validation Loss')
plt.title('%s - Loss' % best_reg_model["name"])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 3, 3)
plt.plot(best_reg_model['history']['auc'], label='Training AUC')
plt.plot(best_reg_model['history']['val_auc'], label='Validation AUC')
plt.title('%s - AUC' % best_reg_model["name"])
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.legend()

plt.tight_layout()
plt.show()

# Train the best regularized model on the full dataset
print("\nTraining the best regularized model (%s) on the full dataset..." % best_reg_method)

# Prepare full data generators
full_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.1
)

full_train_generator = full_datagen.flow_from_dataframe(
    dataframe=train_labels,
    x_col='path',
    y_col='label_str',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='training'
)

full_val_generator = full_datagen.flow_from_dataframe(
    dataframe=train_labels,
    x_col='path',
    y_col='label_str',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation',
    shuffle=False
)

# Add callbacks for final training
final_checkpoint = ModelCheckpoint("final_model.keras", save_best_only=True, monitor='val_auc', mode='max')
final_csv_logger = CSVLogger("final_training_history.csv")

Training the best regularized model (Additional BatchNorm) on the full dataset...
Found 176020 validated image filenames belonging to 2 classes.
Found 44005 validated image filenames belonging to 2 classes.

# Load the best model from checkpoint
final_model = load_model("final_model.keras")
final_history = pd.read_csv("final_training_history.csv", sep=',', engine='python')

# Evaluate the final model
final_val_preds = final_model.predict(full_val_generator)
final_val_labels = full_val_generator.classes

final_val_pred_classes = (final_val_preds > 0.5).astype(int).flatten()
final_val_auc = roc_auc_score(final_val_labels, final_val_preds)

print("Final Model Validation AUC: %.4f" % final_val_auc)
print("\nClassification Report:")
print(classification_report(final_val_labels, final_val_pred_classes))

/home/farzat/files/University/Colorado/courses/csca5632=unsupervised-algorithms-in-machine-learning/reviews/venv/lib/python3.13/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py:121: UserWarning: Your `PyDataset` class should call `super().__init__(**kwargs)` in its constructor. `**kwargs` can include `workers`, `use_multiprocessing`, `max_queue_size`. Do not pass these arguments to `fit()`, as they will be ignored.
  self._warn_if_super_not_called()

687/688 ━━━━━━━━━━━━━━━━━━━━ 0s 130ms/step

2025-09-10 10:46:45.955310: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[37,32,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[37,3,96,96]{3,2,1,0}, f32[32,3,3,3]{3,2,1,0}, f32[32]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:46:45.998388: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[37,64,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[37,32,47,47]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:46:46.097799: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[37,128,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[37,64,22,22]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}

688/688 ━━━━━━━━━━━━━━━━━━━━ 91s 131ms/step
Final Model Validation AUC: 0.9878

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     26233
           1       0.95      0.93      0.94     17772

    accuracy                           0.95     44005
   macro avg       0.95      0.95      0.95     44005
weighted avg       0.95      0.95      0.95     44005

# Plot training curves for the best regularization method
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(final_history['accuracy'], label='Training Accuracy')
plt.plot(final_history['val_accuracy'], label='Validation Accuracy')
plt.title('Final model - Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(final_history['loss'], label='Training Loss')
plt.plot(final_history['val_loss'], label='Validation Loss')
plt.title('Final model - Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 3, 3)
plt.plot(final_history['auc'], label='Training AUC')
plt.plot(final_history['val_auc'], label='Validation AUC')
plt.title('Final model - AUC')
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.legend()

plt.tight_layout()
plt.show()

# Confusion matrix
cm = confusion_matrix(final_val_labels, final_val_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Final Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Prepare test data for prediction
test_files = os.listdir(TEST_DIR)
test_ids = [os.path.splitext(f)[0] for f in test_files]
test_paths = [os.path.join(TEST_DIR, f) for f in test_files]

test_df = pd.DataFrame({
    'id': test_ids,
    'path': test_paths
})

test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col='path',
    y_col=None,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode=None,
    shuffle=False
)

Found 57458 validated image filenames.

# Make predictions on test set
test_preds = final_model.predict(test_generator)

/home/farzat/files/University/Colorado/courses/csca5632=unsupervised-algorithms-in-machine-learning/reviews/venv/lib/python3.13/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py:121: UserWarning: Your `PyDataset` class should call `super().__init__(**kwargs)` in its constructor. `**kwargs` can include `workers`, `use_multiprocessing`, `max_queue_size`. Do not pass these arguments to `fit()`, as they will be ignored.
  self._warn_if_super_not_called()

896/898 ━━━━━━━━━━━━━━━━━━━━ 0s 45ms/step

2025-09-10 10:47:28.693071: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[50,32,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[50,3,96,96]{3,2,1,0}, f32[32,3,3,3]{3,2,1,0}, f32[32]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:47:28.734984: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[50,64,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[50,32,47,47]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:47:28.854444: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[50,128,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[50,64,22,22]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}

898/898 ━━━━━━━━━━━━━━━━━━━━ 42s 46ms/step

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'label': test_preds.flatten()
})

submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Submission file created successfully!

	id	label
0	f38a6374c348f90b587e046aac6079959adf3835	0
1	c18f2d887b7ae4f6742ee445113fa1aef383ed77	1
2	755db6279dae599ebb4d39a9123cce439965282d	0
3	bc3f0c64fb968ff4a8bd33af6971ecae77c75e08	0
4	068aba587a4950175d04c680d38943fd488d6a9d	0

Histopathologic Cancer Detection with CNN¶

Binary classification of metastatic cancer in tissue samples¶

Problem Description: Histopathologic Cancer Detection¶

Comprehensive Exploratory Data Analysis (EDA)¶

Data cleaning and processing¶

Model architecture¶

Plan of attack¶

Techniques used to improve the training¶

Conclusion¶