import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, CSVLogger
from tensorflow.keras.regularizers import l1, l2, l1_l2
import os
from PIL import Image
from tqdm import tqdm
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
2025-09-10 10:43:25.735520: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Problem Description: Histopathologic Cancer Detection¶
This project addresses the binary classification task of detecting metastatic cancer in small image patches taken from larger digital pathology scans. The dataset consists of 96x96 pixel RGB images extracted from histopathologic scans of lymph node sections. The size is convenient: bigger than CIFAR10, smaller than imagenet, trainable on a single GPU.
The challenge originates from the PatchCamelyon (PCam) benchmark dataset, which provides a realistic simulation of the task pathologists face when examining lymph node sections for metastatic cancer. Lymph node involvement is clinically crucial for cancer staging and treatment decisions, as the presence of metastasis significantly impacts patient prognosis and therapeutic approaches.
Data source: Kaggle.
# Configuration
DATA_DIR = '/kaggle/input/histopathologic-cancer-detection' if 'KAGGLE_URL_BASE' in os.environ else 'data_3'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TEST_DIR = os.path.join(DATA_DIR, 'test')
IMG_SIZE = 96
BATCH_SIZE = 64
EPOCHS = 30
# Load and explore the data
train_labels = pd.read_csv(os.path.join(DATA_DIR, 'train_labels.csv'))
# Inspect the data
display(train_labels.head())
train_labels.info()
# Add full paths to the dataframe
train_labels['path'] = train_labels['id'].apply(lambda img_id: os.path.join(TRAIN_DIR, f"{img_id}.tif"))
| id | label | |
|---|---|---|
| 0 | f38a6374c348f90b587e046aac6079959adf3835 | 0 |
| 1 | c18f2d887b7ae4f6742ee445113fa1aef383ed77 | 1 |
| 2 | 755db6279dae599ebb4d39a9123cce439965282d | 0 |
| 3 | bc3f0c64fb968ff4a8bd33af6971ecae77c75e08 | 0 |
| 4 | 068aba587a4950175d04c680d38943fd488d6a9d | 0 |
<class 'pandas.core.frame.DataFrame'> RangeIndex: 220025 entries, 0 to 220024 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 220025 non-null object 1 label 220025 non-null int64 dtypes: int64(1), object(1) memory usage: 3.4+ MB
Comprehensive Exploratory Data Analysis (EDA)¶
# Check for missing values
print("Missing values in labels:", train_labels.isnull().sum().sum())
print("Duplicate IDs:", train_labels['id'].duplicated().sum())
Missing values in labels: 0 Duplicate IDs: 0
# Visualize class distribution
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.countplot(x='label', data=train_labels)
plt.title('Class Distribution')
plt.subplot(1, 2, 2)
plt.pie(train_labels['label'].value_counts(),
labels=['0', '1'],
autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
plt.title('Class Proportions')
plt.tight_layout()
plt.show()
# Image metadata analysis
def analyze_image_metadata(df, sample_size=1000):
"""Analyze image dimensions, formats, and basic statistics"""
print("Analysis done on a sample of %d images" % sample_size)
# Sample images for analysis
sample_df = df.sample(min(sample_size, len(df)), random_state=42)
dimensions = []
formats = []
mean_intensities = []
std_intensities = []
for _, row in tqdm(sample_df.iterrows(), desc="Analyzing image metadata"):
img = Image.open(row['path'])
img_array = np.array(img)
dimensions.append(img_array.shape)
formats.append(img.format if img.format else 'Unknown')
mean_intensities.append(np.mean(img_array))
std_intensities.append(np.std(img_array))
# Analyze dimensions
unique_dims = set(dimensions)
print("Unique image dimensions:", unique_dims)
print("All images same size:", len(unique_dims) == 1)
# Analyze formats
print("All images have a format of %s:" % formats[0], all(map(lambda x: x == formats[0], formats)))
# Analyze intensity statistics
print("\nIntensity statistics:")
print("Mean intensity: %.2f ± %.2f" % (np.mean(mean_intensities), np.std(mean_intensities)))
print("Std intensity: %.2f ± %.2f" % (np.mean(std_intensities), np.std(std_intensities)))
# Run metadata analysis
analyze_image_metadata(train_labels)
Analysis done on a sample of 1000 images
Analyzing image metadata: 1000it [00:00, 1217.26it/s]
Unique image dimensions: {(96, 96, 3)}
All images same size: True
All images have a format of TIFF: True
Intensity statistics:
Mean intensity: 164.00 ± 38.53
Std intensity: 51.66 ± 13.62
# Data leakage check - ensure no duplicate images between train and test
overlap = set(train_labels['id']).intersection((os.path.splitext(f)[0] for f in os.listdir(TEST_DIR)))
print("Number of overlapping files:", len(overlap))
print("No leakage detected:", len(overlap) == 0)
Number of overlapping files: 0 No leakage detected: True
# Sample some images from each class
def visualize_samples(n_samples=5):
fig, axes = plt.subplots(2, n_samples, figsize=(15, 6))
for label in [0, 1]:
sample_df = train_labels[train_labels['label'] == label].sample(n_samples)
for i, (_, row) in enumerate(sample_df.iterrows()):
img = Image.open(row['path'])
axes[label, i].imshow(img)
axes[label, i].set_title("Label: %s" % label)
axes[label, i].axis('off')
plt.tight_layout()
plt.show()
visualize_samples()
def analyze_pixel_intensities(df, sample_size=1000):
"""Analyze pixel intensity distributions for each class"""
np.random.seed(42)
# Sample images from each class
samples_0 = df[df['label'] == 0].sample(sample_size // 2)
samples_1 = df[df['label'] == 1].sample(sample_size // 2)
pixels_0 = []
pixels_1 = []
# Process negative samples
for _, row in tqdm(samples_0.iterrows(), desc="Processing negative samples"):
img = Image.open(row['path'])
img_array = np.array(img)
pixels_0.extend(img_array.flatten())
# Process positive samples
for _, row in tqdm(samples_1.iterrows(), desc="Processing positive samples"):
img = Image.open(row['path'])
img_array = np.array(img)
pixels_1.extend(img_array.flatten())
return pixels_0, pixels_1
# Analyze pixel intensities
pixels_0, pixels_1 = analyze_pixel_intensities(train_labels, sample_size=1000)
Processing negative samples: 500it [00:01, 489.64it/s] Processing positive samples: 500it [00:01, 453.75it/s]
# Plot pixel intensity histograms
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
plt.hist(pixels_0, bins=50, alpha=0.7, label='0', color='blue', density=True)
plt.hist(pixels_1, bins=50, alpha=0.7, label='1', color='red', density=True)
plt.title('Pixel Intensity Distribution (All Channels)')
plt.xlabel('Pixel Intensity')
plt.ylabel('Density')
plt.legend()
plt.subplot(2, 2, 2)
# Sample a smaller subset for individual channels
sample_img = Image.open(train_labels.iloc[0]['path'])
sample_array = np.array(sample_img)
for i, color in enumerate(['Red', 'Green', 'Blue']):
plt.hist(sample_array[:, :, i].flatten(), bins=50, alpha=0.7, label=color, density=True)
plt.title('Channel Intensity Distribution (Single Image)')
plt.xlabel('Pixel Intensity')
plt.ylabel('Density')
plt.legend()
plt.subplot(2, 2, 3)
# Sample a smaller subset for individual channels
sample_img = Image.open(train_labels.iloc[1]['path'])
sample_array = np.array(sample_img)
for i, color in enumerate(['Red', 'Green', 'Blue']):
plt.hist(sample_array[:, :, i].flatten(), bins=50, alpha=0.7, label=color, density=True)
plt.title('Channel Intensity Distribution (Another Image)')
plt.xlabel('Pixel Intensity')
plt.ylabel('Density')
plt.legend()
plt.subplot(2, 2, 4)
# Compare mean intensities
means_0 = []
means_1 = []
for i in tqdm(range(200)): # Smaller sample for performance
neg_img = Image.open(train_labels[train_labels['label'] == 0].iloc[i]['path'])
pos_img = Image.open(train_labels[train_labels['label'] == 1].iloc[i]['path'])
means_0.append(np.mean(np.array(neg_img)))
means_1.append(np.mean(np.array(pos_img)))
plt.hist(means_0, bins=30, alpha=0.7, label='0', color='blue', density=True)
plt.hist(means_1, bins=30, alpha=0.7, label='1', color='red', density=True)
plt.title('Mean Image Intensity Distribution')
plt.xlabel('Mean Pixel Intensity')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:04<00:00, 41.99it/s]
Data cleaning and processing¶
The data seems clean. The formats and dimensions are consistent across the dataset, and there are no missing or duplicate IDs between the two sets.
But some processing is still required. The most important steps are:
- Notice that the distribution of the training and test samples is not equal. Whereever possible, try to take an equal number of samples from each to produce a balanced model while training.
- The value in each pixel ranges from 0 to 255. To get faster and better training, rescale to a range of 0 to 1.
- Add some modifications to the training dataset - rotations, shifts, flips and zooms. This helps increase the adaptability of the trained model.
- Note that another label column with a value type of string was created, as this allows the model to be specified as binary.
From the inspection of the images, it can be persumed that the colour information will be important in the analyses. The colours were therefore kept as is.
# Convert labels to strings for the generator
train_labels['label_str'] = train_labels['label'].astype(str)
# Prepare a subset for hyperparameter tuning
tuning_subset = pd.concat([
train_labels[train_labels['label'] == 0].sample(5000, random_state=42),
train_labels[train_labels['label'] == 1].sample(5000, random_state=42)
])
# Split into train and validation
train_df, val_df = train_test_split(
tuning_subset,
test_size=0.2,
stratify=tuning_subset['label'],
random_state=42
)
# Prepare data generators - use class_mode='binary' with string labels
datagen = ImageDataGenerator(
rescale=1./255,
rotation_range=20,
width_shift_range=0.1,
height_shift_range=0.1,
horizontal_flip=True,
vertical_flip=True,
zoom_range=0.1
)
# Create data generators using the path column and string labels
train_generator = datagen.flow_from_dataframe(
dataframe=train_df,
x_col='path',
y_col='label_str',
target_size=(IMG_SIZE, IMG_SIZE),
batch_size=BATCH_SIZE,
class_mode='binary'
)
val_generator = datagen.flow_from_dataframe(
dataframe=val_df,
x_col='path',
y_col='label_str',
target_size=(IMG_SIZE, IMG_SIZE),
batch_size=BATCH_SIZE,
class_mode='binary',
shuffle=False
)
Found 8000 validated image filenames belonging to 2 classes. Found 2000 validated image filenames belonging to 2 classes.
Model architecture¶
Based on the complexity of the data, I believe a deep/wide architecture would be more effective than a simpler one. In order to be sure however, I created a function flexible enough to allow the testing of different architectures and regularization methods.
Plan of attack¶
First of all, three architectures will be tested: simple, deeper, and wider.
The best performing of these three will have its regularization tuned and used for submission.
For architecture selection and regularization tuning, a smaller dataset and number of epochs will be used to make the process faster. Once a combination is chosen, it will be trained on the bigger dataset before submission.
Techniques used to improve the training¶
So as not to have to retrain the models each time the notebook was re-run, I decided to use the callback ModelCheckpoint. The callback would save the best-performing model on-disk and reload it as needed.
This approach had another pleasant effect. Previously, the fluctuations in the validation scores meant that a fair comparison between the different models was impractical. However, by choosing the best performing epoch of each model, a much more fair comparison was possible, and it worked splendidly.
# Single flexible function to create CNN models with different architectures and regularization
def create_cnn_model(architecture='deeper', regularization=None, reg_strength=.001,
dropout_rate=.5, additional_dropout=False, additional_bn=False):
"""
Create a CNN model with specified architecture and regularization.
Parameters:
- architecture: 'simple', 'deeper', or 'wider'
- regularization: None, 'l1', 'l2', 'l1_l2'
- reg_strength: regularization strength
- dropout_rate: dropout rate for dense layers
- additional_dropout: whether to add dropout after convolutional layers
- additional_bn: whether to add additional batch normalization layers
Returns:
- Compiled Keras model
"""
# Define regularization function
if regularization == 'l1':
reg = l1(reg_strength)
elif regularization == 'l2':
reg = l2(reg_strength)
elif regularization == 'l1_l2':
reg = l1_l2(l1=reg_strength/2, l2=reg_strength/2)
else: # 'none'
reg = None
# Define architecture parameters
if architecture == 'simple':
conv_layers = [
(32, 3), # (filters, kernel_size)
(64, 3)
]
dense_units = 64
elif architecture == 'deeper':
conv_layers = [
(32, 3),
(64, 3),
(128, 3)
]
dense_units = 128
elif architecture == 'wider':
conv_layers = [
(64, 3),
(128, 3),
(256, 3)
]
dense_units = 256
else:
raise ValueError("Architecture must be 'simple', 'deeper', or 'wider'")
# Build the model
model = Sequential()
# Input layer
model.add(Conv2D(conv_layers[0][0], (conv_layers[0][1], conv_layers[0][1]),
activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3),
kernel_regularizer=reg))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))
if additional_dropout:
model.add(Dropout(0.2))
if additional_bn:
model.add(BatchNormalization())
# Additional convolutional layers
for filters, kernel_size in conv_layers[1:]:
model.add(Conv2D(filters, (kernel_size, kernel_size),
activation='relu', kernel_regularizer=reg))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))
if additional_dropout:
model.add(Dropout(0.2))
if additional_bn:
model.add(BatchNormalization())
# Dense layers
model.add(Flatten())
model.add(Dense(dense_units, activation='relu', kernel_regularizer=reg))
if additional_bn:
model.add(BatchNormalization())
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
return model
# Test different model architectures
arch_models = [
{
"name": "Simple CNN",
"params": { 'architecture': 'simple', },
},
{
"name": "Deeper CNN",
"params": { 'architecture': 'deeper', },
},
{
"name": "Wider CNN",
"params": { 'architecture': 'wider', },
},
]
# Create directories for checkpoints and histories
os.makedirs("checkpoints", exist_ok=True)
os.makedirs("histories", exist_ok=True)
for model in arch_models:
model["checkpoint"] = os.path.join("checkpoints", "%s.keras" % model["name"])
model["history_file"] = os.path.join("histories", "%s.csv" % model["name"])
# Load and evaluate each model
for model in arch_models:
model["instance"] = load_model(model["checkpoint"])
# Evaluate the model
val_generator.reset()
val_preds = model["instance"].predict(val_generator)
model["auc"] = roc_auc_score(val_generator.classes, val_preds)
model["history"] = pd.read_csv(model["history_file"], sep=',', engine='python')
print("%s Validation AUC: %.4f" % (model["name"], model["auc"]))
arch_models_df = pd.DataFrame(arch_models)
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1757490264.572239 1667087 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3546 MB memory: -> device: 0, name: NVIDIA GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5
/home/farzat/files/University/Colorado/courses/csca5632=unsupervised-algorithms-in-machine-learning/reviews/venv/lib/python3.13/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py:121: UserWarning: Your `PyDataset` class should call `super().__init__(**kwargs)` in its constructor. `**kwargs` can include `workers`, `use_multiprocessing`, `max_queue_size`. Do not pass these arguments to `fit()`, as they will be ignored.
self._warn_if_super_not_called()
2025-09-10 10:44:25.930818: I external/local_xla/xla/service/service.cc:163] XLA service 0x7f0bfc0044c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-09-10 10:44:25.930853: I external/local_xla/xla/service/service.cc:171] StreamExecutor device (0): NVIDIA GeForce GTX 1660 Ti, Compute Capability 7.5
2025-09-10 10:44:25.941837: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-10 10:44:25.990517: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91200
2025-09-10 10:44:26.179768: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[64,32,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,3,96,96]{3,2,1,0}, f32[32,3,3,3]{3,2,1,0}, f32[32]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:44:26.263880: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[64,64,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,32,47,47]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
3/32 ━━━━━━━━━━━━━━━━━━━━ 1s 60ms/step
I0000 00:00:1757490266.659969 1673357 device_compiler.h:196] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.
31/32 ━━━━━━━━━━━━━━━━━━━━ 0s 110ms/step
2025-09-10 10:44:30.161400: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[16,32,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,3,96,96]{3,2,1,0}, f32[32,3,3,3]{3,2,1,0}, f32[32]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:44:30.182667: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[16,64,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,32,47,47]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
32/32 ━━━━━━━━━━━━━━━━━━━━ 5s 124ms/step Simple CNN Validation AUC: 0.9180
2025-09-10 10:44:31.373871: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[64,128,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,64,22,22]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
31/32 ━━━━━━━━━━━━━━━━━━━━ 0s 114ms/step
2025-09-10 10:44:35.371030: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[16,128,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,64,22,22]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
32/32 ━━━━━━━━━━━━━━━━━━━━ 5s 128ms/step Deeper CNN Validation AUC: 0.9331
2025-09-10 10:44:36.712315: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[64,64,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,3,96,96]{3,2,1,0}, f32[64,3,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:44:36.763213: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[64,128,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,64,47,47]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:44:37.126469: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[64,256,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[64,128,22,22]{3,2,1,0}, f32[256,128,3,3]{3,2,1,0}, f32[256]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
31/32 ━━━━━━━━━━━━━━━━━━━━ 0s 112ms/step
2025-09-10 10:44:41.276678: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[16,64,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,3,96,96]{3,2,1,0}, f32[64,3,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:44:41.298269: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[16,128,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,64,47,47]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:44:41.421979: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[16,256,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,128,22,22]{3,2,1,0}, f32[256,128,3,3]{3,2,1,0}, f32[256]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
32/32 ━━━━━━━━━━━━━━━━━━━━ 6s 134ms/step Wider CNN Validation AUC: 0.9239
# Compare model performance
plt.figure(figsize=(10, 6))
bars = plt.bar(arch_models_df["name"], arch_models_df["auc"], color=['skyblue', 'lightgreen', 'lightcoral'])
plt.title('Model Comparison - Validation AUC Scores')
plt.ylabel('AUC Score')
plt.ylim(.7, 1.0)
# Add value labels on bars
for bar, score in zip(bars, arch_models_df["auc"]):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + .005,
'%.4f' % score, ha='center', va='bottom')
plt.tight_layout()
plt.show()
# Plot training history for the best model
arch_best_model = arch_models[arch_models_df.auc.idxmax()]
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.plot(arch_best_model['history']['accuracy'], label='Training Accuracy')
plt.plot(arch_best_model['history']['val_accuracy'], label='Validation Accuracy')
plt.title('%s - Accuracy' % arch_best_model["name"])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 3, 2)
plt.plot(arch_best_model['history']['loss'], label='Training Loss')
plt.plot(arch_best_model['history']['val_loss'], label='Validation Loss')
plt.title('%s - Loss' % arch_best_model["name"])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 3, 3)
plt.plot(arch_best_model['history']['auc'], label='Training AUC')
plt.plot(arch_best_model['history']['val_auc'], label='Validation AUC')
plt.title('%s - AUC' % arch_best_model["name"])
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.legend()
plt.tight_layout()
plt.show()
# Test different regularization methods on the best architecture
best_architecture = arch_best_model["params"]["architecture"]
reg_models = [
{
"name": "Original (No Reg)",
"params": {'architecture': best_architecture, 'regularization': None}
},
{
"name": "L2 Regularization",
"params": {'architecture': best_architecture, 'regularization': 'l2', 'reg_strength': 0.001}
},
{
"name": "L1 Regularization",
"params": {'architecture': best_architecture, 'regularization': 'l1', 'reg_strength': 0.001}
},
{
"name": "Elastic Net (L1+L2)",
"params": {'architecture': best_architecture, 'regularization': 'l1_l2', 'reg_strength': 0.001}
},
{
"name": "Increased Dropout",
"params": {'architecture': best_architecture, 'regularization': None,
'dropout_rate': 0.6, 'additional_dropout': True}
},
{
"name": "Additional BatchNorm",
"params": {'architecture': best_architecture, 'regularization': None,
'additional_bn': True}
}
]
# Create checkpoints and history files for each regularization model
for model in reg_models:
model["checkpoint"] = os.path.join("checkpoints", "%s %s.keras" % (best_architecture, model["name"]))
model["history_file"] = os.path.join("histories", "%s %s.csv" % (best_architecture, model["name"]))
# Load and evaluate each regularization model
for model in reg_models:
model["instance"] = load_model(model["checkpoint"])
# Evaluate the model
val_generator.reset()
val_preds = model["instance"].predict(val_generator)
model["auc"] = roc_auc_score(val_generator.classes, val_preds)
model["history"] = pd.read_csv(model["history_file"], sep=',', engine='python')
print("%s Validation AUC: %.4f" % (model["name"], model["auc"]))
reg_models_df = pd.DataFrame(reg_models)
32/32 ━━━━━━━━━━━━━━━━━━━━ 4s 122ms/step Original (No Reg) Validation AUC: 0.9258 32/32 ━━━━━━━━━━━━━━━━━━━━ 4s 118ms/step L2 Regularization Validation AUC: 0.9247 32/32 ━━━━━━━━━━━━━━━━━━━━ 5s 129ms/step L1 Regularization Validation AUC: 0.9083 32/32 ━━━━━━━━━━━━━━━━━━━━ 5s 127ms/step Elastic Net (L1+L2) Validation AUC: 0.9103 32/32 ━━━━━━━━━━━━━━━━━━━━ 4s 121ms/step Increased Dropout Validation AUC: 0.9122 32/32 ━━━━━━━━━━━━━━━━━━━━ 4s 123ms/step Additional BatchNorm Validation AUC: 0.9327
# Compare regularization methods
plt.figure(figsize=(12, 6))
bars = plt.bar(reg_models_df["name"], reg_models_df["auc"],
color=['skyblue', 'lightgreen', 'lightcoral', 'gold', 'lightpink', 'lightseagreen'])
plt.title('Regularization Methods Comparison - Validation AUC Scores')
plt.ylabel('AUC Score')
plt.xticks(rotation=45, ha='right')
plt.ylim(0.8, 1.0)
# Add value labels on bars
for bar, score in zip(bars, reg_models_df["auc"]):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
'%.4f' % score, ha='center', va='bottom')
plt.tight_layout()
plt.show()
# Find the best regularization method
best_reg_model = reg_models[reg_models_df.auc.idxmax()]
best_reg_method = best_reg_model["name"]
print("\nBest regularization method: %s with AUC: %.4f" % (best_reg_method, best_reg_model['auc']))
Best regularization method: Additional BatchNorm with AUC: 0.9327
# Plot training curves for the best regularization method
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.plot(best_reg_model['history']['accuracy'], label='Training Accuracy')
plt.plot(best_reg_model['history']['val_accuracy'], label='Validation Accuracy')
plt.title('%s - Accuracy' % best_reg_model["name"])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 3, 2)
plt.plot(best_reg_model['history']['loss'], label='Training Loss')
plt.plot(best_reg_model['history']['val_loss'], label='Validation Loss')
plt.title('%s - Loss' % best_reg_model["name"])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 3, 3)
plt.plot(best_reg_model['history']['auc'], label='Training AUC')
plt.plot(best_reg_model['history']['val_auc'], label='Validation AUC')
plt.title('%s - AUC' % best_reg_model["name"])
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.legend()
plt.tight_layout()
plt.show()
# Train the best regularized model on the full dataset
print("\nTraining the best regularized model (%s) on the full dataset..." % best_reg_method)
# Prepare full data generators
full_datagen = ImageDataGenerator(
rescale=1./255,
validation_split=0.2,
rotation_range=20,
width_shift_range=0.1,
height_shift_range=0.1,
horizontal_flip=True,
vertical_flip=True,
zoom_range=0.1
)
full_train_generator = full_datagen.flow_from_dataframe(
dataframe=train_labels,
x_col='path',
y_col='label_str',
target_size=(IMG_SIZE, IMG_SIZE),
batch_size=BATCH_SIZE,
class_mode='binary',
subset='training'
)
full_val_generator = full_datagen.flow_from_dataframe(
dataframe=train_labels,
x_col='path',
y_col='label_str',
target_size=(IMG_SIZE, IMG_SIZE),
batch_size=BATCH_SIZE,
class_mode='binary',
subset='validation',
shuffle=False
)
# Add callbacks for final training
final_checkpoint = ModelCheckpoint("final_model.keras", save_best_only=True, monitor='val_auc', mode='max')
final_csv_logger = CSVLogger("final_training_history.csv")
Training the best regularized model (Additional BatchNorm) on the full dataset... Found 176020 validated image filenames belonging to 2 classes. Found 44005 validated image filenames belonging to 2 classes.
# Load the best model from checkpoint
final_model = load_model("final_model.keras")
final_history = pd.read_csv("final_training_history.csv", sep=',', engine='python')
# Evaluate the final model
final_val_preds = final_model.predict(full_val_generator)
final_val_labels = full_val_generator.classes
final_val_pred_classes = (final_val_preds > 0.5).astype(int).flatten()
final_val_auc = roc_auc_score(final_val_labels, final_val_preds)
print("Final Model Validation AUC: %.4f" % final_val_auc)
print("\nClassification Report:")
print(classification_report(final_val_labels, final_val_pred_classes))
/home/farzat/files/University/Colorado/courses/csca5632=unsupervised-algorithms-in-machine-learning/reviews/venv/lib/python3.13/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py:121: UserWarning: Your `PyDataset` class should call `super().__init__(**kwargs)` in its constructor. `**kwargs` can include `workers`, `use_multiprocessing`, `max_queue_size`. Do not pass these arguments to `fit()`, as they will be ignored. self._warn_if_super_not_called()
687/688 ━━━━━━━━━━━━━━━━━━━━ 0s 130ms/step
2025-09-10 10:46:45.955310: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[37,32,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[37,3,96,96]{3,2,1,0}, f32[32,3,3,3]{3,2,1,0}, f32[32]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:46:45.998388: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[37,64,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[37,32,47,47]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:46:46.097799: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[37,128,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[37,64,22,22]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
688/688 ━━━━━━━━━━━━━━━━━━━━ 91s 131ms/step Final Model Validation AUC: 0.9878 Classification Report: precision recall f1-score support 0 0.95 0.97 0.96 26233 1 0.95 0.93 0.94 17772 accuracy 0.95 44005 macro avg 0.95 0.95 0.95 44005 weighted avg 0.95 0.95 0.95 44005
# Plot training curves for the best regularization method
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.plot(final_history['accuracy'], label='Training Accuracy')
plt.plot(final_history['val_accuracy'], label='Validation Accuracy')
plt.title('Final model - Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 3, 2)
plt.plot(final_history['loss'], label='Training Loss')
plt.plot(final_history['val_loss'], label='Validation Loss')
plt.title('Final model - Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 3, 3)
plt.plot(final_history['auc'], label='Training AUC')
plt.plot(final_history['val_auc'], label='Validation AUC')
plt.title('Final model - AUC')
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.legend()
plt.tight_layout()
plt.show()
# Confusion matrix
cm = confusion_matrix(final_val_labels, final_val_pred_classes)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Final Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
# Prepare test data for prediction
test_files = os.listdir(TEST_DIR)
test_ids = [os.path.splitext(f)[0] for f in test_files]
test_paths = [os.path.join(TEST_DIR, f) for f in test_files]
test_df = pd.DataFrame({
'id': test_ids,
'path': test_paths
})
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_dataframe(
dataframe=test_df,
x_col='path',
y_col=None,
target_size=(IMG_SIZE, IMG_SIZE),
batch_size=BATCH_SIZE,
class_mode=None,
shuffle=False
)
Found 57458 validated image filenames.
# Make predictions on test set
test_preds = final_model.predict(test_generator)
/home/farzat/files/University/Colorado/courses/csca5632=unsupervised-algorithms-in-machine-learning/reviews/venv/lib/python3.13/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py:121: UserWarning: Your `PyDataset` class should call `super().__init__(**kwargs)` in its constructor. `**kwargs` can include `workers`, `use_multiprocessing`, `max_queue_size`. Do not pass these arguments to `fit()`, as they will be ignored. self._warn_if_super_not_called()
896/898 ━━━━━━━━━━━━━━━━━━━━ 0s 45ms/step
2025-09-10 10:47:28.693071: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[50,32,94,94]{3,2,1,0}, u8[0]{0}) custom-call(f32[50,3,96,96]{3,2,1,0}, f32[32,3,3,3]{3,2,1,0}, f32[32]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:47:28.734984: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[50,64,45,45]{3,2,1,0}, u8[0]{0}) custom-call(f32[50,32,47,47]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}, f32[64]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
2025-09-10 10:47:28.854444: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:546] Omitted potentially buggy algorithm eng14{k25=2} for conv (f32[50,128,20,20]{3,2,1,0}, u8[0]{0}) custom-call(f32[50,64,22,22]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false,"reification_cost":[]}
898/898 ━━━━━━━━━━━━━━━━━━━━ 42s 46ms/step
# Create submission file
submission = pd.DataFrame({
'id': test_ids,
'label': test_preds.flatten()
})
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")
Submission file created successfully!
Conclusion¶
From the hyperparameter tuning figures, it seems that additional batch normalization improved the validation performance, but only by 1%. Overall, almost all the hyperparameter settings produced the same result. The same goes for the architecture setting, as deeper only won by a tiny margin.
What helped the most in getting the performance up was using ModelCheckpoint, as it helped smooth out the problem of fluctuations especially in the model selection part.
The large fluctuation in the validation scores remain a huge concern nevertheless. It is true that for our case we can filter out the epoch which produced the best performance, but there is no guarantee that the batch which scored the best AUC on our validation set would also score well on Kaggle's test set.
On inspecting the training logs on the larger dataset, it seems that the fluctuations are much less on the larger dataset especially as the number of epochs increased. Could it be that the fluctuation was caused by the limited dataset size and number of epochs used while choosing the model and hypertuning?
Further improvements one could make would be investigating that issue further. Another point of research would be investigating whether aggressive regularization helps smoothen out the fluctuations as well.