import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm
import re
import tensorflow as tf
from tensorflow.keras import Sequential, layers, Input, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import custom_object_scope

2025-09-17 10:47:19.818964: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

if 'KAGGLE_URL_BASE' in os.environ:
    df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
    df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
else:
    DATA_DIR = "data"
    df_train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
    df_test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print('Training dataset overview:')
display(df_train.info())
df_train.head()

Training dataset overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB

None

print('Test dataset overview:')
display(df_test.info())
df_test.head()

Test dataset overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB

None

print('Sample tweet instances:\n')
for row in np.random.choice(len(df_train), size=5, replace=False):
    print('(id=%s, label=%s), text: %s' % (df_train.loc[row,'id'], df_train.loc[row,'target'], df_train.loc[row,'text']))

Sample tweet instances:

(id=10605, label=0), text: Have you ever seen the President 
who killed your wounded child?
Or the man that crashed your sister's plane 
claimin' he was sent of God?
(id=7603, label=0), text: @J3Lyon I'm going to put the FFVII ones out at the weekend so I think Pandemonium! (Don't forget the exclamation mark) would be midweek.
(id=6844, label=0), text: matako_milk: Breaking news! Unconfirmed! I just heard a loud bang nearby. in what appears to be a blast of wind from my neighbour's ass.
(id=680, label=1), text: Suspect in latest US theatre attack had psychological issues http://t.co/OnPnBx0ZEx http://t.co/uM5IcN5Et2
(id=7301, label=1), text: Salem 2 nuclear reactor shut down over electrical circuit failure on pump: The Salem 2 nuclear reactor had bee... http://t.co/5hkGXzJLmX

# Class distribution
print('Class distribution:')
df_train.target.value_counts().plot.pie(autopct='%1.1f%%', ylabel='')
print(df_train.target.value_counts().to_string(header=False))

Class distribution:
0    4342
1    3271

# Text length (training)
df_train['text_length'] = df_train['text'].apply(len)
sns.displot(data=df_train, x='text_length', hue='target', kde=True, legend=True).set(title='Text length by label (training dataset)')

print('Average training text length: %d' % df_train['text_length'].mean())
print('\t\twhen label=1: %d' % df_train[df_train['target'] == 1]['text_length'].mean())
print('\t\twhen label=0: %d' % df_train[df_train['target'] == 0]['text_length'].mean())

Average training text length: 101
		when label=1: 108
		when label=0: 95

# Text length (test)
df_test['text_length'] = df_test['text'].apply(len)
sns.displot(data=df_test, x='text_length', kde=True, legend=True).set(title='Text length (test dataset)')

print('Average training text length: %d' % df_test['text_length'].mean())

Average training text length: 102

# Fill empty cells
for df in df_train, df_test:
    df['location'].fillna('unknown', inplace=True)
    df['keyword'].fillna('unknown', inplace=True)

/tmp/ipykernel_141633/2307373696.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['location'].fillna('unknown', inplace=True)
/tmp/ipykernel_141633/2307373696.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['keyword'].fillna('unknown', inplace=True)

def clean_text(text):
    """Apply regex transformations to normalize tweet text following GloVe preprocessing"""
    
    def process_hashtag(hashtag):
        """Process hashtag content according to GloVe specifications"""
        hashtag_body = hashtag.group(1)
        if hashtag_body.upper() == hashtag_body:
            return "<HASHTAG> " + hashtag_body + " <ALLCAPS>"
        else:
            # Split on uppercase letters and numbers followed by uppercase
            split_pattern = r'(?=[A-Z])|(?<=[0-9])(?=[A-Z])|(?<=[A-Z])(?=[0-9])'
            parts = re.split(split_pattern, hashtag_body)
            parts = [p for p in parts if p]  # Remove empty strings
            return "<HASHTAG> " + ' '.join(parts)
    
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"
    
    transformations = [
        # URLs first
        (r"https?://\S+\b|www\.(\w+\.)+\S*", "<URL>"),
        # Force splitting words appended with slashes
        (r"/", " / "),
        # User mentions
        (r"@\w+", "<USER>"),
        # Smileys and emoticons
        (rf"{eyes}{nose}[)d]+|[)d]+{nose}{eyes}", "<SMILE>", re.IGNORECASE),
        (rf"{eyes}{nose}p+", "<LOLFACE>", re.IGNORECASE),
        (rf"{eyes}{nose}\(+|\)+{nose}{eyes}", "<SADFACE>"),
        (rf"{eyes}{nose}[\/|l*]", "<NEUTRALFACE>"),
        # Hearts
        (r"<3", "<HEART>"),
        # Numbers
        (r"[-+]?[.\d]*[\d]+[:,.\d]*", "<NUMBER>"),
        # Hashtags
        (r"#(\S+)", process_hashtag),
        # Punctuation repetitions
        (r"([!?.]){2,}", r"\1 <REPEAT>"),
        # Elongated words
        (r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <ELONG>"),
        # All caps words (but not tags that already start with <)
        (r"(?<!\<)\b([A-Z0-9()<>'`\-]{2,})\b", lambda m: m.group(1).lower() + " <ALLCAPS>")
    ]
    
    for transformation in transformations:
        if len(transformation) == 3:
            pattern, replacement, flags = transformation
            text = re.sub(pattern, replacement, text, flags=flags)
        else:
            pattern, replacement = transformation
            text = re.sub(pattern, replacement, text)
    
    return text

df_train["clean_text"] = df_train["text"].apply(clean_text)
df_test["clean_text"] = df_test["text"].apply(clean_text)

# Show sample of processed text
print("Sample processed tweets:")
for idx in np.random.choice(len(df_train), 3, replace=False):
    print("Original:", df_train.loc[idx, 'text'])
    print("Cleaned :", df_train.loc[idx, 'clean_text'])
    print("-" * 80)

Sample processed tweets:
Original: Coastal German Shepherd Rescue OC shared a link: 'Ecstatic Rescued Racco... http://t.co/t8Q6DzVgwX #animalrescue
Cleaned : Coastal German Shepherd Rescue oc <ALLCAPS> shared a link: 'Ecstatic Rescued Racco. <REPEAT> <URL> <HASHTAG> animalrescue
--------------------------------------------------------------------------------
Original: Long Road To Ruin - Foo Fighters
Cleaned : Long Road To Ruin - Foo Fighters
--------------------------------------------------------------------------------
Original: @NBCNews Yea bombing #pearlharbor not so good of an idea!
Cleaned : <USER> Yea bombing <HASHTAG> pearlharbor not so good of an idea!
--------------------------------------------------------------------------------

tokenizer = Tokenizer(num_words=None, filters='\t\n', lower=True, split=' ')
training_texts = df_train['clean_text'].to_list()
training_texts.append('<blank>')

tokenizer.fit_on_texts(training_texts)
idx_word = tokenizer.index_word
word_idx = {word:idx for idx, word in idx_word.items()}
num_words = len(idx_word) + 1

for df in df_train, df_test:
    df['sequences'] = tokenizer.texts_to_sequences(df['clean_text'])
    df['seq_len'] = df['sequences'].apply(len)

blank_id = word_idx['<blank>']
num_words += 1

def load_glove_embeddings(glove_file_path):
    """Load GloVe embeddings from text file format"""
    embeddings = {}
    print("Loading GloVe embeddings from %s..." % glove_file_path)
    
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Processing GloVe vectors"):
            values = line.split()
            if len(values) < 2:  # Skip empty lines
                continue
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    
    print("Loaded %d word vectors" % len(embeddings))
    return embeddings

# Load GloVe embeddings
if 'KAGGLE_URL_BASE' in os.environ:
    glove_file_path = "/kaggle/input/glovetwitter27b100dtxt/glove.twitter.27B.100d.txt"
else:
    glove_file_path = os.path.join(DATA_DIR, "glove.twitter.27B.100d.txt")
embeddings = load_glove_embeddings(glove_file_path)
embed_dim = len(next(iter(embeddings.values())))

print("Embedding dimension:", embed_dim)

Loading GloVe embeddings from data/glove.twitter.27B.100d.txt...

Processing GloVe vectors: 1193514it [00:22, 53595.23it/s]

Loaded 1193514 word vectors
Embedding dimension: 100

# Create embedding matrix with proper handling of special tokens
embedding_matrix = np.zeros((num_words, embed_dim))

for word, idx in word_idx.items():
    # GloVe embeddings use lowercase versions of special tokens
    glove_word = word.lower()
    
    if glove_word in embeddings:
        embedding_matrix[idx, :] = embeddings[glove_word]
    elif word == '<blank>':
        # Use a zero vector for padding
        embedding_matrix[idx, :] = np.zeros(embed_dim)
    else:
        # For out-of-vocabulary words, use a random initialization
        embedding_matrix[idx, :] = np.random.normal(0, 0.1, embed_dim)

df_train_sub, df_cv_sub = train_test_split(df_train, test_size=.2, stratify=df_train['target'])
y_train = df_train_sub['target']
y_cv = df_cv_sub['target']

# First write some functions to handle metrics
def calc_model_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'cm': confusion_matrix(y_true, y_pred),
    }

def display_model_metrics(model_name, accuracy, f1, precision, recall, cm, show_cm=True):
    print('Model:', model_name)
    print('    F1-score: %.3f    Accuracy: %.3f    Precision: %.3f    Recall: %.3f' % (f1, accuracy, precision, recall))
    if show_cm:
        ConfusionMatrixDisplay(cm).plot()
        plt.show()

cols_to_encode = ['keyword','location']
ct = ColumnTransformer(
    [("ordinal_encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cols_to_encode)]
).fit(df_train[cols_to_encode])

X_train_rf = ct.transform(df_train_sub[cols_to_encode])
X_cv_rf = ct.transform(df_cv_sub[cols_to_encode])
X_test_rf = ct.transform(df_test[cols_to_encode])

rf_classifier = RandomForestClassifier().fit(X_train_rf, y_train)
y_pred_cv = rf_classifier.predict(X_cv_rf)
rf_metrics = calc_model_metrics(y_cv, y_pred_cv)
display_model_metrics('Random Forest Classifier', **rf_metrics)

Model: Random Forest Classifier
    F1-score: 0.617    Accuracy: 0.678    Precision: 0.630    Recall: 0.606

# Prepare the inputs for RNN handling by padding
max_words = df_train['seq_len'].max()
X_train_rnn = pad_sequences(
    df_train_sub['sequences'].tolist(),
    maxlen=max_words,
    padding='pre',
    value=blank_id,
    dtype='int32'
)
X_cv_rnn = pad_sequences(
    df_cv_sub['sequences'].tolist(),
    maxlen=max_words,
    padding='pre',
    value=blank_id,
    dtype='int32'
)
X_test_rnn = pad_sequences(
    df_test['sequences'].tolist(),
    maxlen=max_words,
    padding='pre',
    value=blank_id,
    dtype='int32'
)
y_train_rnn = y_train.values.reshape(-1, 1)
y_cv_rnn = y_cv.values.reshape(-1, 1)

# A function to create RNN models
def create_rnn_model(rnn_layer=layers.GRU, extra_layers=0, units=48, dropout=.3):
    """Create enhanced RNN architecture with GloVe embeddings"""
    model = Sequential()
    
    model.add(layers.Embedding(
        input_dim=num_words,
        output_dim=embed_dim,
        weights=[embedding_matrix],
        trainable=False,
        mask_zero=True
    ))
    
    for i in range(extra_layers):
        model.add(rnn_layer(units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout))
    
    model.add(rnn_layer(units, dropout=dropout, recurrent_dropout=dropout))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(.4))
    model.add(layers.Dense(1, activation='sigmoid'))
    
    model.compile(
        optimizer=Nadam(learning_rate=1e-3),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Create the RNN models along with the appropriate meta-data
rnn_models = [{
    "params": {
        "rnn_layer": layer_class,
        "extra_layers": layers,
        "units": units,
        "dropout": dropout,
}} for layer_class in (layers.LSTM, layers.GRU) for layers in range(3) for units in (48, 64) for dropout in (0, .2, .4)]

# Create directories for checkpoints and histories
os.makedirs("checkpoints", exist_ok=True)
os.makedirs("histories", exist_ok=True)

for model in rnn_models:
    params = model["params"]
    model["instance"] = create_rnn_model(**params)
    model["name"] = "%s, layers=%d, units=%d, dropout=%.1f" % (params["rnn_layer"].__name__, params["extra_layers"]+1, params["units"], params["dropout"])
    model["checkpoint"] = os.path.join("checkpoints", "%s.keras" % model["name"])
    model["history_file"] = os.path.join("histories", "%s.csv" % model["name"])

WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1758095267.102520  141633 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4300 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5

# Load the models from the checkpoint files
for model in rnn_models:
    model["instance"] = load_model(model["checkpoint"])
    model["history"] = pd.read_csv(model["history_file"], sep=',', engine='python')
    y_pred_cv = np.round(model["instance"].predict(X_cv_rnn))
    model["metrics"] = calc_model_metrics(y_cv, y_pred_cv)

2025-09-17 10:47:49.923844: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91200

48/48 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 3s 36ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 2s 36ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 2s 34ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 2s 38ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 3s 57ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 3s 60ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 3s 62ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 4s 67ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 5s 80ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 5s 84ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 5s 83ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 4s 80ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 2s 34ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 2s 41ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 2s 41ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 2s 42ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 3s 62ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 3s 63ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 3s 61ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 4s 63ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 5s 81ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 5s 82ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 5s 84ms/step
48/48 ━━━━━━━━━━━━━━━━━━━━ 5s 82ms/step

# Add this function after your metric display functions
def plot_training_history(history, model_name):
    """Plot training history with loss, accuracy, and F1-score"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Training History: %s' % model_name, fontsize=16)
    
    # Loss
    axes[0, 0].plot(history['loss'], label='Training Loss')
    axes[0, 0].plot(history['val_loss'], label='Validation Loss')
    axes[0, 0].set_title('Loss')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=.3)
    
    # Accuracy
    axes[0, 1].plot(history['accuracy'], label='Training Accuracy')
    axes[0, 1].plot(history['val_accuracy'], label='Validation Accuracy')
    axes[0, 1].set_title('Accuracy')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Accuracy')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=.3)
    
    # F1-score (validation only)
    if 'val_f1' in history.columns:
        axes[1, 0].plot(history['val_f1'], label='Validation F1-score', color='green')
        axes[1, 0].set_title('Validation F1-score')
        axes[1, 0].set_xlabel('Epoch')
        axes[1, 0].set_ylabel('F1-score')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=.3)
        
        # Mark best F1 epoch
        best_f1_epoch = history['val_f1'].idxmax()
        best_f1_value = history['val_f1'].max()
        axes[1, 0].axvline(x=best_f1_epoch, color='red', linestyle='--', alpha=.7)
        axes[1, 0].text(best_f1_epoch + 0.5, best_f1_value - 0.05, 
                       f'Best: {best_f1_value:.3f}', color='red')
    
    # Learning rate (if available)
    axes[1, 1].remove()  # Remove empty subplot
    
    plt.tight_layout()
    plt.show()

# Add this function to compare multiple models
def compare_training_histories(models, title=""):
    """Compare training histories of multiple models"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'Model Comparison: %s' % title, fontsize=16)
    
    colors = plt.cm.Set3(np.linspace(0, 1, len(models)))
    
    for i, (model_name, history) in enumerate(models):
        color = colors[i]
        
        # Loss
        axes[0, 0].plot(history['val_loss'], label=model_name, color=color, alpha=.8)
        
        # Accuracy
        axes[0, 1].plot(history['val_accuracy'], label=model_name, color=color, alpha=.8)
        
        # F1-score
        axes[1, 0].plot(history['val_f1'], label=model_name, color=color, alpha=.8)

    # Format loss subplot
    axes[0, 0].set_title('Validation Loss')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=.3)

    # Format accuracy subplot
    axes[0, 1].set_title('Validation Accuracy')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Accuracy')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=.3)

    # Format F1 subplot
    axes[1, 0].set_title('Validation F1-score')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('F1-score')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=.3)

    # Remove empty subplot
    axes[1, 1].remove()

    plt.tight_layout()
    plt.show()

# Display the LSTM models in descending order based on their F1 scores
# Also show the confusion matrix of the top model
lstm_models = [model for model in rnn_models if model["params"]["rnn_layer"] == layers.LSTM]
lstm_models.sort(key=lambda model: model["metrics"]["f1"], reverse=True)
for i, model in enumerate(lstm_models):
    display_model_metrics(model["name"], **model["metrics"], show_cm=i==0)

Model: LSTM, layers=1, units=48, dropout=0.0
    F1-score: 0.829    Accuracy: 0.856    Precision: 0.848    Recall: 0.810

Model: LSTM, layers=3, units=48, dropout=0.4
    F1-score: 0.815    Accuracy: 0.844    Precision: 0.834    Recall: 0.797
Model: LSTM, layers=1, units=48, dropout=0.2
    F1-score: 0.810    Accuracy: 0.834    Precision: 0.797    Recall: 0.823
Model: LSTM, layers=2, units=48, dropout=0.4
    F1-score: 0.808    Accuracy: 0.840    Precision: 0.835    Recall: 0.783
Model: LSTM, layers=2, units=48, dropout=0.0
    F1-score: 0.808    Accuracy: 0.838    Precision: 0.827    Recall: 0.789
Model: LSTM, layers=1, units=64, dropout=0.4
    F1-score: 0.797    Accuracy: 0.831    Precision: 0.824    Recall: 0.772
Model: LSTM, layers=2, units=48, dropout=0.2
    F1-score: 0.791    Accuracy: 0.825    Precision: 0.810    Recall: 0.774
Model: LSTM, layers=3, units=48, dropout=0.0
    F1-score: 0.791    Accuracy: 0.820    Precision: 0.790    Recall: 0.792
Model: LSTM, layers=3, units=48, dropout=0.2
    F1-score: 0.790    Accuracy: 0.829    Precision: 0.836    Recall: 0.749
Model: LSTM, layers=3, units=64, dropout=0.4
    F1-score: 0.789    Accuracy: 0.825    Precision: 0.820    Recall: 0.760
Model: LSTM, layers=1, units=64, dropout=0.0
    F1-score: 0.783    Accuracy: 0.822    Precision: 0.823    Recall: 0.746
Model: LSTM, layers=1, units=48, dropout=0.4
    F1-score: 0.779    Accuracy: 0.814    Precision: 0.797    Recall: 0.761
Model: LSTM, layers=3, units=64, dropout=0.0
    F1-score: 0.777    Accuracy: 0.812    Precision: 0.791    Recall: 0.763
Model: LSTM, layers=1, units=64, dropout=0.2
    F1-score: 0.776    Accuracy: 0.810    Precision: 0.783    Recall: 0.769
Model: LSTM, layers=2, units=64, dropout=0.0
    F1-score: 0.776    Accuracy: 0.820    Precision: 0.835    Recall: 0.725
Model: LSTM, layers=2, units=64, dropout=0.4
    F1-score: 0.771    Accuracy: 0.794    Precision: 0.738    Recall: 0.807
Model: LSTM, layers=2, units=64, dropout=0.2
    F1-score: 0.767    Accuracy: 0.797    Precision: 0.756    Recall: 0.780
Model: LSTM, layers=3, units=64, dropout=0.2
    F1-score: 0.759    Accuracy: 0.798    Precision: 0.777    Recall: 0.742

# Display the GRU models in descending order based on their F1 scores
# Also show the confusion matrix of the top model
gru_models = [model for model in rnn_models if model["params"]["rnn_layer"] == layers.GRU]
gru_models.sort(key=lambda model: model["metrics"]["f1"], reverse=True)
for i, model in enumerate(gru_models):
    display_model_metrics(model["name"], **model["metrics"], show_cm=i==0)

Model: GRU, layers=3, units=64, dropout=0.2
    F1-score: 0.857    Accuracy: 0.882    Precision: 0.892    Recall: 0.824

Model: GRU, layers=1, units=48, dropout=0.0
    F1-score: 0.821    Accuracy: 0.850    Precision: 0.844    Recall: 0.800
Model: GRU, layers=2, units=64, dropout=0.0
    F1-score: 0.818    Accuracy: 0.846    Precision: 0.832    Recall: 0.804
Model: GRU, layers=1, units=64, dropout=0.2
    F1-score: 0.813    Accuracy: 0.848    Precision: 0.859    Recall: 0.772
Model: GRU, layers=1, units=64, dropout=0.0
    F1-score: 0.811    Accuracy: 0.840    Precision: 0.827    Recall: 0.795
Model: GRU, layers=3, units=48, dropout=0.2
    F1-score: 0.807    Accuracy: 0.840    Precision: 0.839    Recall: 0.778
Model: GRU, layers=1, units=48, dropout=0.2
    F1-score: 0.804    Accuracy: 0.834    Precision: 0.814    Recall: 0.795
Model: GRU, layers=2, units=48, dropout=0.2
    F1-score: 0.792    Accuracy: 0.821    Precision: 0.788    Recall: 0.797
Model: GRU, layers=3, units=64, dropout=0.0
    F1-score: 0.787    Accuracy: 0.823    Precision: 0.812    Recall: 0.765
Model: GRU, layers=3, units=48, dropout=0.4
    F1-score: 0.786    Accuracy: 0.823    Precision: 0.815    Recall: 0.760
Model: GRU, layers=1, units=64, dropout=0.4
    F1-score: 0.779    Accuracy: 0.814    Precision: 0.793    Recall: 0.766
Model: GRU, layers=2, units=48, dropout=0.0
    F1-score: 0.777    Accuracy: 0.812    Precision: 0.789    Recall: 0.766
Model: GRU, layers=3, units=64, dropout=0.4
    F1-score: 0.774    Accuracy: 0.800    Precision: 0.754    Recall: 0.795
Model: GRU, layers=3, units=48, dropout=0.0
    F1-score: 0.773    Accuracy: 0.810    Precision: 0.792    Recall: 0.755
Model: GRU, layers=2, units=48, dropout=0.4
    F1-score: 0.768    Accuracy: 0.798    Precision: 0.757    Recall: 0.780
Model: GRU, layers=1, units=48, dropout=0.4
    F1-score: 0.766    Accuracy: 0.806    Precision: 0.792    Recall: 0.742
Model: GRU, layers=2, units=64, dropout=0.4
    F1-score: 0.761    Accuracy: 0.791    Precision: 0.749    Recall: 0.772
Model: GRU, layers=2, units=64, dropout=0.2
    F1-score: 0.759    Accuracy: 0.792    Precision: 0.756    Recall: 0.761

# Plot training history for top LSTM model
print("Training History for Top LSTM Model:")
plot_training_history(lstm_models[0]["history"], lstm_models[0]["name"])

Training History for Top LSTM Model:

# Plot training history for top GRU model
print("Training History for Top GRU Model:")
plot_training_history(gru_models[0]["history"], gru_models[0]["name"])

Training History for Top GRU Model:

# Compare top models from each category
top_models = [
    ('Top LSTM', lstm_models[0]["history"]),
    ('Top GRU', gru_models[0]["history"])
]
compare_training_histories(top_models, "Best LSTM vs Best GRU")

# Compare different architectures within LSTM
lstm_comparison = []
for i, model in enumerate(lstm_models[:3]):  # Top 3 LSTM models
    lstm_comparison.append(("#%d %s" % (i+1, model["name"]), model["history"]))
compare_training_histories(lstm_comparison, "Top 3 LSTM Architectures")

# Compare different architectures within GRU
gru_comparison = []
for i, model in enumerate(gru_models[:3]):  # Top 3 GRU models
    gru_comparison.append(("#%d %s" % (i+1, model["name"]), model["history"]))
compare_training_histories(gru_comparison, "Top 3 GRU Architectures")

# Store all model results for comparison
model_results = {
    'Random Forest': rf_metrics,
    'LSTM': lstm_models[0]["metrics"],
    'GRU': gru_models[0]["metrics"],
}

# Create comparison DataFrame
comparison_df = pd.DataFrame(model_results)[:4].T
comparison_df = comparison_df.sort_values('f1', ascending=False)

print("Model Performance Comparison (Sorted by F1-score):")
print(comparison_df)

# Select best model based on F1-score
best_model_name = comparison_df.index[0]
best_model_metrics = comparison_df.iloc[0]

print("\nBest Model:", best_model_name)
print("Best F1-score: %.3f" % best_model_metrics['f1'])

# Visual comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# F1-score comparison
axes[0, 0].bar(comparison_df.index, comparison_df['f1'])
axes[0, 0].set_title('F1-score Comparison')
axes[0, 0].set_ylabel('F1-score')
axes[0, 0].tick_params(axis='x', rotation=45)

# Accuracy comparison
axes[0, 1].bar(comparison_df.index, comparison_df['accuracy'])
axes[0, 1].set_title('Accuracy Comparison')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].tick_params(axis='x', rotation=45)

# Precision comparison
axes[1, 0].bar(comparison_df.index, comparison_df['precision'])
axes[1, 0].set_title('Precision Comparison')
axes[1, 0].set_ylabel('Precision')
axes[1, 0].tick_params(axis='x', rotation=45)

# Recall comparison
axes[1, 1].bar(comparison_df.index, comparison_df['recall'])
axes[1, 1].set_title('Recall Comparison')
axes[1, 1].set_ylabel('Recall')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

Model Performance Comparison (Sorted by F1-score):
               accuracy        f1 precision    recall
GRU            0.881812  0.856916  0.892384  0.824159
LSTM           0.856205  0.828772     0.848  0.810398
Random Forest   0.67761  0.617303  0.629571  0.605505

Best Model: GRU
Best F1-score: 0.857

# Generate predictions using the best model based on F1-score
if best_model_name == 'Random Forest':
    best_predictions = rf_classifier.predict(X_test_rf)
elif best_model_name == 'LSTM':
    best_predictions = np.round(lstm_models[0]["instance"].predict(X_test_rnn)).astype(int).flatten()
elif best_model_name == 'GRU':
    best_predictions = np.round(gru_models[0]["instance"].predict(X_test_rnn)).astype(int).flatten()

# Create submission file
submission = pd.DataFrame({
    'id': df_test['id'],
    'target': best_predictions
})

submission.to_csv('submission.csv', index=False)
print('Submission file created successfully using %s (F1-score: %.3f)' % (best_model_name, best_model_metrics["f1"]))

102/102 ━━━━━━━━━━━━━━━━━━━━ 7s 66ms/step
Submission file created successfully using GRU (F1-score: 0.857)

print("Detailed Model Performance Analysis:")
for model_name, metrics in model_results.items():
    print("\n%s:" % model_name)
    print("  F1-score: %.3f" % metrics['f1'])
    print("  Accuracy: %.3f" % metrics['accuracy'])
    print("  Precision: %.3f" % metrics['precision'])
    print("  Recall: %.3f" % metrics['recall'])

Detailed Model Performance Analysis:

Random Forest:
  F1-score: 0.617
  Accuracy: 0.678
  Precision: 0.630
  Recall: 0.606

LSTM:
  F1-score: 0.829
  Accuracy: 0.856
  Precision: 0.848
  Recall: 0.810

GRU:
  F1-score: 0.857
  Accuracy: 0.882
  Precision: 0.892
  Recall: 0.824

	id	keyword	location	text	target
0	1	NaN	NaN	Our Deeds are the Reason of this #earthquake M...	1
1	4	NaN	NaN	Forest fire near La Ronge Sask. Canada	1
2	5	NaN	NaN	All residents asked to 'shelter in place' are ...	1
3	6	NaN	NaN	13,000 people receive #wildfires evacuation or...	1
4	7	NaN	NaN	Just got sent this photo from Ruby #Alaska as ...	1

	id	keyword	location	text
0	0	NaN	NaN	Just happened a terrible car crash
1	2	NaN	NaN	Heard about #earthquake is different cities, s...
2	3	NaN	NaN	there is a forest fire at spot pond, geese are...
3	9	NaN	NaN	Apocalypse lighting. #Spokane #wildfires
4	11	NaN	NaN	Typhoon Soudelor kills 28 in China and Taiwan

Twitter Disaster Classification Analysis¶

Initial Data Inspection¶

Observations¶

Data cleaning¶

Text Cleaning Process¶

Word Tokenization¶

Load GloVe Embeddings from Text File¶

Data Partitioning¶

Model Development Optimization¶

Baseline Model with Metadata Features¶

RNN Models with Text Features¶

Train the RNN models¶

Display the results¶

Model Comparison and Selection based on F1-score¶

Results Submission with Best Model¶

Detailed Model Analysis¶

Discussion¶

References¶

Twitter Disaster Classification Analysis¶

Binary Classification of Disaster-Related Tweets¶

Initial Data Inspection¶

Observations¶

Data cleaning¶

Text Cleaning Process¶

Word Tokenization¶

Load GloVe Embeddings from Text File¶

Data Partitioning¶

Model Development Optimization¶

Baseline Model with Metadata Features¶

RNN Models with Text Features¶

Train the RNN models¶

Display the results¶

Model Comparison and Selection based on F1-score¶

Results Submission with Best Model¶

Detailed Model Analysis¶

Discussion¶

References¶