From b358c276ff49c76ab945ffc51760e51328f574b1 Mon Sep 17 00:00:00 2001 From: A Farzat Date: Fri, 31 Oct 2025 22:32:50 +0300 Subject: Improve the content of the blog post --- content/blog/csca5622-final/index.md | 65 ++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 7 deletions(-) (limited to 'content/blog/csca5622-final/index.md') diff --git a/content/blog/csca5622-final/index.md b/content/blog/csca5622-final/index.md index f3747e6..d4115b7 100644 --- a/content/blog/csca5622-final/index.md +++ b/content/blog/csca5622-final/index.md @@ -1,6 +1,6 @@ +++ -title = "Spam Email Classification (non-DL)" -description = "Comparing different machine learning algorithms on the Spam Email Classification problem (deep learning not included)." +title = "📧 Is This Spam? Testing Email Classification Models" +description = "Exploring which machine learning models best detect spam emails—and why ensemble methods like AdaBoost and Random Forest come out on top." date = 2025-10-22 [taxonomies] tags = ["machine_learning"] @@ -8,13 +8,64 @@ tags = ["machine_learning"] styles = ["notebooks.css", ] +++ -This is a small research I made on the performance of different machine learning -models when classifying spam email. The focus is on supervised models, but without -including deep learning models. +Spam filters are something we rely on every day, often without thinking about +how they work. In this project, I explored how different machine learning +models perform when tasked with identifying spam emails using a dataset from +the UCI Machine Learning Repository. -You can also view the notebook as [a separate page](notebook.html). +## About the Dataset + +The dataset includes over 4,600 emails, each described by 57 features. These +features capture things like how often certain words or characters appear +(e.g., “free”, “$”, “!”), and how long sequences of capital letters are. Each +email is labeled as either spam or not spam. + +Some features are surprisingly specific—like the presence of the word “george” +or the area code “650”—which turned out to be strong indicators of non-spam. +These quirks reflect the personal nature of the original email sources. + +## What I Tried + +The goal was to test a few different models and see which one did the best job. +I compared: + +* Logistic Regression +* Random Forest +* AdaBoost +* Support Vector Machines (SVMs) + +Each model was tuned to find the best settings, and then evaluated based on +accuracy, precision, and recall. + +## What Worked Best + +The ensemble models—Random Forest and AdaBoost—stood out. They consistently +delivered high accuracy and precision, outperforming the benchmarks published +on UCI’s website. + +Logistic Regression also did well, especially when regularization was used to +handle overlapping features. SVMs, on the other hand, didn’t perform as +strongly. Interestingly, the simpler LinearSVC model did better than the more +complex RBF kernel version. + +## Why Precision Matters + +In spam detection, false positives (marking a legitimate email as spam) are +worse than false negatives. So precision is more important than raw accuracy. +Fortunately, the best-performing models had strong precision scores, especially +the ensemble ones. + +## Final Thoughts + +This project was a great way to see how different models handle a real-world +classification task. While the results were solid, there’s still room to +improve—especially when it comes to minimizing false positives. Adjusting +thresholds or tweaking model weights could help push precision even higher. + +The full notebook with code and visualizations is embedded below 👇 -You can also check it on [GitHub](https://github.com/Farzat07/introduction-to-machine-learning-supervised-learning-final-assignment). +You can also view the notebook in [a separate page](notebook.html), or check it +on [GitHub](https://github.com/Farzat07/introduction-to-machine-learning-supervised-learning-final-assignment). -- cgit v1.2.3-70-g09d2