# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learn-ai-bbc/BBC News Train.csv
/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv
/kaggle/input/learn-ai-bbc/BBC News Test.csv

from itertools import permutations
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from wordcloud import WordCloud

data_train = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Train.csv")
data_test = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Test.csv")
sample_solution = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv")

print(data_train.info())
data_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.1+ KB
None

data_train = data_train.drop_duplicates(subset = ["Text"])
data_train.Category = pd.Categorical(data_train.Category)

data_train.Category.value_counts().plot.pie(autopct='%1.1f%%', ylabel='')

<Axes: >

print(data_test.info())
data_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735 entries, 0 to 734
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  735 non-null    int64 
 1   Text       735 non-null    object
dtypes: int64(1), object(1)
memory usage: 11.6+ KB
None

print(sample_solution.info())
sample_solution.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735 entries, 0 to 734
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  735 non-null    int64 
 1   Category   735 non-null    object
dtypes: int64(1), object(1)
memory usage: 11.6+ KB
None

vectoriser = TfidfVectorizer(stop_words="english") # lowercase is True by default and analyzer==word.
combined_text = pd.concat([data_train.Text, data_test.Text], ignore_index=True)
vectorised = vectoriser.fit_transform(combined_text)
print(vectorised.shape)

(2175, 29126)

word_frequencies = pd.DataFrame(vectorised.toarray(), columns=vectoriser.get_feature_names_out()).T.sum(axis=1)
wordcloud = WordCloud(background_color='white').generate_from_frequencies(word_frequencies.to_dict())
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

nmf = NMF(n_components=5)
train_pred = nmf.fit_transform(vectorised).argmax(axis=1)[:1440]

def label_permute_compare(ytdf,yp):
    """
    ytdf: labels column from dataframe object.
    yp: label prediction output.
    Returns permuted label order and accuracy.
    Example output: ('business', 'politics', 'sport', 'entertainment', 'tech'), 0.74 .
    """
    y_true = ytdf
    best_acc = 0.
    for perm in permutations(ytdf.cat.categories):
        y_pred = [perm[i] for i in yp]
        accuracy = accuracy_score(y_true, y_pred)
        if accuracy > best_acc:
            best_acc = accuracy
            best_perm = perm
    return best_perm, best_acc

labels, acc = label_permute_compare(data_train.Category, train_pred)
print("Accuracy: %.1f%%" % (acc * 100))
confusion_matrix(data_train.Category, [labels[x] for x in train_pred])

Accuracy: 91.7%

array([[302,   1,  17,   1,  14],
       [  5, 211,   7,   3,  37],
       [ 13,   0, 242,   2,   9],
       [  1,   2,   0, 339,   0],
       [  0,   3,   0,   4, 227]])

text_without_nums = combined_text.str.replace(r'\d+', 'NUM', regex=True)
vectorised_without_nums = TfidfVectorizer(stop_words="english").fit_transform(text_without_nums)
without_nums_pred = NMF(5).fit_transform(vectorised_without_nums).argmax(axis=1)[:1440]
labels_without_num, acc_without_num = label_permute_compare(data_train.Category, without_nums_pred)
print("Accuracy: %.1f%%" % (acc_without_num * 100))
confusion_matrix(data_train.Category, [labels_without_num[x] for x in without_nums_pred])

Accuracy: 88.0%

array([[287,   1,  30,   1,  16],
       [ 20, 200,   8,   6,  29],
       [ 12,   0, 241,   5,   8],
       [ 19,   2,   0, 321,   0],
       [  7,   3,   0,   6, 218]])

def gen_model(vectoriser):
    model = {"vectoriser": vectoriser}
    vectorised = vectoriser.fit_transform(combined_text)
    nmf = NMF(n_components=5)
    model["nmf"] = nmf
    y_pred = nmf.fit_transform(vectorised).argmax(axis=1)[:1440]
    model["pred"] = y_pred
    model["labels"], model["acc"] = label_permute_compare(data_train.Category, y_pred)
    return model

best_accuracy = 0.
for norm in ("l1", "l2", None):
    for max_features in (2000, 4000, 6000, 8000, None):
        model = gen_model(TfidfVectorizer(stop_words="english", norm=norm, max_features=max_features))
        print("norm=%s, max_features=%s, acc=%.5f%%" % (norm, max_features, model["acc"] * 100))
        if model["acc"] > best_accuracy:
            best_model = model
            best_accuracy = model["acc"]
print("Best model accuracy = %.3f%%" % (best_model["acc"] * 100))
confusion_matrix(data_train.Category, [best_model["labels"][x] for x in best_model["pred"]])

norm=l1, max_features=2000, acc=90.20833%
norm=l1, max_features=4000, acc=90.27778%
norm=l1, max_features=6000, acc=90.97222%
norm=l1, max_features=8000, acc=90.97222%
norm=l1, max_features=None, acc=90.69444%
norm=l2, max_features=2000, acc=91.25000%
norm=l2, max_features=4000, acc=91.73611%
norm=l2, max_features=6000, acc=91.52778%
norm=l2, max_features=8000, acc=91.45833%
norm=l2, max_features=None, acc=91.73611%
norm=None, max_features=2000, acc=47.22222%
norm=None, max_features=4000, acc=42.29167%
norm=None, max_features=6000, acc=42.01389%
norm=None, max_features=8000, acc=40.69444%
norm=None, max_features=None, acc=45.69444%
Best model accuracy = 91.736%

array([[304,   1,  18,   0,  12],
       [  5, 209,   7,   4,  38],
       [ 13,   0, 243,   2,   8],
       [  2,   2,   0, 338,   0],
       [  0,   3,   0,   4, 227]])

vectorised = best_model["vectoriser"].transform(combined_text)
test_pred = best_model["nmf"].transform(vectorised).argmax(axis=1)[1440:]

sample_solution.Category = [best_model["labels"][x] for x in test_pred]
print("The order of solution df and test df is equivalent:", all(sample_solution.ArticleId == data_test.ArticleId))
print(sample_solution)
sample_solution.to_csv("submission.csv", index=False)

The order of solution df and test df is equivalent: True
     ArticleId       Category
0         1018          sport
1         1319           tech
2         1138          sport
3          459       business
4         1020          sport
..         ...            ...
730       1923       business
731        373  entertainment
732       1704           tech
733        206       business
734        471       politics

[735 rows x 2 columns]

train_vectorised = best_model["vectoriser"].transform(data_train.Text)
X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.2, random_state=4)
clf = LogisticRegression(random_state=4).fit(X_train, y_train)
print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
confusion_matrix(y_test, clf.predict(X_test))

Training score: 99.7%
Test score: 96.9%

array([[69,  0,  0,  0,  1],
       [ 0, 46,  0,  0,  0],
       [ 4,  0, 54,  0,  0],
       [ 0,  0,  0, 65,  0],
       [ 0,  3,  0,  1, 45]])

print("Training size: 60%")
X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.4, random_state=4)
clf = LogisticRegression(random_state=4).fit(X_train, y_train)
print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
print("\nTraining size: 40%")
X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.6, random_state=4)
clf = LogisticRegression(random_state=4).fit(X_train, y_train)
print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
print("\nTraining size: 20%")
X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.8, random_state=4)
clf = LogisticRegression(random_state=4).fit(X_train, y_train)
print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))

Training size: 60%
Training score: 99.8%
Test score: 97.0%

Training size: 40%
Training score: 99.8%
Test score: 96.5%

Training size: 20%
Training score: 100.0%
Test score: 92.4%

print("Training size: 80%")
X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.2, random_state=4)
clf = LinearSVC(random_state=4).fit(X_train, y_train)
print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))

Training size: 80%
Training score: 100.0%
Test score: 96.9%

print("Training size: 60%")
X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.4, random_state=4)
clf = LinearSVC(random_state=4).fit(X_train, y_train)
print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
print("\nTraining size: 40%")
X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.6, random_state=4)
clf = LinearSVC(random_state=4).fit(X_train, y_train)
print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
print("\nTraining size: 20%")
X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.8, random_state=4)
clf = LinearSVC(random_state=4).fit(X_train, y_train)
print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))

Training size: 60%
Training score: 100.0%
Test score: 97.2%

Training size: 40%
Training score: 100.0%
Test score: 96.9%

Training size: 20%
Training score: 100.0%
Test score: 95.7%

	ArticleId	Text	Category
0	1833	worldcom ex-boss launches defence lawyers defe...	business
1	154	german business confidence slides german busin...	business
2	1101	bbc poll indicates economic gloom citizens in ...	business
3	1976	lifestyle governs mobile choice faster bett...	tech
4	917	enron bosses in $168m payout eighteen former e...	business

BBC News Classification Kaggle Mini-Project¶

Project setup¶

Inspection¶

Training data¶

Test data and provided sample solution¶

Preprocessing¶

An idea to increase performance¶

Building and training the NMF model¶

Try replacing the numbers with NUM¶

Changing the hyperparameters¶

Test on data_test¶

Building and training the Supervised Learning model¶

Trying the same with LinearSVC¶

References¶

	ArticleId	Text
0	1018	qpr keeper day heads for preston queens park r...
1	1319	software watching while you work software that...
2	1138	d arcy injury adds to ireland woe gordon d arc...
3	459	india s reliance family feud heats up the ongo...
4	1020	boro suffer morrison injury blow middlesbrough...

	ArticleId	Category
0	1018	sport
1	1319	tech
2	1138	business
3	459	entertainment
4	1020	politics