# This Python 3 environment comes with many helpful analytics libraries installed
+# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
+# For example, here's several helpful packages to load
+
+import numpy as np # linear algebra
+import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+
+# Input data files are available in the read-only "../input/" directory
+# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
+
+import os
+for dirname, _, filenames in os.walk('/kaggle/input'):
+    for filename in filenames:
+        print(os.path.join(dirname, filename))
+
+# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
+# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
+

/kaggle/input/learn-ai-bbc/BBC News Train.csv
+/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv
+/kaggle/input/learn-ai-bbc/BBC News Test.csv
+

from itertools import permutations
+import matplotlib.pyplot as plt
+from sklearn.decomposition import NMF
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.svm import LinearSVC
+from wordcloud import WordCloud
+

data_train = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Train.csv")
+data_test = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Test.csv")
+sample_solution = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv")
+

print(data_train.info())
+data_train.head()
+

<class 'pandas.core.frame.DataFrame'>
+RangeIndex: 1490 entries, 0 to 1489
+Data columns (total 3 columns):
+ #   Column     Non-Null Count  Dtype 
+---  ------     --------------  ----- 
+ 0   ArticleId  1490 non-null   int64 
+ 1   Text       1490 non-null   object
+ 2   Category   1490 non-null   object
+dtypes: int64(1), object(2)
+memory usage: 35.1+ KB
+None
+

data_train = data_train.drop_duplicates(subset = ["Text"])
+data_train.Category = pd.Categorical(data_train.Category)
+

data_train.Category.value_counts().plot.pie(autopct='%1.1f%%', ylabel='')
+

<Axes: >

print(data_test.info())
+data_test.head()
+

<class 'pandas.core.frame.DataFrame'>
+RangeIndex: 735 entries, 0 to 734
+Data columns (total 2 columns):
+ #   Column     Non-Null Count  Dtype 
+---  ------     --------------  ----- 
+ 0   ArticleId  735 non-null    int64 
+ 1   Text       735 non-null    object
+dtypes: int64(1), object(1)
+memory usage: 11.6+ KB
+None
+

print(sample_solution.info())
+sample_solution.head()
+

<class 'pandas.core.frame.DataFrame'>
+RangeIndex: 735 entries, 0 to 734
+Data columns (total 2 columns):
+ #   Column     Non-Null Count  Dtype 
+---  ------     --------------  ----- 
+ 0   ArticleId  735 non-null    int64 
+ 1   Category   735 non-null    object
+dtypes: int64(1), object(1)
+memory usage: 11.6+ KB
+None
+

vectoriser = TfidfVectorizer(stop_words="english") # lowercase is True by default and analyzer==word.
+combined_text = pd.concat([data_train.Text, data_test.Text], ignore_index=True)
+vectorised = vectoriser.fit_transform(combined_text)
+print(vectorised.shape)
+

(2175, 29126)
+

word_frequencies = pd.DataFrame(vectorised.toarray(), columns=vectoriser.get_feature_names_out()).T.sum(axis=1)
+wordcloud = WordCloud(background_color='white').generate_from_frequencies(word_frequencies.to_dict())
+plt.imshow(wordcloud)
+plt.axis('off')
+plt.show()
+

nmf = NMF(n_components=5)
+train_pred = nmf.fit_transform(vectorised).argmax(axis=1)[:1440]
+

def label_permute_compare(ytdf,yp):
+    """
+    ytdf: labels column from dataframe object.
+    yp: label prediction output.
+    Returns permuted label order and accuracy.
+    Example output: ('business', 'politics', 'sport', 'entertainment', 'tech'), 0.74 .
+    """
+    y_true = ytdf
+    best_acc = 0.
+    for perm in permutations(ytdf.cat.categories):
+        y_pred = [perm[i] for i in yp]
+        accuracy = accuracy_score(y_true, y_pred)
+        if accuracy > best_acc:
+            best_acc = accuracy
+            best_perm = perm
+    return best_perm, best_acc
+
+labels, acc = label_permute_compare(data_train.Category, train_pred)
+print("Accuracy: %.1f%%" % (acc * 100))
+confusion_matrix(data_train.Category, [labels[x] for x in train_pred])
+

Accuracy: 91.7%
+

array([[302,   1,  17,   1,  14],
+       [  5, 211,   7,   3,  37],
+       [ 13,   0, 242,   2,   9],
+       [  1,   2,   0, 339,   0],
+       [  0,   3,   0,   4, 227]])

text_without_nums = combined_text.str.replace(r'\d+', 'NUM', regex=True)
+vectorised_without_nums = TfidfVectorizer(stop_words="english").fit_transform(text_without_nums)
+without_nums_pred = NMF(5).fit_transform(vectorised_without_nums).argmax(axis=1)[:1440]
+labels_without_num, acc_without_num = label_permute_compare(data_train.Category, without_nums_pred)
+print("Accuracy: %.1f%%" % (acc_without_num * 100))
+confusion_matrix(data_train.Category, [labels_without_num[x] for x in without_nums_pred])
+

Accuracy: 88.0%
+

array([[287,   1,  30,   1,  16],
+       [ 20, 200,   8,   6,  29],
+       [ 12,   0, 241,   5,   8],
+       [ 19,   2,   0, 321,   0],
+       [  7,   3,   0,   6, 218]])

def gen_model(vectoriser):
+    model = {"vectoriser": vectoriser}
+    vectorised = vectoriser.fit_transform(combined_text)
+    nmf = NMF(n_components=5)
+    model["nmf"] = nmf
+    y_pred = nmf.fit_transform(vectorised).argmax(axis=1)[:1440]
+    model["pred"] = y_pred
+    model["labels"], model["acc"] = label_permute_compare(data_train.Category, y_pred)
+    return model
+
+best_accuracy = 0.
+for norm in ("l1", "l2", None):
+    for max_features in (2000, 4000, 6000, 8000, None):
+        model = gen_model(TfidfVectorizer(stop_words="english", norm=norm, max_features=max_features))
+        print("norm=%s, max_features=%s, acc=%.5f%%" % (norm, max_features, model["acc"] * 100))
+        if model["acc"] > best_accuracy:
+            best_model = model
+            best_accuracy = model["acc"]
+print("Best model accuracy = %.3f%%" % (best_model["acc"] * 100))
+confusion_matrix(data_train.Category, [best_model["labels"][x] for x in best_model["pred"]])
+

norm=l1, max_features=2000, acc=90.20833%
+norm=l1, max_features=4000, acc=90.27778%
+norm=l1, max_features=6000, acc=90.97222%
+norm=l1, max_features=8000, acc=90.97222%
+norm=l1, max_features=None, acc=90.69444%
+norm=l2, max_features=2000, acc=91.25000%
+norm=l2, max_features=4000, acc=91.73611%
+norm=l2, max_features=6000, acc=91.52778%
+norm=l2, max_features=8000, acc=91.45833%
+norm=l2, max_features=None, acc=91.73611%
+norm=None, max_features=2000, acc=47.22222%
+norm=None, max_features=4000, acc=42.29167%
+norm=None, max_features=6000, acc=42.01389%
+norm=None, max_features=8000, acc=40.69444%
+norm=None, max_features=None, acc=45.69444%
+Best model accuracy = 91.736%
+

array([[304,   1,  18,   0,  12],
+       [  5, 209,   7,   4,  38],
+       [ 13,   0, 243,   2,   8],
+       [  2,   2,   0, 338,   0],
+       [  0,   3,   0,   4, 227]])

vectorised = best_model["vectoriser"].transform(combined_text)
+test_pred = best_model["nmf"].transform(vectorised).argmax(axis=1)[1440:]
+

sample_solution.Category = [best_model["labels"][x] for x in test_pred]
+print("The order of solution df and test df is equivalent:", all(sample_solution.ArticleId == data_test.ArticleId))
+print(sample_solution)
+sample_solution.to_csv("submission.csv", index=False)
+

The order of solution df and test df is equivalent: True
+     ArticleId       Category
+0         1018          sport
+1         1319           tech
+2         1138          sport
+3          459       business
+4         1020          sport
+..         ...            ...
+730       1923       business
+731        373  entertainment
+732       1704           tech
+733        206       business
+734        471       politics
+
+[735 rows x 2 columns]
+

train_vectorised = best_model["vectoriser"].transform(data_train.Text)
+X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.2, random_state=4)
+clf = LogisticRegression(random_state=4).fit(X_train, y_train)
+print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
+print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
+confusion_matrix(y_test, clf.predict(X_test))
+

Training score: 99.7%
+Test score: 96.9%
+

array([[69,  0,  0,  0,  1],
+       [ 0, 46,  0,  0,  0],
+       [ 4,  0, 54,  0,  0],
+       [ 0,  0,  0, 65,  0],
+       [ 0,  3,  0,  1, 45]])

print("Training size: 60%")
+X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.4, random_state=4)
+clf = LogisticRegression(random_state=4).fit(X_train, y_train)
+print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
+print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
+print("\nTraining size: 40%")
+X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.6, random_state=4)
+clf = LogisticRegression(random_state=4).fit(X_train, y_train)
+print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
+print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
+print("\nTraining size: 20%")
+X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.8, random_state=4)
+clf = LogisticRegression(random_state=4).fit(X_train, y_train)
+print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
+print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
+

Training size: 60%
+Training score: 99.8%
+Test score: 97.0%
+
+Training size: 40%
+Training score: 99.8%
+Test score: 96.5%
+
+Training size: 20%
+Training score: 100.0%
+Test score: 92.4%
+

print("Training size: 80%")
+X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.2, random_state=4)
+clf = LinearSVC(random_state=4).fit(X_train, y_train)
+print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
+print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
+

Training size: 80%
+Training score: 100.0%
+Test score: 96.9%
+

print("Training size: 60%")
+X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.4, random_state=4)
+clf = LinearSVC(random_state=4).fit(X_train, y_train)
+print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
+print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
+print("\nTraining size: 40%")
+X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.6, random_state=4)
+clf = LinearSVC(random_state=4).fit(X_train, y_train)
+print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
+print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
+print("\nTraining size: 20%")
+X_train, X_test, y_train, y_test = train_test_split(train_vectorised, data_train.Category, test_size=.8, random_state=4)
+clf = LinearSVC(random_state=4).fit(X_train, y_train)
+print("Training score: %.1f%%" % (clf.score(X_train, y_train) * 100))
+print("Test score: %.1f%%" % (clf.score(X_test, y_test) * 100))
+

Training size: 60%
+Training score: 100.0%
+Test score: 97.2%
+
+Training size: 40%
+Training score: 100.0%
+Test score: 96.9%
+
+Training size: 20%
+Training score: 100.0%
+Test score: 95.7%
+

+

	ArticleId	Text	Category
0	1833	worldcom ex-boss launches defence lawyers defe...	business
1	154	german business confidence slides german busin...	business
2	1101	bbc poll indicates economic gloom citizens in ...	business
3	1976	lifestyle governs mobile choice faster bett...	tech
4	917	enron bosses in $168m payout eighteen former e...	business

BBC News Classification Kaggle Mini-Project¶

Project setup¶

Inspection¶

Training data¶

Test data and provided sample solution¶

Preprocessing¶

An idea to increase performance¶

Building and training the NMF model¶

Try replacing the numbers with NUM¶

Changing the hyperparameters¶

Test on data_test¶

Building and training the Supervised Learning model¶

Trying the same with LinearSVC¶

References¶

	ArticleId	Text
0	1018	qpr keeper day heads for preston queens park r...
1	1319	software watching while you work software that...
2	1138	d arcy injury adds to ireland woe gordon d arc...
3	459	india s reliance family feud heats up the ongo...
4	1020	boro suffer morrison injury blow middlesbrough...

	ArticleId	Category
0	1018	sport
1	1319	tech
2	1138	business
3	459	entertainment
4	1020	politics