For coursework 2 you will be asked to train and evalute several different classifiers: Naïve Bayes classifier, Random Forest classifier, and kNN classifier using the iris dataset. You will be asked to answer a series of questions relating to each individual model and questions comparing each model.
You are free to use the sklearn library.¶
Notes:
- Remember to comment all of your code (see here for tips: https://stackabuse.com/commenting-python-code/). You can also make use of Jupyter Markdown, where appropriate, to improve the layout of your code and documentation.
- Please add docstrings to all of your functions (so that users can get information on inputs/outputs and what each function does by typing SHIFT+TAB over the function name. For more detail on python docstrings, see here: https://numpydoc.readthedocs.io/en/latest/format.html)
- When a question allows a free-form answer (e.g. what do you observe?), create a new markdown cell below and answer the question in the notebook.
- Always save your notebook when you are done (this is not automatic)!
- Upload your completed notebook using the VLE
Plagiarism: please make sure that the material you submit has been created by you. Any sources you use for code should be properly referenced. Your code will be checked for plagiarism using appropriate software.
Marking¶
The grades in this coursework are allocated approximately as follows:
mark | |
---|---|
Code | 7 |
Code Report/comments | 6 |
Model questions | 14 |
Model comparision questions | 18 |
Total available | 45 |
Remember to save your notebook as “CW2.ipynb”. It is a good idea to re-run the whole thing before saving and submitting.¶
1 Classifiers [7 marks total]¶
Code and train your three classifiers in the cells below the corresponding header. You do not need to implement cross-validation in this coursework, simply fit the data. You are free to use sklearn and other packages where necessary.
# import datasets
from sklearn import datasets
# load data
iris = datasets.load_iris() # load data
#print(iris.DESCR) # print dataset description
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB,CategoricalNB,BernoulliNB,ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import numpy as np
import sklearn.naive_bayes
import seaborn as sns
import pandas as pd
random_state = 4003
split_ratios = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
1.1 Naïve Bayes Classifier [2]¶
Train a naïve bayes classifier in python.
Use your code to fit the data given above.
def runNaiveBayes(X, y, split_ratios):
# Initialize a list to store results
results = []
# List of models to run
model_sets = [
{"model": GaussianNB(), 'algorithm': "Gaussian"},
{"model": MultinomialNB(), 'algorithm': "Multinomial"},
{"model": MultinomialNB(alpha=0, force_alpha=True), 'algorithm': "Multinomial (No smoothing)"},
{"model": ComplementNB(), 'algorithm': "Complement"},
{"model": ComplementNB(alpha=0, force_alpha=True), 'algorithm': "Complement (No smoothing)"},
{"model": BernoulliNB(), 'algorithm': "Bernoulli"},
{"model": BernoulliNB(alpha=0, force_alpha=True), 'algorithm': "Bernoulli (No smoothing)"},
#{"model": CategoricalNB(), 'algorithm': "Categorical"},
#{"model": CategoricalNB(alpha=0, force_alpha=False), 'algorithm': "Categorical (No smoothing)"},
]
# Check if split_ratios is a single decimal or a list
if not isinstance(split_ratios, list):
split_ratios = [split_ratios]
# Loop through each split ratio
for split_ratio in split_ratios:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=random_state)
# Loop through each model set
for model_set in model_sets:
# Train and predict
y_pred = model_set['model'].fit(X_train, y_train).predict(X_test)
# Calculate accuracy
accuracy = round((X_test.shape[0] - (y_test != y_pred).sum()) / X_test.shape[0], 4)
# Add result to the results list
results.append({
'split_ratio': split_ratio,
'algorithm': model_set['algorithm'],
'accuracy': accuracy
})
return results
def printReport(results):
# Sort results by 'description' and then 'split_ratio'
sorted_results = sorted(results, key=lambda x: (x['algorithm'], x['split_ratio']))
# Generate report
print("Performance Report:")
print("{:<20} {:<15} {:<15}".format('Model', 'Split Ratio', 'Accuracy'))
print('-' * 50) # To print a separator line
previous_description = None # Variable to keep track of the last printed description
for result in sorted_results:
current_description = result['algorithm']
# Print the description only if it's different from the last one
if current_description != previous_description:
print("{:<20} {:<15} {:<15}".format(current_description, result['split_ratio'], result['accuracy']))
previous_description = current_description
else:
print("{:<20} {:<15} {:<15}".format('', result['split_ratio'], result['accuracy']))
def summarizeNaiveBayesAccuracy(results):
# Extract unique split_ratios and algorithms from results
split_ratios = sorted(set([res['split_ratio'] for res in results]))
algorithms = sorted(set([res['algorithm'] for res in results]))
# Create an empty numpy array to store accuracy scores
accuracy_data = np.zeros((len(split_ratios), len(algorithms)))
# Populate accuracy_data with accuracy scores
for res in results:
row_idx = split_ratios.index(res['split_ratio'])
col_idx = algorithms.index(res['algorithm'])
accuracy_data[row_idx, col_idx] = res['accuracy']
# Create the heatmap
plt.figure(figsize=(10, 6))
im = plt.imshow(accuracy_data, cmap="viridis", aspect='auto')
# Add color bar
plt.colorbar(label="Accuracy")
# Annotate each cell with the numeric value
for i in range(len(split_ratios)):
for j in range(len(algorithms)):
plt.text(j, i, f"{accuracy_data[i, j]:.4f}", ha='center', va='center', color='w')
# Add labels and title
plt.title('Accuracy Across Configurations')
plt.xticks(np.arange(len(algorithms)), algorithms, rotation=90)
plt.yticks(np.arange(len(split_ratios)), split_ratios)
plt.xlabel('Algorithms')
plt.ylabel('split_ratio')
plt.show()
naive_bayes_results = runNaiveBayes(iris['data'], iris['target'], split_ratios)
#printReport(results)
c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1227: RuntimeWarning: divide by zero encountered in log neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1230: RuntimeWarning: invalid value encountered in add jll += self.class_log_prior_ + neg_prob.sum(axis=1) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1227: RuntimeWarning: divide by zero encountered in log neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1230: RuntimeWarning: invalid value encountered in add jll += self.class_log_prior_ + neg_prob.sum(axis=1) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1227: RuntimeWarning: divide by zero encountered in log neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1230: RuntimeWarning: invalid value encountered in add jll += self.class_log_prior_ + neg_prob.sum(axis=1) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1227: RuntimeWarning: divide by zero encountered in log neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1230: RuntimeWarning: invalid value encountered in add jll += self.class_log_prior_ + neg_prob.sum(axis=1) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1227: RuntimeWarning: divide by zero encountered in log neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1230: RuntimeWarning: invalid value encountered in add jll += self.class_log_prior_ + neg_prob.sum(axis=1) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1227: RuntimeWarning: divide by zero encountered in log neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1230: RuntimeWarning: invalid value encountered in add jll += self.class_log_prior_ + neg_prob.sum(axis=1) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1227: RuntimeWarning: divide by zero encountered in log neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1230: RuntimeWarning: invalid value encountered in add jll += self.class_log_prior_ + neg_prob.sum(axis=1) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1227: RuntimeWarning: divide by zero encountered in log neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1230: RuntimeWarning: invalid value encountered in add jll += self.class_log_prior_ + neg_prob.sum(axis=1) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1227: RuntimeWarning: divide by zero encountered in log neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) c:\Projects\UoL\(DSM040-2023-APR) Machine learning\DSM040_CW2\.venv\Lib\site-packages\sklearn\naive_bayes.py:1230: RuntimeWarning: invalid value encountered in add jll += self.class_log_prior_ + neg_prob.sum(axis=1)
1.2 Random Forst Classifier [3]¶
Train a random forest classifier in python. Use your code to fit the data given above.
Evaluate feature performance of the model.
Visualise the feature importance.
def runRandomForest(X, y, split_ratios, n_estimators=[10, 50, 100]):
# Initialize a list to store results
results = []
# Check if split_ratios is a single decimal or a list
if not isinstance(split_ratios, list):
split_ratios = [split_ratios]
# Loop through each split ratio
for split_ratio in split_ratios:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=random_state)
# Loop through each number of estimators
for n_estimator in n_estimators:
# Initialize and train the Random Forest classifier
model = RandomForestClassifier(n_estimators=n_estimator, random_state=random_state)
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = round((X_test.shape[0] - (y_test != y_pred).sum()) / X_test.shape[0], 4)
# Add result to the results list
results.append({
'split_ratio': split_ratio,
'n_estimators': n_estimator,
'accuracy': accuracy,
'feature_importances': model.feature_importances_
})
return results
def plotFeatureImportances(results, feature_names=None):
# Initialize the subplot grid
split_ratios = sorted(set([res['split_ratio'] for res in results]))
n_estimators = sorted(set([res['n_estimators'] for res in results]))
num_rows = len(split_ratios)
num_cols = len(n_estimators)
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
# Determine the maximum feature importance value across all results
max_importance = max([max(res['feature_importances']) for res in results])
# Loop through each split ratio and estimator pair in results
for res in results:
row_idx = split_ratios.index(res['split_ratio'])
col_idx = n_estimators.index(res['n_estimators'])
# Plotting on the appropriate subplot
ax = axes[row_idx, col_idx]
# Visualize feature importance
if feature_names is None:
feature_names = [f'feature_{i}' for i in range(len(res['feature_importances']))]
sorted_idx = np.argsort(res['feature_importances'])
ax.barh(range(len(res['feature_importances'])), res['feature_importances'][sorted_idx], align='center')
ax.set_yticks(range(len(res['feature_importances'])))
ax.set_yticklabels(np.array(feature_names)[sorted_idx])
ax.set_xlabel('Feature Importance')
ax.set_title(f'n_estimators={res["n_estimators"]}, split_ratio={res["split_ratio"]}')
# Set the same x-axis range for all subplots
ax.set_xlim([0, max_importance])
# Show the full grid of plots
plt.tight_layout()
plt.show()
def summarizeFeatureImportance(results, feature_names=None):
# Extract unique split_ratios and n_estimators from results
split_ratios = sorted(set([res['split_ratio'] for res in results]))
n_estimators = sorted(set([res['n_estimators'] for res in results]))
# If feature_names is not provided, generate default feature names
if feature_names is None:
feature_names = [f'Feature_{i}' for i in range(len(results[0]['feature_importances']))]
# Create an empty dictionary to store feature importances
feature_data = {name: np.zeros((len(split_ratios), len(n_estimators))) for name in feature_names}
# Populate feature_data with feature importances
for res in results:
row_idx = split_ratios.index(res['split_ratio'])
col_idx = n_estimators.index(res['n_estimators'])
for i, importance in enumerate(res['feature_importances']):
feature_data[feature_names[i]][row_idx, col_idx] = importance
# Create the heatmaps
for feature, data in feature_data.items():
plt.figure(figsize=(10, 6))
im = plt.imshow(data, cmap="viridis", aspect='auto')
# Add color bar
plt.colorbar(label="Feature Importance")
# Annotate each cell with the numeric value
for i in range(len(split_ratios)):
for j in range(len(n_estimators)):
plt.text(j, i, f"{data[i, j]:.2f}", ha='center', va='center', color='w')
# Add labels and title
plt.title(f'Feature Importance of {feature} Across Configurations')
plt.xticks(np.arange(len(n_estimators)), n_estimators, rotation=45)
plt.yticks(np.arange(len(split_ratios)), split_ratios)
plt.xlabel('n_estimators')
plt.ylabel('split_ratio')
plt.show()
def summarizeRandomForrestAccuracy(results):
# Extract unique split_ratios and n_estimators from results
split_ratios = sorted(set([res['split_ratio'] for res in results]))
n_estimators = sorted(set([res['n_estimators'] for res in results]))
# Create an empty numpy array to store accuracy scores
accuracy_data = np.zeros((len(split_ratios), len(n_estimators)))
# Populate accuracy_data with accuracy scores
for res in results:
row_idx = split_ratios.index(res['split_ratio'])
col_idx = n_estimators.index(res['n_estimators'])
accuracy_data[row_idx, col_idx] = res['accuracy']
# Create the heatmap
plt.figure(figsize=(10, 6))
im = plt.imshow(accuracy_data, cmap="viridis", aspect='auto')
# Add color bar
plt.colorbar(label="Accuracy")
# Annotate each cell with the numeric value
for i in range(len(split_ratios)):
for j in range(len(n_estimators)):
plt.text(j, i, f"{accuracy_data[i, j]:.4f}", ha='center', va='center', color='w')
# Add labels and title
plt.title('Accuracy Across Configurations')
plt.xticks(np.arange(len(n_estimators)), n_estimators, rotation=45)
plt.yticks(np.arange(len(split_ratios)), split_ratios)
plt.xlabel('n_estimators')
plt.ylabel('split_ratio')
plt.show()
def remove_nth_element(arr, n):
return np.array([[item for index, item in enumerate(inner_list) if index != n] for inner_list in arr])
random_forest_results = runRandomForest(iris['data'], iris['target'], split_ratios, [5,10,25,50,75,100])