https://raw.githubusercontent.com/ProjectPythiaCookbooks/radar-cookbook/main/thumbnail.png

ARM Data Quality Office ML Spike Detection

# Data Manipulation
import numpy as np

# Data Preprocessing
from sklearn.preprocessing import StandardScaler

# Model Training
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Performance Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve

#Plotting
import matplotlib.pyplot as plt
import plotly.graph_objs as go
#Hyperparameter Tunning
from sklearn.model_selection import GridSearchCV

#Dimensionality Reduction
from sklearn.decomposition import PCA 

Load Data

import act
# Set your username and token here!
# Read more on how to retrive your token:
# https://arm-doe.github.io/ACT/API/generated/act.discovery.download_arm_data.html#act.discovery.download_arm_data
username = 
token = 

datastream = "nsametC1.b1"
target_variable='rh_mean'
# Read NetCDF files
from datetime import datetime
startdate = '20200930'
enddate = '20201210'
files = act.discovery.download_arm_data(username, token, datastream, startdate, enddate)
ds = act.io.read_arm_netcdf(files)

Data Clearning - Missingness Imputation

In this project, we aim to identify data quality spikes from time series data of the variable ‘rh_mean’.

for var in list(ds):
    if var != target_variable:
        continue
    if ds[var].isnull().any():
        print(f'{var} contains NaN values.')
ds_single_chunk = ds.chunk(dict(time=-1))
ds = ds_single_chunk.apply(lambda x: x.interpolate_na(dim='time', 
                                                      fill_value="extrapolate"))
for var in list(ds):
    if var != target_variable:
        continue
    if ds[var].isnull().any():
        print(f'{var} contains NaN values.')
    else:
        print('No variables in the dataset contain NaN values.')

Load labels

label_filepath = 'labels_spike_detection_nsametC1.b1_rh_mean.txt'
with open(label_filepath, 'r') as f:
    lines = f.read().splitlines()
spike_intervals = np.array([line.split(' - ') for line in lines])

vectorize_func = np.vectorize(np.datetime64)
spike_intervals = vectorize_func(spike_intervals)

Time intervals of DQ spikes were retrived from the DQ-Zoom Plotter.

Link: https://dq.arm.gov/dq-zoom/

spike_intervals # Time intervals retrived from DQ-Zoom Plotter

Generate Sliding Windows

Windows without spikes

normal_mask = np.ones_like(ds.time.values, dtype=bool)
# Exclude data that is marked as spikes.
spike_time_ranges = [(np.datetime64(starttime, 'm'), np.datetime64(endtime, 'm')) \
                for starttime, endtime in spike_intervals]

for start, end in spike_time_ranges:
    normal_mask &= (ds.time.values < start) | (ds.time.values > end)
ds_normal = ds.sel(time=normal_mask)
ds_normal.time.shape, ds.time.shape
da_normal = ds_normal[target_variable].values
time_steps=1
diff_normal =  da_normal - np.roll(da_normal, shift = time_steps) # See Section 5 Feature Engineering
diff_normal[:time_steps] = [0]*time_steps
# Generate sliding windows for data without any spikes.
windows_diff_normal = []
window_size = 20
stride = window_size//1
for i in range(0, len(diff_normal) -  window_size + 1, stride):
    window = diff_normal[i: i + window_size]
    windows_diff_normal.append(window)

windows_diff_normal = np.array(windows_diff_normal)
windows_diff_normal.shape
windows_diff_normal[0]

Windows with spikes

da_time = ds.time.values
windows_time = []
window_size = 20
stride = window_size//10
for i in range(0, len(da_time) -  window_size + 1, stride):
    window = da_time[i: i + window_size]
    windows_time.append(window)

windows_time = np.array(windows_time)
windows_time.shape
da = ds[target_variable].values
# time_steps=1
diff =  da - np.roll(da, shift = time_steps)
diff[:time_steps] = [0]*time_steps
ds[f'diff_{target_variable}'] = (('time'), diff)

# Generate sliding windows for data with spikes. Quite different from what's been done with windows without spikes. 
window_size = 20
windows_diff_spike = []
for idx, (start, end) in enumerate(spike_intervals):
    start = np.datetime64(start, 'm')
    end = np.datetime64(end, 'm')
    spike_time_steps = (end - start).astype(int)
    heading_time_steps = np.random.randint(0, window_size - spike_time_steps)
    start_slicing = start-heading_time_steps
    end_slicing = start_slicing +window_size - 1
    window = ds.sel(time = slice(start_slicing, end_slicing))[f'diff_{target_variable}'].values
    windows_diff_spike.append(window)

windows_diff_spike = np.array(windows_diff_spike)
windows_diff_spike.shape
windows_diff_spike[0] # Can you spot a possible spike?

Feature Engineering

(Repeat) First Order Differencing

“The first difference of a time series is the series of changes from one period to the next.”

Reference

https://people.duke.edu/~rnau/411diff.htm

# time_steps = 1
# diff =  da - np.roll(da, shift = time_steps)
# diff[:time_steps] = [0]*time_steps
# diff_normal =  da_normal - np.roll(da_normal, shift = time_steps)
# diff_spike =  da_spike - np.roll(da_spike, shift = time_steps)
# diff_normal[:time_steps] = [0]*time_steps
# diff_spike[:time_steps] = [0]*time_steps

Optional: Visualization of differences between every two time steps

# plt.hist(diff, bins=500, edgecolor='green')
# plt.title('Distribution of 1st Order Differencing')
# plt.xlabel('Value')
# plt.ylabel('Frequency')
# plt.grid(True)
# plt.show()
# plt.hist(da, bins=500, edgecolor='blue')
# plt.title('Distribution of Original RH data')
# plt.xlabel('Value')
# plt.ylabel('Frequency')
# plt.grid(True)
# plt.show()
# # Calculate quartiles
# def make_box_plot(data, markOutliers=True, threshold=3):

#     mu = np.mean(data)
#     std = np.std(data)
#     # Define outlier thresholds
#     # threshold = 3. 0 means three standard deviations from the distribution mean
#     # Three standard deviations from the mean is a statistical rule that states that 
#     # 99.7% of observed data falls within three standard deviations of the mean.
#     upper_threshold = mu + threshold * std
#     lower_threshold = mu - threshold * std
#     outliers = data[(data < lower_threshold) | (data > upper_threshold)]
    
#     plt.boxplot(data)

#     # Highlight outliers
#     plt.plot(np.ones_like(outliers), outliers, 'ro', label='Outliers')
    
#     # Add legend and labels
#     plt.title('Boxplot with Outliers Highlighted')
#     plt.xlabel('Data')
#     plt.ylabel('Values')
#     plt.legend()
#     plt.grid(True)
#     plt.show()
# make_box_plot(diff, threshold = 3)
# make_box_plot(da, threshold = 3)

Variance Inflation Factor

The Variance Inflation Factor (VIF) is initially designed as a measure of multicollinearity within a set of multiple regression variables. I adopted this concept to develop features for our spike detection task. For further details on utilizing VIF to alleviate the effects of multicollinearity, please refer to this link.

https://www.investopedia.com/terms/v/variance-inflation-factor.asp

window_variance_normal = np.var(windows_diff_normal)

window_vif_normal = []
num_vif = 5
min_var = 0.01
idx_container = []
for w in windows_diff_normal:
    avg_deviation = np.abs(w - np.mean(w))
    max_deviation_indices = avg_deviation.argsort()[-num_vif:][::-1] # Find out the most deviated data points within a time window.
    window_var = np.var(w)
    idx_container = []
    vif = []
    # Estimate how much the overall time window variance is inflated by the most deviated data points for each time window.
    for idx in max_deviation_indices:
        exclude_max_deviation_indices = [i not in idx_container for i in np.arange(len(w))]
        exclude_max_deviation_var = np.var(w[exclude_max_deviation_indices])
        idx_container.append(idx)
        vif.append(window_var/max(exclude_max_deviation_var, min_var))
    window_vif_normal.append(vif)
window_vif_normal = np.array(window_vif_normal)
window_vif_normal.shape
# window_vif_normal
# Repeat the same process for windows with spikes. 
window_variance_spike = np.var(windows_diff_spike)

window_vif_spike = []
num_vif = 5
min_var = 0.01
idx_container = []
for w in windows_diff_spike:
    avg_deviation = np.abs(w - np.mean(w))
    max_deviation_indices = avg_deviation.argsort()[-num_vif:][::-1]
    window_var = np.var(w)
    idx_container = []
    vif = []
    for idx in max_deviation_indices:
        exclude_max_deviation_indices = [i not in idx_container for i in np.arange(len(w))]
        exclude_max_deviation_var = np.var(w[exclude_max_deviation_indices])
        idx_container.append(idx)
        vif.append(window_var/max(exclude_max_deviation_var, min_var))
    window_vif_spike.append(vif)
window_vif_spike= np.array(window_vif_spike)
window_vif_spike.shape
# window_vif_spike

Maximum Deviation

max_deviation_normal = np.max(windows_diff_normal, axis=1).reshape(-1,1)
max_deviation_normal.shape
max_deviation_spike = np.max(windows_diff_spike, axis=1).reshape(-1,1)
max_deviation_spike.shape
print(f'The class ratio is {max_deviation_normal.shape[0]}:{max_deviation_spike.shape[0]}. Highly imbalanced! \nIf we predict all samples to be without DQ spikes, we can easily achieve an impressive classification accuracy of {np.round(max_deviation_normal.shape[0]/ (max_deviation_normal.shape[0] + max_deviation_spike.shape[0])*100,1)}%')

Concatenate labels and features

normal_label = [0] * windows_diff_normal.shape[0]
spike_label = [1]* windows_diff_spike.shape[0]
label_combined = np.concatenate((normal_label, spike_label), axis=0)

X_normal = np.hstack((window_vif_normal, max_deviation_normal))
X_spike = np.hstack((window_vif_spike, max_deviation_spike))
X = np.concatenate((X_normal, X_spike), axis=0)

label_combined.shape, X.shape

Shuffle

random_seed=1568
np.random.seed(random_seed)
perm = np.random.permutation(X.shape[0])
X_shuffled =X[perm]
label_shuffled = label_combined[perm]

Data Splitting

Reference for train_test_split
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split

from sklearn.model_selection import train_test_split
random_state = 1265
X_train, X_test, y_train, y_test = train_test_split(X_shuffled, 
                                                    label_shuffled,
                                                    test_size=0.2, 
                                                    random_state=random_state,
                                                    stratify=label_shuffled
                                                   )

Model Training

Logistic Regression

Logistic Regression
Logistic regression, despite its name, is a linear model for classification rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function. The implementation of logistic regression in scikit-learn can be accessed from class LogisticRegression. This implementation can fit binary, One-vs- Rest, or multinomial logistic regression with optional L2 or L1 regularization.

Reading Materials
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Baseline Logistic Regression

# Train a Logistic Regression model using training data
lr = LogisticRegression(random_state=5645, max_iter=200).fit(X_train, y_train)
# Make predictions using the trained LR model
y_train_pred_lr = lr.predict(X_train)
# Calculate confusion matrix with training data
conf_mx_lr = confusion_matrix(y_train, y_train_pred_lr, labels=[1,0])
conf_mx_lr
# Evaluate model performance on testing data
y_test_pred_lr = lr.predict(X_test)
# Calculate confusion matrix with testing data
conf_mx_lr_test = confusion_matrix(y_test, y_test_pred_lr, labels=[1,0])
conf_mx_lr_test
y_train_pred_prob_lr = lr.predict_proba(X_train)[:,1]
y_test_pred_prob_lr = lr.predict_proba(X_test)[:,1]
def plot_performance_curves(y_train, y_train_pred_prob, y_test, y_test_pred_prob, postfix='NA'):
    
    fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred_prob, pos_label=1)
    precision_train, recall_train, _ = precision_recall_curve(y_train, y_train_pred_prob, pos_label=1)

    fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred_prob, pos_label=1)
    precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred_prob, pos_label=1)
    
    roc_auc_train = auc(fpr_train, tpr_train)
    roc_auc_test  = auc(fpr_test, tpr_test)

    pr_auc_train  = auc(recall_train, precision_train)
    pr_auc_test   = auc(recall_test, precision_test)
    
    plt.figure(figsize=(12,5))
    plt.subplot(1, 2, 1)
    plt.plot(fpr_train, tpr_train, color='darkorange', lw=2, label='ROC curve (area = %0.4f) - Train' % roc_auc_train)
    plt.plot(fpr_train, tpr_train, color='orangered', lw=2, label='ROC curve (area = %0.4f) - Test' % roc_auc_test)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic (ROC) Curve - {postfix}')
    plt.legend(loc="lower right")
                   
    plt.subplot(1, 2, 2)
    plt.plot(recall_train, precision_train, color='blue', lw=2, label='Precision-Recall curve (AUC = %0.2f) - Train' % pr_auc_train)
    plt.plot(recall_test, precision_test, color='aqua', lw=2, label='Precision-Recall curve (AUC = %0.2f) - Test' % pr_auc_test)
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - {postfix}')
    plt.legend(loc="lower left")

    plt.tight_layout()
    plt.show()
plot_performance_curves(y_train, y_train_pred_prob_lr, y_test, y_test_pred_prob_lr, 'Logistic Regresion')

“In situations where the dataset is highly imbalanced, the ROC curve can give an overly optimistic assessment of the model’s performance. This optimism bias arises because the ROC curve’s false positive rate (FPR) can become very small when the number of actual negatives is large.”

Reference

https://juandelacalle.medium.com/how-and-why-i-switched-from-the-roc-curve-to-the-precision-recall-curve-to-analyze-my-imbalanced-6171da91c6b8#:~:text=In%20situations%20where%20the%20dataset,of%20actual%20negatives%20is%20large.

Penalized Logistic Regression

Ridge
Ridge regression addresses some of the problems of Ordinary Least Squares by imposing a penalty on the size of coefficients. The ridge coefficients minimize a penalized residual sum of squares.

Lasso
The Lasso is a linear model that estimates sparse coefficients. It is useful in some contexts due to its tendency to prefer solutions with fewer non-zero coefficients, effectively reducing the number of features upon which the given solution is dependent. For this reason, Lasso and its variants are fundamental to the field of compressed sensing. Under certain conditions, it can recover the exact set of non-zero coefficients.

Elastic Net
ElasticNet is a linear regression model trained with L1 and L2 prior as regularizer. This combination allows for learning a sparse model where few of the weights are non-zero like Lasso, while still maintaining the regularization properties of Ridge. We control the convex combination of L1 and L2 using the l1_ratio parameter. Elastic-net is useful when there are multiple features which are correlated with one another. Lasso is likely to pick one of these at random, while elastic-net is likely to pick both.

References
[1] Ridge: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge

[2] Lasso: https://scikit-learn.org/stable/modules/linear_model.html#lasso

[3] Elastic Net: https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_coordinate_descent_path.html#lasso-and-elastic-net

lr_penalized = LogisticRegression(random_state=5645,
                        max_iter=200,
                        penalty='l2', 
                        C=0.1).fit(X_train, y_train)
y_train_pred_lr_penalized = lr_penalized.predict(X_train)
np.unique(y_train_pred_lr_penalized, return_counts=True)
conf_mx_lr_penalized = confusion_matrix(y_train, y_train_pred_lr_penalized, labels=[1,0])
conf_mx_lr_penalized
y_test_pred_lr_penalized = lr_penalized.predict(X_test)
conf_mx_lr_penalized_test = confusion_matrix(y_test, y_test_pred_lr_penalized, labels=[1,0])
conf_mx_lr_penalized_test
y_train_pred_prob_lr_penalized = lr_penalized.predict_proba(X_train)[:,1]
y_test_pred_prob_lr_penalized = lr_penalized.predict_proba(X_test)[:,1]
plot_performance_curves(y_train, y_train_pred_prob_lr_penalized, y_test, y_test_pred_prob_lr_penalized, 'Penalized Logistic Regresion')

Decision Tree

Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.

References

[1] https://scikit-learn.org/stable/modules/tree.html

[2] https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

dtree = DecisionTreeClassifier(random_state=5645, criterion='gini')
dtree.fit(X_train, y_train)
y_train_pred_dtree = dtree.predict(X_train)
np.unique(y_train_pred_dtree, return_counts=True)
conf_mx_dtree = confusion_matrix(y_train, y_train_pred_dtree, labels=[1,0])
conf_mx_dtree
# Evaluate model performance on testing data
y_test_pred_dtree = dtree.predict(X_test)
# Calculate confusion matrix with testing data
conf_mx_dtree_test = confusion_matrix(y_test, y_test_pred_dtree, labels=[1,0])
conf_mx_dtree_test
y_train_pred_prob_dtree = dtree.predict_proba(X_train)[:,1]
y_test_pred_prob_dtree = dtree.predict_proba(X_test)[:,1]
plot_performance_curves(y_train, y_train_pred_prob_dtree, y_test, y_test_pred_prob_dtree, 'Decision Tree')

Regularized Decision Tree

How to restrict the growth of a Decision Tree?

Reading Materials
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

dtree_regularized = DecisionTreeClassifier(random_state=5645, 
                                           criterion='gini', 
                                           max_depth=30, 
                                           min_samples_split=10,
                                           # max_features='sqrt',
                                           # min_samples_leaf = 10
                                          )
dtree_regularized.fit(X_train, y_train)
y_train_pred_dtree_regularized = dtree_regularized.predict(X_train)
np.unique(y_train_pred_dtree_regularized, return_counts=True)
conf_mx_dtree_regularized = confusion_matrix(y_train, y_train_pred_dtree_regularized, labels=[1,0])
conf_mx_dtree_regularized
# Evaluate model performance on testing data
y_test_pred_dtree_regularized = dtree_regularized.predict(X_test)
# Calculate confusion matrix with testing data
conf_mx_dtree_regularized_test = confusion_matrix(y_test, y_test_pred_dtree_regularized, labels=[1,0])
conf_mx_dtree_regularized_test
y_train_pred_prob_dtree_regularized = dtree_regularized.predict_proba(X_train)[:,1]
y_test_pred_prob_dtree_regularized = dtree_regularized.predict_proba(X_test)[:,1]
plot_performance_curves(y_train, y_train_pred_prob_dtree_regularized, y_test, y_test_pred_prob_dtree_regularized, 'Regularized Decision Tree')

Random Forest

In random forests, each tree in the ensemble is built from a sample drawn with replacement (i.e., a bootstrap sample) from the training set.

Furthermore, when splitting each node during the construction of a tree, the best split is found through an exhaustive search of the features values of either all input features or a random subset of size max_features.

The purpose of these two sources of randomness is to decrease the variance of the forest estimator. Indeed, individual decision trees typically exhibit high variance and tend to overfit. The injected randomness in forests yield decision trees with somewhat decoupled prediction errors. By taking an average of those predictions, some errors can cancel out. Random forests achieve a reduced variance by combining diverse trees, sometimes at the cost of a slight increase in bias. In practice the variance reduction is often significant hence yielding an overall better model.

References
[1] https://scikit-learn.org/stable/modules/ensemble.html#random-forests [2] https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

rnd= RandomForestClassifier(n_jobs = -1, random_state=1110)
rnd.fit(X_train, y_train)
y_train_pred_rnd = rnd.predict(X_train)
conf_mx_rnd = confusion_matrix(y_train, y_train_pred_rnd, labels=[1,0])
conf_mx_rnd
y_test_pred_rnd = rnd.predict(X_test)
conf_mx_rnd_test = confusion_matrix(y_test, y_test_pred_rnd, labels=[1,0])
conf_mx_rnd_test
y_train_pred_prob_rnd = rnd.predict_proba(X_train)[:,1]
y_test_pred_prob_rnd = rnd.predict_proba(X_test)[:,1]
plot_performance_curves(y_train, y_train_pred_prob_rnd, y_test, y_test_pred_prob_rnd, 'Random Forest')

Random Forest - Hyperparameter Tunning

Reading Materials
[1] https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV

[2] https://scikit-learn.org/stable/modules/classes.html#hyper-parameter-optimizers

param_grid_rnd = [{
    'max_leaf_nodes':[50, 100],
    'min_samples_leaf':[1, 20],
    'n_estimators':[100, 500],
    'max_features':["sqrt", None],
    'criterion': ["entropy"],
    'random_state':[1110]
}]

rnd_grid = GridSearchCV(rnd, param_grid_rnd, cv=3)
rnd_grid.fit(X_train, y_train)
rnd_grid.best_estimator_, rnd_grid.best_params_
y_train_pred_rnd_grid = rnd_grid.predict(X_train)
conf_mx_rnd_grid = confusion_matrix(y_train, y_train_pred_rnd_grid, labels=[1,0])
conf_mx_rnd_grid
y_test_pred_rnd_grid = rnd_grid.predict(X_test)
conf_mx_rnd_grid_test = confusion_matrix(y_test, y_test_pred_rnd_grid, labels=[1,0])
conf_mx_rnd_grid_test
y_train_pred_prob_rnd_grid = rnd_grid.predict_proba(X_train)[:,1]
y_test_pred_prob_rnd_grid = rnd_grid.predict_proba(X_test)[:,1]
plot_performance_curves(y_train, y_train_pred_prob_rnd_grid, y_test, y_test_pred_prob_rnd_grid, 'Random Forest - 3 Fold CV')

Use the best parameter values to build a Random Forest

rnd_select = RandomForestClassifier(n_jobs = -1, 
                                      random_state=1110,
                                      # Update the following parameters with the best values you found with GridSearchCV.
                                      # max_leaf_nodes=50,
                                      # min_samples_leaf=1,
                                      # max_features = 'sqrt',
                                      # n_estimators=100,
                                      # criterion='entropy',
                                     )
rnd_select.fit(X_train, y_train)
y_train_pred_rnd_select = rnd_select.predict(X_train)
np.unique(y_train_pred_rnd_select, return_counts=True)
y_test_pred_rnd_select = rnd_select.predict(X_test)
np.unique(y_test_pred_rnd_select, return_counts=True)
conf_mx_rnd_select = confusion_matrix(y_train, y_train_pred_rnd_select, labels=[1,0])
conf_mx_rnd_select
conf_mx_rnd_select_test = confusion_matrix(y_test, y_test_pred_rnd_select, labels=[1,0])
conf_mx_rnd_select_test
y_train_pred_prob_rnd_select = rnd_select.predict_proba(X_train)[:,1]
y_test_pred_prob_rnd_select = rnd_select.predict_proba(X_test)[:,1]
plot_performance_curves(y_train, y_train_pred_prob_rnd_select, y_test, y_test_pred_prob_rnd_select, 'Random Forest - Best Parameters')

Dimensionality Reduction - PCA

Principal component analysis (PCA)

Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space.

References
[1]https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA

[2]https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html#sphx-glr-auto-examples-decomposition-plot-pca-iris-py

PCA for Visualization

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
transformed_X = pca.fit_transform(X_scaled)
pca.explained_variance_ratio_, sum(pca.explained_variance_ratio_[:3])
fig = go.Figure(data=[go.Scatter3d(
    x=transformed_X[:, 0],
    y=transformed_X[:, 1],
    z=transformed_X[:, 2],
    mode='markers',
    marker=dict(
        size=2,
        color=label_combined,  # Color by class label
        colorscale='Viridis',
        opacity=0.5
    ),
    text=['Class: {}'.format(label) for label in label_combined],
)])

# Set layout
fig.update_layout(scene=dict(
    xaxis_title='Principal Component 1',
    yaxis_title='Principal Component 2',
    zaxis_title='Principal Component 3',
), title='3D Scatter Plot of First 3 Principal Components')

fig.update_layout(width=800, height=600)
fig.show()

PCA Model Training - Random Forest

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
pca_train = PCA(n_components=3)
transformed_X_Train = pca_train.fit_transform(X_train_scaled)
transformed_X_Test = pca_train.transform(X_test_scaled)
rnd_pca = RandomForestClassifier(n_jobs = -1, 
                                      random_state=1110,
                                      max_leaf_nodes=100,
                                      min_samples_leaf=1,
                                      max_features = 'sqrt',
                                      n_estimators=500,
                                      criterion='entropy'
                                     )
rnd_pca.fit(transformed_X_Train, y_train)
y_train_pred_rnd_pca = rnd_pca.predict(transformed_X_Train)
np.unique(y_train_pred_rnd_pca, return_counts=True)
y_test_pred_rnd_pca = rnd_pca.predict(transformed_X_Test)
np.unique(y_test_pred_rnd_pca, return_counts=True)
conf_mx_rnd_pca = confusion_matrix(y_train, y_train_pred_rnd_pca, labels=[1,0])
conf_mx_rnd_pca
conf_mx_rnd_pca_test = confusion_matrix(y_test, y_test_pred_rnd_pca, labels=[1,0])
conf_mx_rnd_pca_test
y_train_pred_prob_rnd_pca = rnd_pca.predict_proba(transformed_X_Train)[:,1]
y_test_pred_prob_rnd_pca = rnd_pca.predict_proba(transformed_X_Test)[:,1]
plot_performance_curves(y_train, y_train_pred_prob_rnd_pca, y_test, y_test_pred_prob_rnd_pca, 'Random Forest - PCA')