Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

CSE572 Project 2 - Model Training

Mark Khusid

import pandas as pd
import numpy as np

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

from scipy.integrate import simpson
from scipy.ndimage import uniform_filter1d
from scipy.signal import find_peaks

from sklearn.impute import KNNImputer

from sklearn.svm import SVC

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

Load the datasets

insulin_data_1 = pd.read_csv('InsulinData.csv', low_memory=False)
insulin_data_2 = pd.read_csv('Insulin_patient2.csv', low_memory=False)
cgm_data_1 = pd.read_csv('CGMData.csv', low_memory=False)
cgm_data_2 = pd.read_csv('CGM_patient2.csv', low_memory=False)

View Dataframes

#insulin_data_1
#cgm_data_1
#insulin_data_2
#cgm_data_2

EDA on dataframes

#insulin_data_1.info()
#cgm_data_1.info()
#insulin_data_2.info()
#cgm_data_2.info()

Convert ‘Date’ and ‘Time’ columns to datetime format in both datasets for easier manipulation

def preprocess_date(date_string):
    date_string = date_string.strip()
    if len(date_string.split()) == 1:
        return date_string
    elif len(date_string.split()) == 2:
        return date_string.split()[0]
insulin_data_1['Date'] = insulin_data_1['Date'].apply(preprocess_date)
insulin_data_2['Date'] = insulin_data_2['Date'].apply(preprocess_date)
insulin_data_1['datetime'] = pd.to_datetime(insulin_data_1['Date'] + ' ' + insulin_data_1['Time'])
insulin_data_2['datetime'] = pd.to_datetime(insulin_data_2['Date'] + ' ' + insulin_data_2['Time'])

cgm_data_1['datetime'] = pd.to_datetime(cgm_data_1['Date'] + ' ' + cgm_data_1['Time'])
cgm_data_2['datetime'] = pd.to_datetime(cgm_data_2['Date'] + ' ' + cgm_data_2['Time'])
#insulin_data_1[['Date', 'Time', 'datetime']]
#insulin_data_2[['Date', 'Time', 'datetime']]
#cgm_data_1[['Date', 'Time', 'datetime']]
#cgm_data_2[['Date', 'Time', 'datetime']]

Search for non-zero BWZ Carb Input Data

# This datetime series contains the start of all non NaN and  positive carb input
# data from the column BWZ Carb Input (grams)
meal_times_1 = \
    insulin_data_1[~insulin_data_1['BWZ Carb Input (grams)'].isna() & \
    (insulin_data_1['BWZ Carb Input (grams)'] > 0)]['datetime']
meal_times_1
48 2018-02-12 09:15:45 129 2018-02-12 02:30:55 188 2018-02-11 20:33:18 207 2018-02-11 18:14:37 222 2018-02-11 16:27:04 ... 41265 2017-07-26 11:24:52 41274 2017-07-26 09:27:16 41347 2017-07-25 18:31:40 41393 2017-07-25 10:39:46 41401 2017-07-25 10:21:19 Name: datetime, Length: 747, dtype: datetime64[ns]
# This datetime series contains the start of all non NaN and  positive carb input
# data from the column BWZ Carb Input (grams)
meal_times_2 = \
    insulin_data_2[~insulin_data_2['BWZ Carb Input (grams)'].isna() & \
    (insulin_data_2['BWZ Carb Input (grams)'] > 0)]['datetime']
#meal_times_2

Extract Meal Data

Obtain CGM Data based on Length of After-Meal Window

# Initialize meal data list
meal_data_1 = []

# Loop through each meal start time
for meal_time in meal_times_1:
    # A meal windows is 2 hours long
    meal_window = insulin_data_1[(insulin_data_1['datetime'] > meal_time) & 
                               (insulin_data_1['datetime'] <= (meal_time + pd.Timedelta(hours=2)))]
    #print(meal_window)

    # If that meal window is empty, get corresponding CGM_
    # data that starts at tm-30 and ends at tm+2hrs
    if meal_window.empty:
        # Step 4 condition (a): No meal between tm and tm+2hrs, use this stretch as meal data
        meal_cgm_data_1 = cgm_data_1[(cgm_data_1['datetime'] >= (meal_time - pd.Timedelta(minutes=30))) & 
                                 (cgm_data_1['datetime'] <= (meal_time + pd.Timedelta(hours=2)))]
    else:
        # Step 4 check for condition (b) or (c)
        next_meal_time = meal_window.iloc[0]['datetime']
        if next_meal_time < (meal_time + pd.Timedelta(hours=2)):
            # condition (b): There is a meal at tp between tm and tm+2hrs, use tp data
            meal_time = next_meal_time
            meal_cgm_data_1 = cgm_data_1[(cgm_data_1['datetime'] >= (meal_time - pd.Timedelta(minutes=30))) & 
                                     (cgm_data_1['datetime'] <= (meal_time + pd.Timedelta(hours=2)))]
        # Condition (c): There is a meal exactly at tm+2hrs, use tm+1hr 30min to tm+4hrs
        elif next_meal_time == (meal_time + pd.Timedelta(hours=2)):
            meal_cgm_data_1 = cgm_data_1[(cgm_data_1['datetime'] >= (meal_time + pd.Timedelta(hours=1, minutes=30))) & 
                                     (cgm_data_1['datetime'] <= (meal_time + pd.Timedelta(hours=4)))]

    # Check if the extracted data has the correct number of points (30 for meal data)
    if len(meal_cgm_data_1) == 30:
        #print(f"Meal Time Data {meal_time} has {len(meal_cgm_data)} number of points.")
        meal_data_1.append(meal_cgm_data_1['Sensor Glucose (mg/dL)'].values)
# Initialize meal data list
meal_data_2 = []

# Loop through each meal start time
for meal_time in meal_times_2:
    # A meal windows is 2 hours long
    meal_window = insulin_data_2[(insulin_data_2['datetime'] > meal_time) & 
                               (insulin_data_2['datetime'] <= (meal_time + pd.Timedelta(hours=2)))]
    #print(meal_window)

    # If that meal window is empty, get corresponding CGM_
    # data that starts at tm-30 and ends at tm+2hrs
    if meal_window.empty:
        # Step 4 condition (a): No meal between tm and tm+2hrs, use this stretch as meal data
        meal_cgm_data_2 = cgm_data_2[(cgm_data_2['datetime'] >= (meal_time - pd.Timedelta(minutes=30))) & 
                                 (cgm_data_2['datetime'] <= (meal_time + pd.Timedelta(hours=2)))]
    else:
        # Step 4 check for condition (b) or (c)
        next_meal_time = meal_window.iloc[0]['datetime']
        if next_meal_time < (meal_time + pd.Timedelta(hours=2)):
            # condition (b): There is a meal at tp between tm and tm+2hrs, use tp data
            meal_time = next_meal_time
            meal_cgm_data_2 = cgm_data_2[(cgm_data_2['datetime'] >= (meal_time - pd.Timedelta(minutes=30))) & 
                                     (cgm_data_2['datetime'] <= (meal_time + pd.Timedelta(hours=2)))]
        # Condition (c): There is a meal exactly at tm+2hrs, use tm+1hr 30min to tm+4hrs
        elif next_meal_time == (meal_time + pd.Timedelta(hours=2)):
            meal_cgm_data_2 = cgm_data_2[(cgm_data_2['datetime'] >= (meal_time + pd.Timedelta(hours=1, minutes=30))) & 
                                     (cgm_data_2['datetime'] <= (meal_time + pd.Timedelta(hours=4)))]

    # Check if the extracted data has the correct number of points (30 for meal data)
    if len(meal_cgm_data_2) == 30:
        #print(f"Meal Time Data {meal_time} has {len(meal_cgm_data)} number of points.")
        meal_data_2.append(meal_cgm_data_2['Sensor Glucose (mg/dL)'].values)

Get Length of Meal Data CGM Values

len(meal_data_1)
715
len(meal_data_2)
372
meal_data_1[0:5]
[array([ nan, nan, 124., 126., 127., 128., 130., 137., 140., nan, 150., 140., 131., 130., 134., 148., nan, 159., 157., 140., 146., 148., 151., 150., 162., 174., 180., 178., 184., 185.]), array([121., 120., 115., 110., 106., 103., 103., 101., nan, 131., 109., 82., 74., 66., 73., 82., 87., 90., 83., 73., 49., 58., 69., 79., nan, 93., 92., 86., 84., 80.]), array([287., 288., 289., 279., 260., 262., 259., 250., 245., 250., 247., 243., 227., 213., 205., 182., 175., 170., 166., 164., 165., 158., 153., 161., 177., 194., 200., 201., 189., 162.]), array([189., 162., 166., 173., 176., 169., 167., 166., 168., 180., 182., 178., 171., 160., 145., 138., 132., 120., 127., 129., 132., 123., 104., 100., 107., 106., 123., 128., 137., 147.]), array([107., 106., 123., 128., 137., 147., 145., 142., nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])]
#meal_data_2[0:5]

Create Meal Data Matrix

meal_data_matrix_1 = np.array(meal_data_1)
meal_data_matrix_1
array([[ nan, nan, 124., ..., 178., 184., 185.], [121., 120., 115., ..., 86., 84., 80.], [287., 288., 289., ..., 201., 189., 162.], ..., [274., 278., 283., ..., 268., 281., 292.], [223., 222., 222., ..., 324., 326., 328.], [ 82., 85., 92., ..., 164., 163., 155.]])
meal_data_matrix_1.shape
(715, 30)
meal_data_matrix_2 = np.array(meal_data_2)
meal_data_matrix_2
array([[390., 391., 389., ..., 165., 139., 133.], [116., 113., 110., ..., 248., 233., 222.], [130., 129., 126., ..., 153., 147., 147.], ..., [174., 181., 191., ..., 126., 129., 128.], [174., 181., 191., ..., 126., 129., 128.], [102., 94., 92., ..., 169., 180., 186.]])
meal_data_matrix_2.shape
(372, 30)

Visualize Meal Data Matrix 1

Heatmap of Meal Data Matrix 1

# Create the heatmap
sampled_meal_data_matrix_1 = meal_data_matrix_1[np.random.choice(meal_data_matrix_1.shape[0], 200, replace=False)]
plt.figure(figsize=(30, 30))
sns.heatmap(sampled_meal_data_matrix_1, cmap='YlGnBu', linewidths=0.5, annot=False, vmin=None, vmax=None)
plt.title('Heatmap of Meal Data Matrix 1')
plt.xlabel('Time Points')
plt.ylabel('Meals (Samples)')
plt.show()
<Figure size 3000x3000 with 2 Axes>

Surface Plot of Meal Data Matrix 1

# Create X, Y meshgrid
X = np.arange(meal_data_matrix_1.shape[1])
Y = np.arange(meal_data_matrix_1.shape[0])
time_points = np.linspace(-30, 120, 30)  # X-axis: from -30 minutes to +120 minutes (30 time points)
meal_numbers = np.arange(1, len(meal_data_1) + 1)
X, Y = np.meshgrid(time_points, Y)
#X, Y = np.meshgrid(time_points, meal_numbers)
X
array([[-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], ..., [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ]])
X.shape
(715, 30)
Y
array([[ 0, 0, 0, ..., 0, 0, 0], [ 1, 1, 1, ..., 1, 1, 1], [ 2, 2, 2, ..., 2, 2, 2], ..., [712, 712, 712, ..., 712, 712, 712], [713, 713, 713, ..., 713, 713, 713], [714, 714, 714, ..., 714, 714, 714]])
# Fill NaNs using forward fill, then backward fill
meal_data_matrix_1_df = pd.DataFrame(meal_data_matrix_1)
meal_data_matrix_1_filled = meal_data_matrix_1_df.ffill().bfill().values
# Apply a moving average smoothing to each row
window_size = 5  # Choose a window size for smoothing
smoothed_meal_data_matrix_1 = np.array([uniform_filter1d(row, size=window_size) for row in meal_data_matrix_1_filled])
# Create the plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
stride = 100

# Plot the surface
surface_plot = ax.plot_surface(X[::stride], Y[::stride], smoothed_meal_data_matrix_1[::stride], cmap='viridis', edgecolor='none')
ax.set_title('3D Surface Plot of Smoothed Averaged Meal Glucose Levels', y=0.985)
ax.set_xlabel('Time Relative to Start of Meal [minutes]', labelpad=10)
ax.set_ylabel('Meal Samples', labelpad=10)
#ax.set_zlabel('Glucose Level (mg/dL)', labelpad=30)

#ax.set_xticks(np.linspace(-30, 120, 7))

# Adjust axis limits to create a zoom-out effect
#ax.set_xlim(-30, 120)
ax.set_ylim(0, 800)
ax.set_zlim(0, 300)

fig.colorbar(surface_plot, ax=ax, shrink=0.5, aspect=10, label='Glucose Level (mg/dL)')

#plt.tight_layout()
fig.subplots_adjust(left=0.01, right=1, top=0.9, bottom=0.1)

ax.view_init(elev=25, azim=-55)

plt.show()
<Figure size 1000x800 with 2 Axes>

Contour Plot of Meal Data Matrix 1

# Create the contour plot
plt.figure(figsize=(15, 10))
plt.contourf(X, Y, meal_data_matrix_1_df, cmap='viridis', levels=30)
plt.colorbar(label='Glucose Level (mg/dL)')
plt.title('Contour Plot of Meal Glucose Levels')
plt.xlabel('Time Points')
plt.ylabel('Meal Samples')
plt.show()
<Figure size 1500x1000 with 2 Axes>

Visualize Meal Data Matrix 2

Heatmap of Meal Data Matrix 2

# Create the heatmap
sampled_meal_data_matrix_2 = meal_data_matrix_2[np.random.choice(meal_data_matrix_2.shape[0], 200, replace=False)]
plt.figure(figsize=(30, 30))
sns.heatmap(sampled_meal_data_matrix_2, cmap='YlGnBu', linewidths=0.5, annot=False, vmin=None, vmax=None)
plt.title('Heatmap of Meal Data Matrix 2')
plt.xlabel('Time Points')
plt.ylabel('Meals (Samples)')
plt.show()
<Figure size 3000x3000 with 2 Axes>

Surface Plot of Meal Data Matrix 2

# Create X, Y meshgrid
X = np.arange(meal_data_matrix_2.shape[1])
Y = np.arange(meal_data_matrix_2.shape[0])
time_points = np.linspace(-30, 120, 30)  # X-axis: from -30 minutes to +120 minutes (30 time points)
meal_numbers = np.arange(1, len(meal_data_2) + 1)
X, Y = np.meshgrid(time_points, Y)
#X, Y = np.meshgrid(time_points, meal_numbers)
X
array([[-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], ..., [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ]])
X.shape
(372, 30)
Y
array([[ 0, 0, 0, ..., 0, 0, 0], [ 1, 1, 1, ..., 1, 1, 1], [ 2, 2, 2, ..., 2, 2, 2], ..., [369, 369, 369, ..., 369, 369, 369], [370, 370, 370, ..., 370, 370, 370], [371, 371, 371, ..., 371, 371, 371]])
# Fill NaNs using forward fill, then backward fill
meal_data_matrix_2_df = pd.DataFrame(meal_data_matrix_2)
meal_data_matrix_2_filled = meal_data_matrix_2_df.ffill().bfill().values
# Apply a moving average smoothing to each row
window_size = 5  # Choose a window size for smoothing
smoothed_meal_data_matrix_2 = np.array([uniform_filter1d(row, size=window_size) for row in meal_data_matrix_2_filled])
# Create the plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
stride = 100

# Plot the surface
surface_plot = ax.plot_surface(X[::stride], Y[::stride], smoothed_meal_data_matrix_2[::stride], cmap='viridis', edgecolor='none')
ax.set_title('3D Surface Plot of Smoothed Averaged Meal Glucose Levels', y=0.985)
ax.set_xlabel('Time Relative to Start of Meal [minutes]', labelpad=10)
ax.set_ylabel('Meal Samples', labelpad=10)
#ax.set_zlabel('Glucose Level (mg/dL)', labelpad=30)

#ax.set_xticks(np.linspace(-30, 120, 7))

# Adjust axis limits to create a zoom-out effect
#ax.set_xlim(-30, 120)
#ax.set_ylim(0, 800)
ax.set_zlim(0, 400)

fig.colorbar(surface_plot, ax=ax, shrink=0.5, aspect=10, label='Glucose Level (mg/dL)')

#plt.tight_layout()
fig.subplots_adjust(left=0.01, right=1, top=0.9, bottom=0.1)

ax.view_init(elev=25, azim=-55)

plt.show()
<Figure size 1000x800 with 2 Axes>

Contour Plot of Meal Data Matrix 2

# Create the contour plot
plt.figure(figsize=(15, 10))
plt.contourf(X, Y, meal_data_matrix_2_df, cmap='viridis', levels=30)
plt.colorbar(label='Glucose Level (mg/dL)')
plt.title('Contour Plot of Meal Glucose Levels')
plt.xlabel('Time Points')
plt.ylabel('Meal Samples')
plt.show()
<Figure size 1500x1000 with 2 Axes>

Extract No - Meal Data

# Initialize meal data list
no_meal_data_1 = []
    
# Loop through each meal start time to find valid post - absorbtive windows
for meal_time in meal_times_1:
    # Define the post-absorptive period: starts from tm+2 hours
    # and lasts 2 hours (i.e to four hours)
    post_absorptive_start = meal_time + pd.Timedelta(hours=2)
    post_absorptive_end = meal_time + pd.Timedelta(hours=4)
    
    # Check for meals in the post-absorptive period
    post_absorptive_meals = insulin_data_1[(insulin_data_1['datetime'] >= post_absorptive_start) &
                                         (insulin_data_1['datetime'] <= post_absorptive_end)]
    
    # Step 2 Condition (a): If any non-zero, non-NaN meal is found in the
    # post-absorptive period, skip this stretch
    if not post_absorptive_meals.empty:
        # Check if any of the meals have a non-zero carb input
        if (post_absorptive_meals['BWZ Carb Input (grams)'] > 0).any():
            # Skip this stretch, as there is a meal found in the post-absorptive period
            continue
    
    # Step 2 condition (b): If meal values are 0 or NaN, ignore and consider the stretch
    # Extract the corresponding 2-hour CGM data
    no_meal_cgm_data_1 = cgm_data_1[(cgm_data_1['datetime'] >= post_absorptive_start) & 
                                (cgm_data_1['datetime'] <= post_absorptive_end)]
    
    # Check if the extracted data has the correct number of points (24 for no meal data)
    if len(no_meal_cgm_data_1) == 24:
        no_meal_data_1.append(no_meal_cgm_data_1['Sensor Glucose (mg/dL)'].values)
# Initialize meal data list
no_meal_data_2 = []
    
# Loop through each meal start time to find valid post - absorbtive windows
for meal_time in meal_times_2:
    # Define the post-absorptive period: starts from tm+2 hours
    # and lasts 2 hours (i.e to four hours)
    post_absorptive_start = meal_time + pd.Timedelta(hours=2)
    post_absorptive_end = meal_time + pd.Timedelta(hours=4)
    
    # Check for meals in the post-absorptive period
    post_absorptive_meals = insulin_data_2[(insulin_data_2['datetime'] >= post_absorptive_start) &
                                         (insulin_data_2['datetime'] <= post_absorptive_end)]
    
    # Step 2 Condition (a): If any non-zero, non-NaN meal is found in the
    # post-absorptive period, skip this stretch
    if not post_absorptive_meals.empty:
        # Check if any of the meals have a non-zero carb input
        if (post_absorptive_meals['BWZ Carb Input (grams)'] > 0).any():
            # Skip this stretch, as there is a meal found in the post-absorptive period
            continue
    
    # Step 2 condition (b): If meal values are 0 or NaN, ignore and consider the stretch
    # Extract the corresponding 2-hour CGM data
    no_meal_cgm_data_2 = cgm_data_2[(cgm_data_2['datetime'] >= post_absorptive_start) & 
                                (cgm_data_2['datetime'] <= post_absorptive_end)]
    
    # Check if the extracted data has the correct number of points (24 for no meal data)
    if len(no_meal_cgm_data_2) == 24:
        no_meal_data_2.append(no_meal_cgm_data_2['Sensor Glucose (mg/dL)'].values)
len(no_meal_data_1)
510
len(no_meal_data_2)
325
#no_meal_data_1[0:5]
#no_meal_data_2[0:5]

Create No Meal Data Matrix

no_meal_data_matrix_1 = np.array(no_meal_data_1)
#no_meal_data_matrix_1
no_meal_data_matrix_2 = np.array(no_meal_data_2)
#no_meal_data_matrix_2
no_meal_data_matrix_1.shape
(510, 24)
no_meal_data_matrix_2.shape
(325, 24)

Visualize No Meal Data Matrix 1

Heatmap of No Meal Data Matrix 1

# Create the heatmap
sampled_no_meal_data_matrix_1 = no_meal_data_matrix_1[np.random.choice(no_meal_data_matrix_1.shape[0], 200, replace=False)]
plt.figure(figsize=(30, 30))
sns.heatmap(sampled_no_meal_data_matrix_1, cmap='YlGnBu', linewidths=0.5, annot=False, vmin=None, vmax=None)
plt.title('Heatmap of No Meal Data Matrix 1')
plt.xlabel('Time Points')
plt.ylabel('Meals (Samples)')
plt.show()
<Figure size 3000x3000 with 2 Axes>

Surface Plot of No Meal Data Matrix 1

time_points = np.linspace(-30, 120, 24)
print(time_points)
print(time_points.shape)
[-30.         -23.47826087 -16.95652174 -10.43478261  -3.91304348
   2.60869565   9.13043478  15.65217391  22.17391304  28.69565217
  35.2173913   41.73913043  48.26086957  54.7826087   61.30434783
  67.82608696  74.34782609  80.86956522  87.39130435  93.91304348
 100.43478261 106.95652174 113.47826087 120.        ]
(24,)
# Create X, Y meshgrid
X = np.arange(no_meal_data_matrix_1.shape[1])
Y = np.arange(no_meal_data_matrix_1.shape[0])
time_points = np.linspace(-30, 120, 24)  # X-axis: from -30 minutes to +120 minutes (24 time points)
meal_numbers = np.arange(1, len(no_meal_data_1) + 1)
X, Y = np.meshgrid(time_points, Y)
#X, Y = np.meshgrid(time_points, meal_numbers)
X
array([[-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], ..., [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ]])
X.shape
(510, 24)
Y
array([[ 0, 0, 0, ..., 0, 0, 0], [ 1, 1, 1, ..., 1, 1, 1], [ 2, 2, 2, ..., 2, 2, 2], ..., [507, 507, 507, ..., 507, 507, 507], [508, 508, 508, ..., 508, 508, 508], [509, 509, 509, ..., 509, 509, 509]])
Y.shape
(510, 24)
# Fill NaNs using forward fill, then backward fill
no_meal_data_matrix_1_df = pd.DataFrame(no_meal_data_matrix_1)
no_meal_data_matrix_1_filled = no_meal_data_matrix_1_df.ffill().bfill().values
# Apply a moving average smoothing to each row
window_size = 5  # Choose a window size for smoothing
smoothed_no_meal_data_matrix_1 = np.array([uniform_filter1d(row, size=window_size) for row in no_meal_data_matrix_1_filled])
# Create the plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
stride = 100

# Plot the surface
surface_plot = ax.plot_surface(X[::stride], Y[::stride], smoothed_no_meal_data_matrix_1[::stride], cmap='viridis', edgecolor='none')
ax.set_title('3D Surface Plot of Smoothed Averaged No Meal Glucose Levels', y=0.985)
ax.set_xlabel('Time Relative to End of Meal [minutes]', labelpad=10)
ax.set_ylabel('Meal Samples', labelpad=10)
#ax.set_zlabel('Glucose Level (mg/dL)', labelpad=30)

#ax.set_xticks(np.linspace(-30, 120, 7))

# Adjust axis limits to create a zoom-out effect
#ax.set_xlim(-30, 120)
#ax.set_ylim(0, 800)
#ax.set_zlim(0, 300)

fig.colorbar(surface_plot, ax=ax, shrink=0.5, aspect=10, label='Glucose Level (mg/dL)')

#plt.tight_layout()
fig.subplots_adjust(left=0.01, right=1, top=0.9, bottom=0.1)

ax.view_init(elev=25, azim=-55)

plt.show()
<Figure size 1000x800 with 2 Axes>

Contour Plot of No Meal Data Matrix 1

# Create the contour plot
plt.figure(figsize=(15, 10))
plt.contourf(X, Y, no_meal_data_matrix_1_df, cmap='viridis', levels=30)
plt.colorbar(label='Glucose Level (mg/dL)')
plt.title('Contour Plot of No Meal Glucose Levels')
plt.xlabel('Time Points')
plt.ylabel('Meal Samples')
plt.show()
<Figure size 1500x1000 with 2 Axes>

Visualize No Meal Data Matrix 2

Heatmap of No Meal Data Matrix 2

# Create the heatmap
sampled_no_meal_data_matrix_2 = no_meal_data_matrix_2[np.random.choice(no_meal_data_matrix_2.shape[0], 200, replace=False)]
plt.figure(figsize=(30, 30))
sns.heatmap(sampled_meal_data_matrix_2, cmap='YlGnBu', linewidths=0.5, annot=False, vmin=None, vmax=None)
plt.title('Heatmap of No Meal Data Matrix 1')
plt.xlabel('Time Points')
plt.ylabel('Meals (Samples)')
plt.show()
<Figure size 3000x3000 with 2 Axes>

Surface Plot of No Meal Data Matrix 2

time_points = np.linspace(-30, 120, 24)
print(time_points)
print(time_points.shape)
[-30.         -23.47826087 -16.95652174 -10.43478261  -3.91304348
   2.60869565   9.13043478  15.65217391  22.17391304  28.69565217
  35.2173913   41.73913043  48.26086957  54.7826087   61.30434783
  67.82608696  74.34782609  80.86956522  87.39130435  93.91304348
 100.43478261 106.95652174 113.47826087 120.        ]
(24,)
# Create X, Y meshgrid
X = np.arange(no_meal_data_matrix_2.shape[1])
Y = np.arange(no_meal_data_matrix_2.shape[0])
time_points = np.linspace(-30, 120, 24)  # X-axis: from -30 minutes to +120 minutes (24 time points)
meal_numbers = np.arange(1, len(no_meal_data_2) + 1)
X, Y = np.meshgrid(time_points, Y)
#X, Y = np.meshgrid(time_points, meal_numbers)
X
array([[-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], ..., [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ], [-30. , -23.47826087, -16.95652174, ..., 106.95652174, 113.47826087, 120. ]])
X.shape
(325, 24)
Y
array([[ 0, 0, 0, ..., 0, 0, 0], [ 1, 1, 1, ..., 1, 1, 1], [ 2, 2, 2, ..., 2, 2, 2], ..., [322, 322, 322, ..., 322, 322, 322], [323, 323, 323, ..., 323, 323, 323], [324, 324, 324, ..., 324, 324, 324]])
Y.shape
(325, 24)
# Fill NaNs using forward fill, then backward fill
no_meal_data_matrix_2_df = pd.DataFrame(no_meal_data_matrix_2)
no_meal_data_matrix_2_filled = no_meal_data_matrix_2_df.ffill().bfill().values
# Apply a moving average smoothing to each row
window_size = 5  # Choose a window size for smoothing
smoothed_no_meal_data_matrix_2 = np.array([uniform_filter1d(row, size=window_size) for row in no_meal_data_matrix_2_filled])
# Create the plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
stride = 100

# Plot the surface
surface_plot = ax.plot_surface(X[::stride], Y[::stride], smoothed_no_meal_data_matrix_2[::stride], cmap='viridis', edgecolor='none')
ax.set_title('3D Surface Plot of Smoothed Averaged No Meal Glucose Levels', y=0.985)
ax.set_xlabel('Time Relative to End of Meal [minutes]', labelpad=10)
ax.set_ylabel('Meal Samples', labelpad=10)
#ax.set_zlabel('Glucose Level (mg/dL)', labelpad=30)

#ax.set_xticks(np.linspace(-30, 120, 7))

# Adjust axis limits to create a zoom-out effect
#ax.set_xlim(-30, 120)
#ax.set_ylim(0, 800)
#ax.set_zlim(0, 300)

fig.colorbar(surface_plot, ax=ax, shrink=0.5, aspect=10, label='Glucose Level (mg/dL)')

#plt.tight_layout()
fig.subplots_adjust(left=0.01, right=1, top=0.9, bottom=0.1)

ax.view_init(elev=25, azim=-55)

plt.show()
<Figure size 1000x800 with 2 Axes>

Contour Plot of No Meal Data Matrix 2

# Create the contour plot
plt.figure(figsize=(15, 10))
plt.contourf(X, Y, no_meal_data_matrix_2_df, cmap='viridis', levels=30)
plt.colorbar(label='Glucose Level (mg/dL)')
plt.title('Contour Plot of No Meal Glucose Levels')
plt.xlabel('Time Points')
plt.ylabel('Meal Samples')
plt.show()
<Figure size 1500x1000 with 2 Axes>

Handle Missing Data

We will use a strategy of deleting rows of data that have more than 10% of bad data. If less than 10%, we will use KNN with a nearest neighbor hyperparameter (K=7).

Define Threshold for Dumping Data

# 10%
dump_thresold = 0.1

Create KNN Imputer

knn_imputer = KNNImputer(n_neighbors=7)

Parse Meal Data and Handle Missing Values

cleaned_meal_data_1 = []
for row in meal_data_matrix_1:

    proportion_missing_data = np.isnan(row).mean()

    if proportion_missing_data > dump_thresold:

        # Ignore this row
        print(f"Ignoring row with {proportion_missing_data*100:.2f}% missing data")
        continue

    else:
        # The data is good, use it
        cleaned_meal_data_1.append(row)
Ignoring row with 13.33% missing data
Ignoring row with 73.33% missing data
Ignoring row with 66.67% missing data
Ignoring row with 13.33% missing data
Ignoring row with 30.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 23.33% missing data
Ignoring row with 16.67% missing data
Ignoring row with 26.67% missing data
Ignoring row with 30.00% missing data
Ignoring row with 20.00% missing data
Ignoring row with 73.33% missing data
Ignoring row with 63.33% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 33.33% missing data
Ignoring row with 16.67% missing data
Ignoring row with 16.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 60.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 36.67% missing data
Ignoring row with 43.33% missing data
Ignoring row with 13.33% missing data
Ignoring row with 13.33% missing data
Ignoring row with 60.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 53.33% missing data
Ignoring row with 33.33% missing data
Ignoring row with 13.33% missing data
Ignoring row with 20.00% missing data
Ignoring row with 83.33% missing data
Ignoring row with 86.67% missing data
Ignoring row with 70.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 43.33% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 63.33% missing data
Ignoring row with 63.33% missing data
Ignoring row with 26.67% missing data
Ignoring row with 60.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 33.33% missing data
Ignoring row with 96.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 20.00% missing data
Ignoring row with 80.00% missing data
Ignoring row with 43.33% missing data
Ignoring row with 100.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 70.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 83.33% missing data
Ignoring row with 83.33% missing data
Ignoring row with 30.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 20.00% missing data
Ignoring row with 30.00% missing data
Ignoring row with 20.00% missing data
Ignoring row with 76.67% missing data
Ignoring row with 73.33% missing data
Ignoring row with 70.00% missing data
Ignoring row with 36.67% missing data
Ignoring row with 20.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 36.67% missing data
Ignoring row with 16.67% missing data
Ignoring row with 63.33% missing data
Ignoring row with 13.33% missing data
Ignoring row with 20.00% missing data
Ignoring row with 26.67% missing data
Ignoring row with 90.00% missing data
Ignoring row with 23.33% missing data
Ignoring row with 33.33% missing data
Ignoring row with 100.00% missing data
Ignoring row with 80.00% missing data
Ignoring row with 33.33% missing data
Ignoring row with 30.00% missing data
Ignoring row with 20.00% missing data
Ignoring row with 33.33% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 13.33% missing data
Ignoring row with 20.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 16.67% missing data
cleaned_meal_data_2 = []
for row in meal_data_matrix_2:

    proportion_missing_data = np.isnan(row).mean()

    if proportion_missing_data > dump_thresold:

        # Ignore this row
        print(f"Ignoring row with {proportion_missing_data*100:.2f}% missing data")
        continue

    else:
        # The data is good, use it
        cleaned_meal_data_2.append(row)
Ignoring row with 43.33% missing data
Ignoring row with 43.33% missing data
Ignoring row with 30.00% missing data
Ignoring row with 63.33% missing data
Ignoring row with 36.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 66.67% missing data
Ignoring row with 26.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 56.67% missing data
Ignoring row with 53.33% missing data
Ignoring row with 70.00% missing data
Ignoring row with 20.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 20.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 13.33% missing data
Ignoring row with 100.00% missing data
Ignoring row with 60.00% missing data
Ignoring row with 80.00% missing data
Ignoring row with 36.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 56.67% missing data
Ignoring row with 63.33% missing data
Ignoring row with 20.00% missing data
Ignoring row with 20.00% missing data
Ignoring row with 26.67% missing data
Ignoring row with 13.33% missing data
Ignoring row with 73.33% missing data
Ignoring row with 36.67% missing data
Ignoring row with 23.33% missing data
Ignoring row with 13.33% missing data
Ignoring row with 13.33% missing data
Ignoring row with 13.33% missing data
Ignoring row with 36.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 26.67% missing data
Ignoring row with 26.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 20.00% missing data
Ignoring row with 20.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 16.67% missing data
Ignoring row with 20.00% missing data
Ignoring row with 20.00% missing data
cleaned_meal_data_1[0]
array([121., 120., 115., 110., 106., 103., 103., 101., nan, 131., 109., 82., 74., 66., 73., 82., 87., 90., 83., 73., 49., 58., 69., 79., nan, 93., 92., 86., 84., 80.])
cleaned_meal_data_2[0]
array([390., 391., 389., 383., 375., 368., 357., 349., 336., 326., 315., 305., 295., 286., 279., 273., 268., 259., 252., 246., 238., 229., 218., 206., 195., 192., 181., 165., 139., 133.])

Convert Cleaned Meal Data List into a Matrix

cleaned_meal_data_matrix_1 = \
    np.array(cleaned_meal_data_1)
cleaned_meal_data_matrix_2 = \
    np.array(cleaned_meal_data_2)

Use KNN to Impute NaNs in Cleaned Meal Data

cleaned_imputed_meal_matrix_1 = knn_imputer.fit_transform(cleaned_meal_data_matrix_1)
cleaned_imputed_meal_matrix_2 = knn_imputer.fit_transform(cleaned_meal_data_matrix_2)
cleaned_imputed_meal_matrix_1[0]
array([121. , 120. , 115. , 110. , 106. , 103. , 103. , 101. , 99.71428571, 131. , 109. , 82. , 74. , 66. , 73. , 82. , 87. , 90. , 83. , 73. , 49. , 58. , 69. , 79. , 75.85714286, 93. , 92. , 86. , 84. , 80. ])
cleaned_imputed_meal_matrix_1.shape
(619, 30)
cleaned_imputed_meal_matrix_2[0]
array([390., 391., 389., 383., 375., 368., 357., 349., 336., 326., 315., 305., 295., 286., 279., 273., 268., 259., 252., 246., 238., 229., 218., 206., 195., 192., 181., 165., 139., 133.])
cleaned_imputed_meal_matrix_2.shape
(325, 30)

Parse No Meal Data and Handle Missing Values

cleaned_no_meal_data_1 = []
for row in no_meal_data_matrix_1:

    proportion_missing_data = np.isnan(row).mean()

    if proportion_missing_data > dump_thresold:

        # Ignore this row
        print(f"Ignoring row with {proportion_missing_data*100:.2f}% missing data")
        continue

    else:
        # The data is good, use it
        cleaned_no_meal_data_1.append(row)
Ignoring row with 20.83% missing data
Ignoring row with 58.33% missing data
Ignoring row with 12.50% missing data
Ignoring row with 33.33% missing data
Ignoring row with 12.50% missing data
Ignoring row with 41.67% missing data
Ignoring row with 79.17% missing data
Ignoring row with 16.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 66.67% missing data
Ignoring row with 50.00% missing data
Ignoring row with 16.67% missing data
Ignoring row with 29.17% missing data
Ignoring row with 12.50% missing data
Ignoring row with 66.67% missing data
Ignoring row with 83.33% missing data
Ignoring row with 12.50% missing data
Ignoring row with 16.67% missing data
Ignoring row with 20.83% missing data
Ignoring row with 54.17% missing data
Ignoring row with 100.00% missing data
Ignoring row with 91.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 12.50% missing data
Ignoring row with 12.50% missing data
Ignoring row with 100.00% missing data
Ignoring row with 12.50% missing data
Ignoring row with 12.50% missing data
Ignoring row with 79.17% missing data
Ignoring row with 62.50% missing data
Ignoring row with 54.17% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 100.00% missing data
Ignoring row with 12.50% missing data
Ignoring row with 100.00% missing data
Ignoring row with 50.00% missing data
Ignoring row with 12.50% missing data
Ignoring row with 37.50% missing data
Ignoring row with 91.67% missing data
Ignoring row with 87.50% missing data
Ignoring row with 66.67% missing data
Ignoring row with 41.67% missing data
Ignoring row with 87.50% missing data
Ignoring row with 41.67% missing data
Ignoring row with 87.50% missing data
Ignoring row with 100.00% missing data
Ignoring row with 12.50% missing data
Ignoring row with 41.67% missing data
Ignoring row with 29.17% missing data
Ignoring row with 100.00% missing data
cleaned_no_meal_data_2 = []
for row in no_meal_data_matrix_2:

    proportion_missing_data = np.isnan(row).mean()

    if proportion_missing_data > dump_thresold:

        # Ignore this row
        print(f"Ignoring row with {proportion_missing_data*100:.2f}% missing data")
        continue

    else:
        # The data is good, use it
        cleaned_no_meal_data_2.append(row)
Ignoring row with 54.17% missing data
Ignoring row with 54.17% missing data
Ignoring row with 100.00% missing data
Ignoring row with 83.33% missing data
Ignoring row with 16.67% missing data
Ignoring row with 100.00% missing data
Ignoring row with 58.33% missing data
Ignoring row with 37.50% missing data
Ignoring row with 50.00% missing data
Ignoring row with 37.50% missing data
Ignoring row with 100.00% missing data
Ignoring row with 70.83% missing data
Ignoring row with 29.17% missing data
Ignoring row with 45.83% missing data
Ignoring row with 100.00% missing data
Ignoring row with 79.17% missing data
Ignoring row with 75.00% missing data
Ignoring row with 62.50% missing data
Ignoring row with 16.67% missing data
Ignoring row with 91.67% missing data
Ignoring row with 20.83% missing data
Ignoring row with 29.17% missing data
Ignoring row with 20.83% missing data
Ignoring row with 100.00% missing data
Ignoring row with 33.33% missing data
Ignoring row with 37.50% missing data
Ignoring row with 100.00% missing data
Ignoring row with 45.83% missing data
Ignoring row with 16.67% missing data
cleaned_no_meal_data_1[0]
array([121., 120., 115., 110., 106., 103., 103., 101., nan, 131., 109., 82., 74., 66., 73., 82., 87., 90., 83., 73., 49., 58., 69., 79.])
cleaned_no_meal_data_2[0]
array([390., 391., 389., 383., 375., 368., 357., 349., 336., 326., 315., 305., 295., 286., 279., 273., 268., 259., 252., 246., 238., 229., 218., 206.])
cleaned_no_meal_data_1[0].shape
(24,)
cleaned_no_meal_data_2[0].shape
(24,)

Convert Cleaned No Meal Data List into a Matrix

cleaned_no_meal_data_matrix_1 = \
    np.array(cleaned_no_meal_data_1)
cleaned_no_meal_data_matrix_2 = \
    np.array(cleaned_no_meal_data_2)

Use KNN to Impute NaNs in Cleaned No Meal Data

cleaned_imputed_no_meal_matrix_1 = knn_imputer.fit_transform(cleaned_no_meal_data_matrix_1)
cleaned_imputed_no_meal_matrix_2 = knn_imputer.fit_transform(cleaned_no_meal_data_matrix_2)
cleaned_imputed_no_meal_matrix_1[0]
array([121. , 120. , 115. , 110. , 106. , 103. , 103. , 101. , 103.85714286, 131. , 109. , 82. , 74. , 66. , 73. , 82. , 87. , 90. , 83. , 73. , 49. , 58. , 69. , 79. ])
cleaned_imputed_no_meal_matrix_2[0]
array([390., 391., 389., 383., 375., 368., 357., 349., 336., 326., 315., 305., 295., 286., 279., 273., 268., 259., 252., 246., 238., 229., 218., 206.])

Visualize Cleaned Imputed Data

Visualize Cleaned Imputed Meal Data Matrix 1

Heatmap of Cleaned Imputed Meal Data Matrix 1

# Create the heatmap
sampled_meal_data_matrix_1 = cleaned_imputed_meal_matrix_1[np.random.choice(cleaned_imputed_meal_matrix_1.shape[0], 200, replace=False)]
plt.figure(figsize=(30, 30))
sns.heatmap(sampled_meal_data_matrix_1, cmap='YlGnBu', linewidths=0.5, annot=False, vmin=None, vmax=None)
plt.title('Heatmap of Meal Data Matrix 1', fontsize=30)
plt.xlabel('Time Points', fontsize=26)
plt.ylabel('Meals (Samples)', fontsize=26)

# Change font properties for the x and y tick labels
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.show()
<Figure size 3000x3000 with 2 Axes>

Surface Plot of Cleaned Imputed Meal Data Matrix 1

# Create X, Y meshgrid
X = np.arange(cleaned_imputed_meal_matrix_1.shape[1])
Y = np.arange(cleaned_imputed_meal_matrix_1.shape[0])
time_points = np.linspace(-30, 120, 30)  # X-axis: from -30 minutes to +120 minutes (30 time points)
meal_numbers = np.arange(1, len(meal_data_1) + 1)
X, Y = np.meshgrid(time_points, Y)
#X, Y = np.meshgrid(time_points, meal_numbers)
X
array([[-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], ..., [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ]])
X.shape
(619, 30)
Y
array([[ 0, 0, 0, ..., 0, 0, 0], [ 1, 1, 1, ..., 1, 1, 1], [ 2, 2, 2, ..., 2, 2, 2], ..., [616, 616, 616, ..., 616, 616, 616], [617, 617, 617, ..., 617, 617, 617], [618, 618, 618, ..., 618, 618, 618]])
# Fill NaNs using forward fill, then backward fill
meal_data_matrix_1_df = pd.DataFrame(cleaned_imputed_meal_matrix_1)
meal_data_matrix_1_filled = meal_data_matrix_1_df.ffill().bfill().values
# Apply a moving average smoothing to each row
window_size = 3  # Choose a window size for smoothing
smoothed_meal_data_matrix_1 = np.array([uniform_filter1d(row, size=window_size) for row in meal_data_matrix_1_filled])
# Create the plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
stride = 100

# Plot the surface
surface_plot = ax.plot_surface(X[::stride], Y[::stride], smoothed_meal_data_matrix_1[::stride], cmap='viridis', edgecolor='none')
ax.set_title('3D Surface Plot of Smoothed Averaged Meal Glucose Levels', y=0.985)
ax.set_xlabel('Time Relative to Start of Meal [minutes]', labelpad=10)
ax.set_ylabel('Meal Samples', labelpad=10)
#ax.set_zlabel('Glucose Level (mg/dL)', labelpad=30)

#ax.set_xticks(np.linspace(-30, 120, 7))

# Adjust axis limits to create a zoom-out effect
#ax.set_xlim(-30, 120)
ax.set_ylim(0, 800)
ax.set_zlim(0, 300)

fig.colorbar(surface_plot, ax=ax, shrink=0.5, aspect=10, label='Glucose Level (mg/dL)')

#plt.tight_layout()
fig.subplots_adjust(left=0.01, right=1, top=0.9, bottom=0.1)

ax.view_init(elev=25, azim=-55)

plt.show()
<Figure size 1000x800 with 2 Axes>

Contour Plot of Cleaned Imputed Meal Data Matrix 1

# Create the contour plot
plt.figure(figsize=(15, 10))
plt.contourf(X, Y, meal_data_matrix_1_df, cmap='viridis', levels=30)
plt.colorbar(label='Glucose Level (mg/dL)')
plt.title('Contour Plot of Meal Glucose Levels')
plt.xlabel('Time Points')
plt.ylabel('Meal Samples')
plt.show()
<Figure size 1500x1000 with 2 Axes>

Visualize Cleaned Imputed Meal Data Matrix 2

Heatmap of Cleaned Imputed Meal Data Matrix 2

# Create the heatmap
sampled_meal_data_matrix_2 = cleaned_imputed_meal_matrix_1[np.random.choice(cleaned_imputed_meal_matrix_2.shape[0], 200, replace=False)]
plt.figure(figsize=(30, 30))
sns.heatmap(sampled_meal_data_matrix_2, cmap='YlGnBu', linewidths=0.5, annot=False, vmin=None, vmax=None)
plt.title('Heatmap of Meal Data Matrix 1')
plt.xlabel('Time Points')
plt.ylabel('Meals (Samples)')
plt.show()
<Figure size 3000x3000 with 2 Axes>

Surface Plot of Cleaned Imputed Meal Data Matrix 2

# Create X, Y meshgrid
X = np.arange(cleaned_imputed_meal_matrix_2.shape[1])
Y = np.arange(cleaned_imputed_meal_matrix_2.shape[0])
time_points = np.linspace(-30, 120, 30)  # X-axis: from -30 minutes to +120 minutes (30 time points)
meal_numbers = np.arange(1, len(meal_data_2) + 1)
X, Y = np.meshgrid(time_points, Y)
#X, Y = np.meshgrid(time_points, meal_numbers)
X
array([[-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], ..., [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ], [-30. , -24.82758621, -19.65517241, ..., 109.65517241, 114.82758621, 120. ]])
X.shape
(325, 30)
Y
array([[ 0, 0, 0, ..., 0, 0, 0], [ 1, 1, 1, ..., 1, 1, 1], [ 2, 2, 2, ..., 2, 2, 2], ..., [322, 322, 322, ..., 322, 322, 322], [323, 323, 323, ..., 323, 323, 323], [324, 324, 324, ..., 324, 324, 324]])
# Fill NaNs using forward fill, then backward fill
meal_data_matrix_2_df = pd.DataFrame(cleaned_imputed_meal_matrix_2)
meal_data_matrix_2_filled = meal_data_matrix_2_df.ffill().bfill().values
# Apply a moving average smoothing to each row
window_size = 3  # Choose a window size for smoothing
smoothed_meal_data_matrix_2 = np.array([uniform_filter1d(row, size=window_size) for row in meal_data_matrix_2_filled])
# Create the plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
stride = 100

# Plot the surface
surface_plot = ax.plot_surface(X[::stride], Y[::stride], smoothed_meal_data_matrix_2[::stride], cmap='viridis', edgecolor='none')
ax.set_title('3D Surface Plot of Smoothed Averaged Meal Glucose Levels', y=0.985)
ax.set_xlabel('Time Relative to Start of Meal [minutes]', labelpad=10)
ax.set_ylabel('Meal Samples', labelpad=10)
#ax.set_zlabel('Glucose Level (mg/dL)', labelpad=30)

#ax.set_xticks(np.linspace(-30, 120, 7))

# Adjust axis limits to create a zoom-out effect
#ax.set_xlim(-30, 120)
#ax.set_ylim(0, 800)
#ax.set_zlim(0, 300)

fig.colorbar(surface_plot, ax=ax, shrink=0.5, aspect=10, label='Glucose Level (mg/dL)')

#plt.tight_layout()
fig.subplots_adjust(left=0.01, right=1, top=0.9, bottom=0.1)

ax.view_init(elev=25, azim=-55)

plt.show()
<Figure size 1000x800 with 2 Axes>

Contour Plot of Cleaned Imputed Meal Data Matrix 2

# Create the contour plot
plt.figure(figsize=(15, 10))
plt.contourf(X, Y, meal_data_matrix_2_df, cmap='viridis', levels=30)
plt.colorbar(label='Glucose Level (mg/dL)')
plt.title('Contour Plot of Meal Glucose Levels')
plt.xlabel('Time Points')
plt.ylabel('Meal Samples')
plt.show()
<Figure size 1500x1000 with 2 Axes>

Combine Patient 1 and Patient 2 Data

cleaned_imputed_meal_matrix = \
    np.vstack((cleaned_imputed_meal_matrix_1, cleaned_imputed_meal_matrix_2))
cleaned_imputed_no_meal_matrix = \
    np.vstack((cleaned_imputed_no_meal_matrix_1, cleaned_imputed_no_meal_matrix_2))

Feature Extraction

Define Feature Extractor Helper Functions

def compute_first_derivative(glucose_data, delta_t=5):
    return np.diff(glucose_data) / delta_t
def compute_second_derivative(glucose_values, delta_t=5):
    second_derivative = np.zeros(len(glucose_values) - 2)
    for i in range(1, len(glucose_values) - 1):
        second_derivative[i - 1] = \
            (glucose_values[i + 1] - 2 * glucose_values[i] + glucose_values[i - 1]) / (delta_t ** 2)
    return second_derivative
compute_first_derivative(cleaned_imputed_meal_matrix[0])
array([-0.2 , -1. , -1. , -0.8 , -0.6 , 0. , -0.4 , -0.25714286, 6.25714286, -4.4 , -5.4 , -1.6 , -1.6 , 1.4 , 1.8 , 1. , 0.6 , -1.4 , -2. , -4.8 , 1.8 , 2.2 , 2. , -0.62857143, 3.42857143, -0.2 , -1.2 , -0.4 , -0.8 ])
compute_second_derivative(cleaned_imputed_meal_matrix[0])
array([-0.16 , 0. , 0.04 , 0.04 , 0.12 , -0.08 , 0.02857143, 1.30285714, -2.13142857, -0.2 , 0.76 , 0. , 0.6 , 0.08 , -0.16 , -0.08 , -0.4 , -0.12 , -0.56 , 1.32 , 0.08 , -0.04 , -0.52571429, 0.81142857, -0.72571429, -0.2 , 0.16 , -0.08 ])
def compute_AUC(glucose_values, delta_t=5):
    return simpson(glucose_values, dx=delta_t)

Define Feature Extractor Function

def feature_extractor(data_matrix):

    features = []

    for row in data_matrix:
        mean_glucose = np.mean(row)
        std_glucose  = np.std(row)
        max_glucose  = np.max(row)
        min_glucose  = np.min(row)
        
        first_derivative = compute_first_derivative(row)
        mean_first_derivate = np.mean(first_derivative)

        if len(row) > 2:
            second_derivative = compute_second_derivative(row)
            mean_second_derivative = np.mean(second_derivative)
        else:
            mean_second_derivative = 0

        AUC = compute_AUC(row)

        time_to_peak = np.argmax(row)

        features_vector = [
            mean_glucose,
            std_glucose,
            max_glucose,
            min_glucose,
            mean_first_derivate,
            mean_second_derivative,
            AUC,
            time_to_peak
        ]

        features.append(features_vector)
    
    return np.array(features)

Get Feature Matrix for Cleaned Imputed Meal Data

meal_features_matrix = feature_extractor(cleaned_imputed_meal_matrix)
meal_features_matrix[0]
array([ 8.98190476e+01, 1.91175360e+01, 1.31000000e+02, 4.90000000e+01, -2.82758621e-01, -4.28571429e-03, 1.30310714e+04, 9.00000000e+00])

Get Feature Matrix for Clean Imputed No Meal Data

no_meal_features_matrix = feature_extractor(cleaned_imputed_no_meal_matrix)
no_meal_features_matrix[0]
array([ 9.11607143e+01, 2.10808822e+01, 1.31000000e+02, 4.90000000e+01, -3.65217391e-01, 2.00000000e-02, 1.04699405e+04, 9.00000000e+00])

Visualize The Meal Features Matrix

Normalize The Data

scaler = StandardScaler()
scaled_meal_features_matrix = scaler.fit_transform(meal_features_matrix)
scaled_no_meal_features_matrix = scaler.fit_transform(no_meal_features_matrix)
scaled_meal_features_matrix[0]
array([-1.43931653, -0.6890434 , -1.43186426, -1.35409331, -0.73789252, -0.2568468 , -1.42698567, -0.72547238])
scaled_no_meal_features_matrix[0]
array([-1.24861858, -0.24758492, -1.16121642, -1.39218291, -0.66468096, 1.06439767, -1.24077771, -0.41467362])

Plot Features for all Rows

meal_features_matrix.shape[0]
944
X = np.arange(
        start=0,
        stop=meal_features_matrix.shape[0]
    )

Plot Max, Mean, Min, and Std Dev for All Rows

Y_mean = meal_features_matrix[:, 0] # First column
Y_mean_peaks, _ = find_peaks(Y_mean, height=20)

Y_mean_peaks_times = X[Y_mean_peaks]
Y_mean_peaks_values = Y_mean[Y_mean_peaks]

window_size = 3
Y_mean_peaks_moving_avg = np.convolve(Y_mean_peaks_values, np.ones(window_size)/window_size, mode='valid')
Y_std = meal_features_matrix[:, 1] # Second column
Y_max = meal_features_matrix[:, 2] # Third column
Y_max_peaks, _ = find_peaks(Y_max, height=20)

Y_max_peaks_times = X[Y_max_peaks]
Y_max_peaks_values = Y_max[Y_max_peaks]

window_size = 3
Y_max_peaks_moving_avg = np.convolve(Y_max_peaks_values, np.ones(window_size)/window_size, mode='valid')
Y_max_peaks_times.shape
(289,)
Y_max_peaks_moving_avg.shape
(287,)
Y_min = meal_features_matrix[:, 3] # Fourth column
Y_min_peaks, _ = find_peaks(Y_min, height=20)

Y_min_peaks_times = X[Y_min_peaks]
Y_min_peaks_values = Y_min[Y_min_peaks]

window_size = 3
Y_min_peaks_moving_avg = np.convolve(Y_min_peaks_values, np.ones(window_size)/window_size, mode='valid')
Y_min_peaks_moving_avg.shape
(281,)
# Create the plot
fig = plt.figure(figsize=(18, 6))
ax = fig.add_subplot(111)
stride = 5

# Plot the features
#ax.plot(X[::stride], Y_max[::stride], 'x', label='Max', color='red')
ax.plot(Y_max_peaks_times[:len(Y_max_peaks_moving_avg)], Y_max_peaks_moving_avg, color='red', label='Moving Average of Max', linewidth=2)

#ax.plot(X[::stride], Y_mean[::stride], label='Mean', color='blue')
ax.plot(Y_mean_peaks_times[:len(Y_mean_peaks_moving_avg)], Y_mean_peaks_moving_avg, color='blue', label='Moving Average of Mean', linewidth=2)

#ax.plot(X[::stride], Y_min[::stride], 'x', label='Min')
ax.plot(Y_min_peaks_times[:len(Y_min_peaks_moving_avg)], Y_min_peaks_moving_avg, color='green', label='Moving Average of Min', linewidth=2)

ax.plot(X[::stride], Y_std[::stride], label='Std Dev')

ax.set_title('Meal Glucose Level Features vs Data Row Number', fontsize=20)
ax.set_xlabel('Data Row Number [arbitrary] ', fontsize=16)
ax.set_ylabel('Glucose Level [mg/dL]', fontsize=16)

ax.set_xticks(np.linspace(0, 1000, 11))
ax.set_yticks(np.linspace(0, 400, 7))
ax.set_xlim(-50, 1000)
ax.set_ylim(0, 420)

#ax.text(300, 380, f'Viewing every {stride} rows')

# Adjust axis limits to create a zoom-out effect
#ax.set_xlim(-30, 120)
#ax.set_ylim(0, 800)

#plt.tight_layout()

plt.legend()

plt.show()
<Figure size 1800x600 with 1 Axes>

Plot First and Second Derivatives

Y_mean_first_derivative = meal_features_matrix[:, 4] # Fourth column
Y_mean_first_derivative_peaks, _ = find_peaks(Y_mean_first_derivative, height=None)

Y_mean_first_derivative_times = X[Y_mean_first_derivative_peaks]
Y_mean_first_derivative_values = Y_mean_first_derivative[Y_mean_first_derivative_peaks]

window_size = 3
Y_mean_first_derivative_peaks_moving_avg = \
    np.convolve(
        Y_mean_first_derivative_values,
        np.ones(window_size)/window_size, mode='valid'
    )
Y_mean_2nd_derivative = meal_features_matrix[:, 5] # Fourth column
Y_mean_2nd_derivative_peaks, _ = find_peaks(Y_mean_2nd_derivative, height=None)

Y_mean_2nd_derivative_times = X[Y_mean_2nd_derivative_peaks]
Y_mean_2nd_derivative_values = Y_mean_2nd_derivative[Y_mean_2nd_derivative_peaks]

window_size = 3
Y_mean_2nd_derivative_peaks_moving_avg = \
    np.convolve(
        Y_mean_2nd_derivative_values,
        np.ones(window_size)/window_size, mode='valid'
    )
# Create the plot
fig = plt.figure(figsize=(18, 6))
ax = fig.add_subplot(111)
stride = 5

# Plot the features
#ax.plot(X[::stride], Y_max[::stride], 'x', label='Max', color='red')
#ax.plot(
#    Y_max_peaks_times[:len(Y_max_peaks_moving_avg)], 
#    Y_max_peaks_moving_avg, 
#    color='red', 
#    label='Moving Average of Max', 
#    linewidth=2)

#ax.plot(X[::stride], Y_mean[::stride], label='Mean', color='blue')
ax.plot(
    Y_mean_peaks_times[:len(Y_mean_peaks_moving_avg)], 
    Y_mean_peaks_moving_avg, 
    color='blue', 
    label='Moving Average of Mean', 
    linewidth=2
)

#ax.plot(X[::stride], Y_min[::stride], 'x', label='Min')
#ax.plot(Y_min_peaks_times[:len(Y_min_peaks_moving_avg)], Y_min_peaks_moving_avg, color='green', label='Moving Average of Min', linewidth=2)

#ax.plot(X[::stride], Y_std[::stride], label='Std Dev')

ax.plot(
    Y_mean_first_derivative_times[:len(Y_mean_first_derivative_peaks_moving_avg)],
    Y_mean_first_derivative_peaks_moving_avg*100,
    color='orange',
    label=r'First Derivative $\times 100$',
    linewidth=2
)

ax.plot(
    Y_mean_2nd_derivative_times[:len(Y_mean_2nd_derivative_peaks_moving_avg)],
    Y_mean_2nd_derivative_peaks_moving_avg*2500,
    color='green',
    label=r'Second Derivative $\times 2500$',
    linewidth=2
)

ax.set_title('Glucose Level Features vs Data Row Number')
ax.set_xlabel('Data Row Number [arbitrary] ')
ax.set_ylabel('Glucose Level [mg/dL]')

#ax.set_xticks(np.linspace(0, 1000, 11))
#ax.set_yticks(np.linspace(0, 400, 7))
#ax.set_xlim(-50, 1000)
#ax.set_ylim(0, 420)

#ax.text(300, 380, f'Viewing every {stride} rows')

# Adjust axis limits to create a zoom-out effect
#ax.set_xlim(-30, 120)
#ax.set_ylim(0, 800)

#plt.tight_layout()

plt.legend()

plt.show()
<Figure size 1800x600 with 1 Axes>

Pairwise Plot of Meal Features Matrix

# Generate column labels for the features
feature_columns = [f'Feature_{i+1}' for i in range(meal_features_matrix.shape[1])]

# Convert the feature matrix to a Pandas DataFrame
df = pd.DataFrame(meal_features_matrix, columns=feature_columns)

# Create a pairwise plot using Seaborn
plt.figure(figsize=(15, 15))  # Adjust figure size as needed
sns.pairplot(df, diag_kind='hist', plot_kws={'alpha': 0.5, 's': 40})

plt.suptitle('Pairwise Plot of Meal Features Matrix', y=1.02, fontsize=30)

# Show the plot
plt.show()
<Figure size 1500x1500 with 0 Axes>
<Figure size 2000x2000 with 72 Axes>

Visualize The No Meal Features Matrix

Normalize The Data

scaler = StandardScaler()
scaled_meal_features_matrix = scaler.fit_transform(meal_features_matrix)
scaled_no_meal_features_matrix = scaler.fit_transform(no_meal_features_matrix)
scaled_meal_features_matrix[0]
array([-1.43931653, -0.6890434 , -1.43186426, -1.35409331, -0.73789252, -0.2568468 , -1.42698567, -0.72547238])
scaled_no_meal_features_matrix[0]
array([-1.24861858, -0.24758492, -1.16121642, -1.39218291, -0.66468096, 1.06439767, -1.24077771, -0.41467362])

Plot Features for all Rows

no_meal_features_matrix.shape[0]
752
X = np.arange(
        start=0,
        stop=no_meal_features_matrix.shape[0]
    )

Plot Max, Mean, Min, and Std Dev for All Rows

Y_mean = no_meal_features_matrix[:, 0] # First column
Y_mean_peaks, _ = find_peaks(Y_mean, height=20)

Y_mean_peaks_times = X[Y_mean_peaks]
Y_mean_peaks_values = Y_mean[Y_mean_peaks]

window_size = 3
Y_mean_peaks_moving_avg = np.convolve(Y_mean_peaks_values, np.ones(window_size)/window_size, mode='valid')
Y_std = no_meal_features_matrix[:, 1] # Second column
Y_max = no_meal_features_matrix[:, 2] # Third column
Y_max_peaks, _ = find_peaks(Y_max, height=20)

Y_max_peaks_times = X[Y_max_peaks]
Y_max_peaks_values = Y_max[Y_max_peaks]

window_size = 3
Y_max_peaks_moving_avg = np.convolve(Y_max_peaks_values, np.ones(window_size)/window_size, mode='valid')
Y_max_peaks_times.shape
(238,)
Y_max_peaks_moving_avg.shape
(236,)
Y_min = no_meal_features_matrix[:, 3] # Fourth column
Y_min_peaks, _ = find_peaks(Y_min, height=20)

Y_min_peaks_times = X[Y_min_peaks]
Y_min_peaks_values = Y_min[Y_min_peaks]

window_size = 3
Y_min_peaks_moving_avg = np.convolve(Y_min_peaks_values, np.ones(window_size)/window_size, mode='valid')
Y_min_peaks_moving_avg.shape
(238,)
# Create the plot
fig = plt.figure(figsize=(18, 6))
ax = fig.add_subplot(111)
stride = 5

# Plot the features
#ax.plot(X[::stride], Y_max[::stride], 'x', label='Max', color='red')
ax.plot(Y_max_peaks_times[:len(Y_max_peaks_moving_avg)], Y_max_peaks_moving_avg, color='red', label='Moving Average of Max', linewidth=2)

#ax.plot(X[::stride], Y_mean[::stride], label='Mean', color='blue')
ax.plot(Y_mean_peaks_times[:len(Y_mean_peaks_moving_avg)], Y_mean_peaks_moving_avg, color='blue', label='Moving Average of Mean', linewidth=2)

#ax.plot(X[::stride], Y_min[::stride], 'x', label='Min')
ax.plot(Y_min_peaks_times[:len(Y_min_peaks_moving_avg)], Y_min_peaks_moving_avg, color='green', label='Moving Average of Min', linewidth=2)

ax.plot(X[::stride], Y_std[::stride], label='Std Dev')

ax.set_title('No Meal Glucose Level Features vs Data Row Number', fontsize=20)
ax.set_xlabel('Data Row Number [arbitrary] ', fontsize=16)
ax.set_ylabel('Glucose Level [mg/dL]', fontsize=16)

ax.set_xticks(np.linspace(0, 800, 11))
ax.set_yticks(np.linspace(0, 400, 7))
ax.set_xlim(-50, 800)
ax.set_ylim(0, 420)

#ax.text(300, 380, f'Viewing every {stride} rows')

# Adjust axis limits to create a zoom-out effect
#ax.set_xlim(-30, 120)
#ax.set_ylim(0, 800)

#plt.tight_layout()

plt.legend()

plt.show()
<Figure size 1800x600 with 1 Axes>

Pairwise Plot of No Meal Features Matrix

# Generate column labels for the features
feature_columns = [f'Feature_{i+1}' for i in range(meal_features_matrix.shape[1])]

# Convert the feature matrix to a Pandas DataFrame
df = pd.DataFrame(no_meal_features_matrix, columns=feature_columns)

# Create a pairwise plot using Seaborn
plt.figure(figsize=(15, 15))  # Adjust figure size as needed
sns.pairplot(df, diag_kind='hist', plot_kws={'alpha': 0.5, 's': 40})

plt.suptitle('Pairwise Plot of No Meal Features Matrix', y=1.02, fontsize=30)

# Show the plot
plt.show()
<Figure size 1500x1500 with 0 Axes>
<Figure size 2000x2000 with 72 Axes>

Concatenate Feature Matrices and Add Class Labels

meal_labels = np.ones((meal_features_matrix.shape[0], 1))
len(meal_labels)
944
meal_features_matrix_with_labels = \
    np.hstack((meal_features_matrix, meal_labels))
#meal_features_matrix_with_labels[1]
no_meal_labels = np.zeros((no_meal_features_matrix.shape[0], 1))
len(no_meal_labels)
752
no_meal_features_matrix_with_labels = \
    np.hstack((no_meal_features_matrix, no_meal_labels))
#no_meal_features_matrix_with_labels[1]
concat_features = \
    np.vstack((meal_features_matrix_with_labels, no_meal_features_matrix_with_labels))
len(concat_features)
1696

Train Support Vector Machine

Use Standard Variable Names for Features and Class Label

X = concat_features[:, :-1]
y = concat_features[:, -1]
X[0]
array([ 8.98190476e+01, 1.91175360e+01, 1.31000000e+02, 4.90000000e+01, -2.82758621e-01, -4.28571429e-03, 1.30310714e+04, 9.00000000e+00])
y[0]
np.float64(1.0)

Train / Test Split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = 0.2, random_state=40)

Normalize The Data

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

Train SVM Model

svm_model = SVC(kernel='linear', C=0.1)
svm_model.fit(X_train_scaled, y_train)
Loading...

Test SVM Model

y_pred = svm_model.predict(X_test_scaled)

Obtain Accuracy of SVM Model

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
Accuracy: 99.71%

Obtain Training and Test Accuracy of SVM Model

train_accuracy = svm_model.score(X_train_scaled, y_train)
test_accuracy = svm_model.score(X_test_scaled, y_test)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
Training Accuracy: 100.00%
Test Accuracy: 99.71%

Obtain F1 Score of SVM Model

f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")
F1 Score: 1.00

Cross Validate SVM Model

scores = cross_val_score(svm_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {scores.mean():.2f} (+/- {scores.std():.2f})")
Cross-Validation Accuracy: 1.00 (+/- 0.00)

Visualize Results of SVM Model

Scatter Plot with PCA

# Reduce dimensions using PCA
pca = PCA(n_components=2)

X_reduced = pca.fit_transform(X_test_scaled)

components = pca.components_
print(components)

# Get the predicted labels
predicted_labels = svm_model.predict(X_test_scaled)

# Create boolean masks for correct and incorrect predictions
correct_mask = (predicted_labels == y_test)
incorrect_mask = ~correct_mask

# Create a figure
plt.figure(figsize=(10, 6))

# Plot correct predictions in green
plt.scatter(X_reduced[correct_mask, 0], X_reduced[correct_mask, 1], color='green',
            marker='o', alpha=0.7, s=60, label='Correct Prediction')

# Plot incorrect predictions in red
plt.scatter(X_reduced[incorrect_mask, 0], X_reduced[incorrect_mask, 1], color='red',
            marker='x', alpha=0.7, s=60, label='Incorrect Prediction')

# Set labels and title
plt.xlabel('PCA Component 1', fontsize=20)
plt.ylabel('PCA Component 2', fontsize=20)
plt.title('SVM Classification Results: Correct vs Incorrect Predictions on Test Data', fontsize=18)

# Add a legend
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()
[[ 0.51111341  0.13118439  0.47327076  0.4251986   0.00643744 -0.257121
   0.50031459  0.01753109]
 [-0.03692307  0.31993471  0.1095569  -0.19047139  0.66528982  0.06390289
   0.01416038  0.63332595]]
<Figure size 1000x600 with 1 Axes>

Pairwise Plotting of Feature vs Feature

df = pd.DataFrame(X_test_scaled, columns=[f'Feature {i+1}' for i in range(X_test_scaled.shape[1])])
df['Predicted Class'] = svm_model.predict(X_test_scaled)

# Plot pairwise relationships
sns.pairplot(df, hue='Predicted Class', palette='viridis')
plt.suptitle('Pair Plot of SVM Classification Results', y=1.02, fontsize=30)
plt.show()
<Figure size 2117.49x2000 with 72 Axes>

Heatmaps of Confusion Matrix

# Get the predicted classes
y_pred = svm_model.predict(X_test_scaled)

# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the heatmap of the confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap of Test Data')
plt.show()


# Print accuracy, recall, precision, and F1 score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Precision: {precision:.4f}')
print(f'F1 Score: {f1:.4f}')
<Figure size 640x480 with 2 Axes>
Accuracy: 0.9971
Recall: 0.9950
Precision: 1.0000
F1 Score: 0.9975

Dump Model to Pickle File

with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_model, f)