"""
Created on Mon Apr 15 19:43:04 2019

Updated on Wed Jan 29 10:18:09 2020

@author: created by Sowmya Myneni and updated by Dijiang Huang
"""

'\nCreated on Mon Apr 15 19:43:04 2019\n\nUpdated on Wed Jan 29 10:18:09 2020\n\n@author: created by Sowmya Myneni and updated by Dijiang Huang\n'

import matplotlib.pyplot as plt
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 0

import os
import multiprocessing

# Number of logical CPUs
num_cores = multiprocessing.cpu_count()
print(f"Number of CPU cores available: {num_cores}")

Number of CPU cores available: 12

# Configure TensorFlow to use all available CPU cores
tf.config.threading.set_intra_op_parallelism_threads(0)  # Use all intra-operation threads
tf.config.threading.set_inter_op_parallelism_threads(0)  # Use all inter-operation threads

FNN Sample SC¶

########################################
# Part 1 - Data Pre-Processing
#######################################

# To load a dataset file in Python, you can use Pandas. Import pandas using the line below
import pandas as pd
# Import numpy to perform operations on the dataset
import numpy as np

Variable Setup¶

# Variable Setup
# Available datasets: KDDTrain+.txt, KDDTest+.txt, etc. More read Data Set Introduction.html within the NSL-KDD dataset folder
# Type the training dataset file name in ''
#TrainingDataPath='NSL-KDD/'
#TrainingData='KDDTrain+_20Percent.txt'
# Batch Size
BatchSize=10
# Epohe Size
NumEpoch=10

Import Training Dataset¶

# Import dataset.
# Dataset is given in TraningData variable You can replace it with the file 
# path such as “C:\Users\...\dataset.csv’. 
# The file can be a .txt as well. 
# If the dataset file has header, then keep header=0 otherwise use header=none
# reference: https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
dataset_train = pd.read_csv('Training-a1-a2.csv', header=None)

dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124926 entries, 0 to 124925
Data columns (total 43 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       124926 non-null  int64  
 1   1       124926 non-null  object 
 2   2       124926 non-null  object 
 3   3       124926 non-null  object 
 4   4       124926 non-null  int64  
 5   5       124926 non-null  int64  
 6   6       124926 non-null  int64  
 7   7       124926 non-null  int64  
 8   8       124926 non-null  int64  
 9   9       124926 non-null  int64  
 10  10      124926 non-null  int64  
 11  11      124926 non-null  int64  
 12  12      124926 non-null  int64  
 13  13      124926 non-null  int64  
 14  14      124926 non-null  int64  
 15  15      124926 non-null  int64  
 16  16      124926 non-null  int64  
 17  17      124926 non-null  int64  
 18  18      124926 non-null  int64  
 19  19      124926 non-null  int64  
 20  20      124926 non-null  int64  
 21  21      124926 non-null  int64  
 22  22      124926 non-null  int64  
 23  23      124926 non-null  int64  
 24  24      124926 non-null  float64
 25  25      124926 non-null  float64
 26  26      124926 non-null  float64
 27  27      124926 non-null  float64
 28  28      124926 non-null  float64
 29  29      124926 non-null  float64
 30  30      124926 non-null  float64
 31  31      124926 non-null  int64  
 32  32      124926 non-null  int64  
 33  33      124926 non-null  float64
 34  34      124926 non-null  float64
 35  35      124926 non-null  float64
 36  36      124926 non-null  float64
 37  37      124926 non-null  float64
 38  38      124926 non-null  float64
 39  39      124926 non-null  float64
 40  40      124926 non-null  float64
 41  41      124926 non-null  object 
 42  42      124926 non-null  int64  
dtypes: float64(15), int64(24), object(4)
memory usage: 41.0+ MB

dataset_train.describe()

# Identify the column that contains the attack types
# Assuming the column with attack types is named based on the sample data ("neptune", "normal", etc.)
attack_column = dataset_train.columns[-2]  # Second last column seems to have attack types

# Calculate the frequency of each attack type
attack_distribution = dataset_train[attack_column].value_counts()

plt.figure(figsize=(10, 6))
attack_distribution.plot(kind='bar', logy=True)
plt.title('A1 and A2 Training Dataset Distribution of Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

X_train = dataset_train.iloc[:, 0:-2].values
X_train

array([[0, 'tcp', 'ftp_data', ..., 0.0, 0.05, 0.0],
       [0, 'udp', 'other', ..., 0.0, 0.0, 0.0],
       [0, 'tcp', 'private', ..., 1.0, 0.0, 0.0],
       ...,
       [0, 'tcp', 'smtp', ..., 0.0, 0.01, 0.0],
       [0, 'tcp', 'klogin', ..., 1.0, 0.0, 0.0],
       [0, 'tcp', 'ftp_data', ..., 0.0, 0.0, 0.0]], dtype=object)

X_train.shape

(124926, 41)

label_column_train = dataset_train.iloc[:, -2].values
label_column_train

array(['normal', 'normal', 'neptune', ..., 'normal', 'neptune', 'normal'],
      dtype=object)

y_train = []
for i in range(len(label_column_train)):
    if label_column_train[i] == 'normal':
        y_train.append(0)
    else:
        y_train.append(1)

# Convert list to array
y_train = np.array(y_train)

y_train

array([0, 0, 1, ..., 0, 1, 0])

y_train.shape

(124926,)

Import Testing Dataset¶

# Import dataset.
# Dataset is given in TraningData variable You can replace it with the file 
# path such as “C:\Users\...\dataset.csv’. 
# The file can be a .txt as well. 
# If the dataset file has header, then keep header=0 otherwise use header=none
# reference: https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
dataset_test = pd.read_csv('Testing-a1-a2-a3.csv', header=None)

dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19659 entries, 0 to 19658
Data columns (total 43 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       19659 non-null  int64  
 1   1       19659 non-null  object 
 2   2       19659 non-null  object 
 3   3       19659 non-null  object 
 4   4       19659 non-null  int64  
 5   5       19659 non-null  int64  
 6   6       19659 non-null  int64  
 7   7       19659 non-null  int64  
 8   8       19659 non-null  int64  
 9   9       19659 non-null  int64  
 10  10      19659 non-null  int64  
 11  11      19659 non-null  int64  
 12  12      19659 non-null  int64  
 13  13      19659 non-null  int64  
 14  14      19659 non-null  int64  
 15  15      19659 non-null  int64  
 16  16      19659 non-null  int64  
 17  17      19659 non-null  int64  
 18  18      19659 non-null  int64  
 19  19      19659 non-null  int64  
 20  20      19659 non-null  int64  
 21  21      19659 non-null  int64  
 22  22      19659 non-null  int64  
 23  23      19659 non-null  int64  
 24  24      19659 non-null  float64
 25  25      19659 non-null  float64
 26  26      19659 non-null  float64
 27  27      19659 non-null  float64
 28  28      19659 non-null  float64
 29  29      19659 non-null  float64
 30  30      19659 non-null  float64
 31  31      19659 non-null  int64  
 32  32      19659 non-null  int64  
 33  33      19659 non-null  float64
 34  34      19659 non-null  float64
 35  35      19659 non-null  float64
 36  36      19659 non-null  float64
 37  37      19659 non-null  float64
 38  38      19659 non-null  float64
 39  39      19659 non-null  float64
 40  40      19659 non-null  float64
 41  41      19659 non-null  object 
 42  42      19659 non-null  int64  
dtypes: float64(15), int64(24), object(4)
memory usage: 6.4+ MB

dataset_test.describe()

# Identify the column that contains the attack types
# Assuming the column with attack types is named based on the sample data ("neptune", "normal", etc.)
attack_column = dataset_test.columns[-2]  # Second last column seems to have attack types

# Calculate the frequency of each attack type
attack_distribution = dataset_test[attack_column].value_counts()

plt.figure(figsize=(10, 6))
attack_distribution.plot(kind='bar', logy=True)
plt.title('A1, A2 and A3 Testing Dataset Distribution of Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

X_test = dataset_test.iloc[:, 0:-2].values
X_test

array([[0, 'tcp', 'private', ..., 0.0, 1.0, 1.0],
       [0, 'tcp', 'private', ..., 0.0, 1.0, 1.0],
       [2, 'tcp', 'ftp_data', ..., 0.0, 0.0, 0.0],
       ...,
       [0, 'tcp', 'http', ..., 0.0, 0.07, 0.07],
       [0, 'udp', 'domain_u', ..., 0.0, 0.0, 0.0],
       [0, 'tcp', 'sunrpc', ..., 0.0, 0.44, 1.0]], dtype=object)

X_test.shape

(19659, 41)

label_column_test = dataset_test.iloc[:, -2].values
label_column_test

array(['neptune', 'neptune', 'normal', ..., 'back', 'normal', 'mscan'],
      dtype=object)

y_test = []
for i in range(len(label_column_test)):
    if label_column_test[i] == 'normal':
        y_test.append(0)
    else:
        y_test.append(1)

# Convert list to array
y_test = np.array(y_test)

y_test

array([1, 1, 0, ..., 1, 0, 1])

Encoding categorical data (convert letters/words in numbers)¶

# The following code work Python 3.7 or newer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
     # The column numbers to be transformed ([1, 2, 3] represents three columns to be transferred)
    [('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), [1,2,3])],
    # Leave the rest of the columns untouched
    remainder='passthrough'
)
X_train = np.array(ct.fit_transform(X_train), dtype=np.float64)
X_test  = np.array(ct.transform(X_test) , dtype=np.float64)

X_train

array([[0.  , 1.  , 0.  , ..., 0.  , 0.05, 0.  ],
       [0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , ..., 1.  , 0.  , 0.  ],
       ...,
       [0.  , 1.  , 0.  , ..., 0.  , 0.01, 0.  ],
       [0.  , 1.  , 0.  , ..., 1.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , ..., 0.  , 0.  , 0.  ]])

X_train[0]

array([0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
       0.00e+00, 4.91e+02, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 2.00e+00, 2.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 1.50e+02, 2.50e+01,
       1.70e-01, 3.00e-02, 1.70e-01, 0.00e+00, 0.00e+00, 0.00e+00,
       5.00e-02, 0.00e+00])

X_train.shape

(124926, 122)

X_test

array([[0.  , 1.  , 0.  , ..., 0.  , 1.  , 1.  ],
       [0.  , 1.  , 0.  , ..., 0.  , 1.  , 1.  ],
       [0.  , 1.  , 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 1.  , 0.  , ..., 0.  , 0.07, 0.07],
       [0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , ..., 0.  , 0.44, 1.  ]])

X_test.shape

(19659, 122)

Perform feature scaling¶

# Perform feature scaling. For ANN you can use StandardScaler, for RNNs recommended is 
# MinMaxScaler. 
# referece: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)  # Scaling to the range [0,1]
X_test  = sc.fit_transform(X_test)

X_train

array([[-0.26661772,  0.47858359, -0.3692588 , ..., -0.6282041 ,
        -0.225902  , -0.3774467 ],
       [-0.26661772, -2.08949913,  2.70812776, ..., -0.6282041 ,
        -0.38864593, -0.3774467 ],
       [-0.26661772,  0.47858359, -0.3692588 , ...,  1.60987176,
        -0.38864593, -0.3774467 ],
       ...,
       [-0.26661772,  0.47858359, -0.3692588 , ..., -0.6282041 ,
        -0.35609714, -0.3774467 ],
       [-0.26661772,  0.47858359, -0.3692588 , ...,  1.60987176,
        -0.38864593, -0.3774467 ],
       [-0.26661772,  0.47858359, -0.3692588 , ..., -0.6282041 ,
        -0.38864593, -0.3774467 ]])

y_train

array([0, 0, 1, ..., 0, 1, 0])

Part 2: Building FNN¶

########################################
# Part 2: Building FNN
#######################################

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense, Input

Initialising the ANN¶

# Initialising the ANN
# Reference: https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/
classifier = Sequential()
classifier

<Sequential name=sequential, built=False>

Adding the input layer and the first hidden layer, 6 nodes, input_dim specifies the number of variables¶

# Adding the input layer and the first hidden layer, 6 nodes, input_dim specifies the number of variables
# rectified linear unit activation function relu, reference: https://machinelearningmastery.com/rectified-linear-activation-function-for-deep-learning-neural-networks/
# Adding the input layer with Input() and the first hidden layer
classifier.add(Input(shape=(len(X_train[0]),)))  # Input layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))

# Adding the second hidden layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))

# Adding the output layer
classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))

Compiling the ANN¶

# Compiling the ANN, 

# Gradient descent algorithm “adam“, Reference: https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/

# This loss is for a binary classification problems and is defined in Keras as “binary_crossentropy“, Reference: https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/

classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

Fitting the ANN to the Training set¶

# Fitting the ANN to the Training set
# Train the model so that it learns a good (or good enough) mapping of rows of input data to the output classification.
# add verbose=0 to turn off the progress report during the training
# To run the whole training dataset as one Batch, assign batch size: BatchSize=X_train.shape[0]
#classifierHistory = classifier.fit(X_train, y_train, batch_size = BatchSize, epochs = NumEpoch)

Save the Fitted Model¶

#classifier.save("fitted_FNN_model_SB.keras")  # Save the model in HDF5 format

# Save the history
#import json
#with open("fitted_FNN_model_history_SC.json", "w") as f:
#    json.dump(classifierHistory.history, f)

Load the Model if Necessary¶

# The trainined SB model is used.
from keras.models import load_model

classifier = load_model("fitted_FNN_model_SC.keras")  # Load the saved model

# Load the history
import json
with open("fitted_FNN_model_history_SC.json", "r") as f:
    classifierHistory = json.load(f)

classifier.summary()

Evaluate the keras model for the provided model and dataset¶

from sklearn.metrics import classification_report

Training Loss and Accuracy¶

loss_train, accuracy_train = classifier.evaluate(X_train, y_train)

3904/3904 ━━━━━━━━━━━━━━━━━━━━ 176s 45ms/step - accuracy: 0.9899 - loss: 0.0241

print('Print the training loss and the accuracy of the model on the dataset')
print('Loss [0,1]: {0:0.4f} Accuracy [0,1]: {1:0.4f}'.format(loss_train, accuracy_train))

Print the training loss and the accuracy of the model on the dataset
Loss [0,1]: 0.0244 Accuracy [0,1]: 0.9898

Testing Loss and Accuracy¶

loss_test, accuracy_test = classifier.evaluate(X_test, y_test)

615/615 ━━━━━━━━━━━━━━━━━━━━ 32s 52ms/step - accuracy: 0.8785 - loss: 1.0911

print('Print the testing loss and the accuracy of the model on the dataset')
print('Loss [0,1]: {0:0.4f} Accuracy [0,1]: {1:0.4f}'.format(loss_test, accuracy_test))

Print the testing loss and the accuracy of the model on the dataset
Loss [0,1]: 1.1006 Accuracy [0,1]: 0.8807

Part 3 - Making predictions and evaluating the model¶

########################################
# Part 3 - Making predictions and evaluating the model
#######################################

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.9)   # y_pred is 0 if less than 0.9 or equal to 0.9, y_pred is 1 if it is greater than 0.9

615/615 ━━━━━━━━━━━━━━━━━━━━ 20s 32ms/step

y_pred.shape

(19659, 1)

y_pred[:5]

array([[ True],
       [ True],
       [False],
       [ True],
       [False]])

y_test[:5]

array([1, 1, 0, 1, 1])

# summarize the first 5 cases
for i in range(10):
    print('{} (expected {})'.format(y_pred[i], y_test[i]))

[ True] (expected 1)
[ True] (expected 1)
[False] (expected 0)
[ True] (expected 1)
[False] (expected 1)
[False] (expected 0)
[False] (expected 0)
[False] (expected 0)
[False] (expected 1)
[False] (expected 0)

Making the Confusion Matrix¶

# Making the Confusion Matrix
# [TN, FP ]
# [FN, TP ]
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
print('Print the Confusion Matrix:')
print('[ TN, FP ]')
print('[ FN, TP ]=')
print(cm)
print(classification_report(y_test, y_pred))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Normal', 'Attack'])
disp.plot()
plt.savefig('confusion_matrix_SC')
plt.show()

Print the Confusion Matrix:
[ TN, FP ]
[ FN, TP ]=
[[8939  772]
 [2178 7770]]
              precision    recall  f1-score   support

           0       0.80      0.92      0.86      9711
           1       0.91      0.78      0.84      9948

    accuracy                           0.85     19659
   macro avg       0.86      0.85      0.85     19659
weighted avg       0.86      0.85      0.85     19659

Part 4 - Visualizing¶

Receiver Operating Characteristic (ROC) Curve and Area Under Curve (AUC)¶

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Generate probabilities
y_pred_prob = classifier.predict(X_test).ravel()

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # Random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.savefig('ROC_SC')
plt.show()

615/615 ━━━━━━━━━━━━━━━━━━━━ 19s 31ms/step

Plot the accuracy¶

########################################
# Part 4 - Visualizing
#######################################

# Import matplot lib libraries for plotting the figures. 
import matplotlib.pyplot as plt

# You can plot the accuracy
print('Plot the accuracy')
# Keras 2.2.4 recognizes 'acc' and 2.3.1 recognizes 'accuracy'
# use the command python -c 'import keras; print(keras.__version__)' on MAC or Linux to check Keras' version
#plt.plot(classifierHistory.history['accuracy'], label='Training Accuracy')
plt.plot(classifierHistory['accuracy'], label='Training Accuracy')
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(loc='upper left')
plt.savefig('accuracy_sample_SC.png')
plt.show()

Plot the accuracy

Plot the Loss¶

# You can plot history for loss
print('Plot the loss')
#plt.plot(classifierHistory.history['loss'], label='Training Loss')
plt.plot(classifierHistory['loss'], label='Training Loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(loc='upper left')
plt.savefig('loss_sample_SC.png')
plt.show()

Plot the loss