"""
Created on Mon Apr 15 19:43:04 2019

Updated on Wed Jan 29 10:18:09 2020

@author: created by Sowmya Myneni and updated by Dijiang Huang
"""

'\nCreated on Mon Apr 15 19:43:04 2019\n\nUpdated on Wed Jan 29 10:18:09 2020\n\n@author: created by Sowmya Myneni and updated by Dijiang Huang\n'

import matplotlib.pyplot as plt
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 1

import os
import multiprocessing

# Number of logical CPUs
num_cores = multiprocessing.cpu_count()
print(f"Number of CPU cores available: {num_cores}")

Number of CPU cores available: 16

# Configure TensorFlow to use all available CPU cores
#tf.config.threading.set_intra_op_parallelism_threads(0)  # Use all intra-operation threads
#tf.config.threading.set_inter_op_parallelism_threads(0)  # Use all inter-operation threads

FNN Sample SA¶

########################################
# Part 1 - Data Pre-Processing
#######################################

# To load a dataset file in Python, you can use Pandas. Import pandas using the line below
import pandas as pd
# Import numpy to perform operations on the dataset
import numpy as np

Variable Setup¶

# Variable Setup
# Available datasets: KDDTrain+.txt, KDDTest+.txt, etc. More read Data Set Introduction.html within the NSL-KDD dataset folder
# Type the training dataset file name in ''
#TrainingDataPath='NSL-KDD/'
#TrainingData='KDDTrain+_20Percent.txt'
# Batch Size
BatchSize=10
# Epohe Size
NumEpoch=10

Import Training Dataset¶

# Import dataset.
# Dataset is given in TraningData variable You can replace it with the file 
# path such as “C:\Users\...\dataset.csv’. 
# The file can be a .txt as well. 
# If the dataset file has header, then keep header=0 otherwise use header=none
# reference: https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
dataset_train = pd.read_csv('Training-a1-a3.csv', header=None)

dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113322 entries, 0 to 113321
Data columns (total 43 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       113322 non-null  int64  
 1   1       113322 non-null  object 
 2   2       113322 non-null  object 
 3   3       113322 non-null  object 
 4   4       113322 non-null  int64  
 5   5       113322 non-null  int64  
 6   6       113322 non-null  int64  
 7   7       113322 non-null  int64  
 8   8       113322 non-null  int64  
 9   9       113322 non-null  int64  
 10  10      113322 non-null  int64  
 11  11      113322 non-null  int64  
 12  12      113322 non-null  int64  
 13  13      113322 non-null  int64  
 14  14      113322 non-null  int64  
 15  15      113322 non-null  int64  
 16  16      113322 non-null  int64  
 17  17      113322 non-null  int64  
 18  18      113322 non-null  int64  
 19  19      113322 non-null  int64  
 20  20      113322 non-null  int64  
 21  21      113322 non-null  int64  
 22  22      113322 non-null  int64  
 23  23      113322 non-null  int64  
 24  24      113322 non-null  float64
 25  25      113322 non-null  float64
 26  26      113322 non-null  float64
 27  27      113322 non-null  float64
 28  28      113322 non-null  float64
 29  29      113322 non-null  float64
 30  30      113322 non-null  float64
 31  31      113322 non-null  int64  
 32  32      113322 non-null  int64  
 33  33      113322 non-null  float64
 34  34      113322 non-null  float64
 35  35      113322 non-null  float64
 36  36      113322 non-null  float64
 37  37      113322 non-null  float64
 38  38      113322 non-null  float64
 39  39      113322 non-null  float64
 40  40      113322 non-null  float64
 41  41      113322 non-null  object 
 42  42      113322 non-null  int64  
dtypes: float64(15), int64(24), object(4)
memory usage: 37.2+ MB

dataset_train.describe()

# Identify the column that contains the attack types
# Assuming the column with attack types is named based on the sample data ("neptune", "normal", etc.)
attack_column = dataset_train.columns[-2]  # Second last column seems to have attack types

# Calculate the frequency of each attack type
attack_distribution = dataset_train[attack_column].value_counts()

plt.figure(figsize=(10, 6))
attack_distribution.plot(kind='bar', logy=True)
plt.title('A1 and A3 Training Dataset Distribution of Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

X_train = dataset_train.iloc[:, 0:-2].values
X_train

array([[0, 'tcp', 'ftp_data', ..., 0.0, 0.05, 0.0],
       [0, 'udp', 'other', ..., 0.0, 0.0, 0.0],
       [0, 'tcp', 'private', ..., 1.0, 0.0, 0.0],
       ...,
       [0, 'tcp', 'smtp', ..., 0.0, 0.01, 0.0],
       [0, 'tcp', 'klogin', ..., 1.0, 0.0, 0.0],
       [0, 'tcp', 'ftp_data', ..., 0.0, 0.0, 0.0]], dtype=object)

X_train.shape

(113322, 41)

label_column_train = dataset_train.iloc[:, -2].values
label_column_train

array(['normal', 'normal', 'neptune', ..., 'normal', 'neptune', 'normal'],
      dtype=object)

y_train = []
for i in range(len(label_column_train)):
    if label_column_train[i] == 'normal':
        y_train.append(0)
    else:
        y_train.append(1)

# Convert list to array
y_train = np.array(y_train)

y_train

array([0, 0, 1, ..., 0, 1, 0])

y_train.shape

(113322,)

Import Testing Dataset¶

# Import dataset.
# Dataset is given in TraningData variable You can replace it with the file 
# path such as “C:\Users\...\dataset.csv’. 
# The file can be a .txt as well. 
# If the dataset file has header, then keep header=0 otherwise use header=none
# reference: https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
dataset_test = pd.read_csv('Testing-a2-a4.csv', header=None)

dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15017 entries, 0 to 15016
Data columns (total 43 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       15017 non-null  int64  
 1   1       15017 non-null  object 
 2   2       15017 non-null  object 
 3   3       15017 non-null  object 
 4   4       15017 non-null  int64  
 5   5       15017 non-null  int64  
 6   6       15017 non-null  int64  
 7   7       15017 non-null  int64  
 8   8       15017 non-null  int64  
 9   9       15017 non-null  int64  
 10  10      15017 non-null  int64  
 11  11      15017 non-null  int64  
 12  12      15017 non-null  int64  
 13  13      15017 non-null  int64  
 14  14      15017 non-null  int64  
 15  15      15017 non-null  int64  
 16  16      15017 non-null  int64  
 17  17      15017 non-null  int64  
 18  18      15017 non-null  int64  
 19  19      15017 non-null  int64  
 20  20      15017 non-null  int64  
 21  21      15017 non-null  int64  
 22  22      15017 non-null  int64  
 23  23      15017 non-null  int64  
 24  24      15017 non-null  float64
 25  25      15017 non-null  float64
 26  26      15017 non-null  float64
 27  27      15017 non-null  float64
 28  28      15017 non-null  float64
 29  29      15017 non-null  float64
 30  30      15017 non-null  float64
 31  31      15017 non-null  int64  
 32  32      15017 non-null  int64  
 33  33      15017 non-null  float64
 34  34      15017 non-null  float64
 35  35      15017 non-null  float64
 36  36      15017 non-null  float64
 37  37      15017 non-null  float64
 38  38      15017 non-null  float64
 39  39      15017 non-null  float64
 40  40      15017 non-null  float64
 41  41      15017 non-null  object 
 42  42      15017 non-null  int64  
dtypes: float64(15), int64(24), object(4)
memory usage: 4.9+ MB

dataset_test.describe()

# Identify the column that contains the attack types
# Assuming the column with attack types is named based on the sample data ("neptune", "normal", etc.)
attack_column = dataset_test.columns[-2]  # Second last column seems to have attack types

# Calculate the frequency of each attack type
attack_distribution = dataset_test[attack_column].value_counts()

plt.figure(figsize=(10, 6))
attack_distribution.plot(kind='bar', logy=True)
plt.title('A2 and A4 Testing Dataset Distribution of Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

X_test = dataset_test.iloc[:, 0:-2].values
X_test

array([[2, 'tcp', 'ftp_data', ..., 0.0, 0.0, 0.0],
       [0, 'icmp', 'eco_i', ..., 0.0, 0.0, 0.0],
       [1, 'tcp', 'telnet', ..., 0.0, 0.83, 0.71],
       ...,
       [0, 'tcp', 'http', ..., 0.0, 0.0, 0.0],
       [0, 'udp', 'domain_u', ..., 0.0, 0.0, 0.0],
       [0, 'tcp', 'sunrpc', ..., 0.0, 0.44, 1.0]], dtype=object)

X_test.shape

(15017, 41)

label_column_test = dataset_test.iloc[:, -2].values
label_column_test

array(['normal', 'saint', 'mscan', ..., 'normal', 'normal', 'mscan'],
      dtype=object)

y_test = []
for i in range(len(label_column_test)):
    if label_column_test[i] == 'normal':
        y_test.append(0)
    else:
        y_test.append(1)

# Convert list to array
y_test = np.array(y_test)

y_test

array([0, 1, 1, ..., 0, 0, 1])

Encoding categorical data (convert letters/words in numbers)¶

# The following code work Python 3.7 or newer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
     # The column numbers to be transformed ([1, 2, 3] represents three columns to be transferred)
    [('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), [1,2,3])],
    # Leave the rest of the columns untouched
    remainder='passthrough'
)
X_train = np.array(ct.fit_transform(X_train), dtype=np.float64)
X_test  = np.array(ct.transform(X_test) , dtype=np.float64)

X_train

array([[0.  , 1.  , 0.  , ..., 0.  , 0.05, 0.  ],
       [0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , ..., 1.  , 0.  , 0.  ],
       ...,
       [0.  , 1.  , 0.  , ..., 0.  , 0.01, 0.  ],
       [0.  , 1.  , 0.  , ..., 1.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , ..., 0.  , 0.  , 0.  ]])

X_train[0]

array([0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
       0.00e+00, 4.91e+02, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 2.00e+00, 2.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
       0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 1.50e+02, 2.50e+01,
       1.70e-01, 3.00e-02, 1.70e-01, 0.00e+00, 0.00e+00, 0.00e+00,
       5.00e-02, 0.00e+00])

X_train.shape

(113322, 116)

X_test

array([[0.  , 1.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , ..., 0.  , 0.83, 0.71],
       ...,
       [0.  , 1.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , ..., 0.  , 0.44, 1.  ]])

X_test.shape

(15017, 116)

Perform feature scaling¶

# Perform feature scaling. For ANN you can use StandardScaler, for RNNs recommended is 
# MinMaxScaler. 
# referece: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)  # Scaling to the range [0,1]
X_test  = sc.fit_transform(X_test)

X_train

array([[-0.19511653,  0.42713604, -0.36510181, ..., -0.66659951,
        -0.14895839, -0.31810766],
       [-0.19511653, -2.34117451,  2.73896205, ..., -0.66659951,
        -0.32815058, -0.31810766],
       [-0.19511653,  0.42713604, -0.36510181, ...,  1.51653833,
        -0.32815058, -0.31810766],
       ...,
       [-0.19511653,  0.42713604, -0.36510181, ..., -0.66659951,
        -0.29231215, -0.31810766],
       [-0.19511653,  0.42713604, -0.36510181, ...,  1.51653833,
        -0.32815058, -0.31810766],
       [-0.19511653,  0.42713604, -0.36510181, ..., -0.66659951,
        -0.32815058, -0.31810766]])

y_train

array([0, 0, 1, ..., 0, 1, 0])

Part 2: Building FNN¶

########################################
# Part 2: Building FNN
#######################################

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense, Input

Initializing the ANN¶

# Initialising the ANN
# Reference: https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/
classifier = Sequential()
classifier

<keras.engine.sequential.Sequential at 0x19cb8de02e0>

Adding the input layer and the first hidden layer, 6 nodes, input_dim specifies the number of variables¶

# Adding the input layer and the first hidden layer, 6 nodes, input_dim specifies the number of variables
# rectified linear unit activation function relu, reference: https://machinelearningmastery.com/rectified-linear-activation-function-for-deep-learning-neural-networks/
# Adding the input layer with Input() and the first hidden layer
classifier.add(Input(shape=(len(X_train[0]),)))  # Input layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))

# Adding the second hidden layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))

# Adding the output layer
classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))

Compiling the ANN¶

# Compiling the ANN, 

# Gradient descent algorithm “adam“, Reference: https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/

# This loss is for a binary classification problems and is defined in Keras as “binary_crossentropy“, Reference: https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/

classifier.compile(
    optimizer = 'adam', 
    loss = 'binary_crossentropy', 
    metrics = ['accuracy']
    )

Fitting the ANN to the Training set¶

# Fitting the ANN to the Training set
# Train the model so that it learns a good (or good enough) mapping of rows of input data to the output classification.
# add verbose=0 to turn off the progress report during the training
# To run the whole training dataset as one Batch, assign batch size: BatchSize=X_train.shape[0#]
#classifierHistory = classifier.fit(X_train, y_train, batch_size = BatchSize, epochs = NumEpoch)

Epoch 1/10
11333/11333 [==============================] - 26s 2ms/step - loss: 0.0347 - accuracy: 0.9885
Epoch 2/10
11333/11333 [==============================] - 26s 2ms/step - loss: 0.0137 - accuracy: 0.9960
Epoch 3/10
11333/11333 [==============================] - 25s 2ms/step - loss: 0.0106 - accuracy: 0.9969
Epoch 4/10
11333/11333 [==============================] - 25s 2ms/step - loss: 0.0095 - accuracy: 0.9974
Epoch 5/10
11333/11333 [==============================] - 26s 2ms/step - loss: 0.0084 - accuracy: 0.9977
Epoch 6/10
11333/11333 [==============================] - 26s 2ms/step - loss: 0.0076 - accuracy: 0.9979
Epoch 7/10
11333/11333 [==============================] - 26s 2ms/step - loss: 0.0071 - accuracy: 0.9981
Epoch 8/10
11333/11333 [==============================] - 25s 2ms/step - loss: 0.0068 - accuracy: 0.9981
Epoch 9/10
11333/11333 [==============================] - 25s 2ms/step - loss: 0.0067 - accuracy: 0.9981
Epoch 10/10
11333/11333 [==============================] - 26s 2ms/step - loss: 0.0067 - accuracy: 0.9982

Save the Fitted Model¶

classifier.save("fitted_FNN_model_SA.keras")  # Save the model in HDF5 format

# Save the history
#import json
#with open("fitted_FNN_model_history_SA.json", "w") as f:
#    json.dump(classifierHistory.history, f)

Load the Model if Necessary¶

#from keras.models import load_model

#classifier = load_model("fitted_FNN_model_SA.keras")  # Load the saved model

# Load the history
#import json
#with open("fitted_FNN_model_history_SA.json", "r") as f:
#    classifierHistory = json.load(f)

classifier.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 6)                 702       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 7         
=================================================================
Total params: 751
Trainable params: 751
Non-trainable params: 0
_________________________________________________________________

Evaluate the keras model for the provided model and dataset¶

from sklearn.metrics import classification_report

Training Loss and Accuracy¶

loss_train, accuracy_train = classifier.evaluate(X_train, y_train)

3542/3542 [==============================] - 4s 1ms/step - loss: 0.0061 - accuracy: 0.9984

print('Print the training loss and the accuracy of the model on the dataset')
print('Loss [0,1]: {0:0.4f} Accuracy [0,1]: {1:0.4f}'.format(loss_train, accuracy_train))

Print the training loss and the accuracy of the model on the dataset
Loss [0,1]: 0.0061 Accuracy [0,1]: 0.9984

Testing Loss and Accuracy¶

loss_test, accuracy_test = classifier.evaluate(X_test, y_test)

470/470 [==============================] - 1s 1ms/step - loss: 2.0659 - accuracy: 0.7774

print('Print the testing loss and the accuracy of the model on the dataset')
print('Loss [0,1]: {0:0.4f} Accuracy [0,1]: {1:0.4f}'.format(loss_test, accuracy_test))

Print the testing loss and the accuracy of the model on the dataset
Loss [0,1]: 2.0659 Accuracy [0,1]: 0.7774

Part 3 - Making predictions and evaluating the model¶

########################################
# Part 3 - Making predictions and evaluating the model
#######################################

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.9)   # y_pred is 0 if less than 0.9 or equal to 0.9, y_pred is 1 if it is greater than 0.9

y_pred.shape

(15017, 1)

y_pred[:5]

array([[False],
       [False],
       [ True],
       [False],
       [False]])

y_test[:5]

array([0, 1, 1, 0, 0])

# summarize the first 5 cases
for i in range(10):
    print('{} (expected {})'.format(y_pred[i], y_test[i]))

[False] (expected 0)
[False] (expected 1)
[ True] (expected 1)
[False] (expected 0)
[False] (expected 0)
[False] (expected 1)
[False] (expected 0)
[False] (expected 1)
[ True] (expected 1)
[False] (expected 0)

Making the Confusion Matrix¶

# Making the Confusion Matrix
# [TN, FP ]
# [FN, TP ]
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
print('Print the Confusion Matrix:')
print('[ TN, FP ]')
print('[ FN, TP ]=')
print(cm)
print(classification_report(y_test, y_pred))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Normal', 'Attack'])
disp.plot()
plt.savefig('confusion_matrix_SA.png')
plt.show()

Print the Confusion Matrix:
[ TN, FP ]
[ FN, TP ]=
[[8666 1045]
 [3007 2299]]
              precision    recall  f1-score   support

           0       0.74      0.89      0.81      9711
           1       0.69      0.43      0.53      5306

    accuracy                           0.73     15017
   macro avg       0.71      0.66      0.67     15017
weighted avg       0.72      0.73      0.71     15017

Part 4 - Visualizing¶

Receiver Operating Characteristic (ROC) Curve and Area Under Curve (AUC)¶

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Generate probabilities
y_pred_prob = classifier.predict(X_test).ravel()

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # Random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.savefig('ROC_SA.png')
plt.show()

Plot the accuracy¶

########################################
# Part 4 - Visualizing
#######################################

# Import matplot lib libraries for plotting the figures. 
import matplotlib.pyplot as plt

# You can plot the accuracy
print('Plot the accuracy')
# Keras 2.2.4 recognizes 'acc' and 2.3.1 recognizes 'accuracy'
# use the command python -c 'import keras; print(keras.__version__)' on MAC or Linux to check Keras' version
plt.plot(classifierHistory.history['accuracy'], label='Training Accuracy')
#plt.plot(classifierHistory['accuracy'], label='Training Accuracy')
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(loc='upper left')
plt.savefig('accuracy_sample_SA.png')
plt.show()

Plot the accuracy

Plot the Loss¶

# You can plot history for loss
print('Plot the loss')
plt.plot(classifierHistory.history['loss'], label='Training Loss')
#plt.plot(classifierHistory['loss'], label='Training Loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(loc='upper left')
plt.savefig('loss_sample_SA.png')
plt.show()

Plot the loss