"""
Created on Mon Apr 15 19:43:04 2019
Updated on Wed Jan 29 10:18:09 2020
@author: created by Sowmya Myneni and updated by Dijiang Huang
"""'\nCreated on Mon Apr 15 19:43:04 2019\n\nUpdated on Wed Jan 29 10:18:09 2020\n\n@author: created by Sowmya Myneni and updated by Dijiang Huang\n'import matplotlib.pyplot as plt
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))Num GPUs Available: 0
import os
import multiprocessing
# Number of logical CPUs
num_cores = multiprocessing.cpu_count()
print(f"Number of CPU cores available: {num_cores}")Number of CPU cores available: 12
# Configure TensorFlow to use all available CPU cores
tf.config.threading.set_intra_op_parallelism_threads(0) # Use all intra-operation threads
tf.config.threading.set_inter_op_parallelism_threads(0) # Use all inter-operation threadsFNN Sample SC¶
########################################
# Part 1 - Data Pre-Processing
#######################################
# To load a dataset file in Python, you can use Pandas. Import pandas using the line below
import pandas as pd
# Import numpy to perform operations on the dataset
import numpy as npVariable Setup¶
# Variable Setup
# Available datasets: KDDTrain+.txt, KDDTest+.txt, etc. More read Data Set Introduction.html within the NSL-KDD dataset folder
# Type the training dataset file name in ''
#TrainingDataPath='NSL-KDD/'
#TrainingData='KDDTrain+_20Percent.txt'
# Batch Size
BatchSize=10
# Epohe Size
NumEpoch=10Import Training Dataset¶
# Import dataset.
# Dataset is given in TraningData variable You can replace it with the file
# path such as “C:\Users\...\dataset.csv’.
# The file can be a .txt as well.
# If the dataset file has header, then keep header=0 otherwise use header=none
# reference: https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
dataset_train = pd.read_csv('Training-a1-a2.csv', header=None)dataset_train.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124926 entries, 0 to 124925
Data columns (total 43 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 124926 non-null int64
1 1 124926 non-null object
2 2 124926 non-null object
3 3 124926 non-null object
4 4 124926 non-null int64
5 5 124926 non-null int64
6 6 124926 non-null int64
7 7 124926 non-null int64
8 8 124926 non-null int64
9 9 124926 non-null int64
10 10 124926 non-null int64
11 11 124926 non-null int64
12 12 124926 non-null int64
13 13 124926 non-null int64
14 14 124926 non-null int64
15 15 124926 non-null int64
16 16 124926 non-null int64
17 17 124926 non-null int64
18 18 124926 non-null int64
19 19 124926 non-null int64
20 20 124926 non-null int64
21 21 124926 non-null int64
22 22 124926 non-null int64
23 23 124926 non-null int64
24 24 124926 non-null float64
25 25 124926 non-null float64
26 26 124926 non-null float64
27 27 124926 non-null float64
28 28 124926 non-null float64
29 29 124926 non-null float64
30 30 124926 non-null float64
31 31 124926 non-null int64
32 32 124926 non-null int64
33 33 124926 non-null float64
34 34 124926 non-null float64
35 35 124926 non-null float64
36 36 124926 non-null float64
37 37 124926 non-null float64
38 38 124926 non-null float64
39 39 124926 non-null float64
40 40 124926 non-null float64
41 41 124926 non-null object
42 42 124926 non-null int64
dtypes: float64(15), int64(24), object(4)
memory usage: 41.0+ MB
dataset_train.describe()Loading...
# Identify the column that contains the attack types
# Assuming the column with attack types is named based on the sample data ("neptune", "normal", etc.)
attack_column = dataset_train.columns[-2] # Second last column seems to have attack types
# Calculate the frequency of each attack type
attack_distribution = dataset_train[attack_column].value_counts()
plt.figure(figsize=(10, 6))
attack_distribution.plot(kind='bar', logy=True)
plt.title('A1 and A2 Training Dataset Distribution of Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
X_train = dataset_train.iloc[:, 0:-2].values
X_trainarray([[0, 'tcp', 'ftp_data', ..., 0.0, 0.05, 0.0],
[0, 'udp', 'other', ..., 0.0, 0.0, 0.0],
[0, 'tcp', 'private', ..., 1.0, 0.0, 0.0],
...,
[0, 'tcp', 'smtp', ..., 0.0, 0.01, 0.0],
[0, 'tcp', 'klogin', ..., 1.0, 0.0, 0.0],
[0, 'tcp', 'ftp_data', ..., 0.0, 0.0, 0.0]], dtype=object)X_train.shape(124926, 41)label_column_train = dataset_train.iloc[:, -2].values
label_column_trainarray(['normal', 'normal', 'neptune', ..., 'normal', 'neptune', 'normal'],
dtype=object)y_train = []
for i in range(len(label_column_train)):
if label_column_train[i] == 'normal':
y_train.append(0)
else:
y_train.append(1)
# Convert list to array
y_train = np.array(y_train)y_trainarray([0, 0, 1, ..., 0, 1, 0])y_train.shape(124926,)Import Testing Dataset¶
# Import dataset.
# Dataset is given in TraningData variable You can replace it with the file
# path such as “C:\Users\...\dataset.csv’.
# The file can be a .txt as well.
# If the dataset file has header, then keep header=0 otherwise use header=none
# reference: https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
dataset_test = pd.read_csv('Testing-a1-a2-a3.csv', header=None)dataset_test.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19659 entries, 0 to 19658
Data columns (total 43 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 19659 non-null int64
1 1 19659 non-null object
2 2 19659 non-null object
3 3 19659 non-null object
4 4 19659 non-null int64
5 5 19659 non-null int64
6 6 19659 non-null int64
7 7 19659 non-null int64
8 8 19659 non-null int64
9 9 19659 non-null int64
10 10 19659 non-null int64
11 11 19659 non-null int64
12 12 19659 non-null int64
13 13 19659 non-null int64
14 14 19659 non-null int64
15 15 19659 non-null int64
16 16 19659 non-null int64
17 17 19659 non-null int64
18 18 19659 non-null int64
19 19 19659 non-null int64
20 20 19659 non-null int64
21 21 19659 non-null int64
22 22 19659 non-null int64
23 23 19659 non-null int64
24 24 19659 non-null float64
25 25 19659 non-null float64
26 26 19659 non-null float64
27 27 19659 non-null float64
28 28 19659 non-null float64
29 29 19659 non-null float64
30 30 19659 non-null float64
31 31 19659 non-null int64
32 32 19659 non-null int64
33 33 19659 non-null float64
34 34 19659 non-null float64
35 35 19659 non-null float64
36 36 19659 non-null float64
37 37 19659 non-null float64
38 38 19659 non-null float64
39 39 19659 non-null float64
40 40 19659 non-null float64
41 41 19659 non-null object
42 42 19659 non-null int64
dtypes: float64(15), int64(24), object(4)
memory usage: 6.4+ MB
dataset_test.describe()Loading...
# Identify the column that contains the attack types
# Assuming the column with attack types is named based on the sample data ("neptune", "normal", etc.)
attack_column = dataset_test.columns[-2] # Second last column seems to have attack types
# Calculate the frequency of each attack type
attack_distribution = dataset_test[attack_column].value_counts()
plt.figure(figsize=(10, 6))
attack_distribution.plot(kind='bar', logy=True)
plt.title('A1, A2 and A3 Testing Dataset Distribution of Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
X_test = dataset_test.iloc[:, 0:-2].values
X_testarray([[0, 'tcp', 'private', ..., 0.0, 1.0, 1.0],
[0, 'tcp', 'private', ..., 0.0, 1.0, 1.0],
[2, 'tcp', 'ftp_data', ..., 0.0, 0.0, 0.0],
...,
[0, 'tcp', 'http', ..., 0.0, 0.07, 0.07],
[0, 'udp', 'domain_u', ..., 0.0, 0.0, 0.0],
[0, 'tcp', 'sunrpc', ..., 0.0, 0.44, 1.0]], dtype=object)X_test.shape(19659, 41)label_column_test = dataset_test.iloc[:, -2].values
label_column_testarray(['neptune', 'neptune', 'normal', ..., 'back', 'normal', 'mscan'],
dtype=object)y_test = []
for i in range(len(label_column_test)):
if label_column_test[i] == 'normal':
y_test.append(0)
else:
y_test.append(1)
# Convert list to array
y_test = np.array(y_test)y_testarray([1, 1, 0, ..., 1, 0, 1])Encoding categorical data (convert letters/words in numbers)¶
# The following code work Python 3.7 or newer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
# The column numbers to be transformed ([1, 2, 3] represents three columns to be transferred)
[('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), [1,2,3])],
# Leave the rest of the columns untouched
remainder='passthrough'
)
X_train = np.array(ct.fit_transform(X_train), dtype=np.float64)
X_test = np.array(ct.transform(X_test) , dtype=np.float64)X_trainarray([[0. , 1. , 0. , ..., 0. , 0.05, 0. ],
[0. , 0. , 1. , ..., 0. , 0. , 0. ],
[0. , 1. , 0. , ..., 1. , 0. , 0. ],
...,
[0. , 1. , 0. , ..., 0. , 0.01, 0. ],
[0. , 1. , 0. , ..., 1. , 0. , 0. ],
[0. , 1. , 0. , ..., 0. , 0. , 0. ]])X_train[0]array([0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
0.00e+00, 4.91e+02, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 2.00e+00, 2.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 1.50e+02, 2.50e+01,
1.70e-01, 3.00e-02, 1.70e-01, 0.00e+00, 0.00e+00, 0.00e+00,
5.00e-02, 0.00e+00])X_train.shape(124926, 122)X_testarray([[0. , 1. , 0. , ..., 0. , 1. , 1. ],
[0. , 1. , 0. , ..., 0. , 1. , 1. ],
[0. , 1. , 0. , ..., 0. , 0. , 0. ],
...,
[0. , 1. , 0. , ..., 0. , 0.07, 0.07],
[0. , 0. , 1. , ..., 0. , 0. , 0. ],
[0. , 1. , 0. , ..., 0. , 0.44, 1. ]])X_test.shape(19659, 122)Perform feature scaling¶
# Perform feature scaling. For ANN you can use StandardScaler, for RNNs recommended is
# MinMaxScaler.
# referece: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train) # Scaling to the range [0,1]
X_test = sc.fit_transform(X_test)X_trainarray([[-0.26661772, 0.47858359, -0.3692588 , ..., -0.6282041 ,
-0.225902 , -0.3774467 ],
[-0.26661772, -2.08949913, 2.70812776, ..., -0.6282041 ,
-0.38864593, -0.3774467 ],
[-0.26661772, 0.47858359, -0.3692588 , ..., 1.60987176,
-0.38864593, -0.3774467 ],
...,
[-0.26661772, 0.47858359, -0.3692588 , ..., -0.6282041 ,
-0.35609714, -0.3774467 ],
[-0.26661772, 0.47858359, -0.3692588 , ..., 1.60987176,
-0.38864593, -0.3774467 ],
[-0.26661772, 0.47858359, -0.3692588 , ..., -0.6282041 ,
-0.38864593, -0.3774467 ]])y_trainarray([0, 0, 1, ..., 0, 1, 0])Part 2: Building FNN¶
########################################
# Part 2: Building FNN
#######################################
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense, InputInitialising the ANN¶
# Initialising the ANN
# Reference: https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/
classifier = Sequential()
classifier<Sequential name=sequential, built=False>Adding the input layer and the first hidden layer, 6 nodes, input_dim specifies the number of variables¶
# Adding the input layer and the first hidden layer, 6 nodes, input_dim specifies the number of variables
# rectified linear unit activation function relu, reference: https://machinelearningmastery.com/rectified-linear-activation-function-for-deep-learning-neural-networks/
# Adding the input layer with Input() and the first hidden layer
classifier.add(Input(shape=(len(X_train[0]),))) # Input layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
# Adding the second hidden layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
# Adding the output layer
classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))Compiling the ANN¶
# Compiling the ANN,
# Gradient descent algorithm “adam“, Reference: https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/
# This loss is for a binary classification problems and is defined in Keras as “binary_crossentropy“, Reference: https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])Fitting the ANN to the Training set¶
# Fitting the ANN to the Training set
# Train the model so that it learns a good (or good enough) mapping of rows of input data to the output classification.
# add verbose=0 to turn off the progress report during the training
# To run the whole training dataset as one Batch, assign batch size: BatchSize=X_train.shape[0]
#classifierHistory = classifier.fit(X_train, y_train, batch_size = BatchSize, epochs = NumEpoch)Save the Fitted Model¶
#classifier.save("fitted_FNN_model_SB.keras") # Save the model in HDF5 format
# Save the history
#import json
#with open("fitted_FNN_model_history_SC.json", "w") as f:
# json.dump(classifierHistory.history, f)Load the Model if Necessary¶
# The trainined SB model is used.
from keras.models import load_model
classifier = load_model("fitted_FNN_model_SC.keras") # Load the saved model
# Load the history
import json
with open("fitted_FNN_model_history_SC.json", "r") as f:
classifierHistory = json.load(f)classifier.summary()Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
Evaluate the keras model for the provided model and dataset¶
from sklearn.metrics import classification_reportTraining Loss and Accuracy¶
loss_train, accuracy_train = classifier.evaluate(X_train, y_train)3904/3904 ━━━━━━━━━━━━━━━━━━━━ 176s 45ms/step - accuracy: 0.9899 - loss: 0.0241
print('Print the training loss and the accuracy of the model on the dataset')
print('Loss [0,1]: {0:0.4f} Accuracy [0,1]: {1:0.4f}'.format(loss_train, accuracy_train))Print the training loss and the accuracy of the model on the dataset
Loss [0,1]: 0.0244 Accuracy [0,1]: 0.9898
Testing Loss and Accuracy¶
loss_test, accuracy_test = classifier.evaluate(X_test, y_test)615/615 ━━━━━━━━━━━━━━━━━━━━ 32s 52ms/step - accuracy: 0.8785 - loss: 1.0911
print('Print the testing loss and the accuracy of the model on the dataset')
print('Loss [0,1]: {0:0.4f} Accuracy [0,1]: {1:0.4f}'.format(loss_test, accuracy_test))Print the testing loss and the accuracy of the model on the dataset
Loss [0,1]: 1.1006 Accuracy [0,1]: 0.8807
Part 3 - Making predictions and evaluating the model¶
########################################
# Part 3 - Making predictions and evaluating the model
#######################################
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.9) # y_pred is 0 if less than 0.9 or equal to 0.9, y_pred is 1 if it is greater than 0.9615/615 ━━━━━━━━━━━━━━━━━━━━ 20s 32ms/step
y_pred.shape(19659, 1)y_pred[:5]array([[ True],
[ True],
[False],
[ True],
[False]])y_test[:5]array([1, 1, 0, 1, 1])# summarize the first 5 cases
for i in range(10):
print('{} (expected {})'.format(y_pred[i], y_test[i]))[ True] (expected 1)
[ True] (expected 1)
[False] (expected 0)
[ True] (expected 1)
[False] (expected 1)
[False] (expected 0)
[False] (expected 0)
[False] (expected 0)
[False] (expected 1)
[False] (expected 0)
Making the Confusion Matrix¶
# Making the Confusion Matrix
# [TN, FP ]
# [FN, TP ]
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
print('Print the Confusion Matrix:')
print('[ TN, FP ]')
print('[ FN, TP ]=')
print(cm)
print(classification_report(y_test, y_pred))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Normal', 'Attack'])
disp.plot()
plt.savefig('confusion_matrix_SC')
plt.show()Print the Confusion Matrix:
[ TN, FP ]
[ FN, TP ]=
[[8939 772]
[2178 7770]]
precision recall f1-score support
0 0.80 0.92 0.86 9711
1 0.91 0.78 0.84 9948
accuracy 0.85 19659
macro avg 0.86 0.85 0.85 19659
weighted avg 0.86 0.85 0.85 19659

Part 4 - Visualizing¶
Receiver Operating Characteristic (ROC) Curve and Area Under Curve (AUC)¶
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
# Generate probabilities
y_pred_prob = classifier.predict(X_test).ravel()
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--') # Random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.savefig('ROC_SC')
plt.show()615/615 ━━━━━━━━━━━━━━━━━━━━ 19s 31ms/step

Plot the accuracy¶
########################################
# Part 4 - Visualizing
#######################################
# Import matplot lib libraries for plotting the figures.
import matplotlib.pyplot as plt
# You can plot the accuracy
print('Plot the accuracy')
# Keras 2.2.4 recognizes 'acc' and 2.3.1 recognizes 'accuracy'
# use the command python -c 'import keras; print(keras.__version__)' on MAC or Linux to check Keras' version
#plt.plot(classifierHistory.history['accuracy'], label='Training Accuracy')
plt.plot(classifierHistory['accuracy'], label='Training Accuracy')
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(loc='upper left')
plt.savefig('accuracy_sample_SC.png')
plt.show()Plot the accuracy

Plot the Loss¶
# You can plot history for loss
print('Plot the loss')
#plt.plot(classifierHistory.history['loss'], label='Training Loss')
plt.plot(classifierHistory['loss'], label='Training Loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(loc='upper left')
plt.savefig('loss_sample_SC.png')
plt.show()Plot the loss
