Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Data Extractor

"""
Created on Mon Sep 23 23:02:38 2019

@author: Sowmya
Modified by Dijiang Huang 4/19/2020

Modified by Mark Khusid
December 6, 2024
"""
'\nCreated on Mon Sep 23 23:02:38 2019\n\n@author: Sowmya\nModified by Dijiang Huang 4/19/2020\n'

Import Libraries

import numpy as np
import pandas as pd

Define Variables

Data File Path

# Data file Path
DatasetPath='NSL-KDD/'

Data File Name

input_train = "KDDTrain+.txt"
input_test = "KDDTest+.txt"
file_extension = '.csv'  # .csv or .txt

Define Number of Attack Classes

num_attack_class = 4 # total number of attack classes

Define Attack Subclasses

#All attacks in NSL-KDD classed based on their attack classes: DoS, Prob, U2R, and R2L
attacks_subClass = \
    [
        [
            'apache2',
            'back',
            'land',
            'neptune',
            'mailbomb',
            'pod',
            'processtable',
            'smurf',
            'teardrop',
            'udpstorm',
            'worm'
        ],
        [
            'ipsweep',
            'mscan',
            'portsweep',
            'saint',
            'satan',
            'nmap'
        ],
        [
            'buffer_overflow',
            'loadmodule',
            'perl',
            'ps',
            'rootkit',
            'sqlattack','xterm'],
        [
            'ftp_write',
            'guess_passwd',
            'httptunnel',
            'imap',
            'multihop',
            'named',
            'phf',
            'sendmail',
            'snmpgetattack',
            'spy',
            'snmpguess',
            'warezclient',
            'warezmaster',
            'xlock',
            'xsnoop'
        ]
     ]

Load the Datasets

print("Loading", input_train, "and", input_test, "files from the current folder where this script resides.....\n")
dataset_train = pd.read_csv(DatasetPath + input_train, header=None, encoding="ISO-8859-1")
dataset_test = pd.read_csv(DatasetPath + input_test, header=None, encoding="ISO-8859-1")
Loading KDDTrain+.txt and KDDTest+.txt files from the current folder where this script resides.....

dataset_test.describe()
Loading...
dataset_test.head()
Loading...
dataset_train.describe()
Loading...
dataset_train.head()
Loading...

Create Train and Test Datasets

X_train = dataset_train.iloc[:, :].values
X_test = dataset_test.iloc[:, :].values
X_train.shape
(125973, 43)
X_test.shape
(22544, 43)

Define Selected Attack Classes List

# Set1 is selected attack classes
training_attack_class_list = []
training_attack_class_list
[]
# Set2 is removed attack classes
testing_attack_class_list = []
testing_attack_class_list
[]

Scenario A (SA)

Enter the Desired Attack Classes

attack_class_1 = [1, 3]
attack_class_1
[1, 3]
attack_class_2 = [2, 4]
attack_class_2
[2, 4]

Append Desired Attack Classes to Training and Testing Attack Class Lists

training_attack_class_list = []
training_attack_class_list.append(attack_class_1)
training_attack_class_list
[[1, 3]]
testing_attack_class_list = []
testing_attack_class_list.append(attack_class_2)
testing_attack_class_list
[[2, 4]]

Select Subclasses and Save to File

Create Training Data Set

print("Creating training set.....\n")
setA_train = []

if (training_attack_class_list[0][0] != 0) and \
    (len(training_attack_class_list[0]) != num_attack_class):

    for i in range(len(X_train)):
        # exp., X_train[i, -2] is the label of attack subclass, and attacks_subClass[training_attack_class_list[0][j]-1] identify the selected attack class
        if str.lower(str(X_train[i,-2])) == 'normal':
            setA_train.append(X_train[i])

        for j in range(len(training_attack_class_list[0])):
            if str.lower(str(X_train[i, -2])) in attacks_subClass[training_attack_class_list[0][j]-1]:
                setA_train.append(X_train[i])

    trainingFileName="Training"

    for i in range(len(training_attack_class_list[0])):
        trainingFileName = trainingFileName + "-a" + str(training_attack_class_list[0][i])

    trainingFileName = trainingFileName + file_extension

    np.savetxt(trainingFileName, setA_train, delimiter=',', fmt="%s" )

    print("Files " + trainingFileName + " have been created in the same folder this script resides\n")

elif (len(training_attack_class_list[0]) == num_attack_class):
    print("No changes is needed for training dataset!\n")
else:
    print("No attack classes are chosen, thus no new training file is created!\n")
Creating training set.....

Files Training-a1-a3.csv have been created in the same folder this script resides

Create Testing Data Set

print("Creating testing set.....\n")    
setA_test = []

# the following for loop choose selected attack classes and normal labeled data and put them into the setA_train.
if (testing_attack_class_list[0][0] != 0) and \
    (len(testing_attack_class_list[0]) != num_attack_class):

    for i in range(len(X_test)):
        # exp., X_train[i, -2] is the label of attack subclass, and attacks_subClass[training_attack_class_list[0][j]-1] identify the selected attack class

        if str.lower(str(X_test[i,-2])) == 'normal':
            setA_test.append(X_test[i])

        for j in range(len(testing_attack_class_list[0])):
            if str.lower(str(X_test[i, -2])) in attacks_subClass[testing_attack_class_list[0][j]-1]:
                setA_test.append(X_test[i])

    testingFileName="Testing"

    for i in range(len(testing_attack_class_list[0])):
        testingFileName = testingFileName + "-a" + str(testing_attack_class_list[0][i])

    testingFileName = testingFileName + file_extension

    np.savetxt(testingFileName, setA_test, delimiter=',', fmt="%s" )

    print("Files " + testingFileName + " have been created in the same folder this script resides\n")

elif len(testing_attack_class_list[0]) == num_attack_class:
    print("No changes is needed for testing dataset!\n")

else:
    print("No attack classes are chosen, thus no new training file is created!\n")
Creating testing set.....

Files Testing-a2-a4.csv have been created in the same folder this script resides

Scenario B (SB)

Enter the Desired Attack Classes

attack_class_1 = [1, 2]
attack_class_1
[1, 2]
attack_class_2 = [1]
attack_class_2
[1]

Append Desired Attack Classes to Training and Testing Attack Class Lists

training_attack_class_list = []
training_attack_class_list.append(attack_class_1)
training_attack_class_list
[[1, 2]]
testing_attack_class_list = []
testing_attack_class_list.append(attack_class_2)
testing_attack_class_list
[[1]]

Select Subclasses and Save to File

Create Training Data Set

print("Creating training set.....\n")
setA_train = []

if (training_attack_class_list[0][0] != 0) and \
    (len(training_attack_class_list[0]) != num_attack_class):

    for i in range(len(X_train)):
        # exp., X_train[i, -2] is the label of attack subclass, and attacks_subClass[training_attack_class_list[0][j]-1] identify the selected attack class
        if str.lower(str(X_train[i,-2])) == 'normal':
            setA_train.append(X_train[i])

        for j in range(len(training_attack_class_list[0])):
            if str.lower(str(X_train[i, -2])) in attacks_subClass[training_attack_class_list[0][j]-1]:
                setA_train.append(X_train[i])

    trainingFileName="Training"

    for i in range(len(training_attack_class_list[0])):
        trainingFileName = trainingFileName + "-a" + str(training_attack_class_list[0][i])

    trainingFileName = trainingFileName + file_extension

    np.savetxt(trainingFileName, setA_train, delimiter=',', fmt="%s" )

    print("Files " + trainingFileName + " have been created in the same folder this script resides\n")

elif (len(training_attack_class_list[0]) == num_attack_class):
    print("No changes is needed for training dataset!\n")
else:
    print("No attack classes are chosen, thus no new training file is created!\n")
Creating training set.....

Files Training-a1-a2.csv have been created in the same folder this script resides

Create Testing Data Set

print("Creating testing set.....\n")    
setA_test = []

# the following for loop choose selected attack classes and normal labeled data and put them into the setA_train.
if (testing_attack_class_list[0][0] != 0) and \
    (len(testing_attack_class_list[0]) != num_attack_class):

    for i in range(len(X_test)):
        # exp., X_train[i, -2] is the label of attack subclass, and attacks_subClass[training_attack_class_list[0][j]-1] identify the selected attack class

        if str.lower(str(X_test[i,-2])) == 'normal':
            setA_test.append(X_test[i])

        for j in range(len(testing_attack_class_list[0])):
            if str.lower(str(X_test[i, -2])) in attacks_subClass[testing_attack_class_list[0][j]-1]:
                setA_test.append(X_test[i])

    testingFileName="Testing"

    for i in range(len(testing_attack_class_list[0])):
        testingFileName = testingFileName + "-a" + str(testing_attack_class_list[0][i])

    testingFileName = testingFileName + file_extension

    np.savetxt(testingFileName, setA_test, delimiter=',', fmt="%s" )

    print("Files " + testingFileName + " have been created in the same folder this script resides\n")

elif len(testing_attack_class_list[0]) == num_attack_class:
    print("No changes is needed for testing dataset!\n")

else:
    print("No attack classes are chosen, thus no new training file is created!\n")
Creating testing set.....

Files Testing-a1.csv have been created in the same folder this script resides

Scenario C (SC)

Enter the Desired Attack Classes

attack_class_1 = [1, 2]
attack_class_1
[1, 2]
attack_class_2 = [1, 2, 3]
attack_class_2
[1, 2, 3]

Append Desired Attack Classes to Training and Testing Attack Class Lists

training_attack_class_list = []
training_attack_class_list.append(attack_class_1)
training_attack_class_list
[[1, 2]]
testing_attack_class_list = []
testing_attack_class_list.append(attack_class_2)
testing_attack_class_list
[[1, 2, 3]]

Select Subclasses and Save to File

Create Training Data Set

print("Creating training set.....\n")
setA_train = []

if (training_attack_class_list[0][0] != 0) and \
    (len(training_attack_class_list[0]) != num_attack_class):

    for i in range(len(X_train)):
        # exp., X_train[i, -2] is the label of attack subclass, and attacks_subClass[training_attack_class_list[0][j]-1] identify the selected attack class
        if str.lower(str(X_train[i,-2])) == 'normal':
            setA_train.append(X_train[i])

        for j in range(len(training_attack_class_list[0])):
            if str.lower(str(X_train[i, -2])) in attacks_subClass[training_attack_class_list[0][j]-1]:
                setA_train.append(X_train[i])

    trainingFileName="Training"

    for i in range(len(training_attack_class_list[0])):
        trainingFileName = trainingFileName + "-a" + str(training_attack_class_list[0][i])

    trainingFileName = trainingFileName + file_extension

    np.savetxt(trainingFileName, setA_train, delimiter=',', fmt="%s" )

    print("Files " + trainingFileName + " have been created in the same folder this script resides\n")

elif (len(training_attack_class_list[0]) == num_attack_class):
    print("No changes is needed for training dataset!\n")
else:
    print("No attack classes are chosen, thus no new training file is created!\n")
Creating training set.....

Files Training-a1-a2.csv have been created in the same folder this script resides

Create Testing Data Set

print("Creating testing set.....\n")    
setA_test = []

# the following for loop choose selected attack classes and normal labeled data and put them into the setA_train.
if (testing_attack_class_list[0][0] != 0) and \
    (len(testing_attack_class_list[0]) != num_attack_class):

    for i in range(len(X_test)):
        # exp., X_train[i, -2] is the label of attack subclass, and attacks_subClass[training_attack_class_list[0][j]-1] identify the selected attack class

        if str.lower(str(X_test[i,-2])) == 'normal':
            setA_test.append(X_test[i])

        for j in range(len(testing_attack_class_list[0])):
            if str.lower(str(X_test[i, -2])) in attacks_subClass[testing_attack_class_list[0][j]-1]:
                setA_test.append(X_test[i])

    testingFileName="Testing"

    for i in range(len(testing_attack_class_list[0])):
        testingFileName = testingFileName + "-a" + str(testing_attack_class_list[0][i])

    testingFileName = testingFileName + file_extension

    np.savetxt(testingFileName, setA_test, delimiter=',', fmt="%s" )

    print("Files " + testingFileName + " have been created in the same folder this script resides\n")

elif len(testing_attack_class_list[0]) == num_attack_class:
    print("No changes is needed for testing dataset!\n")

else:
    print("No attack classes are chosen, thus no new training file is created!\n")
Creating testing set.....

Files Testing-a1-a2-a3.csv have been created in the same folder this script resides