| 1 | + | |
| 2 | + | ## =========================================================================== ## |
| 3 | + | #| Written by Cooper Coldwell, July 12, 2022 |# |
| 4 | + | #| If you need a better documented/commented version of this code, check the |# |
| 5 | + | #| notebook titled "Data_prep.ipnyb". Everything is explained in detail there, |# |
| 6 | + | #| though it may take longer to run or require more memory. YMMV. |# |
| 7 | + | ## =========================================================================== ## |
| 8 | + | |
| 9 | + | from __future__ import absolute_import, division, print_function, unicode_literals |
| 10 | + | # import cupy as cp |
| 11 | + | import numpy as np |
| 12 | + | import pandas as pd |
| 13 | + | # import cudf as cd |
| 14 | + | import os, sys |
| 15 | + | import glob as glob |
| 16 | + | import binascii |
| 17 | + | import csv |
| 18 | + | import pickle |
| 19 | + | # import PIL.Image as Image |
| 20 | + | from scapy.all import * |
| 21 | + | from pathlib import Path |
| 22 | + | from tqdm.auto import tqdm |
| 23 | + | |
| 24 | + | pathToNormal = 'Normal-1UE/' |
| 25 | + | pathToNormal2UE = 'Normal-2UE/' |
| 26 | + | pathToAttack = 'Attacks/' |
| 27 | + | try: |
| 28 | + | os.mkdir('NEW-PREPPED-DATA') |
| 29 | + | except: |
| 30 | + | pass |
| 31 | + | processedPath = 'NEW-PREPPED-DATA/' |
| 32 | + | |
| 33 | + | datasets = glob(pathToNormal+'allcap*.pcapng') |
| 34 | + | print('\nDatasets: \n',datasets,'\n') |
| 35 | + | |
| 36 | + | print('Start processing normal-1ue data') |
| 37 | + | print('This could take up to an hour or more') |
| 38 | + | payloads = [] |
| 39 | + | for file in tqdm(datasets): |
| 40 | + | pcap = sniff(offline=str(file)) |
| 41 | + | for packet in pcap: |
| 42 | + | if not Raw in packet: |
| 43 | + | continue |
| 44 | + | payload = binascii.hexlify(packet[Raw].original) |
| 45 | + | payloads.append(payload) |
| 46 | + | print('\tConverted',len(payloads),'packets to strings.') |
| 47 | + | print('\tShuffling and saving to csv') |
| 48 | + | data = {'raw':payloads} |
| 49 | + | df = pd.DataFrame(data=data).sample(frac=1).reset_index(drop=True) |
| 50 | + | df.loc[:,'label'] = 'normal' |
| 51 | + | df.to_csv(f"{processedPath}normal_data.csv", index=False) |
| 52 | + | print('The first 3 processed packets look like: \n',df.head(3)) |
| 53 | + | |
| 54 | + | print('\n\nStart processing normal-2ue data') |
| 55 | + | print('This may take a while') |
| 56 | + | datasets = glob(pathToNormal2UE+'allcap*.pcapng') |
| 57 | + | payloads = [] |
| 58 | + | for pcap in datasets: |
| 59 | + | pcap = sniff(offline=str(file)) |
| 60 | + | for packet in pcap: |
| 61 | + | if not Raw in packet: |
| 62 | + | continue |
| 63 | + | payload = binascii.hexlify(packet[Raw].original) |
| 64 | + | payloads.append(payload) |
| 65 | + | print('\tConverted',len(payloads),'packets to strings.') |
| 66 | + | print('\tPickling to avoid data loss in the event memory runs out') |
| 67 | + | with open('2ue.p','wb') as file: |
| 68 | + | pickle.dump(payloads,file) |
| 69 | + | |
| 70 | + | with open('2ue.p','rb') as file: |
| 71 | + | payloads = pickle.load(file) |
| 72 | + | print('\tShuffling and saving to csv') |
| 73 | + | data = {'raw':payloads,'label':['normal']*len(payloads)} |
| 74 | + | # print(data['label'][0]) |
| 75 | + | df = pd.DataFrame(data=data).sample(frac=1).reset_index(drop=True) |
| 76 | + | df.to_csv(f"{processedPath}normal_data_2ue.csv", index=False) |
| 77 | + | |
| 78 | + | print('\n\nStart processing attack data') |
| 79 | + | print('This should be quicker') |
| 80 | + | |
| 81 | + | try: |
| 82 | + | del dataset, payload, payloads, data, df |
| 83 | + | except: |
| 84 | + | pass |
| 85 | + | |
| 86 | + | sets = [] |
| 87 | + | # print(os.listdir(pathToAttack)) |
| 88 | + | for i in os.listdir(pathToAttack): |
| 89 | + | dataset = glob(pathToAttack+i+'/Attacks*.pcapng') |
| 90 | + | try: |
| 91 | + | # print(dataset[0]) |
| 92 | + | sets.append(str(dataset[0])) |
| 93 | + | except: |
| 94 | + | print("Failed to find 'Attacks*.pcapng' file in folder: ", str(pathToAttack+i)) |
| 95 | + | |
| 96 | + | payloads = [] |
| 97 | + | for file in sets: |
| 98 | + | pcap = sniff(offline=str(file)) |
| 99 | + | |
| 100 | + | for packet in pcap[Raw]: |
| 101 | + | if not Raw in packet: |
| 102 | + | continue |
| 103 | + | payload = binascii.hexlify(packet[Raw].original) |
| 104 | + | payloads.append(payload) |
| 105 | + | print('\tConverted',len(payloads),'packets to strings.') |
| 106 | + | |
| 107 | + | print('\tShuffling and saving to csv') |
| 108 | + | data = {'raw':payloads} |
| 109 | + | df = pd.DataFrame(data=data) |
| 110 | + | df.loc[:,'label'] = 'attack' |
| 111 | + | df.to_csv(f"{processedPath}malicious_data.csv", index=False) |
| 112 | + | |
| 113 | + | |
| 114 | + | try: |
| 115 | + | del df |
| 116 | + | except: |
| 117 | + | pass |
| 118 | + | |
| 119 | + | print('\n\nReading the data back in from the CSVs') |
| 120 | + | normal = pd.read_csv(f"{processedPath}normal_data.csv") |
| 121 | + | normal2UE = pd.read_csv(f"{processedPath}normal_data_2ue.csv") |
| 122 | + | malicious = pd.read_csv(f"{processedPath}malicious_data.csv") |
| 123 | + | |
| 124 | + | print('\nCreating a data set with equal parts attack and normal') |
| 125 | + | mixed = malicious.sample(frac=1,random_state=100) #take all the malicious |
| 126 | + | mixed = pd.concat([mixed, normal.sample(frac=1,random_state=100)[0:len(malicious)//2]]) #append the first {half the length of malicious} packets from normal-1ue |
| 127 | + | mixed = pd.concat([mixed, normal2UE.sample(frac=1,random_state=100)[0:len(malicious)//2]]) #append the first {half the length of malicious} packets from normal-2ue |
| 128 | + | mixed = mixed.sample(frac=1,random_state=1) #shuffle the data before processing |
| 129 | + | |
| 130 | + | ## Separate the labels (important for using the mixed data to evaluate an autoencoder) |
| 131 | + | mixed_labels = mixed.pop('label') |
| 132 | + | np.save(f'{processedPath}mixed_labels.npy',mixed_labels) |
| 133 | + | del mixed_labels |
| 134 | + | print('Packets in malicious: ',len(malicious)) |
| 135 | + | print('Packets in mixed: ',len(mixed)) |
| 136 | + | print('Mixed set is of the expected size: ',len(malicious)*2==len(mixed)) |
| 137 | + | |
| 138 | + | print('\nPad the payloads to the same length, then convert to an array of bytes.') |
| 139 | + | print("The output is saved as:") |
| 140 | + | print('\t- mixed.npy') |
| 141 | + | max_packet_length = 1024 |
| 142 | + | def ReshapePackets(dataFrame,saveToFilename,max_packet_length): |
| 143 | + | '''Converts from byte strings in a DataFrame to a numpy array of bytes''' |
| 144 | + | array = np.array(dataFrame['raw']) |
| 145 | + | array = np.ascontiguousarray(array) |
| 146 | + | payloads = [] |
| 147 | + | array.shape |
| 148 | + | for i in range(array.shape[0]): |
| 149 | + | # print(array[i]) |
| 150 | + | # Standardize the length of the strings: |
| 151 | + | payloadStr = array[i].split('\'')[1] |
| 152 | + | payloadStr = payloadStr.ljust(max_packet_length+2, u'0') |
| 153 | + | payloadStr = payloadStr[0:max_packet_length] |
| 154 | + | array[i] = payloadStr.encode('utf8') |
| 155 | + | # Convert to array: |
| 156 | + | array[i] = np.frombuffer(array[i],dtype=np.uint8,count=max_packet_length) |
| 157 | + | payloads.append(np.reshape(array[i],(array[i].shape[0],1,1))) |
| 158 | + | payloads = np.array(payloads) |
| 159 | + | print('New data shape: ',payloads.shape) |
| 160 | + | np.save(saveToFilename,payloads) |
| 161 | + | |
| 162 | + | ReshapePackets(mixed,f'{processedPath}mixed.npy',max_packet_length) |
| 163 | + | del mixed |
| 164 | + | |
| 165 | + | print('\nCreating a data set with equal parts normal-1ue and normal-2ue') |
| 166 | + | totalNormal = pd.concat([normal.sample(frac=1,random_state=2022), |
| 167 | + | normal2UE.sample(frac=1,random_state=100)[0:len(normal)] |
| 168 | + | ]) |
| 169 | + | totalNormal = totalNormal.sample(frac=1,random_state=2022) |
| 170 | + | |
| 171 | + | print("\nPad the sets' payloads, then convert them to arrays of bytes.\nThe outputs are saved as:") |
| 172 | + | print("\t- normal.npy") |
| 173 | + | print("\t- normal2UE.npy") |
| 174 | + | print("\t- total_normal.npy") |
| 175 | + | ReshapePackets(normal,f'{processedPath}normal.npy',max_packet_length) |
| 176 | + | del normal |
| 177 | + | ReshapePackets(normal2UE,f'{processedPath}normal2UE.npy',max_packet_length) |
| 178 | + | del normal2UE |
| 179 | + | ReshapePackets(totalNormal,f'{processedPath}total_normal.npy',max_packet_length) |
| 180 | + | del totalNormal |
| 181 | + | |