Projects STRLCPY 5GAD Commits 2ad9afb8
🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • 5GNetworkDiagram.png
  • ■ ■ ■ ■ ■ ■
    Data_prep.py
     1 + 
     2 +## =========================================================================== ##
     3 +#| Written by Cooper Coldwell, July 12, 2022 |#
     4 +#| If you need a better documented/commented version of this code, check the |#
     5 +#| notebook titled "Data_prep.ipnyb". Everything is explained in detail there, |#
     6 +#| though it may take longer to run or require more memory. YMMV. |#
     7 +## =========================================================================== ##
     8 + 
     9 +from __future__ import absolute_import, division, print_function, unicode_literals
     10 +# import cupy as cp
     11 +import numpy as np
     12 +import pandas as pd
     13 +# import cudf as cd
     14 +import os, sys
     15 +import glob as glob
     16 +import binascii
     17 +import csv
     18 +import pickle
     19 +# import PIL.Image as Image
     20 +from scapy.all import *
     21 +from pathlib import Path
     22 +from tqdm.auto import tqdm
     23 + 
     24 +pathToNormal = 'Normal-1UE/'
     25 +pathToNormal2UE = 'Normal-2UE/'
     26 +pathToAttack = 'Attacks/'
     27 +try:
     28 + os.mkdir('NEW-PREPPED-DATA')
     29 +except:
     30 + pass
     31 +processedPath = 'NEW-PREPPED-DATA/'
     32 + 
     33 +datasets = glob(pathToNormal+'allcap*.pcapng')
     34 +print('\nDatasets: \n',datasets,'\n')
     35 + 
     36 +print('Start processing normal-1ue data')
     37 +print('This could take up to an hour or more')
     38 +payloads = []
     39 +for file in tqdm(datasets):
     40 + pcap = sniff(offline=str(file))
     41 + for packet in pcap:
     42 + if not Raw in packet:
     43 + continue
     44 + payload = binascii.hexlify(packet[Raw].original)
     45 + payloads.append(payload)
     46 +print('\tConverted',len(payloads),'packets to strings.')
     47 +print('\tShuffling and saving to csv')
     48 +data = {'raw':payloads}
     49 +df = pd.DataFrame(data=data).sample(frac=1).reset_index(drop=True)
     50 +df.loc[:,'label'] = 'normal'
     51 +df.to_csv(f"{processedPath}normal_data.csv", index=False)
     52 +print('The first 3 processed packets look like: \n',df.head(3))
     53 + 
     54 +print('\n\nStart processing normal-2ue data')
     55 +print('This may take a while')
     56 +datasets = glob(pathToNormal2UE+'allcap*.pcapng')
     57 +payloads = []
     58 +for pcap in datasets:
     59 + pcap = sniff(offline=str(file))
     60 + for packet in pcap:
     61 + if not Raw in packet:
     62 + continue
     63 + payload = binascii.hexlify(packet[Raw].original)
     64 + payloads.append(payload)
     65 +print('\tConverted',len(payloads),'packets to strings.')
     66 +print('\tPickling to avoid data loss in the event memory runs out')
     67 +with open('2ue.p','wb') as file:
     68 + pickle.dump(payloads,file)
     69 +
     70 +with open('2ue.p','rb') as file:
     71 + payloads = pickle.load(file)
     72 +print('\tShuffling and saving to csv')
     73 +data = {'raw':payloads,'label':['normal']*len(payloads)}
     74 +# print(data['label'][0])
     75 +df = pd.DataFrame(data=data).sample(frac=1).reset_index(drop=True)
     76 +df.to_csv(f"{processedPath}normal_data_2ue.csv", index=False)
     77 + 
     78 +print('\n\nStart processing attack data')
     79 +print('This should be quicker')
     80 + 
     81 +try:
     82 + del dataset, payload, payloads, data, df
     83 +except:
     84 + pass
     85 + 
     86 +sets = []
     87 +# print(os.listdir(pathToAttack))
     88 +for i in os.listdir(pathToAttack):
     89 + dataset = glob(pathToAttack+i+'/Attacks*.pcapng')
     90 + try:
     91 + # print(dataset[0])
     92 + sets.append(str(dataset[0]))
     93 + except:
     94 + print("Failed to find 'Attacks*.pcapng' file in folder: ", str(pathToAttack+i))
     95 +
     96 +payloads = []
     97 +for file in sets:
     98 + pcap = sniff(offline=str(file))
     99 +
     100 + for packet in pcap[Raw]:
     101 + if not Raw in packet:
     102 + continue
     103 + payload = binascii.hexlify(packet[Raw].original)
     104 + payloads.append(payload)
     105 +print('\tConverted',len(payloads),'packets to strings.')
     106 + 
     107 +print('\tShuffling and saving to csv')
     108 +data = {'raw':payloads}
     109 +df = pd.DataFrame(data=data)
     110 +df.loc[:,'label'] = 'attack'
     111 +df.to_csv(f"{processedPath}malicious_data.csv", index=False)
     112 + 
     113 + 
     114 +try:
     115 + del df
     116 +except:
     117 + pass
     118 + 
     119 +print('\n\nReading the data back in from the CSVs')
     120 +normal = pd.read_csv(f"{processedPath}normal_data.csv")
     121 +normal2UE = pd.read_csv(f"{processedPath}normal_data_2ue.csv")
     122 +malicious = pd.read_csv(f"{processedPath}malicious_data.csv")
     123 + 
     124 +print('\nCreating a data set with equal parts attack and normal')
     125 +mixed = malicious.sample(frac=1,random_state=100) #take all the malicious
     126 +mixed = pd.concat([mixed, normal.sample(frac=1,random_state=100)[0:len(malicious)//2]]) #append the first {half the length of malicious} packets from normal-1ue
     127 +mixed = pd.concat([mixed, normal2UE.sample(frac=1,random_state=100)[0:len(malicious)//2]]) #append the first {half the length of malicious} packets from normal-2ue
     128 +mixed = mixed.sample(frac=1,random_state=1) #shuffle the data before processing
     129 + 
     130 +## Separate the labels (important for using the mixed data to evaluate an autoencoder)
     131 +mixed_labels = mixed.pop('label')
     132 +np.save(f'{processedPath}mixed_labels.npy',mixed_labels)
     133 +del mixed_labels
     134 +print('Packets in malicious: ',len(malicious))
     135 +print('Packets in mixed: ',len(mixed))
     136 +print('Mixed set is of the expected size: ',len(malicious)*2==len(mixed))
     137 + 
     138 +print('\nPad the payloads to the same length, then convert to an array of bytes.')
     139 +print("The output is saved as:")
     140 +print('\t- mixed.npy')
     141 +max_packet_length = 1024
     142 +def ReshapePackets(dataFrame,saveToFilename,max_packet_length):
     143 + '''Converts from byte strings in a DataFrame to a numpy array of bytes'''
     144 + array = np.array(dataFrame['raw'])
     145 + array = np.ascontiguousarray(array)
     146 + payloads = []
     147 + array.shape
     148 + for i in range(array.shape[0]):
     149 +# print(array[i])
     150 + # Standardize the length of the strings:
     151 + payloadStr = array[i].split('\'')[1]
     152 + payloadStr = payloadStr.ljust(max_packet_length+2, u'0')
     153 + payloadStr = payloadStr[0:max_packet_length]
     154 + array[i] = payloadStr.encode('utf8')
     155 + # Convert to array:
     156 + array[i] = np.frombuffer(array[i],dtype=np.uint8,count=max_packet_length)
     157 + payloads.append(np.reshape(array[i],(array[i].shape[0],1,1)))
     158 + payloads = np.array(payloads)
     159 + print('New data shape: ',payloads.shape)
     160 + np.save(saveToFilename,payloads)
     161 +
     162 +ReshapePackets(mixed,f'{processedPath}mixed.npy',max_packet_length)
     163 +del mixed
     164 + 
     165 +print('\nCreating a data set with equal parts normal-1ue and normal-2ue')
     166 +totalNormal = pd.concat([normal.sample(frac=1,random_state=2022),
     167 + normal2UE.sample(frac=1,random_state=100)[0:len(normal)]
     168 + ])
     169 +totalNormal = totalNormal.sample(frac=1,random_state=2022)
     170 + 
     171 +print("\nPad the sets' payloads, then convert them to arrays of bytes.\nThe outputs are saved as:")
     172 +print("\t- normal.npy")
     173 +print("\t- normal2UE.npy")
     174 +print("\t- total_normal.npy")
     175 +ReshapePackets(normal,f'{processedPath}normal.npy',max_packet_length)
     176 +del normal
     177 +ReshapePackets(normal2UE,f'{processedPath}normal2UE.npy',max_packet_length)
     178 +del normal2UE
     179 +ReshapePackets(totalNormal,f'{processedPath}total_normal.npy',max_packet_length)
     180 +del totalNormal
     181 + 
Please wait...
Page is in error, reload to recover