STRLCPY/5GAD

Add files via upload
Matthew Anderson committed with GitHub 2 years ago

2ad9afb8

1 parent 46d40475

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

5GNetworkDiagram.png

■ ■ ■ ■ ■ ■

Data_prep.py

1	+
2	+	## =========================================================================== ##
3	+	#\| Written by Cooper Coldwell, July 12, 2022 \|#
4	+	#\| If you need a better documented/commented version of this code, check the \|#
5	+	#\| notebook titled "Data_prep.ipnyb". Everything is explained in detail there, \|#
6	+	#\| though it may take longer to run or require more memory. YMMV. \|#
7	+	## =========================================================================== ##
8	+
9	+	from __future__ import absolute_import, division, print_function, unicode_literals
10	+	# import cupy as cp
11	+	import numpy as np
12	+	import pandas as pd
13	+	# import cudf as cd
14	+	import os, sys
15	+	import glob as glob
16	+	import binascii
17	+	import csv
18	+	import pickle
19	+	# import PIL.Image as Image
20	+	from scapy.all import *
21	+	from pathlib import Path
22	+	from tqdm.auto import tqdm
23	+
24	+	pathToNormal = 'Normal-1UE/'
25	+	pathToNormal2UE = 'Normal-2UE/'
26	+	pathToAttack = 'Attacks/'
27	+	try:
28	+	os.mkdir('NEW-PREPPED-DATA')
29	+	except:
30	+	pass
31	+	processedPath = 'NEW-PREPPED-DATA/'
32	+
33	+	datasets = glob(pathToNormal+'allcap*.pcapng')
34	+	print('\nDatasets: \n',datasets,'\n')
35	+
36	+	print('Start processing normal-1ue data')
37	+	print('This could take up to an hour or more')
38	+	payloads = []
39	+	for file in tqdm(datasets):
40	+	pcap = sniff(offline=str(file))
41	+	for packet in pcap:
42	+	if not Raw in packet:
43	+	continue
44	+	payload = binascii.hexlify(packet[Raw].original)
45	+	payloads.append(payload)
46	+	print('\tConverted',len(payloads),'packets to strings.')
47	+	print('\tShuffling and saving to csv')
48	+	data = {'raw':payloads}
49	+	df = pd.DataFrame(data=data).sample(frac=1).reset_index(drop=True)
50	+	df.loc[:,'label'] = 'normal'
51	+	df.to_csv(f"{processedPath}normal_data.csv", index=False)
52	+	print('The first 3 processed packets look like: \n',df.head(3))
53	+
54	+	print('\n\nStart processing normal-2ue data')
55	+	print('This may take a while')
56	+	datasets = glob(pathToNormal2UE+'allcap*.pcapng')
57	+	payloads = []
58	+	for pcap in datasets:
59	+	pcap = sniff(offline=str(file))
60	+	for packet in pcap:
61	+	if not Raw in packet:
62	+	continue
63	+	payload = binascii.hexlify(packet[Raw].original)
64	+	payloads.append(payload)
65	+	print('\tConverted',len(payloads),'packets to strings.')
66	+	print('\tPickling to avoid data loss in the event memory runs out')
67	+	with open('2ue.p','wb') as file:
68	+	pickle.dump(payloads,file)
69	+
70	+	with open('2ue.p','rb') as file:
71	+	payloads = pickle.load(file)
72	+	print('\tShuffling and saving to csv')
73	+	data = {'raw':payloads,'label':['normal']*len(payloads)}
74	+	# print(data['label'][0])
75	+	df = pd.DataFrame(data=data).sample(frac=1).reset_index(drop=True)
76	+	df.to_csv(f"{processedPath}normal_data_2ue.csv", index=False)
77	+
78	+	print('\n\nStart processing attack data')
79	+	print('This should be quicker')
80	+
81	+	try:
82	+	del dataset, payload, payloads, data, df
83	+	except:
84	+	pass
85	+
86	+	sets = []
87	+	# print(os.listdir(pathToAttack))
88	+	for i in os.listdir(pathToAttack):
89	+	dataset = glob(pathToAttack+i+'/Attacks*.pcapng')
90	+	try:
91	+	# print(dataset[0])
92	+	sets.append(str(dataset[0]))
93	+	except:
94	+	print("Failed to find 'Attacks*.pcapng' file in folder: ", str(pathToAttack+i))
95	+
96	+	payloads = []
97	+	for file in sets:
98	+	pcap = sniff(offline=str(file))
99	+
100	+	for packet in pcap[Raw]:
101	+	if not Raw in packet:
102	+	continue
103	+	payload = binascii.hexlify(packet[Raw].original)
104	+	payloads.append(payload)
105	+	print('\tConverted',len(payloads),'packets to strings.')
106	+
107	+	print('\tShuffling and saving to csv')
108	+	data = {'raw':payloads}
109	+	df = pd.DataFrame(data=data)
110	+	df.loc[:,'label'] = 'attack'
111	+	df.to_csv(f"{processedPath}malicious_data.csv", index=False)
112	+
113	+
114	+	try:
115	+	del df
116	+	except:
117	+	pass
118	+
119	+	print('\n\nReading the data back in from the CSVs')
120	+	normal = pd.read_csv(f"{processedPath}normal_data.csv")
121	+	normal2UE = pd.read_csv(f"{processedPath}normal_data_2ue.csv")
122	+	malicious = pd.read_csv(f"{processedPath}malicious_data.csv")
123	+
124	+	print('\nCreating a data set with equal parts attack and normal')
125	+	mixed = malicious.sample(frac=1,random_state=100) #take all the malicious
126	+	mixed = pd.concat([mixed, normal.sample(frac=1,random_state=100)[0:len(malicious)//2]]) #append the first {half the length of malicious} packets from normal-1ue
127	+	mixed = pd.concat([mixed, normal2UE.sample(frac=1,random_state=100)[0:len(malicious)//2]]) #append the first {half the length of malicious} packets from normal-2ue
128	+	mixed = mixed.sample(frac=1,random_state=1) #shuffle the data before processing
129	+
130	+	## Separate the labels (important for using the mixed data to evaluate an autoencoder)
131	+	mixed_labels = mixed.pop('label')
132	+	np.save(f'{processedPath}mixed_labels.npy',mixed_labels)
133	+	del mixed_labels
134	+	print('Packets in malicious: ',len(malicious))
135	+	print('Packets in mixed: ',len(mixed))
136	+	print('Mixed set is of the expected size: ',len(malicious)*2==len(mixed))
137	+
138	+	print('\nPad the payloads to the same length, then convert to an array of bytes.')
139	+	print("The output is saved as:")
140	+	print('\t- mixed.npy')
141	+	max_packet_length = 1024
142	+	def ReshapePackets(dataFrame,saveToFilename,max_packet_length):
143	+	'''Converts from byte strings in a DataFrame to a numpy array of bytes'''
144	+	array = np.array(dataFrame['raw'])
145	+	array = np.ascontiguousarray(array)
146	+	payloads = []
147	+	array.shape
148	+	for i in range(array.shape[0]):
149	+	# print(array[i])
150	+	# Standardize the length of the strings:
151	+	payloadStr = array[i].split('\'')[1]
152	+	payloadStr = payloadStr.ljust(max_packet_length+2, u'0')
153	+	payloadStr = payloadStr[0:max_packet_length]
154	+	array[i] = payloadStr.encode('utf8')
155	+	# Convert to array:
156	+	array[i] = np.frombuffer(array[i],dtype=np.uint8,count=max_packet_length)
157	+	payloads.append(np.reshape(array[i],(array[i].shape[0],1,1)))
158	+	payloads = np.array(payloads)
159	+	print('New data shape: ',payloads.shape)
160	+	np.save(saveToFilename,payloads)
161	+
162	+	ReshapePackets(mixed,f'{processedPath}mixed.npy',max_packet_length)
163	+	del mixed
164	+
165	+	print('\nCreating a data set with equal parts normal-1ue and normal-2ue')
166	+	totalNormal = pd.concat([normal.sample(frac=1,random_state=2022),
167	+	normal2UE.sample(frac=1,random_state=100)[0:len(normal)]
168	+	])
169	+	totalNormal = totalNormal.sample(frac=1,random_state=2022)
170	+
171	+	print("\nPad the sets' payloads, then convert them to arrays of bytes.\nThe outputs are saved as:")
172	+	print("\t- normal.npy")
173	+	print("\t- normal2UE.npy")
174	+	print("\t- total_normal.npy")
175	+	ReshapePackets(normal,f'{processedPath}normal.npy',max_packet_length)
176	+	del normal
177	+	ReshapePackets(normal2UE,f'{processedPath}normal2UE.npy',max_packet_length)
178	+	del normal2UE
179	+	ReshapePackets(totalNormal,f'{processedPath}total_normal.npy',max_packet_length)
180	+	del totalNormal
181	+

Add files via upload