224 lines
6.8 KiB
Python
224 lines
6.8 KiB
Python
import warnings
|
|
warnings.filterwarnings("ignore")
|
|
import psutil
|
|
import pandas as pd
|
|
import numpy as np
|
|
import json
|
|
import gc
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.metrics import (
|
|
confusion_matrix,
|
|
accuracy_score,
|
|
precision_recall_fscore_support,
|
|
)
|
|
|
|
from tabulate import tabulate
|
|
|
|
|
|
import os
|
|
|
|
def _force_memory_cleanup():
|
|
gc.collect()
|
|
try:
|
|
import ctypes
|
|
ctypes.CDLL("libc.so.6").malloc_trim(0)
|
|
except Exception:
|
|
pass
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
PROGRESS_PATH = os.path.join(BASE_DIR, "progress.json")
|
|
DATASET_PATH = os.path.join(BASE_DIR, "dataset", "cicids2017_cleaned.csv")
|
|
|
|
|
|
def update_progress(value):
|
|
with open(PROGRESS_PATH, "w") as f:
|
|
json.dump({"progress": value}, f)
|
|
|
|
|
|
def run_autoencoder(csv_path=DATASET_PATH, plot=False, table=False):
|
|
|
|
import tensorflow as tf
|
|
|
|
from tensorflow.keras import Model
|
|
from tensorflow.keras.layers import Dense, Input, Dropout
|
|
from tensorflow.keras.optimizers import Adam
|
|
from tensorflow.keras.callbacks import EarlyStopping
|
|
from tensorflow.keras import backend as K
|
|
|
|
process = psutil.Process()
|
|
ram_before = process.memory_info().rss
|
|
ram_peak = ram_before
|
|
update_progress(1)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
df = pd.read_csv(csv_path)
|
|
update_progress(10)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
df["fraud"] = df["Attack Type"].apply(
|
|
lambda x: 0 if "normal" in str(x).lower() else 1
|
|
)
|
|
true_labels = df["fraud"]
|
|
|
|
normal_count = int((true_labels == 0).sum())
|
|
attack_count = int((true_labels == 1).sum())
|
|
|
|
features = [c for c in df.columns if c not in ["Attack Type", "fraud"]]
|
|
scaler = StandardScaler()
|
|
|
|
X_raw = df[features].to_numpy(dtype=np.float32, copy=True)
|
|
y = df["fraud"].to_numpy(dtype=np.int8, copy=True)
|
|
|
|
X_scaled = scaler.fit_transform(X_raw).astype(np.float32, copy=False)
|
|
|
|
X_train_normal = X_scaled[y == 0]
|
|
X_test = X_scaled
|
|
y_test = y
|
|
|
|
CODE_DIM = 16
|
|
INPUT_SHAPE = X_scaled.shape[1]
|
|
|
|
inp = Input(shape=(INPUT_SHAPE,))
|
|
x = Dense(128, activation="relu")(inp)
|
|
x = Dropout(0.1)(x)
|
|
x = Dense(64, activation="relu")(x)
|
|
x = Dense(16, activation="relu")(x)
|
|
code = Dense(CODE_DIM, activation="relu")(x)
|
|
x = Dense(16, activation="relu")(code)
|
|
x = Dense(64, activation="relu")(x)
|
|
x = Dense(128, activation="relu")(x)
|
|
out = Dense(INPUT_SHAPE, activation="linear")(x)
|
|
|
|
autoencoder = Model(inp, out)
|
|
autoencoder.compile(loss="mae", optimizer=Adam(learning_rate=0.001))
|
|
update_progress(40)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
earlystopping = EarlyStopping(
|
|
monitor="val_loss", patience=5, restore_best_weights=True
|
|
)
|
|
|
|
history = autoencoder.fit(
|
|
X_train_normal,
|
|
X_train_normal,
|
|
epochs=20,
|
|
batch_size=64,
|
|
validation_split=0.1,
|
|
callbacks=[earlystopping],
|
|
shuffle=True,
|
|
verbose=1,
|
|
)
|
|
update_progress(60)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
reconstructions = autoencoder.predict(X_test, verbose=0)
|
|
reconstruction_error = np.mean(np.abs(reconstructions - X_test), axis=1)
|
|
|
|
recons_df = pd.DataFrame(
|
|
{"error": reconstruction_error, "y_true": y_test}
|
|
).reset_index(drop=True)
|
|
|
|
threshold = np.percentile(recons_df["error"], 60)
|
|
recons_df["y_pred"] = (recons_df["error"] > threshold).astype(int)
|
|
update_progress(80)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
cm = confusion_matrix(recons_df["y_true"], recons_df["y_pred"])
|
|
accuracy = accuracy_score(recons_df["y_true"], recons_df["y_pred"])
|
|
precision, recall, f1, _ = precision_recall_fscore_support(
|
|
recons_df["y_true"],
|
|
recons_df["y_pred"],
|
|
average=None,
|
|
labels=[0, 1],
|
|
)
|
|
update_progress(90)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
if table:
|
|
table_data = [
|
|
["Normal (0)", f"{precision[0]:.4f}", f"{recall[0]:.4f}", f"{f1[0]:.4f}"],
|
|
["Attack (1)", f"{precision[1]:.4f}", f"{recall[1]:.4f}", f"{f1[1]:.4f}"],
|
|
["Overall Accuracy", "-", "-", f"{accuracy:.4f}"],
|
|
]
|
|
print(
|
|
tabulate(
|
|
table_data,
|
|
headers=["Class", "Precision", "Recall", "F1-Score"],
|
|
tablefmt="fancy_grid",
|
|
)
|
|
)
|
|
|
|
results = {
|
|
"normal_count": float(normal_count),
|
|
"attack_count": float(attack_count),
|
|
"accuracy": float(accuracy),
|
|
"precision_normal": float(precision[0]),
|
|
"recall_normal": float(recall[0]),
|
|
"f1_normal": float(f1[0]),
|
|
"precision_attack": float(precision[1]),
|
|
"recall_attack": float(recall[1]),
|
|
"f1_attack": float(f1[1]),
|
|
}
|
|
|
|
candidates = recons_df[
|
|
(recons_df["y_pred"] == 1) & (recons_df["y_true"] == 1)
|
|
].copy()
|
|
|
|
|
|
if len(candidates) < 5:
|
|
extra = recons_df[recons_df["y_pred"] == 1].copy()
|
|
candidates = pd.concat([candidates, extra]).drop_duplicates()
|
|
|
|
if len(candidates) < 5:
|
|
candidates = recons_df.copy()
|
|
|
|
candidates = candidates.sort_values("error", ascending=False).head(5)
|
|
idx = candidates.index
|
|
|
|
df_top = df.iloc[idx].copy()
|
|
df_top["reconstruction_error"] = candidates["error"].values
|
|
|
|
important_cols = [
|
|
"Attack Type",
|
|
"Destination Port",
|
|
"Flow Duration",
|
|
"Total Fwd Packets",
|
|
"Flow Packets/s",
|
|
"Packet Length Mean",
|
|
]
|
|
cols_exist = [c for c in important_cols if c in df_top.columns]
|
|
|
|
top_anomalies = df_top[cols_exist + ["reconstruction_error"]].to_dict(
|
|
orient="records"
|
|
)
|
|
|
|
results["top_anomalies"] = top_anomalies
|
|
|
|
del X_raw, X_scaled, X_train_normal, X_test, y_test, y
|
|
del reconstructions, reconstruction_error, recons_df, candidates, df_top
|
|
del autoencoder, history, scaler, df
|
|
|
|
K.clear_session()
|
|
tf.keras.backend.clear_session(free_memory=True)
|
|
_force_memory_cleanup()
|
|
|
|
ram_after = process.memory_info().rss
|
|
results["ram_before"] = round(ram_before / (1024 ** 2), 2)
|
|
results["ram_peak"] = round(ram_peak / (1024 ** 2), 2)
|
|
results["ram_after"] = round(ram_after / (1024 ** 2), 2)
|
|
results["ram_increase"] = round((ram_peak - ram_before) / (1024 ** 2), 2)
|
|
|
|
update_progress(100)
|
|
|
|
return results
|
|
|
|
if __name__ == "__main__":
|
|
res = run_autoencoder(plot=True, table=True)
|
|
print(res)
|