import warnings warnings.filterwarnings("ignore") import psutil import pandas as pd import numpy as np import json import gc from sklearn.preprocessing import StandardScaler from sklearn.metrics import ( confusion_matrix, accuracy_score, precision_recall_fscore_support, ) from tabulate import tabulate import os def _force_memory_cleanup(): gc.collect() try: import ctypes ctypes.CDLL("libc.so.6").malloc_trim(0) except Exception: pass BASE_DIR = os.path.dirname(os.path.abspath(__file__)) PROGRESS_PATH = os.path.join(BASE_DIR, "progress.json") DATASET_PATH = os.path.join(BASE_DIR, "dataset", "cicids2017_cleaned.csv") def update_progress(value): with open(PROGRESS_PATH, "w") as f: json.dump({"progress": value}, f) def run_autoencoder(csv_path=DATASET_PATH, plot=False, table=False): import tensorflow as tf from tensorflow.keras import Model from tensorflow.keras.layers import Dense, Input, Dropout from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras import backend as K process = psutil.Process() ram_before = process.memory_info().rss ram_peak = ram_before update_progress(1) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) df = pd.read_csv(csv_path) update_progress(10) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) df["fraud"] = df["Attack Type"].apply( lambda x: 0 if "normal" in str(x).lower() else 1 ) true_labels = df["fraud"] normal_count = int((true_labels == 0).sum()) attack_count = int((true_labels == 1).sum()) features = [c for c in df.columns if c not in ["Attack Type", "fraud"]] scaler = StandardScaler() X_raw = df[features].to_numpy(dtype=np.float32, copy=True) y = df["fraud"].to_numpy(dtype=np.int8, copy=True) X_scaled = scaler.fit_transform(X_raw).astype(np.float32, copy=False) X_train_normal = X_scaled[y == 0] X_test = X_scaled y_test = y CODE_DIM = 16 INPUT_SHAPE = X_scaled.shape[1] inp = Input(shape=(INPUT_SHAPE,)) x = Dense(128, activation="relu")(inp) x = Dropout(0.1)(x) x = Dense(64, activation="relu")(x) x = Dense(16, activation="relu")(x) code = Dense(CODE_DIM, activation="relu")(x) x = Dense(16, activation="relu")(code) x = Dense(64, activation="relu")(x) x = Dense(128, activation="relu")(x) out = Dense(INPUT_SHAPE, activation="linear")(x) autoencoder = Model(inp, out) autoencoder.compile(loss="mae", optimizer=Adam(learning_rate=0.001)) update_progress(40) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) earlystopping = EarlyStopping( monitor="val_loss", patience=5, restore_best_weights=True ) history = autoencoder.fit( X_train_normal, X_train_normal, epochs=20, batch_size=64, validation_split=0.1, callbacks=[earlystopping], shuffle=True, verbose=1, ) update_progress(60) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) reconstructions = autoencoder.predict(X_test, verbose=0) reconstruction_error = np.mean(np.abs(reconstructions - X_test), axis=1) recons_df = pd.DataFrame( {"error": reconstruction_error, "y_true": y_test} ).reset_index(drop=True) threshold = np.percentile(recons_df["error"], 60) recons_df["y_pred"] = (recons_df["error"] > threshold).astype(int) update_progress(80) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) cm = confusion_matrix(recons_df["y_true"], recons_df["y_pred"]) accuracy = accuracy_score(recons_df["y_true"], recons_df["y_pred"]) precision, recall, f1, _ = precision_recall_fscore_support( recons_df["y_true"], recons_df["y_pred"], average=None, labels=[0, 1], ) update_progress(90) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) if table: table_data = [ ["Normal (0)", f"{precision[0]:.4f}", f"{recall[0]:.4f}", f"{f1[0]:.4f}"], ["Attack (1)", f"{precision[1]:.4f}", f"{recall[1]:.4f}", f"{f1[1]:.4f}"], ["Overall Accuracy", "-", "-", f"{accuracy:.4f}"], ] print( tabulate( table_data, headers=["Class", "Precision", "Recall", "F1-Score"], tablefmt="fancy_grid", ) ) results = { "normal_count": float(normal_count), "attack_count": float(attack_count), "accuracy": float(accuracy), "precision_normal": float(precision[0]), "recall_normal": float(recall[0]), "f1_normal": float(f1[0]), "precision_attack": float(precision[1]), "recall_attack": float(recall[1]), "f1_attack": float(f1[1]), } candidates = recons_df[ (recons_df["y_pred"] == 1) & (recons_df["y_true"] == 1) ].copy() if len(candidates) < 5: extra = recons_df[recons_df["y_pred"] == 1].copy() candidates = pd.concat([candidates, extra]).drop_duplicates() if len(candidates) < 5: candidates = recons_df.copy() candidates = candidates.sort_values("error", ascending=False).head(5) idx = candidates.index df_top = df.iloc[idx].copy() df_top["reconstruction_error"] = candidates["error"].values important_cols = [ "Attack Type", "Destination Port", "Flow Duration", "Total Fwd Packets", "Flow Packets/s", "Packet Length Mean", ] cols_exist = [c for c in important_cols if c in df_top.columns] top_anomalies = df_top[cols_exist + ["reconstruction_error"]].to_dict( orient="records" ) results["top_anomalies"] = top_anomalies del X_raw, X_scaled, X_train_normal, X_test, y_test, y del reconstructions, reconstruction_error, recons_df, candidates, df_top del autoencoder, history, scaler, df K.clear_session() tf.keras.backend.clear_session(free_memory=True) _force_memory_cleanup() ram_after = process.memory_info().rss results["ram_before"] = round(ram_before / (1024 ** 2), 2) results["ram_peak"] = round(ram_peak / (1024 ** 2), 2) results["ram_after"] = round(ram_after / (1024 ** 2), 2) results["ram_increase"] = round((ram_peak - ram_before) / (1024 ** 2), 2) update_progress(100) return results if __name__ == "__main__": res = run_autoencoder(plot=True, table=True) print(res)