import pandas as pd import numpy as np import json import psutil from sklearn.ensemble import IsolationForest from sklearn.preprocessing import RobustScaler from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support from tabulate import tabulate import matplotlib.pyplot as plt import seaborn as sns import gc import os BASE_DIR = os.path.dirname(os.path.abspath(__file__)) PROGRESS_PATH = os.path.join(BASE_DIR, "progress.json") DATASET_PATH = os.path.join(BASE_DIR, "dataset", "cicids2017_cleaned.csv") def _force_memory_cleanup(): gc.collect() try: import ctypes ctypes.CDLL("libc.so.6").malloc_trim(0) except Exception: pass def update_progress(value): with open(PROGRESS_PATH, "w") as f: json.dump({"progress": value}, f) def run_isolation_forest(csv_path=DATASET_PATH, plot=False, table=False): process = psutil.Process() ram_before = process.memory_info().rss ram_peak = ram_before update_progress(1) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) df = pd.read_csv(csv_path) update_progress(10) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) df["fraud"] = df["Attack Type"].apply( lambda x: 0 if "normal" in str(x).lower() else 1 ) true_labels = df["fraud"] update_progress(20) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) normal_count = int((true_labels == 0).sum()) attack_count = int((true_labels == 1).sum()) attack_fraction = true_labels.mean() contamination = min(attack_fraction * 2.5, 0.49) X = df.select_dtypes(include=[np.number]) X = X.loc[:, X.std() > 0.01] X = X.to_numpy(dtype=np.float32, copy=True) update_progress(30) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) scaler = RobustScaler() X_scaled = scaler.fit_transform(X).astype(np.float32, copy=False) update_progress(40) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) model = IsolationForest( n_estimators=600, max_samples=0.3, contamination=contamination, max_features=0.7, bootstrap=False, random_state=42, n_jobs=1, ) model.fit(X_scaled) update_progress(70) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) preds = model.predict(X_scaled) df["pred_label"] = np.where(preds == 1, 0, 1) df["anomaly_score"] = model.decision_function(X_scaled) update_progress(85) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) cm = confusion_matrix(true_labels, df["pred_label"]) accuracy = accuracy_score(true_labels, df["pred_label"]) precision, recall, f1, _ = precision_recall_fscore_support( true_labels, df["pred_label"], average=None, labels=[0, 1] ) update_progress(95) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) if table: table_data = [ ["Normal (0)", f"{precision[0]:.4f}", f"{recall[0]:.4f}", f"{f1[0]:.4f}"], ["Attack (1)", f"{precision[1]:.4f}", f"{recall[1]:.4f}", f"{f1[1]:.4f}"], ["Overall Accuracy", "-", "-", f"{accuracy:.4f}"], ] print( tabulate( table_data, headers=["Class", "Precision", "Recall", "F1-Score"], tablefmt="fancy_grid", ) ) if plot: plt.figure(figsize=(10, 5)) scatter = plt.scatter( range(len(df)), df["anomaly_score"], c=df["pred_label"], cmap="coolwarm", s=10, ) plt.xlabel("Instance") plt.ylabel("Anomaly Score") plt.title("Anomaly Score Distribution (Isolation Forest)") handles, labels = scatter.legend_elements() plt.legend(handles, ["Normal", "Anomaly"], title="Predicted") plt.show() plt.figure(figsize=(5, 4)) sns.heatmap( cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Normal", "Attack"], yticklabels=["Normal", "Attack"], ) plt.xlabel("Predicted Label") plt.ylabel("True Label") plt.title("Confusion Matrix (Isolation Forest)") plt.show() results = { "normal_count": float(normal_count), "attack_count": float(attack_count), "contamination": float(contamination), "accuracy": float(accuracy), "precision_normal": float(precision[0]), "recall_normal": float(recall[0]), "f1_normal": float(f1[0]), "precision_attack": float(precision[1]), "recall_attack": float(recall[1]), "f1_attack": float(f1[1]), } candidates = df[(df["pred_label"] == 1) & (df["fraud"] == 1)].copy() if len(candidates) < 5: extra = df[df["pred_label"] == 1].copy() candidates = pd.concat([candidates, extra]).drop_duplicates() if len(candidates) < 5: candidates = df.copy() candidates = candidates.sort_values("anomaly_score").head(5) important_cols = [ "Attack Type", "Destination Port", "Flow Duration", "Total Fwd Packets", "Flow Packets/s", "Packet Length Mean", ] cols_exist = [c for c in important_cols if c in candidates.columns] top_anomalies = candidates[cols_exist + ["anomaly_score"]].rename( columns={"anomaly_score": "score"} ).to_dict(orient="records") results["top_anomalies"] = top_anomalies current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) del X, X_scaled, preds, model, scaler, candidates, df, cm, true_labels _force_memory_cleanup() ram_after = process.memory_info().rss results["ram_before"] = round(ram_before / (1024 ** 2), 2) results["ram_peak"] = round(ram_peak / (1024 ** 2), 2) results["ram_after"] = round(ram_after / (1024 ** 2), 2) results["ram_increase"] = round((ram_peak - ram_before) / (1024 ** 2), 2) return results if __name__ == "__main__": res = run_isolation_forest(plot=True, table=True) print(res)