# isolation_forest_custom.py import json import pandas as pd import numpy as np import psutil from sklearn.ensemble import IsolationForest from sklearn.preprocessing import RobustScaler from sklearn.metrics import accuracy_score, precision_recall_fscore_support import os import gc BASE_DIR = os.path.dirname(os.path.abspath(__file__)) TEMP_DIR = os.path.join(BASE_DIR, "temp") def _force_memory_cleanup(): gc.collect() try: import ctypes ctypes.CDLL("libc.so.6").malloc_trim(0) except Exception: pass def update_progress(value, progress_path="progress.json"): with open(progress_path, "w") as f: json.dump({"progress": value}, f) def run_isolation_forest_custom( csv_path=None, config_path=None, progress_path=None ): if csv_path is None: csv_path = os.path.join(TEMP_DIR, "upload.csv") if config_path is None: config_path = os.path.join(TEMP_DIR, "config.json") if progress_path is None: progress_path = os.path.join(BASE_DIR, "progress.json") update_progress(1, progress_path) process = psutil.Process() ram_before = process.memory_info().rss ram_peak = ram_before update_progress(5, progress_path) with open(config_path, "r", encoding="utf-8") as f: config = json.load(f) label_col = config["labeling"]["label_column"] normal_value = config["labeling"]["normal_value"] features = config["features"]["selected_columns"] params = config.get("algorithm", {}).get("parameters", {}) needed_cols = list(dict.fromkeys(features + [label_col])) df = pd.read_csv(csv_path, usecols=needed_cols) update_progress(15, progress_path) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) col = df[label_col] df["__label"] = col.apply(lambda x: 0 if x == normal_value else 1) update_progress(25, progress_path) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) normal_count = int((df["__label"] == 0).sum()) attack_count = int((df["__label"] == 1).sum()) X = df[features].to_numpy(dtype=np.float32, copy=True) y = df["__label"].to_numpy(dtype=np.int8, copy=True) scaler = RobustScaler() X_scaled = scaler.fit_transform(X).astype(np.float32, copy=False) contamination = min(y.mean() * 2.5, 0.49) update_progress(45, progress_path) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) model = IsolationForest( n_estimators=params.get("n_estimators", 600), max_samples=params.get("max_samples", 0.3), max_features=params.get("max_features", 0.7), contamination=contamination, random_state=params.get("random_state", 42), n_jobs=1 ) model.fit(X_scaled) update_progress(75, progress_path) current_ram = process.memory_info().rss ram_peak = max(ram_peak, current_ram) preds = model.predict(X_scaled) y_pred = np.where(preds == 1, 0, 1) accuracy = accuracy_score(y, y_pred) precision, recall, f1, _ = precision_recall_fscore_support( y, y_pred, average=None, labels=[0, 1] ) update_progress(85, progress_path) results = { "normal_count": float(normal_count), "attack_count": float(attack_count), "contamination": float(contamination), "accuracy": float(accuracy), "precision_normal": float(precision[0]), "recall_normal": float(recall[0]), "f1_normal": float(f1[0]), "precision_attack": float(precision[1]), "recall_attack": float(recall[1]), "f1_attack": float(f1[1]), } anomaly_scores = -model.decision_function(X_scaled) display_cols = list(dict.fromkeys(features + [label_col])) top_df = df[display_cols].copy() top_df["anomaly_score"] = anomaly_scores top_df["predicted"] = y_pred top_df["true_label"] = y real_attacks = top_df[top_df["true_label"] == 1] real_attacks_sorted = real_attacks.sort_values("anomaly_score", ascending=False) top_real_attacks = real_attacks_sorted.head(5) if len(top_real_attacks) > 0: cols = [c for c in display_cols if c in top_real_attacks.columns] + ["anomaly_score"] top_anomalies = ( top_real_attacks[cols] .rename(columns={"anomaly_score": "score"}) .round(4) .to_dict(orient="records") ) else: top_anomalies = [] results["top_anomalies"] = top_anomalies del X, X_scaled, preds, y_pred, anomaly_scores del top_df, real_attacks, real_attacks_sorted, top_real_attacks del model, scaler, df, y, col _force_memory_cleanup() ram_after = process.memory_info().rss results["ram_before"] = round(ram_before / (1024 ** 2), 2) results["ram_peak"] = round(ram_peak / (1024 ** 2), 2) results["ram_after"] = round(ram_after / (1024 ** 2), 2) results["ram_increase"] = round((ram_peak - ram_before) / (1024 ** 2), 2) return results if __name__ == "__main__": res = run_isolation_forest_custom() print(json.dumps(res, indent=2))