BC_praca/Backend/isolation_forest_cicids.py

212 lines
6.4 KiB
Python

import pandas as pd
import numpy as np
import json
import psutil
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
PROGRESS_PATH = os.path.join(BASE_DIR, "progress.json")
DATASET_PATH = os.path.join(BASE_DIR, "dataset", "cicids2017_cleaned.csv")
def _force_memory_cleanup():
gc.collect()
try:
import ctypes
ctypes.CDLL("libc.so.6").malloc_trim(0)
except Exception:
pass
def update_progress(value):
with open(PROGRESS_PATH, "w") as f:
json.dump({"progress": value}, f)
def run_isolation_forest(csv_path=DATASET_PATH, plot=False, table=False):
process = psutil.Process()
ram_before = process.memory_info().rss
ram_peak = ram_before
update_progress(1)
current_ram = process.memory_info().rss
ram_peak = max(ram_peak, current_ram)
df = pd.read_csv(csv_path)
update_progress(10)
current_ram = process.memory_info().rss
ram_peak = max(ram_peak, current_ram)
df["fraud"] = df["Attack Type"].apply(
lambda x: 0 if "normal" in str(x).lower() else 1
)
true_labels = df["fraud"]
update_progress(20)
current_ram = process.memory_info().rss
ram_peak = max(ram_peak, current_ram)
normal_count = int((true_labels == 0).sum())
attack_count = int((true_labels == 1).sum())
attack_fraction = true_labels.mean()
contamination = min(attack_fraction * 2.5, 0.49)
X = df.select_dtypes(include=[np.number])
X = X.loc[:, X.std() > 0.01]
X = X.to_numpy(dtype=np.float32, copy=True)
update_progress(30)
current_ram = process.memory_info().rss
ram_peak = max(ram_peak, current_ram)
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X).astype(np.float32, copy=False)
update_progress(40)
current_ram = process.memory_info().rss
ram_peak = max(ram_peak, current_ram)
model = IsolationForest(
n_estimators=600,
max_samples=0.3,
contamination=contamination,
max_features=0.7,
bootstrap=False,
random_state=42,
n_jobs=1,
)
model.fit(X_scaled)
update_progress(70)
current_ram = process.memory_info().rss
ram_peak = max(ram_peak, current_ram)
preds = model.predict(X_scaled)
df["pred_label"] = np.where(preds == 1, 0, 1)
df["anomaly_score"] = model.decision_function(X_scaled)
update_progress(85)
current_ram = process.memory_info().rss
ram_peak = max(ram_peak, current_ram)
cm = confusion_matrix(true_labels, df["pred_label"])
accuracy = accuracy_score(true_labels, df["pred_label"])
precision, recall, f1, _ = precision_recall_fscore_support(
true_labels, df["pred_label"], average=None, labels=[0, 1]
)
update_progress(95)
current_ram = process.memory_info().rss
ram_peak = max(ram_peak, current_ram)
if table:
table_data = [
["Normal (0)", f"{precision[0]:.4f}", f"{recall[0]:.4f}", f"{f1[0]:.4f}"],
["Attack (1)", f"{precision[1]:.4f}", f"{recall[1]:.4f}", f"{f1[1]:.4f}"],
["Overall Accuracy", "-", "-", f"{accuracy:.4f}"],
]
print(
tabulate(
table_data,
headers=["Class", "Precision", "Recall", "F1-Score"],
tablefmt="fancy_grid",
)
)
if plot:
plt.figure(figsize=(10, 5))
scatter = plt.scatter(
range(len(df)),
df["anomaly_score"],
c=df["pred_label"],
cmap="coolwarm",
s=10,
)
plt.xlabel("Instance")
plt.ylabel("Anomaly Score")
plt.title("Anomaly Score Distribution (Isolation Forest)")
handles, labels = scatter.legend_elements()
plt.legend(handles, ["Normal", "Anomaly"], title="Predicted")
plt.show()
plt.figure(figsize=(5, 4))
sns.heatmap(
cm,
annot=True,
fmt="d",
cmap="Blues",
xticklabels=["Normal", "Attack"],
yticklabels=["Normal", "Attack"],
)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix (Isolation Forest)")
plt.show()
results = {
"normal_count": float(normal_count),
"attack_count": float(attack_count),
"contamination": float(contamination),
"accuracy": float(accuracy),
"precision_normal": float(precision[0]),
"recall_normal": float(recall[0]),
"f1_normal": float(f1[0]),
"precision_attack": float(precision[1]),
"recall_attack": float(recall[1]),
"f1_attack": float(f1[1]),
}
candidates = df[(df["pred_label"] == 1) & (df["fraud"] == 1)].copy()
if len(candidates) < 5:
extra = df[df["pred_label"] == 1].copy()
candidates = pd.concat([candidates, extra]).drop_duplicates()
if len(candidates) < 5:
candidates = df.copy()
candidates = candidates.sort_values("anomaly_score").head(5)
important_cols = [
"Attack Type",
"Destination Port",
"Flow Duration",
"Total Fwd Packets",
"Flow Packets/s",
"Packet Length Mean",
]
cols_exist = [c for c in important_cols if c in candidates.columns]
top_anomalies = candidates[cols_exist + ["anomaly_score"]].rename(
columns={"anomaly_score": "score"}
).to_dict(orient="records")
results["top_anomalies"] = top_anomalies
current_ram = process.memory_info().rss
ram_peak = max(ram_peak, current_ram)
del X, X_scaled, preds, model, scaler, candidates, df, cm, true_labels
_force_memory_cleanup()
ram_after = process.memory_info().rss
results["ram_before"] = round(ram_before / (1024 ** 2), 2)
results["ram_peak"] = round(ram_peak / (1024 ** 2), 2)
results["ram_after"] = round(ram_after / (1024 ** 2), 2)
results["ram_increase"] = round((ram_peak - ram_before) / (1024 ** 2), 2)
return results
if __name__ == "__main__":
res = run_isolation_forest(plot=True, table=True)
print(res)