212 lines
6.4 KiB
Python
212 lines
6.4 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import json
|
|
import psutil
|
|
from sklearn.ensemble import IsolationForest
|
|
from sklearn.preprocessing import RobustScaler
|
|
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
|
|
from tabulate import tabulate
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
import gc
|
|
|
|
import os
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
PROGRESS_PATH = os.path.join(BASE_DIR, "progress.json")
|
|
DATASET_PATH = os.path.join(BASE_DIR, "dataset", "cicids2017_cleaned.csv")
|
|
|
|
def _force_memory_cleanup():
|
|
gc.collect()
|
|
try:
|
|
import ctypes
|
|
ctypes.CDLL("libc.so.6").malloc_trim(0)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def update_progress(value):
|
|
with open(PROGRESS_PATH, "w") as f:
|
|
json.dump({"progress": value}, f)
|
|
|
|
|
|
def run_isolation_forest(csv_path=DATASET_PATH, plot=False, table=False):
|
|
process = psutil.Process()
|
|
ram_before = process.memory_info().rss
|
|
ram_peak = ram_before
|
|
|
|
update_progress(1)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
df = pd.read_csv(csv_path)
|
|
update_progress(10)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
df["fraud"] = df["Attack Type"].apply(
|
|
lambda x: 0 if "normal" in str(x).lower() else 1
|
|
)
|
|
true_labels = df["fraud"]
|
|
update_progress(20)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
normal_count = int((true_labels == 0).sum())
|
|
attack_count = int((true_labels == 1).sum())
|
|
|
|
attack_fraction = true_labels.mean()
|
|
contamination = min(attack_fraction * 2.5, 0.49)
|
|
|
|
X = df.select_dtypes(include=[np.number])
|
|
X = X.loc[:, X.std() > 0.01]
|
|
X = X.to_numpy(dtype=np.float32, copy=True)
|
|
update_progress(30)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
scaler = RobustScaler()
|
|
X_scaled = scaler.fit_transform(X).astype(np.float32, copy=False)
|
|
update_progress(40)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
model = IsolationForest(
|
|
n_estimators=600,
|
|
max_samples=0.3,
|
|
contamination=contamination,
|
|
max_features=0.7,
|
|
bootstrap=False,
|
|
random_state=42,
|
|
n_jobs=1,
|
|
)
|
|
model.fit(X_scaled)
|
|
update_progress(70)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
preds = model.predict(X_scaled)
|
|
df["pred_label"] = np.where(preds == 1, 0, 1)
|
|
df["anomaly_score"] = model.decision_function(X_scaled)
|
|
update_progress(85)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
cm = confusion_matrix(true_labels, df["pred_label"])
|
|
accuracy = accuracy_score(true_labels, df["pred_label"])
|
|
precision, recall, f1, _ = precision_recall_fscore_support(
|
|
true_labels, df["pred_label"], average=None, labels=[0, 1]
|
|
)
|
|
update_progress(95)
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
if table:
|
|
table_data = [
|
|
["Normal (0)", f"{precision[0]:.4f}", f"{recall[0]:.4f}", f"{f1[0]:.4f}"],
|
|
["Attack (1)", f"{precision[1]:.4f}", f"{recall[1]:.4f}", f"{f1[1]:.4f}"],
|
|
["Overall Accuracy", "-", "-", f"{accuracy:.4f}"],
|
|
]
|
|
print(
|
|
tabulate(
|
|
table_data,
|
|
headers=["Class", "Precision", "Recall", "F1-Score"],
|
|
tablefmt="fancy_grid",
|
|
)
|
|
)
|
|
|
|
if plot:
|
|
plt.figure(figsize=(10, 5))
|
|
scatter = plt.scatter(
|
|
range(len(df)),
|
|
df["anomaly_score"],
|
|
c=df["pred_label"],
|
|
cmap="coolwarm",
|
|
s=10,
|
|
)
|
|
plt.xlabel("Instance")
|
|
plt.ylabel("Anomaly Score")
|
|
plt.title("Anomaly Score Distribution (Isolation Forest)")
|
|
handles, labels = scatter.legend_elements()
|
|
plt.legend(handles, ["Normal", "Anomaly"], title="Predicted")
|
|
plt.show()
|
|
|
|
plt.figure(figsize=(5, 4))
|
|
sns.heatmap(
|
|
cm,
|
|
annot=True,
|
|
fmt="d",
|
|
cmap="Blues",
|
|
xticklabels=["Normal", "Attack"],
|
|
yticklabels=["Normal", "Attack"],
|
|
)
|
|
plt.xlabel("Predicted Label")
|
|
plt.ylabel("True Label")
|
|
plt.title("Confusion Matrix (Isolation Forest)")
|
|
plt.show()
|
|
|
|
results = {
|
|
"normal_count": float(normal_count),
|
|
"attack_count": float(attack_count),
|
|
"contamination": float(contamination),
|
|
"accuracy": float(accuracy),
|
|
"precision_normal": float(precision[0]),
|
|
"recall_normal": float(recall[0]),
|
|
"f1_normal": float(f1[0]),
|
|
"precision_attack": float(precision[1]),
|
|
"recall_attack": float(recall[1]),
|
|
"f1_attack": float(f1[1]),
|
|
}
|
|
|
|
|
|
|
|
candidates = df[(df["pred_label"] == 1) & (df["fraud"] == 1)].copy()
|
|
|
|
|
|
if len(candidates) < 5:
|
|
extra = df[df["pred_label"] == 1].copy()
|
|
candidates = pd.concat([candidates, extra]).drop_duplicates()
|
|
|
|
if len(candidates) < 5:
|
|
candidates = df.copy()
|
|
|
|
candidates = candidates.sort_values("anomaly_score").head(5)
|
|
|
|
important_cols = [
|
|
"Attack Type",
|
|
"Destination Port",
|
|
"Flow Duration",
|
|
"Total Fwd Packets",
|
|
"Flow Packets/s",
|
|
"Packet Length Mean",
|
|
]
|
|
|
|
cols_exist = [c for c in important_cols if c in candidates.columns]
|
|
|
|
top_anomalies = candidates[cols_exist + ["anomaly_score"]].rename(
|
|
columns={"anomaly_score": "score"}
|
|
).to_dict(orient="records")
|
|
|
|
results["top_anomalies"] = top_anomalies
|
|
|
|
current_ram = process.memory_info().rss
|
|
ram_peak = max(ram_peak, current_ram)
|
|
|
|
|
|
del X, X_scaled, preds, model, scaler, candidates, df, cm, true_labels
|
|
_force_memory_cleanup()
|
|
|
|
ram_after = process.memory_info().rss
|
|
|
|
results["ram_before"] = round(ram_before / (1024 ** 2), 2)
|
|
results["ram_peak"] = round(ram_peak / (1024 ** 2), 2)
|
|
results["ram_after"] = round(ram_after / (1024 ** 2), 2)
|
|
results["ram_increase"] = round((ram_peak - ram_before) / (1024 ** 2), 2)
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
res = run_isolation_forest(plot=True, table=True)
|
|
print(res)
|