KNN 1
#uvoz biblioteka
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.datasets import load_breast_cancer #dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, ConfusionMatrixDisplay, classification_report,
roc_auc_score, RocCurveDisplay
)
from sklearn.pipeline import Pipeline
##PRIPREMA TABELE + osnovni uvid
##----------------
data = load_breast_cancer(as_frame=True)
X = data.data
y = data.target
print("X shape:", X.shape)
print("y shape:", y.shape)
print("number of samples:", y.value_counts())
print(X.head()) #d
##T-T-V PODELA
##----------------
#prvo se test izdvaja iz trainval
X_trainval, X_test, y_trainval, y_test= train_test_split(X, y,
test_size=0.10, random_state=42, stratify=y)
#onda val iz train
X_train, X_val, y_train, y_val = train_test_split(
X_trainval, y_trainval, test_size = 0.10, random_state=42, stratify=y_trainval)
print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)
print("Train classes:", y_train.value_counts())
print("Val classes:", y_val.value_counts())
print("Test classes:", y_test.value_counts())
##KNN
##----------------
#prvo skaliranje, normalizacija JESTE neophodna
scaler = StandardScaler()
X_train_s = scaler.transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
knn.fit(X_train_s, y_train)
y_val_pred = knn.predict(X_val_s)
for i in range (10):
print(f"{i:02d} true={y_val.iloc[i]} pred={y_val_pred[i]}")
##MERE USPESNOSTI (matrica konfuzije)
##----------------
cm = confusion_matrix(y_val, y_val_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names).plot(values_format="d")
plt.title("Confusion matrix (VAL) | k=5")
plt.show()
acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred)
rec = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
print(f"\nVAL | accuracy={acc:.2f} precision={prec:.2f} recall={rec:.2f} f1={f1:.2f}")
##UTICAJ HIPERPARAMETARA NA VALIDACIONOM SKUPU
##----------------
#euklidsko rastojanje
k_list = list(range(1,31))
val_f1 = []
for k in k_list:
knn_k = KNeighborsClassifier(n_neighbors=k, metric="euclidean")
knn_k.fit(X_train_s, y_train)
pred = knn_k.predict(X_val_s)
val_f1.append(f1_score(y_val, pred))
best_k = k_list[int(np.argmax(val_f1))]
best_f1 = float(np.max(val_f1))
print("best k per F1 on VAL for euclidean distance:", best_k, "F1:", round(best_f1, 2))
plt.figure(figsize=(9,4))
plt.plot(k_list, val_f1, marker="o")
plt.xlabel("k (n_neighbors)")
plt.ylabel("F1 on validation subset")
plt.title("k choice on validation subset")
plt.grid(True, alpha=0.3)
plt.show()
#menhetn/taksi rastojanje
k_list = list(range(1,31))
val_f1 = []
for k in k_list:
knn_k = KNeighborsClassifier(n_neighbors=k, metric="manhattan")
knn_k.fit(X_train_s, y_train)
pred = knn_k.predict(X_val_s)
val_f1.append(f1_score(y_val, pred))
best_k = k_list[int(np.argmax(val_f1))]
best_f1 = float(np.max(val_f1))
print("best k per F1 on VAL for euclidean distance:", best_k, "F1:", round(best_f1, 2))
plt.figure(figsize=(9,4))
plt.plot(k_list, val_f1, marker="o")
plt.xlabel("k (n_neighbors)")
plt.ylabel("F1 on validation subset")
plt.title("k choice on validation subset")
plt.grid(True, alpha=0.3)
plt.show()
##TRENIRANJE KONACNOG MODELA NA (train+val)
##----------------
X_train_final = pd.concat([pd.DataFrame(X_train_s), pd.DataFrame(X_val_s)], axis=0)
y_train_final = pd.concat([y_train, y_val], axis=0)
knn_final = KNeighborsClassifier(n_neighbors=1, metric="euclidean", weights="distance")
knn_final.fit(X_train_final, y_train_final)
y_test_pred = knn_final.predict(X_test_s)
cm = confusion_matrix(y_test, y_test_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names).plot(values_format="d")
plt.title(f"Confusion matrix (TEST) | k={best_k}")
plt.show()
acc = accuracy_score(y_test, y_test_pred)
prec = precision_score(y_test, y_test_pred)
rec = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
print(f"\nTEST | accuracy={acc:.2f} precision={prec:.2f} recall={rec:.2f} f1={f1:.2f}")
##PIPELINE
##----------------
from sklearn.pipeline import Pipeline
#podela train/val/test
X_trainval, X_test, y_trainval, y_test = train_test_split(
X, y, test_size=0.10, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(
X_trainval, y_trainval, test_size=0.10, random_state=42, stratify=y_trainval)
#definisanje pipeline
pipe = Pipeline(steps=[
("scaler", StandardScaler()),
("knn", KNeighborsClassifier())
])
#isprobavanje hiperparametara na validacionom skupu
pipe.set_params(knn__n_neighbors=3, knn__metric='manhattan', knn__weights='distance').fit(X_train, y_train)
y_val_pred = pipe.predict(X_val)
print("F1 (VAL):", f1_score(y_val, y_val_pred))
#cim se utvrde optimalni parametri obucava se konacan model i za to se moze
#iskoristiti vec napravljen pipeline
pipe.set_params(knn__n_neighbors=3, knn__metric='manhattan', knn__weights='distance').fit(X_trainval, y_trainval)
y_test_pred = pipe.predict(X_test)
print("F1 (TEST):", f1_score(y_test, y_test_pred))
cm2 = confusion_matrix(y_test, y_test_pred)
ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=data.target_names).plot(values_format="d")
plt.title(f"Confusion matrix (TEST) | k={1}")
plt.show()
##ROC KRIVA
##----------------
if hasattr(pipe.named_steps["knn"], "predict_proba"):
y_score = pipe.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_score)
print("ROC-AUC (TEST):", round(auc, 2))
RocCurveDisplay.from_predictions(y_test, y_score)
plt.title("ROC kriva (TEST)")
plt.show()
##X
##----------------