#uvoz biblioteka
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
confusion_matrix, ConfusionMatrixDisplay,
classification_report, accuracy_score
)
from sklearn.model_selection import StratifiedKFold, GridSearchCV
##UCITAVANJE I UVID
##----------------
df = sb.load_dataset("penguins")
print(df.shape)
print(df.isna().sum())
print(df.head()) #d
print(df.describe()) #d
#sredjivanje baze, prvo se uklanjaju nanovi (vrste)
df = df.dropna(axis=0).reset_index(drop=True)
print(df.shape)
print(df['island'].unique())
print(df['sex'].unique())
print(df['species'].value_counts())
print(df.head()) #d
##DEFINISANJE CILJA I OBELEZJA
##----------------
#ciljna promenljiva je species
#u knn ne sme kategoricke kao tekst nego mora u numericko kroz one hot
target = "species"
num_cols = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
cat_cols = ["island", "sex"]
X = df[num_cols + cat_cols].copy()
y = df[target].copy()
print("X shape:", X.shape)
print("y shape:", y.shape)
print("classes:", y.unique())
##T-T-V PODELA
##----------------
#odvaja se prvo test pa se ostatak deli na train i validation
#uz stratify=y da bi raspodele bile slicne u svim skupovima
X_trainval, X_test, y_trainval, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
X_trainval, y_trainval, test_size=0.2, random_state=42, stratify=y_trainval
)
#0.25*0.80 = 0.2; stoga je podela 60/20/20
print("Train:", X_train.shape, y_train.shape)
print("Val: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)
print("\Class distributions (train/val/test):")
print("Train:\n", y_train.value_counts(normalize=True).round(1))
print("Val:\n", y_val.value_counts(normalize=True).round(1))
print("Test:\n", y_test.value_counts(normalize=True).round(1))
# num obelezja: euklid, menhetn rastojanje
# kat obelezja: dice, jaccard
# standardscaler samo na numericke a onehotencoder samo na
# kategoricke
preprocess = ColumnTransformer(
transformers=[
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(), cat_cols),
],
remainder="drop"
)
##PIPELINE
##----------------
pipe = Pipeline([
("prep", preprocess),
("knn", KNeighborsClassifier(metric='euclidean'))
])
#proba nekoliko k vrednosti
k_values = [1, 2, 3, 4, 5, 10, 20, 50]
val_acc = []
for k in k_values:
pipe.set_params(knn__n_neighbors=k)
pipe.fit(X_train, y_train)
y_val_pred = pipe.predict(X_val)
acc = accuracy_score(y_val, y_val_pred)
val_acc.append(acc)
pd.DataFrame({"k": k_values, "val_accuracy": val_acc})\
.sort_values("val_accuracy", ascending=False)
#vizuelizacija uticaja k na validacionom skup
plt.figure(figsize=(8, 4))
plt.plot(k_values, val_acc, marker="o")
plt.xlabel("k (n_neighbors)")
plt.ylabel("Tačnost na validacionom skupu")
plt.title("Uticaj k na tačnost")
plt.grid(True)
plt.show()
best_k = k_values[int(np.argmax(val_acc))]
print("best k on validation set:", best_k)
##KONACAN MODEL
##----------------
#i to obuka sa izabranim k
pipe.set_params(knn__n_neighbors = best_k)
pipe.fit(X_trainval, y_trainval)
y_test_pred = pipe.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred)
print("Accuracy (test):", round(test_acc, 3))
##MATRICA KONFUZIJE (3X3)
##----------------
labels = sorted(y.unique())
cm = confusion_matrix(y_test, y_test_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
plt.figure(figsize=(6, 6))
disp.plot(cmap=None, values_format="d")
plt.title("Confusion matrix (TEST)")
plt.show()
print(classification_report(y_test, y_test_pred, digits=3))
##UNAKRSNA VALIDACIJA (rucno)
##----------------
#train skup se deli na deset delova; 9 za trening a 1 za validaciju
#ponovi deset puta, tako da svaki fold jednom bude validacioni
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
k_for_cv = best_k
pipe_cv = Pipeline([
("prep", preprocess),
("knn", KNeighborsClassifier(n_neighbors=k_for_cv, metric='euclidean'))
])
cv_scores = []
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_trainval, y_trainval), start=1):
X_tr, X_va = X_trainval.iloc[train_idx], X_trainval.iloc[val_idx]
y_tr, y_va = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]
pipe_cv.fit(X_tr, y_tr)
y_va_pred = pipe_cv.predict(X_va)
acc = accuracy_score(y_va, y_va_pred)
cv_scores.append(acc)
print(f"Fold {fold_idx:02d} accuracy: {acc:.3f}")
print("\nCV mean accuracy:", round(np.mean(cv_scores), 3))
print("CV std accuracy:", round(np.std(cv_scores), 3))
##GRIDSEARCHCV + PIPELINE
##----------------
param_grid = {
"knn__n_neighbors": [1, 2, 3, 4, 5, 10, 20, 50],
"knn__weights": ["uniform", "distance"],
"knn__p": [1, 2], # p=1 → Manhattan, p=2 → Euclidean (Minkowski)
}
grid = GridSearchCV(
estimator=pipe,
param_grid=param_grid,
scoring="accuracy",
cv=skf,
n_jobs=-1,
verbose=0
)
grid.fit(X_trainval, y_trainval)
print("Best parameters:", grid.best_params_)
print("Best CV score:", round(grid.best_score_, 3))
#konacna evaluacija
best_model = grid.best_estimator_
y_test_pred_gs = best_model.predict(X_test)
print("Accuracy (TEST, GridSearch):", round(accuracy_score(y_test, y_test_pred_gs), 3))
print("\nClassification report (TEST, GridSearch):")
print(classification_report(y_test, y_test_pred_gs, digits=3))
cm2 = confusion_matrix(y_test, y_test_pred_gs, labels=labels)
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=labels)
plt.figure(figsize=(6, 6))
disp2.plot(cmap=None, values_format="d")
plt.title("Matrica konfuzije (TEST) — GridSearch model")
plt.show()
##X
##----------------