#uvoz biblioteka
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    classification_report, accuracy_score
)

from sklearn.model_selection import StratifiedKFold, GridSearchCV

##UCITAVANJE I UVID
##----------------
df = sb.load_dataset("penguins")

print(df.shape)
print(df.isna().sum())
print(df.head()) #d
print(df.describe()) #d

#sredjivanje baze, prvo se uklanjaju nanovi (vrste)
df = df.dropna(axis=0).reset_index(drop=True)

print(df.shape)
print(df['island'].unique())
print(df['sex'].unique())
print(df['species'].value_counts())
print(df.head()) #d

##DEFINISANJE CILJA I OBELEZJA
##----------------
#ciljna promenljiva je species
#u knn ne sme kategoricke kao tekst nego mora u numericko kroz one hot

target = "species"

num_cols = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
cat_cols = ["island", "sex"]

X = df[num_cols + cat_cols].copy()
y = df[target].copy()

print("X shape:", X.shape)
print("y shape:", y.shape)
print("classes:", y.unique())

##T-T-V PODELA
##----------------
#odvaja se prvo test pa se ostatak deli na train i validation
#uz stratify=y da bi raspodele bile slicne u svim skupovima
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.2, random_state=42, stratify=y_trainval
)

#0.25*0.80 = 0.2; stoga je podela 60/20/20
print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

print("\Class distributions (train/val/test):")
print("Train:\n", y_train.value_counts(normalize=True).round(1))
print("Val:\n", y_val.value_counts(normalize=True).round(1))
print("Test:\n", y_test.value_counts(normalize=True).round(1))

# num obelezja: euklid, menhetn rastojanje
# kat obelezja: dice, jaccard

# standardscaler samo na numericke a onehotencoder samo na 
# kategoricke

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(), cat_cols),
    ],
    remainder="drop"
)

##PIPELINE
##----------------
pipe = Pipeline([
    ("prep", preprocess),
    ("knn", KNeighborsClassifier(metric='euclidean'))
])

#proba nekoliko k vrednosti
k_values = [1, 2, 3, 4, 5, 10, 20, 50]
val_acc = []

for k in k_values:
    pipe.set_params(knn__n_neighbors=k)
    pipe.fit(X_train, y_train)
    y_val_pred = pipe.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    val_acc.append(acc)

pd.DataFrame({"k": k_values, "val_accuracy": val_acc})\
    .sort_values("val_accuracy", ascending=False)

#vizuelizacija uticaja k na validacionom skup
plt.figure(figsize=(8, 4))
plt.plot(k_values, val_acc, marker="o")
plt.xlabel("k (n_neighbors)")
plt.ylabel("Tačnost na validacionom skupu")
plt.title("Uticaj k na tačnost")
plt.grid(True)
plt.show()

best_k = k_values[int(np.argmax(val_acc))]
print("best k on validation set:", best_k)

##KONACAN MODEL 
##----------------
#i to obuka sa izabranim k
pipe.set_params(knn__n_neighbors = best_k)
pipe.fit(X_trainval, y_trainval)

y_test_pred = pipe.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred)
print("Accuracy (test):", round(test_acc, 3))

##MATRICA KONFUZIJE (3X3)
##----------------
labels = sorted(y.unique())

cm = confusion_matrix(y_test, y_test_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)

plt.figure(figsize=(6, 6))
disp.plot(cmap=None, values_format="d")
plt.title("Confusion matrix (TEST)")
plt.show()

print(classification_report(y_test, y_test_pred, digits=3))

##UNAKRSNA VALIDACIJA (rucno)
##----------------
#train skup se deli na deset delova; 9 za trening a 1 za validaciju
#ponovi deset puta, tako da svaki fold jednom bude validacioni

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

k_for_cv = best_k
pipe_cv = Pipeline([
    ("prep", preprocess),
    ("knn", KNeighborsClassifier(n_neighbors=k_for_cv, metric='euclidean'))
])

cv_scores = []
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_trainval, y_trainval), start=1):
    X_tr, X_va = X_trainval.iloc[train_idx], X_trainval.iloc[val_idx]
    y_tr, y_va = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]

    pipe_cv.fit(X_tr, y_tr)
    y_va_pred = pipe_cv.predict(X_va)
    acc = accuracy_score(y_va, y_va_pred)
    cv_scores.append(acc)

    print(f"Fold {fold_idx:02d} accuracy: {acc:.3f}")

print("\nCV mean accuracy:", round(np.mean(cv_scores), 3))
print("CV std  accuracy:", round(np.std(cv_scores), 3))

##GRIDSEARCHCV + PIPELINE
##----------------
param_grid = {
    "knn__n_neighbors": [1, 2, 3, 4, 5, 10, 20, 50],
    "knn__weights": ["uniform", "distance"],
    "knn__p": [1, 2],  # p=1 → Manhattan, p=2 → Euclidean (Minkowski)
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="accuracy",
    cv=skf,
    n_jobs=-1,
    verbose=0
)

grid.fit(X_trainval, y_trainval)

print("Best parameters:", grid.best_params_)
print("Best CV score:", round(grid.best_score_, 3))

#konacna evaluacija
best_model = grid.best_estimator_

y_test_pred_gs = best_model.predict(X_test)

print("Accuracy (TEST, GridSearch):", round(accuracy_score(y_test, y_test_pred_gs), 3))
print("\nClassification report (TEST, GridSearch):")
print(classification_report(y_test, y_test_pred_gs, digits=3))

cm2 = confusion_matrix(y_test, y_test_pred_gs, labels=labels)
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=labels)

plt.figure(figsize=(6, 6))
disp2.plot(cmap=None, values_format="d")
plt.title("Matrica konfuzije (TEST) — GridSearch model")
plt.show()

##X
##----------------