Exploratory Data Analysis 1

#uvoz biblioteka

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from pathlib import Path

#podesavanje ispisa (na dve decimale)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
DATA_PATH = Path("city_temperature.csv")

##ucitavanje info
df = pd.read_csv(DATA_PATH)
#dimenzije
print("Dimenzije:", df.shape)
print(df.head()) #display na jupyter
print("\nInfo:")
print(df.info())
print("\nBroj jedinstvenih po koloni:")
print(df.nunique()) #display na jupyter

#prikaz memorije, 1e6 = 1.000.000 (milion)
print("\n Memorija u MB:", df.memory_usage(deep=True).sum()/1e6)

##provera i tretiranje nedostajucih vrednosti
#njihov broj
NANs = df.isnull().sum()
print(NANs)

#koliki je udeo u tome
udeo = df.isnull().sum()/len(df) * 100 #radi prikaza u procentima
print("Nan: ",udeo,"%", sep="")

#deskriptivne statistike
print(df.describe()) #display na jupyter

#tretiranje

n_before = (df["AvgTemperature"] == -99).sum()
df.loc[df["AvgTemperature"] == -99, "AvgTemperature"] = np.nan
#(u koloni ce se menjati -99 sa {n_before} uzoraka).

#nulti dan u mesecu
n_before = (df["Day"] == 0).sum()
df.loc[df["Day"] == 0, "Day"] = np.nan
#(u koloni ce se menjati nulti dan sa {n_before} uzoraka).

print("Duplikati potpuno iste vrste:", df.duplicated().sum())

#sve godine ponaosob ali samo jednom.
print(df['Year'].unique())

#state se izbacuje iz daljeg razmatranja, 1 = kolona
df = df.drop("State", axis=1)

#radi popune prosecne temperature, s obzirom na sortiranost baze, samo ce se
#popuniti poslednjom validnom vrednoscu
#→ df['AvgTemperature'].fillna(value, inplace=True) ←
 
df["AvgTemperature"] = df["AvgTemperature"].ffill()

#brisanje onih uzoraka sto nedostaje i na kraju ispis onog sto je nestalo
df = df.dropna(axis=0)
print(df.isnull().sum())

##univarijantna analiza promenljive avgtemperature i kategorija
#pripremanje grafikona
plt.figure(figsize=(6,4))
#pripremanje histograma
plt.hist(df["AvgTemperature"], bins=50) #i to u 50 "kofa"
plt.title("Histogram")
plt.xlabel("AvgTemperature")
plt.ylabel("Count")
plt.show()

#boxplot
plt.figure(figsize=(4,5))
plt.boxplot(df["AvgTemperature"], vert=True)
plt.title("Boxplot")
plt.ylabel("AvgTemperature")
plt.show()

#kategorizacija
top_k = 15

vc_country = df["Country"].value_counts().head(top_k)
plt.figure(figsize=(8,4))
plt.bar(vc_country.index.astype(str), vc_country.values)
plt.title("Top",top_k,"zemalja po broju zapisa")
plt.xticks(rotation=45, ha="right")
plt.tight_layout(); plt.show()

##korelacije i scatter plot

#izdvajaju se numericke iz df
num_df = df[["AvgTemperature", "Month", "Year"]]
#pa se pravi korelaciona matrica
corr = num_df.corr(method="pearson")

#pa se bkv pravi prikaz korelacione matrice
plt.figure(figsize=(6,4))
sb.heatmap(corr, annot=True, fmt=".3f", cmap="vlag", square=True)
plt.title("Corr")
plt.tight_layout()
plt.show()

#scatter plot
#grupise se po regionu (kontinentu) i mesecu
g_rm = df.groupby(["Region","Month"], as_index=False)["AvgTemperature"].mean()

#pravi se pivot od te grupe, bitno je
pivot = g_rm.pivot(index="Month", columns="Region", values="AvgTemperature")
print(pivot) #display na jupyteru

#izdvajanje evrope i aus
g_eu = pivot["Europe"]
g_au = pivot["Australia/South Pacific"]

corr_value = g_eu.corr(g_au)
print("Korelacija (Evropa i Australija):", corr_value)

plt.figure(figsize=(10,5))
plt.scatter(g_eu, g_au, marker="o")
plt.title("AvgTemperature za Evropu i Australiju po mesecima")
plt.xlabel("AvgTemp Evropa"); plt.ylabel("AvgTemp Australija")
plt.xlim([min(min(g_eu),min(g_au))-10,max(g_eu)+10]), plt.ylim([min(min(g_eu),min(g_au))-10,max(g_au)+10])
plt.tight_layout()
plt.show()


Exploratory Data Analysis 2

#zadatak
"""
    * Kreirajte AvgTemperature_C = (AvgTemperature - 32) * 5/9 pa ga nadalje koristite za analize...
    * Kako se odnose raspodela temperatura 1995. i 2005. godine? Prikazati pomoću histograma. (Iskoristiti 
      opciju za transparentnost histograma alpha kako bi se oba histograma mogla prikazati na istom grafiku.)
    * Prikažite Top-5 zemalja sa najvišim prosečnim termperaturama.
    * Napravite boxplot po kontinentu (izaberite 2 kontinenta) — uporedite raspodele prosečnih temperatura.
"""

#importujemo
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sb
from pathlib import Path

#podesavanje parametara za grafike
plt.rcParams['figure.figsize'] = (10,5)

#ucitavanje fajla
DATA_PATH = Path("city_temperature.csv")
df = pd.read_csv(DATA_PATH)

#ciscenje i dopuna podataka
#menjanje prosecnih temperatura sa -99, nultih dana... na nan
df.loc[df["AvgTemperature"] == -99, "AvgTemperature"] = np.nan
df.loc[df["Day"] == 0, "Day"] = np.nan

#menjanje 200. i 201. godine isto u nan
df["Year"] = df["Year"].replace({200: np.nan, 201: np.nan})

#izbacivanje nepotrebne kolone
if "State" in df.columns:
    df = df.drop("State", axis=1)

#forward fill za avgtemp
df["AvgTemperature"] = df["AvgTemperature"].ffill()

#izbacuju se SVI nanovi!!!!!!!
df = df.dropna(axis=0)
print("Oblik po ciscenju:", df.shape)
print(df.head()) #display na jupyteru

#1. postavljanje kolone na celzijus
df["AvgTemperature_C"] = (df["AvgTemperature"] - 32) * (5/9)
print(df.head())
print(df["AvgTemperature_C"].describe())

#2. histogram: 1995-2005
sub_95 = df.loc[df["Year"]==1995, "AvgTemperature_C"]
sub_05 = df.loc[df["Year"]==2005, "AvgTemperature_C"]

#prikaz histograma za obe godine
plt.hist(sub_95, density=False, bins=30, alpha=0.5, label="1995")
plt.hist(sub_05, density=False, bins=30, alpha=0.5, label="2005")

#labelovanje....
plt.xlabel("AvgTemperature (°C)")
plt.ylabel("Broj pojavljivanja")
plt.legend(loc="best")
plt.tight_layout()
plt.show()

#3. top pet zemalja po prosecnoj temperaturi u °C
country_mean = (df.groupby("Country", as_index=True)["AvgTemperature_C"]\
                .mean()\
                .sort_values(ascending=False)\
                .head(5))

#prikaz i to zaokruzen na dveju decimala
print(country_mean.to_frame("AvgTemperature_C_mean").round(2)) #display na jupyteru

#vizuelizacija
plt.figure(figsize=(10,5))
sb.barplot(x=country_mean.index.astype(str), y=country_mean.values)
plt.title("Top 5 zemalja po prosecnoj temperaturi")
plt.xlabel("Country")
plt.ylabel("Prosecna temperatura u C")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.show()

#4. boxplot po kontinentu
#izdvajanja
sub_eu = df.loc[df["Region"] == "Europe", "AvgTemperature_C"]
sub_au = df.loc[df["Region"] == "Australia/South Pacific", "AvgTemperature_C"]

#pripremanje plota
plt.figure(figsize=(10,5))
plt.boxplot([sub_eu, sub_au])
plt.title("Boxplot Avg °C")
plt.xlabel("Region")
plt.ylabel("AvgTemprature")
plt.tight_layout()
plt.show()


Exploratory Data Analysis 3

#uvoz biblioteka

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from pathlib import Path
from scipy.stats import chi2_contingency, f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

#podesavanje ispisa (na dve decimale)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
DATA_PATH = Path("city_temperature.csv")

##ucitavanje info
df = pd.read_csv(DATA_PATH)

###dopuna podataka
#-menjanje svih u avgtemperature sa -99 i nultog dana na nan
df.loc[df["AvgTemperature"] == -99, "AvgTemperature"] = np.nan
df.loc[df["Day"] == 0, "Day"] = np.nan
#...kao i godina
df["Year"] = df["Year"].replace({200: np.nan, 201: np.nan})

#izbacivanje suvisne KOLONE + forward fill temp
df = df.drop("State", axis=1)
df["AvgTemperature"] = df["AvgTemperature"].ffill()

#izbacivanje nan VRSTA
df = df.dropna(axis=0)

print(df.shape)
print(df.head()) #d

###grupisanje i pivot tabele
grp_city_month = (df.groupby(["Region","Country","City","Month"], as_index = False)\
                 ["AvgTemperature"].mean())
print(grp_city_month.head(10)) #d

#pivotiranje....
pivot_city_month = grp_city_month.pivot_table(index="City", columns="Month", \
                                              values = "AvgTemperature")
print(pivot_city_month.head()) #d

#heatmap vizuelizacija
plt.figure(figsize=(10,6))
sb.heatmap(pivot_city_month.iloc[:10], annot=False, cmap="coolwarm", linewidths=.5)
plt.title("Heatmap: AvgTemperatre City by Month (mean)")
plt.tight_layout()
plt.show()

###LINIJSKI GRAFIKONI
#primer 1. promena temperature po jednom gradu

top_city = df["City"].value_counts().index[0]
#izdvajanje po jednom gradu
sub = df[df["City"] == top_city] 

monthly_city = (sub.groupby("Month", as_index=False)["AvgTemperature"].mean()\
                .sort_values("Month"))

plt.figure(figsize=(6,4))
#plot po X i Y respektivno
plt.plot(monthly_city["Month"], monthly_city["AvgTemperature"], marker="o")
plt.title(f"Temp mesec {top_city}")
plt.xlabel("Month")
plt.ylabel("AvgTemperature")
#ovde od 1 do 13 (1-12 bez 13 tj. jan-dec)
plt.xticks(range(1,13))

#namestanje grida
plt.grid(True)
plt.show()

#primer 2. trend promene temperature kroz godine
city_year = (df.groupby(["City","Year"],as_index=False)\
             ["AvgTemperature"].mean())

#eksplicitno izdvajanje city_yeara po proslom odabranom gradu
cy = city_year[city_year["City"] == "Portland"]

plt.figure(figsize=(6,4))
plt.plot(cy["Year"], cy["AvgTemperature"],marker="o")
plt.title("Annual trend, Portland")
plt.xlabel("Year")
plt.ylabel("AvgTemperature")
plt.grid(True)
plt.show()

#hmm....pojavio se ekstrem
#utvrdjuje se:
df_city = df.loc[df["City"] == 'Portland']
df_city_2020 = df_city.loc[df_city["Year"] == 2020]
print(df_city_2020["Month"].unique())
#ispisace se [1, 2, 3, 4, 5] - nema podataka za celu '20. godinu

#primer 3. prosecna temperatura po region x month
g_rm = df.groupby(["Region","Month"],as_index=False)["AvgTemperature"].mean()

plt.figure(figsize=(10,5))
sb.lineplot(data=g_rm, x="Month", y="AvgTemperature", hue="Region", marker="o")
plt.title("AvgTemperature per month per region")
plt.xlabel("Month")
plt.ylabel("AvgTemperature (°C)")

#bbox to anchor
plt.legend(title="Region", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()

###BAR GRAFIKONI
#primer 1. top 10 gradova po prosecnoj temperaturi

#najpre se izdvaja top 10
city_mean = df.groupby("City", as_index = True)["AvgTemperature"].mean().\
            sort_values(ascending=False).head(10)

plt.figure(figsize=(9,4))
sb.barplot(x=city_mean.index.astype(str), y=city_mean.values)
plt.title("Top 10 cities")
plt.xlabel("City")
plt.ylabel("AvgTemperature (°F)")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

#primer 2. prosecna temperatura po regionu po mesecima
regions = ["Europe", "Australia/South Pacific"]

#izdvajanje samo onih kolona iz dataframe cija se imena nalaze u regions listi
g2 = df[df["Region"].isin(regions)]

sb.barplot(data=g2, x="Month", y="AvgTemperature", hue="Region")
plt.legend(title="Region", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.show()

###ANALIZA KATEGORICKIH VARIJABLI - VALUE_COUNTS
#najzastupljenijih 10 gradova i zemalja

#izdvajanje i prikaz u listi
vc_city = df["City"].value_counts(normalize=True).head(10) * 100
print(vc_city.round(2).astype(str) + "%") #d


plt.figure(figsize=(9,4))
sb.barplot(x=vc_city.index.astype(str), y=vc_city.values)
plt.title("Top 10 Cities")
plt.xlabel("City")
plt.ylabel("%")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

#....i top 10 zemalja
vc_country = df["Country"].value_counts(normalize=True).head(10) * 100
print(vc_country.round(2).astype(str) + "%") #d

plt.figure(figsize=(9,4))
sb.barplot(x=vc_country.index.astype(str), y=vc_country.values)
plt.title("Top-10 Country (procenat učešća)")
plt.ylabel("Učešće (%)"); plt.xlabel("Country")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

###TABELA KONTIGENCIJE
#za to treba da se ubaci "from scipy.stats import chi2_contingency"

#10 zemalja sa najvise uzoraka
ten_countries = df["Country"].value_counts.head(10).index
sub = df[df["Country"].isin(ten_countries)]
ct = pd.crosstab(sub["Region"], sub["Country"])
print(ct) #d

#hi na kvadrat
chi2, p, dof, expected = chi2_contingency(ct)
print(f"chi2={chi2:.2f}, dof={dof}, p-value={p:.3e}")

# trivijalno je test pozitivan, znaci hi2 je visok a p je blizu 0; varijable za regione
# i drzave medjusobno zavisne (zemlja jednoznacno odgovara regionu)

###JEDNOFAKTORSKA ANOVA
#postoji li statisticki znacanja razlika izmedju prosecnih vrednosti varijable za vise grupa (kontinent)
#za to treba da se urade: 
# from scipy.stats import f_oneway
# from statsmodels.stats.multicomp import pairwise_tukeyhsd

groups = [g["AvgTemperature"].values for _, g in df.groupby("Region")]

F, p = f_oneway(*groups)
print(f"ANOVA F={F:.2f}, p-value={p:.3e}")
print(pairwise_tukeyhsd(df["AvgTemperature"], df["Region"]))


Exploratory Data Analysis 4

#zadatak
"""
    * Pivot + heatmap: Napravi pivot tabelu Region × Month (mean AvgTemperature) i 
      vizuelizuj seaborn heatmap. Napiši 2 zapažanja.
    * Barplot: Nacrtaj grafikon koji prikazuje prosečne temperature po regionima po godinama.
    * ANOVA: Proveri da li za Evropu postoji statistički značajna 
      razlika prosečne temperature u različitim godinama.
"""

#importi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from pathlib import Path

from scipy.stats import f_oneway #3. zad

#ucitavanje
plt.rcParams["figure.figsize"] = (10,5)
DATA_PATH = Path("city_temperature.csv")
df = pd.read_csv(DATA_PATH)

#ciscenje i dopune
df.loc[df["AvgTemperature"] == -99, "AvgTemperature"] = np.nan
df.loc[df["Day"] == 0, "Day"] = np.nan
df["Year"] = df["Year"].replace({200: np.nan, 201: np.nan})

if "State" in df.columns:
    df = df.drop("State", axis=1)

df['AvgTemperature'] = df['AvgTemperature'].ffill()
df = df.dropna(axis=0)

#celzijus
df["AvgTemperature_C"] = (df["AvgTemperature"] - 32) * (5/9)

print("Shape:", df.shape)
print(df.head()) #d

#1. pravljenje pivot tabele: redovi = region, kolone = meseci, srednja vrednost
#pivot tabela
pivot_rm = (df.groupy(["Region", "Month"], as_index=False)\
            ["AvgTemperature_C"]\
                .mean()\
                    .pivot(index="Region", columns="Month", values="AvgTemperature_C")\
                        .sort_index()
           )

#vizuelizacija kao heatmap
print(pivot_rm.round(2)) #d
plt.figure(figsize=(10,6))
sb.heatmap(pivot_rm, annot=False, cmap="coolwarm", linewidths=.5)

plt.title("Heatmap")
plt.tight_layout()
plt.show()

#zapazanje 1: najtopliji mesec u regionu:
max_month_per_region = pivot_rm.idxmax(axis=1)
#globalni najtopliji region, po mesecnom proseku
mean_per_region = pivot_rm.mean(axis=1)
glob_warmest_region = mean_per_region.idxmax()

print("Najtopliji mesec po regionu:")
print(max_month_per_region.to_frame("Najtopliji mesec")) #d

#zapazanje 2: 
# - u evropi i americi postoje znacajne promene temperature 
#   tokom godine, dok ih u africi i centralnoj americi gotovo
#   i nema 

#2. barplot:, region x year:

years_sorted = sorted(df["Year"].unique())[-3:]
g2 = df[df["Year"].isin(years_sorted)]
sb.barplot(data=g2, x="Year", y="AvgTemperature", hue="Region")
plt.legend(title="Region", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.show()

#3. anova (1 factor):
eu = df[df["Region"]=="Europe"].copy()

groups = []
for _, g in eu.groupby("Year"):
    groups.append(g["AvgTemperature"].values)
F, p = f_oneway(*groups)
print(f"ANOVA F={F:.2f}, p-value={p:.3e}")

#p < 0.05: postoji statistički značajna razlika prosečne 
#          temperature između godina u Evropi (p < 0.05)
#p > 0.05: nema statistički značajne razlike prosečne temperature 
#           između godina u Evropi (p ≥ 0.05)

#sredine po godinama radi uvida
means_by_year = eu.groupby("Year")["AvgTemperature_C"].mean()
print(means_by_year.to_frame("Mean_Europe_AvgTemp_C").round(2)) #d


Linear Regression 1 

#uvoz biblioteka
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler #zbog normalizacije

##PRIPREMA TABELE
##----------------
#predvidjanje vidljivosti u kilometrima
df_raw = pd.read_csv("weatherHistory.csv")

#dimenzije
print(df_raw.shape)
df_raw.head()

#analiza ciljne promenljive
#ima li nan vrednosti
y = df_raw["Visibility (km)"]
print("number of missing values: ",y.isna().sum())

#osnovne deskriptivne statistike
print(y.describe()) #d

#histogram raspodele
plt.hist(y, bins=40)
plt.xlabel("Visibility")
plt.ylabel("Frequency")
plt.title("Visibility histogram")
plt.tight_layout()
plt.show()

#biranje podskupa obelezja za regresor
print("Summary: ", df_raw["Summary"].nunique())
print("Precip Type: ", df_raw["Precip Type"].nunique())
print("Daily Summary: ", df_raw["Daily Summary"].nunique())

# summary i daily summary imaju previse kategorija pa su nezgodni za 
# obradu u ovom obliku. izbacuju se - nezgodne su i za linreg

# formatted date ne moze da se upotrebi; moglo bi da se razbije na dan-mesec-godina 
# ali se moze samo i izbaciti

#preostala obelezja se mogu ispisati vako
print(df_raw.columns)

# i sada se koriste za predivdjanje vidljivosti. a ono obelezje koje se predvidja 
# NE MOZE da se nadje medju onima koji se koriste za predvidjanje

# stoga se sad izdvajaju svi sem izbacenih i visibility-ja
features = ["Precip Type", "Temperature (C)", "Apparent Temperature (C)",
            "Humidity", "Wind Speed (km/h)", "Wind Bearing (degrees)", "Loud Cover",
            "Pressure (millibars)"]

X = df_raw[features].copy()
print("Dimenzije izabranog podskupa:", X.shape)
X.head()

#provera nedostajucih vrednosti
print(X.isna().sum())

X["Precip Type"].unique()
#hmm...pojavljuje se tu nan vrednost...

print(X.describe()) #d

plt.figure(figsize=(10,8))
numeric_cols = X.columns[1:]
i=1

for col in numeric_cols:
    plt.subplot(2,4,i)
    plt.hist(X[col], bins=30)
    plt.title(col)
    i=i+1

plt.tight_layout()
plt.show()

#loud cover/cloud cover ima samo nule, sto ne nosi nikakav info i izbacuje se
#pressure ima nesto jednako 0 - to su zapravo nan, pa se dopunjuju medijanom

X = X.drop("Loud Cover", axis=1)
X["Pressure (millibars)"].replace(0, np.nan, inplace=True)
print(X.isnull().sum()/len(X)*100)

#dopuna pritisaka medijanom
X['Pressure (millibars)'] = X['Pressure (millibars)'].fillna(X['Pressure (millibars)'].median())
print(X.isnull().sum()/len(X)*100)

#kako trebaju za to numericki ulazi, kategorije moraju da se pretvore u numericke promenljive
X["Precip Type"] = X["Precip Type"].fillna("None")
print("Number of categories for Precip Type:", X["Precip Type"].nunique())
X["Precip Type"].value_counts()

#a pretvaraju se u dummy obelezja uz drop_first=True:
print("Columns before dummies:", X.shape[1])
X_dummies = pd.get_dummies(X, drop_first=True)
print("Columns after dummies:", X_dummies.shape[1])
X_dummies.head()

##PODELA NA TRAIN I TEST SKUP
##----------------

X_train, X_test, y_train, y_test = \
    train_test_split(X_dummies, y, test_size=0.1, random_state=42)

print("Train shape:", X_train.shape)
print(" Test shape:", X_test.shape)

##JEDNOSTAVNA LINEARNA REGRESIJA
##----------------

# koristi se samo humidity kako bi se dobila intuitivna slika prave regresije 
# gde je hipoteza y = θ_1 * x + θ_0, 

# a cilj regresije je da se otkriju optimalne tete: 
# obuciti model tj. unutrasnje njegove parametre

X1 = X[["Humidity"]]
X1_train, X1_test, y1_train, y1_test = \
    train_test_split(X1, y, test_size=0.1, random_state=42)

lin_reg_1 = LinearRegression(fit_intercept=True)
lin_reg_1.fit(X1_train, y1_train)

#koeficijent i intercept
print(lin_reg_1.coef_[0])
print(lin_reg_1.intercept_)

#deo za predvidjanje
y1_pred_train = lin_reg_1.predict(X1_train)
y1_pred_test = lin_reg_1.predict(X1_test)

print("MAE train:",np.mean(np.abs(y1_train - y1_pred_train)))
print("MAE test :",np.mean(np.abs(y1_test - y1_pred_test)))

tmp = pd.DataFrame({"y_test": y1_test, "y_pred": y1_pred_test})
print(tmp.head(10)) #d

#priprema vizuelizacije
plt.figure(figsize=(7,5))

#prvih 20 uzoraka iz test baze
plt.scatter(X1_test["Humidity"][:20], y1_test[:20], alpha=0.3, label="Test")
x_line = np.linspace(X1_train["Humidity"].min(), X1_train["Humidity"].max(), 
                     200).reshape(-1, 1)
y_line = lin_reg_1.predict(x_line)

#ilustracija REGRESORA
plt.plot(x_line, y_line, color="red")
plt.xlabel("Humidity")
plt.ylabel("Visibility (km)")
plt.title("Simple linear regression")
plt.tight_layout()
plt.show()

##LIN REG SA OSNOVNOM HIPOTEZOM I VECIM BROJEM PREDIKTORA
##----------------
# misli se na uporebu svih numerickih i dummy obelezja cija je
# hipoteza y = θ_n * x_n + ... θ_1 * x_1 + θ_0

lin_reg_2 = LinearRegression(fit_intercept=True)
lin_reg_2.fit(X_train, y_train)
y_pred_2 = lin_reg_2.predict(X_test)

mae = mean_absolute_error(y_test, y_pred_2)
print("MAE:",mae)

# ako su ispunjene pretpostavke linearnog modela, reziduali 
# treba da budu rasporedjeni simetricno oko nule i to bez uocljivog obrasca

tmp = pd.DataFrame({"y_test": y_test, "y_pred": y_pred_2})
print(tmp.tail(10)) #d

residuals = y_test - y_pred_2
plt.hist(residuals, bins=40)
plt.title("Histogram of residuals")
plt.tight_layout()
plt.show()

##NORMALIZACIJA
##----------------
# ona ubrzava treniranje, olaksava interpretaciju koeficijenata i neophodna je
# za regularizaciju i hipoteze sa interakcijama

#najpre staviti: "from sklearn.preprocessing import StandardScaler"

s = StandardScaler()
s.fit(X_train)
X_train_std = s.transform(X_train)
X_test_std = s.transform(X_test)

#i jos jedna linearna regresija....
lin_reg_3 = LinearRegression(fit_intercept=True)
lin_reg_3.fit(X_train_std, y_train)

y_pred_3 = lin_reg_3.predict(X_test_std)

mae = mean_absolute_error(y_test, y_pred_3)
print("MAE:",mae)

##VIZUELIZACIJA KOEFICIJENATA
##----------------
plt.figure(figsize=(10,8))
plt.subplot(311)
plt.bar(lin_reg_2.feature_names_in_,lin_reg_2.coef_)
plt.xticks(rotation=45, ha="right")
plt.subplot(313)
plt.bar(lin_reg_2.feature_names_in_,lin_reg_3.coef_) #nazive preuzimamo iz modela 2 zbog standardizacije
plt.xticks(rotation=45, ha="right")
plt.show()


Linear Regression 2

#uvoz biblioteka
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler #zbog normalizacije

##PRIPREMA TABELE
##----------------
#predvidjanje vidljivosti u kilometrima
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#uvitavanje podataka
df_raw = pd.read_csv("weatherHistory.csv")
print("Dimenzije originalne baze:", df_raw.shape)

#ciljna promenljiva
y = df_raw["Visibility (km)"]

#obelezja koja koristimo 
#znaci, (bez summary, daily summary, formatted date i loud cover, 
#kao i bez visibility)
features = [
    'Precip Type',
    'Temperature (C)',
    'Apparent Temperature (C)',
    'Humidity',
    'Wind Speed (km/h)',
    'Wind Bearing (degrees)',
    'Pressure (millibars)'
]

X = df_raw[features].copy()

#uklanjanje nedostajucih vrednosti / dopuna
X['Pressure (millibars)'] = X['Pressure (millibars)'].fillna\
    (X['Pressure (millibars)'].median())
X['Precip Type'] = X['Precip Type'].fillna("None")

# get_dummies za precip type
X_dum = pd.get_dummies(X, drop_first=True)

print("Dimenzije X_dum:", X_dum.shape)
X_dum.head()

##X
##----------------

#podela na train, validation, test skupove
#najpre se radi obicna podela na train/test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X_dum, y, test_size=0.1, random_state=42)

#pa onda se od trainval dela radi validacija
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size = 0.1, random_state = 42
)

print("Train shape:", X_train.shape)
print("  Val shape:", X_val.shape)
print(" Test shape:", X_test.shape)

#funkcija za uvodjenje metrika:
#-n je broj vrsta a p broj kolona
def print_regression_metrics(y_true, y_pred, n, p, label=""):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    r2_adj = 1-(1-r2)*(n-1)/(n-p-1)
    print(f"{label}MAE     : {mae:.3f}")
    print(f"{label}MSE     : {mse:.3f}")
    print(f"{label}RMSE    : {rmse:.3f}")
    print(f"{label}R2      : {r2:.3f}")
    print(f"{label}R2_adj  : {r2_adj:.3f}\n")

#osnovni model bez standardizacije uz osnovnu hipotezu
lin_reg = LinearRegression(fit_intercept=True)
lin_reg.fit(X_train, y_train)

#ubacivanje hipoteze
y_val_pred = lin_reg.predict(X_val)

print_regression_metrics(y_val, y_val_pred, X_train.shape[0], 
                         X_train.shape[1], label="Val ")

lin_reg = LinearRegression(fit_intercept=True)
lin_reg.fit(X_trainval, y_trainval)

#onda se konacna obuka radi na uzorcima iz train i val a testiranje na test skupu
y_test_pred = lin_reg.predict(X_test)
print_regression_metrics(y_test, y_test_pred, X_trainval.shape[0], 
                         X_trainval.shape[1], label="Test ")

##POLYNOMIAL FEATURES
##----------------

#prave se kvadratna i visestepenska obelezja kao i interakcije izmedju njih
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)

#sada se to radi na podeljenom skupu za SVE potpodele
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(X_test)

#pretvaranje u data frame
X_train_poly = pd.DataFrame(X_train_poly)
X_val_poly = pd.DataFrame(X_val_poly)
X_test_poly = pd.DataFrame(X_test_poly)

print("Dimensions after polynomial transform:")
print(X_train_poly.shape)
print("New feature names: ", poly.get_feature_names_out())
print(X_train_poly.head()) #d

#standardizovanje kao prvi korak zbog interakcija
#fituje se na train podacima i transformise kroz train/val/test

scaler_poly = StandardScaler()

X_train_poly_std = scaler_poly.fit_transform(X_train_poly)
X_train_poly_std = pd.DataFrame(X_train_poly_std, columns=poly.get_feature_names_out(),
                                index=X_train.index)

#analogno i za ostala dva....
X_val_poly_std = scaler_poly.transform(X_val_poly)
X_val_poly_std = pd.DataFrame(X_val_poly_std, columns=poly.get_feature_names_out,
                              index=X_val.index)

X_test_poly_std = scaler_poly.transform(X_test_poly)
X_test_poly_std = pd.DataFrame(X_test_poly_std, columns = poly.get_feature_names_out(),
                               index=X_test.index)

print(X_train_poly_std.head()) #d
print(X_train_poly.shape)
print(X_train_poly_std.shape)

#linearna regresija sa hipotezom viseg stepena
lin_reg_poly = LinearRegression(fit_intercept=True)
lin_reg_poly.fit(X_train_poly_std, y_train)

y_val_poly = lin_reg_poly.predict(X_val_poly_std)

print("Linear model with PF on val subset:")
print_regression_metrics(y_val, y_val_poly, 
                         X_train_poly.shape[0], X_train_poly.shape[1], 
                         label="Val  ")

#konacna obuka modela sa odabranimm hiperparametrima
lin_reg_poly = LinearRegression(fit_intercept=True)
lin_reg_poly.fit(pd.concat([X_train_poly_std, X_val_poly_std], axis=0),
                 pd.concat([y_train, y_val], axis=0))

print("Linear model with PF on test subset:")
y_test_poly = lin_reg_poly.predict(X_test_poly_std)
print_regression_metrics(y_test, y_test_poly, 
                         X_train_poly.shape[0]+X_val_poly.shape[0], 
                         X_train_poly.shape[1], label="Test ")

#vizuelizacija
plt.figure(figsize=(16,5))
plt.bar(poly.get_feature_names_out(), lin_reg_poly.coef_)
plt.xticks(rotation=45, ha="right")
plt.show()

#underfitting se desava kada je model suvise jednostavan; los na train i val/test
#overfitting se desava kada je model suvise slozen; dobar na trainu ali los na val/test

#standardizacija ne utice na uspesnost LR ALI doprinosi brzini konvergencije i da je
#interpretacija koeficijenata zavisnosti laksa. kad postoje interakcije - zbog razlicitih
#skala obelezja - pre obuke modela neophodna je standardizacija zbog znacaja koeficijenata

##RIDGE I LASSO
##----------------

#za ovo je pak obavezna standardizacija
scaler_poly = StandardScaler()

#i to se sad njaka na svaki od tri dela...
X_train_std_part = scaler_poly.fit_transform(X_train.iloc[:, :-2])
X_train_std_part = pd.DataFrame(X_train_std_part, columns=X_train.columns[:-2],
                                index = X_train.index)
X_train_std = pd.concat([X_train_std_part, X_train.iloc[:, -2:]], axis=1)

X_val_std_part = scaler_poly.transform(X_val.iloc[:, :-2])
X_val_std_part = pd.DataFrame(X_val_std_part, columns = X_val.columns[:-2],
                              index = X_val.index)
X_val_std = pd.concat([X_val_std_part, X_val.iloc[:,-2:]], axis=1)

X_test_std_part = scaler_poly.transform(X_test.iloc[:, :-2])
X_test_std_part = pd.DataFrame(X_test_std_part, columns=X_test.columns[:-2], index=X_test.index)
X_test_std = pd.concat([X_test_std_part, X_test.iloc[:, -2:]], axis=1)

print(X_train_std.head()) #d
print(X_train.shape)
print(X_train_std.shape)

#ridge sa osnovnom hipotezom
ridge = Ridge(alpha=3.0)
ridge.fit(X_train_std, y_train)
y_val_ridge = ridge.predict(X_val_std)

print("Ridge on basic hypothesis on val:")
print_regression_metrics(y_val, y_val_ridge, X_train.shape[0], X_train.shape[1],
                         label="Val  ")

#lasso sa osnovnom hipotezom
lasso = Lasso(alpha=0.001)
lasso.fit(X_train_std, y_train)
y_val_lasso = lasso.predict(X_val_std)

print("Lasso on basic hypothesis on val:")
print_regression_metrics(y_val, y_val_lasso, X_train.shape[0], X_train.shape[1], 
                         label="Val  ")

#i sada ide se opet sa polinomskim obelezjima
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_poly_std, y_train)

y_val_ridge = ridge.predict(X_val_poly_std)
print("Ridge on PF on val:")
print_regression_metrics(y_val, y_val_ridge, X_train_poly.shape[0], X_train_poly.shape[1], label="Val  ")

lasso = Lasso(alpha=0.001, max_iter=10000)
lasso.fit(X_train_poly_std, y_train)

y_val_lasso = lasso.predict(X_val_poly_std)
print("Lasso on PF on val:")
print_regression_metrics(y_val, y_val_lasso, X_train_poly.shape[0], X_train_poly.shape[1], label="Val  ")

#obuka konacnog modela nakon odabira optimalnih hiperparametara
ridge_fin = Ridge(alpha=1.0)
ridge_fin.fit(pd.concat([X_train_poly_std,X_val_poly_std], axis=0), 
              pd.concat([y_train,y_val], axis=0))
y_test_ridge = ridge_fin.predict(X_test_poly_std)
print_regression_metrics(y_test, y_test_ridge, 
                         X_train_poly.shape[0], X_train_poly.shape[1], 
                         label="Test ")

##VIZUELIZACIJA I POREDJENJE RIDGE I LASSO
##----------------

#recicemo da je broj prvih koeficijenata koji se porede:
n = 20
coef_lin = lin_reg_poly.coef_[:n]
coef_ridge = ridge.coef_[:n]
coef_lasso = lasso.coef_[:n]

indices = np.arange(n)

plt.figure(figsize=(12,6))
plt.plot(indices, coef_lin, marker='o', linestyle='', label='Bez regularizacije')
plt.plot(indices, coef_ridge, marker='x', linestyle='', label='Ridge')
plt.plot(indices, coef_lasso, marker='s', linestyle='', label='Lasso')
plt.xlabel("Indeks koeficijenta (prvih 20)")
plt.ylabel("Vrednost koeficijenta")
plt.title("Poređenje koeficijenata – Linear vs Ridge vs Lasso (PolynomialFeatures)")
plt.legend()
plt.tight_layout()
plt.show()


KNN 1

#uvoz biblioteka
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.datasets import load_breast_cancer #dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay, classification_report,
    roc_auc_score, RocCurveDisplay
)
from sklearn.pipeline import Pipeline

##PRIPREMA TABELE + osnovni uvid
##----------------
data = load_breast_cancer(as_frame=True)
X = data.data
y = data.target

print("X shape:", X.shape)
print("y shape:", y.shape)
print("number of samples:", y.value_counts())
print(X.head()) #d

##T-T-V PODELA
##----------------
#prvo se test izdvaja iz trainval
X_trainval, X_test, y_trainval, y_test= train_test_split(X, y,
    test_size=0.10, random_state=42, stratify=y)

#onda val iz train
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size = 0.10, random_state=42, stratify=y_trainval)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)
print("Train classes:", y_train.value_counts())
print("Val classes:", y_val.value_counts())
print("Test classes:", y_test.value_counts())

##KNN
##----------------

#prvo skaliranje, normalizacija JESTE neophodna
scaler = StandardScaler()
X_train_s = scaler.transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
knn.fit(X_train_s, y_train)

y_val_pred = knn.predict(X_val_s)
for i in range (10):
    print(f"{i:02d}  true={y_val.iloc[i]}  pred={y_val_pred[i]}")    

##MERE USPESNOSTI (matrica konfuzije)
##----------------
cm = confusion_matrix(y_val, y_val_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names).plot(values_format="d")

plt.title("Confusion matrix (VAL) | k=5")
plt.show()

acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred)
rec = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print(f"\nVAL | accuracy={acc:.2f} precision={prec:.2f} recall={rec:.2f} f1={f1:.2f}")

##UTICAJ HIPERPARAMETARA NA VALIDACIONOM SKUPU
##----------------

#euklidsko rastojanje
k_list = list(range(1,31))
val_f1 = []

for k in k_list:
    knn_k = KNeighborsClassifier(n_neighbors=k, metric="euclidean")
    knn_k.fit(X_train_s, y_train)
    pred = knn_k.predict(X_val_s)
    val_f1.append(f1_score(y_val, pred))

best_k = k_list[int(np.argmax(val_f1))]
best_f1 = float(np.max(val_f1))

print("best k per F1 on VAL for euclidean distance:", best_k, "F1:", round(best_f1, 2))

plt.figure(figsize=(9,4))
plt.plot(k_list, val_f1, marker="o")
plt.xlabel("k (n_neighbors)")
plt.ylabel("F1 on validation subset")
plt.title("k choice on validation subset")
plt.grid(True, alpha=0.3)
plt.show()

#menhetn/taksi rastojanje
k_list = list(range(1,31))
val_f1 = []

for k in k_list:
    knn_k = KNeighborsClassifier(n_neighbors=k, metric="manhattan")
    knn_k.fit(X_train_s, y_train)
    pred = knn_k.predict(X_val_s)
    val_f1.append(f1_score(y_val, pred))

best_k = k_list[int(np.argmax(val_f1))]
best_f1 = float(np.max(val_f1))

print("best k per F1 on VAL for euclidean distance:", best_k, "F1:", round(best_f1, 2))

plt.figure(figsize=(9,4))
plt.plot(k_list, val_f1, marker="o")
plt.xlabel("k (n_neighbors)")
plt.ylabel("F1 on validation subset")
plt.title("k choice on validation subset")
plt.grid(True, alpha=0.3)
plt.show()

##TRENIRANJE KONACNOG MODELA NA (train+val)
##----------------
X_train_final = pd.concat([pd.DataFrame(X_train_s), pd.DataFrame(X_val_s)], axis=0)
y_train_final = pd.concat([y_train, y_val], axis=0)

knn_final = KNeighborsClassifier(n_neighbors=1, metric="euclidean", weights="distance")
knn_final.fit(X_train_final, y_train_final)

y_test_pred = knn_final.predict(X_test_s)

cm = confusion_matrix(y_test, y_test_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names).plot(values_format="d")
plt.title(f"Confusion matrix (TEST) | k={best_k}")
plt.show()

acc = accuracy_score(y_test, y_test_pred)
prec = precision_score(y_test, y_test_pred)
rec = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"\nTEST | accuracy={acc:.2f} precision={prec:.2f} recall={rec:.2f} f1={f1:.2f}")

##PIPELINE
##----------------
from sklearn.pipeline import Pipeline

#podela train/val/test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.10, random_state=42, stratify=y_trainval)

#definisanje pipeline
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

#isprobavanje hiperparametara na validacionom skupu
pipe.set_params(knn__n_neighbors=3, knn__metric='manhattan', knn__weights='distance').fit(X_train, y_train)
y_val_pred = pipe.predict(X_val)

print("F1 (VAL):", f1_score(y_val, y_val_pred))

#cim se utvrde optimalni parametri obucava se konacan model i za to se moze
#iskoristiti vec napravljen pipeline

pipe.set_params(knn__n_neighbors=3, knn__metric='manhattan', knn__weights='distance').fit(X_trainval, y_trainval)
y_test_pred = pipe.predict(X_test)

print("F1 (TEST):", f1_score(y_test, y_test_pred))

cm2 = confusion_matrix(y_test, y_test_pred)
ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=data.target_names).plot(values_format="d")
plt.title(f"Confusion matrix (TEST) | k={1}")
plt.show()

##ROC KRIVA
##----------------
if hasattr(pipe.named_steps["knn"], "predict_proba"):
    y_score = pipe.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_score)
    print("ROC-AUC (TEST):", round(auc, 2))
    RocCurveDisplay.from_predictions(y_test, y_score)
    plt.title("ROC kriva (TEST)")
    plt.show()

##X
##----------------


#uvoz biblioteka
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    classification_report, accuracy_score
)

from sklearn.model_selection import StratifiedKFold, GridSearchCV

##UCITAVANJE I UVID
##----------------
df = sb.load_dataset("penguins")

print(df.shape)
print(df.isna().sum())
print(df.head()) #d
print(df.describe()) #d

#sredjivanje baze, prvo se uklanjaju nanovi (vrste)
df = df.dropna(axis=0).reset_index(drop=True)

print(df.shape)
print(df['island'].unique())
print(df['sex'].unique())
print(df['species'].value_counts())
print(df.head()) #d

##DEFINISANJE CILJA I OBELEZJA
##----------------
#ciljna promenljiva je species
#u knn ne sme kategoricke kao tekst nego mora u numericko kroz one hot

target = "species"

num_cols = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
cat_cols = ["island", "sex"]

X = df[num_cols + cat_cols].copy()
y = df[target].copy()

print("X shape:", X.shape)
print("y shape:", y.shape)
print("classes:", y.unique())

##T-T-V PODELA
##----------------
#odvaja se prvo test pa se ostatak deli na train i validation
#uz stratify=y da bi raspodele bile slicne u svim skupovima
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.2, random_state=42, stratify=y_trainval
)

#0.25*0.80 = 0.2; stoga je podela 60/20/20
print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

print("\Class distributions (train/val/test):")
print("Train:\n", y_train.value_counts(normalize=True).round(1))
print("Val:\n", y_val.value_counts(normalize=True).round(1))
print("Test:\n", y_test.value_counts(normalize=True).round(1))

# num obelezja: euklid, menhetn rastojanje
# kat obelezja: dice, jaccard

# standardscaler samo na numericke a onehotencoder samo na 
# kategoricke

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(), cat_cols),
    ],
    remainder="drop"
)

##PIPELINE
##----------------
pipe = Pipeline([
    ("prep", preprocess),
    ("knn", KNeighborsClassifier(metric='euclidean'))
])

#proba nekoliko k vrednosti
k_values = [1, 2, 3, 4, 5, 10, 20, 50]
val_acc = []

for k in k_values:
    pipe.set_params(knn__n_neighbors=k)
    pipe.fit(X_train, y_train)
    y_val_pred = pipe.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    val_acc.append(acc)

pd.DataFrame({"k": k_values, "val_accuracy": val_acc})\
    .sort_values("val_accuracy", ascending=False)

#vizuelizacija uticaja k na validacionom skup
plt.figure(figsize=(8, 4))
plt.plot(k_values, val_acc, marker="o")
plt.xlabel("k (n_neighbors)")
plt.ylabel("Tačnost na validacionom skupu")
plt.title("Uticaj k na tačnost")
plt.grid(True)
plt.show()

best_k = k_values[int(np.argmax(val_acc))]
print("best k on validation set:", best_k)

##KONACAN MODEL 
##----------------
#i to obuka sa izabranim k
pipe.set_params(knn__n_neighbors = best_k)
pipe.fit(X_trainval, y_trainval)

y_test_pred = pipe.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred)
print("Accuracy (test):", round(test_acc, 3))

##MATRICA KONFUZIJE (3X3)
##----------------
labels = sorted(y.unique())

cm = confusion_matrix(y_test, y_test_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)

plt.figure(figsize=(6, 6))
disp.plot(cmap=None, values_format="d")
plt.title("Confusion matrix (TEST)")
plt.show()

print(classification_report(y_test, y_test_pred, digits=3))

##UNAKRSNA VALIDACIJA (rucno)
##----------------
#train skup se deli na deset delova; 9 za trening a 1 za validaciju
#ponovi deset puta, tako da svaki fold jednom bude validacioni

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

k_for_cv = best_k
pipe_cv = Pipeline([
    ("prep", preprocess),
    ("knn", KNeighborsClassifier(n_neighbors=k_for_cv, metric='euclidean'))
])

cv_scores = []
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_trainval, y_trainval), start=1):
    X_tr, X_va = X_trainval.iloc[train_idx], X_trainval.iloc[val_idx]
    y_tr, y_va = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]

    pipe_cv.fit(X_tr, y_tr)
    y_va_pred = pipe_cv.predict(X_va)
    acc = accuracy_score(y_va, y_va_pred)
    cv_scores.append(acc)

    print(f"Fold {fold_idx:02d} accuracy: {acc:.3f}")

print("\nCV mean accuracy:", round(np.mean(cv_scores), 3))
print("CV std  accuracy:", round(np.std(cv_scores), 3))

##GRIDSEARCHCV + PIPELINE
##----------------
param_grid = {
    "knn__n_neighbors": [1, 2, 3, 4, 5, 10, 20, 50],
    "knn__weights": ["uniform", "distance"],
    "knn__p": [1, 2],  # p=1 → Manhattan, p=2 → Euclidean (Minkowski)
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="accuracy",
    cv=skf,
    n_jobs=-1,
    verbose=0
)

grid.fit(X_trainval, y_trainval)

print("Best parameters:", grid.best_params_)
print("Best CV score:", round(grid.best_score_, 3))

#konacna evaluacija
best_model = grid.best_estimator_

y_test_pred_gs = best_model.predict(X_test)

print("Accuracy (TEST, GridSearch):", round(accuracy_score(y_test, y_test_pred_gs), 3))
print("\nClassification report (TEST, GridSearch):")
print(classification_report(y_test, y_test_pred_gs, digits=3))

cm2 = confusion_matrix(y_test, y_test_pred_gs, labels=labels)
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=labels)

plt.figure(figsize=(6, 6))
disp2.plot(cmap=None, values_format="d")
plt.title("Matrica konfuzije (TEST) — GridSearch model")
plt.show()

##X
##----------------


























hmm

* izbacivanje X i Y. uzorka i baze:
df.drop([X], inplace=True, axis=0)
df.drop([Y], inplace=True, axis=0)

* LR sa trening i test, 10% u test skup a random_state = 10:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state = 10)

* model LR sa osnovnom hipotezom i to bez regularizacije:
model = LinearRegression(fit_intercept=True)
model.fit(X_train,y_train)

* kNN klasifikator, euklid i 1 sused:
knn = KNeighborsClassifier(n_neighbors=1, metric="euclidean")
knn.fit(X_train_s, y_train)

* predikcija na test skupu i odgovoriti koliko ima pravih pozitiva (gornji levi ugao)
y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
true_positives=cm[1,1]

acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred)
rec = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print(f"\nVAL | accuracy={acc:.2f} precision={prec:.2f} recall={rec:.2f} f1={f1:.2f}")...
(izmeniti...)

*izbacivanje obelezja A iz baze:
df.drop(["A"], inplace=True,axis=1)

*predikcija izlaza i mse.
y_predicted = model.predict(X_test)
mse = mean_square_error(y_test, y_predicted)

* LR sa trening i test, 10% u test skup a random_state = 10, uz originalnu zastupljenost:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state = 10, stratify=y)

*model lin regresije sa osnovnom hipotezom gde je ridge reguular sa parametrom alpha=5.

model_ridge = Ridge(alpha=5)
model_ridge.fit(X,train,y_train)

*(u nekoj bazi) uzorak predstavlja skup obelezja koja ga opisuju i na kojem obucavamo model

*popuniti nedostajuce vrednosti medijanom donji kvartal obelezja hum:
df["temp"] = df["temp"].fillna(df["temp"].median())
Q1(25%)=0.48

*predikcija na test skupu:
y_pred=classifier.predict(X_test)

end