Linear Regression 2

#uvoz biblioteka
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler #zbog normalizacije

##PRIPREMA TABELE
##----------------
#predvidjanje vidljivosti u kilometrima
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#uvitavanje podataka
df_raw = pd.read_csv("weatherHistory.csv")
print("Dimenzije originalne baze:", df_raw.shape)

#ciljna promenljiva
y = df_raw["Visibility (km)"]

#obelezja koja koristimo 
#znaci, (bez summary, daily summary, formatted date i loud cover, 
#kao i bez visibility)
features = [
    'Precip Type',
    'Temperature (C)',
    'Apparent Temperature (C)',
    'Humidity',
    'Wind Speed (km/h)',
    'Wind Bearing (degrees)',
    'Pressure (millibars)'
]

X = df_raw[features].copy()

#uklanjanje nedostajucih vrednosti / dopuna
X['Pressure (millibars)'] = X['Pressure (millibars)'].fillna\
    (X['Pressure (millibars)'].median())
X['Precip Type'] = X['Precip Type'].fillna("None")

# get_dummies za precip type
X_dum = pd.get_dummies(X, drop_first=True)

print("Dimenzije X_dum:", X_dum.shape)
X_dum.head()

##X
##----------------

#podela na train, validation, test skupove
#najpre se radi obicna podela na train/test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X_dum, y, test_size=0.1, random_state=42)

#pa onda se od trainval dela radi validacija
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size = 0.1, random_state = 42
)

print("Train shape:", X_train.shape)
print("  Val shape:", X_val.shape)
print(" Test shape:", X_test.shape)

#funkcija za uvodjenje metrika:
#-n je broj vrsta a p broj kolona
def print_regression_metrics(y_true, y_pred, n, p, label=""):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    r2_adj = 1-(1-r2)*(n-1)/(n-p-1)
    print(f"{label}MAE     : {mae:.3f}")
    print(f"{label}MSE     : {mse:.3f}")
    print(f"{label}RMSE    : {rmse:.3f}")
    print(f"{label}R2      : {r2:.3f}")
    print(f"{label}R2_adj  : {r2_adj:.3f}\n")

#osnovni model bez standardizacije uz osnovnu hipotezu
lin_reg = LinearRegression(fit_intercept=True)
lin_reg.fit(X_train, y_train)

#ubacivanje hipoteze
y_val_pred = lin_reg.predict(X_val)

print_regression_metrics(y_val, y_val_pred, X_train.shape[0], 
                         X_train.shape[1], label="Val ")

lin_reg = LinearRegression(fit_intercept=True)
lin_reg.fit(X_trainval, y_trainval)

#onda se konacna obuka radi na uzorcima iz train i val a testiranje na test skupu
y_test_pred = lin_reg.predict(X_test)
print_regression_metrics(y_test, y_test_pred, X_trainval.shape[0], 
                         X_trainval.shape[1], label="Test ")

##POLYNOMIAL FEATURES
##----------------

#prave se kvadratna i visestepenska obelezja kao i interakcije izmedju njih
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)

#sada se to radi na podeljenom skupu za SVE potpodele
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
X_test_poly = poly.transform(X_test)

#pretvaranje u data frame
X_train_poly = pd.DataFrame(X_train_poly)
X_val_poly = pd.DataFrame(X_val_poly)
X_test_poly = pd.DataFrame(X_test_poly)

print("Dimensions after polynomial transform:")
print(X_train_poly.shape)
print("New feature names: ", poly.get_feature_names_out())
print(X_train_poly.head()) #d

#standardizovanje kao prvi korak zbog interakcija
#fituje se na train podacima i transformise kroz train/val/test

scaler_poly = StandardScaler()

X_train_poly_std = scaler_poly.fit_transform(X_train_poly)
X_train_poly_std = pd.DataFrame(X_train_poly_std, columns=poly.get_feature_names_out(),
                                index=X_train.index)

#analogno i za ostala dva....
X_val_poly_std = scaler_poly.transform(X_val_poly)
X_val_poly_std = pd.DataFrame(X_val_poly_std, columns=poly.get_feature_names_out,
                              index=X_val.index)

X_test_poly_std = scaler_poly.transform(X_test_poly)
X_test_poly_std = pd.DataFrame(X_test_poly_std, columns = poly.get_feature_names_out(),
                               index=X_test.index)

print(X_train_poly_std.head()) #d
print(X_train_poly.shape)
print(X_train_poly_std.shape)

#linearna regresija sa hipotezom viseg stepena
lin_reg_poly = LinearRegression(fit_intercept=True)
lin_reg_poly.fit(X_train_poly_std, y_train)

y_val_poly = lin_reg_poly.predict(X_val_poly_std)

print("Linear model with PF on val subset:")
print_regression_metrics(y_val, y_val_poly, 
                         X_train_poly.shape[0], X_train_poly.shape[1], 
                         label="Val  ")

#konacna obuka modela sa odabranimm hiperparametrima
lin_reg_poly = LinearRegression(fit_intercept=True)
lin_reg_poly.fit(pd.concat([X_train_poly_std, X_val_poly_std], axis=0),
                 pd.concat([y_train, y_val], axis=0))

print("Linear model with PF on test subset:")
y_test_poly = lin_reg_poly.predict(X_test_poly_std)
print_regression_metrics(y_test, y_test_poly, 
                         X_train_poly.shape[0]+X_val_poly.shape[0], 
                         X_train_poly.shape[1], label="Test ")

#vizuelizacija
plt.figure(figsize=(16,5))
plt.bar(poly.get_feature_names_out(), lin_reg_poly.coef_)
plt.xticks(rotation=45, ha="right")
plt.show()

#underfitting se desava kada je model suvise jednostavan; los na train i val/test
#overfitting se desava kada je model suvise slozen; dobar na trainu ali los na val/test

#standardizacija ne utice na uspesnost LR ALI doprinosi brzini konvergencije i da je
#interpretacija koeficijenata zavisnosti laksa. kad postoje interakcije - zbog razlicitih
#skala obelezja - pre obuke modela neophodna je standardizacija zbog znacaja koeficijenata

##RIDGE I LASSO
##----------------

#za ovo je pak obavezna standardizacija
scaler_poly = StandardScaler()

#i to se sad njaka na svaki od tri dela...
X_train_std_part = scaler_poly.fit_transform(X_train.iloc[:, :-2])
X_train_std_part = pd.DataFrame(X_train_std_part, columns=X_train.columns[:-2],
                                index = X_train.index)
X_train_std = pd.concat([X_train_std_part, X_train.iloc[:, -2:]], axis=1)

X_val_std_part = scaler_poly.transform(X_val.iloc[:, :-2])
X_val_std_part = pd.DataFrame(X_val_std_part, columns = X_val.columns[:-2],
                              index = X_val.index)
X_val_std = pd.concat([X_val_std_part, X_val.iloc[:,-2:]], axis=1)

X_test_std_part = scaler_poly.transform(X_test.iloc[:, :-2])
X_test_std_part = pd.DataFrame(X_test_std_part, columns=X_test.columns[:-2], index=X_test.index)
X_test_std = pd.concat([X_test_std_part, X_test.iloc[:, -2:]], axis=1)

print(X_train_std.head()) #d
print(X_train.shape)
print(X_train_std.shape)

#ridge sa osnovnom hipotezom
ridge = Ridge(alpha=3.0)
ridge.fit(X_train_std, y_train)
y_val_ridge = ridge.predict(X_val_std)

print("Ridge on basic hypothesis on val:")
print_regression_metrics(y_val, y_val_ridge, X_train.shape[0], X_train.shape[1],
                         label="Val  ")

#lasso sa osnovnom hipotezom
lasso = Lasso(alpha=0.001)
lasso.fit(X_train_std, y_train)
y_val_lasso = lasso.predict(X_val_std)

print("Lasso on basic hypothesis on val:")
print_regression_metrics(y_val, y_val_lasso, X_train.shape[0], X_train.shape[1], 
                         label="Val  ")

#i sada ide se opet sa polinomskim obelezjima
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_poly_std, y_train)

y_val_ridge = ridge.predict(X_val_poly_std)
print("Ridge on PF on val:")
print_regression_metrics(y_val, y_val_ridge, X_train_poly.shape[0], X_train_poly.shape[1], label="Val  ")

lasso = Lasso(alpha=0.001, max_iter=10000)
lasso.fit(X_train_poly_std, y_train)

y_val_lasso = lasso.predict(X_val_poly_std)
print("Lasso on PF on val:")
print_regression_metrics(y_val, y_val_lasso, X_train_poly.shape[0], X_train_poly.shape[1], label="Val  ")

#obuka konacnog modela nakon odabira optimalnih hiperparametara
ridge_fin = Ridge(alpha=1.0)
ridge_fin.fit(pd.concat([X_train_poly_std,X_val_poly_std], axis=0), 
              pd.concat([y_train,y_val], axis=0))
y_test_ridge = ridge_fin.predict(X_test_poly_std)
print_regression_metrics(y_test, y_test_ridge, 
                         X_train_poly.shape[0], X_train_poly.shape[1], 
                         label="Test ")

##VIZUELIZACIJA I POREDJENJE RIDGE I LASSO
##----------------

#recicemo da je broj prvih koeficijenata koji se porede:
n = 20
coef_lin = lin_reg_poly.coef_[:n]
coef_ridge = ridge.coef_[:n]
coef_lasso = lasso.coef_[:n]

indices = np.arange(n)

plt.figure(figsize=(12,6))
plt.plot(indices, coef_lin, marker='o', linestyle='', label='Bez regularizacije')
plt.plot(indices, coef_ridge, marker='x', linestyle='', label='Ridge')
plt.plot(indices, coef_lasso, marker='s', linestyle='', label='Lasso')
plt.xlabel("Indeks koeficijenta (prvih 20)")
plt.ylabel("Vrednost koeficijenta")
plt.title("Poređenje koeficijenata – Linear vs Ridge vs Lasso (PolynomialFeatures)")
plt.legend()
plt.tight_layout()
plt.show()