Linear Regression 1
#uvoz biblioteka
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler #zbog normalizacije
##PRIPREMA TABELE
##----------------
#predvidjanje vidljivosti u kilometrima
df_raw = pd.read_csv("weatherHistory.csv")
#dimenzije
print(df_raw.shape)
df_raw.head()
#analiza ciljne promenljive
#ima li nan vrednosti
y = df_raw["Visibility (km)"]
print("number of missing values: ",y.isna().sum())
#osnovne deskriptivne statistike
print(y.describe()) #d
#histogram raspodele
plt.hist(y, bins=40)
plt.xlabel("Visibility")
plt.ylabel("Frequency")
plt.title("Visibility histogram")
plt.tight_layout()
plt.show()
#biranje podskupa obelezja za regresor
print("Summary: ", df_raw["Summary"].nunique())
print("Precip Type: ", df_raw["Precip Type"].nunique())
print("Daily Summary: ", df_raw["Daily Summary"].nunique())
# summary i daily summary imaju previse kategorija pa su nezgodni za
# obradu u ovom obliku. izbacuju se - nezgodne su i za linreg
# formatted date ne moze da se upotrebi; moglo bi da se razbije na dan-mesec-godina
# ali se moze samo i izbaciti
#preostala obelezja se mogu ispisati vako
print(df_raw.columns)
# i sada se koriste za predivdjanje vidljivosti. a ono obelezje koje se predvidja
# NE MOZE da se nadje medju onima koji se koriste za predvidjanje
# stoga se sad izdvajaju svi sem izbacenih i visibility-ja
features = ["Precip Type", "Temperature (C)", "Apparent Temperature (C)",
"Humidity", "Wind Speed (km/h)", "Wind Bearing (degrees)", "Loud Cover",
"Pressure (millibars)"]
X = df_raw[features].copy()
print("Dimenzije izabranog podskupa:", X.shape)
X.head()
#provera nedostajucih vrednosti
print(X.isna().sum())
X["Precip Type"].unique()
#hmm...pojavljuje se tu nan vrednost...
print(X.describe()) #d
plt.figure(figsize=(10,8))
numeric_cols = X.columns[1:]
i=1
for col in numeric_cols:
plt.subplot(2,4,i)
plt.hist(X[col], bins=30)
plt.title(col)
i=i+1
plt.tight_layout()
plt.show()
#loud cover/cloud cover ima samo nule, sto ne nosi nikakav info i izbacuje se
#pressure ima nesto jednako 0 - to su zapravo nan, pa se dopunjuju medijanom
X = X.drop("Loud Cover", axis=1)
X["Pressure (millibars)"].replace(0, np.nan, inplace=True)
print(X.isnull().sum()/len(X)*100)
#dopuna pritisaka medijanom
X['Pressure (millibars)'] = X['Pressure (millibars)'].fillna(X['Pressure (millibars)'].median())
print(X.isnull().sum()/len(X)*100)
#kako trebaju za to numericki ulazi, kategorije moraju da se pretvore u numericke promenljive
X["Precip Type"] = X["Precip Type"].fillna("None")
print("Number of categories for Precip Type:", X["Precip Type"].nunique())
X["Precip Type"].value_counts()
#a pretvaraju se u dummy obelezja uz drop_first=True:
print("Columns before dummies:", X.shape[1])
X_dummies = pd.get_dummies(X, drop_first=True)
print("Columns after dummies:", X_dummies.shape[1])
X_dummies.head()
##PODELA NA TRAIN I TEST SKUP
##----------------
X_train, X_test, y_train, y_test = \
train_test_split(X_dummies, y, test_size=0.1, random_state=42)
print("Train shape:", X_train.shape)
print(" Test shape:", X_test.shape)
##JEDNOSTAVNA LINEARNA REGRESIJA
##----------------
# koristi se samo humidity kako bi se dobila intuitivna slika prave regresije
# gde je hipoteza y = θ_1 * x + θ_0,
# a cilj regresije je da se otkriju optimalne tete:
# obuciti model tj. unutrasnje njegove parametre
X1 = X[["Humidity"]]
X1_train, X1_test, y1_train, y1_test = \
train_test_split(X1, y, test_size=0.1, random_state=42)
lin_reg_1 = LinearRegression(fit_intercept=True)
lin_reg_1.fit(X1_train, y1_train)
#koeficijent i intercept
print(lin_reg_1.coef_[0])
print(lin_reg_1.intercept_)
#deo za predvidjanje
y1_pred_train = lin_reg_1.predict(X1_train)
y1_pred_test = lin_reg_1.predict(X1_test)
print("MAE train:",np.mean(np.abs(y1_train - y1_pred_train)))
print("MAE test :",np.mean(np.abs(y1_test - y1_pred_test)))
tmp = pd.DataFrame({"y_test": y1_test, "y_pred": y1_pred_test})
print(tmp.head(10)) #d
#priprema vizuelizacije
plt.figure(figsize=(7,5))
#prvih 20 uzoraka iz test baze
plt.scatter(X1_test["Humidity"][:20], y1_test[:20], alpha=0.3, label="Test")
x_line = np.linspace(X1_train["Humidity"].min(), X1_train["Humidity"].max(),
200).reshape(-1, 1)
y_line = lin_reg_1.predict(x_line)
#ilustracija REGRESORA
plt.plot(x_line, y_line, color="red")
plt.xlabel("Humidity")
plt.ylabel("Visibility (km)")
plt.title("Simple linear regression")
plt.tight_layout()
plt.show()
##LIN REG SA OSNOVNOM HIPOTEZOM I VECIM BROJEM PREDIKTORA
##----------------
# misli se na uporebu svih numerickih i dummy obelezja cija je
# hipoteza y = θ_n * x_n + ... θ_1 * x_1 + θ_0
lin_reg_2 = LinearRegression(fit_intercept=True)
lin_reg_2.fit(X_train, y_train)
y_pred_2 = lin_reg_2.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_2)
print("MAE:",mae)
# ako su ispunjene pretpostavke linearnog modela, reziduali
# treba da budu rasporedjeni simetricno oko nule i to bez uocljivog obrasca
tmp = pd.DataFrame({"y_test": y_test, "y_pred": y_pred_2})
print(tmp.tail(10)) #d
residuals = y_test - y_pred_2
plt.hist(residuals, bins=40)
plt.title("Histogram of residuals")
plt.tight_layout()
plt.show()
##NORMALIZACIJA
##----------------
# ona ubrzava treniranje, olaksava interpretaciju koeficijenata i neophodna je
# za regularizaciju i hipoteze sa interakcijama
#najpre staviti: "from sklearn.preprocessing import StandardScaler"
s = StandardScaler()
s.fit(X_train)
X_train_std = s.transform(X_train)
X_test_std = s.transform(X_test)
#i jos jedna linearna regresija....
lin_reg_3 = LinearRegression(fit_intercept=True)
lin_reg_3.fit(X_train_std, y_train)
y_pred_3 = lin_reg_3.predict(X_test_std)
mae = mean_absolute_error(y_test, y_pred_3)
print("MAE:",mae)
##VIZUELIZACIJA KOEFICIJENATA
##----------------
plt.figure(figsize=(10,8))
plt.subplot(311)
plt.bar(lin_reg_2.feature_names_in_,lin_reg_2.coef_)
plt.xticks(rotation=45, ha="right")
plt.subplot(313)
plt.bar(lin_reg_2.feature_names_in_,lin_reg_3.coef_) #nazive preuzimamo iz modela 2 zbog standardizacije
plt.xticks(rotation=45, ha="right")
plt.show()