# Importing Libraries:

import pandas as pd
import numpy as np
from scipy.stats import shapiro
from statsmodels.api import qqplot
from scipy.stats import ranksums, pointbiserialr

import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import plotly.graph_objects as go

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, cross_validate
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, RFECV


# Importing data:

df_raw = pd.read_csv("KAG_energydata_complete.csv")


# Creating a copy of the dataset:

df = df_raw.copy()


# Shape of the dataset:

print(f"Shape: {df.shape}")

Shape: (19735, 29)


# Looking at the first five lines: 

df.head()


# Looking for missing data:

df.isna().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64


# Information about data types:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9           19735 non-null  float64
 20  RH_9         19735 non-null  float64
 21  T_out        19735 non-null  float64
 22  Press_mm_hg  19735 non-null  float64
 23  RH_out       19735 non-null  float64
 24  Windspeed    19735 non-null  float64
 25  Visibility   19735 non-null  float64
 26  Tdewpoint    19735 non-null  float64
 27  rv1          19735 non-null  float64
 28  rv2          19735 non-null  float64
dtypes: float64(26), int64(2), object(1)
memory usage: 4.4+ MB


# Changing date's type to datetime:

df['date'] = pd.to_datetime(df['date'])
df.dtypes

date           datetime64[ns]
Appliances              int64
lights                  int64
T1                    float64
RH_1                  float64
T2                    float64
RH_2                  float64
T3                    float64
RH_3                  float64
T4                    float64
RH_4                  float64
T5                    float64
RH_5                  float64
T6                    float64
RH_6                  float64
T7                    float64
RH_7                  float64
T8                    float64
RH_8                  float64
T9                    float64
RH_9                  float64
T_out                 float64
Press_mm_hg           float64
RH_out                float64
Windspeed             float64
Visibility            float64
Tdewpoint             float64
rv1                   float64
rv2                   float64
dtype: object


# Setting date as the index:

df.set_index('date', inplace=True)


# Separating columns:

temperature_column = [i for i in df.columns if "T" in i]
humidity_column = [i for i in df.columns if "RH" in i]
other = [i for i in df.columns if ("T" not in i)&("RH" not in i)]


# Humidity statistics:

df[humidity_column].describe()


# Temperature statistics:

df[temperature_column].describe()


# Other column statistics:

df[other].describe()


# Counting values of the "lights" column:

df['lights'].value_counts(normalize=True)

lights
0     0.772840
10    0.112085
20    0.082290
30    0.028325
40    0.003902
50    0.000456
70    0.000051
60    0.000051
Name: proportion, dtype: float64


# Dropping the lights column:

df.drop(columns='lights', inplace=True)


# Using boxplot on Visibility, Appliances and Windspeed to visualize the outliers:

fig_sub = make_subplots(rows=1, cols=3, shared_yaxes=False)

fig_sub.add_trace(

    go.Box(y=df['Appliances'].values,
           name='Appliances'
           ),
    row=1, col=1

)

fig_sub.add_trace(
    go.Box(y=df['Windspeed'].values,
           name='Windspeed'
           ),
    row=1, col=2
)

fig_sub.add_trace(
    go.Box(y=df['Visibility'].values,
           name='Visibility'
           ),
    row=1, col=3
)

fig_sub.show("png",width=1500, height=500)


# Visualizing distributions using Histograms:

df.hist(figsize=(17, 20), grid=False);


# Temperatures:

fig, ax = plt.subplots(nrows=3, ncols=4, sharex=False, sharey=False, figsize=(20, 17))
fig.suptitle("QQ-Plot for Temperatures")

qqplot(df["T1"], ax=ax[0, 0], line="s")
ax[0, 0].set_title("T1")

qqplot(df["T2"], ax=ax[0, 1], line="s")
ax[0, 1].set_title("T2")

qqplot(df["T3"], ax=ax[0, 2], line="s")
ax[0, 2].set_title("T3")

qqplot(df["T4"], ax=ax[0, 3], line="s")
ax[0, 3].set_title("T4")

qqplot(df["T5"], ax=ax[1, 0], line="s")
ax[1, 0].set_title("T5")

qqplot(df["T6"], ax=ax[1, 1], line="s")
ax[1, 1].set_title("T6")

qqplot(df["T7"], ax=ax[1, 2], line="s")
ax[1, 2].set_title("T7")

qqplot(df["T8"], ax=ax[1, 3], line="s")
ax[1, 3].set_title("T8")

qqplot(df["T9"], ax=ax[2, 0], line="s")
ax[2, 0].set_title("T9")

qqplot(df["T_out"], ax=ax[2, 1], line="s")
ax[2, 1].set_title("T_out")

qqplot(df["Tdewpoint"], ax=ax[2, 2], line="s")
ax[2, 2].set_title("Tdewpoint")

ax[2, 3].set_visible(False);


# Umidity:

fig, ax = plt.subplots(nrows=2, ncols=5, sharex=False, sharey=False, figsize=(20, 15))
fig.suptitle("QQ-Plot for Humidity")

qqplot(df["RH_1"], ax=ax[0, 0], line="s")
ax[0, 0].set_title("RH_1")

qqplot(df["RH_2"], ax=ax[0, 1], line="s")
ax[0, 1].set_title("RH_2")

qqplot(df["RH_3"], ax=ax[0, 2], line="s")
ax[0, 2].set_title("RH_3")

qqplot(df["RH_4"], ax=ax[0, 3], line="s")
ax[0, 3].set_title("RH_4")

qqplot(df["RH_5"], ax=ax[0, 4], line="s")
ax[0, 4].set_title("RH_5")

qqplot(df["RH_6"], ax=ax[1, 0], line="s")
ax[1, 0].set_title("RH_6")

qqplot(df["RH_7"], ax=ax[1, 1], line="s")
ax[1, 1].set_title("RH_7")

qqplot(df["RH_8"], ax=ax[1, 2], line="s")
ax[1, 2].set_title("RH_8")

qqplot(df["RH_9"], ax=ax[1, 3], line="s")
ax[1, 3].set_title("RH_9")

qqplot(df["RH_out"], ax=ax[1, 4], line="s")
ax[1, 4].set_title("TH_out");


# Other columns

fig, ax = plt.subplots(nrows=1, ncols=4, sharex=False, sharey=False, figsize=(15, 5))
fig.suptitle("QQ-Plot for Other columns")

qqplot(df["Appliances"], ax=ax[0], line="s")
ax[0].set_title("Appliances")

qqplot(df["Windspeed"], ax=ax[1], line="s")
ax[1].set_title("Windspeed")

qqplot(df["Visibility"], ax=ax[2], line="s")
ax[2].set_title("Visibility")

qqplot(df["Press_mm_hg"], ax=ax[3], line="s")
ax[3].set_title("Press_mm_hg");


# Extracting a random sample of 5000 registers from our dataset:

df_sampled = df.sample(n=5000, random_state=42)


# Using Shapiro test for normallity evaluation:

for c in df.columns:
    w_statisc, p_values = shapiro(df_sampled[c])
    print(f'Test for {c}: (statistic={w_statisc}, p-value={p_values})')

Test for Appliances: (statistic=0.5849369764328003, p-value=0.0)
Test for T1: (statistic=0.9911403059959412, p-value=3.733412196820106e-17)
Test for RH_1: (statistic=0.9821624159812927, p-value=1.2745344613958403e-24)
Test for T2: (statistic=0.9549137353897095, p-value=9.466583207000185e-37)
Test for RH_2: (statistic=0.9911195039749146, p-value=3.5440305190650705e-17)
Test for T3: (statistic=0.980573832988739, p-value=1.2472995316811506e-25)
Test for RH_3: (statistic=0.9624248743057251, p-value=3.6429414818547343e-34)
Test for T4: (statistic=0.9901070594787598, p-value=3.099034646471756e-18)
Test for RH_4: (statistic=0.9676409363746643, p-value=4.054767846141661e-32)
Test for T5: (statistic=0.9707837700843811, p-value=9.308487738172878e-31)
Test for RH_5: (statistic=0.8472362160682678, p-value=0.0)
Test for T6: (statistic=0.9771420359611511, p-value=1.2924734819367967e-27)
Test for RH_6: (statistic=0.9391845464706421, p-value=3.284923860070236e-41)
Test for T7: (statistic=0.9850450754165649, p-value=1.326130338831014e-22)
Test for RH_7: (statistic=0.9879186153411865, p-value=2.763621412532176e-20)
Test for T8: (statistic=0.992936909198761, p-value=4.788611405326252e-15)
Test for RH_8: (statistic=0.986628532409668, p-value=2.26813368248502e-21)
Test for T9: (statistic=0.9759417772293091, p-value=2.9571197811545564e-28)
Test for RH_9: (statistic=0.9775705933570862, p-value=2.2199535800910687e-27)
Test for T_out: (statistic=0.981996476650238, p-value=9.927184389558388e-25)
Test for Press_mm_hg: (statistic=0.9866811633110046, p-value=2.502911208679081e-21)
Test for RH_out: (statistic=0.9219866394996643, p-value=4.203895392974451e-45)
Test for Windspeed: (statistic=0.9285374879837036, p-value=1.0089348943138683e-43)
Test for Visibility: (statistic=0.9228914380073547, p-value=5.605193857299268e-45)
Test for Tdewpoint: (statistic=0.9919402599334717, p-value=2.9564435650314153e-16)
Test for rv1: (statistic=0.9540770053863525, p-value=5.1268318158055655e-37)
Test for rv2: (statistic=0.9540770053863525, p-value=5.1268318158055655e-37)


# Function to plot heatmap of correlations:

def plot_correlation(dataframe):
    correlations=dataframe.corr()

    mask =  np.zeros_like(correlations)
    mask[np.triu_indices_from(mask)] = True


    plt.figure(figsize=(27, 10))
    sns.heatmap(correlations, annot=True, mask=mask, cmap="crest");


# Plotting a Heatmap with all the correlations:

plot_correlation(df)


# We are going to drop some columns because of their correlation analysis:

features = df.drop(columns=['rv1', 'rv2', 'T_out', 'T9'])


# Reset the index:

features_eng = features.reset_index()


# Transfoming date column in a binary class feature:

features_eng['Weekend'] = features_eng['date'].dt.dayofweek


# Printing the dataset

features_eng.head()


# Before the mapping:

features_eng['Weekend'].value_counts()

Weekend
1    2880
2    2880
3    2880
4    2845
0    2778
5    2736
6    2736
Name: count, dtype: int64


#  Mapping the categorical variable to be either 0 or 1:

features_eng['Weekend'] = features_eng['Weekend'].apply(lambda x : 1 if (x == 6)|(x==5) else 0)
features_eng['Weekend'].value_counts()

Weekend
0    14263
1     5472
Name: count, dtype: int64


# Setting date as index again:

features_eng.set_index('date', inplace=True)


# Printing the dataset to see the changed column

features_eng.head()


# Sampling the groups:

weekdays = features_eng[features_eng['Weekend'] == 0].sample(n=300, random_state=42)['Appliances']
weekend = features_eng[features_eng['Weekend'] == 1].sample(n=300, random_state=42)['Appliances']


# Normality test:

print(f"Weekdays: {shapiro(weekdays)}")
print(f"Weekends: {shapiro(weekend)}")

Weekdays: ShapiroResult(statistic=0.5949463844299316, pvalue=6.323979251306852e-26)
Weekends: ShapiroResult(statistic=0.681671142578125, pvalue=2.1151544435960978e-23)


# Wilcoxon rank-sum statistic for two samples:

ranksums(weekend, weekdays, alternative="greater")

RanksumsResult(statistic=2.3616550407154433, pvalue=0.00909678130539973)


# We will use the Point Biserial correlation to asses whether there is a correlation between the categorical and numeric variables:

point_bisserial_cor = pointbiserialr(features_eng['Weekend'], features_eng['Appliances'])
point_bisserial_cor

SignificanceResult(statistic=0.017437050668854755, pvalue=0.014301067218672008)


# Now, we need to separate the target from the whole dataset:

target = df["Appliances"]
features_eng.drop(columns='Appliances', inplace=True)


# Class that put together many feature selection techniques:

class feature_selector:
    seed = 42
    def __init__(self, X, y) -> None:
        self.X_train = X
        self.y_train = y
    
    def  randomforestR_imp(self) -> None:
        model = RandomForestRegressor(random_state=feature_selector.seed)
        model.fit(self.X_train, self.y_train)
        series = pd.Series(index=model.feature_names_in_, data=model.feature_importances_).sort_values(ascending=False)

        # Plotting RandomForest Regression Importance:
        plt.title("RandomForest Importance")
        plt.xlabel("Importance")
        sns.barplot(x=series.values, y=series.index)
        

    def xgbR_imp(self) -> None:
        model = XGBRegressor(random_state=feature_selector.seed)
        model.fit(self.X_train, self.y_train)
        series = pd.Series(index=model.feature_names_in_, data=model.feature_importances_).sort_values(ascending=False)

        # Plotting XGboost Regression Feature Importance:
        plt.title("XGBoost Feature Importance")
        plt.xlabel("Importance")
        sns.barplot(y=series.index, x=series.values)

    # Univariate feature selection:
    def univariate(self, statistic, n="all") -> None:
        selector = SelectKBest(score_func=statistic, k=n)
        selector.fit(self.X_train, self.y_train)
        
        series = pd.Series(index=selector.feature_names_in_, data=selector.scores_).\
            sort_values(ascending=False)
        plt.title("F-Regression Filtering")
        plt.xlabel("F-score")
        sns.barplot(y=series.index, x=series.values)

        return selector
    
    # Wrapper method for feature selection:
    def refcv(self):
        
        models = {
            "Lasso":Lasso(random_state=feature_selector.seed),
            "Ridge":Ridge(random_state=feature_selector.seed),
            "RandomForestR":RandomForestRegressor(random_state=feature_selector.seed),
            "XGB":XGBRegressor(random_state=feature_selector.seed)
        }

        splits=10
        cross = KFold(n_splits=splits, random_state=feature_selector.seed, shuffle=True)
        ind = [f"Columns {i}" for i in range(1, len(self.X_train.columns) + 1)]
        df = pd.DataFrame(index=ind)
        
        minimo = np.inf
        name = ""
        for key, model in models.items():
            rfecv = RFECV(estimator=model, step=1, cv=cross, min_features_to_select=1, scoring="neg_mean_squared_error")
            rfecv.fit(self.X_train, self.y_train)
            root_mean = np.sqrt(-rfecv.cv_results_["mean_test_score"])
            df[key] = root_mean
            best_value = root_mean[np.argmin(root_mean)]

            if minimo > best_value:
                minimo = best_value
                best_features = rfecv.support_
                name = key

        df_features = pd.DataFrame(columns=self.X_train.columns, data=best_features.reshape(1, -1), index=[name])
    
        return df,  df_features


# Splitting into three sets: 

X_train, X_test, y_train, y_test = train_test_split(features_eng, target, test_size=0.25, random_state=42)


# Printing shapes:

print("Shapes:")
print(f"X_Train set: {X_train.shape}")
print(f"y_Train set: {y_train.shape}\n")
print(f"X_Test set: {X_test.shape}")
print(f"y_Test set: {y_test.shape}")

Shapes:
X_Train set: (14801, 23)
y_Train set: (14801,)

X_Test set: (4934, 23)
y_Test set: (4934,)


# Plotting Best features accordingly to the Random Forest Algorithm:

feature = feature_selector(X_train, y_train)


feature.randomforestR_imp()


# Plotting the best features accordingly the XGBoost Algorithm:

feature.xgbR_imp()


# Mean test score for all of the Algorithms and for each number of columns:

df_results, df_features = feature.refcv()


df_results


# Best features using REF:

df_features


# Removing Weekend column:

X_train = X_train.drop(columns="Weekend")
X_test = X_test.drop(columns="Weekend")


# Function used to evaluate the best algorithms:

def melhor_modelo(X_train, y_train, dicionario):

    seed = 42
    cv = 5
    score = ['neg_root_mean_squared_error', 'r2']
    result_rmse = {}
    result_r2 = {}


    for name, model in dicionario.items():
        k_fold = KFold(n_splits=cv, random_state=seed, shuffle=True)
        result = cross_validate(model, X_train, y_train, cv=k_fold, scoring=score)

        result_rmse[name] = -result['test_neg_root_mean_squared_error']
        result_r2[name] = result['test_r2']
        
        
    result_pd_rmse = pd.DataFrame(data=result_rmse)
    result_pd_r2 = pd.DataFrame(data=result_r2)
    
    return result_pd_rmse, result_pd_r2


# Dictionary of algorithms:

modelos_dicionario = {
        "Lasso":Lasso(random_state=42),
        "Ridge":Ridge(random_state=42),
        "SVR":SVR(),
        "RandomForest":RandomForestRegressor(random_state=42),
        "XGB":XGBRegressor(random_state=42),
        "MLP":MLPRegressor(random_state=42, max_iter=2500)
    }


# MinMax Scaler:

min_max_scaler = MinMaxScaler()
X_train_min_max = min_max_scaler.fit_transform(X_train)
X_test_min_max = min_max_scaler.transform(X_test)


# Result of the function melhor_modelo:

resultados_min_max_rmse, resultados_min_max_r2 = melhor_modelo(X_train_min_max, y_train, modelos_dicionario)


# Boxplot of the result for analisys of the algorithms, using MinMax Scaler:

fig = px.box(resultados_min_max_rmse, title= "Cross Validation result for MinMax Scaler(RMSE)")
fig.show("png",width=1500, height=500)


# Models statistics for RMSE metric, using MinMax Scaler:

resultados_min_max_rmse.describe()


# Models statistics for R2 metric, using MinMax Scaler: 

resultados_min_max_r2.describe()


# Standard Scaler:

std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)


# Result of the function melhor_modelo:

result_std_rmse, result_std_r2 = melhor_modelo(X_train_std, y_train, modelos_dicionario)


# Boxplot of the result for analisys of the algorithms, using Standard Scaler:

fig = px.box(result_std_rmse, title="Cross validation result for Standard Scaler(RMSE)")

fig.show('png', width=1500, height=500)


# Models statistics for RMSE metric, using Standard Scaler:

result_std_rmse.describe()


# Models statistics for R2 metric, using Standard Scaler: 

result_std_r2.describe()


# Robust Scaler:

rb_scaler = RobustScaler()
X_train_rb =  rb_scaler.fit_transform(X_train)
X_test_rb = rb_scaler.transform(X_test)


# Result of the function melhor_modelo:

result_rb_rmse,  result_rb_r2= melhor_modelo(X_train_rb, y_train, modelos_dicionario)


# Boxplot of the result for analisys of the algorithms, using Robust Scaler:

fig = px.box(result_rb_rmse, title="Cross validation results for Robust Scaler(RMSE)")
fig.show('png', width=1500, height=500)


# Models statistics for RMSE metric, using Rodbust Scaler:

result_rb_rmse.describe()


# Models statistics for R2 metric, using Rodbust Scaler:

result_rb_r2.describe()


# Function for tuning an arbitrary model:

def tuning(X_train, y_train, modelo, params):
    
    cv = 5
    score = "neg_root_mean_squared_error"
    grid  = GridSearchCV(modelo, cv=cv, param_grid=params, 
                         scoring=score, 
                         n_jobs=-1,
                         return_train_score=True,
                         )

    grid.fit(X_train, y_train)

    best_index = grid.best_index_
    result = grid.cv_results_

    train_score = -result['mean_train_score'][best_index]
    left_out = -result['mean_test_score'][best_index]



    print(f"Train score: {train_score}")
    print(f"Left out data score: {left_out}")

    return grid.best_estimator_


# Parameter grid:

params_grid = {
                "n_estimators":[100, 120, 130],
                "max_depth":[11, 12],
                "min_samples_split":[2, 5, 10],
                "max_features":[0.5, 0.6, 0.7]
}


# Tuning a Random Forest using MinMax Scaler:

model_rnd = RandomForestRegressor(random_state=42)

print(f"Random Forest - MinMax Scaler:")
best_estimator_rnd_min_max = tuning(X_train_min_max, y_train, model_rnd, params_grid)

Random Forest - MinMax Scaler:
Train score: 55.20977987629416
Left out data score: 78.2570080306544


# Best Random Forest model:

best_estimator_rnd_min_max

RandomForestRegressor(max_depth=12, max_features=0.5, random_state=42)

RandomForestRegressor(max_depth=12, max_features=0.5, random_state=42)


# Tuning a Random Forest unsing Standard Scaler:

model_rnd = RandomForestRegressor(random_state=42)

print(f"Random Forest - Standard Scaler:")
best_estimator_rnd_std = tuning(X_train_std, y_train, model_rnd, params_grid)

Random Forest - Standard Scaler:
Train score: 55.20971673209787
Left out data score: 78.2527129431916


# Tuning a Random Forest using Robust Scaler:

model_rnd = RandomForestRegressor(random_state=42)

print(f"Random Forest - Robust Scaler:")
best_estimator_rnd_rb = tuning(X_train_rb, y_train, model_rnd, params_grid)

Random Forest - Robust Scaler:
Train score: 55.21031330462006
Left out data score: 78.25951265014629


# Clone the parameters of the best Random Forest model:

rd_best_model = clone(best_estimator_rnd_std)


# Training a Random Forest model:

rd_best_model.fit(X_train_std, y_train)

RandomForestRegressor(max_depth=12, max_features=0.5, random_state=42)

RandomForestRegressor(max_depth=12, max_features=0.5, random_state=42)


# Predicting train set and test set:

y_pred_train = rd_best_model.predict(X_train_std)
y_pred_test = rd_best_model.predict(X_test_std)


# Calculating metrics for train set:

rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
r2_train = r2_score(y_train, y_pred_train)

# Calculating metrics for test set:

rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2_test = r2_score(y_test, y_pred_test)


# Train set results:

print("Train results:")
print(f"RMSE: {rmse_train}")
print(f"R2 score: {r2_train}")

Train results:
RMSE: 55.42828784895388
R2 score: 0.7129440480283424


# Test set results:

print("Test results:")
print(f"RMSE: {rmse_test}")
print(f"R2 score: {r2_test}")

Test results:
RMSE: 73.7462104247005
R2 score: 0.45252870157881486

	date	Appliances	lights	T1	RH_1	T2	RH_2	T3	RH_3	T4	RH_4	T5	RH_5	T6	RH_6	T7	RH_7	T8	RH_8	T9	RH_9	T_out	Press_mm_hg	RH_out	Windspeed	Visibility	Tdewpoint	rv1	rv2
0	2016-01-11 17:00:00	60	30	19.89	47.596667	19.2	44.790000	19.79	44.730000	19.000000	45.566667	17.166667	55.20	7.026667	84.256667	17.200000	41.626667	18.2	48.900000	17.033333	45.53	6.600000	733.5	92.0	7.000000	63.000000	5.3	13.275433	13.275433
1	2016-01-11 17:10:00	60	30	19.89	46.693333	19.2	44.722500	19.79	44.790000	19.000000	45.992500	17.166667	55.20	6.833333	84.063333	17.200000	41.560000	18.2	48.863333	17.066667	45.56	6.483333	733.6	92.0	6.666667	59.166667	5.2	18.606195	18.606195
2	2016-01-11 17:20:00	50	30	19.89	46.300000	19.2	44.626667	19.79	44.933333	18.926667	45.890000	17.166667	55.09	6.560000	83.156667	17.200000	41.433333	18.2	48.730000	17.000000	45.50	6.366667	733.7	92.0	6.333333	55.333333	5.1	28.642668	28.642668
3	2016-01-11 17:30:00	50	40	19.89	46.066667	19.2	44.590000	19.79	45.000000	18.890000	45.723333	17.166667	55.09	6.433333	83.423333	17.133333	41.290000	18.1	48.590000	17.000000	45.40	6.250000	733.8	92.0	6.000000	51.500000	5.0	45.410389	45.410389
4	2016-01-11 17:40:00	60	40	19.89	46.333333	19.2	44.530000	19.79	45.000000	18.890000	45.530000	17.200000	55.09	6.366667	84.893333	17.200000	41.230000	18.1	48.590000	17.000000	45.40	6.133333	733.9	92.0	5.666667	47.666667	4.9	10.084097	10.084097

	RH_1	RH_2	RH_3	RH_4	RH_5	RH_6	RH_7	RH_8	RH_9	RH_out
count	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000
mean	40.259739	40.420420	39.242500	39.026904	50.949283	54.609083	35.388200	42.936165	41.552401	79.750418
std	3.979299	4.069813	3.254576	4.341321	9.022034	31.149806	5.114208	5.224361	4.151497	14.901088
min	27.023333	20.463333	28.766667	27.660000	29.815000	1.000000	23.200000	29.600000	29.166667	24.000000
25%	37.333333	37.900000	36.900000	35.530000	45.400000	30.025000	31.500000	39.066667	38.500000	70.333333
50%	39.656667	40.500000	38.530000	38.400000	49.090000	55.290000	34.863333	42.375000	40.900000	83.666667
75%	43.066667	43.260000	41.760000	42.156667	53.663333	83.226667	39.000000	46.536000	44.338095	91.666667
max	63.360000	56.026667	50.163333	51.090000	96.321667	99.900000	51.400000	58.780000	53.326667	100.000000

	T1	T2	T3	T4	T5	T6	T7	T8	T9	T_out	Tdewpoint
count	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000
mean	21.686571	20.341219	22.267611	20.855335	19.592106	7.910939	20.267106	22.029107	19.485828	7.411665	3.760707
std	1.606066	2.192974	2.006111	2.042884	1.844623	6.090347	2.109993	1.956162	2.014712	5.317409	4.194648
min	16.790000	16.100000	17.200000	15.100000	15.330000	-6.065000	15.390000	16.306667	14.890000	-5.000000	-6.600000
25%	20.760000	18.790000	20.790000	19.530000	18.277500	3.626667	18.700000	20.790000	18.000000	3.666667	0.900000
50%	21.600000	20.000000	22.100000	20.666667	19.390000	7.300000	20.033333	22.100000	19.390000	6.916667	3.433333
75%	22.600000	21.500000	23.290000	22.100000	20.619643	11.256000	21.600000	23.390000	20.600000	10.408333	6.566667
max	26.260000	29.856667	29.236000	26.200000	25.795000	28.290000	26.000000	27.230000	24.500000	26.100000	15.500000

	Appliances	lights	Press_mm_hg	Windspeed	Visibility	rv1	rv2
count	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000	19735.000000
mean	97.694958	3.801875	755.522602	4.039752	38.330834	24.988033	24.988033
std	102.524891	7.935988	7.399441	2.451221	11.794719	14.496634	14.496634
min	10.000000	0.000000	729.300000	0.000000	1.000000	0.005322	0.005322
25%	50.000000	0.000000	750.933333	2.000000	29.000000	12.497889	12.497889
50%	60.000000	0.000000	756.100000	3.666667	40.000000	24.897653	24.897653
75%	100.000000	0.000000	760.933333	5.500000	40.000000	37.583769	37.583769
max	1080.000000	70.000000	772.300000	14.000000	66.000000	49.996530	49.996530

	date	Appliances	T1	RH_1	T2	RH_2	T3	RH_3	T4	RH_4	T5	RH_5	T6	RH_6	T7	RH_7	T8	RH_8	RH_9	Press_mm_hg	RH_out	Windspeed	Visibility	Tdewpoint
0	2016-01-11 17:00:00	60	19.89	47.596667	19.2	44.790000	19.79	44.730000	19.000000	45.566667	17.166667	55.20	7.026667	84.256667	17.200000	41.626667	18.2	48.900000	45.53	733.5	92.0	7.000000	63.000000	5.3
1	2016-01-11 17:10:00	60	19.89	46.693333	19.2	44.722500	19.79	44.790000	19.000000	45.992500	17.166667	55.20	6.833333	84.063333	17.200000	41.560000	18.2	48.863333	45.56	733.6	92.0	6.666667	59.166667	5.2
2	2016-01-11 17:20:00	50	19.89	46.300000	19.2	44.626667	19.79	44.933333	18.926667	45.890000	17.166667	55.09	6.560000	83.156667	17.200000	41.433333	18.2	48.730000	45.50	733.7	92.0	6.333333	55.333333	5.1
3	2016-01-11 17:30:00	50	19.89	46.066667	19.2	44.590000	19.79	45.000000	18.890000	45.723333	17.166667	55.09	6.433333	83.423333	17.133333	41.290000	18.1	48.590000	45.40	733.8	92.0	6.000000	51.500000	5.0
4	2016-01-11 17:40:00	60	19.89	46.333333	19.2	44.530000	19.79	45.000000	18.890000	45.530000	17.200000	55.09	6.366667	84.893333	17.200000	41.230000	18.1	48.590000	45.40	733.9	92.0	5.666667	47.666667	4.9

1) Importing Libraries and Loading Data¶

2) Data Munging¶

2.1) Data Cleaning¶

2.2) Exploratory Data Analisys¶

3) Preprocessing¶

3.1) Feature Engineering¶

T-test¶

Wilcoxon Rank-Sum¶

Point Biserial Correlation¶

3.2) Feature Selection¶

3.2.1) Random Forest Importance¶

3.2.2) XGBoost Importance¶

3.2.3) Recursive Feature Elimination (RFE)¶

3.3) Transformation¶

3.3.1) MinMaxScaler¶

3.3.2) StandarScaler¶

3.3.3) Robust Scaler¶

4) Fine Tuning¶

4.1) MinMax Scaler¶

4.2) Standard Scaler¶

4.3) Robust Scaler¶

5) Predictions¶

	Lasso	Ridge	RandomForestR	XGB
Columns 1	103.056148	103.055816	113.304770	105.212470
Columns 2	100.590495	100.587456	95.788990	100.936336
Columns 3	100.435432	100.422634	79.751147	93.266782
Columns 4	98.950192	98.890011	77.808713	87.317947
Columns 5	98.850838	98.801902	76.062723	83.934313
Columns 6	98.366952	98.609354	74.597922	81.859767
Columns 7	98.262228	98.447257	74.254578	79.656813
Columns 8	96.880590	98.119044	73.701069	79.038286
Columns 9	96.657324	96.992252	72.525388	78.179304
Columns 10	96.586050	96.554785	72.050913	76.868434
Columns 11	96.508091	96.474571	71.308990	76.758451
Columns 12	96.443505	96.458674	71.200546	76.060537
Columns 13	96.456390	96.404750	71.278944	75.356031
Columns 14	96.427961	96.318593	71.050404	75.967296
Columns 15	96.439207	96.328227	70.571489	75.322817
Columns 16	96.440432	96.332470	70.415103	75.650811
Columns 17	96.323125	96.303376	70.315029	77.008334
Columns 18	96.288128	96.228437	70.110516	76.239957
Columns 19	96.286269	96.213253	69.846299	75.334601
Columns 20	96.264342	96.200491	69.792471	75.395966
Columns 21	96.230583	96.144727	69.709099	75.173989
Columns 22	96.224150	96.079493	69.575768	75.670323
Columns 23	96.224143	96.047953	69.628186	75.824829

	Lasso	Ridge	SVR	RandomForest	XGB	MLP
count	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000
mean	101.339171	96.103956	104.131297	71.032976	76.014748	92.804121
std	3.224101	3.444759	3.413926	4.005136	4.020095	3.754998
min	98.302072	93.081523	101.107995	65.284468	70.757023	89.253211
25%	98.716873	93.238649	101.306166	68.550961	72.959072	89.673112
50%	100.713914	95.121696	103.623066	72.564129	77.390410	92.182401
75%	102.847561	97.872060	105.184074	74.088113	78.560237	94.594520
max	106.115436	101.205853	109.435184	74.677209	80.406997	98.317360

	Lasso	Ridge	SVR	RandomForest	XGB	MLP
count	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000
mean	0.039671	0.136466	-0.013942	0.528097	0.459751	0.194885
std	0.003774	0.010826	0.006957	0.032454	0.030597	0.016729
min	0.035541	0.123872	-0.024401	0.499513	0.432473	0.173169
25%	0.036807	0.126600	-0.017478	0.501051	0.437268	0.184117
50%	0.038851	0.140026	-0.011263	0.522985	0.446976	0.194792
75%	0.043069	0.142624	-0.008779	0.538550	0.477295	0.210372
max	0.044087	0.149205	-0.007789	0.578388	0.504741	0.211973

	Lasso	Ridge	SVR	RandomForest	XGB	MLP
count	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000
mean	97.027618	96.061407	103.592166	71.036226	76.007873	84.539414
std	3.356449	3.525190	3.418535	4.103174	3.934746	3.118004
min	93.827089	93.035937	100.470340	65.167256	70.906749	80.339979
25%	94.368913	93.086069	100.854924	68.452633	72.942371	82.763725
50%	96.310955	95.063921	103.064621	72.642128	77.308188	84.498308
75%	98.639844	97.797212	104.683744	74.137774	78.651667	87.536390
max	101.991288	101.323894	108.887199	74.781337	80.230393	87.558671

	Lasso	Ridge	SVR	RandomForest	XGB	MLP
count	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000
mean	0.119752	0.137261	-0.003458	0.528049	0.459843	0.331569
std	0.008085	0.011838	0.006852	0.033580	0.029618	0.022830
min	0.110221	0.121827	-0.014168	0.498842	0.433679	0.300973
25%	0.112843	0.127935	-0.006540	0.499978	0.435958	0.323437
50%	0.121052	0.142839	0.000796	0.521654	0.449403	0.327367
75%	0.125508	0.143666	0.001169	0.539872	0.477534	0.344559
max	0.129137	0.150038	0.001452	0.579901	0.502643	0.361506

	Lasso	Ridge	SVR	RandomForest	XGB	MLP
count	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000
mean	97.291404	96.061271	103.746001	71.066650	76.120590	84.622211
std	3.351912	3.523836	3.443017	4.029722	4.007752	3.226213
min	94.010451	93.036021	100.599875	65.296841	70.926901	80.310477
25%	94.724338	93.087422	100.977465	68.562893	72.939596	83.203670
50%	96.582075	95.064305	103.235908	72.564676	77.667119	83.948702
75%	98.899118	97.797581	104.842294	74.140519	78.682726	87.633504
max	102.241039	101.321028	109.074463	74.768323	80.386606	88.014702