# Importing Libraries:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
from statsmodels.api import qqplot
from scipy.stats import ranksums, pointbiserialr
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_validate
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, RFECV
# Importing data:
df_raw = pd.read_csv("KAG_energydata_complete.csv")
# Creating a copy of the dataset:
df = df_raw.copy()
# Shape of the dataset:
print(f"Shape: {df.shape}")
Shape: (19735, 29)
# Looking at the first five lines:
df.head()
date | Appliances | lights | T1 | RH_1 | T2 | RH_2 | T3 | RH_3 | T4 | RH_4 | T5 | RH_5 | T6 | RH_6 | T7 | RH_7 | T8 | RH_8 | T9 | RH_9 | T_out | Press_mm_hg | RH_out | Windspeed | Visibility | Tdewpoint | rv1 | rv2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-01-11 17:00:00 | 60 | 30 | 19.89 | 47.596667 | 19.2 | 44.790000 | 19.79 | 44.730000 | 19.000000 | 45.566667 | 17.166667 | 55.20 | 7.026667 | 84.256667 | 17.200000 | 41.626667 | 18.2 | 48.900000 | 17.033333 | 45.53 | 6.600000 | 733.5 | 92.0 | 7.000000 | 63.000000 | 5.3 | 13.275433 | 13.275433 |
1 | 2016-01-11 17:10:00 | 60 | 30 | 19.89 | 46.693333 | 19.2 | 44.722500 | 19.79 | 44.790000 | 19.000000 | 45.992500 | 17.166667 | 55.20 | 6.833333 | 84.063333 | 17.200000 | 41.560000 | 18.2 | 48.863333 | 17.066667 | 45.56 | 6.483333 | 733.6 | 92.0 | 6.666667 | 59.166667 | 5.2 | 18.606195 | 18.606195 |
2 | 2016-01-11 17:20:00 | 50 | 30 | 19.89 | 46.300000 | 19.2 | 44.626667 | 19.79 | 44.933333 | 18.926667 | 45.890000 | 17.166667 | 55.09 | 6.560000 | 83.156667 | 17.200000 | 41.433333 | 18.2 | 48.730000 | 17.000000 | 45.50 | 6.366667 | 733.7 | 92.0 | 6.333333 | 55.333333 | 5.1 | 28.642668 | 28.642668 |
3 | 2016-01-11 17:30:00 | 50 | 40 | 19.89 | 46.066667 | 19.2 | 44.590000 | 19.79 | 45.000000 | 18.890000 | 45.723333 | 17.166667 | 55.09 | 6.433333 | 83.423333 | 17.133333 | 41.290000 | 18.1 | 48.590000 | 17.000000 | 45.40 | 6.250000 | 733.8 | 92.0 | 6.000000 | 51.500000 | 5.0 | 45.410389 | 45.410389 |
4 | 2016-01-11 17:40:00 | 60 | 40 | 19.89 | 46.333333 | 19.2 | 44.530000 | 19.79 | 45.000000 | 18.890000 | 45.530000 | 17.200000 | 55.09 | 6.366667 | 84.893333 | 17.200000 | 41.230000 | 18.1 | 48.590000 | 17.000000 | 45.40 | 6.133333 | 733.9 | 92.0 | 5.666667 | 47.666667 | 4.9 | 10.084097 | 10.084097 |
First let's see if there are missing data in the dataset.
# Looking for missing data:
df.isna().sum()
date 0 Appliances 0 lights 0 T1 0 RH_1 0 T2 0 RH_2 0 T3 0 RH_3 0 T4 0 RH_4 0 T5 0 RH_5 0 T6 0 RH_6 0 T7 0 RH_7 0 T8 0 RH_8 0 T9 0 RH_9 0 T_out 0 Press_mm_hg 0 RH_out 0 Windspeed 0 Visibility 0 Tdewpoint 0 rv1 0 rv2 0 dtype: int64
Looking for wrong variable type
# Information about data types:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19735 entries, 0 to 19734 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 19735 non-null object 1 Appliances 19735 non-null int64 2 lights 19735 non-null int64 3 T1 19735 non-null float64 4 RH_1 19735 non-null float64 5 T2 19735 non-null float64 6 RH_2 19735 non-null float64 7 T3 19735 non-null float64 8 RH_3 19735 non-null float64 9 T4 19735 non-null float64 10 RH_4 19735 non-null float64 11 T5 19735 non-null float64 12 RH_5 19735 non-null float64 13 T6 19735 non-null float64 14 RH_6 19735 non-null float64 15 T7 19735 non-null float64 16 RH_7 19735 non-null float64 17 T8 19735 non-null float64 18 RH_8 19735 non-null float64 19 T9 19735 non-null float64 20 RH_9 19735 non-null float64 21 T_out 19735 non-null float64 22 Press_mm_hg 19735 non-null float64 23 RH_out 19735 non-null float64 24 Windspeed 19735 non-null float64 25 Visibility 19735 non-null float64 26 Tdewpoint 19735 non-null float64 27 rv1 19735 non-null float64 28 rv2 19735 non-null float64 dtypes: float64(26), int64(2), object(1) memory usage: 4.4+ MB
# Changing date's type to datetime:
df['date'] = pd.to_datetime(df['date'])
df.dtypes
date datetime64[ns] Appliances int64 lights int64 T1 float64 RH_1 float64 T2 float64 RH_2 float64 T3 float64 RH_3 float64 T4 float64 RH_4 float64 T5 float64 RH_5 float64 T6 float64 RH_6 float64 T7 float64 RH_7 float64 T8 float64 RH_8 float64 T9 float64 RH_9 float64 T_out float64 Press_mm_hg float64 RH_out float64 Windspeed float64 Visibility float64 Tdewpoint float64 rv1 float64 rv2 float64 dtype: object
# Setting date as the index:
df.set_index('date', inplace=True)
# Separating columns:
temperature_column = [i for i in df.columns if "T" in i]
humidity_column = [i for i in df.columns if "RH" in i]
other = [i for i in df.columns if ("T" not in i)&("RH" not in i)]
# Humidity statistics:
df[humidity_column].describe()
RH_1 | RH_2 | RH_3 | RH_4 | RH_5 | RH_6 | RH_7 | RH_8 | RH_9 | RH_out | |
---|---|---|---|---|---|---|---|---|---|---|
count | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 |
mean | 40.259739 | 40.420420 | 39.242500 | 39.026904 | 50.949283 | 54.609083 | 35.388200 | 42.936165 | 41.552401 | 79.750418 |
std | 3.979299 | 4.069813 | 3.254576 | 4.341321 | 9.022034 | 31.149806 | 5.114208 | 5.224361 | 4.151497 | 14.901088 |
min | 27.023333 | 20.463333 | 28.766667 | 27.660000 | 29.815000 | 1.000000 | 23.200000 | 29.600000 | 29.166667 | 24.000000 |
25% | 37.333333 | 37.900000 | 36.900000 | 35.530000 | 45.400000 | 30.025000 | 31.500000 | 39.066667 | 38.500000 | 70.333333 |
50% | 39.656667 | 40.500000 | 38.530000 | 38.400000 | 49.090000 | 55.290000 | 34.863333 | 42.375000 | 40.900000 | 83.666667 |
75% | 43.066667 | 43.260000 | 41.760000 | 42.156667 | 53.663333 | 83.226667 | 39.000000 | 46.536000 | 44.338095 | 91.666667 |
max | 63.360000 | 56.026667 | 50.163333 | 51.090000 | 96.321667 | 99.900000 | 51.400000 | 58.780000 | 53.326667 | 100.000000 |
# Temperature statistics:
df[temperature_column].describe()
T1 | T2 | T3 | T4 | T5 | T6 | T7 | T8 | T9 | T_out | Tdewpoint | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 |
mean | 21.686571 | 20.341219 | 22.267611 | 20.855335 | 19.592106 | 7.910939 | 20.267106 | 22.029107 | 19.485828 | 7.411665 | 3.760707 |
std | 1.606066 | 2.192974 | 2.006111 | 2.042884 | 1.844623 | 6.090347 | 2.109993 | 1.956162 | 2.014712 | 5.317409 | 4.194648 |
min | 16.790000 | 16.100000 | 17.200000 | 15.100000 | 15.330000 | -6.065000 | 15.390000 | 16.306667 | 14.890000 | -5.000000 | -6.600000 |
25% | 20.760000 | 18.790000 | 20.790000 | 19.530000 | 18.277500 | 3.626667 | 18.700000 | 20.790000 | 18.000000 | 3.666667 | 0.900000 |
50% | 21.600000 | 20.000000 | 22.100000 | 20.666667 | 19.390000 | 7.300000 | 20.033333 | 22.100000 | 19.390000 | 6.916667 | 3.433333 |
75% | 22.600000 | 21.500000 | 23.290000 | 22.100000 | 20.619643 | 11.256000 | 21.600000 | 23.390000 | 20.600000 | 10.408333 | 6.566667 |
max | 26.260000 | 29.856667 | 29.236000 | 26.200000 | 25.795000 | 28.290000 | 26.000000 | 27.230000 | 24.500000 | 26.100000 | 15.500000 |
# Other column statistics:
df[other].describe()
Appliances | lights | Press_mm_hg | Windspeed | Visibility | rv1 | rv2 | |
---|---|---|---|---|---|---|---|
count | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 | 19735.000000 |
mean | 97.694958 | 3.801875 | 755.522602 | 4.039752 | 38.330834 | 24.988033 | 24.988033 |
std | 102.524891 | 7.935988 | 7.399441 | 2.451221 | 11.794719 | 14.496634 | 14.496634 |
min | 10.000000 | 0.000000 | 729.300000 | 0.000000 | 1.000000 | 0.005322 | 0.005322 |
25% | 50.000000 | 0.000000 | 750.933333 | 2.000000 | 29.000000 | 12.497889 | 12.497889 |
50% | 60.000000 | 0.000000 | 756.100000 | 3.666667 | 40.000000 | 24.897653 | 24.897653 |
75% | 100.000000 | 0.000000 | 760.933333 | 5.500000 | 40.000000 | 37.583769 | 37.583769 |
max | 1080.000000 | 70.000000 | 772.300000 | 14.000000 | 66.000000 | 49.996530 | 49.996530 |
The first, second and third quartiles are 0 for the lights column, which means that most of the information of this column is 0. As we can see below more than 77% of the registers in this column is 0 and so, not relevant for the prediction.
# Counting values of the "lights" column:
df['lights'].value_counts(normalize=True)
lights 0 0.772840 10 0.112085 20 0.082290 30 0.028325 40 0.003902 50 0.000456 70 0.000051 60 0.000051 Name: proportion, dtype: float64
# Dropping the lights column:
df.drop(columns='lights', inplace=True)
Outliers
Looking at the statistics of "other" columns, we can see that there are some outliers in Visibility, Windspeed and Appliance. Let's plot a box plot of each one of those and see the outliers closer.
# Using boxplot on Visibility, Appliances and Windspeed to visualize the outliers:
fig_sub = make_subplots(rows=1, cols=3, shared_yaxes=False)
fig_sub.add_trace(
go.Box(y=df['Appliances'].values,
name='Appliances'
),
row=1, col=1
)
fig_sub.add_trace(
go.Box(y=df['Windspeed'].values,
name='Windspeed'
),
row=1, col=2
)
fig_sub.add_trace(
go.Box(y=df['Visibility'].values,
name='Visibility'
),
row=1, col=3
)
fig_sub.show("png",width=1500, height=500)
Distributions
A quick way to have an idea about a varibable distribution is plotting a histogram.
# Visualizing distributions using Histograms:
df.hist(figsize=(17, 20), grid=False);
QQ-Plots
# Temperatures:
fig, ax = plt.subplots(nrows=3, ncols=4, sharex=False, sharey=False, figsize=(20, 17))
fig.suptitle("QQ-Plot for Temperatures")
qqplot(df["T1"], ax=ax[0, 0], line="s")
ax[0, 0].set_title("T1")
qqplot(df["T2"], ax=ax[0, 1], line="s")
ax[0, 1].set_title("T2")
qqplot(df["T3"], ax=ax[0, 2], line="s")
ax[0, 2].set_title("T3")
qqplot(df["T4"], ax=ax[0, 3], line="s")
ax[0, 3].set_title("T4")
qqplot(df["T5"], ax=ax[1, 0], line="s")
ax[1, 0].set_title("T5")
qqplot(df["T6"], ax=ax[1, 1], line="s")
ax[1, 1].set_title("T6")
qqplot(df["T7"], ax=ax[1, 2], line="s")
ax[1, 2].set_title("T7")
qqplot(df["T8"], ax=ax[1, 3], line="s")
ax[1, 3].set_title("T8")
qqplot(df["T9"], ax=ax[2, 0], line="s")
ax[2, 0].set_title("T9")
qqplot(df["T_out"], ax=ax[2, 1], line="s")
ax[2, 1].set_title("T_out")
qqplot(df["Tdewpoint"], ax=ax[2, 2], line="s")
ax[2, 2].set_title("Tdewpoint")
ax[2, 3].set_visible(False);
# Umidity:
fig, ax = plt.subplots(nrows=2, ncols=5, sharex=False, sharey=False, figsize=(20, 15))
fig.suptitle("QQ-Plot for Humidity")
qqplot(df["RH_1"], ax=ax[0, 0], line="s")
ax[0, 0].set_title("RH_1")
qqplot(df["RH_2"], ax=ax[0, 1], line="s")
ax[0, 1].set_title("RH_2")
qqplot(df["RH_3"], ax=ax[0, 2], line="s")
ax[0, 2].set_title("RH_3")
qqplot(df["RH_4"], ax=ax[0, 3], line="s")
ax[0, 3].set_title("RH_4")
qqplot(df["RH_5"], ax=ax[0, 4], line="s")
ax[0, 4].set_title("RH_5")
qqplot(df["RH_6"], ax=ax[1, 0], line="s")
ax[1, 0].set_title("RH_6")
qqplot(df["RH_7"], ax=ax[1, 1], line="s")
ax[1, 1].set_title("RH_7")
qqplot(df["RH_8"], ax=ax[1, 2], line="s")
ax[1, 2].set_title("RH_8")
qqplot(df["RH_9"], ax=ax[1, 3], line="s")
ax[1, 3].set_title("RH_9")
qqplot(df["RH_out"], ax=ax[1, 4], line="s")
ax[1, 4].set_title("TH_out");
# Other columns
fig, ax = plt.subplots(nrows=1, ncols=4, sharex=False, sharey=False, figsize=(15, 5))
fig.suptitle("QQ-Plot for Other columns")
qqplot(df["Appliances"], ax=ax[0], line="s")
ax[0].set_title("Appliances")
qqplot(df["Windspeed"], ax=ax[1], line="s")
ax[1].set_title("Windspeed")
qqplot(df["Visibility"], ax=ax[2], line="s")
ax[2].set_title("Visibility")
qqplot(df["Press_mm_hg"], ax=ax[3], line="s")
ax[3].set_title("Press_mm_hg");
Normallity test
Even though, normallity tests are very sensitive when the sample size is large, we will still use a test statistic named Shapiro to see if it can detect normallity.
Obs:
1 - If p-value is smaller than 0.05, we can not assume a normal distribution.
2 - If p-value is larger than 0.05, we can assume a normal distribution.
3 - For datasets larger than 5000 samples, the p-value will be an approximation.
# Extracting a random sample of 5000 registers from our dataset:
df_sampled = df.sample(n=5000, random_state=42)
# Using Shapiro test for normallity evaluation:
for c in df.columns:
w_statisc, p_values = shapiro(df_sampled[c])
print(f'Test for {c}: (statistic={w_statisc}, p-value={p_values})')
Test for Appliances: (statistic=0.5849369764328003, p-value=0.0) Test for T1: (statistic=0.9911403059959412, p-value=3.733412196820106e-17) Test for RH_1: (statistic=0.9821624159812927, p-value=1.2745344613958403e-24) Test for T2: (statistic=0.9549137353897095, p-value=9.466583207000185e-37) Test for RH_2: (statistic=0.9911195039749146, p-value=3.5440305190650705e-17) Test for T3: (statistic=0.980573832988739, p-value=1.2472995316811506e-25) Test for RH_3: (statistic=0.9624248743057251, p-value=3.6429414818547343e-34) Test for T4: (statistic=0.9901070594787598, p-value=3.099034646471756e-18) Test for RH_4: (statistic=0.9676409363746643, p-value=4.054767846141661e-32) Test for T5: (statistic=0.9707837700843811, p-value=9.308487738172878e-31) Test for RH_5: (statistic=0.8472362160682678, p-value=0.0) Test for T6: (statistic=0.9771420359611511, p-value=1.2924734819367967e-27) Test for RH_6: (statistic=0.9391845464706421, p-value=3.284923860070236e-41) Test for T7: (statistic=0.9850450754165649, p-value=1.326130338831014e-22) Test for RH_7: (statistic=0.9879186153411865, p-value=2.763621412532176e-20) Test for T8: (statistic=0.992936909198761, p-value=4.788611405326252e-15) Test for RH_8: (statistic=0.986628532409668, p-value=2.26813368248502e-21) Test for T9: (statistic=0.9759417772293091, p-value=2.9571197811545564e-28) Test for RH_9: (statistic=0.9775705933570862, p-value=2.2199535800910687e-27) Test for T_out: (statistic=0.981996476650238, p-value=9.927184389558388e-25) Test for Press_mm_hg: (statistic=0.9866811633110046, p-value=2.502911208679081e-21) Test for RH_out: (statistic=0.9219866394996643, p-value=4.203895392974451e-45) Test for Windspeed: (statistic=0.9285374879837036, p-value=1.0089348943138683e-43) Test for Visibility: (statistic=0.9228914380073547, p-value=5.605193857299268e-45) Test for Tdewpoint: (statistic=0.9919402599334717, p-value=2.9564435650314153e-16) Test for rv1: (statistic=0.9540770053863525, p-value=5.1268318158055655e-37) Test for rv2: (statistic=0.9540770053863525, p-value=5.1268318158055655e-37)
Correlations
We will use Pearson's correlarion due to the type and nature of the variables (numeric target and features)
# Function to plot heatmap of correlations:
def plot_correlation(dataframe):
correlations=dataframe.corr()
mask = np.zeros_like(correlations)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(27, 10))
sns.heatmap(correlations, annot=True, mask=mask, cmap="crest");
# Plotting a Heatmap with all the correlations:
plot_correlation(df)
Conclusions
1 - As a result of the Shapiro test, we can assume that no feature follows a normal distribution (p-value less than 0.05).
2 - T6 and T_out are highly correlated(0.97), probably because they both were measured outside the building.
3 - All Temperatures are significant correlated to each other, but T9 has many correlations superior than 0.9 points.
4 - The two random variables are basically the same and their negative correlation with the target is close to 0, so we can exclude them both from the dataset.
5 - QQ-Plots and Histograms show us that no variables follow a normal distrubution.
# We are going to drop some columns because of their correlation analysis:
features = df.drop(columns=['rv1', 'rv2', 'T_out', 'T9'])
We will transform date column in a binary class feature that will represent weekend(1) and weekdays(0).
# Reset the index:
features_eng = features.reset_index()
# Transfoming date column in a binary class feature:
features_eng['Weekend'] = features_eng['date'].dt.dayofweek
# Printing the dataset
features_eng.head()
date | Appliances | T1 | RH_1 | T2 | RH_2 | T3 | RH_3 | T4 | RH_4 | T5 | RH_5 | T6 | RH_6 | T7 | RH_7 | T8 | RH_8 | RH_9 | Press_mm_hg | RH_out | Windspeed | Visibility | Tdewpoint | Weekend | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-01-11 17:00:00 | 60 | 19.89 | 47.596667 | 19.2 | 44.790000 | 19.79 | 44.730000 | 19.000000 | 45.566667 | 17.166667 | 55.20 | 7.026667 | 84.256667 | 17.200000 | 41.626667 | 18.2 | 48.900000 | 45.53 | 733.5 | 92.0 | 7.000000 | 63.000000 | 5.3 | 0 |
1 | 2016-01-11 17:10:00 | 60 | 19.89 | 46.693333 | 19.2 | 44.722500 | 19.79 | 44.790000 | 19.000000 | 45.992500 | 17.166667 | 55.20 | 6.833333 | 84.063333 | 17.200000 | 41.560000 | 18.2 | 48.863333 | 45.56 | 733.6 | 92.0 | 6.666667 | 59.166667 | 5.2 | 0 |
2 | 2016-01-11 17:20:00 | 50 | 19.89 | 46.300000 | 19.2 | 44.626667 | 19.79 | 44.933333 | 18.926667 | 45.890000 | 17.166667 | 55.09 | 6.560000 | 83.156667 | 17.200000 | 41.433333 | 18.2 | 48.730000 | 45.50 | 733.7 | 92.0 | 6.333333 | 55.333333 | 5.1 | 0 |
3 | 2016-01-11 17:30:00 | 50 | 19.89 | 46.066667 | 19.2 | 44.590000 | 19.79 | 45.000000 | 18.890000 | 45.723333 | 17.166667 | 55.09 | 6.433333 | 83.423333 | 17.133333 | 41.290000 | 18.1 | 48.590000 | 45.40 | 733.8 | 92.0 | 6.000000 | 51.500000 | 5.0 | 0 |
4 | 2016-01-11 17:40:00 | 60 | 19.89 | 46.333333 | 19.2 | 44.530000 | 19.79 | 45.000000 | 18.890000 | 45.530000 | 17.200000 | 55.09 | 6.366667 | 84.893333 | 17.200000 | 41.230000 | 18.1 | 48.590000 | 45.40 | 733.9 | 92.0 | 5.666667 | 47.666667 | 4.9 | 0 |
# Before the mapping:
features_eng['Weekend'].value_counts()
Weekend 1 2880 2 2880 3 2880 4 2845 0 2778 5 2736 6 2736 Name: count, dtype: int64
# Mapping the categorical variable to be either 0 or 1:
features_eng['Weekend'] = features_eng['Weekend'].apply(lambda x : 1 if (x == 6)|(x==5) else 0)
features_eng['Weekend'].value_counts()
Weekend 0 14263 1 5472 Name: count, dtype: int64
# Setting date as index again:
features_eng.set_index('date', inplace=True)
# Printing the dataset to see the changed column
features_eng.head()
Appliances | T1 | RH_1 | T2 | RH_2 | T3 | RH_3 | T4 | RH_4 | T5 | RH_5 | T6 | RH_6 | T7 | RH_7 | T8 | RH_8 | RH_9 | Press_mm_hg | RH_out | Windspeed | Visibility | Tdewpoint | Weekend | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||||||||||||||||
2016-01-11 17:00:00 | 60 | 19.89 | 47.596667 | 19.2 | 44.790000 | 19.79 | 44.730000 | 19.000000 | 45.566667 | 17.166667 | 55.20 | 7.026667 | 84.256667 | 17.200000 | 41.626667 | 18.2 | 48.900000 | 45.53 | 733.5 | 92.0 | 7.000000 | 63.000000 | 5.3 | 0 |
2016-01-11 17:10:00 | 60 | 19.89 | 46.693333 | 19.2 | 44.722500 | 19.79 | 44.790000 | 19.000000 | 45.992500 | 17.166667 | 55.20 | 6.833333 | 84.063333 | 17.200000 | 41.560000 | 18.2 | 48.863333 | 45.56 | 733.6 | 92.0 | 6.666667 | 59.166667 | 5.2 | 0 |
2016-01-11 17:20:00 | 50 | 19.89 | 46.300000 | 19.2 | 44.626667 | 19.79 | 44.933333 | 18.926667 | 45.890000 | 17.166667 | 55.09 | 6.560000 | 83.156667 | 17.200000 | 41.433333 | 18.2 | 48.730000 | 45.50 | 733.7 | 92.0 | 6.333333 | 55.333333 | 5.1 | 0 |
2016-01-11 17:30:00 | 50 | 19.89 | 46.066667 | 19.2 | 44.590000 | 19.79 | 45.000000 | 18.890000 | 45.723333 | 17.166667 | 55.09 | 6.433333 | 83.423333 | 17.133333 | 41.290000 | 18.1 | 48.590000 | 45.40 | 733.8 | 92.0 | 6.000000 | 51.500000 | 5.0 | 0 |
2016-01-11 17:40:00 | 60 | 19.89 | 46.333333 | 19.2 | 44.530000 | 19.79 | 45.000000 | 18.890000 | 45.530000 | 17.200000 | 55.09 | 6.366667 | 84.893333 | 17.200000 | 41.230000 | 18.1 | 48.590000 | 45.40 | 733.9 | 92.0 | 5.666667 | 47.666667 | 4.9 | 0 |
We need to verify, using the T-Test Hypothesis, whether the new categorical fature(Weekend) has any kind of relation with de numeric target(Appliances) or not. For that, we will use two sample T-test, which means that we will test the difference between of means between two groups of our categorical feature.
NULL Hypothesis: There is no difference between the energy consumed on weekends and weekdays.
Alternative Hypothesis: Energy consumption on weekends is different than on weekdays.
OBS: Two sample T-test assumes that our samples came from a normal distribution, so we need to check that assumption before applying it to our data.
# Sampling the groups:
weekdays = features_eng[features_eng['Weekend'] == 0].sample(n=300, random_state=42)['Appliances']
weekend = features_eng[features_eng['Weekend'] == 1].sample(n=300, random_state=42)['Appliances']
# Normality test:
print(f"Weekdays: {shapiro(weekdays)}")
print(f"Weekends: {shapiro(weekend)}")
Weekdays: ShapiroResult(statistic=0.5949463844299316, pvalue=6.323979251306852e-26) Weekends: ShapiroResult(statistic=0.681671142578125, pvalue=2.1151544435960978e-23)
Since p-value for the Shapirto test was less than 0.05, we can't assume that they come from a normal distribution. Therefore, we won't use T-test to evaluate their relation.
As an alternative solution, we will use a non parametric test called Wilcoxon rank-sum test, which does not assume normality.
Null Hypothesis: Two sets of measurements come from the same distribution.
Alternative Hypothesis: Measurements in one sample tend to be larger than the values of the other sample.
# Wilcoxon rank-sum statistic for two samples:
ranksums(weekend, weekdays, alternative="greater")
RanksumsResult(statistic=2.3616550407154433, pvalue=0.00909678130539973)
Since p-value is less than 0.05, we can reject the null Hypothesis and assume that the Energy consumed on Weekends are larger than the energy consumed on Weekdays.
Now, we can quantify this relation by using the Point Biserial correlation which return a statistic metric(R value) varing between [-1, 1], that measures strength of the relationship between these variables. Furthermore, the point bisserial also returns the p-value.
# We will use the Point Biserial correlation to asses whether there is a correlation between the categorical and numeric variables:
point_bisserial_cor = pointbiserialr(features_eng['Weekend'], features_eng['Appliances'])
point_bisserial_cor
SignificanceResult(statistic=0.017437050668854755, pvalue=0.014301067218672008)
Since the statistic(R value) is 0.0174, we assume that there is a positive correlation between these variables.
# Now, we need to separate the target from the whole dataset:
target = df["Appliances"]
features_eng.drop(columns='Appliances', inplace=True)
# Class that put together many feature selection techniques:
class feature_selector:
seed = 42
def __init__(self, X, y) -> None:
self.X_train = X
self.y_train = y
def randomforestR_imp(self) -> None:
model = RandomForestRegressor(random_state=feature_selector.seed)
model.fit(self.X_train, self.y_train)
series = pd.Series(index=model.feature_names_in_, data=model.feature_importances_).sort_values(ascending=False)
# Plotting RandomForest Regression Importance:
plt.title("RandomForest Importance")
plt.xlabel("Importance")
sns.barplot(x=series.values, y=series.index)
def xgbR_imp(self) -> None:
model = XGBRegressor(random_state=feature_selector.seed)
model.fit(self.X_train, self.y_train)
series = pd.Series(index=model.feature_names_in_, data=model.feature_importances_).sort_values(ascending=False)
# Plotting XGboost Regression Feature Importance:
plt.title("XGBoost Feature Importance")
plt.xlabel("Importance")
sns.barplot(y=series.index, x=series.values)
# Univariate feature selection:
def univariate(self, statistic, n="all") -> None:
selector = SelectKBest(score_func=statistic, k=n)
selector.fit(self.X_train, self.y_train)
series = pd.Series(index=selector.feature_names_in_, data=selector.scores_).\
sort_values(ascending=False)
plt.title("F-Regression Filtering")
plt.xlabel("F-score")
sns.barplot(y=series.index, x=series.values)
return selector
# Wrapper method for feature selection:
def refcv(self):
models = {
"Lasso":Lasso(random_state=feature_selector.seed),
"Ridge":Ridge(random_state=feature_selector.seed),
"RandomForestR":RandomForestRegressor(random_state=feature_selector.seed),
"XGB":XGBRegressor(random_state=feature_selector.seed)
}
splits=10
cross = KFold(n_splits=splits, random_state=feature_selector.seed, shuffle=True)
ind = [f"Columns {i}" for i in range(1, len(self.X_train.columns) + 1)]
df = pd.DataFrame(index=ind)
minimo = np.inf
name = ""
for key, model in models.items():
rfecv = RFECV(estimator=model, step=1, cv=cross, min_features_to_select=1, scoring="neg_mean_squared_error")
rfecv.fit(self.X_train, self.y_train)
root_mean = np.sqrt(-rfecv.cv_results_["mean_test_score"])
df[key] = root_mean
best_value = root_mean[np.argmin(root_mean)]
if minimo > best_value:
minimo = best_value
best_features = rfecv.support_
name = key
df_features = pd.DataFrame(columns=self.X_train.columns, data=best_features.reshape(1, -1), index=[name])
return df, df_features
# Splitting into three sets:
X_train, X_test, y_train, y_test = train_test_split(features_eng, target, test_size=0.25, random_state=42)
# Printing shapes:
print("Shapes:")
print(f"X_Train set: {X_train.shape}")
print(f"y_Train set: {y_train.shape}\n")
print(f"X_Test set: {X_test.shape}")
print(f"y_Test set: {y_test.shape}")
Shapes: X_Train set: (14801, 23) y_Train set: (14801,) X_Test set: (4934, 23) y_Test set: (4934,)
# Plotting Best features accordingly to the Random Forest Algorithm:
feature = feature_selector(X_train, y_train)
feature.randomforestR_imp()
# Plotting the best features accordingly the XGBoost Algorithm:
feature.xgbR_imp()
# Mean test score for all of the Algorithms and for each number of columns:
df_results, df_features = feature.refcv()
df_results
Lasso | Ridge | RandomForestR | XGB | |
---|---|---|---|---|
Columns 1 | 103.056148 | 103.055816 | 113.304770 | 105.212470 |
Columns 2 | 100.590495 | 100.587456 | 95.788990 | 100.936336 |
Columns 3 | 100.435432 | 100.422634 | 79.751147 | 93.266782 |
Columns 4 | 98.950192 | 98.890011 | 77.808713 | 87.317947 |
Columns 5 | 98.850838 | 98.801902 | 76.062723 | 83.934313 |
Columns 6 | 98.366952 | 98.609354 | 74.597922 | 81.859767 |
Columns 7 | 98.262228 | 98.447257 | 74.254578 | 79.656813 |
Columns 8 | 96.880590 | 98.119044 | 73.701069 | 79.038286 |
Columns 9 | 96.657324 | 96.992252 | 72.525388 | 78.179304 |
Columns 10 | 96.586050 | 96.554785 | 72.050913 | 76.868434 |
Columns 11 | 96.508091 | 96.474571 | 71.308990 | 76.758451 |
Columns 12 | 96.443505 | 96.458674 | 71.200546 | 76.060537 |
Columns 13 | 96.456390 | 96.404750 | 71.278944 | 75.356031 |
Columns 14 | 96.427961 | 96.318593 | 71.050404 | 75.967296 |
Columns 15 | 96.439207 | 96.328227 | 70.571489 | 75.322817 |
Columns 16 | 96.440432 | 96.332470 | 70.415103 | 75.650811 |
Columns 17 | 96.323125 | 96.303376 | 70.315029 | 77.008334 |
Columns 18 | 96.288128 | 96.228437 | 70.110516 | 76.239957 |
Columns 19 | 96.286269 | 96.213253 | 69.846299 | 75.334601 |
Columns 20 | 96.264342 | 96.200491 | 69.792471 | 75.395966 |
Columns 21 | 96.230583 | 96.144727 | 69.709099 | 75.173989 |
Columns 22 | 96.224150 | 96.079493 | 69.575768 | 75.670323 |
Columns 23 | 96.224143 | 96.047953 | 69.628186 | 75.824829 |
# Best features using REF:
df_features
T1 | RH_1 | T2 | RH_2 | T3 | RH_3 | T4 | RH_4 | T5 | RH_5 | T6 | RH_6 | T7 | RH_7 | T8 | RH_8 | RH_9 | Press_mm_hg | RH_out | Windspeed | Visibility | Tdewpoint | Weekend | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
RandomForestR | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | True | False |
Conclusions
# Removing Weekend column:
X_train = X_train.drop(columns="Weekend")
X_test = X_test.drop(columns="Weekend")
# Function used to evaluate the best algorithms:
def melhor_modelo(X_train, y_train, dicionario):
seed = 42
cv = 5
score = ['neg_root_mean_squared_error', 'r2']
result_rmse = {}
result_r2 = {}
for name, model in dicionario.items():
k_fold = KFold(n_splits=cv, random_state=seed, shuffle=True)
result = cross_validate(model, X_train, y_train, cv=k_fold, scoring=score)
result_rmse[name] = -result['test_neg_root_mean_squared_error']
result_r2[name] = result['test_r2']
result_pd_rmse = pd.DataFrame(data=result_rmse)
result_pd_r2 = pd.DataFrame(data=result_r2)
return result_pd_rmse, result_pd_r2
# Dictionary of algorithms:
modelos_dicionario = {
"Lasso":Lasso(random_state=42),
"Ridge":Ridge(random_state=42),
"SVR":SVR(),
"RandomForest":RandomForestRegressor(random_state=42),
"XGB":XGBRegressor(random_state=42),
"MLP":MLPRegressor(random_state=42, max_iter=2500)
}
# MinMax Scaler:
min_max_scaler = MinMaxScaler()
X_train_min_max = min_max_scaler.fit_transform(X_train)
X_test_min_max = min_max_scaler.transform(X_test)
# Result of the function melhor_modelo:
resultados_min_max_rmse, resultados_min_max_r2 = melhor_modelo(X_train_min_max, y_train, modelos_dicionario)
# Boxplot of the result for analisys of the algorithms, using MinMax Scaler:
fig = px.box(resultados_min_max_rmse, title= "Cross Validation result for MinMax Scaler(RMSE)")
fig.show("png",width=1500, height=500)
# Models statistics for RMSE metric, using MinMax Scaler:
resultados_min_max_rmse.describe()
Lasso | Ridge | SVR | RandomForest | XGB | MLP | |
---|---|---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 101.339171 | 96.103956 | 104.131297 | 71.032976 | 76.014748 | 92.804121 |
std | 3.224101 | 3.444759 | 3.413926 | 4.005136 | 4.020095 | 3.754998 |
min | 98.302072 | 93.081523 | 101.107995 | 65.284468 | 70.757023 | 89.253211 |
25% | 98.716873 | 93.238649 | 101.306166 | 68.550961 | 72.959072 | 89.673112 |
50% | 100.713914 | 95.121696 | 103.623066 | 72.564129 | 77.390410 | 92.182401 |
75% | 102.847561 | 97.872060 | 105.184074 | 74.088113 | 78.560237 | 94.594520 |
max | 106.115436 | 101.205853 | 109.435184 | 74.677209 | 80.406997 | 98.317360 |
# Models statistics for R2 metric, using MinMax Scaler:
resultados_min_max_r2.describe()
Lasso | Ridge | SVR | RandomForest | XGB | MLP | |
---|---|---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 0.039671 | 0.136466 | -0.013942 | 0.528097 | 0.459751 | 0.194885 |
std | 0.003774 | 0.010826 | 0.006957 | 0.032454 | 0.030597 | 0.016729 |
min | 0.035541 | 0.123872 | -0.024401 | 0.499513 | 0.432473 | 0.173169 |
25% | 0.036807 | 0.126600 | -0.017478 | 0.501051 | 0.437268 | 0.184117 |
50% | 0.038851 | 0.140026 | -0.011263 | 0.522985 | 0.446976 | 0.194792 |
75% | 0.043069 | 0.142624 | -0.008779 | 0.538550 | 0.477295 | 0.210372 |
max | 0.044087 | 0.149205 | -0.007789 | 0.578388 | 0.504741 | 0.211973 |
Random Forest was the best algorithms while using the MinMax scaler.
# Standard Scaler:
std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)
# Result of the function melhor_modelo:
result_std_rmse, result_std_r2 = melhor_modelo(X_train_std, y_train, modelos_dicionario)
# Boxplot of the result for analisys of the algorithms, using Standard Scaler:
fig = px.box(result_std_rmse, title="Cross validation result for Standard Scaler(RMSE)")
fig.show('png', width=1500, height=500)
# Models statistics for RMSE metric, using Standard Scaler:
result_std_rmse.describe()
Lasso | Ridge | SVR | RandomForest | XGB | MLP | |
---|---|---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 97.027618 | 96.061407 | 103.592166 | 71.036226 | 76.007873 | 84.539414 |
std | 3.356449 | 3.525190 | 3.418535 | 4.103174 | 3.934746 | 3.118004 |
min | 93.827089 | 93.035937 | 100.470340 | 65.167256 | 70.906749 | 80.339979 |
25% | 94.368913 | 93.086069 | 100.854924 | 68.452633 | 72.942371 | 82.763725 |
50% | 96.310955 | 95.063921 | 103.064621 | 72.642128 | 77.308188 | 84.498308 |
75% | 98.639844 | 97.797212 | 104.683744 | 74.137774 | 78.651667 | 87.536390 |
max | 101.991288 | 101.323894 | 108.887199 | 74.781337 | 80.230393 | 87.558671 |
# Models statistics for R2 metric, using Standard Scaler:
result_std_r2.describe()
Lasso | Ridge | SVR | RandomForest | XGB | MLP | |
---|---|---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 0.119752 | 0.137261 | -0.003458 | 0.528049 | 0.459843 | 0.331569 |
std | 0.008085 | 0.011838 | 0.006852 | 0.033580 | 0.029618 | 0.022830 |
min | 0.110221 | 0.121827 | -0.014168 | 0.498842 | 0.433679 | 0.300973 |
25% | 0.112843 | 0.127935 | -0.006540 | 0.499978 | 0.435958 | 0.323437 |
50% | 0.121052 | 0.142839 | 0.000796 | 0.521654 | 0.449403 | 0.327367 |
75% | 0.125508 | 0.143666 | 0.001169 | 0.539872 | 0.477534 | 0.344559 |
max | 0.129137 | 0.150038 | 0.001452 | 0.579901 | 0.502643 | 0.361506 |
Random Fortest was the best algorithms while using the StandardScaler.
# Robust Scaler:
rb_scaler = RobustScaler()
X_train_rb = rb_scaler.fit_transform(X_train)
X_test_rb = rb_scaler.transform(X_test)
# Result of the function melhor_modelo:
result_rb_rmse, result_rb_r2= melhor_modelo(X_train_rb, y_train, modelos_dicionario)
# Boxplot of the result for analisys of the algorithms, using Robust Scaler:
fig = px.box(result_rb_rmse, title="Cross validation results for Robust Scaler(RMSE)")
fig.show('png', width=1500, height=500)
# Models statistics for RMSE metric, using Rodbust Scaler:
result_rb_rmse.describe()
Lasso | Ridge | SVR | RandomForest | XGB | MLP | |
---|---|---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 97.291404 | 96.061271 | 103.746001 | 71.066650 | 76.120590 | 84.622211 |
std | 3.351912 | 3.523836 | 3.443017 | 4.029722 | 4.007752 | 3.226213 |
min | 94.010451 | 93.036021 | 100.599875 | 65.296841 | 70.926901 | 80.310477 |
25% | 94.724338 | 93.087422 | 100.977465 | 68.562893 | 72.939596 | 83.203670 |
50% | 96.582075 | 95.064305 | 103.235908 | 72.564676 | 77.667119 | 83.948702 |
75% | 98.899118 | 97.797581 | 104.842294 | 74.140519 | 78.682726 | 87.633504 |
max | 102.241039 | 101.321028 | 109.074463 | 74.768323 | 80.386606 | 88.014702 |
# Models statistics for R2 metric, using Rodbust Scaler:
result_rb_r2.describe()
Lasso | Ridge | SVR | RandomForest | XGB | MLP | |
---|---|---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 0.114953 | 0.137263 | -0.006433 | 0.527657 | 0.458222 | 0.330232 |
std | 0.008090 | 0.011819 | 0.007255 | 0.032586 | 0.031014 | 0.025566 |
min | 0.105858 | 0.121877 | -0.017659 | 0.498805 | 0.428408 | 0.293672 |
25% | 0.108173 | 0.127929 | -0.009889 | 0.501044 | 0.435512 | 0.320197 |
50% | 0.116096 | 0.142814 | -0.002233 | 0.521820 | 0.447257 | 0.332210 |
75% | 0.118908 | 0.143659 | -0.001260 | 0.538389 | 0.477574 | 0.343104 |
max | 0.125730 | 0.150037 | -0.001124 | 0.578228 | 0.502360 | 0.361975 |
Random Forest was the best algorithms while using the Robust Scaler.
As the boxplot has shown before, the best algorithm was Random Forest for all of the transformations. So now, we will tune some Random Forest models, with all of the transformations using GridSearchCV.
# Function for tuning an arbitrary model:
def tuning(X_train, y_train, modelo, params):
cv = 5
score = "neg_root_mean_squared_error"
grid = GridSearchCV(modelo, cv=cv, param_grid=params,
scoring=score,
n_jobs=-1,
return_train_score=True,
)
grid.fit(X_train, y_train)
best_index = grid.best_index_
result = grid.cv_results_
train_score = -result['mean_train_score'][best_index]
left_out = -result['mean_test_score'][best_index]
print(f"Train score: {train_score}")
print(f"Left out data score: {left_out}")
return grid.best_estimator_
# Parameter grid:
params_grid = {
"n_estimators":[100, 120, 130],
"max_depth":[11, 12],
"min_samples_split":[2, 5, 10],
"max_features":[0.5, 0.6, 0.7]
}
# Tuning a Random Forest using MinMax Scaler:
model_rnd = RandomForestRegressor(random_state=42)
print(f"Random Forest - MinMax Scaler:")
best_estimator_rnd_min_max = tuning(X_train_min_max, y_train, model_rnd, params_grid)
Random Forest - MinMax Scaler: Train score: 55.20977987629416 Left out data score: 78.2570080306544
# Best Random Forest model:
best_estimator_rnd_min_max
RandomForestRegressor(max_depth=12, max_features=0.5, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(max_depth=12, max_features=0.5, random_state=42)
# Tuning a Random Forest unsing Standard Scaler:
model_rnd = RandomForestRegressor(random_state=42)
print(f"Random Forest - Standard Scaler:")
best_estimator_rnd_std = tuning(X_train_std, y_train, model_rnd, params_grid)
Random Forest - Standard Scaler: Train score: 55.20971673209787 Left out data score: 78.2527129431916
# Tuning a Random Forest using Robust Scaler:
model_rnd = RandomForestRegressor(random_state=42)
print(f"Random Forest - Robust Scaler:")
best_estimator_rnd_rb = tuning(X_train_rb, y_train, model_rnd, params_grid)
Random Forest - Robust Scaler: Train score: 55.21031330462006 Left out data score: 78.25951265014629
Conclusions
The best model was built using Standard Scaler and Random Forest.
Random Forest seem to suffer from Overfitting.
Train score: 55.20971673209787
Left out data score: 78.2527129431916
# Clone the parameters of the best Random Forest model:
rd_best_model = clone(best_estimator_rnd_std)
# Training a Random Forest model:
rd_best_model.fit(X_train_std, y_train)
RandomForestRegressor(max_depth=12, max_features=0.5, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(max_depth=12, max_features=0.5, random_state=42)
# Predicting train set and test set:
y_pred_train = rd_best_model.predict(X_train_std)
y_pred_test = rd_best_model.predict(X_test_std)
# Calculating metrics for train set:
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
r2_train = r2_score(y_train, y_pred_train)
# Calculating metrics for test set:
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2_test = r2_score(y_test, y_pred_test)
# Train set results:
print("Train results:")
print(f"RMSE: {rmse_train}")
print(f"R2 score: {r2_train}")
Train results: RMSE: 55.42828784895388 R2 score: 0.7129440480283424
# Test set results:
print("Test results:")
print(f"RMSE: {rmse_test}")
print(f"R2 score: {r2_test}")
Test results: RMSE: 73.7462104247005 R2 score: 0.45252870157881486