The objective of this project is to employ various types of asteroid information to classify them as either hazardous or non-hazardous. The data, obtained from Kaggle, is meticulously curated by the Jet Propulsion Laboratory (JPL) of the California Institute of Technology, an esteemed institution operating under the auspices of NASA. The collected data will be utilized to train and apply machine learning algorithms specifically designed for classification purposes.
# Importing Libraries:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis, skew, mannwhitneyu, chi2_contingency
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.base import clone
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, RFECV
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import make_pipeline
import missingno as ms
import joblib
import os
# Loading Data:
df_raw = pd.read_csv("dataset.csv", dtype={"pdes":"str",
"name":"str",
"prefix":"str"})
# Making a copy of the dataset:
df = df_raw.copy()
# Information about the dataset:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 958524 entries, 0 to 958523 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 958524 non-null object 1 spkid 958524 non-null int64 2 full_name 958524 non-null object 3 pdes 958524 non-null object 4 name 22064 non-null object 5 prefix 18 non-null object 6 neo 958520 non-null object 7 pha 938603 non-null object 8 H 952261 non-null float64 9 diameter 136209 non-null float64 10 albedo 135103 non-null float64 11 diameter_sigma 136081 non-null float64 12 orbit_id 958524 non-null object 13 epoch 958524 non-null float64 14 epoch_mjd 958524 non-null int64 15 epoch_cal 958524 non-null float64 16 equinox 958524 non-null object 17 e 958524 non-null float64 18 a 958524 non-null float64 19 q 958524 non-null float64 20 i 958524 non-null float64 21 om 958524 non-null float64 22 w 958524 non-null float64 23 ma 958523 non-null float64 24 ad 958520 non-null float64 25 n 958524 non-null float64 26 tp 958524 non-null float64 27 tp_cal 958524 non-null float64 28 per 958520 non-null float64 29 per_y 958523 non-null float64 30 moid 938603 non-null float64 31 moid_ld 958397 non-null float64 32 sigma_e 938602 non-null float64 33 sigma_a 938602 non-null float64 34 sigma_q 938602 non-null float64 35 sigma_i 938602 non-null float64 36 sigma_om 938602 non-null float64 37 sigma_w 938602 non-null float64 38 sigma_ma 938602 non-null float64 39 sigma_ad 938598 non-null float64 40 sigma_n 938602 non-null float64 41 sigma_tp 938602 non-null float64 42 sigma_per 938598 non-null float64 43 class 958524 non-null object 44 rms 958522 non-null float64 dtypes: float64(33), int64(2), object(10) memory usage: 329.1+ MB
# Deleting some useless columns:
df.drop(columns=["prefix", "name", "spkid", "orbit_id", 'pdes'], inplace=True)
print(f"Shape: {df.shape}")
Shape: (958524, 40)
RMS column
After searching a lot about this columns, I could not find anthing about it, so as I am no keen on it, I will drop it as well.
# Deleting rms column:
df.drop(columns=['rms'], inplace=True)
Let's see the columns that I want to set as index.
# Unique values of the full_name column:
df['full_name'].unique()
array([' 1 Ceres', ' 2 Pallas', ' 3 Juno', ..., ' (6344 P-L)', ' (2060 T-2)', ' (2678 T-3)'], dtype=object)
As we can see that at the beginning of the names, there are some blank spaces, so we need to fix it before set them as index.
# Removing the blank space in front of the names:
df['full_name'] = df['full_name'].str.strip(" ")
df['full_name'].unique()
array(['1 Ceres', '2 Pallas', '3 Juno', ..., '(6344 P-L)', '(2060 T-2)', '(2678 T-3)'], dtype=object)
# Setting ID and Full_name as a multindex:
df.set_index(["id", "full_name"], inplace=True)
Both of them measure distances, and the difference between them is that the first one measures distance in astronomical units, which means it measures distance based on the distance between the Earth and the Sun. On the other hand, the second attribute measures the distance using a unit called lunar distance, which has a very objective description and is easy to interpret.
# Correlation between moid and moid_ld
df[['moid', 'moid_ld']].corr()
moid | moid_ld | |
---|---|---|
moid | 1.0 | 1.0 |
moid_ld | 1.0 | 1.0 |
As we can expect, they are highly correlated, so we can choose one of them to represent the information about distance. With this in mind, we will drop the moid_ld column because its range is much larger than that of the moid column.
# Dropping the moid_ld column:
df.drop(columns=['moid_ld'], inplace=True)
Accondingly to the Nasa website https://ssd.jpl.nasa.gov/tools/sbdb_query.html#!#results, both of these columns have the same name "Time of the Perihelion Passage (TDB)". Furthermore, the first attribute represents the perihelion value itself and the second attribute can be a calibrated version of the tp variable. These attributes carry information about the time, which an asteroid, orbiting the Sun, is closest to it. These columns are presented in Unix Timestamp.
# Displaying the corelation of tp and tp_cal:
display(df[['tp', 'tp_cal']].corr())
tp | tp_cal | |
---|---|---|
tp | 1.000000 | 0.998251 |
tp_cal | 0.998251 | 1.000000 |
As we can see above, they are hightly correlated but I will not drop any of them for now, because I want to explore them more on the next topic of the project.
# Looking at the first five rows:
df.head()
neo | pha | H | diameter | albedo | diameter_sigma | epoch | epoch_mjd | epoch_cal | equinox | ... | sigma_q | sigma_i | sigma_om | sigma_w | sigma_ma | sigma_ad | sigma_n | sigma_tp | sigma_per | class | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | full_name | |||||||||||||||||||||
a0000001 | 1 Ceres | N | N | 3.40 | 939.400 | 0.0900 | 0.200 | 2458600.5 | 58600 | 20190427.0 | J2000 | ... | 1.956900e-11 | 4.608900e-09 | 6.168800e-08 | 6.624800e-08 | 7.820700e-09 | 1.111300e-11 | 1.196500e-12 | 3.782900e-08 | 9.415900e-09 | MBA |
a0000002 | 2 Pallas | N | N | 4.20 | 545.000 | 0.1010 | 18.000 | 2459000.5 | 59000 | 20200531.0 | J2000 | ... | 8.832200e-08 | 3.469400e-06 | 6.272400e-06 | 9.128200e-06 | 8.859100e-06 | 4.961300e-09 | 4.653600e-10 | 4.078700e-05 | 3.680700e-06 | MBA |
a0000003 | 3 Juno | N | N | 5.33 | 246.596 | 0.2140 | 10.594 | 2459000.5 | 59000 | 20200531.0 | J2000 | ... | 8.139200e-08 | 3.223100e-06 | 1.664600e-05 | 1.772100e-05 | 8.110400e-06 | 4.363900e-09 | 4.413400e-10 | 3.528800e-05 | 3.107200e-06 | MBA |
a0000004 | 4 Vesta | N | N | 3.00 | 525.400 | 0.4228 | 0.200 | 2458600.5 | 58600 | 20190427.0 | J2000 | ... | 1.928600e-09 | 2.170600e-07 | 3.880800e-07 | 1.789300e-07 | 1.206800e-06 | 1.648600e-09 | 2.612500e-10 | 4.103700e-06 | 1.274900e-06 | MBA |
a0000005 | 5 Astraea | N | N | 6.90 | 106.699 | 0.2740 | 3.140 | 2459000.5 | 59000 | 20200531.0 | J2000 | ... | 6.092400e-08 | 2.740800e-06 | 2.894900e-05 | 2.984200e-05 | 8.303800e-06 | 4.729000e-09 | 5.522700e-10 | 3.474300e-05 | 3.490500e-06 | MBA |
5 rows × 36 columns
Missing Data
# Looking for missing data:
df.isna().sum().sort_values(ascending=False).to_frame()
0 | |
---|---|
albedo | 823421 |
diameter_sigma | 822443 |
diameter | 822315 |
sigma_per | 19926 |
sigma_ad | 19926 |
sigma_w | 19922 |
sigma_om | 19922 |
sigma_a | 19922 |
sigma_q | 19922 |
sigma_i | 19922 |
sigma_n | 19922 |
sigma_ma | 19922 |
sigma_tp | 19922 |
sigma_e | 19922 |
moid | 19921 |
pha | 19921 |
H | 6263 |
neo | 4 |
ad | 4 |
per | 4 |
ma | 1 |
per_y | 1 |
n | 0 |
tp_cal | 0 |
tp | 0 |
w | 0 |
om | 0 |
i | 0 |
q | 0 |
a | 0 |
e | 0 |
equinox | 0 |
epoch_cal | 0 |
epoch_mjd | 0 |
epoch | 0 |
class | 0 |
# Deleting columns that hava more than 50% of missing data:
df.dropna(thresh=0.5*len(df), axis=1, inplace=True)
print(f"Shape: {df.shape}")
Shape: (958524, 33)
# Looking for missing data:
df.isna().sum().sort_values(ascending=False)
sigma_per 19926 sigma_ad 19926 sigma_om 19922 sigma_e 19922 sigma_tp 19922 sigma_n 19922 sigma_ma 19922 sigma_w 19922 sigma_i 19922 sigma_q 19922 sigma_a 19922 moid 19921 pha 19921 H 6263 neo 4 ad 4 per 4 ma 1 per_y 1 tp 0 tp_cal 0 n 0 w 0 om 0 i 0 q 0 a 0 e 0 equinox 0 epoch_cal 0 epoch_mjd 0 epoch 0 class 0 dtype: int64
# Description of the dataset:
df.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 958524 entries, ('a0000001', '1 Ceres') to ('bT3S2678', '(2678 T-3)') Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 neo 958520 non-null object 1 pha 938603 non-null object 2 H 952261 non-null float64 3 epoch 958524 non-null float64 4 epoch_mjd 958524 non-null int64 5 epoch_cal 958524 non-null float64 6 equinox 958524 non-null object 7 e 958524 non-null float64 8 a 958524 non-null float64 9 q 958524 non-null float64 10 i 958524 non-null float64 11 om 958524 non-null float64 12 w 958524 non-null float64 13 ma 958523 non-null float64 14 ad 958520 non-null float64 15 n 958524 non-null float64 16 tp 958524 non-null float64 17 tp_cal 958524 non-null float64 18 per 958520 non-null float64 19 per_y 958523 non-null float64 20 moid 938603 non-null float64 21 sigma_e 938602 non-null float64 22 sigma_a 938602 non-null float64 23 sigma_q 938602 non-null float64 24 sigma_i 938602 non-null float64 25 sigma_om 938602 non-null float64 26 sigma_w 938602 non-null float64 27 sigma_ma 938602 non-null float64 28 sigma_ad 938598 non-null float64 29 sigma_n 938602 non-null float64 30 sigma_tp 938602 non-null float64 31 sigma_per 938598 non-null float64 32 class 958524 non-null object dtypes: float64(28), int64(1), object(4) memory usage: 327.8+ MB
# Plottinhg the missing data matrix:
ms.matrix(df, figsize=(30, 10));
Apparently all the missing data in the sigma columns and in the moid column are related to the missing data contained in the Target column pha.
# Looking at the missing data of the sigma columns:
df_2 = df[["pha", "moid", "sigma_e", "sigma_a",
"sigma_q", "sigma_i", "sigma_om",
"sigma_w", "sigma_ma", "sigma_ad",
"sigma_n", "sigma_tp", "sigma_per"]]
df_2[df["pha"].isnull()]
pha | moid | sigma_e | sigma_a | sigma_q | sigma_i | sigma_om | sigma_w | sigma_ma | sigma_ad | sigma_n | sigma_tp | sigma_per | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | full_name | |||||||||||||
bJ39R00R | (1939 RR) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
bJ90O05K | (1990 OK5) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
bJ91R28N | (1991 RN28) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
bJ93T11C | (1993 TC11) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
bJ94A09F | (1994 AF9) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
bK20K03Q | (2020 KQ3) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
bK20K03R | (2020 KR3) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
bK20K03V | (2020 KV3) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
bK20K04A | (2020 KA4) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
bK20K04C | (2020 KC4) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
19921 rows × 13 columns
As we can see above, 19921 from 19922 forom the missing data is related to the target column. Since we do not have data of the dependent variable for those obsrvations, we can drop all of them basend on the target.
# Dropping observations with missing data in the dependent variable:
df.dropna(subset="pha", inplace=True)
df.isna().sum().sort_values(ascending=False).to_frame()
0 | |
---|---|
H | 6262 |
sigma_per | 5 |
sigma_ad | 5 |
neo | 4 |
ad | 4 |
per | 4 |
sigma_q | 1 |
sigma_e | 1 |
ma | 1 |
sigma_a | 1 |
sigma_om | 1 |
sigma_i | 1 |
sigma_w | 1 |
sigma_ma | 1 |
sigma_n | 1 |
sigma_tp | 1 |
per_y | 1 |
moid | 0 |
tp | 0 |
tp_cal | 0 |
pha | 0 |
n | 0 |
w | 0 |
om | 0 |
i | 0 |
q | 0 |
a | 0 |
e | 0 |
equinox | 0 |
epoch_cal | 0 |
epoch_mjd | 0 |
epoch | 0 |
class | 0 |
Let's look at the missing data of the Absolute magnitude parameter.
# Plotting the missing data matrix:
ms.matrix(df[['H']]);
Apparently, the missing data of the H columns is not completely at random. So, we are going to imput them with the mean.
# Separete into two sets of data:
features = df.drop(columns='pha')
target = df['pha']
# Splitting the data:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Imputation of the missing data with the mean:
simpler_imputer = SimpleImputer(strategy='mean')
X_train['H'] = simpler_imputer.fit_transform(X_train[['H']])
X_test['H'] = simpler_imputer.transform(X_test[['H']])
For the rest of the missing data, we will drop them, because they were not systematically generated and because the number of missing data is not relevant compared with the amount of not missing data that we have.
# Dropping observations with missing data:
X_train['target'] = y_train
X_test['target'] = y_test
X_train.dropna(inplace=True)
X_test.dropna(inplace=True)
# Training set with the same number of obsertions:
# Test set with the same number of observations:
y_train = X_train['target']
y_test = X_test['target']
X_train.drop(columns=['target'], inplace=True)
X_test.drop(columns=['target'], inplace=True)
# Looking at the unique data of equinox column:
df['equinox'].unique()
array(['J2000'], dtype=object)
Equinox variable has just 1 type of data, so it is not important for the analysis.
# Dropping equinox column:
X_train.drop(columns=['equinox'], inplace=True)
X_test.drop(columns=['equinox'], inplace=True)
# Shape of the sets:
print(f"Traning Shape: {X_train.shape}")
print(f"Test Shape: {X_test.shape}")
Traning Shape: (750877, 31) Test Shape: (187720, 31)
# Encoding the target variable:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)
# Informations after Data Cleaning:
X_train.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 750877 entries, ('bK14EF8X', '(2014 EX158)') to ('a0121959', '121959 (2000 EL67)') Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 neo 750877 non-null object 1 H 750877 non-null float64 2 epoch 750877 non-null float64 3 epoch_mjd 750877 non-null int64 4 epoch_cal 750877 non-null float64 5 e 750877 non-null float64 6 a 750877 non-null float64 7 q 750877 non-null float64 8 i 750877 non-null float64 9 om 750877 non-null float64 10 w 750877 non-null float64 11 ma 750877 non-null float64 12 ad 750877 non-null float64 13 n 750877 non-null float64 14 tp 750877 non-null float64 15 tp_cal 750877 non-null float64 16 per 750877 non-null float64 17 per_y 750877 non-null float64 18 moid 750877 non-null float64 19 sigma_e 750877 non-null float64 20 sigma_a 750877 non-null float64 21 sigma_q 750877 non-null float64 22 sigma_i 750877 non-null float64 23 sigma_om 750877 non-null float64 24 sigma_w 750877 non-null float64 25 sigma_ma 750877 non-null float64 26 sigma_ad 750877 non-null float64 27 sigma_n 750877 non-null float64 28 sigma_tp 750877 non-null float64 29 sigma_per 750877 non-null float64 30 class 750877 non-null object dtypes: float64(28), int64(1), object(2) memory usage: 262.4+ MB
Some usefull functions.
# Function that calculates kurtosis and skewness of a dataset:
def kurtosis_skewness(dataset: any):
col = ["Kurtosis", "Skewness"]
results = pd.DataFrame(index=dataset.describe().columns, columns=col)
for c in dataset.describe().columns:
kurt = kurtosis(dataset[c])
skewness = skew(dataset[c])
results.loc[c] = [kurt, skewness]
return results
# Heatmap of correlations:
def heatmap_cor(df: any):
correlation = df.coRr()
mask = np.zeros_like(correlation)
mask[np.triu_indices_from(correlation)] = True
sns.heatmap(correlation, mask=mask, annot=True, c="crest", cbar=True)
Let's see the descriptive statistics of the sigma columns.
# Statistical Summary:
sigma_list = [i for i in X_train.columns if 'sigma' in i]
X_train[sigma_list].describe()
sigma_e | sigma_a | sigma_q | sigma_i | sigma_om | sigma_w | sigma_ma | sigma_ad | sigma_n | sigma_tp | sigma_per | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 |
mean | 7.758346e-01 | 1.726028e+01 | 2.054308e+01 | 1.214982e+00 | 6.000012e+00 | 4.549929e+05 | 4.549157e+05 | 2.418465e+01 | 5.603977e-02 | 1.357960e+08 | 8.905039e+04 |
std | 1.002947e+02 | 4.892992e+03 | 3.059394e+03 | 1.376154e+02 | 1.489532e+03 | 7.682747e+07 | 7.682117e+07 | 8.028341e+03 | 1.096641e+01 | 2.309041e+10 | 2.947131e+07 |
min | 1.285600e-10 | 4.145500e-11 | 2.616800e-10 | 4.087800e-08 | 2.163900e-07 | 1.789300e-07 | 2.103800e-07 | 5.572900e-11 | 2.860900e-11 | 3.088400e-07 | 2.335200e-08 |
25% | 5.477000e-08 | 2.046300e-08 | 1.462100e-07 | 6.094500e-06 | 3.622600e-05 | 5.759500e-05 | 2.573700e-05 | 2.340400e-08 | 2.768800e-09 | 1.110900e-04 | 1.793700e-05 |
50% | 8.167200e-08 | 3.850300e-08 | 2.271700e-07 | 8.687700e-06 | 6.650600e-05 | 1.047800e-04 | 4.900800e-05 | 4.361800e-08 | 4.637100e-09 | 2.231400e-04 | 3.500700e-05 |
75% | 2.338800e-07 | 1.044500e-07 | 6.595600e-07 | 1.591900e-05 | 1.613200e-04 | 3.118800e-04 | 1.719500e-04 | 1.196900e-07 | 1.125300e-08 | 8.145700e-04 | 9.779100e-05 |
max | 3.942500e+04 | 3.241200e+06 | 1.015000e+06 | 5.533000e+04 | 1.199100e+06 | 3.340000e+10 | 3.339300e+10 | 5.509700e+06 | 7.698800e+03 | 1.041500e+13 | 1.910700e+10 |
Observations:
Now let's dive a little bit into the Absolute magnitude parameter (H), epoch, epoch_mjd, epoch_cal, Eccentricity (e), Semi-major axis (a), Perihelion distance (q) and Inclination (i) columns.
# Selecting columns:
initial_columns = ['H', 'epoch', 'epoch_mjd', 'epoch_cal', 'e', 'a', 'q', 'i']
# Descriptive Statistic of the initial numeric columns:
X_train[initial_columns].describe()
H | epoch | epoch_mjd | epoch_cal | e | a | q | i | |
---|---|---|---|---|---|---|---|---|
count | 750877.000000 | 7.508770e+05 | 750877.000000 | 7.508770e+05 | 750877.000000 | 750877.000000 | 750877.000000 | 750877.000000 |
mean | 16.890408 | 2.458871e+06 | 58870.829587 | 2.019698e+07 | 0.156175 | 2.900448 | 2.397450 | 9.044674 |
std | 1.795913 | 7.055785e+02 | 705.578515 | 1.940802e+04 | 0.092974 | 12.327240 | 2.157830 | 6.658707 |
min | -1.100000 | 2.425052e+06 | 25051.000000 | 1.927062e+07 | 0.000003 | 0.555418 | 0.075872 | 0.007744 |
25% | 16.000000 | 2.459000e+06 | 59000.000000 | 2.020053e+07 | 0.092078 | 2.388185 | 1.971846 | 4.152227 |
50% | 16.900000 | 2.459000e+06 | 59000.000000 | 2.020053e+07 | 0.144916 | 2.647204 | 2.226284 | 7.398350 |
75% | 17.700000 | 2.459000e+06 | 59000.000000 | 2.020053e+07 | 0.200694 | 3.002401 | 2.579520 | 12.390983 |
max | 33.200000 | 2.459000e+06 | 59000.000000 | 2.020053e+07 | 0.999591 | 8850.823836 | 80.398819 | 175.082901 |
Observations:
Finally, let's see the rest of the numeric columns.
# Selecting columns:
other_columns = ['om', 'w', 'ma', 'ad', 'n', 'tp', 'tp_cal', 'per', 'per_y', 'moid']
# Descriptive Statistic of the other numeric columns:
X_train[other_columns].describe()
om | w | ma | ad | n | tp | tp_cal | per | per_y | moid | |
---|---|---|---|---|---|---|---|---|---|---|
count | 750877.000000 | 750877.000000 | 750877.000000 | 750877.000000 | 750877.000000 | 7.508770e+05 | 7.508770e+05 | 7.508770e+05 | 750877.000000 | 7.508770e+05 |
mean | 168.500688 | 181.376168 | 177.210988 | 3.403445 | 0.236686 | 2.458858e+06 | 2.019585e+07 | 2.900275e+03 | 7.940519 | 1.414834e+00 |
std | 102.886239 | 103.885218 | 105.869845 | 24.216581 | 0.079762 | 1.600272e+03 | 4.391532e+04 | 3.635303e+05 | 995.291634 | 2.155184e+00 |
min | 0.000025 | 0.000130 | -67.136826 | 0.653773 | 0.000001 | 2.283183e+06 | 1.539011e+07 | 1.511918e+02 | 0.413941 | 4.544120e-07 |
25% | 80.610134 | 91.541555 | 83.600824 | 2.781923 | 0.189453 | 2.458564e+06 | 2.019032e+07 | 1.348032e+03 | 3.690711 | 9.797690e-01 |
50% | 159.947331 | 182.275912 | 175.284210 | 3.047334 | 0.228835 | 2.458946e+06 | 2.020041e+07 | 1.573184e+03 | 4.307143 | 1.240520e+00 |
75% | 252.248318 | 271.571677 | 269.886957 | 3.365602 | 0.267056 | 2.459362e+06 | 2.021053e+07 | 1.900209e+03 | 5.202490 | 1.593080e+00 |
max | 359.999793 | 359.998075 | 491.618014 | 17698.024949 | 2.381082 | 2.546362e+06 | 2.259081e+07 | 3.041403e+08 | 832690.763699 | 7.947660e+01 |
Observations:
Now, let's look at the distribution and compute the Kurtosis and Skewness of the sigma columns. We will change the scale of the variables to a logarithmic scale due to their range, otherwise the plots won't look good and, we won't be able to interpret the results.
# Changing the scale of the data:
new_name = ['log_' + i for i in sigma_list]
dictionary = dict(zip(sigma_list, new_name))
log_data = np.log(X_train[sigma_list])
log_data.rename(columns=dictionary, inplace=True)
# Creating a subplot:
fig, ax = plt.subplots(nrows= 3, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Distributions of the Log-Sigma data")
# Histogram plots:
sns.histplot(log_data['log_sigma_e'], ax=ax[0, 0], kde=True)
sns.histplot(log_data['log_sigma_a'], ax=ax[0, 1], kde=True)
sns.histplot(log_data['log_sigma_q'], ax=ax[0, 2], kde=True)
sns.histplot(log_data['log_sigma_i'], ax=ax[0, 3], kde=True)
sns.histplot(log_data['log_sigma_om'], ax=ax[1, 0], kde=True)
sns.histplot(log_data['log_sigma_w'], ax=ax[1, 1], kde=True)
sns.histplot(log_data['log_sigma_ma'], ax=ax[1, 2], kde=True)
sns.histplot(log_data['log_sigma_ad'], ax=ax[1, 3], kde=True)
sns.histplot(log_data['log_sigma_n'], ax=ax[2, 0], kde=True)
sns.histplot(log_data['log_sigma_tp'], ax=ax[2, 1], kde=True)
sns.histplot(log_data['log_sigma_per'], ax=ax[2, 2], kde=True)
ax[2, 3].set_visible(False)
# Printing Kurtosis and Skewness for the log(sigma) columns:
kurtosis_skewness(log_data)
Kurtosis | Skewness | |
---|---|---|
log_sigma_e | 6.204082 | 2.475314 |
log_sigma_a | 5.646046 | 2.400785 |
log_sigma_q | 6.867976 | 2.552479 |
log_sigma_i | 8.351038 | 2.880583 |
log_sigma_om | 10.566405 | 2.981978 |
log_sigma_w | 8.715659 | 2.724005 |
log_sigma_ma | 6.282922 | 2.400544 |
log_sigma_ad | 5.587056 | 2.39486 |
log_sigma_n | 5.49624 | 2.421342 |
log_sigma_tp | 7.331019 | 2.48295 |
log_sigma_per | 5.787754 | 2.406471 |
Observations:
Let's see the distributions of Absolute magnitude parameter (H), epoch, epoch_mjd, epoch_cal, Eccentricity (e), Semi-major axis (a), Perihelion distance (q) and Inclination (i) columns.
# Creating a subplot:
fig, ax = plt.subplots(nrows=2, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Histograms and Kernel Density Function")
# Histogram plots:
sns.histplot(X_train['H'], ax=ax[0, 0], kde=True)
sns.histplot(X_train['epoch'], ax=ax[0, 1], kde=True)
sns.histplot(X_train['epoch_mjd'], ax=ax[0, 2], kde=True)
sns.histplot(X_train['epoch_cal'], ax=ax[0, 3], kde=True)
sns.histplot(X_train['e'], ax=ax[1, 0], kde=True)
sns.histplot(X_train['a'], ax=ax[1, 1], kde=True)
sns.histplot(X_train['q'], ax=ax[1, 2], kde=True)
sns.histplot(X_train['i'], ax=ax[1, 3], kde=True);
# Printing Kurtosis and Skewness for the initial columns:
kurtosis_skewness(X_train[initial_columns])
Kurtosis | Skewness | |
---|---|---|
H | 9.574231 | 0.632873 |
epoch | 63.509785 | -6.773483 |
epoch_mjd | 63.509785 | -6.773483 |
epoch_cal | 63.507964 | -6.778353 |
e | 7.807368 | 1.900893 |
a | 359883.099936 | 528.109344 |
q | 264.636904 | 15.725018 |
i | 22.503049 | 2.266021 |
Observations:
# Creating a subplot:
fig, ax = plt.subplots(nrows=3, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Histograms and Kernel Density Function")
# Histogram plots:
sns.histplot(X_train['om'], ax=ax[0, 0], kde=True)
sns.histplot(X_train['w'], ax=ax[0, 1], kde=True)
sns.histplot(X_train['ma'], ax=ax[0, 2], kde=True)
sns.histplot(np.log(X_train['ad']), ax=ax[0, 3], kde=True)
ax[0, 3].set_xlabel('log-ad')
sns.histplot(X_train['n'], ax=ax[1, 0], kde=True)
sns.histplot(X_train['tp'], ax=ax[1, 1], kde=True)
sns.histplot(X_train['tp_cal'], ax=ax[1, 2], kde=True)
sns.histplot(np.log(X_train['per']), ax=ax[1, 3], kde=True)
ax[1, 3].set_xlabel('log-per')
sns.histplot(np.log(X_train['per_y']), ax=ax[2, 0], kde=True)
ax[2, 0].set_xlabel('log-per_y')
sns.histplot(X_train['moid'], ax=ax[2, 1], kde=True)
ax[2, 2].set_visible(False)
ax[2, 3].set_visible(False);
# Printing Kurtosis and Skewness for the other columns:
kurtosis_skewness(X_train[other_columns])
Kurtosis | Skewness | |
---|---|---|
om | -1.108926 | 0.196737 |
w | -1.20069 | -0.019961 |
ma | -1.233766 | 0.043606 |
ad | 386475.66126 | 556.60062 |
n | 63.992398 | 5.635289 |
tp | 739.249481 | -4.714999 |
tp_cal | 731.590956 | -4.676611 |
per | 653535.966944 | 787.456975 |
per_y | 653535.966944 | 787.456975 |
moid | 265.866793 | 15.782915 |
Observations:
Now, let's see the distribution and the outliers of the variables using boxplot.
df_train = X_train.copy()
df_train['Target'] = y_train
# Boxplots of the sigma variables:
fig, ax = plt.subplots(nrows=3, ncols=4, sharex=False, sharey=False, figsize=(20, 15))
fig.suptitle("Boxplots")
sns.boxplot(X_train[['sigma_e']], ax=ax[0, 0])
sns.boxplot(X_train[['sigma_a']], ax=ax[0, 1])
sns.boxplot(X_train[['sigma_q']], ax=ax[0, 2])
sns.boxplot(X_train[['sigma_i']], ax=ax[0, 3])
sns.boxplot(X_train[['sigma_om']], ax=ax[1, 0])
sns.boxplot(X_train[['sigma_w']], ax=ax[1, 1])
sns.boxplot(X_train[['sigma_ma']], ax=ax[1, 2])
sns.boxplot(X_train[['sigma_ad']], ax=ax[1, 3])
sns.boxplot(X_train[['sigma_n']], ax=ax[2, 0])
sns.boxplot(X_train[['sigma_tp']], ax=ax[2, 1])
sns.boxplot(X_train[['sigma_per']], ax=ax[2, 2])
ax[2, 3].set_visible(False)
Observations:
# Boxplot of the first set of columns:
fig, ax = plt.subplots(nrows=2, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Boxplots")
sns.boxplot(X_train[['H']], ax=ax[0, 0])
sns.boxplot(X_train[['epoch']], ax=ax[0, 1])
sns.boxplot(X_train[['epoch_mjd']], ax=ax[0, 2])
sns.boxplot(X_train[['epoch_cal']], ax=ax[0, 3])
sns.boxplot(X_train[['e']], ax=ax[1, 0])
sns.boxplot(X_train[['a']], ax=ax[1, 1])
sns.boxplot(X_train[['q']], ax=ax[1, 2])
sns.boxplot(X_train[['i']], ax=ax[1, 3]);
Observations:
# Boxplot of the rest of the columns:
fig, ax = plt.subplots(nrows=3, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Boxplots")
sns.boxplot(X_train[['om']], ax=ax[0, 0])
sns.boxplot(X_train[['w']], ax=ax[0, 1])
sns.boxplot(X_train[['ma']], ax=ax[0, 2])
sns.boxplot(X_train[['ad']], ax=ax[0, 3])
sns.boxplot(X_train[['n']], ax=ax[1, 0])
sns.boxplot(X_train[['tp']], ax=ax[1, 1])
sns.boxplot(X_train[['tp_cal']], ax=ax[1, 2])
sns.boxplot(X_train[['per']], ax=ax[1, 3])
sns.boxplot(X_train[['per_y']], ax=ax[2, 0])
sns.boxplot(X_train[['moid']], ax=ax[2, 1])
ax[2, 2].set_visible(False)
ax[2, 3].set_visible(False);
Observations:
def mann_whitney_func(features, target, numeric_columns):
statistics = []
p_values = []
group1 = features[target==1]
group2 = features[target==0]
for feature in numeric_columns:
statistic, p_value = mannwhitneyu(group1[feature].values, group2[feature].values)
statistics.append(statistic)
p_values.append(p_value)
dicionario = dict(Mann_Whitney_U=statistics, P_value=p_values)
df_pt = pd.DataFrame(index=numeric_columns, data=dicionario)
return df_pt
def ch2_func(features, target):
cat_columns = features.columns
statistics = []
p_values = []
for feature in cat_columns:
# Creating a contingency table:
df_contingency = pd.crosstab(features[feature], target)
# Chi squared:
statistic, p_value, dof, expected = chi2_contingency(df_contingency)
statistics.append(statistic)
p_values.append(p_value)
dicionario = dict(Chi_Squared=statistics, P_value=p_values)
df_ch = pd.DataFrame(index=cat_columns, data=dicionario)
return df_ch
As our target variable is categorical, we can use some methods to check if there is relation between numerical features and the dependent variables. However, all our numerical features do not follow a normal distriution, so we have to choose a Non-Parametric test to evaluate if are difference between groups. One approach is to use the Mann Whitney U test, which is statitical test that evaluates if there is a satistically diference between two independent groups.
H0: Groups came from the same Distribution.
H1: Groups came from different Distributions.
significance level = 0.05
# Selecting the numerical columns:
numeric_columns = sigma_list + initial_columns + other_columns
# Computing the Mann Whitney U test:
df_mw = mann_whitney_func(X_train, y_train, numeric_columns)
df_mw
Mann_Whitney_U | P_value | |
---|---|---|
sigma_e | 6.639186e+08 | 1.602131e-08 |
sigma_a | 5.426100e+08 | 2.738368e-16 |
sigma_q | 5.122688e+08 | 2.445461e-31 |
sigma_i | 7.887824e+08 | 4.939480e-88 |
sigma_om | 5.296364e+08 | 4.292388e-22 |
sigma_w | 4.691271e+08 | 1.254976e-61 |
sigma_ma | 5.757771e+08 | 1.073567e-05 |
sigma_ad | 5.842754e+08 | 5.977712e-04 |
sigma_n | 7.636538e+08 | 5.404597e-65 |
sigma_tp | 4.356460e+08 | 2.367745e-92 |
sigma_per | 5.157174e+08 | 2.282571e-29 |
H | 1.139616e+09 | 0.000000e+00 |
epoch | 5.921343e+08 | 7.201099e-13 |
epoch_mjd | 5.921343e+08 | 7.201099e-13 |
epoch_cal | 5.921343e+08 | 7.201099e-13 |
e | 1.181987e+09 | 0.000000e+00 |
a | 1.279411e+08 | 0.000000e+00 |
q | 7.679729e+06 | 0.000000e+00 |
i | 7.396650e+08 | 2.573163e-46 |
om | 6.273777e+08 | 1.380838e-01 |
w | 6.154047e+08 | 9.064559e-01 |
ma | 6.253275e+08 | 2.116118e-01 |
ad | 4.880512e+08 | 4.712485e-47 |
n | 1.100808e+09 | 0.000000e+00 |
tp | 6.093095e+08 | 5.635194e-01 |
tp_cal | 6.093095e+08 | 5.635194e-01 |
per | 1.279411e+08 | 0.000000e+00 |
per_y | 1.279411e+08 | 0.000000e+00 |
moid | 7.639718e+06 | 0.000000e+00 |
Observations:
Longitude of the ascending node (om) has a p-value higher than the significance level, so there is no evidence to reject the Null Hyothesis that there is no difference between groups.
Argument of perihelium (w) has a p-value higher than the significance level, so there is no evidence to reject the Null Hyothesis that there is no difference between groups.
Time of perihelion passage (tp) has a p-value higher than the significance level, so there is no evidence to reject the Null Hyothesis that there is no difference between groups.
Time of perihelion passage (Apparently, calibrated) (tp_cap) has a p-value higher than the significance level, so there is no evidence to reject the Null Hyothesis that there is no difference between groups.
For the categorical independent features, we can apply the Chi Squared test to verify if two categorical variables have any relationship. This test is a non-parametric test and so is less likely to reject the null hypothesis, epecially when it is false.
H0: The proportion of each group is not independent. So it was not drew from the same distribution.
H1: The proportion of each groups is independent and were drew from diferent distributions.
significance level: 0.05
# Selecting the categorical columns:
categorical_columns = ['class', 'neo']
# Applying ch2 function:
df_ch = ch2_func(X_train[categorical_columns], y_train)
df_ch
Chi_Squared | P_value | |
---|---|---|
class | 95044.341663 | 0.0 |
neo | 65713.810916 | 0.0 |
Observations:
# Dataframe for analysis:
df_train = X_train.copy()
df_train['Target'] = y_train
The eccentricity of a Orbit is a metric that quantify the shape of the Orbit, and can be a real number greater or equal to zero.
# Absolute Magnitude vs Eccentricity:
sns.scatterplot(x=df_train['H'], y=df_train['e'], hue=df_train['Target'], alpha=0.6)
plt.ylabel("Eccentricity")
plt.xlabel("Absolute Magnitude")
plt.show()
Observations:
# Plotting the difference bewteen the average eccentricity by target class:
group = df_train.groupby('Target').agg({'e':'mean'}).reset_index()
sns.barplot(y=group['e'], x=group['Target'])
plt.title("Average eccentricity by Target")
plt.ylabel("Eccentricity")
plt.show()
Observations:
# Plotting Near-Earth Object vs Class as a contigency table:
cross = pd.crosstab(df_train['neo'], df_train['Target'])
sns.heatmap(cross, annot=True, cmap=None, cbar=True)
plt.show()
Observations:
# Contingency table of class and the target:
cross = pd.crosstab(df_train['class'], df_train['Target'])
plt.figure(figsize=(15, 10))
sns.heatmap(cross, cbar=True, annot=True, cmap='crest')
plt.show()
Observations:
Aphelium distance: The Aphelium distance is the farthest point of a object's orbit around a celestial body.
Perihelium distance: The Perihelium distance is the closest point of a object's orbit around a celestial body.
# Looking into the Aphelium distance (ad) and the Perielium distance (q):
sns.scatterplot(y=df_train['ad'], x=df_train['q'], hue=df_train['Target'])
plt.title("Aphelium distance (ad) versus Perielium distance (q)")
plt.yscale("log") # Setting the y axis as a log scale axis
plt.xlabel("Perielium distance (au)")
plt.ylabel("log-Aphelium distance")
plt.show()
Observations:
# Aphelium and Perihelium distance by target class:
group = df_train.groupby('Target').agg({'ad':'mean', 'q':'mean'}).reset_index()
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(15, 5), sharex=False, sharey=False)
sns.barplot(x=group['Target'], y=group['ad'], ax=ax[0])
ax[0].set_ylabel("Average Aphelium distance (au)")
sns.barplot(x=group['Target'], y=group['q'], ax=ax[1])
ax[1].set_ylabel("Average Perielium distance (au)")
plt.show()
Observations:
Argument of Perihelium: It is the angular distance between the ascending node (the location where the object's Orbit crosses the plane of the reference) and the perihelium point.
# Argument of Perihelium versus Perihelium distance:
sns.scatterplot(y=df_train['w'], x=df_train['q'], hue=df_train['Target'], alpha=0.5)
plt.xlabel("Perihelium distance (au)")
plt.ylabel("Argument of Perihelium (deg)")
plt.show()
Observations:
Absolute Magnitude: It is defined as the apparent magnitude that an object would appear to have if it were situated at a distance of 10 parsec.
group = df_train.groupby('Target').agg({'H':'mean'}).reset_index()
sns.barplot(x=group['Target'], y=group['H'])
plt.ylabel("Absolute Magnitude")
plt.show()
Observations:
The time of perihelium passage: It is the time that a celestial body in an elliptical orbit, reaches its closest point to the sun.
# Boxplots of Argument of Perihelium (tp) and Argument of Perihelium calibrated (tp_cal) per class:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(15, 5), sharex=False, sharey=False)
sns.boxplot(y=df_train['tp'], x=df_train['Target'], ax=ax[0])
ax[0].set_ylabel("Argument of Perihelium (tp)")
sns.boxplot(y=df_train['tp_cal'], x=df_train['Target'], ax=ax[1])
ax[1].set_ylabel("Argument of Perihelium cal (tp_cal)")
plt.show()
Observations
Looking at these plots, we can see that the Argument of Perihelium (tp) has lower magnitude than the Argument of Perihelium cal. Furthermore, they are extremly correlated. So, we are going to keep Argument of Perihelium, due to its magnitude of values.
There distribution of each group of Argument of Perihelium appears not to be very different because the boxplots are overlapping, as the Mann Whitney test worked.
# Dropping the Argument of Perihelium cal:
df_train.drop(columns=['tp_cal'], inplace=True)
X_test.drop(columns=['tp_cal'], inplace=True)
# Splitting the training dataframe:
X_train = df_train.drop(columns='Target')
# Numeric columns without the tp_cal column:
numeric_columns = [i for i in numeric_columns if i != 'tp_cal']
# Transforming the categorical attributes into dummy variables:
X_train_dummy = pd.get_dummies(X_train, drop_first=True, dtype=bool)
X_test_dummy = pd.get_dummies(X_test, drop_first=True, dtype=bool)
# Class that put together many feature selection techniques:
class feature_selector:
seed = 42
def __init__(self, X, y) -> None:
self.X_train = X
self.y_train = y
def randomforestC_imp(self) -> None:
model = RandomForestClassifier(random_state=feature_selector.seed)
model.fit(self.X_train, self.y_train)
series = pd.Series(index=model.feature_names_in_, data=model.feature_importances_).sort_values(ascending=False)
# Plotting RandomForest Regression Importance:
plt.figure(figsize=(15, 15))
plt.title("RandomForest Importance")
plt.xlabel("Importance")
sns.barplot(x=series.values, y=series.index)
def xgbC_imp(self) -> None:
model = XGBClassifier(random_state=feature_selector.seed)
model.fit(self.X_train, self.y_train)
series = pd.Series(index=model.feature_names_in_, data=model.feature_importances_).sort_values(ascending=False)
# Plotting XGboost Regression Feature Importance:
plt.figure(figsize=(15,15))
plt.title("XGBoost Feature Importance")
plt.xlabel("Importance")
sns.barplot(y=series.index, x=series.values)
# Univariate feature selection:
def univariate(self, statistic, n="all") -> None:
selector = SelectKBest(score_func=statistic, k=n)
selector.fit(self.X_train, self.y_train)
series = pd.Series(index=selector.feature_names_in_, data=selector.scores_).\
sort_values(ascending=False)
plt.title("Univariate feature selection - Filtering")
sns.barplot(y=series.index, x=series.values)
return selector
# Wrapper method for feature selection:
def refcv(self):
models = {
"RandomForestR":RandomForestClassifier(random_state=feature_selector.seed),
"ExtraTreeR":ExtraTreesClassifier(random_state=feature_selector.seed),
"XGB":XGBClassifier(random_state=feature_selector.seed)
}
splits=10
step = 1
min_features = 30
cross = StratifiedKFold(n_splits=splits, random_state=feature_selector.seed, shuffle=True)
ind = [f"Columns {i}" for i in range(min_features, len(self.X_train.columns) + 1)]
df = pd.DataFrame(index=ind)
minimo = -np.inf
name = ""
for key, model in models.items():
rfecv = RFECV(estimator=model, step=step, cv=cross, min_features_to_select=min_features, scoring="f1")
rfecv.fit(self.X_train, self.y_train)
root_mean = rfecv.cv_results_["mean_test_score"]
df[key] = root_mean
best_value = root_mean[np.argmax(root_mean)]
if best_value > minimo:
minimo = best_value
best_features = rfecv.support_
name = key
df_features = pd.DataFrame(columns=self.X_train.columns, data=best_features.reshape(1, -1), index=[name])
return df, df_features.transpose()
# Feature Selector object:
selector = feature_selector(X_train_dummy, y_train)
# Random Forest Importance:
selector.randomforestC_imp()
Observations:
Earth Minimum Orbit Intersection Distance (moid) is the most important feature according to the Random Forest algorithm.
Epoch, Epoch_cal, Epoch_mjd and the Orbit classes are the least important features to the model.
# XGBoost Classifier Importance:
selector.xgbC_imp()
Observations:
According to the XGBoost Classifier the Absolute Magnitude (H) is the most important feature followed by Earth Minimum Orbit Intersection Distance (moid).
It is interesting that all of the other features are practically irrelevants to the model.
class_TNO is the most irrelevant feature according to the XGBoost.
# Recusive feature elimination method:
df_results, df_best_features = selector.refcv()
# Results for each Algorithm:
df_results
RandomForestR | ExtraTreeR | XGB | |
---|---|---|---|
Columns 30 | 0.985079 | 0.817495 | 0.985622 |
Columns 31 | 0.984133 | 0.827134 | 0.985622 |
Columns 32 | 0.985054 | 0.815004 | 0.985622 |
Columns 33 | 0.983802 | 0.813049 | 0.985622 |
Columns 34 | 0.982249 | 0.812564 | 0.985622 |
Columns 35 | 0.983195 | 0.805361 | 0.985622 |
Columns 36 | 0.984121 | 0.826559 | 0.985622 |
Columns 37 | 0.984744 | 0.822762 | 0.985622 |
Columns 38 | 0.984436 | 0.822374 | 0.985622 |
Columns 39 | 0.982570 | 0.818971 | 0.985622 |
Columns 40 | 0.982558 | 0.819126 | 0.985622 |
# Best number set of features and the respective algorithm:
df_best_features
XGB | |
---|---|
H | True |
epoch | True |
epoch_mjd | True |
epoch_cal | True |
e | True |
a | True |
q | True |
i | True |
om | True |
w | True |
ma | True |
ad | True |
n | True |
tp | True |
per | True |
per_y | False |
moid | True |
sigma_e | True |
sigma_a | True |
sigma_q | True |
sigma_i | True |
sigma_om | True |
sigma_w | True |
sigma_ma | True |
sigma_ad | True |
sigma_n | True |
sigma_tp | True |
sigma_per | True |
neo_Y | True |
class_APO | True |
class_AST | True |
class_ATE | False |
class_CEN | False |
class_IEO | False |
class_IMB | False |
class_MBA | False |
class_MCA | False |
class_OMB | False |
class_TJN | False |
class_TNO | False |
Observations:
# Selected columns:
columns_selected = ['H', 'epoch', 'epoch_cal', 'epoch_mjd', 'e', 'a', 'q', 'i', 'om', 'w',
'ma', 'ad', 'n', 'tp', 'per', 'moid', 'sigma_e', 'sigma_a', 'sigma_q',
'sigma_i', 'sigma_om', 'sigma_w', 'sigma_ma', 'sigma_ad', 'sigma_n',
'sigma_tp', 'sigma_per', 'neo_Y', 'class_APO', 'class_AST']
# Dropping unnecessary features:
X_train = X_train_dummy[columns_selected]
X_test = X_test_dummy[columns_selected]
# Exctracting new numeric columns:
numeric_columns = [c for c in X_train.columns if not pd.api.types.is_bool_dtype(X_train[c])]
Definition:
OBS: Standard Scaler can perform slightly worst than the other transformations because it assumes that the data is normally distributed. However you can still standardize your data.
Matematical Definition:
$X_{new_{i}} = \frac{X_{i} - \hat{\mu}_{i}}{\sigma_{i}}$
# Copy of the train and test set:
X_train_std = X_train.copy()
X_test_std = X_test.copy()
# Fitting a Standard Scaler object:
std_scaler = StandardScaler()
X_train_std_numeric = std_scaler.fit_transform(X_train[numeric_columns])
X_test_std_numeric = std_scaler.transform(X_test[numeric_columns])
# Switching the old numeric columns by the normalized set of columns:
X_train_std[numeric_columns] = X_train_std_numeric
X_test_std[numeric_columns] = X_test_std_numeric
Definition:
Mathematical Definition:
$X_{new_{i}} = \frac{X_{i} - X_{min_{i}}}{X_{max_{i}} - X_{min_{i}}}$
# Copy of the train and test set:
X_train_min_max = X_train.copy()
X_test_min_max = X_test.copy()
# Fitting a Standard Scaler object:
min_max_scaler = RobustScaler()
X_train_min_max_numeric = min_max_scaler.fit_transform(X_train[numeric_columns])
X_test_min_max_numeric = min_max_scaler.transform(X_test[numeric_columns])
# Switching the old numeric columns by the normalized set of columns:
X_train_min_max[numeric_columns] = X_train_min_max_numeric
X_test_min_max[numeric_columns] = X_test_min_max_numeric
Definition:
Mathematical Definition:
$X_{new_{i}} = \frac{X_{i} - median_{i}}{IQR_{i}}$
$IQR_{i} = P_{75_{i}} - P_{25_{i}}$
# Copy of the train and test set:
X_train_rb = X_train.copy()
X_test_rb = X_test.copy()
# Fitting a Standard Scaler object:
rb_scaler = RobustScaler()
X_train_rb_numeric = rb_scaler.fit_transform(X_train[numeric_columns])
X_test_rb_numeric = rb_scaler.transform(X_test[numeric_columns])
# Switching the old numeric columns by the normalized set of columns:
X_train_rb[numeric_columns] = X_train_rb_numeric
X_test_rb[numeric_columns] = X_test_rb_numeric
# Function used to evaluate the best algorithms:
def melhor_modelo(X_train, y_train):
seed = 42
cv = 10
score = ['f1']
result_f1 = {}
dicionario = {
"RandomForestR":RandomForestClassifier(random_state=seed),
"ExtraTreeR":ExtraTreesClassifier(random_state=seed),
"XGB":XGBClassifier(random_state=seed),
"MLP":MLPClassifier(random_state=seed, max_iter=2000)
}
for name, model in dicionario.items():
k_fold = StratifiedKFold(n_splits=cv, random_state=seed, shuffle=True)
result = cross_validate(model, X_train, y_train, cv=k_fold, scoring=score)
result_f1[name] = result['test_f1']
result_pd_f1 = pd.DataFrame(data=result_f1)
return result_pd_f1
class Best_model_selector:
seed = 42
cv = 10
def __init__(self, X_train, y_train) -> None:
self.X_train = X_train
self.y_train = y_train
# Function used to evaluate the best algorithms:
def best_model(self, scoring='f1', imbalance=False):
result_f1 = {}
dicionario = {
"RandomForestR":RandomForestClassifier(random_state=Best_model_selector.seed),
"ExtraTreeR":ExtraTreesClassifier(random_state=Best_model_selector.seed),
"XGB":XGBClassifier(random_state=Best_model_selector.seed),
"MLP":MLPClassifier(random_state=Best_model_selector.seed, max_iter=2000)
}
for name, model in dicionario.items():
if imbalance:
categorical_index = [X_train.columns.get_loc(c) for c in X_train.columns if pd.api.types.is_bool_dtype(X_train[c])]
pipeline = make_pipeline(SMOTENC(categorical_features=categorical_index , random_state=Best_model_selector.seed),
model)
k_fold = StratifiedKFold(n_splits=Best_model_selector.cv, random_state=Best_model_selector.seed, shuffle=True)
result = cross_validate(pipeline, self.X_train, self.y_train, cv=k_fold, scoring=[scoring])
else:
k_fold = StratifiedKFold(n_splits=Best_model_selector.cv, random_state=Best_model_selector.seed, shuffle=True)
result = cross_validate(model, self.X_train, self.y_train, cv=k_fold, scoring=[scoring])
result_f1[name] = result['test_f1']
result_pd_f1 = pd.DataFrame(data=result_f1)
return result_pd_f1
# Instace of the best model selector:
model_selector = Best_model_selector(X_train_std, y_train)
# Best algorithm results using Standard Scaler:
result_std = model_selector.best_model()
result_std.describe()
RandomForestR | ExtraTreeR | XGB | MLP | |
---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
mean | 0.984440 | 0.827741 | 0.984397 | 0.640767 |
std | 0.005672 | 0.020355 | 0.006104 | 0.097131 |
min | 0.975460 | 0.797153 | 0.975155 | 0.485356 |
25% | 0.981481 | 0.813441 | 0.981566 | 0.575673 |
50% | 0.983255 | 0.823737 | 0.984894 | 0.647670 |
75% | 0.987879 | 0.841924 | 0.987786 | 0.661327 |
max | 0.993902 | 0.863014 | 0.993902 | 0.803797 |
# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms")
sns.boxplot(result_std)
plt.show()
# Instace of the best model selector:
model_selector = Best_model_selector(X_train_min_max, y_train)
# Best algorithm results using Min max Scaler:
result_min_max = model_selector.best_model()
result_min_max.describe()
RandomForestR | ExtraTreeR | XGB | MLP | |
---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
mean | 0.983518 | 0.799723 | 0.985622 | 0.347974 |
std | 0.006330 | 0.024197 | 0.006369 | 0.094171 |
min | 0.975155 | 0.766423 | 0.975155 | 0.143646 |
25% | 0.979344 | 0.789855 | 0.984567 | 0.310472 |
50% | 0.981763 | 0.794224 | 0.984802 | 0.351030 |
75% | 0.987860 | 0.813223 | 0.987133 | 0.398293 |
max | 0.993902 | 0.836879 | 0.996960 | 0.478964 |
# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms")
sns.boxplot(result_min_max)
plt.show()
# Instace of the best model selector:
model_selector = Best_model_selector(X_train_rb, y_train)
# Best algorithm:
result_rb = model_selector.best_model()
result_rb.describe()
RandomForestR | ExtraTreeR | XGB | MLP | |
---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
mean | 0.984751 | 0.823194 | 0.985622 | 0.383080 |
std | 0.005404 | 0.018505 | 0.006369 | 0.172224 |
min | 0.978328 | 0.795699 | 0.975155 | 0.046243 |
25% | 0.979524 | 0.808805 | 0.984567 | 0.331400 |
50% | 0.984802 | 0.821652 | 0.984802 | 0.461108 |
75% | 0.987842 | 0.833042 | 0.987133 | 0.504792 |
max | 0.993902 | 0.851211 | 0.996960 | 0.526627 |
# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms")
sns.boxplot(result_rb)
plt.show()
Observations:
SMOTENC is a technique for orver sampling new records based on the records of the minority class. This approach is used when a imbalanced dataset and its algorithm can be seen below:
Find all the observations that compose the minority class.
Distinguishe between the categorical and continous columns.
Find the k nearest neighbors, this parameter can be settled by the user, for each observation.
Afterward, it is chosen randomly one of the k nearest observations and it is computed de difference between their continuous features, multiplied by a number between 0 and 1, and then added to the original continuous feature values. For categorical features, a random categoy is chosen from the k nearest neighbors.
Repeat steps 3 and 4 until reaches the number of synthetic samples.
# Instace of the best model selector:
model_selector = Best_model_selector(X_train_std, y_train)
# Best algorithm:
result_sm_std = model_selector.best_model(imbalance=True)
result_sm_std.describe()
RandomForestR | ExtraTreeR | XGB | MLP | |
---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
mean | 0.980065 | 0.866062 | 0.981475 | 0.773130 |
std | 0.006181 | 0.020269 | 0.007902 | 0.060722 |
min | 0.969880 | 0.828571 | 0.969325 | 0.646943 |
25% | 0.976420 | 0.855841 | 0.978721 | 0.744346 |
50% | 0.980335 | 0.870453 | 0.981594 | 0.775953 |
75% | 0.984084 | 0.877249 | 0.981900 | 0.821450 |
max | 0.990881 | 0.891496 | 0.996942 | 0.848000 |
# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms using SMOTENC")
sns.boxplot(result_sm_std)
plt.show()
# Instace of the best model selector:
model_selector = Best_model_selector(X_train_min_max, y_train)
# Best algorithm:
result_sm_min_max = model_selector.best_model(imbalance=True)
result_sm_min_max.describe()
RandomForestR | ExtraTreeR | XGB | MLP | |
---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
mean | 0.979806 | 0.888866 | 0.980849 | 0.236312 |
std | 0.006251 | 0.019633 | 0.007713 | 0.055620 |
min | 0.969880 | 0.856305 | 0.969880 | 0.169671 |
25% | 0.976717 | 0.877393 | 0.976400 | 0.199354 |
50% | 0.981818 | 0.889871 | 0.978915 | 0.232590 |
75% | 0.981928 | 0.895479 | 0.984802 | 0.255465 |
max | 0.987879 | 0.930233 | 0.993902 | 0.345616 |
# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms using SMOTENC")
sns.boxplot(result_sm_min_max)
plt.show()
# Instace of the best model selector:
model_selector = Best_model_selector(X_train_rb, y_train)
# Best algorithm:
result_sm_rb = model_selector.best_model(imbalance=True)
result_sm_rb.describe()
# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms using SMOTENC")
sns.boxplot(result_sm_rb)
plt.show()
# Function for fine tuning an arbitrary model:
def tuning(X_train, y_train, modelo, params):
cv = 5
score = "f1"
seed = 42
categorical_index = []
# Index of categorical features:
for column in X_train.columns:
if pd.api.types.is_bool_dtype(X_train[column]):
categorical_index.append(X_train.columns.get_loc(column))
pipeline = make_pipeline(
SMOTENC(categorical_features=categorical_index , random_state=seed),
modelo
)
grid = GridSearchCV(pipeline, cv=cv, param_grid=params,
scoring=score,
n_jobs=-1,
return_train_score=True,
)
grid.fit(X_train, y_train)
best_index = grid.best_index_
result = grid.cv_results_
train_score = result['mean_train_score'][best_index]
left_out = result['mean_test_score'][best_index]
print(f"Train score: {train_score}")
print(f"Left out data score: {left_out}")
return grid.best_params_
# Function that creates a directory named model and save a model in joblib format:
def save_model(model, name):
try:
joblib.dump(model, "model/{}".format(name))
except:
os.makedirs("model")
path = "model/{}".format(name)
joblib.dump(model, path)
# Hyperparameter grid for XGBoost:
params = {
'n_estimators':[100, 120, 150],
'max_depth':[5, 6, 7, 8],
'eta':[0.3, 0.01, 0.001],
'max_delta_step':[0, 0.5, 1],
'tree_method':['gpu_hist']
}
# Fine Tuning the mode:
best_algorithm = XGBClassifier(random_state=42)
best_params = tuning(best_algorithm)
# Best params:
best_params
# Training the model with the whole training set:
best_estimator = XGBClassifier(**best_params)
best_estimator.fit(X_train, y_train)
# Saving the model:
save_model(best_estimator, "xgb.joblib")
# Function to compute and plot the confusion matrix as a heatmap:
def confusion_map(y_true, y_pred):
matrix = confusion_matrix(y_true, y_pred)
plt.title("Confusion Matrix Heatmap")
sns.heatmap(matrix, cmap=True, annot=True, xticklabels='Predicted', yticklabels='Truth')
plt.show()
# Function to calculate classification metrics and create a dataframe:
def metrics(y_true, y_pred):
index_metrics = ['Accuracy', 'F1-score', 'Recall', 'Precision']
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
metrics_df = pd.DataFrame(dict(Metrics=[accuracy, f1, recall, precision]), index=index_metrics)
return metrics_df
# Function to calculate the Area under the curve and plot the ROC-Curve:
def roc_curve(y_true, y_prob):
tpr, fpr, thresholds = roc_curve(y_true, y_prob)
auc = roc_auc_score(y_true, y_prob)
plt.title("ROC Curve")
sns.lineplot(x=tpr, y=fpr)
plt.xlabel("False Postive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
return auc
# loading the model:
path = os.path.join('model', 'xgb.joblib')
model = joblib.load(path)
# Making predictions on the test set:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)
# Making predictions on the trainin set:
y_pred_train = model.predict(X_train)
# Confusion Matrix:
confusion_map(y_test, y_pred)
# Metrics for test set and training set:
metrics_test = metrics(y_test, y_pred)
metrics_train = metrics(y_train, y_pred_train)
# Dataframes display:
display(metrics_test)
display(metrics_train)
# Calculating ROC-Curve and Area under the curve:
auc = roc_curve(y_test, y_prob)
# Area under the curve metric on the test set:
print('Test set:')
print(f"Area under the roc curve: {auc}")