# Importing Libraries:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis, skew, mannwhitneyu, chi2_contingency

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.base import clone
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier


from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, RFECV
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import make_pipeline
import missingno as ms

import joblib
import os


# Loading Data:
df_raw = pd.read_csv("dataset.csv", dtype={"pdes":"str", 
                                           "name":"str",
                                           "prefix":"str"})


# Making a copy of the dataset:
df = df_raw.copy()


# Information about the dataset:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958524 entries, 0 to 958523
Data columns (total 45 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              958524 non-null  object 
 1   spkid           958524 non-null  int64  
 2   full_name       958524 non-null  object 
 3   pdes            958524 non-null  object 
 4   name            22064 non-null   object 
 5   prefix          18 non-null      object 
 6   neo             958520 non-null  object 
 7   pha             938603 non-null  object 
 8   H               952261 non-null  float64
 9   diameter        136209 non-null  float64
 10  albedo          135103 non-null  float64
 11  diameter_sigma  136081 non-null  float64
 12  orbit_id        958524 non-null  object 
 13  epoch           958524 non-null  float64
 14  epoch_mjd       958524 non-null  int64  
 15  epoch_cal       958524 non-null  float64
 16  equinox         958524 non-null  object 
 17  e               958524 non-null  float64
 18  a               958524 non-null  float64
 19  q               958524 non-null  float64
 20  i               958524 non-null  float64
 21  om              958524 non-null  float64
 22  w               958524 non-null  float64
 23  ma              958523 non-null  float64
 24  ad              958520 non-null  float64
 25  n               958524 non-null  float64
 26  tp              958524 non-null  float64
 27  tp_cal          958524 non-null  float64
 28  per             958520 non-null  float64
 29  per_y           958523 non-null  float64
 30  moid            938603 non-null  float64
 31  moid_ld         958397 non-null  float64
 32  sigma_e         938602 non-null  float64
 33  sigma_a         938602 non-null  float64
 34  sigma_q         938602 non-null  float64
 35  sigma_i         938602 non-null  float64
 36  sigma_om        938602 non-null  float64
 37  sigma_w         938602 non-null  float64
 38  sigma_ma        938602 non-null  float64
 39  sigma_ad        938598 non-null  float64
 40  sigma_n         938602 non-null  float64
 41  sigma_tp        938602 non-null  float64
 42  sigma_per       938598 non-null  float64
 43  class           958524 non-null  object 
 44  rms             958522 non-null  float64
dtypes: float64(33), int64(2), object(10)
memory usage: 329.1+ MB


# Deleting some useless columns:
df.drop(columns=["prefix", "name", "spkid", "orbit_id", 'pdes'], inplace=True)
print(f"Shape: {df.shape}")

Shape: (958524, 40)


# Deleting rms column:
df.drop(columns=['rms'], inplace=True)


# Unique values of the full_name column: 
df['full_name'].unique()

array(['     1 Ceres', '     2 Pallas', '     3 Juno', ...,
       '       (6344 P-L)', '       (2060 T-2)', '       (2678 T-3)'],
      dtype=object)


# Removing the blank space in front of the names: 
df['full_name'] = df['full_name'].str.strip(" ")
df['full_name'].unique()

array(['1 Ceres', '2 Pallas', '3 Juno', ..., '(6344 P-L)', '(2060 T-2)',
       '(2678 T-3)'], dtype=object)


# Setting ID  and Full_name as a multindex:
df.set_index(["id", "full_name"], inplace=True)


# Correlation between moid and moid_ld
df[['moid', 'moid_ld']].corr()


# Dropping the moid_ld column:
df.drop(columns=['moid_ld'], inplace=True)


# Displaying the corelation of tp and tp_cal:
display(df[['tp', 'tp_cal']].corr())


# Looking at the first five rows:
df.head()


# Looking for missing data:
df.isna().sum().sort_values(ascending=False).to_frame()


# Deleting columns that hava more than 50% of missing data:
df.dropna(thresh=0.5*len(df), axis=1, inplace=True)
print(f"Shape: {df.shape}")

Shape: (958524, 33)


# Looking for missing data:
df.isna().sum().sort_values(ascending=False)

sigma_per    19926
sigma_ad     19926
sigma_om     19922
sigma_e      19922
sigma_tp     19922
sigma_n      19922
sigma_ma     19922
sigma_w      19922
sigma_i      19922
sigma_q      19922
sigma_a      19922
moid         19921
pha          19921
H             6263
neo              4
ad               4
per              4
ma               1
per_y            1
tp               0
tp_cal           0
n                0
w                0
om               0
i                0
q                0
a                0
e                0
equinox          0
epoch_cal        0
epoch_mjd        0
epoch            0
class            0
dtype: int64


# Description of the dataset:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 958524 entries, ('a0000001', '1 Ceres') to ('bT3S2678', '(2678 T-3)')
Data columns (total 33 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   neo        958520 non-null  object 
 1   pha        938603 non-null  object 
 2   H          952261 non-null  float64
 3   epoch      958524 non-null  float64
 4   epoch_mjd  958524 non-null  int64  
 5   epoch_cal  958524 non-null  float64
 6   equinox    958524 non-null  object 
 7   e          958524 non-null  float64
 8   a          958524 non-null  float64
 9   q          958524 non-null  float64
 10  i          958524 non-null  float64
 11  om         958524 non-null  float64
 12  w          958524 non-null  float64
 13  ma         958523 non-null  float64
 14  ad         958520 non-null  float64
 15  n          958524 non-null  float64
 16  tp         958524 non-null  float64
 17  tp_cal     958524 non-null  float64
 18  per        958520 non-null  float64
 19  per_y      958523 non-null  float64
 20  moid       938603 non-null  float64
 21  sigma_e    938602 non-null  float64
 22  sigma_a    938602 non-null  float64
 23  sigma_q    938602 non-null  float64
 24  sigma_i    938602 non-null  float64
 25  sigma_om   938602 non-null  float64
 26  sigma_w    938602 non-null  float64
 27  sigma_ma   938602 non-null  float64
 28  sigma_ad   938598 non-null  float64
 29  sigma_n    938602 non-null  float64
 30  sigma_tp   938602 non-null  float64
 31  sigma_per  938598 non-null  float64
 32  class      958524 non-null  object 
dtypes: float64(28), int64(1), object(4)
memory usage: 327.8+ MB


# Plottinhg the missing data matrix:
ms.matrix(df, figsize=(30, 10));


# Looking at the missing data of the sigma columns:
df_2 = df[["pha", "moid", "sigma_e", "sigma_a", 
    "sigma_q", "sigma_i", "sigma_om", 
    "sigma_w", "sigma_ma", "sigma_ad",
    "sigma_n", "sigma_tp", "sigma_per"]]

df_2[df["pha"].isnull()]


# Dropping observations with missing data in the dependent variable:
df.dropna(subset="pha", inplace=True)
df.isna().sum().sort_values(ascending=False).to_frame()


# Plotting the missing data matrix:
ms.matrix(df[['H']]);


# Separete into two sets of data:
features = df.drop(columns='pha')
target = df['pha']

# Splitting the data:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Imputation of the missing data with the mean:
simpler_imputer = SimpleImputer(strategy='mean')
X_train['H'] = simpler_imputer.fit_transform(X_train[['H']])
X_test['H'] = simpler_imputer.transform(X_test[['H']])


# Dropping observations with missing data:
X_train['target'] = y_train
X_test['target'] = y_test

X_train.dropna(inplace=True)
X_test.dropna(inplace=True)


# Training set with the same number of obsertions:
# Test set with the same number of observations:
y_train = X_train['target']
y_test = X_test['target']

X_train.drop(columns=['target'], inplace=True)
X_test.drop(columns=['target'], inplace=True)


# Looking at the unique data of equinox column:
df['equinox'].unique()

array(['J2000'], dtype=object)


# Dropping equinox column:
X_train.drop(columns=['equinox'], inplace=True)
X_test.drop(columns=['equinox'], inplace=True)


# Shape of the sets:
print(f"Traning Shape: {X_train.shape}")
print(f"Test Shape: {X_test.shape}")

Traning Shape: (750877, 31)
Test Shape: (187720, 31)


# Encoding the target variable:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)


# Informations after Data Cleaning:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 750877 entries, ('bK14EF8X', '(2014 EX158)') to ('a0121959', '121959 (2000 EL67)')
Data columns (total 31 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   neo        750877 non-null  object 
 1   H          750877 non-null  float64
 2   epoch      750877 non-null  float64
 3   epoch_mjd  750877 non-null  int64  
 4   epoch_cal  750877 non-null  float64
 5   e          750877 non-null  float64
 6   a          750877 non-null  float64
 7   q          750877 non-null  float64
 8   i          750877 non-null  float64
 9   om         750877 non-null  float64
 10  w          750877 non-null  float64
 11  ma         750877 non-null  float64
 12  ad         750877 non-null  float64
 13  n          750877 non-null  float64
 14  tp         750877 non-null  float64
 15  tp_cal     750877 non-null  float64
 16  per        750877 non-null  float64
 17  per_y      750877 non-null  float64
 18  moid       750877 non-null  float64
 19  sigma_e    750877 non-null  float64
 20  sigma_a    750877 non-null  float64
 21  sigma_q    750877 non-null  float64
 22  sigma_i    750877 non-null  float64
 23  sigma_om   750877 non-null  float64
 24  sigma_w    750877 non-null  float64
 25  sigma_ma   750877 non-null  float64
 26  sigma_ad   750877 non-null  float64
 27  sigma_n    750877 non-null  float64
 28  sigma_tp   750877 non-null  float64
 29  sigma_per  750877 non-null  float64
 30  class      750877 non-null  object 
dtypes: float64(28), int64(1), object(2)
memory usage: 262.4+ MB


# Function that calculates kurtosis and skewness of a dataset:
def kurtosis_skewness(dataset: any):
    col = ["Kurtosis", "Skewness"]
    results = pd.DataFrame(index=dataset.describe().columns, columns=col)
    for c in dataset.describe().columns:
        kurt = kurtosis(dataset[c])
        skewness = skew(dataset[c])
        results.loc[c] = [kurt, skewness]

    return results


# Heatmap of correlations:
def heatmap_cor(df: any):
    correlation = df.coRr()
    mask = np.zeros_like(correlation)
    mask[np.triu_indices_from(correlation)] = True
    sns.heatmap(correlation, mask=mask, annot=True, c="crest", cbar=True)


# Statistical Summary:
sigma_list = [i for i in  X_train.columns if 'sigma' in i]
X_train[sigma_list].describe()


# Selecting columns:
initial_columns = ['H', 'epoch', 'epoch_mjd', 'epoch_cal', 'e', 'a', 'q', 'i']

# Descriptive Statistic of the initial numeric columns:
X_train[initial_columns].describe()


# Selecting columns:
other_columns = ['om', 'w', 'ma', 'ad', 'n', 'tp', 'tp_cal', 'per', 'per_y', 'moid']

# Descriptive Statistic of the other numeric columns: 
X_train[other_columns].describe()


# Changing the scale of the data:
new_name = ['log_' + i for i in sigma_list]
dictionary = dict(zip(sigma_list, new_name))
log_data = np.log(X_train[sigma_list])
log_data.rename(columns=dictionary, inplace=True)


# Creating a subplot:
fig, ax = plt.subplots(nrows= 3, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Distributions of the Log-Sigma data")

# Histogram plots:
sns.histplot(log_data['log_sigma_e'], ax=ax[0, 0], kde=True)
sns.histplot(log_data['log_sigma_a'], ax=ax[0, 1], kde=True)
sns.histplot(log_data['log_sigma_q'], ax=ax[0, 2], kde=True)
sns.histplot(log_data['log_sigma_i'], ax=ax[0, 3], kde=True)
sns.histplot(log_data['log_sigma_om'], ax=ax[1, 0], kde=True)
sns.histplot(log_data['log_sigma_w'], ax=ax[1, 1], kde=True)
sns.histplot(log_data['log_sigma_ma'], ax=ax[1, 2], kde=True)
sns.histplot(log_data['log_sigma_ad'], ax=ax[1, 3], kde=True)
sns.histplot(log_data['log_sigma_n'], ax=ax[2, 0], kde=True)
sns.histplot(log_data['log_sigma_tp'], ax=ax[2, 1], kde=True)
sns.histplot(log_data['log_sigma_per'], ax=ax[2, 2], kde=True)

ax[2, 3].set_visible(False)


# Printing Kurtosis and Skewness for the log(sigma) columns:
kurtosis_skewness(log_data)


# Creating a subplot:
fig, ax = plt.subplots(nrows=2, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Histograms and Kernel Density Function")

# Histogram plots:
sns.histplot(X_train['H'], ax=ax[0, 0], kde=True)
sns.histplot(X_train['epoch'], ax=ax[0, 1], kde=True)
sns.histplot(X_train['epoch_mjd'], ax=ax[0, 2], kde=True)
sns.histplot(X_train['epoch_cal'], ax=ax[0, 3], kde=True)
sns.histplot(X_train['e'], ax=ax[1, 0], kde=True)
sns.histplot(X_train['a'], ax=ax[1, 1], kde=True)
sns.histplot(X_train['q'], ax=ax[1, 2], kde=True)
sns.histplot(X_train['i'], ax=ax[1, 3], kde=True);


# Printing Kurtosis and Skewness for the initial columns:
kurtosis_skewness(X_train[initial_columns])


# Creating a subplot:
fig, ax = plt.subplots(nrows=3, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Histograms and Kernel Density Function")

# Histogram plots:
sns.histplot(X_train['om'], ax=ax[0, 0], kde=True)
sns.histplot(X_train['w'], ax=ax[0, 1], kde=True)
sns.histplot(X_train['ma'], ax=ax[0, 2], kde=True)

sns.histplot(np.log(X_train['ad']), ax=ax[0, 3], kde=True)
ax[0, 3].set_xlabel('log-ad')

sns.histplot(X_train['n'], ax=ax[1, 0], kde=True)
sns.histplot(X_train['tp'], ax=ax[1, 1], kde=True)
sns.histplot(X_train['tp_cal'], ax=ax[1, 2], kde=True)

sns.histplot(np.log(X_train['per']), ax=ax[1, 3], kde=True)
ax[1, 3].set_xlabel('log-per')

sns.histplot(np.log(X_train['per_y']), ax=ax[2, 0], kde=True)
ax[2, 0].set_xlabel('log-per_y')

sns.histplot(X_train['moid'], ax=ax[2, 1], kde=True)

ax[2, 2].set_visible(False)
ax[2, 3].set_visible(False);


# Printing Kurtosis and Skewness for the other columns:
kurtosis_skewness(X_train[other_columns])


df_train = X_train.copy()
df_train['Target'] = y_train


# Boxplots of the sigma variables:
fig, ax = plt.subplots(nrows=3, ncols=4, sharex=False, sharey=False, figsize=(20, 15))
fig.suptitle("Boxplots")

sns.boxplot(X_train[['sigma_e']], ax=ax[0, 0])
sns.boxplot(X_train[['sigma_a']], ax=ax[0, 1])
sns.boxplot(X_train[['sigma_q']], ax=ax[0, 2])
sns.boxplot(X_train[['sigma_i']], ax=ax[0, 3])
sns.boxplot(X_train[['sigma_om']], ax=ax[1, 0])
sns.boxplot(X_train[['sigma_w']], ax=ax[1, 1])
sns.boxplot(X_train[['sigma_ma']], ax=ax[1, 2])
sns.boxplot(X_train[['sigma_ad']], ax=ax[1, 3])
sns.boxplot(X_train[['sigma_n']], ax=ax[2, 0])
sns.boxplot(X_train[['sigma_tp']], ax=ax[2, 1])
sns.boxplot(X_train[['sigma_per']], ax=ax[2, 2])

ax[2, 3].set_visible(False)


# Boxplot of the first set of columns:
fig, ax = plt.subplots(nrows=2, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Boxplots")

sns.boxplot(X_train[['H']], ax=ax[0, 0])
sns.boxplot(X_train[['epoch']], ax=ax[0, 1])
sns.boxplot(X_train[['epoch_mjd']], ax=ax[0, 2])
sns.boxplot(X_train[['epoch_cal']], ax=ax[0, 3])
sns.boxplot(X_train[['e']], ax=ax[1, 0])
sns.boxplot(X_train[['a']], ax=ax[1, 1])
sns.boxplot(X_train[['q']], ax=ax[1, 2])
sns.boxplot(X_train[['i']], ax=ax[1, 3]);


# Boxplot of the rest of the columns:
fig, ax = plt.subplots(nrows=3, ncols=4, sharex=False, sharey=False, figsize=(25, 15))
fig.suptitle("Boxplots")

sns.boxplot(X_train[['om']], ax=ax[0, 0])
sns.boxplot(X_train[['w']], ax=ax[0, 1])
sns.boxplot(X_train[['ma']], ax=ax[0, 2])
sns.boxplot(X_train[['ad']], ax=ax[0, 3])
sns.boxplot(X_train[['n']], ax=ax[1, 0])
sns.boxplot(X_train[['tp']], ax=ax[1, 1])
sns.boxplot(X_train[['tp_cal']], ax=ax[1, 2])
sns.boxplot(X_train[['per']], ax=ax[1, 3])
sns.boxplot(X_train[['per_y']], ax=ax[2, 0])
sns.boxplot(X_train[['moid']], ax=ax[2, 1])

ax[2, 2].set_visible(False)
ax[2, 3].set_visible(False);


def mann_whitney_func(features, target, numeric_columns):
    statistics = []
    p_values = []

    group1 = features[target==1]
    group2 = features[target==0]

    for feature in numeric_columns:
        statistic, p_value = mannwhitneyu(group1[feature].values, group2[feature].values)
        statistics.append(statistic)
        p_values.append(p_value)
        
    dicionario = dict(Mann_Whitney_U=statistics, P_value=p_values)
    df_pt = pd.DataFrame(index=numeric_columns, data=dicionario)
    return df_pt



def ch2_func(features, target):
    cat_columns = features.columns
    statistics = []
    p_values = []
    
    for feature in cat_columns:
        # Creating a contingency table:
        df_contingency = pd.crosstab(features[feature], target)

        # Chi squared:
        statistic, p_value, dof, expected = chi2_contingency(df_contingency)
    
        statistics.append(statistic)
        p_values.append(p_value)
        
    dicionario = dict(Chi_Squared=statistics, P_value=p_values)
    df_ch = pd.DataFrame(index=cat_columns, data=dicionario)
    return df_ch


# Selecting the numerical columns:
numeric_columns = sigma_list + initial_columns + other_columns

# Computing the Mann Whitney U test:
df_mw = mann_whitney_func(X_train, y_train, numeric_columns)
df_mw


# Selecting the categorical columns:
categorical_columns = ['class', 'neo']

# Applying ch2 function:
df_ch = ch2_func(X_train[categorical_columns], y_train)
df_ch


# Dataframe for analysis:
df_train = X_train.copy()
df_train['Target'] = y_train


# Absolute Magnitude vs Eccentricity:
sns.scatterplot(x=df_train['H'], y=df_train['e'], hue=df_train['Target'], alpha=0.6)
plt.ylabel("Eccentricity")
plt.xlabel("Absolute Magnitude")
plt.show()


# Plotting the difference bewteen the average eccentricity by target class:
group = df_train.groupby('Target').agg({'e':'mean'}).reset_index()
sns.barplot(y=group['e'], x=group['Target'])
plt.title("Average eccentricity by Target")
plt.ylabel("Eccentricity")
plt.show()


# Plotting Near-Earth Object vs Class as a contigency table:
cross = pd.crosstab(df_train['neo'], df_train['Target'])
sns.heatmap(cross, annot=True, cmap=None, cbar=True)
plt.show()


# Contingency table of class and the target:
cross = pd.crosstab(df_train['class'], df_train['Target'])
plt.figure(figsize=(15, 10))
sns.heatmap(cross, cbar=True, annot=True, cmap='crest')
plt.show()


# Looking into the Aphelium distance (ad) and the Perielium distance (q):
sns.scatterplot(y=df_train['ad'], x=df_train['q'], hue=df_train['Target'])
plt.title("Aphelium distance (ad) versus Perielium distance (q)")
plt.yscale("log") # Setting the y axis as a log scale axis
plt.xlabel("Perielium distance (au)")
plt.ylabel("log-Aphelium distance")
plt.show()


# Aphelium and Perihelium distance by target class:
group = df_train.groupby('Target').agg({'ad':'mean', 'q':'mean'}).reset_index()

fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(15, 5), sharex=False, sharey=False)
sns.barplot(x=group['Target'], y=group['ad'], ax=ax[0])
ax[0].set_ylabel("Average Aphelium distance (au)")

sns.barplot(x=group['Target'], y=group['q'], ax=ax[1])
ax[1].set_ylabel("Average Perielium distance (au)")

plt.show()


# Argument of Perihelium versus Perihelium distance:
sns.scatterplot(y=df_train['w'], x=df_train['q'], hue=df_train['Target'], alpha=0.5)
plt.xlabel("Perihelium distance (au)")
plt.ylabel("Argument of Perihelium (deg)")
plt.show()


group = df_train.groupby('Target').agg({'H':'mean'}).reset_index()
sns.barplot(x=group['Target'], y=group['H'])
plt.ylabel("Absolute Magnitude")
plt.show()


# Boxplots of Argument of Perihelium (tp) and Argument of Perihelium calibrated (tp_cal) per class:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(15, 5), sharex=False, sharey=False)

sns.boxplot(y=df_train['tp'], x=df_train['Target'], ax=ax[0])
ax[0].set_ylabel("Argument of Perihelium (tp)")

sns.boxplot(y=df_train['tp_cal'], x=df_train['Target'], ax=ax[1])
ax[1].set_ylabel("Argument of Perihelium cal (tp_cal)")

plt.show()


# Dropping the Argument of Perihelium cal:
df_train.drop(columns=['tp_cal'], inplace=True)
X_test.drop(columns=['tp_cal'], inplace=True)


# Splitting the training dataframe:
X_train = df_train.drop(columns='Target')


# Numeric columns without the tp_cal column:
numeric_columns = [i for i in numeric_columns if i != 'tp_cal']


# Transforming the categorical attributes into dummy variables:
X_train_dummy = pd.get_dummies(X_train, drop_first=True, dtype=bool)
X_test_dummy = pd.get_dummies(X_test, drop_first=True, dtype=bool)


# Class that put together many feature selection techniques:

class feature_selector:
    seed = 42
    def __init__(self, X, y) -> None:
        self.X_train = X
        self.y_train = y
    
    def  randomforestC_imp(self) -> None:
        model = RandomForestClassifier(random_state=feature_selector.seed)
        model.fit(self.X_train, self.y_train)
        series = pd.Series(index=model.feature_names_in_, data=model.feature_importances_).sort_values(ascending=False)

        # Plotting RandomForest Regression Importance:
        plt.figure(figsize=(15, 15))
        plt.title("RandomForest Importance")
        plt.xlabel("Importance")
        sns.barplot(x=series.values, y=series.index)
        

    def xgbC_imp(self) -> None:
        model = XGBClassifier(random_state=feature_selector.seed)
        model.fit(self.X_train, self.y_train)
        series = pd.Series(index=model.feature_names_in_, data=model.feature_importances_).sort_values(ascending=False)

        # Plotting XGboost Regression Feature Importance:
        plt.figure(figsize=(15,15))
        plt.title("XGBoost Feature Importance")
        plt.xlabel("Importance")
        sns.barplot(y=series.index, x=series.values)

    # Univariate feature selection:
    def univariate(self, statistic, n="all") -> None:
        selector = SelectKBest(score_func=statistic, k=n)
        selector.fit(self.X_train, self.y_train)
        
        series = pd.Series(index=selector.feature_names_in_, data=selector.scores_).\
            sort_values(ascending=False)
        plt.title("Univariate feature selection - Filtering")
        sns.barplot(y=series.index, x=series.values)

        return selector
    
    # Wrapper method for feature selection:
    def refcv(self):
        
        models = {
            "RandomForestR":RandomForestClassifier(random_state=feature_selector.seed),
            "ExtraTreeR":ExtraTreesClassifier(random_state=feature_selector.seed),
            "XGB":XGBClassifier(random_state=feature_selector.seed)
        }

        splits=10
        step = 1
        min_features = 30
        cross = StratifiedKFold(n_splits=splits, random_state=feature_selector.seed, shuffle=True)
        ind = [f"Columns {i}" for i in range(min_features, len(self.X_train.columns) + 1)]
        df = pd.DataFrame(index=ind)
        
        minimo = -np.inf
        name = ""
        for key, model in models.items():
            rfecv = RFECV(estimator=model, step=step, cv=cross, min_features_to_select=min_features, scoring="f1")
            rfecv.fit(self.X_train, self.y_train)
            root_mean = rfecv.cv_results_["mean_test_score"]
            df[key] = root_mean
            best_value = root_mean[np.argmax(root_mean)]

            if best_value > minimo:
                minimo = best_value
                best_features = rfecv.support_
                name = key

        df_features = pd.DataFrame(columns=self.X_train.columns, data=best_features.reshape(1, -1), index=[name])
    
        return df,  df_features.transpose()


# Feature Selector object:
selector = feature_selector(X_train_dummy, y_train)


# Random Forest Importance:
selector.randomforestC_imp()


# XGBoost Classifier Importance:
selector.xgbC_imp()


# Recusive feature elimination method:
df_results, df_best_features = selector.refcv()


# Results for each Algorithm:
df_results


# Best number set of features and the respective algorithm:
df_best_features


# Selected columns:
columns_selected = ['H', 'epoch', 'epoch_cal', 'epoch_mjd', 'e', 'a', 'q', 'i', 'om', 'w',
                    'ma', 'ad', 'n', 'tp', 'per', 'moid', 'sigma_e', 'sigma_a', 'sigma_q',
                    'sigma_i', 'sigma_om', 'sigma_w', 'sigma_ma', 'sigma_ad', 'sigma_n',
                    'sigma_tp', 'sigma_per', 'neo_Y', 'class_APO', 'class_AST']


# Dropping unnecessary features:
X_train = X_train_dummy[columns_selected]
X_test = X_test_dummy[columns_selected]


# Exctracting new numeric columns:
numeric_columns = [c for c in X_train.columns if not pd.api.types.is_bool_dtype(X_train[c])]


# Copy of the train and test set:
X_train_std = X_train.copy()
X_test_std = X_test.copy()


# Fitting a Standard Scaler object:
std_scaler = StandardScaler()
X_train_std_numeric = std_scaler.fit_transform(X_train[numeric_columns])
X_test_std_numeric = std_scaler.transform(X_test[numeric_columns])

# Switching the old numeric columns by the normalized set of columns:
X_train_std[numeric_columns] = X_train_std_numeric
X_test_std[numeric_columns] = X_test_std_numeric


# Copy of the train and test set:
X_train_min_max = X_train.copy()
X_test_min_max = X_test.copy()

# Fitting a Standard Scaler object:
min_max_scaler = RobustScaler()
X_train_min_max_numeric = min_max_scaler.fit_transform(X_train[numeric_columns])
X_test_min_max_numeric = min_max_scaler.transform(X_test[numeric_columns])

# Switching the old numeric columns by the normalized set of columns:
X_train_min_max[numeric_columns] = X_train_min_max_numeric
X_test_min_max[numeric_columns] = X_test_min_max_numeric


# Copy of the train and test set:
X_train_rb = X_train.copy()
X_test_rb = X_test.copy()

# Fitting a Standard Scaler object:
rb_scaler = RobustScaler()
X_train_rb_numeric = rb_scaler.fit_transform(X_train[numeric_columns])
X_test_rb_numeric = rb_scaler.transform(X_test[numeric_columns])

# Switching the old numeric columns by the normalized set of columns:
X_train_rb[numeric_columns] = X_train_rb_numeric
X_test_rb[numeric_columns] = X_test_rb_numeric


# Function used to evaluate the best algorithms:

def melhor_modelo(X_train, y_train):

    seed = 42
    cv = 10
    score = ['f1']
    result_f1 = {}

    dicionario = { 
                "RandomForestR":RandomForestClassifier(random_state=seed),
                "ExtraTreeR":ExtraTreesClassifier(random_state=seed),
                "XGB":XGBClassifier(random_state=seed),
                "MLP":MLPClassifier(random_state=seed, max_iter=2000)
                 }


    for name, model in dicionario.items():
        k_fold = StratifiedKFold(n_splits=cv, random_state=seed, shuffle=True)
        result = cross_validate(model, X_train, y_train, cv=k_fold, scoring=score)

        result_f1[name] = result['test_f1']
        
        
    result_pd_f1 = pd.DataFrame(data=result_f1)

    return result_pd_f1


class Best_model_selector:
    seed = 42
    cv = 10

    def __init__(self, X_train, y_train) -> None:
        self.X_train = X_train
        self.y_train = y_train
    
    # Function used to evaluate the best algorithms:

    def best_model(self, scoring='f1', imbalance=False):
        result_f1 = {}
        dicionario = { 
                    "RandomForestR":RandomForestClassifier(random_state=Best_model_selector.seed),
                    "ExtraTreeR":ExtraTreesClassifier(random_state=Best_model_selector.seed),
                    "XGB":XGBClassifier(random_state=Best_model_selector.seed),
                    "MLP":MLPClassifier(random_state=Best_model_selector.seed, max_iter=2000)
                    }

        for name, model in dicionario.items():
            if imbalance:
                categorical_index = [X_train.columns.get_loc(c) for c in X_train.columns if pd.api.types.is_bool_dtype(X_train[c])]
                pipeline = make_pipeline(SMOTENC(categorical_features=categorical_index , random_state=Best_model_selector.seed), 
                                               model)
                
                k_fold = StratifiedKFold(n_splits=Best_model_selector.cv, random_state=Best_model_selector.seed, shuffle=True)
                result = cross_validate(pipeline, self.X_train, self.y_train, cv=k_fold, scoring=[scoring])
            else:
                k_fold = StratifiedKFold(n_splits=Best_model_selector.cv, random_state=Best_model_selector.seed, shuffle=True)
                result = cross_validate(model, self.X_train, self.y_train, cv=k_fold, scoring=[scoring])

            result_f1[name] = result['test_f1']
            
        result_pd_f1 = pd.DataFrame(data=result_f1)
        return result_pd_f1


# Instace of the best model selector:
model_selector = Best_model_selector(X_train_std, y_train)


# Best algorithm results using Standard Scaler:
result_std = model_selector.best_model()
result_std.describe()


# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms")
sns.boxplot(result_std)
plt.show()


# Instace of the best model selector:
model_selector = Best_model_selector(X_train_min_max, y_train)


# Best algorithm results using Min max Scaler:
result_min_max = model_selector.best_model()
result_min_max.describe()


# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms")
sns.boxplot(result_min_max)
plt.show()


# Instace of the best model selector:
model_selector = Best_model_selector(X_train_rb, y_train)


# Best algorithm:
result_rb = model_selector.best_model()
result_rb.describe()


# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms")
sns.boxplot(result_rb)
plt.show()


# Instace of the best model selector:
model_selector = Best_model_selector(X_train_std, y_train)


# Best algorithm:
result_sm_std = model_selector.best_model(imbalance=True)
result_sm_std.describe()


# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms using SMOTENC")
sns.boxplot(result_sm_std)
plt.show()


# Instace of the best model selector:
model_selector = Best_model_selector(X_train_min_max, y_train)


# Best algorithm:
result_sm_min_max = model_selector.best_model(imbalance=True)
result_sm_min_max.describe()


# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms using SMOTENC")
sns.boxplot(result_sm_min_max)
plt.show()


# Instace of the best model selector:
model_selector = Best_model_selector(X_train_rb, y_train)


# Best algorithm:
result_sm_rb = model_selector.best_model(imbalance=True)
result_sm_rb.describe()


# Plotting a boxplot for model's evaluation:
plt.figure(figsize=(15, 5))
plt.title("Perfomance of different algorithms using SMOTENC")
sns.boxplot(result_sm_rb)
plt.show()


# Function for fine tuning an arbitrary model:
def tuning(X_train, y_train, modelo, params):
    
    cv = 5
    score = "f1"
    seed = 42
    categorical_index = []

    # Index of categorical features:
    for column in X_train.columns:
        if pd.api.types.is_bool_dtype(X_train[column]):
            categorical_index.append(X_train.columns.get_loc(column))



    pipeline = make_pipeline(
                        SMOTENC(categorical_features=categorical_index , random_state=seed),
                        modelo
                        )
    
    grid  = GridSearchCV(pipeline, cv=cv, param_grid=params, 
                         scoring=score, 
                         n_jobs=-1,
                         return_train_score=True,
                         )

    grid.fit(X_train, y_train)

    best_index = grid.best_index_
    result = grid.cv_results_

    train_score = result['mean_train_score'][best_index]
    left_out = result['mean_test_score'][best_index]


    print(f"Train score: {train_score}")
    print(f"Left out data score: {left_out}")

    return grid.best_params_


# Function that creates a directory named model and save a model in joblib format:
def save_model(model, name):
    try:
        joblib.dump(model, "model/{}".format(name))
    except:
        os.makedirs("model")
        path = "model/{}".format(name)
        joblib.dump(model, path)


# Hyperparameter grid for XGBoost:
params = {
    'n_estimators':[100, 120, 150],
    'max_depth':[5, 6, 7, 8],
    'eta':[0.3, 0.01, 0.001],
    'max_delta_step':[0, 0.5, 1],
    'tree_method':['gpu_hist']
}


# Fine Tuning the mode:
best_algorithm = XGBClassifier(random_state=42)
best_params = tuning(best_algorithm)


# Best params:
best_params


# Training the model with the whole training set:
best_estimator = XGBClassifier(**best_params)
best_estimator.fit(X_train, y_train)


# Saving the model:
save_model(best_estimator, "xgb.joblib")


# Function to compute and plot the confusion matrix as a heatmap:
def confusion_map(y_true, y_pred):
    matrix = confusion_matrix(y_true, y_pred)
    plt.title("Confusion Matrix Heatmap")
    sns.heatmap(matrix, cmap=True, annot=True, xticklabels='Predicted', yticklabels='Truth')
    plt.show()

# Function to calculate classification metrics and create a dataframe:
def metrics(y_true, y_pred): 
    index_metrics = ['Accuracy', 'F1-score', 'Recall', 'Precision']
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    metrics_df = pd.DataFrame(dict(Metrics=[accuracy, f1, recall, precision]), index=index_metrics)
    return metrics_df

# Function to calculate the Area under the curve and plot the ROC-Curve:
def roc_curve(y_true, y_prob):
    tpr, fpr, thresholds = roc_curve(y_true, y_prob)
    auc = roc_auc_score(y_true, y_prob)
    plt.title("ROC Curve")
    sns.lineplot(x=tpr, y=fpr)
    plt.xlabel("False Postive Rate (1 - Specificity)")
    plt.ylabel("True Positive Rate (Sensitivity)")
    return auc


# loading the model:
path = os.path.join('model', 'xgb.joblib')
model = joblib.load(path)


# Making predictions on the test set:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

# Making predictions on the trainin set:
y_pred_train = model.predict(X_train)


# Confusion Matrix:
confusion_map(y_test, y_pred)


# Metrics for test set and training set:
metrics_test = metrics(y_test, y_pred)
metrics_train = metrics(y_train, y_pred_train)

# Dataframes display:
display(metrics_test)
display(metrics_train)


# Calculating ROC-Curve and Area under the curve:
auc = roc_curve(y_test, y_prob)


# Area under the curve metric on the test set:
print('Test set:')
print(f"Area under the roc curve: {auc}")

		neo	pha	H	diameter	albedo	diameter_sigma	epoch	epoch_mjd	epoch_cal	equinox	...	sigma_q	sigma_i	sigma_om	sigma_w	sigma_ma	sigma_ad	sigma_n	sigma_tp	sigma_per	class
id	full_name
a0000001	1 Ceres	N	N	3.40	939.400	0.0900	0.200	2458600.5	58600	20190427.0	J2000	...	1.956900e-11	4.608900e-09	6.168800e-08	6.624800e-08	7.820700e-09	1.111300e-11	1.196500e-12	3.782900e-08	9.415900e-09	MBA
a0000002	2 Pallas	N	N	4.20	545.000	0.1010	18.000	2459000.5	59000	20200531.0	J2000	...	8.832200e-08	3.469400e-06	6.272400e-06	9.128200e-06	8.859100e-06	4.961300e-09	4.653600e-10	4.078700e-05	3.680700e-06	MBA
a0000003	3 Juno	N	N	5.33	246.596	0.2140	10.594	2459000.5	59000	20200531.0	J2000	...	8.139200e-08	3.223100e-06	1.664600e-05	1.772100e-05	8.110400e-06	4.363900e-09	4.413400e-10	3.528800e-05	3.107200e-06	MBA
a0000004	4 Vesta	N	N	3.00	525.400	0.4228	0.200	2458600.5	58600	20190427.0	J2000	...	1.928600e-09	2.170600e-07	3.880800e-07	1.789300e-07	1.206800e-06	1.648600e-09	2.612500e-10	4.103700e-06	1.274900e-06	MBA
a0000005	5 Astraea	N	N	6.90	106.699	0.2740	3.140	2459000.5	59000	20200531.0	J2000	...	6.092400e-08	2.740800e-06	2.894900e-05	2.984200e-05	8.303800e-06	4.729000e-09	5.522700e-10	3.474300e-05	3.490500e-06	MBA

		pha	moid	sigma_e	sigma_a	sigma_q	sigma_i	sigma_om	sigma_w	sigma_ma	sigma_ad	sigma_n	sigma_tp	sigma_per
id	full_name
bJ39R00R	(1939 RR)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
bJ90O05K	(1990 OK5)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
bJ91R28N	(1991 RN28)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
bJ93T11C	(1993 TC11)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
bJ94A09F	(1994 AF9)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
bK20K03Q	(2020 KQ3)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
bK20K03R	(2020 KR3)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
bK20K03V	(2020 KV3)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
bK20K04A	(2020 KA4)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
bK20K04C	(2020 KC4)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	sigma_e	sigma_a	sigma_q	sigma_i	sigma_om	sigma_w	sigma_ma	sigma_ad	sigma_n	sigma_tp	sigma_per
count	7.508770e+05	7.508770e+05	7.508770e+05	7.508770e+05	7.508770e+05	7.508770e+05	7.508770e+05	7.508770e+05	7.508770e+05	7.508770e+05	7.508770e+05
mean	7.758346e-01	1.726028e+01	2.054308e+01	1.214982e+00	6.000012e+00	4.549929e+05	4.549157e+05	2.418465e+01	5.603977e-02	1.357960e+08	8.905039e+04
std	1.002947e+02	4.892992e+03	3.059394e+03	1.376154e+02	1.489532e+03	7.682747e+07	7.682117e+07	8.028341e+03	1.096641e+01	2.309041e+10	2.947131e+07
min	1.285600e-10	4.145500e-11	2.616800e-10	4.087800e-08	2.163900e-07	1.789300e-07	2.103800e-07	5.572900e-11	2.860900e-11	3.088400e-07	2.335200e-08
25%	5.477000e-08	2.046300e-08	1.462100e-07	6.094500e-06	3.622600e-05	5.759500e-05	2.573700e-05	2.340400e-08	2.768800e-09	1.110900e-04	1.793700e-05
50%	8.167200e-08	3.850300e-08	2.271700e-07	8.687700e-06	6.650600e-05	1.047800e-04	4.900800e-05	4.361800e-08	4.637100e-09	2.231400e-04	3.500700e-05
75%	2.338800e-07	1.044500e-07	6.595600e-07	1.591900e-05	1.613200e-04	3.118800e-04	1.719500e-04	1.196900e-07	1.125300e-08	8.145700e-04	9.779100e-05
max	3.942500e+04	3.241200e+06	1.015000e+06	5.533000e+04	1.199100e+06	3.340000e+10	3.339300e+10	5.509700e+06	7.698800e+03	1.041500e+13	1.910700e+10

	H	epoch	epoch_mjd	epoch_cal	e	a	q	i
count	750877.000000	7.508770e+05	750877.000000	7.508770e+05	750877.000000	750877.000000	750877.000000	750877.000000
mean	16.890408	2.458871e+06	58870.829587	2.019698e+07	0.156175	2.900448	2.397450	9.044674
std	1.795913	7.055785e+02	705.578515	1.940802e+04	0.092974	12.327240	2.157830	6.658707
min	-1.100000	2.425052e+06	25051.000000	1.927062e+07	0.000003	0.555418	0.075872	0.007744
25%	16.000000	2.459000e+06	59000.000000	2.020053e+07	0.092078	2.388185	1.971846	4.152227
50%	16.900000	2.459000e+06	59000.000000	2.020053e+07	0.144916	2.647204	2.226284	7.398350
75%	17.700000	2.459000e+06	59000.000000	2.020053e+07	0.200694	3.002401	2.579520	12.390983
max	33.200000	2.459000e+06	59000.000000	2.020053e+07	0.999591	8850.823836	80.398819	175.082901

	om	w	ma	ad	n	tp	tp_cal	per	per_y	moid
count	750877.000000	750877.000000	750877.000000	750877.000000	750877.000000	7.508770e+05	7.508770e+05	7.508770e+05	750877.000000	7.508770e+05
mean	168.500688	181.376168	177.210988	3.403445	0.236686	2.458858e+06	2.019585e+07	2.900275e+03	7.940519	1.414834e+00
std	102.886239	103.885218	105.869845	24.216581	0.079762	1.600272e+03	4.391532e+04	3.635303e+05	995.291634	2.155184e+00
min	0.000025	0.000130	-67.136826	0.653773	0.000001	2.283183e+06	1.539011e+07	1.511918e+02	0.413941	4.544120e-07
25%	80.610134	91.541555	83.600824	2.781923	0.189453	2.458564e+06	2.019032e+07	1.348032e+03	3.690711	9.797690e-01
50%	159.947331	182.275912	175.284210	3.047334	0.228835	2.458946e+06	2.020041e+07	1.573184e+03	4.307143	1.240520e+00
75%	252.248318	271.571677	269.886957	3.365602	0.267056	2.459362e+06	2.021053e+07	1.900209e+03	5.202490	1.593080e+00
max	359.999793	359.998075	491.618014	17698.024949	2.381082	2.546362e+06	2.259081e+07	3.041403e+08	832690.763699	7.947660e+01

	0
albedo	823421
diameter_sigma	822443
diameter	822315
sigma_per	19926
sigma_ad	19926
sigma_w	19922
sigma_om	19922
sigma_a	19922
sigma_q	19922
sigma_i	19922
sigma_n	19922
sigma_ma	19922
sigma_tp	19922
sigma_e	19922
moid	19921
pha	19921
H	6263
neo	4
ad	4
per	4
ma	1
per_y	1
n	0
tp_cal	0
tp	0
w	0
om	0
i	0
q	0
a	0
e	0
equinox	0
epoch_cal	0
epoch_mjd	0
epoch	0
class	0

	0
H	6262
sigma_per	5
sigma_ad	5
neo	4
ad	4
per	4
sigma_q	1
sigma_e	1
ma	1
sigma_a	1
sigma_om	1
sigma_i	1
sigma_w	1
sigma_ma	1
sigma_n	1
sigma_tp	1
per_y	1
moid	0
tp	0
tp_cal	0
pha	0
n	0
w	0
om	0
i	0
q	0
a	0
e	0
equinox	0
epoch_cal	0
epoch_mjd	0
epoch	0
class	0

	Kurtosis	Skewness
log_sigma_e	6.204082	2.475314
log_sigma_a	5.646046	2.400785
log_sigma_q	6.867976	2.552479
log_sigma_i	8.351038	2.880583
log_sigma_om	10.566405	2.981978
log_sigma_w	8.715659	2.724005
log_sigma_ma	6.282922	2.400544
log_sigma_ad	5.587056	2.39486
log_sigma_n	5.49624	2.421342
log_sigma_tp	7.331019	2.48295
log_sigma_per	5.787754	2.406471

	Kurtosis	Skewness
H	9.574231	0.632873
epoch	63.509785	-6.773483
epoch_mjd	63.509785	-6.773483
epoch_cal	63.507964	-6.778353
e	7.807368	1.900893
a	359883.099936	528.109344
q	264.636904	15.725018
i	22.503049	2.266021

	Kurtosis	Skewness
om	-1.108926	0.196737
w	-1.20069	-0.019961
ma	-1.233766	0.043606
ad	386475.66126	556.60062
n	63.992398	5.635289
tp	739.249481	-4.714999
tp_cal	731.590956	-4.676611
per	653535.966944	787.456975
per_y	653535.966944	787.456975
moid	265.866793	15.782915

	Mann_Whitney_U	P_value
sigma_e	6.639186e+08	1.602131e-08
sigma_a	5.426100e+08	2.738368e-16
sigma_q	5.122688e+08	2.445461e-31
sigma_i	7.887824e+08	4.939480e-88
sigma_om	5.296364e+08	4.292388e-22
sigma_w	4.691271e+08	1.254976e-61
sigma_ma	5.757771e+08	1.073567e-05
sigma_ad	5.842754e+08	5.977712e-04
sigma_n	7.636538e+08	5.404597e-65
sigma_tp	4.356460e+08	2.367745e-92
sigma_per	5.157174e+08	2.282571e-29
H	1.139616e+09	0.000000e+00
epoch	5.921343e+08	7.201099e-13
epoch_mjd	5.921343e+08	7.201099e-13
epoch_cal	5.921343e+08	7.201099e-13
e	1.181987e+09	0.000000e+00
a	1.279411e+08	0.000000e+00
q	7.679729e+06	0.000000e+00
i	7.396650e+08	2.573163e-46
om	6.273777e+08	1.380838e-01
w	6.154047e+08	9.064559e-01
ma	6.253275e+08	2.116118e-01
ad	4.880512e+08	4.712485e-47
n	1.100808e+09	0.000000e+00
tp	6.093095e+08	5.635194e-01
tp_cal	6.093095e+08	5.635194e-01
per	1.279411e+08	0.000000e+00
per_y	1.279411e+08	0.000000e+00
moid	7.639718e+06	0.000000e+00

	RandomForestR	ExtraTreeR	XGB
Columns 30	0.985079	0.817495	0.985622
Columns 31	0.984133	0.827134	0.985622
Columns 32	0.985054	0.815004	0.985622
Columns 33	0.983802	0.813049	0.985622
Columns 34	0.982249	0.812564	0.985622
Columns 35	0.983195	0.805361	0.985622
Columns 36	0.984121	0.826559	0.985622
Columns 37	0.984744	0.822762	0.985622
Columns 38	0.984436	0.822374	0.985622
Columns 39	0.982570	0.818971	0.985622
Columns 40	0.982558	0.819126	0.985622

	XGB
H	True
epoch	True
epoch_mjd	True
epoch_cal	True
e	True
a	True
q	True
i	True
om	True
w	True
ma	True
ad	True
n	True
tp	True
per	True
per_y	False
moid	True
sigma_e	True
sigma_a	True
sigma_q	True
sigma_i	True
sigma_om	True
sigma_w	True
sigma_ma	True
sigma_ad	True
sigma_n	True
sigma_tp	True
sigma_per	True
neo_Y	True
class_APO	True
class_AST	True
class_ATE	False
class_CEN	False
class_IEO	False
class_IMB	False
class_MBA	False
class_MCA	False
class_OMB	False
class_TJN	False
class_TNO	False

	RandomForestR	ExtraTreeR	XGB	MLP
count	10.000000	10.000000	10.000000	10.000000
mean	0.984440	0.827741	0.984397	0.640767
std	0.005672	0.020355	0.006104	0.097131
min	0.975460	0.797153	0.975155	0.485356
25%	0.981481	0.813441	0.981566	0.575673
50%	0.983255	0.823737	0.984894	0.647670
75%	0.987879	0.841924	0.987786	0.661327
max	0.993902	0.863014	0.993902	0.803797

	moid	moid_ld
moid	1.0	1.0
moid_ld	1.0	1.0

	tp	tp_cal
tp	1.000000	0.998251
tp_cal	0.998251	1.000000

	Chi_Squared	P_value
class	95044.341663	0.0
neo	65713.810916	0.0

	RandomForestR	ExtraTreeR	XGB	MLP
count	10.000000	10.000000	10.000000	10.000000
mean	0.983518	0.799723	0.985622	0.347974
std	0.006330	0.024197	0.006369	0.094171
min	0.975155	0.766423	0.975155	0.143646
25%	0.979344	0.789855	0.984567	0.310472
50%	0.981763	0.794224	0.984802	0.351030
75%	0.987860	0.813223	0.987133	0.398293
max	0.993902	0.836879	0.996960	0.478964

Asteroid Hazard Classification¶

Dictionary (Columns)¶

1) Importing Libraries and Loading the Dataset¶

2) Data Cleaning¶

3) Exploratory Data Analysis¶

3.1) Descriptive Statistics¶

3.2) Distributions and Correlations¶

3.2.1) Histograms, Kurtosis and Skewness¶

3.2.2) Boxplots¶

3.2.3) Correlations¶

3.2.3.1) Numerical Independent Variables¶

3.2.3.1) Categorical Independent Variables¶

3.3) Exploring the Variables¶

Eccentricity¶

NEO: Near-Earth Object (NEO) flag¶

Orbit class¶

Aphelium distance and Perihelium distance¶

Argument of Perihelium¶

Absolute Magnitude¶

Time of perihelium passage calibrated (tp_cal) and Time of perihelium passage (tp)¶

4) Preprocessing¶

4.1) Categorical Transformation¶

4.1.1) One hot Encoder¶

4.2) Feature Selection¶

4.1.1) Random Forest Importance¶

4.1.2) XBGoost Importance¶

4.1.3) Recursive Feature elimination¶

4.3) Numerical Transformation¶

4.3.1) Standard Scaler¶

4.3.2) Min Max Scaler¶

4.3.3) Robust Scaler¶

4.4) Selecting the best algorithm and Transformation¶

4.4.1) Using Standard Scaler¶

4.4.2) Using Min Max Scaler¶

4.4.3) Using Robust Scaler¶

4.5) SMOTENC - Synthetic Minority Over Sampling Technique for Nominal and Continous.¶

4.5.1) SMOTENC with Standard Scaler¶

4.5.2) SMOTENC with Min Max Scaler¶

4.5.3) SMOTENC with Robust Scaler¶

5) Fine Tuning¶

6) Results¶

6.1) Usefull functions¶

6.2) Predictions¶

6.2.1) Confusion Matrix¶

6.2.2) Classification metrics¶

6.2.3) ROC Curve and Area Under the Curve¶