normalisation

# Librairies ```python import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from sklearn.preprocessing import MinMaxScaler, StandardScaler ``` # Load Diamonds ```python df = sns.load_dataset('diamonds') df.head() ``` <div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>carat</th> <th>cut</th> <th>color</th> <th>clarity</th> <th>depth</th> <th>table</th> <th>price</th> <th>x</th> <th>y</th> <th>z</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>0.23</td> <td>Ideal</td> <td>E</td> <td>SI2</td> <td>61.5</td> <td>55.0</td> <td>326</td> <td>3.95</td> <td>3.98</td> <td>2.43</td> </tr> <tr> <th>1</th> <td>0.21</td> <td>Premium</td> <td>E</td> <td>SI1</td> <td>59.8</td> <td>61.0</td> <td>326</td> <td>3.89</td> <td>3.84</td> <td>2.31</td> </tr> <tr> <th>2</th> <td>0.23</td> <td>Good</td> <td>E</td> <td>VS1</td> <td>56.9</td> <td>65.0</td> <td>327</td> <td>4.05</td> <td>4.07</td> <td>2.31</td> </tr> <tr> <th>3</th> <td>0.29</td> <td>Premium</td> <td>I</td> <td>VS2</td> <td>62.4</td> <td>58.0</td> <td>334</td> <td>4.20</td> <td>4.23</td> <td>2.63</td> </tr> <tr> <th>4</th> <td>0.31</td> <td>Good</td> <td>J</td> <td>SI2</td> <td>63.3</td> <td>58.0</td> <td>335</td> <td>4.34</td> <td>4.35</td> <td>2.75</td> </tr> </tbody> </table> </div> ```python # Filtre le jeu de données por exclure les catégories df_numeric = df.select_dtypes(exclude="category") df_numeric.describe() ``` <div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>carat</th> <th>depth</th> <th>table</th> <th>price</th> <th>x</th> <th>y</th> <th>z</th> </tr> </thead> <tbody> <tr> <th>count</th> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> </tr> <tr> <th>mean</th> <td>0.797940</td> <td>61.749405</td> <td>57.457184</td> <td>3932.799722</td> <td>5.731157</td> <td>5.734526</td> <td>3.538734</td> </tr> <tr> <th>std</th> <td>0.474011</td> <td>1.432621</td> <td>2.234491</td> <td>3989.439738</td> <td>1.121761</td> <td>1.142135</td> <td>0.705699</td> </tr> <tr> <th>min</th> <td>0.200000</td> <td>43.000000</td> <td>43.000000</td> <td>326.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> </tr> <tr> <th>25%</th> <td>0.400000</td> <td>61.000000</td> <td>56.000000</td> <td>950.000000</td> <td>4.710000</td> <td>4.720000</td> <td>2.910000</td> </tr> <tr> <th>50%</th> <td>0.700000</td> <td>61.800000</td> <td>57.000000</td> <td>2401.000000</td> <td>5.700000</td> <td>5.710000</td> <td>3.530000</td> </tr> <tr> <th>75%</th> <td>1.040000</td> <td>62.500000</td> <td>59.000000</td> <td>5324.250000</td> <td>6.540000</td> <td>6.540000</td> <td>4.040000</td> </tr> <tr> <th>max</th> <td>5.010000</td> <td>79.000000</td> <td>95.000000</td> <td>18823.000000</td> <td>10.740000</td> <td>58.900000</td> <td>31.800000</td> </tr> </tbody> </table> </div> # Normalisation MinMax - convient pour la plupart des distributions - à éviter pour les valeurs aberrantes # Standarisation - donne de bon résultats aux variables suivant une allure gaussienne ```python fig, ax = plt.subplots(2, 1) sns.histplot(data=df_numeric, x="carat", ax=ax[0]) sns.boxplot(data=df_numeric, x="carat", ax=ax[1]) plt.tight_layout() plt.show() ``` ![png](normalisation_7_0.png) présence d'outliers mais la distribution ne suis pas une gaussienne -> on va tester les deux options ```python fig, ax = plt.subplots(2, 1) sns.histplot(data=df_numeric, x="depth", ax=ax[0]) sns.boxplot(data=df_numeric, x="depth", ax=ax[1]) plt.tight_layout() plt.show() ``` ![png](normalisation_9_0.png) valeurs aberrantes et distribution gaussienne --> standardisation ```python fig, ax = plt.subplots(2, 1) sns.histplot(data=df_numeric, x="table", ax=ax[0]) sns.boxplot(data=df_numeric, x="table", ax=ax[1]) plt.tight_layout() plt.show() ``` ![png](normalisation_11_0.png) valeurs aberrantes + distribution gaussienne --> standardisation ```python fig, ax = plt.subplots(2, 1) sns.histplot(data=df_numeric, x="price", ax=ax[0]) sns.boxplot(data=df_numeric, x="price", ax=ax[1]) plt.tight_layout() plt.show() ``` ![png](normalisation_13_0.png) valeurs aberrantes mais distribution non-gaussienne (pas de grand vide dans la distribution --> plutôt MinMAx mais ici il s'agit de la Target --> besoin de normalisation dépend du modèle de ML) ```python fig, ax = plt.subplots(2, 1) sns.histplot(data=df_numeric, x="x", ax=ax[0]) sns.boxplot(data=df_numeric, x="x", ax=ax[1]) plt.tight_layout() plt.show() ``` ![png](normalisation_15_0.png) ```python fig, ax = plt.subplots(2, 1) sns.histplot(data=df_numeric, x="y", ax=ax[0]) sns.boxplot(data=df_numeric, x="y", ax=ax[1]) plt.tight_layout() plt.show() ``` ![png](normalisation_16_0.png) ```python fig, ax = plt.subplots(2, 1) sns.histplot(data=df_numeric, x="z", ax=ax[0]) sns.boxplot(data=df_numeric, x="z", ax=ax[1]) plt.tight_layout() plt.show() ``` ![png](normalisation_17_0.png) valeurs aberrantes mais distribution gaussienne pour les coordonées x, y et z --> plutôt une standardisation. Dans la pratique on enlèvera les valeurs aberrantes les plus éloignées. # MinMaxScaler ```python scaler = MinMaxScaler() scaler.fit(df_numeric) df_minmax = pd.DataFrame(scaler.transform(df_numeric), columns=df_numeric.columns) ``` ```python df_minmax.describe() ``` <div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>carat</th> <th>depth</th> <th>table</th> <th>price</th> <th>x</th> <th>y</th> <th>z</th> </tr> </thead> <tbody> <tr> <th>count</th> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> <td>53940.000000</td> </tr> <tr> <th>mean</th> <td>0.124312</td> <td>0.520817</td> <td>0.278023</td> <td>0.194994</td> <td>0.533627</td> <td>0.097360</td> <td>0.111281</td> </tr> <tr> <th>std</th> <td>0.098547</td> <td>0.039795</td> <td>0.042971</td> <td>0.215680</td> <td>0.104447</td> <td>0.019391</td> <td>0.022192</td> </tr> <tr> <th>min</th> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> </tr> <tr> <th>25%</th> <td>0.041580</td> <td>0.500000</td> <td>0.250000</td> <td>0.033735</td> <td>0.438547</td> <td>0.080136</td> <td>0.091509</td> </tr> <tr> <th>50%</th> <td>0.103950</td> <td>0.522222</td> <td>0.269231</td> <td>0.112180</td> <td>0.530726</td> <td>0.096944</td> <td>0.111006</td> </tr> <tr> <th>75%</th> <td>0.174636</td> <td>0.541667</td> <td>0.307692</td> <td>0.270219</td> <td>0.608939</td> <td>0.111036</td> <td>0.127044</td> </tr> <tr> <th>max</th> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> </tr> </tbody> </table> </div> # Standardisation ```python scaler = StandardScaler() scaler.fit(df_numeric) df_standard = pd.DataFrame(scaler.transform(df_numeric), columns=df_numeric.columns) ``` ```python df_standard.describe() ``` <div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>carat</th> <th>depth</th> <th>table</th> <th>price</th> <th>x</th> <th>y</th> <th>z</th> </tr> </thead> <tbody> <tr> <th>count</th> <td>5.394000e+04</td> <td>5.394000e+04</td> <td>5.394000e+04</td> <td>5.394000e+04</td> <td>5.394000e+04</td> <td>5.394000e+04</td> <td>5.394000e+04</td> </tr> <tr> <th>mean</th> <td>2.444878e-16</td> <td>-3.996902e-15</td> <td>9.695207e-17</td> <td>-9.273676e-17</td> <td>2.782103e-16</td> <td>-8.430615e-17</td> <td>-2.002271e-16</td> </tr> <tr> <th>std</th> <td>1.000009e+00</td> <td>1.000009e+00</td> <td>1.000009e+00</td> <td>1.000009e+00</td> <td>1.000009e+00</td> <td>1.000009e+00</td> <td>1.000009e+00</td> </tr> <tr> <th>min</th> <td>-1.261458e+00</td> <td>-1.308760e+01</td> <td>-6.470073e+00</td> <td>-9.040952e-01</td> <td>-5.109120e+00</td> <td>-5.020931e+00</td> <td>-5.014556e+00</td> </tr> <tr> <th>25%</th> <td>-8.395232e-01</td> <td>-5.231053e-01</td> <td>-6.521385e-01</td> <td>-7.476808e-01</td> <td>-9.103248e-01</td> <td>-8.882800e-01</td> <td>-8.909461e-01</td> </tr> <tr> <th>50%</th> <td>-2.066210e-01</td> <td>3.531678e-02</td> <td>-2.046051e-01</td> <td>-3.839672e-01</td> <td>-2.777553e-02</td> <td>-2.147398e-02</td> <td>-1.237618e-02</td> </tr> <tr> <th>75%</th> <td>5.106683e-01</td> <td>5.239361e-01</td> <td>6.904618e-01</td> <td>3.487866e-01</td> <td>7.210542e-01</td> <td>7.052421e-01</td> <td>7.103184e-01</td> </tr> <tr> <th>max</th> <td>8.886075e+00</td> <td>1.204139e+01</td> <td>1.680167e+01</td> <td>3.732438e+00</td> <td>4.465203e+00</td> <td>4.654965e+01</td> <td>4.004758e+01</td> </tr> </tbody> </table> </div> ## Variable "carat" ```python # comparaison des distributions avant et après la normalisation plt.figure(figsize=(15, 5)) sns.histplot(data=df_numeric, x="carat", color="tab:blue", alpha=0.5) sns.histplot(data=df_minmax, x="carat", color="tab:orange", alpha=0.5) sns.histplot(data=df_standard, x="carat", color="tab:green", alpha=0.5) plt.show() ``` ![png](normalisation_26_0.png) ```python # Avec des boxplots fig, ax = plt.subplots(3, 1, sharex=True) sns.boxplot(data=df_numeric, x="carat", color="tab:blue", ax=ax[0]) sns.boxplot(data=df_minmax, x="carat", color="tab:orange", ax=ax[1]) sns.boxplot(data=df_standard, x="carat", color="tab:green", ax=ax[2]) plt.show() ``` ![png](normalisation_27_0.png) ## Variable "depth" ```python variable = "depth" plt.figure(figsize=(15, 5)) sns.histplot(data=df_numeric, x=variable, color="tab:blue", alpha=0.5, label="originel") sns.histplot(data=df_minmax, x=variable, color="tab:orange", alpha=0.5, label="minmax") sns.histplot(data=df_standard, x=variable, color="tab:green", alpha=0.5, label="standard") plt.legend() plt.show() ``` ![png](normalisation_29_0.png) ```python fig, ax = plt.subplots(3, 1, sharex=True) sns.boxplot(data=df_numeric, x=variable, color="tab:blue", ax=ax[0]) sns.boxplot(data=df_minmax, x=variable, color="tab:orange", ax=ax[1]) sns.boxplot(data=df_standard, x=variable, color="tab:green", ax=ax[2]) plt.show() ``` ![png](normalisation_30_0.png) La standardisation semble ici le choix le plus judicieux ## Variable "table" ```python variable = "table" plt.figure(figsize=(15, 5)) sns.histplot(data=df_numeric, x=variable, color="tab:blue", alpha=0.5, label="originel") sns.histplot(data=df_minmax, x=variable, color="tab:orange", alpha=0.5, label="minmax") sns.histplot(data=df_standard, x=variable, color="tab:green", alpha=0.5, label="standard") plt.legend() plt.show() ``` ![png](normalisation_33_0.png) ```python fig, ax = plt.subplots(3, 1, sharex=True) sns.boxplot(data=df_numeric, x=variable, color="tab:blue", ax=ax[0]) sns.boxplot(data=df_minmax, x=variable, color="tab:orange", ax=ax[1]) sns.boxplot(data=df_standard, x=variable, color="tab:green", ax=ax[2]) plt.show() ``` ![png](normalisation_34_0.png) --> standardisation ## Variable "price" ```python variable = "price" plt.figure(figsize=(15, 5)) #sns.histplot(data=df_numeric, x=variable, color="tab:blue", alpha=0.5, label="originel") sns.histplot(data=df_minmax, x=variable, color="tab:orange", alpha=0.5, label="minmax") sns.histplot(data=df_standard, x=variable, color="tab:green", alpha=0.5, label="standard") plt.legend() plt.show() ``` ![png](normalisation_37_0.png) ```python fig, ax = plt.subplots(3, 1, sharex=True) sns.boxplot(data=df_numeric, x=variable, color="tab:blue", ax=ax[0]) sns.boxplot(data=df_minmax, x=variable, color="tab:orange", ax=ax[1]) sns.boxplot(data=df_standard, x=variable, color="tab:green", ax=ax[2]) plt.show() ``` ![png](normalisation_38_0.png) -> plutôt MinMax car la standardisation est très étendue --> peut induire des problèmes d'échelle avec d'autres variables normalisées de 0 à 1 Si le prix est la Target, alors elle ne se normalise pas comme les Features