# Librairies ```python import numpy as np import pandas as pd import matplotlib import matplotlib.pyplot as plt import seaborn as sns import scipy from scipy.stats import chi2_contingency print("Pandas version: {}".format(pd.__version__)) print("Numpy version: {}".format(np.__version__)) print("Seaborn version: {}".format(sns.__version__)) print("Scipy version: {}".format(scipy.__version__)) print("Matplotlib version: {}".format(matplotlib.__version__)) ``` Pandas version: 2.2.2 Numpy version: 1.26.4 Seaborn version: 0.13.2 Scipy version: 1.13.1 Matplotlib version: 3.9.2 # Data ```python df = sns.load_dataset("tips") df.head() ``` <div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>total_bill</th> <th>tip</th> <th>sex</th> <th>smoker</th> <th>day</th> <th>time</th> <th>size</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>16.99</td> <td>1.01</td> <td>Female</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>2</td> </tr> <tr> <th>1</th> <td>10.34</td> <td>1.66</td> <td>Male</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>3</td> </tr> <tr> <th>2</th> <td>21.01</td> <td>3.50</td> <td>Male</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>3</td> </tr> <tr> <th>3</th> <td>23.68</td> <td>3.31</td> <td>Male</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>2</td> </tr> <tr> <th>4</th> <td>24.59</td> <td>3.61</td> <td>Female</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>4</td> </tr> </tbody> </table> </div> # Chi2 Contingency H0 : "Il n'y a pas de relation entre le nombre de personnes (size) et le jour de la semaine (day)" ```python contigency_table = pd.crosstab(df["size"], df["day"]) contigency_table ``` <div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th>day</th> <th>Thur</th> <th>Fri</th> <th>Sat</th> <th>Sun</th> </tr> <tr> <th>size</th> <th></th> <th></th> <th></th> <th></th> </tr> </thead> <tbody> <tr> <th>1</th> <td>1</td> <td>1</td> <td>2</td> <td>0</td> </tr> <tr> <th>2</th> <td>48</td> <td>16</td> <td>53</td> <td>39</td> </tr> <tr> <th>3</th> <td>4</td> <td>1</td> <td>18</td> <td>15</td> </tr> <tr> <th>4</th> <td>5</td> <td>1</td> <td>13</td> <td>18</td> </tr> <tr> <th>5</th> <td>1</td> <td>0</td> <td>1</td> <td>3</td> </tr> <tr> <th>6</th> <td>3</td> <td>0</td> <td>0</td> <td>1</td> </tr> </tbody> </table> </div> ```python chi2_contingency(contigency_table) ``` Chi2ContingencyResult(statistic=29.632849936919712, pvalue=0.013316478351860587, dof=15, expected_freq=array([[ 1.01639344, 0.31147541, 1.42622951, 1.24590164], [39.63934426, 12.14754098, 55.62295082, 48.59016393], [ 9.6557377 , 2.95901639, 13.54918033, 11.83606557], [ 9.40163934, 2.88114754, 13.19262295, 11.52459016], [ 1.2704918 , 0.38934426, 1.78278689, 1.55737705], [ 1.01639344, 0.31147541, 1.42622951, 1.24590164]])) ```python pd.DataFrame( chi2_contingency(contigency_table).expected_freq, columns=df["day"].sort_values().unique(), index=df["size"].sort_values().unique() ).round() ``` <div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>Thur</th> <th>Fri</th> <th>Sat</th> <th>Sun</th> </tr> </thead> <tbody> <tr> <th>1</th> <td>1.0</td> <td>0.0</td> <td>1.0</td> <td>1.0</td> </tr> <tr> <th>2</th> <td>40.0</td> <td>12.0</td> <td>56.0</td> <td>49.0</td> </tr> <tr> <th>3</th> <td>10.0</td> <td>3.0</td> <td>14.0</td> <td>12.0</td> </tr> <tr> <th>4</th> <td>9.0</td> <td>3.0</td> <td>13.0</td> <td>12.0</td> </tr> <tr> <th>5</th> <td>1.0</td> <td>0.0</td> <td>2.0</td> <td>2.0</td> </tr> <tr> <th>6</th> <td>1.0</td> <td>0.0</td> <td>1.0</td> <td>1.0</td> </tr> </tbody> </table> </div> ```python diff = pd.DataFrame( chi2_contingency(contigency_table).expected_freq, columns=df["day"].sort_values().unique(), index=df["size"].sort_values().unique() ).round() - contigency_table sns.heatmap(diff, annot=True, cmap="coolwarm") plt.show() ``` ![png](Chi2%20Contigency_9_0.png) # En résumé ```python print(f"H0 :\"Il n'y a pas de relation entre le nombre de personnes (size) et le jour de la semaine (day)\"") print() alpha = 0.02 p_value = chi2_contingency(contigency_table).pvalue if p_value < alpha: print("Nous avons suffisamment d'éléments pour rejeter H0") else: print("Nous n'avons pas suffisamment d'éléments pour rejeter H0") ``` H0 :"Il n'y a pas de relation entre le nombre de personnes (size) et le jour de la semaine (day)" Nous avons suffisamment d'éléments pour rejeter H0