chisquare

# Librairies ```python import numpy as np import pandas as pd import matplotlib import matplotlib.pyplot as plt import seaborn as sns import scipy from scipy.stats import chisquare ``` # Data ```python df= sns.load_dataset("tips") df.head() ``` <div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>total_bill</th> <th>tip</th> <th>sex</th> <th>smoker</th> <th>day</th> <th>time</th> <th>size</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>16.99</td> <td>1.01</td> <td>Female</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>2</td> </tr> <tr> <th>1</th> <td>10.34</td> <td>1.66</td> <td>Male</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>3</td> </tr> <tr> <th>2</th> <td>21.01</td> <td>3.50</td> <td>Male</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>3</td> </tr> <tr> <th>3</th> <td>23.68</td> <td>3.31</td> <td>Male</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>2</td> </tr> <tr> <th>4</th> <td>24.59</td> <td>3.61</td> <td>Female</td> <td>No</td> <td>Sun</td> <td>Dinner</td> <td>4</td> </tr> </tbody> </table> </div> # Test Chisquare H0 : "les clients donnent les pourboires: 20% le jeudi, 10% le vendredi, 40% le samedi, et 30% le dimanche" ```python # Valeurs théoriques en % expected_frequencies = np.array([0.2, 0.1, 0.4, 0.3]) ``` ```python # Valeurs mesurées en % df["day"].value_counts(normalize=True, sort=False).round(2) ``` day Thur 0.25 Fri 0.08 Sat 0.36 Sun 0.31 Name: proportion, dtype: float64 ```python # Valeurs mesurées en nombre d'effectifs observed_frequencies = df["day"].value_counts(normalize=False, sort=False).round(2).values observed_frequencies ``` array([62, 19, 87, 76], dtype=int64) ```python # Valeurs théoriques en nombre d'effectifs expected_frequencies = expected_frequencies * len(df) expected_frequencies ``` array([48.8, 24.4, 97.6, 73.2]) ```python chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies) ``` Power_divergenceResult(statistic=6.023907103825136, pvalue=0.11045286802428235) # En résumé ```python print(f"H0 :\"les clients donnent les pourboires: 20% le jeudi, 10% le vendredi, 40% le samedi, et 30% le dimanche") print() expected_frequencies = np.array([0.2, 0.1, 0.4, 0.3]) expected_frequencies = expected_frequencies * len(df) observed_frequencies = df["day"].value_counts(normalize=False, sort=False).round(2).values alpha = 0.02 p_value = chisquare(f_obs=observed_frequencies, f_exp=expected_frequencies).pvalue if p_value < alpha: print("Nous avons suffisamment d'éléments pour rejeter H0") else: print("Nous n'avons pas suffisamment d'éléments pour rejeter H0") ``` H0 :"les clients donnent les pourboires: 20% le jeudi, 10% le vendredi, 40% le samedi, et 30% le dimanche Nous n'avons pas suffisamment d'éléments pour rejeter H0