import numpy as np
np.random.seed(7654567)
x=np.random.normal(size=15,loc=-2,scale=2)

x

array([-3.89395351, -2.55250403, -2.39746865,  0.98703995, -2.8607237 ,
       -1.39481847, -0.74237156, -4.63886095, -3.81258843, -2.89188048,
       -1.50048335, -3.31429764, -3.28882943, -0.96256977, -2.0684448 ])


from scipy.stats import ttest_1samp


test_mean=ttest_1samp(x, -2, alternative='two-sided')


test_mean.statistic

-0.948081719359044


test_mean.pvalue

0.3591669622802579


m=np.mean(x)
m

-2.3555169880044056


std_error=np.std(x)/np.sqrt(len(x)-1)
std_error

0.3749855953817513


(m+2)/std_error

-0.948081719359044


test_mean.statistic

-0.948081719359044


from scipy import stats
X = stats.t(len(x)-1)


2*X.cdf(test_mean.statistic)

0.3591669622802579


test_mean.pvalue

0.3591669622802579


test_mean1=ttest_1samp(x, -2, alternative='less')


test_mean1.statistic

-0.948081719359044


test_mean1.pvalue

0.17958348114012895


X.cdf(test_mean.statistic)

0.17958348114012895


test_mean2=ttest_1samp(x, -2, alternative='greater')


test_mean2.statistic

-0.948081719359044


test_mean2.pvalue

0.8204165188598711


1-X.cdf(test_mean.statistic)

0.8204165188598711


import pandas as pd


df=pd.read_csv('student_grades.csv')

df


df=df.dropna()

df


from scipy.stats import ttest_rel
test_diffmean1=ttest_rel(df['Quiz 1'],df['Quiz 2'])


test_diffmean1.statistic

-1.1156139485204906


test_diffmean1.pvalue

0.2701415971643917


test_diffmean1a=ttest_rel(df['Quiz 1'],df['Quiz 2'],alternative='less')


test_diffmean1a.statistic

-1.1156139485204906


test_diffmean1a.pvalue

0.13507079858219584


test_diffmean1b=ttest_rel(df['Quiz 1'],df['Quiz 2'],alternative='greater')


test_diffmean1b.pvalue

0.8649292014178042


x1=df['Quiz 1'].loc[df['Section  ']=='Section 1']
x1

0     17.5
1     16.5
5     14.0
9     14.0
11    10.5
12    16.5
13    14.5
14    14.5
15     8.0
16    14.5
18    11.0
20    18.5
22    10.5
23    10.0
28    15.0
29    18.0
32    14.0
36    11.0
37    15.0
41    14.0
44    16.0
45    10.5
46    16.5
48    17.5
50    14.5
51    16.5
Name: Quiz 1, dtype: float64


x2=df['Quiz 1'].loc[df['Section  ']=='Section 2']
x2

2     12.5
3     10.5
4      5.5
7     15.0
8     17.0
10    11.0
17    14.0
19    17.0
21    14.0
24    11.0
25    15.0
26    16.0
27    19.5
30    14.5
31    18.5
33    11.5
34    10.0
35    20.0
38    18.5
40     9.5
43    13.0
47    10.5
49    13.5
Name: Quiz 1, dtype: float64


from scipy.stats import ttest_ind
test_diffmean2=ttest_ind(x1,x2)


test_diffmean2.statistic

0.4200033340360798


test_diffmean2.pvalue

0.6763969243532744


import numpy as np
from statsmodels.stats.proportion import proportions_ztest


stat, pvalue=proportions_ztest(count=32, nobs=80, value=.45, alternative='two-sided')


stat

-0.9128709291752767


pvalue

0.3613104285261789


phat=32/80
phat

0.4


Zscore=np.sqrt(80)*(phat-.45)/np.sqrt(.4*.6)
Zscore

-0.9128709291752768


from scipy.stats import norm,t


2*(1-norm.cdf(np.abs(Zscore),loc=0,scale=1))

0.361310428526179


stat, pvalue=proportions_ztest(count=32, nobs=80, value=.45, alternative='smaller')


stat

-0.9128709291752767


pvalue

0.18065521426308945


norm.cdf(stat,0,1)

0.18065521426308945


stat, pvalue=proportions_ztest(count=32, nobs=80, value=.45, alternative='larger')


stat

-0.9128709291752767


pvalue

0.8193447857369105


1-norm.cdf(stat,0,1)

0.8193447857369105


stat, pvalue=proportions_ztest(count=82,nobs=100,value=.9,alternative='smaller')


stat

-2.0823168251814157


pvalue

0.018656770036782476


norm.cdf(stat,0,1)

0.018656770036782476


stat, pvalue=proportions_ztest(count=105,nobs=361,value=.25,alternative='smaller')


stat

1.709349971523915


pvalue

0.9563069289956099


from statsmodels.stats.proportion import proportions_ztest
import numpy as np
from scipy.stats import norm,t


sample_success_a, sample_size_a = (410, 500)
sample_success_b, sample_size_b = (379, 400)


successes = np.array([sample_success_a, sample_success_b])
samples = np.array([sample_size_a, sample_size_b])


successes

array([410, 379])


samples

array([500, 400])


stat, pvalue = proportions_ztest(count=successes, nobs=samples,  alternative='two-sided')


stat

-5.7802476568050825


pvalue

7.459074060078635e-09


p_pooled=successes.sum()/samples.sum()
p_pooled

0.8766666666666667


Sp=np.sqrt(p_pooled*(1-p_pooled)*(np.sum(1/samples)))
Sp

0.02205787841112558


zscore = (successes[0]/samples[0]-successes[1]/samples[1])/Sp
zscore

-5.7802476568050825


stat

-5.7802476568050825


pv=2*(1-norm.cdf(np.abs(zscore),0,1))
pv

7.459074025106815e-09


pvalue

7.459074060078635e-09


import numpy as np
import pandas as pd
import scipy.stats.distributions as dist


df= pd.read_csv('titanic.csv')


df.head()


df.shape

(1309, 12)


df['Survived'].value_counts()

0    815
1    494
Name: Survived, dtype: int64


df['Survived'] = df['Survived'].map({1:'Survived',0:'Not Survived'})


df['Survived'].value_counts()

Not Survived    815
Survived        494
Name: Survived, dtype: int64


df1=df[['Sex','Survived']]


df1.shape

(1309, 2)


df1=df1.dropna()


df1.shape

(1309, 2)


tab=pd.crosstab(df1.Survived,df1.Sex)
tab


tab=np.array(tab)

tab

array([[ 81, 734],
       [385, 109]], dtype=int64)


tab[1]

array([385, 109], dtype=int64)


counts=tab[1]


counts

array([385, 109], dtype=int64)


nobs=tab.sum(axis=0)


nobs

array([466, 843], dtype=int64)


counts/nobs

array([0.82618026, 0.12930012])


stat, pvalue = proportions_ztest(count=counts, nobs=nobs,  alternative='two-sided')


stat

24.905334404307723


pvalue

6.513244629920008e-137

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

Hypothesis Testing with Python¶

Table of Contents

Parametric hypothesis testing¶

Testing the mean¶

The theory¶

Practice with Python¶

Comparing two means¶

Paired sample t-test¶

A two-sample t-test¶

Testing the proportion¶

The theory¶

Practice with Python¶

Other types of alternative hypothesis¶

Comparing two proportions¶

	Student	Section	Quiz 1	Quiz 2	Midterm 1
0	1	Section 1	17.5	13.4575	22.5
1	2	Section 1	16.5	14.2125	25.0
2	3	Section 2	12.5	14.6150	23.5
3	4	Section 2	10.5	15.2100	25.0
4	5	Section 2	5.5	12.2225	24.5
5	6	Section 1	14.0	14.5875	23.5
6	7	Section 2	11.0	NaN	20.5
7	8	Section 2	15.0	18.7500	25.0
8	9	Section 2	17.0	16.5000	25.0
9	10	Section 1	14.0	19.2500	19.5
10	11	Section 2	11.0	10.9625	24.5
11	12	Section 1	10.5	7.3650	19.5
12	13	Section 1	16.5	18.2450	25.0
13	14	Section 1	14.5	18.8375	21.5
14	15	Section 1	14.5	11.1100	25.0
15	16	Section 1	8.0	8.7475	24.0
16	17	Section 1	14.5	15.9125	21.0
17	18	Section 2	14.0	8.8975	16.5
18	19	Section 1	11.0	4.1425	21.0
19	20	Section 2	17.0	14.9500	25.0
20	21	Section 1	18.5	13.6250	24.0
21	22	Section 2	14.0	13.5025	23.0
22	23	Section 1	10.5	17.7025	21.5
23	24	Section 1	10.0	15.5025	24.0
24	25	Section 2	11.0	12.8750	20.0
25	26	Section 2	15.0	12.2075	25.0
26	27	Section 2	16.0	19.0650	24.5
27	28	Section 2	19.5	20.0000	24.5
28	29	Section 1	15.0	11.9200	24.0
29	30	Section 1	18.0	14.4950	25.0
30	31	Section 2	14.5	16.5650	25.0
31	32	Section 2	18.5	18.8250	24.0
32	33	Section 1	14.0	12.1900	25.0
33	34	Section 2	11.5	10.4950	23.5
34	35	Section 2	10.0	10.5725	24.0
35	36	Section 2	20.0	18.5000	25.0
36	37	Section 1	11.0	14.9400	25.0
37	38	Section 1	15.0	14.7175	23.5
38	39	Section 2	18.5	19.5050	23.5
39	40	Section 1	NaN	20.0000	25.0
40	41	Section 2	9.5	13.4625	20.0
41	42	Section 1	14.0	11.8775	20.5
42	43	Section 2	3.0	NaN	13.0
43	44	Section 2	13.0	20.0000	25.0
44	45	Section 1	16.0	18.5700	25.0
45	46	Section 1	10.5	12.8325	23.0
46	47	Section 1	16.5	15.4725	21.0
47	48	Section 2	10.5	18.2550	23.5
48	49	Section 1	17.5	17.8750	25.0
49	50	Section 2	13.5	9.8100	23.5
50	51	Section 1	14.5	19.5050	23.5
51	52	Section 1	16.5	11.7875	21.5