import pandas as pd
import numpy as np
df = pd.read_csv('data1.csv')
df.head(5)


import numpy as np
from scipy.stats import norm,t


def get_ci_known_variance(sigma, sample_mean, sample_size, alpha):
    margin_of_error = norm.ppf(1 - alpha/2)*sigma/np.sqrt(sample_size)
    return sample_mean - margin_of_error, sample_mean + margin_of_error


def get_ci_unknown_variance(sample_s, sample_mean, sample_size, alpha):
    margin_of_error = t.ppf(1 - alpha/2,sample_size-1)*sample_s/np.sqrt(sample_size)
    return sample_mean - margin_of_error, sample_mean + margin_of_error


mu, sigma = 40, 2.5
random_sample = np.random.normal(mu, sigma, 100)


random_sample[0:5]

array([38.03299039, 41.66455159, 41.73117252, 37.66321625, 37.82044562])


sample_mean=np.average(random_sample)


sample_mean

40.090871779180944


sample_size=len(random_sample)
sample_size

100


get_ci_known_variance(sigma,sample_mean,sample_size,.05)

(39.60088078304593, 40.58086277531596)


sample_s=random_sample.std()
sample_s

2.7015723352282577


get_ci_unknown_variance(sample_s,sample_mean,sample_size,.05)

(39.55482121685226, 40.626922341509626)


confidence_level = 0.95
degrees_freedom = sample_size - 1


import scipy


sample_standard_error = scipy.stats.sem(random_sample)


sample_standard_error

0.2715182357562536


sample_s/np.sqrt(sample_size-1)

0.27151823575625356


scipy.stats.t.interval(confidence_level, degrees_freedom, sample_mean, sample_standard_error)

(39.552120693149654, 40.629622865212234)


scipy.stats.norm.interval(.95,loc=sample_mean, scale=sigma/np.sqrt(sample_size))

(39.60088078304593, 40.58086277531596)


df['Gender'].value_counts()

Male      882
Female    588
Name: Gender, dtype: int64


df.groupby("Gender").agg({"DailyRate": [np.mean, np.std, np.size]})


x=df.groupby("Gender").agg({"DailyRate": [np.mean, np.std, np.size]})


x=np.array(x)

x

array([[808.27380952, 408.24167967, 588.        ],
       [798.62698413, 400.50902101, 882.        ]])


x[0,2]

588.0


x[0,0]

808.2738095238095


x[0,1]/np.sqrt(x[0,2]-1)

16.84993739332325


CI_F=scipy.stats.t.interval(.95, x[0,2] -1, x[0,0], x[0,1]/np.sqrt(x[0,2]-1))


CI_F

(775.1803043941346, 841.3673146534844)


CI_M=scipy.stats.t.interval(.95, x[1,2] -1, x[1,0], x[1,1]/np.sqrt(x[1,2]-1))


CI_M

(772.1438431601089, 825.1101250938593)


CI_M[0]

772.1438431601089


ci_dailrate = {}
ci_dailrate['Gender'] = ['Male','Female']
ci_dailrate['lb'] = [CI_M[0],CI_F[0]]
ci_dailrate['ub'] = [CI_M[1],CI_F[1]]
df_ci= pd.DataFrame(ci_dailrate)


df_ci


import matplotlib.pyplot as plt


for lb,ub,y in zip(df_ci['lb'],df_ci['ub'],range(len(df_ci))):
    plt.plot((lb,ub),(y,y),'ro-')
plt.yticks(range(len(df_ci)),list(df_ci['Gender']))

([<matplotlib.axis.YTick at 0x1523a227130>,
  <matplotlib.axis.YTick at 0x1523a216970>],
 [Text(0, 0, 'Male'), Text(0, 1, 'Female')])


import pandas as pd
import numpy as np
import scipy.stats as st

def plot_diff_in_means(data: pd.DataFrame,alpha, col1: str, col2: str):
    """
    given data, plots difference in means with confidence intervals across groups
    col1: categorical data with groups
    col2: continuous data for the means
    alpha: is the level of significance, it's usualy equal to .95
    """
    n = data.groupby(col1)[col2].count()
    # n contains a pd.Series with sample size for each category

    cat = list(data.groupby(col1, as_index=False)[col2].count()[col1])
    # cat has names of the categories, like 'category 1', 'category 2'

    mean = data.groupby(col1)[col2].agg('mean')
    # the average value of col2 across the categories

    std = data.groupby(col1)[col2].agg(np.std)
    se = std / np.sqrt(n)
    # standard deviation and standard error

    lower = st.t.interval(alpha = alpha, df=n-1, loc = mean, scale = se)[0]
    upper = st.t.interval(alpha = alpha, df =n-1, loc = mean, scale = se)[1]
    # calculates the upper and lower bounds using scipy

    for upper, mean, lower, y in zip(upper, mean, lower, cat):
        plt.plot((lower, mean, upper), (y, y, y), 'b.-')
        # for 'b.-': 'b' means 'blue', '.' means dot, '-' means solid line
    plt.yticks(
        range(len(n)), 
        list(data.groupby(col1, as_index = False)[col2].count()[col1])
        )


plot_diff_in_means(data = df,alpha=.95, col1 = 'Gender', col2 = 'DailyRate')


plot_diff_in_means(data = df,alpha=.95, col1 = 'EducationField', col2 = 'DailyRate')


plot_diff_in_means(data = df,alpha=.95, col1 = 'Department', col2 = 'DailyRate')


import statsmodels.api as sm


from statsmodels.stats.proportion import proportion_confint


proportion_confint(count=560,    # Number of "successes"
                   nobs=1200,    # Number of trials
                   alpha=(1 - 0.95),
                   method='normal')  # when we use asymptotic normal approximation
# Alpha, which is 1 minus the confidence level

(0.4384399591955059, 0.49489337413782747)


import pandas as pd


tab = pd.crosstab(df['BusinessTravel'], df['Department'],margins=True)
tab


table = sm.stats.Table(tab)
table.table

array([[   6.,   97.,   47.,  150.],
       [  11.,  182.,   84.,  277.],
       [  46.,  682.,  315., 1043.],
       [  63.,  961.,  446., 1470.]])


CI_HR=proportion_confint(count=table.table[1,0],nobs=table.table[3,0],alpha=(1-.95))
CI_HR

(0.08086094446182195, 0.2683454047445272)


CI_RD=proportion_confint(count=table.table[1,1],nobs=table.table[3,1],alpha=(1-.95))
CI_RD

(0.16461369335918247, 0.214158419023752)


CI_SL=proportion_confint(count=table.table[1,2],nobs=table.table[3,2],alpha=(1-.95))
CI_SL

(0.1520547541182747, 0.22462686023150105)


ci_travel = {}
ci_travel['Department'] = ['HR','RD','Sales']
ci_travel['lb'] = [CI_HR[0],CI_RD[0],CI_SL[0]]
ci_travel['ub'] = [CI_HR[1],CI_RD[1],CI_SL[1]]
df_ci= pd.DataFrame(ci_travel)
df_ci


import matplotlib.pyplot as plt


for lb,ub,y in zip(df_ci['lb'],df_ci['ub'],range(len(df_ci))):
    plt.plot((lb,ub),(y,y),'ro-')
plt.yticks(range(len(df_ci)),list(df_ci['Department']))

([<matplotlib.axis.YTick at 0x1523b764e80>,
  <matplotlib.axis.YTick at 0x1523b764820>,
  <matplotlib.axis.YTick at 0x1523b75f670>],
 [Text(0, 0, 'HR'), Text(0, 1, 'RD'), Text(0, 2, 'Sales')])


import pandas as pd
import numpy as np
import scipy.stats as stats

def two_proprotions_confint(success_a, size_a, success_b, size_b, significance = 0.05):
    """
    A/B test for two proportions;
    given a success a trial size of group A and B compute
    its confidence interval;
    resulting confidence interval matches R's prop.test function

    Parameters
    ----------
    success_a, success_b : int
        Number of successes in each group

    size_a, size_b : int
        Size, or number of observations in each group

    significance : float, default 0.05
        Often denoted as alpha. Governs the chance of a false positive.
        A significance level of 0.05 means that there is a 5% chance of
        a false positive. In other words, our confidence level is
        1 - 0.05 = 0.95

    Returns
    -------
    prop_diff : float
        Difference between the two proportion

    confint : 1d ndarray
        Confidence interval of the two proportion test
    """
    prop_a = success_a / size_a
    prop_b = success_b / size_b
    var = prop_a * (1 - prop_a) / size_a + prop_b * (1 - prop_b) / size_b
    se = np.sqrt(var)

    # z critical value
    confidence = 1 - significance
    z = stats.norm(loc = 0, scale = 1).ppf(confidence + significance / 2)

    # standard formula for the confidence interval
    # point-estimtate +- z * standard-error
    prop_diff = prop_b - prop_a
    confint = prop_diff + np.array([-1, 1]) * z * se
    return prop_diff, confint


two_proprotions_confint(success_a=table.table[1,0],size_a=table.table[3,0],success_b=table.table[1,1],size_b=table.table[3,1])

(0.014782881588292635, array([-0.08217729,  0.11174306]))


two_proprotions_confint(success_a=table.table[1,2],size_a=table.table[3,2],success_b=table.table[1,1],size_b=table.table[3,1])

(0.001045249016579347, array([-0.04289047,  0.04498097]))

	Age	Attrition	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EmployeeCount	EmployeeNumber	...	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	41	Yes	Travel_Rarely	1102	Sales	1	2	Life Sciences	1	1	...	1	80	0	8	0	1	6	4	0	5
1	49	No	Travel_Frequently	279	Research & Development	8	1	Life Sciences	1	2	...	4	80	1	10	3	3	10	7	1	7
2	37	Yes	Travel_Rarely	1373	Research & Development	2	2	Other	1	4	...	2	80	0	7	3	3	0	0	0	0
3	33	No	Travel_Frequently	1392	Research & Development	3	4	Life Sciences	1	5	...	3	80	0	8	3	3	8	7	3	0
4	27	No	Travel_Rarely	591	Research & Development	2	1	Medical	1	7	...	4	80	1	6	3	3	2	2	2	2

Estimation with Python¶

Table of Contents

The mean¶

The theory¶

Practice with Python¶

Practice with data¶

The proportion¶

The difference of two proportions¶

	DailyRate
	mean	std	size
Gender
Female	808.273810	408.241680	588
Male	798.626984	400.509021	882

Department	Human Resources	Research & Development	Sales	All
BusinessTravel
Non-Travel	6	97	47	150
Travel_Frequently	11	182	84	277
Travel_Rarely	46	682	315	1043
All	63	961	446	1470

	Department	lb	ub
0	HR	0.080861	0.268345
1	RD	0.164614	0.214158
2	Sales	0.152055	0.224627

	Gender	lb	ub
0	Male	772.143843	825.110125
1	Female	775.180304	841.367315