import pandas as pd


temperature = [34, 56, 15, -9, -121, -5, 39]
days = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']

# create series 
series_from_list = pd.Series(temperature, index=days)
series_from_list

Mon     34
Tue     56
Wed     15
Thu     -9
Fri   -121
Sat     -5
Sun     39
dtype: int64


temperature = [34, 56, 'a', -9, -121, -5, 39]
days = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']


series_from_list = pd.Series(temperature, index=days)
series_from_list

Mon      34
Tue      56
Wed       a
Thu      -9
Fri    -121
Sat      -5
Sun      39
dtype: object


my_dict = {'Mon': 33, 'Tue': 19, 'Wed': 15, 'Thu': 89, 'Fri': 11, 'Sat': -5, 'Sun': 9}
my_dict

{'Mon': 33, 'Tue': 19, 'Wed': 15, 'Thu': 89, 'Fri': 11, 'Sat': -5, 'Sun': 9}


series_from_dict = pd.Series(my_dict)
series_from_dict

Mon    33
Tue    19
Wed    15
Thu    89
Fri    11
Sat    -5
Sun     9
dtype: int64


import numpy as np


my_array = np.linspace(0,10,15)
my_array

array([ 0.        ,  0.71428571,  1.42857143,  2.14285714,  2.85714286,
        3.57142857,  4.28571429,  5.        ,  5.71428571,  6.42857143,
        7.14285714,  7.85714286,  8.57142857,  9.28571429, 10.        ])


len(my_array)

15


series_from_ndarray = pd.Series(my_array)
series_from_ndarray

0      0.000000
1      0.714286
2      1.428571
3      2.142857
4      2.857143
5      3.571429
6      4.285714
7      5.000000
8      5.714286
9      6.428571
10     7.142857
11     7.857143
12     8.571429
13     9.285714
14    10.000000
dtype: float64


# If Kaggle use this after uploading the xlsx into Kaggle
## data = pd.read_excel(io="../input/WA_Fn-UseC_-HR-Employee-Attrition.xlsx", sheetname=0, index_col='EmployeeNumber')


data = pd.read_excel(io="data1.xlsx",  index_col='EmployeeNumber')


data.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object


data.head(3)


data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')


data['Attrition'].head()

EmployeeNumber
1    Yes
2     No
4    Yes
5     No
7     No
Name: Attrition, dtype: object


data[['Age', 'Gender','YearsAtCompany']].head()


data['AgeInMonths'] = 12*data['Age']
data['AgeInMonths'].head()

EmployeeNumber
1    492
2    588
4    444
5    396
7    324
Name: AgeInMonths, dtype: int64


del data['AgeInMonths']


data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')


data['BusinessTravel'][10:15]

EmployeeNumber
14    Travel_Rarely
15    Travel_Rarely
16    Travel_Rarely
18    Travel_Rarely
19    Travel_Rarely
Name: BusinessTravel, dtype: object


data[10:15]


selected_EmployeeNumbers = [15, 94, 337, 1120]


data['YearsAtCompany']

EmployeeNumber
1        6
2       10
4        0
5        8
7        2
        ..
2061     5
2062     7
2064     6
2065     9
2068     4
Name: YearsAtCompany, Length: 1470, dtype: int64


data['YearsAtCompany'].loc[selected_EmployeeNumbers]

EmployeeNumber
15      9
94      5
337     2
1120    7
Name: YearsAtCompany, dtype: int64


data.loc[selected_EmployeeNumbers]


data.loc[94,'YearsAtCompany']

5


data['Department'].value_counts()

Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64


data['Department'].value_counts().plot(kind='barh', title='Department')

<AxesSubplot:title={'center':'Department'}>


data['Department'].value_counts().plot(kind='pie', title='Department')

<AxesSubplot:title={'center':'Department'}, ylabel='Department'>


data['Attrition'].value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64


data['Attrition'].value_counts(normalize=True)

No     0.838776
Yes    0.161224
Name: Attrition, dtype: float64


data['HourlyRate'].mean()

65.89115646258503


data['JobSatisfaction'].head()

EmployeeNumber
1    4
2    2
4    3
5    3
7    2
Name: JobSatisfaction, dtype: int64


JobSatisfaction_cat = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}


data['JobSatisfaction'] = data['JobSatisfaction'].map(JobSatisfaction_cat)
data['JobSatisfaction'].head()

EmployeeNumber
1    Very High
2       Medium
4         High
5         High
7       Medium
Name: JobSatisfaction, dtype: object


data['JobSatisfaction'].value_counts()

Very High    459
High         442
Low          289
Medium       280
Name: JobSatisfaction, dtype: int64


100*data['JobSatisfaction'].value_counts(normalize=True)

Very High    31.224490
High         30.068027
Low          19.659864
Medium       19.047619
Name: JobSatisfaction, dtype: float64


data['JobSatisfaction'].value_counts(normalize=True).plot(kind='pie', title='Department')

<AxesSubplot:title={'center':'Department'}, ylabel='JobSatisfaction'>


from pandas.api.types import CategoricalDtype
cats=['Low', 'Medium', 'High', 'Very High']
cat_type = CategoricalDtype(categories=cats, ordered=True)
data['JobSatisfaction'] = data['JobSatisfaction'].astype(cat_type)


data['JobSatisfaction'].head()

EmployeeNumber
1    Very High
2       Medium
4         High
5         High
7       Medium
Name: JobSatisfaction, dtype: category
Categories (4, object): ['Low' < 'Medium' < 'High' < 'Very High']


data['JobSatisfaction'].value_counts().plot(kind='barh', title='Department')

<AxesSubplot:title={'center':'Department'}>


data['JobSatisfaction'].value_counts(sort=False).plot(kind='barh', title='Department')

<AxesSubplot:title={'center':'Department'}>


data['JobSatisfaction'] == 'Low'

EmployeeNumber
1       False
2       False
4       False
5       False
7       False
        ...  
2061    False
2062     True
2064    False
2065    False
2068    False
Name: JobSatisfaction, Length: 1470, dtype: bool


data.loc[data['JobSatisfaction'] == 'Low'].index

Int64Index([  10,   20,   27,   31,   33,   38,   51,   52,   54,   68,
            ...
            1975, 1980, 1998, 2021, 2023, 2038, 2054, 2055, 2057, 2062],
           dtype='int64', name='EmployeeNumber', length=289)


data['JobInvolvement'].head()

EmployeeNumber
1    3
2    2
4    2
5    3
7    3
Name: JobInvolvement, dtype: int64


subset_of_interest = data.loc[(data['JobSatisfaction'] == "Low") | (data['JobSatisfaction'] == "Very High")]
subset_of_interest.shape

(748, 34)


subset_of_interest.head()


subset_of_interest['JobSatisfaction'].value_counts()

Very High    459
Low          289
Medium         0
High           0
Name: JobSatisfaction, dtype: int64


subset_of_interest['JobSatisfaction'].cat.remove_unused_categories(inplace=True)

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\arrays\categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version.
  res = method(*args, **kwargs)


subset_of_interest['JobSatisfaction'].value_counts()

Very High    459
Low          289
Name: JobSatisfaction, dtype: int64


grouped = subset_of_interest.groupby('JobSatisfaction')


grouped.head()


grouped.groups

{'Low': [10, 20, 27, 31, 33, 38, 51, 52, 54, 68, 70, 74, 75, 81, 86, 88, 100, 101, 113, 124, 133, 134, 145, 153, 170, 190, 197, 199, 200, 235, 239, 240, 241, 244, 250, 267, 274, 282, 288, 297, 299, 303, 328, 334, 339, 340, 347, 351, 362, 369, 374, 382, 390, 396, 412, 424, 425, 429, 451, 454, 474, 486, 510, 515, 517, 522, 524, 530, 532, 534, 536, 538, 549, 567, 573, 590, 605, 615, 625, 630, 648, 650, 662, 664, 667, 682, 684, 702, 705, 725, 728, 729, 732, 733, 742, 758, 764, 771, 775, 776, ...], 'Very High': [1, 8, 18, 22, 23, 24, 30, 36, 39, 40, 42, 45, 49, 53, 57, 62, 63, 72, 73, 76, 78, 79, 97, 98, 104, 106, 107, 112, 116, 117, 118, 120, 137, 139, 140, 143, 144, 148, 152, 154, 155, 158, 165, 169, 174, 179, 184, 192, 195, 198, 207, 215, 217, 221, 223, 228, 230, 242, 243, 245, 246, 262, 264, 273, 275, 281, 283, 286, 287, 291, 298, 302, 306, 309, 311, 312, 315, 316, 319, 323, 325, 327, 333, 335, 336, 338, 346, 349, 353, 361, 367, 372, 373, 377, 378, 380, 388, 389, 391, 393, ...]}


grouped.get_group('Low').head()


grouped.get_group('Very High').head()


grouped[['Age','JobSatisfaction']].head()


grouped['Age'].mean()

JobSatisfaction
Low          36.916955
Very High    36.795207
Name: Age, dtype: float64


grouped['Age'].describe()


grouped['Age'].describe().unstack()


grouped['Age'].plot(kind='density', title='Age')

JobSatisfaction
Low          AxesSubplot(0.125,0.125;0.775x0.755)
Very High    AxesSubplot(0.125,0.125;0.775x0.755)
Name: Age, dtype: object


grouped['Department'].value_counts().unstack()


grouped['Department'].value_counts(normalize=True).unstack()


grouped['Department'].value_counts().unstack().plot(kind="barh")

<AxesSubplot:ylabel='JobSatisfaction'>


grouped['Department'].value_counts(normalize=True).unstack().plot(kind="barh")

<AxesSubplot:ylabel='JobSatisfaction'>


data['Department'].value_counts(normalize=True,sort=False).plot(kind="barh")

<AxesSubplot:>


data['Department'].value_counts(normalize=True,sort=False).plot(kind="barh")

<AxesSubplot:>


grouped['DistanceFromHome'].describe().unstack()

       JobSatisfaction
count  Low                289.000000
       Very High          459.000000
mean   Low                  9.190311
       Very High            9.030501
std    Low                  8.045127
       Very High            8.257004
min    Low                  1.000000
       Very High            1.000000
25%    Low                  2.000000
       Very High            2.000000
50%    Low                  7.000000
       Very High            7.000000
75%    Low                 14.000000
       Very High           14.000000
max    Low                 29.000000
       Very High           29.000000
dtype: float64


grouped['DistanceFromHome'].plot(kind='density', title='Distance From Home',legend=True)

JobSatisfaction
Low          AxesSubplot(0.125,0.125;0.775x0.755)
Very High    AxesSubplot(0.125,0.125;0.775x0.755)
Name: DistanceFromHome, dtype: object


grouped['HourlyRate'].describe()


grouped['HourlyRate'].plot(kind='density', title='Hourly Rate',legend=True)

JobSatisfaction
Low          AxesSubplot(0.125,0.125;0.775x0.755)
Very High    AxesSubplot(0.125,0.125;0.775x0.755)
Name: HourlyRate, dtype: object


grouped['MonthlyIncome'].describe()


grouped['HourlyRate'].plot(kind='density', title='Hourly Rate',legend=True)

JobSatisfaction
Low          AxesSubplot(0.125,0.125;0.775x0.755)
Very High    AxesSubplot(0.125,0.125;0.775x0.755)
Name: HourlyRate, dtype: object


!pip install numpy

Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: numpy in c:\users\dhafe\appdata\roaming\python\python310\site-packages (1.22.1)

	count	mean	std	min	25%	50%	75%	max
JobSatisfaction
Low	289.0	6561.570934	4645.170134	1091.0	3072.0	4968.0	8564.0	19943.0
Very High	459.0	6472.732026	4573.906428	1051.0	2927.5	5126.0	7908.0	19845.0

The Pandas Library¶

Contents

Description from the Pandas documentation:¶

Series and DataFrames¶

The Panda Series¶

Pandas DataFrames¶

Data Manipulation¶

	Age	Attrition	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EmployeeCount	EnvironmentSatisfaction	...	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
EmployeeNumber
1	41	Yes	Travel_Rarely	1102	Sales	1	2	Life Sciences	1	2	...	1	80	0	8	0	1	6	4	0	5
2	49	No	Travel_Frequently	279	Research & Development	8	1	Life Sciences	1	3	...	4	80	1	10	3	3	10	7	1	7
4	37	Yes	Travel_Rarely	1373	Research & Development	2	2	Other	1	4	...	2	80	0	7	3	3	0	0	0	0

	Age	Attrition	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EmployeeCount	EnvironmentSatisfaction	...	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
EmployeeNumber
10	59	No	Travel_Rarely	1324	Research & Development	3	3	Medical	1	3	...	1	80	3	12	3	2	1	0	0	0
20	29	No	Travel_Rarely	1389	Research & Development	21	4	Life Sciences	1	2	...	3	80	1	10	1	3	10	9	8	8
27	36	Yes	Travel_Rarely	1218	Sales	9	4	Life Sciences	1	3	...	2	80	0	10	4	3	5	3	0	3
31	34	Yes	Travel_Rarely	699	Research & Development	6	1	Medical	1	2	...	3	80	0	8	2	3	4	2	1	3
33	32	Yes	Travel_Frequently	1125	Research & Development	16	1	Life Sciences	1	2	...	2	80	0	10	5	3	10	2	6	7

	count	mean	std	min	25%	50%	75%	max
JobSatisfaction
Low	289.0	36.916955	9.245496	19.0	30.0	36.0	42.0	60.0
Very High	459.0	36.795207	9.125609	18.0	30.0	35.0	43.0	60.0

Department	Human Resources	Research & Development	Sales
JobSatisfaction
Low	0.038062	0.664360	0.297578
Very High	0.037037	0.642702	0.320261

	count	mean	std	min	25%	50%	75%	max
JobSatisfaction
Low	289.0	68.636678	20.439515	30.0	52.0	72.0	86.0	100.0
Very High	459.0	64.681917	20.647571	30.0	47.0	64.0	82.5	100.0

	Age	Gender	YearsAtCompany
EmployeeNumber
1	41	Female	6
2	49	Male	10
4	37	Male	0
5	33	Female	8
7	27	Male	2