4/2/24, 2:43 PM PLFS_MVPA
In [1]: # Step 1
# upload the dataset
import pandas as pd
df = pd.read_excel('C:/Users/user/Desktop/PLFS_2022_23.xlsx')
In [2]: #List the columns in the dataset
df.columns.tolist()
['Sector',
Out[2]:
'State',
'Religion',
'Social Group',
'Sex',
'Age',
'Marital Status',
'General Education',
'Technical Education',
'No of years in formal education',
'Status of Current Attendance in Educational Institution',
'Whether received any Vocational/ Technical Training',
'Duration of Training',
'Status Code',
'Industry Code',
'Whether Engaged in any work in Subsidiary Capacity',
'No of Workers in the Enterprise',
'Type of Job Contract',
'Eligible of Paid Leave',
'Social Security Benefits',
'Earning for Regular Salaried/ Wage Workers',
'Earnings for Self Employed']
In [3]: # Data cleaning step - 2
# Sector variable - Assigning Rural as 0 and Urban as 1
df['Sector'] = df['Sector'].apply(lambda x: 1 if x == 2 else 0)
In [4]: df['Sector'].value_counts()
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 1/12
4/2/24, 2:43 PM PLFS_MVPA
0 56713
Out[4]:
1 31542
Name: Sector, dtype: int64
In [5]: # Data Cleaning - Step 3
# Assign 1 for Hinduism (majority) and 0 for other religions (minority)
df['Religion'] = df['Religion'].apply(lambda x: 1 if x == 1 else 0)
In [6]: df['Religion'].value_counts()
1 72706
Out[6]:
0 15549
Name: Religion, dtype: int64
In [7]: # Data Cleaning - Step 4
# Assign 0 to SC/ST/OBC and 1 for others
df['Social Group'] = df['Social Group'].apply(lambda x: 1 if x == 9 else 0)
df['Social Group'].value_counts()
0 71226
Out[7]:
1 17029
Name: Social Group, dtype: int64
In [8]: # Data Cleaning - Step 5
# Assign 1 to Male and 0 for others
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 1 else 0)
df['Sex'].value_counts()
1 45439
Out[8]:
0 42816
Name: Sex, dtype: int64
In [9]: # Data Cleaning - Step 6
# Assign 0 to upto higher secondary education and 1 for above higher secondary education
df['General Education'] = df['General Education'].apply(lambda x: 0 if x in (1,2,3,4,5,6,7,8,10) else 1)
In [10]: # Data Cleaning - Step 7
# Assign 0 to NO technical education and 1 for others
df['Technical Education'] = df['Technical Education'].apply(lambda x: 0 if x == 1 else 1)
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 2/12
4/2/24, 2:43 PM PLFS_MVPA
In [11]: # Data Cleaning - Step 8
# Assign 0 to received no vocational/technical training and 1 for others
df['Whether received any Vocational/ Technical Training'] = df['Whether received any Vocational/ Technical Training'].apply(lambd
In [12]: # Data Cleaning - Step 9
# Assign 0 to being Engaged in any work in Subsidiary Capacity and 1 for No
df['Whether Engaged in any work in Subsidiary Capacity'] = df['Whether Engaged in any work in Subsidiary Capacity'].apply(lambda
In [13]: # Data Cleaning - Step 10
# Assign 0 to NO written contract and 1 for others
df['Type of Job Contract'] = df['Type of Job Contract'].apply(lambda x: 0 if x == 1 else 1)
In [14]: # Data Cleaning - Step 11
# Assign 1 to currently married and 0 for others
df['Marital Status'] = df['Marital Status'].apply(lambda x: 1 if x == 2 else 0)
In [15]: # Data Cleaning - Step 12
# Adding new Log columns to my df to deal with high variations in both the Earning Columns
import numpy as np
epsilon = 1e-7
df['log_sal'] = np.log(df['Earning for Regular Salaried/ Wage Workers'] + epsilon)
df['log_self'] = np.log(df['Earnings for Self Employed']+ epsilon)
In [16]: # Data Cleaning - Step 13
#Adding new squared columns to handle in case of non linear relations
df['Age_sq'] = df['Age'] ** 2
df['Formal_Edu_sq'] = df['No of years in formal education'] ** 2
In [17]: #List all the final columns in the dataset
df.columns
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 3/12
4/2/24, 2:43 PM PLFS_MVPA
Index(['Sector', 'State', 'Religion', 'Social Group', 'Sex', 'Age',
Out[17]:
'Marital Status', 'General Education', 'Technical Education',
'No of years in formal education',
'Status of Current Attendance in Educational Institution',
'Whether received any Vocational/ Technical Training',
'Duration of Training', 'Status Code', 'Industry Code',
'Whether Engaged in any work in Subsidiary Capacity',
'No of Workers in the Enterprise', 'Type of Job Contract',
'Eligible of Paid Leave', 'Social Security Benefits',
'Earning for Regular Salaried/ Wage Workers',
'Earnings for Self Employed', 'log_sal', 'log_self', 'Age_sq',
'Formal_Edu_sq'],
dtype='object')
In [18]: # Data Cleaning - Step 14
# Seggregating data for salaried population and self earning population into 2 separate dataframes
df_sal = df[df['Earning for Regular Salaried/ Wage Workers'] > 0]
df_self = df[df['Earnings for Self Employed'] > 0]
In [19]: #Step - 15 - Regression Model
# MODEL No. 1 - Estimating Earnings for Regular Salaried/ Wage Workers
import statsmodels.api as sm
# Define the independent variables
independent_vars = [
'Sector',
'Religion',
'Sex',
'Age',
'Social Group',
'General Education',
'Marital Status',
'Technical Education',
'No of years in formal education',
'Whether received any Vocational/ Technical Training',
'Whether Engaged in any work in Subsidiary Capacity',
'Type of Job Contract',
'Age_sq',
'Formal_Edu_sq'
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 4/12
4/2/24, 2:43 PM PLFS_MVPA
# Add a constant to the independent variables
X = sm.add_constant(df_sal[independent_vars])
# Define the target variable :
y = df_sal['log_sal']
# Fit the linear regression model
model = sm.OLS(y, X).fit()
# Print the model summary
print(model.summary())
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 5/12
4/2/24, 2:43 PM PLFS_MVPA
OLS Regression Results
==============================================================================
Dep. Variable: log_sal R-squared: 0.502
Model: OLS Adj. R-squared: 0.501
Method: Least Squares F-statistic: 403.0
Date: Tue, 02 Apr 2024 Prob (F-statistic): 0.00
Time: 14:36:52 Log-Likelihood: -4655.5
No. Observations: 5610 AIC: 9341.
Df Residuals: 5595 BIC: 9441.
Df Model: 14
Covariance Type: nonrobust
=======================================================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------------------------------------------
const 6.7024 0.090 74.836 0.000 6.527 6.878
Sector 0.1722 0.017 9.907 0.000 0.138 0.206
Religion 0.0626 0.021 3.018 0.003 0.022 0.103
Sex 0.5594 0.022 25.993 0.000 0.517 0.602
Age 0.0596 0.005 13.043 0.000 0.051 0.069
Social Group 0.1134 0.017 6.624 0.000 0.080 0.147
General Education 0.0320 0.036 0.898 0.369 -0.038 0.102
Marital Status 0.1157 0.022 5.297 0.000 0.073 0.159
Technical Education 0.2505 0.029 8.581 0.000 0.193 0.308
No of years in formal education 0.0017 0.006 0.283 0.777 -0.010 0.013
Whether received any Vocational/ Technical Training 0.0275 0.015 1.815 0.070 -0.002 0.057
Whether Engaged in any work in Subsidiary Capacity 0.2856 0.028 10.222 0.000 0.231 0.340
Type of Job Contract 0.4822 0.017 27.850 0.000 0.448 0.516
Age_sq -0.0006 5.59e-05 -11.248 0.000 -0.001 -0.001
Formal_Edu_sq 0.0022 0.000 5.798 0.000 0.001 0.003
==============================================================================
Omnibus: 173.792 Durbin-Watson: 1.969
Prob(Omnibus): 0.000 Jarque-Bera (JB): 355.426
Skew: -0.205 Prob(JB): 6.61e-78
Kurtosis: 4.163 Cond. No. 2.05e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.05e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [20]: # Step - 16
# Calculate VIF values for the Model - 1
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 6/12
4/2/24, 2:43 PM PLFS_MVPA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# Define independent variables used in the regression model
independent_vars = [
'Sector',
'Religion',
'Sex',
'Age',
'Social Group',
'General Education',
'Marital Status',
'Technical Education',
'No of years in formal education',
'Whether received any Vocational/ Technical Training',
'Whether Engaged in any work in Subsidiary Capacity',
'Type of Job Contract',
'Age_sq',
'Formal_Edu_sq'
]
# Add a constant column for intercept (necessary for VIF calculation)
df_with_const = add_constant(df_sal[independent_vars])
# Calculate VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["Variable"] = df_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(df_with_const.values, i) for i in range(df_with_const.shape[1])]
print(vif_data)
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 7/12
4/2/24, 2:43 PM PLFS_MVPA
Variable VIF
0 const 145.780813
1 Sector 1.190473
2 Religion 1.066603
3 Sex 1.091667
4 Age 48.178772
5 Social Group 1.086503
6 General Education 5.226346
7 Marital Status 1.666061
8 Technical Education 1.353767
9 No of years in formal education 15.317316
10 Whether received any Vocational/ Technical Tra... 1.045240
11 Whether Engaged in any work in Subsidiary Capa... 1.182732
12 Type of Job Contract 1.218561
13 Age_sq 44.150284
14 Formal_Edu_sq 25.843036
In [21]: # Step - 17
# MODEL No. 2 - Estimating Earnings for Self Employed
import statsmodels.api as sm
# Define the independent variables
independent_vars = [
'Sector',
'Religion',
'Sex',
'Age',
'Social Group',
'General Education',
'Marital Status',
'Technical Education',
'No of years in formal education',
'Whether received any Vocational/ Technical Training',
'Whether Engaged in any work in Subsidiary Capacity',
'Type of Job Contract',
'Age_sq',
'Formal_Edu_sq'
]
# Add a constant to the independent variables
X = sm.add_constant(df_self[independent_vars])
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 8/12
4/2/24, 2:43 PM PLFS_MVPA
# Define the target variable :
y = df_self['log_self']
# Fit the linear regression model
model = sm.OLS(y, X).fit()
# Print the model summary
print(model.summary())
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 9/12
4/2/24, 2:43 PM PLFS_MVPA
OLS Regression Results
==============================================================================
Dep. Variable: log_self R-squared: 0.418
Model: OLS Adj. R-squared: 0.418
Method: Least Squares F-statistic: 715.8
Date: Tue, 02 Apr 2024 Prob (F-statistic): 0.00
Time: 14:37:51 Log-Likelihood: -12329.
No. Observations: 13960 AIC: 2.469e+04
Df Residuals: 13945 BIC: 2.480e+04
Df Model: 14
Covariance Type: nonrobust
=======================================================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------------------------------------------
const 6.4892 0.080 80.708 0.000 6.332 6.647
Sector 0.3093 0.012 26.292 0.000 0.286 0.332
Religion -0.0239 0.014 -1.753 0.080 -0.051 0.003
Sex 0.8892 0.014 64.611 0.000 0.862 0.916
Age 0.0461 0.003 17.629 0.000 0.041 0.051
Social Group 0.1482 0.013 11.301 0.000 0.122 0.174
General Education 0.0799 0.033 2.435 0.015 0.016 0.144
Marital Status 0.0814 0.016 5.022 0.000 0.050 0.113
Technical Education 0.0932 0.043 2.156 0.031 0.008 0.178
No of years in formal education 0.0147 0.004 3.808 0.000 0.007 0.022
Whether received any Vocational/ Technical Training 0.0205 0.011 1.946 0.052 -0.000 0.041
Whether Engaged in any work in Subsidiary Capacity 0.1606 0.013 11.935 0.000 0.134 0.187
Type of Job Contract 0.5474 0.058 9.418 0.000 0.433 0.661
Age_sq -0.0005 2.87e-05 -18.217 0.000 -0.001 -0.000
Formal_Edu_sq 0.0004 0.000 1.271 0.204 -0.000 0.001
==============================================================================
Omnibus: 736.413 Durbin-Watson: 1.970
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1526.202
Skew: -0.368 Prob(JB): 0.00
Kurtosis: 4.443 Cond. No. 4.34e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.34e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [22]: # Step - 18
# Calculate VIF Values for Model - 2
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 10/12
4/2/24, 2:43 PM PLFS_MVPA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# Define independent variables used in the regression model
independent_vars = [
'Sector',
'Religion',
'Sex',
'Age',
'Social Group',
'General Education',
'Marital Status',
'Technical Education',
'No of years in formal education',
'Whether received any Vocational/ Technical Training',
'Whether Engaged in any work in Subsidiary Capacity',
'Type of Job Contract',
'Age_sq',
'Formal_Edu_sq'
]
# Add a constant column for intercept (necessary for VIF calculation)
df_with_const = add_constant(df_self[independent_vars])
# Calculate VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["Variable"] = df_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(df_with_const.values, i) for i in range(df_with_const.shape[1])]
print(vif_data)
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 11/12
4/2/24, 2:43 PM PLFS_MVPA
Variable VIF
0 const 263.213006
1 Sector 1.163624
2 Religion 1.074025
3 Sex 1.220613
4 Age 44.292376
5 Social Group 1.096183
6 General Education 4.291929
7 Marital Status 1.206137
8 Technical Education 1.179082
9 No of years in formal education 16.354550
10 Whether received any Vocational/ Technical Tra... 1.109433
11 Whether Engaged in any work in Subsidiary Capa... 1.116591
12 Type of Job Contract 1.026795
13 Age_sq 43.738403
14 Formal_Edu_sq 25.047101
file:///C:/Users/user/Downloads/PLFS_MVPA (2).html 12/12