Cleaning Data in Python
Cleaning Data in Python
integers - int
decimals - float
binary - bool
dates - datetime
categories - category
#step 1
df["Revenue"] = df["Revenue"].str.strip('$')
#step 2
df["Revenue"] = df["Revenue"].astype('int')
or if it is float
df["Revenue"] = df["Revenue"].astype('float')
# categorical variables
df["status"] = df["status"].astype('category')
import datetime as dt
df[duplicates]
df[duplicates].sort_values(by = 'first_name')
# You can have complete row of duplicates or some columns in the entire row as
duplicates
# In case the all columns in the row are duplicates then you can
df.drop_duplicates(inplace = True) # inplace will drop duplicated rows directly
inside DataFrame without creating a new object
# For other type of duplicate conditions in the data. In this case first name,
last name all duplicates but height and weight are different between the
duplicate row. One way is to
#practice
# Store length of each row in survey_response column
resp_length = airlines['survey_response'].str.len()
# in case you know the date format for sure then you can write one such code as
an example below
df["Birthdate"] = df["Birthdate"].dt.strftime("%d-%m-%Y")
# any ambuigity like 2019-03-08 then you would need to do some homework to make
such decisions
# Print acct_year
print(banking['acct_year'])
#Cross Field Validation
#check data integrity when merging data from different sources
# checking data between columns if they mean logical right eg e_class + b_class
+ p_class should match the total class column
sum_classes = df[['e_class','b_class','p_class']].sum(axis = 1)
consistent_sum = sum_classes == df['total_class']
inconsistent_df = df[~consistent_sum]
consistent_df = df[consistent_sum]
#just like above example checking for consistency between age and birthdate
import datetime as dt
df['birthday'] = pd.to_datetime(df['birthday'])
today = dt.date.today()
age_cal = today.year - df['birthday'].dt.year
age_equ = age_cal == df['age']
inconsistent_dfage = df[~age_equ]
consistent_dfage = df[age_equ]
# exercise example
# Store fund columns to sum against
fund_columns = ['fund_A', 'fund_B', 'fund_C', 'fund_D']
missing = df[df['CO2'].isna()]
notmissing = df[~df['CO2'].isna()] # this is to write for not isna
missing.describe()
notmissing.describe()
#Record Linkage
#Minimum edit distance between text (eg intention vs execution)
# From the above we know for every record of cuisine_type what is the nearest in
the unique_type of cuisine list.
#Generating Pairs
# two columns dont match exactly because being differently name. Hence you need
record linkage
#recordlinkage package