0% found this document useful (0 votes)
51 views8 pages

Cleaning Data in Python

cleaning data in python

Uploaded by

Keerthi Kumar
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
Download as pdf or txt
0% found this document useful (0 votes)
51 views8 pages

Cleaning Data in Python

cleaning data in python

Uploaded by

Keerthi Kumar
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
Download as pdf or txt
Download as pdf or txt
You are on page 1/ 8

Text data - str

integers - int
decimals - float
binary - bool
dates - datetime
categories - category

# removing $ from revenue column and converting column to int

#step 1
df["Revenue"] = df["Revenue"].str.strip('$')

#step 2
df["Revenue"] = df["Revenue"].astype('int')

or if it is float
df["Revenue"] = df["Revenue"].astype('float')

# after this you can sum the column


assert df["Revenue"].dtype == 'int' # This will tell if the column is rightly
converted or not, if it fine then you can sum the column
Total_Revenue = df["Revenue"].sum()

#assert function helps to check if the conversion happened correctly


assert df["revenue"].dtype = 'int'
#if no error then we are good

# categorical variables

df["status"] = df["status"].astype('category')

# outlier treatment example

df.loc[df['avg_rating'] > 5, 'avg_rating'] = 5 #convert avg_rating > 5 to 5


#asserting the above effort is correct or not
assert df['avg_rating'].max() <= 5

import datetime as dt

#coverting datecolumn to date type


df["subscription_date"] = pd.to_datetime(df["subscription_date"]).dt.date
assert df["subscription_date"].dtype = 'datetime'

#check if the above data cleaning is correct


today_date = dt.date.today()
assert df["subscription_date"].max().date() <= today_date
# if not then
df.loc[df["subscription_date"] > today_date,"subscription_date"] = today_date #
forced cleaning of the data
assert df["subscription_date"].max().date <= today_date
# duplicate values
column_names = ['first_name', 'last_name', 'address']
duplicates = df.duplicated(subset = column_names,keep = False) # The keep
argument False means continue to show the duplicates in the data, the other
arguments for keep would be 'first', 'last' values

df[duplicates]
df[duplicates].sort_values(by = 'first_name')

# You can have complete row of duplicates or some columns in the entire row as
duplicates
# In case the all columns in the row are duplicates then you can
df.drop_duplicates(inplace = True) # inplace will drop duplicated rows directly
inside DataFrame without creating a new object

# For other type of duplicate conditions in the data. In this case first name,
last name all duplicates but height and weight are different between the
duplicate row. One way is to

column_names = ['first_name', 'last_name', 'address']


summaries = {'height' : 'max' , 'weight' : 'mean'} # max and mean are only an
example. you can change as per the need
df = df.groupby(by=column_names).agg(summaries).reset_index()

#Validate if the above process is done correctly


duplicates = df.duplicated(subset = column_names,keep = False)
df[duplicates].sort_values(by = 'first_name')

# Chapter - Text and Categorical Variables

# Member Constraint - Categorical variables that ideally should be in the data


and remove inconsistent categorical values. Eg: in blood group we cannot have Z+
blood. Hence these rows should be removed from analysis

# Find the cleanliness category in airlines not in categories


cat_notclean =
set(airlines['cleanliness']).difference(categories['cleanliness'])

# Find rows with that category


cat_notclean_rows = airlines['cleanliness'].isin(cat_notclean)

# Print rows with inconsistent category


print(airlines[cat_notclean_rows]) # These are the rows that need to be cleaned
and which is what we are doing in the next code

# Print rows with consistent categories only


print(airlines[~cat_notclean_rows])

# Next we are working on value inconsistency ,eg : Married, MARRIED,UNMARRIED,


Unmarried
df['marriage_status'] = df['marriage_status'].str.upper() # or .lower
df['marriage_status'].value_counts()

# at times some values can have trailing spaces


df['marriage_status'] = df['marriage_status'].str.strip()

# Creating Categorical data from the values in the data


#eg : income
# income_group column from income column

# Using cut() - create category ranges and names


ranges = [0,200000,500000,np.inf]
group_names = ['0-200K','200K-500K','500K+']
#Create income group column
demographics['income_group'] = pd.cut(demographics['household_income'],
bins=ranges, labels=group_names)
demographics[['income_group','household_income']]

# Create mapping dictionary and replace


mapping =
{'Microsoft':'DesktopOS','MacOS':'DesktopOS','Linux':'DesktopOS','IOS':'MobileOS
','Android':'MobileOS'}
devices['operating_system'] = devices['operating_system'].replace(mapping)
devices['operating_system'].unique()

#Cleaning Text Data


#Regular Expressions eg: Names, Phonenumbers, emails

#practice
# Store length of each row in survey_response column
resp_length = airlines['survey_response'].str.len()

# Find rows in airlines where resp_length > 40


airlines_survey = airlines[resp_length > 40]

# Assert minimum survey_response length is > 40


assert airlines_survey['survey_response'].str.len().min() > 40

# Print new survey_response column


print(airlines_survey['survey_response'])
# Uniform data
# temp in celsius and farenheit - need to standardize
#in the below case data format is in my combinations

df["Birthdate"] =pd.to_datetime(df["Birthdate"],infer_datetime_format = True,


errors = "coerce")
# The coerce will give nan values where date format is difficult to be checked
eg: 27/27/2002

# in case you know the date format for sure then you can write one such code as
an example below
df["Birthdate"] = df["Birthdate"].dt.strftime("%d-%m-%Y")

# any ambuigity like 2019-03-08 then you would need to do some homework to make
such decisions

# standardize amount in dollars


# Find values of acct_cur that are equal to 'euro'
acct_eu = banking['acct_cur'] == 'euro'

# Convert acct_amount where it is in euro to dollars


banking.loc[acct_eu, 'acct_amount'] = banking.loc[acct_eu, 'acct_amount'] * 1.1

# Unify acct_cur column by changing 'euro' values to 'dollar'


banking.loc[acct_eu, 'acct_cur'] = 'dollar'

# Assert that only dollar currency remains


assert banking['acct_cur'].unique() == 'dollar'

# Convert account_opened to datetime


banking['account_opened'] = pd.to_datetime(banking['account_opened'],
# Infer datetime format
infer_datetime_format = True,
# Return missing value for error
errors = 'coerce')

# Get year of account opened


banking['acct_year'] = banking['account_opened'].dt.strftime('%Y')

# Print acct_year
print(banking['acct_year'])
#Cross Field Validation
#check data integrity when merging data from different sources
# checking data between columns if they mean logical right eg e_class + b_class
+ p_class should match the total class column

sum_classes = df[['e_class','b_class','p_class']].sum(axis = 1)
consistent_sum = sum_classes == df['total_class']

inconsistent_df = df[~consistent_sum]
consistent_df = df[consistent_sum]

#just like above example checking for consistency between age and birthdate

import datetime as dt

df['birthday'] = pd.to_datetime(df['birthday'])
today = dt.date.today()
age_cal = today.year - df['birthday'].dt.year
age_equ = age_cal == df['age']

inconsistent_dfage = df[~age_equ]
consistent_dfage = df[age_equ]

# exercise example
# Store fund columns to sum against
fund_columns = ['fund_A', 'fund_B', 'fund_C', 'fund_D']

# Find rows where fund_columns row sum == inv_amount


inv_equ = banking[fund_columns].sum(axis = 1) == banking['inv_amount']

# Store consistent and inconsistent data


consistent_inv = banking[inv_equ]
inconsistent_inv = banking[~inv_equ]

# Store consistent and inconsistent data


print("Number of inconsistent investments: ", inconsistent_inv.shape[0])

#Completeness and Missing Data

# missing data represented by NA, nan,0,. etc...

# checking for NAs in dataframe


df.isna() # here you get only True or False for columns in df
df.isna().sum() # here you get the count of NAs for the columns in the df

#Missing data visualization


import missingno as msno
import matplotlib.pyplot .as plt

#plotting missing data


msno.matrix(df)
plt.show()

#from the above we observer CO2 data is missing in many records

missing = df[df['CO2'].isna()]
notmissing = df[~df['CO2'].isna()] # this is to write for not isna

missing.describe()
notmissing.describe()

sorted_df = df.sort_values(by = 'Temperature')


msno.matrix (sorted_df)
plt.show()
# from the above output, it is evident that CO2 values are missing for cold
temperature data. May be a sensor failure in cold temperatures

#missing at completely random


#missing at random (eg: data missing only for certain pockets of the cohorts due
to some issues, which means missing data for observed values. We know why it is
missing)
#missing not at random (missing data for unobserved values. may be temp not
available for greater than 50 degree as temperature is too hot for thermometer
to test. but difficult to say this)

#drop missing values


df_droppedmissingval = df.dropna(subset = ['CO2'])
df_droppedmissingval.describe()

#replacing with statistical measures


co2_mean = df['CO2'].mean()
df_mean_imputed = df.fillna({'CO2' : co2_mean})
df.describe()

#you can also use domain knowledge


#Compare Strings

#Record Linkage
#Minimum edit distance between text (eg intention vs execution)

# Store the unique values of cuisine_type in unique_types


unique_types = restaurants['cuisine_type'].unique()

# Calculate similarity of 'asian' to all values of unique_types


print(process.extract('asian', unique_types, limit = len(unique_types)))

# Calculate similarity of 'american' to all values of unique_types


print(process.extract('american', unique_types, limit = len(unique_types)))

# Calculate similarity of 'italian' to all values of unique_types


print(process.extract('italian', unique_types, limit = len(unique_types)))

# From the above we know for every record of cuisine_type what is the nearest in
the unique_type of cuisine list.

#we are now changing for 'italian' cuisine

# Create a list of matches, comparing 'italian' with the cuisine_type column


matches = process.extract('italian', restaurants['cuisine_type'],
limit=len(restaurants.cuisine_type))

# Iterate through the list of matches to italian


for match in matches:
# Check whether the similarity score is greater than or equal to 80
if match[1] >= 80 :
# Select all rows where the cuisine_type is spelled this way, and set them
to the correct cuisine
restaurants.loc[restaurants['cuisine_type'] == match[0], 'cuisine_type' ] =
'italian'

# but we need to do this for all the cuisines

categories = ['italian', 'asian', 'american']

# Iterate through categories


for cuisine in categories:
# Create a list of matches, comparing cuisine with the cuisine_type column
matches = process.extract(cuisine, restaurants['cuisine_type'],
limit=len(restaurants.cuisine_type))

# Iterate through the list of matches


for match in matches:
# Check whether the similarity score is greater than or equal to 80
if match[1] >= 80:
# If it is, select all rows where the cuisine_type is spelled this way,
and set them to the correct cuisine
restaurants.loc[restaurants['cuisine_type'] == match[0]] = cuisine

# Inspect the final result


print(restaurants['cuisine_type'].unique())

#Generating Pairs

# two columns dont match exactly because being differently name. Hence you need
record linkage

#recordlinkage package

You might also like