02 Data Cleaning
02 Data Cleaning
In [197… df = pd.read_csv(r"C:\Users\LENOVO\Downloads\mywork\Ipl_Analysis\data\[Link]")
In [198… [Link]()
Out[198… Unnamed:
match_id date match_type event_name innings batting_team bowling_team over ball ball_no batter bat_pos r
0
Indian Royal
2008- Kolkata SC
0 131970 335982 T20 Premier 1 Challengers 0 1 0.1 1
04-18 Knight Riders Ganguly
League Bangalore
Indian Royal
2008- Kolkata BB
1 131971 335982 T20 Premier 1 Challengers 0 2 0.2 2
04-18 Knight Riders McCullum
League Bangalore
Indian Royal
2008- Kolkata BB
2 131972 335982 T20 Premier 1 Challengers 0 3 0.3 2
04-18 Knight Riders McCullum
League Bangalore
Indian Royal
2008- Kolkata BB
3 131973 335982 T20 Premier 1 Challengers 0 3 0.3 2
04-18 Knight Riders McCullum
League Bangalore
Indian Royal
2008- Kolkata BB
4 131974 335982 T20 Premier 1 Challengers 0 4 0.4 2
04-18 Knight Riders McCullum
League Bangalore
In [199… [Link]
[Link] 1/28
3/4/26, 9:29 PM 02_data_cleaning
In [201… [Link]
In [203… df["date"].dtype
Out[203… dtype('<M8[ns]')
In [204… [Link]("season")["year"].nunique()
[Link] 2/28
3/4/26, 9:29 PM 02_data_cleaning
Out[204… season
2009 1
2011 1
2012 1
2013 1
2014 1
2015 1
2016 1
2017 1
2018 1
2019 1
2021 1
2022 1
2023 1
2024 1
2025 1
2007/08 1
2009 1
2009/10 1
2011 1
2019 1
2020/21 1
2021 1
Name: year, dtype: int64
In [205… df = [Link](columns=["season"])
In [207… [Link]()
[Link] 3/28
3/4/26, 9:29 PM 02_data_cleaning
<class '[Link]'>
RangeIndex: 278205 entries, 0 to 278204
Data columns (total 63 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 match_id 278205 non-null int64
1 date 278205 non-null datetime64[ns]
2 match_type 278205 non-null object
3 event_name 278205 non-null object
4 innings 278205 non-null int64
5 batting_team 278205 non-null object
6 bowling_team 278205 non-null object
7 over 278205 non-null int64
8 ball 278205 non-null int64
9 ball_no 278205 non-null float64
10 batter 278205 non-null object
11 bat_pos 278205 non-null int64
12 runs_batter 278205 non-null int64
13 balls_faced 278205 non-null int64
14 bowler 278205 non-null object
15 valid_ball 278205 non-null int64
16 runs_extras 278205 non-null int64
17 runs_total 278205 non-null int64
18 runs_bowler 278205 non-null int64
19 runs_not_boundary 278205 non-null bool
20 extra_type 15133 non-null object
21 non_striker 278205 non-null object
22 non_striker_pos 278205 non-null int64
23 wicket_kind 13823 non-null object
24 player_out 13823 non-null object
25 fielders 10013 non-null object
26 runs_target 133903 non-null float64
27 review_batter 872 non-null object
28 team_reviewed 872 non-null object
29 review_decision 872 non-null object
30 umpire 872 non-null object
31 umpires_call 278205 non-null bool
32 player_of_match 278205 non-null object
33 match_won_by 278205 non-null object
34 win_outcome 273503 non-null object
35 toss_winner 278205 non-null object
[Link] 4/28
3/4/26, 9:29 PM 02_data_cleaning
In [208… df["match_number"].unique()[:20]
In [209… df = [Link](columns=["match_number"])
In [210… df["event_match_no"].unique()
[Link] 5/28
3/4/26, 9:29 PM 02_data_cleaning
Out[210… array([1, 2, 3, 5, 4, 6, 7, 8, 9, 10, 12, 11, 14, 13, 15, 16, 17, 19, 18,
20, 22, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
'35', '36', '37', '38', '40', '39', '42', '41', '43', '44', '45',
'46', '48', '50', '49', '51', '52', '53', '54', '55', '56',
'Unknown', '1', '2', '3', '4', '5', '6', '8', '9', '10', '11',
'12', '14', '15', 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, '7', '13', '16', '17', '18', '19', '21', '20', '22', '23',
'24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
'47', 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
65, 66, 67, 68, 69, '69', '70', '66', '67', '68', '71', '72', '58',
'59', '60', '61', '62', '63', '64', '65', '57'], dtype=object)
In [214… df["event_match_no"].dtype
df["event_match_no"].unique()[:20]
Out[214… <IntegerArray>
[1, 2, 3, 5, 4, 6, 7, 8, 9, 10, 12, 11, 14, 13, 15, 16, 17, 19, 18, 20]
Length: 20, dtype: Int64
In [215… df["runs_target"].dropna().unique()[:10]
Out[215… array([223., 241., 130., 166., 111., 167., 143., 209., 215., 183.])
In [217… [Link]().sum()
Out[217… 0
Out[218… 10218
[Link] 6/28
3/4/26, 9:29 PM 02_data_cleaning
In [219… critical_cols = [
"match_id", "date", "batting_team",
"bowling_team", "batter", "bowler",
"runs_total", "team_runs"
]
df[critical_cols].isnull().sum()
Out[219… match_id 0
date 0
batting_team 0
bowling_team 0
batter 0
bowler 0
runs_total 0
team_runs 0
dtype: int64
In [220… df["win_outcome"].isnull().sum()
Out[220… 4702
In [221… [Link]().sum().sort_values(ascending=False).head(15)
[Link] 7/28
3/4/26, 9:29 PM 02_data_cleaning
In [222… print(df["gender"].unique(),
df["team_type"].unique(),
df["event_name"].unique(),
df["match_type"].unique())
In [223… cols_to_drop = [
"gender",
"team_type",
"event_name",
"match_type"
]
df = [Link](columns=cols_to_drop)
In [224… [Link]
In [225… sorted(df["batting_team"].unique())
[Link] 8/28
3/4/26, 9:29 PM 02_data_cleaning
In [227… sorted(df["bowling_team"].unique())
[Link] 9/28
3/4/26, 9:29 PM 02_data_cleaning
In [228… team_mapping = {
"Delhi Daredevils": "Delhi Capitals",
"Kings XI Punjab": "Punjab Kings",
"Royal Challengers Bangalore": "Royal Challengers Bengaluru",
"Rising Pune Supergiant": "Rising Pune Supergiants"
}
In [229… team_columns = [
"batting_team",
"bowling_team",
"toss_winner",
"match_won_by",
"superover_winner"
]
team_columns
[Link] 10/28
3/4/26, 9:29 PM 02_data_cleaning
Out[229… ['batting_team',
'bowling_team',
'toss_winner',
'match_won_by',
'superover_winner']
In [231… sorted(df["batting_team"].unique())
In [232… sorted(df["match_won_by"].dropna().unique())
[Link] 11/28
3/4/26, 9:29 PM 02_data_cleaning
In [233… df["batting_team"].nunique()
Out[233… 15
In [234… [Link]()
[Link] 12/28
3/4/26, 9:29 PM 02_data_cleaning
<class '[Link]'>
RangeIndex: 278205 entries, 0 to 278204
Data columns (total 58 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 match_id 278205 non-null int64
1 date 278205 non-null datetime64[ns]
2 innings 278205 non-null int64
3 batting_team 278205 non-null object
4 bowling_team 278205 non-null object
5 over 278205 non-null int64
6 ball 278205 non-null int64
7 ball_no 278205 non-null float64
8 batter 278205 non-null object
9 bat_pos 278205 non-null int64
10 runs_batter 278205 non-null int64
11 balls_faced 278205 non-null int64
12 bowler 278205 non-null object
13 valid_ball 278205 non-null int64
14 runs_extras 278205 non-null int64
15 runs_total 278205 non-null int64
16 runs_bowler 278205 non-null int64
17 runs_not_boundary 278205 non-null bool
18 extra_type 15133 non-null object
19 non_striker 278205 non-null object
20 non_striker_pos 278205 non-null int64
21 wicket_kind 13823 non-null object
22 player_out 13823 non-null object
23 fielders 10013 non-null object
24 runs_target 133903 non-null Int64
25 review_batter 872 non-null object
26 team_reviewed 872 non-null object
27 review_decision 872 non-null object
28 umpire 872 non-null object
29 umpires_call 278205 non-null bool
30 player_of_match 278205 non-null object
31 match_won_by 278205 non-null object
32 win_outcome 273503 non-null object
33 toss_winner 278205 non-null object
34 toss_decision 278205 non-null object
35 venue 278205 non-null object
[Link] 13/28
3/4/26, 9:29 PM 02_data_cleaning
In [235… df["stage"].value_counts()
Out[235… stage
Unknown 261492
Final 4338
Qualifier 2 3636
Qualifier 1 3553
Eliminator 2845
Semi Final 1409
Elimination Final 734
3rd Place Play-Off 198
Name: count, dtype: int64
[Link] 14/28
3/4/26, 9:29 PM 02_data_cleaning
In [237… df["stage"].value_counts()
Out[237… stage
League 261492
Final 4338
Qualifier 2 3636
Qualifier 1 3553
Eliminator 2845
Semi Final 1409
Elimination Final 734
3rd Place Play-Off 198
Name: count, dtype: int64
In [238… stage_mapping = {
"Elimination Final": "Eliminator",
"3rd Place Play-Off": "Playoff"
}
df["stage"] = df["stage"].replace(stage_mapping)
In [239… df["stage"].value_counts()
Out[239… stage
League 261492
Final 4338
Qualifier 2 3636
Eliminator 3579
Qualifier 1 3553
Semi Final 1409
Playoff 198
Name: count, dtype: int64
In [240… [Link]()
[Link] 15/28
3/4/26, 9:29 PM 02_data_cleaning
Out[240… match_id date innings batting_team bowling_team over ball ball_no batter bat_pos runs_batter balls_faced bowler valid_
Royal
2008- Kolkata SC P
0 335982 1 Challengers 0 1 0.1 1 0 1
04-18 Knight Riders Ganguly Kumar
Bengaluru
Royal
2008- Kolkata BB P
1 335982 1 Challengers 0 2 0.2 2 0 1
04-18 Knight Riders McCullum Kumar
Bengaluru
Royal
2008- Kolkata BB P
2 335982 1 Challengers 0 3 0.3 2 0 0
04-18 Knight Riders McCullum Kumar
Bengaluru
Royal
2008- Kolkata BB P
3 335982 1 Challengers 0 3 0.3 2 0 1
04-18 Knight Riders McCullum Kumar
Bengaluru
Royal
2008- Kolkata BB P
4 335982 1 Challengers 0 4 0.4 2 0 1
04-18 Knight Riders McCullum Kumar
Bengaluru
In [242… [Link]
In [243… print(df["over"].max(),
df["overs"].unique(),
df["balls_per_over"].unique(),df["innings"].unique())
19 [20] [6] [1 2 3 4 5 6]
In [244… df["innings"].value_counts()
[Link] 16/28
3/4/26, 9:29 PM 02_data_cleaning
Out[244… innings
1 144131
2 133903
3 83
4 76
5 8
6 4
Name: count, dtype: int64
In [247… df["is_super_over"].value_counts()
[Link] 17/28
3/4/26, 9:29 PM 02_data_cleaning
Out[247… is_super_over
False 278034
True 171
Name: count, dtype: int64
In [248… df['win_outcome'].unique()
Out[248… array(['140 runs', '33 runs', '9 wickets', '5 wickets', '6 wickets',
'6 runs', '3 wickets', '66 runs', '7 wickets', '10 wickets',
'4 wickets', '13 runs', '10 runs', '45 runs', '8 wickets',
'9 runs', '3 runs', '29 runs', '5 runs', '18 runs', '23 runs',
'12 runs', '65 runs', '25 runs', '1 runs', '14 runs', '41 runs',
'105 runs', '19 runs', '75 runs', '92 runs', '11 runs', '24 runs',
nan, '27 runs', '38 runs', '8 runs', '78 runs', '16 runs',
'53 runs', '2 wickets', '2 runs', '4 runs', '31 runs', '55 runs',
'98 runs', '34 runs', '36 runs', '17 runs', '39 runs', '40 runs',
'67 runs', '63 runs', '37 runs', '57 runs', '35 runs', '22 runs',
'21 runs', '48 runs', '26 runs', '20 runs', '85 runs', '32 runs',
'76 runs', '111 runs', '82 runs', '43 runs', '58 runs', '28 runs',
'74 runs', '42 runs', '59 runs', '46 runs', '7 runs', '47 runs',
'86 runs', '44 runs', '87 runs', '130 runs', '15 runs', '60 runs',
'77 runs', '30 runs', '50 runs', '93 runs', '72 runs', '62 runs',
'97 runs', '138 runs', '1 wickets', '71 runs', '144 runs',
'80 runs', '51 runs', '61 runs', '146 runs', '64 runs', '102 runs',
'118 runs', '49 runs', '69 runs', '88 runs', '54 runs', '91 runs',
'52 runs', '81 runs', '56 runs', '112 runs', '106 runs',
'100 runs', '83 runs', '110 runs'], dtype=object)
[Link] 18/28
3/4/26, 9:29 PM 02_data_cleaning
In [252… df = [Link](columns=["win_outcome"])
In [253… df["win_margin"].dtype
Out[253… Int64Dtype()
In [254… df["win_type"].unique()
In [255… df['match_won_by'].unique()
[Link] 19/28
3/4/26, 9:29 PM 02_data_cleaning
In [257… df['match_won_by'].unique()
In [258… df['result_type'].unique()
In [260… df["is_dl_method"].value_counts()
Out[260… is_dl_method
False 274315
True 3890
Name: count, dtype: int64
In [261… df = [Link](columns=["method"])
In [263… df["result_type"].value_counts()
Out[263… result_type
normal 273503
tie 3896
no result 806
Name: count, dtype: int64
In [264… [Link](5)
[Link] 20/28
3/4/26, 9:29 PM 02_data_cleaning
Out[264… match_id date innings batting_team bowling_team over ball batter bat_pos runs_batter balls_faced bowler valid_ball run
Royal
2008- Kolkata SC P
0 335982 1 Challengers 0 1 1 0 1 1
04-18 Knight Riders Ganguly Kumar
Bengaluru
Royal
2008- Kolkata BB P
1 335982 1 Challengers 0 2 2 0 1 1
04-18 Knight Riders McCullum Kumar
Bengaluru
Royal
2008- Kolkata BB P
2 335982 1 Challengers 0 3 2 0 0 0
04-18 Knight Riders McCullum Kumar
Bengaluru
Royal
2008- Kolkata BB P
3 335982 1 Challengers 0 3 2 0 1 1
04-18 Knight Riders McCullum Kumar
Bengaluru
Royal
2008- Kolkata BB P
4 335982 1 Challengers 0 4 2 0 1 1
04-18 Knight Riders McCullum Kumar
Bengaluru
In [265… df['umpire'].unique()
Out[265… array([nan, 'CB Gaffaney', 'RJ Tucker', 'C Shamshuddin', 'A Deshmukh',
'VA Kulkarni', 'AK Chaudhary', 'CK Nandan', 'NJ Llong', 'S Ravi',
'Nitin Menon', 'KN Ananthapadmanabhan', 'A Nand Kishore',
'VK Sharma', 'YC Barde', 'M Erasmus', 'HDPK Dharmasena',
'BNJ Oxenford', 'AY Dandekar', 'UV Gandhe', 'IJ Gould',
'PR Reiffel', 'RK Illingworth', 'K Srinivasan', 'PG Pathak',
'J Madanagopal', 'Navdeep Singh', 'Tapan Sharma', 'HAS Khalid',
'MA Gough', 'N Pandit', 'R Pandit', 'Chirra Ravikanthreddy',
'NA Patwardhan', 'GR Sadashiv Iyer', 'Vinod Seshan', 'A Totre',
'MV Saidharshan Kumar', 'AG Wharf', 'Abhijit Bhattacharya',
'A Bengeri', 'AT Holdstock', 'K Swaroopanand', 'P Joshi',
'M Krishnadas', 'K Kelkar', 'KM Gandhi', 'Anish Sahasrabudhe'],
dtype=object)
[Link] 21/28
3/4/26, 9:29 PM 02_data_cleaning
In [266… venue_mapping = {
"Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur":
"Maharaja Yadavindra Singh International Cricket Stadium, New Chandigarh"
}
Out[268… venue
Maharaja Yadavindra Singh International Cricket Stadium, New Chandigarh 2561
Name: count, dtype: int64
In [269… df['venue'].unique()
[Link] 22/28
3/4/26, 9:29 PM 02_data_cleaning
[Link] 23/28
3/4/26, 9:29 PM 02_data_cleaning
In [270… venue_mapping = {
# Chinnaswamy
"[Link] Stadium": "M Chinnaswamy Stadium, Bengaluru",
"M Chinnaswamy Stadium": "M Chinnaswamy Stadium, Bengaluru",
# Arun Jaitley
"Feroz Shah Kotla": "Arun Jaitley Stadium, Delhi",
"Arun Jaitley Stadium": "Arun Jaitley Stadium, Delhi",
# Chepauk
"MA Chidambaram Stadium": "MA Chidambaram Stadium, Chepauk, Chennai",
"MA Chidambaram Stadium, Chepauk": "MA Chidambaram Stadium, Chepauk, Chennai",
# Punjab
"Punjab Cricket Association Stadium, Mohali":
"Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh",
"Punjab Cricket Association IS Bindra Stadium":
"Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh",
"Punjab Cricket Association IS Bindra Stadium, Mohali":
"Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh",
# Rajiv Gandhi
"Rajiv Gandhi International Stadium":
"Rajiv Gandhi International Stadium, Uppal, Hyderabad",
"Rajiv Gandhi International Stadium, Uppal":
"Rajiv Gandhi International Stadium, Uppal, Hyderabad",
# Wankhede
"Wankhede Stadium":
"Wankhede Stadium, Mumbai",
# Eden
"Eden Gardens":
"Eden Gardens, Kolkata",
# DY Patil
"Dr DY Patil Sports Academy":
"Dr DY Patil Sports Academy, Mumbai",
[Link] 24/28
3/4/26, 9:29 PM 02_data_cleaning
# MCA Pune
"Maharashtra Cricket Association Stadium":
"Maharashtra Cricket Association Stadium, Pune",
# HPCA
"Himachal Pradesh Cricket Association Stadium":
"Himachal Pradesh Cricket Association Stadium, Dharamsala"
}
In [272… df['venue'].unique()
[Link] 25/28
3/4/26, 9:29 PM 02_data_cleaning
In [273… df['city'].unique()
[Link] 26/28
3/4/26, 9:29 PM 02_data_cleaning
In [274… city_mapping = {
"Bangalore": "Bengaluru",
"Chandigarh": "Mohali"
}
df["city"] = df["city"].replace(city_mapping)
In [275… df["city"].unique()
Out[362… venue
Dubai International Cricket Stadium 8080
Sharjah Cricket Stadium 4317
Name: count, dtype: int64
In [364… city_fill_mapping = {
"Dubai International Cricket Stadium": "Dubai",
"Sharjah Cricket Stadium": "Sharjah",
"Zayed Cricket Stadium, Abu Dhabi": "Abu Dhabi",
"Sheikh Zayed Stadium": "Abu Dhabi"
}
[Link] 27/28
3/4/26, 9:29 PM 02_data_cleaning
In [366… df[df["city"].isna()]["venue"].value_counts()
In [ ]:
[Link] 28/28