In [1]:
# prompt: write code which has "team1" = "RCB", "team2" = "PBKS", "Season" = "IPL-2025" and "date" = "6/3/2025" using the deliveries_2008_2023 and matches_2008_2023 dataset which has the similar code structure as "IPL_rcb_vs_dc_analysis_2024.ipynb"
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the datasets
deliveries_df = pd.read_csv('deliveries_2008_2025.csv')
matches_df = pd.read_csv('matches_2008_2025.csv')
In [2]:
# Define the specific match details
team1 = "RCB"
team2 = "PBKS"
season = "IPL-2025"
date = "6/3/2025" # Ensure date format matches the 'date' column in matches_df
# Find the specific match in the matches dataset
match_id = matches_df[(matches_df['team1'] == team1) &
(matches_df['team2'] == team2) &
(matches_df['Season'] == season) &
(matches_df['date'] == date)]['match_id'].iloc[0]
print("Unique dates in matches_df:", matches_df['date'].unique())
Unique dates in matches_df: ['22/03/2025' '23/03/2025' '24/03/2025' '25/03/2025' '26/03/2025' '27/03/2025' '28/03/2025' '29/03/2025' '30/03/2025' '31/03/2025' '4/1/2025' '4/2/2025' '4/3/2025' '4/4/2025' '4/5/2025' '4/6/2025' '4/7/2025' '4/8/2025' '4/9/2025' '4/10/2025' '4/11/2025' '4/12/2025' '13-04-2025' '14-04-2025' '15-04-2025' '16-04-2025' '17-04-2025' '18-04-2025' '19-04-2025' '20-04-2025' '21-04-2025' '22-04-2025' '23-04-2025' '24-04-2025' '25-04-2025' '26-04-2025' '27-04-2025' '28-04-2025' '29-04-2025' '30-04-2025' '5/1/2025' '5/2/2025' .................]
In [3]:
# Filter the deliveries dataset for the specific match
match_deliveries_df = deliveries_df[deliveries_df['match_id'] == match_id]
In [4]:
print(match_deliveries_df.head())
match_id Season innings batting_team bowling_team over ball \ 16994 202574 2025 1 RCB PBKS 1 1 16995 202574 2025 1 RCB PBKS 1 1 16996 202574 2025 1 RCB PBKS 1 2 16997 202574 2025 1 RCB PBKS 1 3 16998 202574 2025 1 RCB PBKS 1 4 batsman bowler wide_runs bye_runs noball_runs \ 16994 Phil Salt Arshdeep Singh 1 0 0 16995 Phil Salt Arshdeep Singh 0 0 0 16996 Phil Salt Arshdeep Singh 0 0 0 16997 Phil Salt Arshdeep Singh 0 0 0 16998 Phil Salt Arshdeep Singh 0 0 0 batsman_runs extras_runs total_runs player_dismissed dismissed_by 16994 0 1 1 NaN NaN 16995 0 0 0 NaN NaN 16996 0 0 0 NaN NaN 16997 6 0 6 NaN NaN 16998 2 0 2 NaN NaN
In [5]:
# Find the match_id for RCB vs PBKS on 31-03-2023 in IPL-2023
match_row = matches_df[
(matches_df['team1'] == team1) &
(matches_df['team2'] == team2) &
(matches_df['Season'] == season) &
((matches_df['date'] == date) | (matches_df['date'] == '31-03-2023'))
]
if not match_row.empty:
match_id = match_row.iloc[0]['match_id']
else:
raise ValueError("Match not found for RCB vs PBKS on 31-03-2023 in IPL-2023")
sns.set_style("whitegrid")
# data preparation for run distribution per over
run_distribution = deliveries_df.groupby(['match_id', 'innings', 'batting_team', 'over']).agg({'total_runs': 'sum'}).reset_index()
# Filter for the specific match
match_run_distribution = run_distribution[run_distribution['match_id'] == match_id]
# Rename 'batting_team' to 'team' for plotting
match_run_distribution = match_run_distribution.rename(columns={'batting_team': 'team'})
# plotting run distribution per over for both teams
plt.figure(figsize=(14, 6))
sns.lineplot(data=match_run_distribution, x='over', y='total_runs', hue='team', marker='o')
plt.title('Run Distribution Per Over for Match ID: {}'.format(match_id))
plt.xlabel('Over Number')
plt.ylabel('Runs Scored')
plt.xticks(range(0, 21)) # over numbers from 0 to 20
plt.legend(title='Team')
plt.show()
In [6]:
# calculating top scorers for each team
top_scorers = (
match_deliveries_df.groupby(['batting_team', 'batsman'])
.agg({'batsman_runs': 'sum'})
.reset_index()
.sort_values(by='batsman_runs', ascending=False)
)
plt.figure(figsize=(14, 8))
sns.barplot(data=top_scorers, x='batsman_runs', y='batsman', hue='batting_team', dodge=False)
plt.title('Top Scorers from Each Team')
plt.xlabel('Total Runs')
plt.ylabel('Batter')
plt.legend(title='Team', loc='center right')
plt.show()
top_scorers
Out[6]:
batting_team | batsman | batsman_runs | |
---|---|---|---|
6 | PBKS | Shashank Singh | 61 |
11 | RCB | Kohli | 43 |
2 | PBKS | Josh Inglis | 39 |
4 | PBKS | Prabhsimran | 26 |
16 | RCB | Rajat Patidar | 26 |
13 | RCB | Livingstone | 25 |
5 | PBKS | Priyansh Arya | 24 |
10 | RCB | Jitesh Sharma | 24 |
14 | RCB | Mayank Agarawal | 24 |
17 | RCB | Shepherd | 17 |
15 | RCB | Phil Salt | 16 |
3 | PBKS | Nehal Wadhera | 15 |
8 | PBKS | Stoinis | 6 |
12 | RCB | Krunal Pandya | 4 |
0 | PBKS | Azmatullah | 1 |
9 | RCB | Bhuvneshwar | 1 |
7 | PBKS | Shreyas Iyer | 1 |
18 | RCB | Yash Dayal | 1 |
1 | PBKS | Jamieson | 0 |
In [7]:
# preparing data for bowling analysis
# (already done in previous cells, so this is just for reference)
# Ensure 'wickets_taken' exists in match_deliveries_df
if 'wickets_taken' not in match_deliveries_df.columns:
match_deliveries_df['wickets_taken'] = match_deliveries_df['player_dismissed'].notna().astype(int)
# Use the correct columns for bowling analysis
bowling_stats = match_deliveries_df.groupby(['bowling_team', 'bowler']).agg({
'total_runs': 'sum',
'wickets_taken': 'sum',
'over': 'nunique'
}).reset_index()
bowling_stats
C:\Users\DELL\AppData\Local\Temp\ipykernel_25608\3737641703.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy match_deliveries_df['wickets_taken'] = match_deliveries_df['player_dismissed'].notna().astype(int)
Out[7]:
bowling_team | bowler | total_runs | wickets_taken | over | |
---|---|---|---|---|---|
0 | PBKS | Arshdeep Singh | 40 | 3 | 4 |
1 | PBKS | Azmatullah | 35 | 1 | 4 |
2 | PBKS | Chahal | 37 | 1 | 4 |
3 | PBKS | Jamieson | 48 | 3 | 4 |
4 | PBKS | Vijaykumar Vyshak | 30 | 1 | 4 |
5 | RCB | Bhuvneshwar | 38 | 2 | 4 |
6 | RCB | Hazlewood | 55 | 1 | 4 |
7 | RCB | Krunal Pandya | 17 | 2 | 4 |
8 | RCB | Shepherd | 30 | 1 | 3 |
9 | RCB | Suyash Sharma | 20 | 0 | 2 |
10 | RCB | Yash Dayal | 24 | 1 | 3 |
In [8]:
# Add economy rate to bowling_stats if not already present
if 'economy_rate' not in bowling_stats.columns:
bowling_stats['economy_rate'] = bowling_stats['total_runs'] / bowling_stats['over']
# Sort for better visualization
bowling_stats_sorted = bowling_stats.sort_values(by='wickets_taken', ascending=False)
# Create the plot
fig, ax1 = plt.subplots(figsize=(14, 8))
# Bar plot for wickets
sns.barplot(data=bowling_stats_sorted, x='bowler', y='wickets_taken', hue='bowling_team', ax=ax1, alpha=0.6)
ax1.set_ylabel('Wickets Taken')
ax1.set_xlabel('Bowler')
ax1.set_title('Bowling Analysis: Wickets and Economy Rate')
ax1.legend(title='Team', bbox_to_anchor=(1.05, 1), loc='upper left')
for item in ax1.get_xticklabels():
item.set_rotation(45)
ax2 = ax1.twinx()
sns.lineplot(data=bowling_stats_sorted, x='bowler', y='economy_rate', marker='o', sort=False, ax=ax2, color='black')
ax2.set_ylabel('Economy Rate')
plt.tight_layout()
plt.show()
In [9]:
# counting dismissal types using the correct column
dismissal_types = match_deliveries_df['dismissed_by'].dropna().value_counts()
plt.figure(figsize=(8, 8))
plt.pie(dismissal_types, labels=dismissal_types.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("Set2"))
plt.title('Wickets Percentage By Each Bowler')
plt.show()
In [10]:
# function to calculate partnerships for a match
def calculate_partnerships(df):
partnerships = []
current_partnership = None
prev_batsmen = set()
for i, row in df.iterrows():
striker = row['batsman']
non_striker = None # Not available in this dataset
team = row['batting_team']
runs = row['total_runs']
wicket = row['player_dismissed']
# Start new partnership if needed
if current_partnership is None:
current_partnership = {
'team': team,
'batter1': striker,
'batter2': non_striker,
'runs': 0,
'balls': 0
}
prev_batsmen = {striker, non_striker}
# Add runs and balls
current_partnership['runs'] += runs
current_partnership['balls'] += 1
# If wicket falls, close partnership
if pd.notna(wicket):
partnerships.append(current_partnership)
current_partnership = None
prev_batsmen = set()
# Add last partnership if not ended by wicket
if current_partnership is not None:
partnerships.append(current_partnership)
return partnerships
# Calculate partnerships for the selected match
partnerships_data = calculate_partnerships(match_deliveries_df)
partnerships_df = pd.DataFrame(partnerships_data)
# Filter out significant partnerships (e.g., partnerships with more than 20 runs)
significant_partnerships = partnerships_df[partnerships_df['runs'] > 20]
# Sort by highest runs
significant_partnerships = significant_partnerships.sort_values(by='runs', ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(data=significant_partnerships, x='runs', y='batter1', hue='team', dodge=False)
plt.title('Significant Batting Partnerships')
plt.xlabel('Runs Scored')
plt.ylabel('Batter 1 (Partnership Initiated)')
plt.legend(title='Team')
plt.show()
In [11]:
# function to classify the phase of the game based on the over number
def classify_phase(over):
if over < 6:
return 'Powerplay'
elif over < 16:
return 'Middle'
else:
return 'Death'
# adding phase information to the dataframe
match_deliveries_df['phase'] = match_deliveries_df['over'].apply(classify_phase)
# grouping data by phase and team to calculate runs and wickets
phase_analysis = match_deliveries_df.groupby(['batting_team', 'phase']).agg({'total_runs': 'sum', 'wickets_taken': 'sum', 'over': 'count'}).rename(columns={'over': 'balls'}).reset_index()
# calculating the run rate
phase_analysis['run_rate'] = (phase_analysis['total_runs'] / phase_analysis['balls']) * 6
# plotting the phase analysis
fig, ax1 = plt.subplots(figsize=(12, 8))
# bar plot for runs scored in each phase
sns.barplot(data=phase_analysis, x='phase', y='total_runs', hue='batting_team', ax=ax1)
ax1.set_title('Phase Analysis: Runs and Wickets')
ax1.set_ylabel('Total Runs')
ax1.set_xlabel('Match Phase')
# line plot for wickets lost
ax2 = ax1.twinx()
sns.lineplot(data=phase_analysis, x='phase', y='wickets_taken', hue='batting_team', marker='o', ax=ax2, legend=False)
ax2.set_ylabel('Wickets Lost')
plt.show()
C:\Users\DELL\AppData\Local\Temp\ipykernel_25608\1756120926.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy match_deliveries_df['phase'] = match_deliveries_df['over'].apply(classify_phase)
In [12]:
# calculate runs and balls faced for each batter
batter_stats = match_deliveries_df.groupby('batsman').agg({'batsman_runs': 'sum', 'ball': 'count'}).rename(columns={'ball': 'balls_faced'}).reset_index()
# calculate strike rate for each batter (runs per 100 balls)
batter_stats['strike_rate'] = (batter_stats['batsman_runs'] / batter_stats['balls_faced']) * 100
# sorting batters by their strike rate
batter_stats_sorted = batter_stats.sort_values(by='strike_rate', ascending=False)
# displaying calculated strike rates along with runs scored and balls faced
batter_stats_sorted.head(10)
Out[12]:
batsman | batsman_runs | balls_faced | strike_rate | |
---|---|---|---|---|
17 | Stoinis | 6 | 2 | 300.000000 |
3 | Jitesh Sharma | 24 | 11 | 218.181818 |
14 | Shashank Singh | 61 | 30 | 203.333333 |
15 | Shepherd | 17 | 9 | 188.888889 |
4 | Josh Inglis | 39 | 23 | 169.565217 |
13 | Rajat Patidar | 26 | 16 | 162.500000 |
10 | Phil Salt | 16 | 10 | 160.000000 |
8 | Mayank Agarawal | 24 | 19 | 126.315789 |
7 | Livingstone | 25 | 20 | 125.000000 |
12 | Priyansh Arya | 24 | 20 | 120.000000 |
In [13]:
# merging phase information with batter stats
batter_phase_stats = match_deliveries_df.groupby(['batsman', 'phase']).agg({'batsman_runs': 'sum', 'over': 'count'}).rename(columns={'over': 'balls_faced'}).reset_index()
# calculate strike rate for each batsman-phase combination
batter_phase_stats['strike_rate'] = (batter_phase_stats['batsman_runs'] / batter_phase_stats['balls_faced']) * 100
# filtering for top performers based on overall strike rate
top_performers = batter_stats_sorted.head(5)['batsman']
batter_phase_stats_top = batter_phase_stats[batter_phase_stats['batsman'].isin(top_performers)]
# plotting strike rate across different phases for top performers
plt.figure(figsize=(10, 6))
sns.barplot(data=batter_phase_stats_top, x='batsman', y='strike_rate', hue='phase')
plt.title('Strike Rate Across Different Phases for Top Performers')
plt.xlabel('Batsman')
plt.ylabel('Strike Rate')
plt.legend(title='Match Phase')
plt.show()
In [14]:
# calculate cumulative runs and wickets for each ball for both teams
match_deliveries_df['cumulative_runs'] = match_deliveries_df.groupby('batting_team')['total_runs'].cumsum()
match_deliveries_df['cumulative_wickets'] = match_deliveries_df.groupby('batting_team')['wickets_taken'].cumsum()
# separate data for both teams
team1_deliveries = match_deliveries_df[match_deliveries_df['batting_team'] == 'RCB'].copy()
team2_deliveries = match_deliveries_df[match_deliveries_df['batting_team'] == 'PBKS'].copy()
# calculating overs for cumulative analysis
team1_deliveries['over_ball'] = team1_deliveries['over'] + (team1_deliveries['ball'] - 1) / 6
team2_deliveries['over_ball'] = team2_deliveries['over'] + (team2_deliveries['ball'] - 1) / 6
# plotting cumulative run rates and wickets
fig, ax = plt.subplots(figsize=(14, 8))
# plot for RCB
ax.plot(team1_deliveries['over_ball'], team1_deliveries['cumulative_runs'], color='red', label='RCB Runs')
ax.scatter(team1_deliveries[team1_deliveries['wickets_taken'] == 1]['over_ball'], team1_deliveries[team1_deliveries['wickets_taken'] == 1]['cumulative_runs'], color='red', marker='X', s=100, label='RCB Wickets')
# plot for PBKS
ax.plot(team2_deliveries['over_ball'], team2_deliveries['cumulative_runs'], color='darkred', label='PBKS Runs')
ax.scatter(team2_deliveries[team2_deliveries['wickets_taken'] == 1]['over_ball'], team2_deliveries[team2_deliveries['wickets_taken'] == 1]['cumulative_runs'], color='darkred', marker='X', s=100, label='PBKS Wickets')
ax.set_title('Cumulative Runs with Wickets for RCB and PBKS')
ax.set_xlabel('Over')
ax.set_ylabel('Cumulative Runs')
ax.legend()
plt.show()
C:\Users\DELL\AppData\Local\Temp\ipykernel_25608\310014327.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy match_deliveries_df['cumulative_runs'] = match_deliveries_df.groupby('batting_team')['total_runs'].cumsum() C:\Users\DELL\AppData\Local\Temp\ipykernel_25608\310014327.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy match_deliveries_df['cumulative_wickets'] = match_deliveries_df.groupby('batting_team')['wickets_taken'].cumsum()
In [15]:
# calculate runs and wickets per over for both teams
per_over_stats = match_deliveries_df.groupby(['batting_team', 'over']).agg({'total_runs': 'sum', 'wickets_taken': 'sum'}).reset_index()
# calculate run rate for each over (runs per over)
per_over_stats['run_rate'] = per_over_stats['total_runs'] # runs per over
# separate data for RCB and PBKS for plotting
team1_per_over_stats = per_over_stats[per_over_stats['batting_team'] == 'RCB']
team2_per_over_stats = per_over_stats[per_over_stats['batting_team'] == 'PBKS']
# plotting run rates and marking wickets for each team
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8), sharex=True)
ax1.plot(team1_per_over_stats['over'], team1_per_over_stats['run_rate'], marker='o', color='red', label='RCB Run Rate')
ax1.scatter(team1_per_over_stats[team1_per_over_stats['wickets_taken'] > 0]['over'], team1_per_over_stats[team1_per_over_stats['wickets_taken'] > 0]['run_rate'], color='gold', s=100, label='Wickets')
ax1.set_title('RCB Run Rate Per Over')
ax1.set_ylabel('Runs per Over')
ax1.legend()
ax2.plot(team2_per_over_stats['over'], team2_per_over_stats['run_rate'], marker='o', color='darkred', label='PBKS Run Rate')
ax2.scatter(team2_per_over_stats[team2_per_over_stats['wickets_taken'] > 0]['over'], team2_per_over_stats[team2_per_over_stats['wickets_taken'] > 0]['run_rate'], color='blue', s=100, label='Wickets')
ax2.set_title('PBKS Run Rate Per Over')
ax2.set_xlabel('Over')
ax2.set_ylabel('Runs per Over')
ax2.legend()
plt.show()