ML Program1
1. Develop a program to create histograms for all numerical features and analyse the distribution
of each feature. Generate box plots for all numerical features and identify any outliers. Use
California Housing dataset.
import pandas as pd
import numpy as np
import seaborn as sns
import [Link] as plt
from [Link] import fetch_california_housing
# Step 1: Load the California Housing dataset
data = fetch_california_housing(as_frame=True)
housing_df = [Link]
# Step 2: Create histograms for numerical features
numerical_features = housing_df.select_dtypes(include=[[Link]]).columns
# Plot histograms
[Link](figsize=(15, 10))
for i, feature in enumerate(numerical_features):
[Link](3, 3, i + 1) # Correct indentation
[Link](housing_df[feature], kde=True, bins=30, color='blue')
[Link](f'Distribution of {feature}')
plt.tight_layout() # Properly place this outside the loop
[Link]()
# Step 3: Generate box plots for numerical features
# Plot box plots
[Link](figsize=(15, 10))
for i, feature in enumerate(numerical_features):
[Link](3, 3, i + 1)
[Link](x=housing_df[feature], color='orange')
[Link](f'Box Plot of {feature}')
plt.tight_layout()
[Link]()
# Step 4: Identify outliers using the IQR method
print("Outliers Detection:")
outliers_summary = {}
for feature in numerical_features:
Q1 = housing_df[feature].quantile(0.25)
Q3 = housing_df[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = housing_df[(housing_df[feature] < lower_bound) | (housing_df[feature] >
upper_bound)]
outliers_summary[feature] = len(outliers)
print(f"{feature}: {len(outliers)} outliers")
# Optional: Print a summary of the dataset
print("\nDataset Summary:")
print(housing_df.describe())
output: