Public
Program 4 Write a program to Load and explore the dataset of CSV and excel files using pandas.
Step 1: Creating CSV and Excel Files with Dummy Data
Create CSV File: Open a text editor like Notepad or any other code editor. Enter the following
data Name, Age, Score
Srikanth, 28, 85
Snigdha, 22, 78
Mary, 31, 92
Save this file as sample_data.csv in the C:\ML_Projects directory.
Create Excel File: We can use Microsoft Excel or Google Sheets to create this file. Enter the
below data:
Name Course Sem
Rajesh BCA 1
Ramesh BCA 2
Swati BCOM 1
Florina BCOM 3
Pooja BBA 2
Raghu BBA 4
Save this file as sample_data.xlsx in the C:\ML_Projects directory.
Step 2: Python Code to Load and Explore the Data
import pandas as pd
# Define the file paths
csv_file_path = 'C:\\ML_Projects\\p4_CSV.csv'
excel_file_path = 'C:\\ML_Projects\\p4_XLSX.xlsx'
# Load the CSV file
data_csv = pd.read_csv(csv_file_path)
print("CSV File Data:")
Public
print(data_csv)
# Load the Excel file
data_excel = pd.read_excel (excel_file_path)
print("\nExcel File Data:")
print(data_excel)
# Basic Data Exploration
print("\nData Descriptions: ")
print("CSV Data Description:")
print(data_csv.describe())
print("\nExcel Data Description:")
print(data_excel.describe())
# Displaying data types
print("\nData Types in CSV File:")
print(data_csv.dtypes)
print("\nData Types in Excel File:")
print(data_excel.dtypes)
print(data_excel.dtypes)
Program 5
Write a program to Visualize the dataset to gain insights using Matplotlib by plotting
scatter plots, bar charts.
Step 1: Create the CSV File:
Create a CSV file with below data of student study hours and exam scores: Save this file as
study_data.CSV.
Public
Student ID, Study Hours, Exam Score
1,5,82
2,2,48
3,8,90
4,1,35
5,3,50
6,4,66
7,9,95
8,6,75
9,7,88
10,0.5,30
11,10,96
12,0,20
13,12,98
Step 2: Python Code:
import pandas as pd
import matplotlib.pyplot as plt
# Load the data
data = pd.read_csv('C:\\ML_Projects\\study_data.csv')
# Scatter plot of Study Hours vs Exam Scores
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1) # 1 row, 2 columns, 1st subplot
plt.scatter (data[ 'Study Hours'], data[ ' Exam Score'], color= 'dodgerblue', edgecolor= 'k',
alpha=0.7)
plt.title('Study Hours vs. Exam Scores')
plt.xlabel('Study Hours')
Public
plt.ylabel('Exam Scores')
plt.grid(True)
# Bar chart of Average Exam Score by Study Hour Range
# Creating bins for study hour ranges
bins=[0, 2, 4, 6, 8, 10, 12]
labels =['0-2', '2-4', '4-6', '6-8', '8-10', '10-12']
data['Study Hour Range'] = pd.cut (data['Study Hours'], bins=bins, labels=labels, right=False)
grouped_data = data.groupby('Study Hour Range')['Exam Score'].mean()
plt.subplot(1, 2, 2) # 1 row, 2 columns, 2nd subplot
grouped_data.plot(kind='bar', color='salmon')
plt.title('Average Exam Score by Study Hour Range')
plt.xlabel('Study Hour Range')
plt.ylabel('Average Exam Score')
plt.xticks(rotation=0) # Keep the category labels horizontal
plt.tight_layout() # Adjust subplots to fit into figure area.
plt.show()
Program 6
Write a program to Handle missing data, encode categorical variables, and perform feature
scaling.
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
Public
# Create dummy data
data={
'Age': [25, 30, None, 28, 35],
'Gender': ['Female', 'Male', 'Male', 'Female', 'Male'],
'Income': [50000, 60000, 45000, None, 70000]
df= pd.DataFrame(data)
# Handling missing data
imputer = SimpleImputer (strategy='mean')
df[['Age', 'Income']]=imputer.fit_transform(df [[ 'Age', 'Income']])
# Print data after handling missing values
print("Data after handling missing values:")
print(df)
# Encoding categorical variables
encoder=OneHotEncoder()
encoded_data = encoder.fit_transform(df[['Gender']]).toarray()
# Print data after categorical encoding
encoded_df = pd.DataFrame (encoded_data,
columns=encoder.get_feature_names_out(['Gender']))
print("\nData after categorical encoding:")
print(encoded_df)
# Feature scaling
scaler = StandardScaler()
scaled_data = scaler.fit_transform (df [[ 'Age', 'Income']])
# Print data after feature scaling
Public
scaled_df = pd.DataFrame(scaled_data, columns=['Scaled Age', 'Scaled Income'])
print("\nData after feature scaling:")
print(scaled_df)
Program 7
Write a program to implement a k-Nearest Neighbours (k-NN) classifier using scikitlearn and
Train the classifier on the dataset and evaluate its performance.
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn. neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# Dummy student data: exam score 1, exam score 2, pass/fail (features)
X = np.array([[80, 75], [95, 90], [60, 50], [45, 30], [30, 40], [85, 95], [70, 60], [50, 55],
[40, 45], [60, 70]])
y = np.array([1, 1, 0, 0, 0, 1, 1, 0, 0, 1]) # Binary classes for demonstration
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the k-NN classifier with k=3
knn = KNeighborsClassifier (n_neighbors=3)
# Train the classifier on the training data
knn.fit(X_train, y_train)
# Evaluate the classifier's performance
y_pred=knn.predict(X_test)
accuracy = accuracy_score (y_test, y_pred)
print("Accuracy on the test set: {:.2f}".format(accuracy))
Public
# Take user input for exam scores
exam_score1 = float(input("Enter Exam Score 1: "))
exam_score2 = float(input("Enter Exam Score 2: "))
# Prepare the user input for prediction
user_input = np.array([[exam_score1, exam_score2]])
# Use the trained k-NN classifier to predict the outcome
predicted_outcome=knn.predict(user_input)
if predicted_outcome [0] == 1:
print("Based on the exam scores provided, the student is predicted to pass.")
else:
print("Based on the exam scores provided, the student is predicted to fail.")
Program 8
Write a program to implement a linear regression model for regression tasks and Train the
model on a dataset with continuous target variables.
import numpy as np
from sklearn.linear_model import LinearRegression
# Dummy house price prediction data: features (house size, number of bedrooms) and target
variable (house price)
X = np.array([[1000, 2], [1500, 3], [1200, 2], [1800, 4], [900, 2], [2000, 3]])
y = np.array([300000, 400000, 350000, 500000, 280000, 450000])
# Initialize the Linear Regression model
Public
model = LinearRegression()
# Train the model on the dataset
model.fit(X, y)
# Take input from the user for new house data
size = float(input("Enter the size of the house in sqft: "))
bedrooms=int(input("Enter the number of bedrooms: "))
new_data = np.array([[size, bedrooms]])
# Predict the price for the new house data
predicted_price = model.predict(new_data)
# Print the predicted price for the new house data
print("Predicted price for a house with size {} sqft and {} bedrooms: Rs.{:.2f}". format (size,
bedrooms, predicted_price[0]))
Program 9
Write a program to implement a decision tree classifier using scikit-learn and visualize the
decision tree and understand its splits.
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.tree import export_text
import matplotlib.pyplot as plt
# Custom dummy data for fruit classification
#Features: [Weight, Texture] -> Target: [Fruit Type]
x = np.array([[150, 0], [170, 1], [120, 0], [140, 1], [200, 1], [130, 0]])
Public
y = np.array(['Apple', 'Orange', 'Apple', 'Orange', 'Melon', 'Apple'])
# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier (random_state=42)
clf.fit(x, y)
# Visualize the Decision Tree splits
tree_rules = export_text(clf, feature_names=['Weight', 'Texture'])
print("Decision Tree Classifier Rules: \n", tree_rules)
# Plot the Decision Tree plt.figure(figsize=(10, 6))
plot_tree (clf, filled=True, feature_names=['Weight', 'Texture'], class_names=np. unique (y))
plt.show()
Program 10 Write a program to Implement K-Means clustering and Visualize clusters.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Generate dummy customer data (Age, Income)
X = np.array([[30, 50000], [35, 60000], [40, 80000], [25, 30000], [45, 100000], [20, 20000], [50,
120000], [55, 150000], [60, 140000], [28, 40000]])
# Initialize K-Means with 2 clusters
kmeans = KMeans (n_clusters=3, random_state=0)
kmeans.fit(X)
# Get cluster labels and cluster centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_
Public
# Visualize the clusters
plt.figure(figsize=(8, 6))
plt.scatter (X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50, alpha=0.8)
plt.scatter (centers[:, 0], centers[:, 1], c='red', s=200, marker='X', label='Centroids')
plt.xlabel('Age')
plt.ylabel('Income')
plt.title('K-Means Clustering of Customers')
plt.legend()
plt.show()