Tushar ML
Tushar ML
Aim:- Study descriptive statistics and data preprocessing concepts using well-known datasets like
the Titanic or Boston house price dataset.
Solution:-
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Load the Titanic dataset from seaborntitanic = sns.load_dataset('titanic')
# Display the first few rows of the dataset
print("Dataset Preview:")
print(titanic.head())
# Compute descriptive statistics
print("\nDescriptive Statistics:")
print(titanic.describe())
# Handling missing data
print("\nHandling Missing Data:")
print("Number of missing values in each column:")
print(titanic.isnull().sum())
# Drop columns with too many missing values (e.g., deck in this case) titanic =
titanic.drop(columns=['deck'])
# Fill missing values in the 'age' column with the median
titanic['age'].fillna(titanic['age'].median(), inplace=True)
# Fill missing values in the 'embarked' column with the most common value
titanic['embarked'].fillna(titanic['embarked'].mode()[0], inplace=True)
# Handling categorical data print("\nHandling Categorical Data:")
# Convert 'sex' and 'embarked' to numerical values using one-hot encoding titanic =
pd.get_dummies(titanic, columns=['sex', 'embarked'], drop_first=True) # Display the
modified dataset
print(titanic.head())
# Partitioning dataset into training and test datasets print("\nPartitioning into Training
and Test Datasets:")X = titanic.drop('survived', axis=1)
y = titanic['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Bringing features onto the same scale (Standardization)
Machine Learning with Python Lab (BCAP-311)
print("\nBringing Features onto the Same Scale:")# Select only numeric columns for
scaling
numeric_columns = X_train.select_dtypes(include=['float64', 'int64']).columnsscaler =
StandardScaler()
X_train_scaled = X_train.copy()X_test_scaled = X_test.copy()
X_train_scaled[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled[numeric_columns] = scaler.transform(X_test[numeric_columns])
# Display the first few rows of the scaled training set print("Scaled Training Set
Preview:") print(X_train_scaled.head())
OUTPUT:-
PRACTICAL-2
Aim:-Implement a linear regression with one variable algorithm from scratch using
Python.Given a dataset of X and Y values, create a linear regression model that predicts
Y based onX without using any machine learning libraries like sklearn.
Solution:-
import numpy as np
import matplotlib.pyplot as plt
def linear_regression(X, Y, learning_rate, epochs):# Initialize parameters
m=0
b=0
n = len(X)
# Gradient descent
for _ in range(epochs):
Y_pred = m * X + b
dm = (-2/n) * np.sum(X * (Y - Y_pred))db = (-2/n) * np.sum(Y - Y_pred)
m -= learning_rate * dmb -= learning_rate * db return m, b
def plot_regression_line(X, Y, m, b): plt.scatter(X, Y, color='darkorange', label='Actual
data')
plt.plot(X, m * X + b, color='royalblue',label='Regression line')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Linear Regression with One Variable')plt.legend()
plt.show()def main():
# Generate synthetic datanp.random.seed(42)
X = 2 * np.random.rand(100, 1)
Y = 4 + 3 * X + np.random.randn(100, 1)
# Flatten X and Y to ensure they are 1-D arrays forease of calculations
X = X.flatten()Y = Y.flatten()# Train model
learning_rate = 0.01
epochs = 1000
m, b = linear_regression(X, Y, learning_rate,epochs)
# Output results
print(f"Slope (m): {m:.4f}, Intercept (b): {b:.4f}")
# Plot
plot_regression_line(X, Y, m, b)if name == " main ": main()
OUTPUT:-
PRACTICAL-3
Aim:-Implement the gradient descent algorithm for linear regression with one variable
fromscratch in vectorize form. Train a linear regression model using gradient descent to
find theoptimal coefficients (slope and intercept) for a given dataset.
Solution:-
import numpy as np
import matplotlib.pyplot as pltfrom sklearn import datasets
from sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import
StandardScaler def normalize_feature(X):
mean = np.mean(X, axis=0)std = np.std(X, axis=0)
normalized_X = (X - mean) / stdreturn normalized_X, mean, std def add_intercept(X):
return np.c_[np.ones(X.shape[0]), X]def compute_cost(X, y, theta):
m = len(y)
error = X @ theta - y
cost = (1 / (2 * m)) * np.sum(error**2)return cost
def gradient_descent(X, y, theta, learning_rate, iterations):
m = len(y)
cost_history = np.zeros(iterations)for i in range(iterations):
error = X @ theta - y
gradient = (1 / m) * (X.T @ error) theta = theta - learning_rate * gradient
cost_history[i] = compute_cost(X, y, theta)return theta, cost_history
# Load the diabetes dataset diabetes = datasets.load_diabetes()
X = diabetes.data[:, np.newaxis, 2] # Use only one feature for simplicity y =
diabetes.target
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Normalize features and add intercept term
X_train_normalized, mean, std = normalize_feature(X_train) X_train_normalized =
add_intercept(X_train_normalized)
# Initialize parameters
theta_initial = np.zeros(X_train_normalized.shape[1])# Set hyperparameters
learning_rate = 0.01
iterations = 1500
# Run gradient descent
theta_optimal, cost_history = gradient_descent(X_train_normalized, y_train,
theta_initial,
learning_rate, iterations)
# Display the optimized coefficients
print("Optimal Coefficients (Intercept, Slope):", theta_optimal)# Plot the cost history to
visualize convergence
plt.plot(np.arange(1, iterations + 1), cost_history, label='Cost History')
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Gradient Descent Convergence')plt.legend()
plt.show()
OUTPUT:-
PRACTICAL-4
Aim:- Use the sklearn library to create a linear regression with multiple variables. Load
awell known dataset split it into training and testing sets, and then train the model to
predicta target variable based on one or more features. You can use the following data
set:
https://raw.githubusercontent.com/sachinmotwani20/NPTELML_Datasets/main/Scores
Prediction.csv
Solution:-
import pandas as pd
from sklearn.model_selection import train_test_splitfrom sklearn.linear_model import
LinearRegression from sklearn.metrics import mean_squared_error import
matplotlib.pyplot as plt
# Load the dataset from the provided URLurl =
"https://raw.githubusercontent.com/sachinmotwani20/NPTEL
ML_Datasets/main/ScoresPrediction.csv"
df = pd.read_csv(url)
# Display the column names in the datasetprint("\nColumn Names:", df.columns)
# Extract features and target variableX = df.drop("FinalYrScore", axis=1)
# Modify "score" based on the actual column namey = df["FinalYrScore"]
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a linear regression model
model = LinearRegression()
# Train the model on the training setmodel.fit(X_train, y_train)
# Make predictions on the testing sety_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("\nMean Squared Error on Test Set:", mse)# Visualize predictions vs. actual values
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Scores") plt.ylabel("Predicted Scores") plt.title("Actual Scores vs.
Predicted Score
OUTPUT:-
PRACTICAL-5
Aim:- Use the sklearn library to create a logistic regression model for binary
classification. Load a dataset with two classes, pre-process the data, split it into training
and testing sets, and then train the model to predict the class labels. You can use any data
set.
Solution:-
# Import necessary libraries
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split from sklearn.preprocessing import
StandardScaler from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score# Step 1: Load the wine
dataset
wine = load_wine()
X = wine.data y = wine.target
binary_filter = (y == 0) | (y == 1)X_binary = X[binary_filter] y_binary = y[binary_filter]
# Step 2: Pre-process the datascaler = StandardScaler()
X_scaled = scaler.fit_transform(X_binary)
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_binary, test_size=0.3,
random_state=42)
# Step 4: Train the logistic regression model model =
LogisticRegression(max_iter=10000)model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}") print("Classification Report:")
print(classification_report(y_test, y_pred))
OUTPUT:-
PRACTICAL-6
Aim:- Implement logistic regression from scratch for binary classification. Given a
dataset with two classes, create a logistic regression model that predicts the probability
of an example belonging to one class. Implement the logistic function and gradient
descent for optimization.
Solution:-
import numpy as np
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import
train_test_splitfrom sklearn.metrics import accuracy_score
cancer = load_breast_cancer()
X = cancer.data y = cancer.target
X_bias = np.c_[np.ones(X.shape[0]), X] X_train, X_test, y_train, y_test =
train_test_split(X_bias, y, test_size=0.2,random_state=42)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def gradient_descent(X, y, theta, learning_rate,iterations):
m = len(y)
cost_history = np.zeros(iterations)
for i in range(iterations): # Correct indentation herez = np.dot(X, theta)
h = sigmoid(z)
gradient = np.dot(X.T, (h - y)) / mtheta -= learning_rate * gradient
cost_history[i] = -1/m * np.sum(y * np.log(h) + (1 -y) * np.log(1 - h))
return theta, cost_history
theta_initial = np.zeros(X_train.shape[1])learning_rate = 0.01
iterations = 500 theta_optimal, cost_history =
gradient_descent(X_train, y_train, theta_initial,learning_rate,
iterations)
y_pred_prob = sigmoid(np.dot(X_test,theta_optimal))
y_pred = (y_pred_prob >= 0.5).astype(int)accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
OUTPUT:-
PRACTICAL-7
Aim:- Write a program to implement the naïve Bayesian classifier for a sample training
data set stored as a .CSV file. Compute the accuracy of the classifier, considering few
testdata sets.
Solution:-
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_splitfrom sklearn.naive_bayes import
GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_reportiris
= load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names) y = pd.Series(iris.target,
name='target')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = GaussianNB()
model.fit(X_train, y_train) y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)print(f"Accuracy: {accuracy:.2f}")
conf_matrix = confusion_matrix(y_test, y_pred)print("Confusion Matrix:")
print(conf_matrix)
class_report = classification_report(y_test, y_pred, target_names=iris.target_names)
print("Classification Report:")
OUTPUT:-
PRACTICAL-8
Aim:- Implement k-fold cross-validation (e.g., 5-fold) for the Naïve Bayesian classifier
on agiven dataset. Calculate the average accuracy of the classifier over the k folds and
report the results.
Solution:-
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFoldfrom sklearn.naive_bayes import
GaussianNB
from sklearn.metrics import accuracy_scoreiris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names) y = pd.Series(iris.target,
name='target')
nb_classifier = GaussianNB()num_folds = 5
stratified_kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
accuracy_scores = []
for train_index, test_index in stratified_kfold.split(X, y):
X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test =
y.iloc[train_index], y.iloc[test_index] nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test) accuracy = accuracy_score(y_test, y_pred)
accuracy_scores.append(accuracy)
average_accuracy = sum(accuracy_scores) / num_folds
print(f"Average Accuracy over {num_folds}-fold Cross-Validation:
{average_accuracy:.4f}")
OUTPUT:-
PRACTICAL-9
Solution:-
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split from sklearn.metrics import
accuracy_score
def euclidean_distance(x1, x2): return np.sqrt(np.sum((x1 - x2)**2))
class KNNClassifier:
def init (self, k=3):
self.k = k
OUTPUT:-
PRACTICAL-10
.
Solution:-
import numpy as np
import matplotlib.pyplot as pltfrom sklearn import datasets
from sklearn.model_selection import train_test_splitfrom sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score,
f1_score,confusion_matrix
# Load the Breast Cancer dataset cancer = datasets.load_breast_cancer()
X = cancer.data y = cancer.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Function to train and evaluate SVM classifier
def train_and_evaluate_svm(kernel):
svm_classifier = SVC(kernel=kernel, random_state=42)
OUTPUT:-
PRACTICAL-11
Aim:-Take a binary classification dataset and implement both the K-Nearest Neighbors
(KNN) and Support Vector Machine (SVM) classifiers using Python. Compare the
performance of these two algorithms on metrics such as accuracy, precision, recall, and
F1-score. Visualize the decision boundaries for both algorithms and discuss the strengths
and weaknesses of each approach.
2. No Training Period
KNN is a lazy learner, meaning it doesn't require a training phase. The model directly
learns from the training data during prediction.
Weaknesses
1. Computationally Intensive
As the size of the dataset grows, the computational cost of KNN increases since it needs
to calculate distances for each prediction.
2. Sensitive to Outliers
KNN is sensitive to outliers and noisy data, as they can significantly impact the distance
calculations.
3. Versatility
SVM can be applied to various types of data, including linear and non-linear
classificationand regression tasks.
Weaknesses
1. Computational Complexity
Training an SVM can be computationally intensive, especially for large datasets.
2. Choice of Kernel and Parameters
The performance of SVM is sensitive to the choice of the kernel and its parameters.
Selecting the right combination requires experimentation.
3. Limited Interpretability
SVM models may be less interpretable compared to simpler models like linear
regression.
1. Performance
In terms of performance, the choice between KNN and SVM depends on the specific
characteristics of the dataset. KNN may work well for simpler datasets with local
patterns, whileSVM may excel in tasks with more complex decision boundaries.
2. Computational Cost:
KNN is computationally expensive, especially for large datasets, as it requires calculating
distances for each prediction. SVM, while also computationally intensive, can be more
efficient for large datasets.
3. Robustness
KNN is sensitive to outliers and noisy data, while SVM is more robust due to its emphasis
on the margin.
4. Interpretability
KNN is more interpretable, as its decision is based on the majority class among the
k-nearestneighbors. SVM, being a more complex model, may be less interpretable.
5. Dataset Characteristics
Consider the nature of the dataset. If the decision boundary is expected to be locally
smooth, KNN might be more appropriate. If the dataset has high dimensionality or
requires a non- linear decision boundary, SVM might be a better choice.
In summary, the choice between KNN and SVM depends on factors such as the dataset
size, complexity, and characteristics. Both algorithms have their strengths and
weaknesses, and the selection should be made based on the specific requirements of the
task at hand.
Solution:-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import
train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.svm
import SVC
OUTPUT:-
PRACTICAL-12
Aim:- Given a dataset containing features and labels, implement a Random Forest
classification model using Python and a library like scikit-learn. Split the dataset into
training and testing sets, train the model, and evaluate its performance using metrics like
accuracy, precision, and recall.
Solution:-
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split from sklearn.ensemble import
RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score,
confusion_matrix# Load the Iris dataset
iris = load_iris()
X = iris.data # Featuresy = iris.target # Labels
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the Random Forest classifier
random_forest_classifier = RandomForestClassifier(n_estimators=100,
random_state=42)# Train the model
random_forest_classifier.fit(X_train, y_train)# Make predictions on the testing set
y_pred = random_forest_classifier.predict(X_test) # Evaluate and print performance
metrics accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted') recall =
recall_score(y_test, y_pred, average='weighted') print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")print(f"Recall: {recall:.4f}")
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)print("\nConfusion Matrix:")
print(conf_matrix)
OUTPUT:-
PRACTICAL-13
Aim: Extend the previous exercise by performing hyperparameter tuning for the Random Forest
classifier. Experiment with different values for hyperparameters like the number of trees, maximum
depth of trees, and minimum samples per leaf. Determine the best combination of hyperparameters
that maximizes the classification accuracy on the test dataset.
Solution
PRACTICAL-14
Aim: Using a dataset with a large number of features, apply a Random Forest classifier to perform
feature importance analysis. Identify and rank the most important features in the dataset. Visualize
the feature importances to gain insights into which features are most influential for classification..
Solution
import numpy as np import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import
train_test_split from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel # Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Initialize the
Random Forest classifier
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42) # Train the
model
random_forest_classifier.fit(X_train, y_train) # Make predictions on the testing set
y_pred = random_forest_classifier.predict(X_test) # Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.4f}")
# Feature Importance Analysis
feature_importances = random_forest_classifier.feature_importances_ feature_names =
data.feature_names
# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance':
feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) #
Print and visualize the top features
print("\nTop 10 Features:") print(feature_importance_df.head(10)) # Visualize Feature Importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importances')
plt.show()
OUTPUT
PRACTICAL-15
Aim: Build a feedforward artificial neural network (ANN) from scratch using Python. Implement the
Backpropagation algorithm to train the network. Use a simple binaryclassification dataset (e.g., the
XOR problem) to test and validate the network's training process and accuracy.
Solution
import numpy as np class NeuralNetwork:
def init (self, input_size, hidden_size, output_size, learning_rate): # Initialize weights and biases
self.weights_input_hidden = np.random.rand(input_size, hidden_size) self.bias_hidden =
np.zeros((1, hidden_size)) self.weights_hidden_output = np.random.rand(hidden_size, output_size)
self.bias_output = np.zeros((1, output_size))
# Learning rate self.learning_rate = learning_rate
def sigmoid(self, x):
return 1 / (1 + np.exp(-x)) def sigmoid_derivative(self, x):
return x * (1 - x) def forward(self, X): # Forward pass
self.hidden_layer_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden
self.hidden_layer_output = self.sigmoid(self.hidden_layer_input)
self.output_layer_input = np.dot(self.hidden_layer_output, self.weights_hidden_output) +
self.bias_output
self.predicted_output = self.sigmoid(self.output_layer_input)
return self.predicted_output
output_size = 1
learning_rate = 0.1
epochs = 10000
neural_network.train(X, y, epochs)
# Test the trained network
predictions = neural_network.forward(X)
print("\nPredictions:")
print(predictions)
PRACTICAL -16
Aim: Create a multilayer perceptron (MLP) neural network using a deep learning framework like
TensorFlow or PyTorch. Train the MLP on a dataset of handwritten digit images (e.g., MNIST or
Fashion MNIST) using the Backpropagation algorithm. Evaluate the model's accuracy on a separate
test dataset and visualize the results.
Solution
import tensorflow as tf
from tensorflow.keras import layers, models
x_train, x_test = x_train / 255.0, x_test / 255.0 # Normalize pixel values to between 0 and 1
# Build the MLP model
model = models.Sequential([
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5) # Evaluate the
model on the test set
cmap='gray')
plt.title(f"Predicted:
{np.argmax(predictions[i])}\nActual: {y_test[i]}")
plt.axis('off')
plt.show()
Output
PRACTICAL-17
Aim: Apply k-Means algorithm and Hierarchical Clustering algorithm to cluster a set of data stored
in a .CSV file. Use the same data set for clustering. Compare the results of these two algorithms and
comment on the quality of clustering.
Solution
import pandas as pd import numpy as np
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
file_path = 'Iris.csv' # Replace with the actual path to your CSV file data = pd.read_csv(file_path)
# Assuming the last column contains non-numeric labels
label_column = data.columns[-1]
# Encode non-numeric labels to numeric values
label_encoder = LabelEncoder()
data[label_column] = label_encoder.fit_transform(data[label_column]) # Exclude non-numeric
columns
X = data.iloc[:, :-1].values
# Number of clusters for k-Means k_clusters = 3
# Apply k-Means algorithm
kmeans = KMeans(n_clusters=k_clusters) kmeans_labels = kmeans.fit_predict(X)
kmeans_centroids = kmeans.cluster_centers_ # Apply Hierarchical Clustering linkage_matrix =
linkage(X, method='ward') dendrogram(linkage_matrix) plt.title('Hierarchical Clustering
Dendrogram') plt.show()
# Plot k-Means results
plt.scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis', edgecolors='k')
PRACTICAL-18
Aim Load a dataset from a "data.csv" file and apply the k-Means clustering algorithm to cluster the data into
'k' clusters. Experiment with different values of 'k' and visualize the results. Discuss the optimal number of
clusters based on the clustering quality metrics using PCA.
Solution
silhouette_scores.append(silhouette_avg)
# Visualize the results using PCA for dimensionality reduction pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', edgecolors='k')
plt.title(f'k-Means Clustering (k={k}), Silhouette Score: {silhouette_avg:.2f}')
plt.show()
# Plot silhouette scores for different values of 'k' plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (k)') plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters') plt.show()
PRACTICAL -19
Aim: Load a dataset from "data.csv" and apply the k-Means algorithm with a chosen 'k'
value. Use appropriate clustering quality metrics (e.g., silhouette score, inertia, Davies-
Bouldin index) to evaluate the quality of clustering. Analyze how the choice of 'k' affects
the clustering quality and discuss your findings.
Solution :
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score import
matplotlib.pyplot as plt
# Load the dataset dataset_path = "Iris.csv"
data = pd.read_csv(dataset_path)
# Exclude the last column (assuming it contains labels)
X = data.iloc[:, :-1].values
# Choose a range of 'k' values to experiment with k_values = range(2, 11)
# Lists to store clustering quality metrics silhouette_scores = []
inertia_values = [] davies_bouldin_scores = []
# Apply k-Means for different 'k' values for k in k_values:
# Fit the k-Means model
kmeans = KMeans(n_clusters=k, random_state=42) kmeans.fit(X)
# Predict clusters
labels = kmeans.labels_
# Calculate clustering quality metrics silhouette_scores.append(silhouette_score(X,
labels)) inertia_values.append(kmeans.inertia_)
davies_bouldin_scores.append(davies_bouldin_score(X, labels))
# Plotting the results
plt.figure(figsize=(12, 4))
# Plot Silhouette Score plt.subplot(1, 3, 1)
plt.plot(k_values, silhouette_scores, marker='o') plt.title('Silhouette Score')
plt.xlabel('Number of Clusters (k)') plt.ylabel('Score')
Output
PRACTICAL -20
Aim: Write a Python program to implement a Self-Organizing Map (SOM) and train it
ona given dataset, such as a collection of 2D points. Allow the user to specify parameters
like the map size, learning rate, and number of training iterations. Visualize the map
beforeand after training to observe how it adapts to the data.
Solution
import numpy as np
from minisom import MiniSom import matplotlib.pyplot as plt
def visualize_som(som, data, title): plt.figure(figsize=(8, 8))
plt.pcolor(som.distance_map().T, cmap='bone_r') # plot the distance map as background
plt.colorbar()
# plot data points on the map for i, (x, _) in enumerate(data):
w = som.winner(x)
plt.plot(w[0] + 0.5, w[1] + 0.5, 'o', markerfacecolor='None', markersize=10,
markeredgecolor='r', markeredgewidth=2)
plt.text(w[0] + 0.5, w[1] + 0.5, str(i + 1), color='k', fontweight='bold', ha='center',
va='center')
plt.title(title)
plt.show()
# Generate synthetic 2D data np.random.seed(42)
data = np.random.rand(100, 2) # replace this with your own dataset # User-defined
parameters
map_size = (10, 10) # SOM map size learning_rate = 0.5 # initial learning rate
num_iterations = 1000 # number of training iterations # Create and train the SOM
som = MiniSom(*map_size, 2, sigma=1.0, learning_rate=learning_rate)
PRACTICAL -21
Aim: Implement a Self-Organizing Map (SOM) algorithm and use it to cluster a set of
images based on their visual content. You can represent each image as a feature
vector(e.g., using color histograms or deep features) and train the SOM to group similar
images together. Visualize the resulting clusters and evaluate the quality of the clustering.
Solution
import numpy as np
def extract_features(images):features = []
for img in images:
# Use color histograms as feature vectors
hist_r = np.histogram(img[:,:,0], bins=256, range=(0, 256))[0] hist_g =
np.histogram(img[:,:,1], bins=256, range=(0, 256))[0]hist_b = np.histogram(img[:,:,2],
bins=256, range=(0, 256))[0]feature_vector = np.concatenate((hist_r, hist_g, hist_b))
features.append(feature_vector)
return np.array(features)
defined parameters
map_size = (10, 10) # SOM map size learning_rate = 0.5 # initial learning rate
num_iterations = 1000 # number of training iterations # Create and train the SOM
som = MiniSom(*map_size, feature_vectors.shape[1], sigma=1.0,
learning_rate=learning_rate) som.random_weights_init(feature_vectors)
print("Training SOM...") som.train_random(feature_vectors, num_iterations)
print("Training complete.")
# Get cluster assignments for each image
cluster_labels = [som.winner(x) for x in feature_vectors]
visualize_som_with_labels(som, feature_vectors, cluster_labels, title="SOM
Clustering")
OUTPUT:
PRACTICAL -22
Aim: Given a dataset of customer churn, implement a program that compares the
performance of three different supervised learning algorithms (e.g., Logistic Regression,
Random Forest, and Support Vector Machine) for binary classification. Split the dataset
into training and testing sets, train each algorithm on the training set, and evaluate their
performance using metrics like accuracy, precision, recall, and F1-score. Present the results
in a clear and informative way, such as through a bar chart or a table.
Solution
import pandas as pd
from sklearn.model_selection
import train_test_split from
sklearn.linear_model import
LogisticRegression from sklearn.ensemble
import RandomForestClassifierfrom
sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score,
recall_score, f1_scoreimport matplotlib.pyplot as plt
# Load your customer churn dataset (replace 'your_dataset.csv' with your
actual dataset)dataset_path = 'telecom_churn.csv'
data = pd.read_csv(dataset_path)
# Assuming your dataset has a 'Churn' column indicating binary labels (1 for
churn, 0 for non-churn)
X=
data.drop('Churn',
axis=1)y =
data['Churn']
# Split the dataset into training and testing sets