Data Mining Practicals

Data mining practicals :-
Underfitting
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Generate synthetic data

np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Fit a simple linear regression model

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
# Evaluate the model on the training data

y_train_pred = linear_reg.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
# Evaluate the model on the testing data

y_test_pred = linear_reg.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
# Plot the data and the model

plt.scatter(X, y, label='Data points')
plt.plot(X, linear_reg.predict(X), color='red', label='Linear Regression
Model')
plt.title(f'Underfitting Example\nTrain MSE: {mse_train:.2f}, Test MSE:
{mse_test:.2f}')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
Overfitting :-
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
# Generate synthetic data

np.random.seed(42)

random_state=42)
# Create a polynomial features transformer

poly_features = PolynomialFeatures(degree=15, include_bias=False)
X_poly = poly_features.fit_transform(X_train)
# Fit a linear regression model on the polynomial features

lin_reg = LinearRegression()
lin_reg.fit(X_poly, y_train)
# Visualize the overfitting

X_range = np.linspace(0, 2, 100).reshape(-1, 1)
X_range_poly = poly_features.transform(X_range)
y_pred = lin_reg.predict(X_range_poly)
plt.scatter(X_train, y_train, label='Training Data')

plt.scatter(X_test, y_test, label='Testing Data', color='r')
plt.plot(X_range, y_pred, label='Polynomial Regression', color='g')
plt.title('Overfitting Example')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
# Evaluate the model on training and testing data

y_train_pred = lin_reg.predict(X_poly)
mse_train = mean_squared_error(y_train, y_train_pred)
print(f'Mean Squared Error on Training Data: {mse_train}')
X_test_poly = poly_features.transform(X_test)
y_test_pred = lin_reg.predict(X_test_poly)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f'Mean Squared Error on Testing Data: {mse_test}')
Cross validation:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
# Load dataset (replace this with your own dataset)

data = load_iris()
X, y = data.data, data.target
# Initialize a classifier (replace this with your own classifier)

classifier = RandomForestClassifier()
# Set up KFold cross-validation with 5 folds

kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(classifier, X, y, cv=kf)
# Print the cross-validation scores

for i, score in enumerate(cv_scores):
print(f'Fold {i+1}: {score}')
# Print the mean and standard deviation of the cross-validation scores

print(f'Mean CV Score: {cv_scores.mean()}')
print(f'Standard Deviation of CV Scores: {cv_scores.std()}')
Confusion matrix
from sklearn.metrics import confusion_matrix
# Load dataset (replace this with your own dataset)

data = load_iris()
X, y = data.data, data.target
# Split the dataset into training and testing sets

random_state=42)
# Initialize a classifier (replace this with your own classifier)

classifier = RandomForestClassifier()
# Train the classifier on the training data

classifier.fit(X_train, y_train)
# Make predictions on the test data

y_pred = classifier.predict(X_test)
# Create a confusion matrix

cm = confusion_matrix(y_test, y_pred)
# Print the confusion matrix

print("Confusion Matrix:")
print(cm)
Gradient descent:
import numpy as np
# Generate some random data for a linear relationship

np.random.seed(42)
# Add a bias term to the feature matrix

X_b = np.c_[np.ones((100, 1)), X]
# Set hyperparameters
learning_rate = 0.01
n_iterations = 1000
# Initialize random weights

theta = np.random.randn(2, 1)
# Gradient Descent algorithm

for iteration in range(n_iterations):
gradients = 2/100 * X_b.T.dot(X_b.dot(theta) - y)
theta = theta - learning_rate * gradients
# Print the final learned parameters (theta)

print("Final Parameters (theta):", theta)
# Plot the original data and the linear regression line

plt.scatter(X, y)
plt.plot(X, X_b.dot(theta), color='red', label='Linear Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
Grid search:-
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Load dataset (you can replace this with your own dataset)
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target, test_size=0.2, random_state=42)
# Define the model

svm_model = SVC()
# Define the hyperparameter grid

param_grid = {
'C': [0.1, 1, 10, 100],
'kernel': ['linear', 'rbf', 'poly'],
'gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
'degree': [2, 3, 4]
}
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid,
cv=3, scoring='accuracy')
# Fit the model with the data

grid_search.fit(X_train, y_train)
# Print the best hyperparameters

print("Best Hyperparameters: ", grid_search.best_params_)
# Get the best model

best_model = grid_search.best_estimator_
# Evaluate the best model on the test set

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))
Randomized search CV:-

# Import necessary libraries
from sklearn.model_selection import RandomizedSearchCV
# Load dataset (you can replace this with your dataset)

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target, test_size=0.2, random_state=42)
# Define the model

rf = RandomForestClassifier()
# Define the hyperparameter distributions

param_dist = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf,
param_distributions=param_dist, n_iter=10, cv=3, scoring='accuracy',
random_state=42)
# Fit the model with the data

random_search.fit(X_train, y_train)
# Print the best hyperparameters

print("Best Hyperparameters: ", random_search.best_params_)
# Get the best model

best_model = random_search.best_estimator_
# Evaluate the model on the test set

accuracy = best_model.score(X_test, y_test)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))
Loss function:-
import numpy as np
def mean_squared_error(y_true, y_pred):

"""
Calculate the mean squared error between the true and predicted
values.
Parameters:
- y_true: numpy array, true values
- y_pred: numpy array, predicted values
Returns:
- mse: float, mean squared error
"""
# Ensure the input arrays have the same shape
assert y_true.shape == y_pred.shape, "Input arrays must have the same
shape"
# Calculate the squared differences

squared_diff = (y_true - y_pred) ** 2
# Calculate the mean squared error
mse = np.mean(squared_diff)
return mse
# Example usage:
# Replace these arrays with your actual true and predicted values
true_values = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
predicted_values = np.array([1.5, 2.5, 2.8, 3.7, 4.2])
# Calculate the mean squared error

mse_result = mean_squared_error(true_values, predicted_values)
# Print the result

print("Mean Squared Error:", mse_result)
Stochastic gradient descent (sgd): -
import numpy as np
def stochastic_gradient_descent(X, y, learning_rate=0.01, epochs=100,

batch_size=1):
"""
Perform Stochastic Gradient Descent for linear regression.
Parameters:
- X: numpy array, feature matrix
- y: numpy array, target values
- learning_rate: float, step size for updating parameters
- epochs: int, number of passes through the entire dataset
- batch_size: int, number of samples in each mini-batch
Returns:
- theta: numpy array, learned parameters
- cost_history: list, history of cost during optimization
"""
# Initialize parameters
num_samples, num_features = X.shape
theta = np.zeros(num_features)
cost_history = []
# Stochastic Gradient Descent
for epoch in range(epochs):
for i in range(0, num_samples, batch_size):
X_batch = X[i:i + batch_size]
y_batch = y[i:i + batch_size]
# Compute predictions
predictions = np.dot(X_batch, theta)
# Compute errors
errors = predictions - y_batch
# Update parameters
gradient = np.dot(X_batch.T, errors) / batch_size
theta -= learning_rate * gradient
# Compute and record the cost

cost = np.mean((np.dot(X, theta) - y) ** 2) / 2.0
cost_history.append(cost)
if epoch % 10 == 0:
print(f"Epoch {epoch}/{epochs}, Cost: {cost}")
return theta, cost_history
# Example usage:
# Replace these arrays with your actual feature matrix (X) and target
values (y)
X = np.array([[1, 2], [1, 3], [1, 4]])
y = np.array([5, 6, 7])
# Set hyperparameters
learning_rate = 0.01
epochs = 100
batch_size = 1
# Run stochastic gradient descent

theta, cost_history = stochastic_gradient_descent(X, y, learning_rate,
epochs, batch_size)
# Print the learned parameters and cost history

print("Learned Parameters (Theta):", theta)
print("Final Cost:", cost_history[-1])
How to Save & Load Machine Learning Model
### Import Libraries
"""
# import libraries
import numpy as np
import pandas as pd
"""### Load Dataset"""
#load dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data.data
data.feature_names
data.target
data.target_names
# create dtaframe
df = pd.DataFrame(np.c_[data.data, data.target],
columns=[list(data.feature_names)+['target']])
df.head()
df.tail()
df.shape
"""### Split Data"""
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

random_state=2020)
print ('Shape of X_train = ', X_train.shape)

print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)
"""## Train Random Forest Classification Model"""
classifier = RandomForestClassifier(n_estimators=100, criterion='gini')

classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)
"""## Predict Cancer"""
patient1 = [17.99,
10.38,
122.8,
1001.0,
0.1184,
0.2776,
0.3001,
0.1471,
0.2419,
0.07871,
1.095,
0.9053,
8.589,
153.4,
0.006399,
0.04904,
0.05373,
0.01587,
0.03003,
0.006193,
25.38,
17.33,
184.6,
2019.0,
0.1622,
0.6656,
0.7119,
0.2654,
0.4601,
0.1189]
patient1 = np.array([patient1])
patient1
classifier.predict(patient1)
data.target_names
pred = classifier.predict(patient1)
if pred [0] == 0:
print ('Patient has Cancer (malignant tumor)')
else:
print ('Patient has no Cancer (malignant benign)')
"""# Save Model
## Save Model using Pickle

"""
import pickle
pickle.dump(classifier, open('model_save', 'wb'))
model = pickle.load(open('model_save', 'rb'))
model.predict(patient1)[0]
"""## Save Model using Joblib"""
import joblib
joblib.dump(classifier, 'model_save2')
model2 = joblib.load('model_save2')
model2.predict(patient1)
Recommendation system:
1.Collaborative filtering:-
2. import pandas as pd
3. import numpy as np
4. #Load the u.user file into a dataframe
5. u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
6.
7. users = pd.read_csv('/content/u.user', sep='|', names=u_cols,
8. encoding='latin-1')
9.
10. users.head(3)
11. #Load the u.item file into a dataframe
12. i_cols = ['movie_id', 'title' ,'release date','video release
date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
13. 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
'Drama', 'Fantasy',
14. 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-
Fi', 'Thriller', 'War', 'Western']
15.
16. movies = pd.read_csv('/content/u.item', sep='|', names=i_cols,
encoding='latin-1')
17.
18. movies.head(2)
19. #Remove all information except Movie ID and title
20. movies = movies[['movie_id', 'title']]
21. #Load the u.data file into a dataframe
22. r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
23.
24. ratings = pd.read_csv('/content/u.data', sep='\t',
names=r_cols,
25. encoding='latin-1')
26.
27. ratings.head(2)
28. #Drop the timestamp column
29. ratings = ratings.drop('timestamp', axis=1)
30. #Import the train_test_split function
31. from sklearn.model_selection import train_test_split
32.
33. #Assign X as the original ratings dataframe and y as the
user_id column of ratings.
34. X = ratings.copy()
35. y = ratings['user_id']
36.
37. #Split into training and test datasets, stratified along
user_id
38. X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25, stratify=y, random_state=42)
39. #Import the mean_squared_error function
40. from sklearn.metrics import mean_squared_error
41.
42. #Function that computes the root mean squared error (or RMSE)
43. def rmse(y_true, y_pred):
44. return np.sqrt(mean_squared_error(y_true, y_pred))
45. #Define the baseline model to always return 3.
46. def baseline(user_id, movie_id):
47. return 3.0
48. #Function to compute the RMSE score obtained on the testing
set by a model
49. def score(cf_model):
50.
51. #Construct a list of user-movie tuples from the testing
dataset
52. id_pairs = zip(X_test['user_id'], X_test['movie_id'])
53.
54. #Predict the rating for every user-movie tuple
55. y_pred = np.array([cf_model(user, movie) for (user, movie)
in id_pairs])
56.
57. #Extract the actual ratings given by the users in the test
data
58. y_true = np.array(X_test['rating'])
59.
60. #Return the final RMSE score
61. return rmse(y_true, y_pred)
62. score(baseline)
63. #Ratings Matrix
64. #Build the ratings matrix using pivot_table function
65. r_matrix = X_train.pivot_table(values='rating',
index='user_id', columns='movie_id')
66.
67. r_matrix.head()
68. #User Based Collaborative Filter using Mean Ratings
69. def cf_user_mean(user_id, movie_id):
70.
71. #Check if movie_id exists in r_matrix
72. if movie_id in r_matrix:
73. #Compute the mean of all the ratings given to the
movie
74. mean_rating = r_matrix[movie_id].mean()
75.
76. else:
77. #Default to a rating of 3.0 in the absence of any
information
78. mean_rating = 3.0
79.
80. return mean_rating
81. #Compute RMSE for the Mean model
82. score(cf_user_mean)
83. # Weighted Mean
84. #Create a dummy ratings matrix with all null values imputed to
0
85. r_matrix_dummy = r_matrix.copy().fillna(0)
86. # Import cosine_score
87. from sklearn.metrics.pairwise import cosine_similarity
88.
89. #Compute the cosine similarity matrix using the dummy ratings
matrix
90. cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)
91. #Convert into pandas dataframe
92. cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index,
columns=r_matrix.index)
93.
94. cosine_sim.head(10)
95. #User Based Collaborative Filter using Weighted Mean Ratings
96.
97. def cf_user_wmean(user_id, movie_id):
98.
99. # Check if movie_id exists in r_matrix
100. if movie_id in r_matrix:
101.
102. # Get the similarity scores for the user in question
with every other user
103. sim_scores = cosine_sim[user_id]
104.
105. # Get the user ratings for the movie in question
106. m_ratings = r_matrix[movie_id]
107.
108. # Extract the indices containing NaN in the m_ratings
series
109. idx = m_ratings[m_ratings.isnull()].index
110.
111. # Check for NaN values in similarity scores
112. if sim_scores.isnull().any() or
m_ratings.isnull().any():
113. # Default to a rating of 3.0 if there are NaN
values
114. wmean_rating = 3.0
115. else:
116. # Drop the NaN values from the m_ratings Series
117. m_ratings = m_ratings.dropna()
118.
119. # Drop the corresponding cosine scores from the
sim_scores series
120. sim_scores = sim_scores.drop(idx)
121.
122. # Compute the final weighted mean
123. wmean_rating = np.dot(sim_scores, m_ratings) /
sim_scores.sum()
124.
125. else:
126. # Default to a rating of 3.0 in the absence of any
information
127. wmean_rating = 3.0
128.
129. return wmean_rating
130.
131. # Now, re-run the score function
132. score(cf_user_wmean)
Content based recommendation system: -
Output
Hybrid recommendation system:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
# Sample user-item interaction data

user_item_data = {
'user_id': [1, 1, 2, 2, 3, 3, 4, 4],
'item_id': ['A', 'B', 'A', 'C', 'B', 'C', 'D', 'E']
}
df_user_item = pd.DataFrame(user_item_data)
# Sample item content data

item_content_data = {
'item_id': ['A', 'B', 'C', 'D', 'E'],
'description': ['Action movie', 'Drama movie', 'Comedy movie',
'Science fiction book', 'Mystery book']
}
df_item_content = pd.DataFrame(item_content_data)
# Collaborative Filtering
user_item_matrix = df_user_item.pivot_table(index='user_id',
columns='item_id', aggfunc=len, fill_value=0)
# Content-Based Filtering
vectorizer = CountVectorizer()
item_description_matrix =
vectorizer.fit_transform(df_item_content['description'])
cosine_similarities = cosine_similarity(item_description_matrix,
item_description_matrix)
# Hybrid Recommendation
def hybrid_recommendation(user_id, item_id):
# Collaborative Filtering
user_ratings = user_item_matrix.loc[user_id].values.reshape(1, -1)
item_ratings = user_item_matrix[item_id].values.reshape(1, -1)
collaborative_similarity = cosine_similarity(user_item_matrix.values,
user_ratings)
# Content-Based Filtering
item_index = df_item_content[df_item_content['item_id'] ==
item_id].index[0]
content_similarity = cosine_similarities[item_index]
# Hybrid Score
hybrid_score = 0.7 * collaborative_similarity + 0.3 *
content_similarity
# Get recommended items

recommended_items =
user_item_matrix.columns[np.argsort(hybrid_score[0])[::-1]]
return recommended_items
# Example usage
user_id = 1
item_id = 'B'
recommendations = hybrid_recommendation(user_id, item_id)
print(f"Recommendations for user {user_id} based on item {item_id}:

{recommendations}")
Hyperparameter tuning: -
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
# Load a sample dataset (Iris dataset in this case)
iris = load_iris()
X, y = iris.data, iris.target

random_state=42)
# Define the model and the hyperparameter grid

model = RandomForestClassifier()
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3,
scoring='accuracy')
# Fit the model with hyperparameter tuning

grid_search.fit(X_train, y_train)
# Get the best hyperparameters

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)
# Predict using the best model

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Data Mining Practicals

Uploaded by

Data Mining Practicals

Uploaded by

Data mining practicals :-

# Generate synthetic data

# Split the data into training and testing sets

# Fit a simple linear regression model

# Evaluate the model on the training data

# Evaluate the model on the testing data

# Plot the data and the model

# Generate synthetic data

# Split the data into training and testing sets

# Create a polynomial features transformer

# Fit a linear regression model on the polynomial features

# Visualize the overfitting

plt.scatter(X_train, y_train, label='Training Data')

# Evaluate the model on training and testing data

# Load dataset (replace this with your own dataset)

# Initialize a classifier (replace this with your own classifier)

# Set up KFold cross-validation with 5 folds

# Print the cross-validation scores

# Print the mean and standard deviation of the cross-validation scores

# Load dataset (replace this with your own dataset)

# Split the dataset into training and testing sets

# Initialize a classifier (replace this with your own classifier)

# Train the classifier on the training data

# Make predictions on the test data

# Create a confusion matrix

# Print the confusion matrix

# Generate some random data for a linear relationship

# Add a bias term to the feature matrix

# Initialize random weights

# Gradient Descent algorithm

# Print the final learned parameters (theta)

# Plot the original data and the linear regression line

# Define the model

# Define the hyperparameter grid

# Fit the model with the data

# Print the best hyperparameters

# Get the best model

# Evaluate the best model on the test set

Randomized search CV:-

# Load dataset (you can replace this with your dataset)

# Define the model

# Define the hyperparameter distributions

# Fit the model with the data

# Print the best hyperparameters

# Get the best model

# Evaluate the model on the test set

def mean_squared_error(y_true, y_pred):

# Calculate the squared differences

# Calculate the mean squared error

# Print the result

Stochastic gradient descent (sgd): -

def stochastic_gradient_descent(X, y, learning_rate=0.01, epochs=100,

# Compute and record the cost

return theta, cost_history

# Run stochastic gradient descent

# Print the learned parameters and cost history

"""### Load Dataset"""

"""### Split Data"""

from sklearn.model_selection import train_test_split

print ('Shape of X_train = ', X_train.shape)

"""## Train Random Forest Classification Model"""

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, criterion='gini')

"""## Predict Cancer"""

"""# Save Model

## Save Model using Pickle

pickle.dump(classifier, open('model_save', 'wb'))

model = pickle.load(open('model_save', 'rb'))

"""## Save Model using Joblib"""

# Sample user-item interaction data

# Sample item content data

# Get recommended items

print(f"Recommendations for user {user_id} based on item {item_id}:

# Split the data into training and testing sets

# Define the model and the hyperparameter grid

# Fit the model with hyperparameter tuning