0% found this document useful (0 votes)
24 views22 pages

Data Mining Practicals

a

Uploaded by

Vanshika Gupta
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
Download as docx, pdf, or txt
0% found this document useful (0 votes)
24 views22 pages

Data Mining Practicals

a

Uploaded by

Vanshika Gupta
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1/ 22

Data mining practicals :-

Underfitting
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Generate synthetic data


np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Fit a simple linear regression model


linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Evaluate the model on the training data


y_train_pred = linear_reg.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)

# Evaluate the model on the testing data


y_test_pred = linear_reg.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)

# Plot the data and the model


plt.scatter(X, y, label='Data points')
plt.plot(X, linear_reg.predict(X), color='red', label='Linear Regression
Model')
plt.title(f'Underfitting Example\nTrain MSE: {mse_train:.2f}, Test MSE:
{mse_test:.2f}')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
Overfitting :-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

# Generate synthetic data


np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Create a polynomial features transformer


poly_features = PolynomialFeatures(degree=15, include_bias=False)
X_poly = poly_features.fit_transform(X_train)

# Fit a linear regression model on the polynomial features


lin_reg = LinearRegression()
lin_reg.fit(X_poly, y_train)

# Visualize the overfitting


X_range = np.linspace(0, 2, 100).reshape(-1, 1)
X_range_poly = poly_features.transform(X_range)
y_pred = lin_reg.predict(X_range_poly)

plt.scatter(X_train, y_train, label='Training Data')


plt.scatter(X_test, y_test, label='Testing Data', color='r')
plt.plot(X_range, y_pred, label='Polynomial Regression', color='g')
plt.title('Overfitting Example')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()

# Evaluate the model on training and testing data


y_train_pred = lin_reg.predict(X_poly)
mse_train = mean_squared_error(y_train, y_train_pred)
print(f'Mean Squared Error on Training Data: {mse_train}')

X_test_poly = poly_features.transform(X_test)
y_test_pred = lin_reg.predict(X_test_poly)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f'Mean Squared Error on Testing Data: {mse_test}')
Cross validation:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load dataset (replace this with your own dataset)


data = load_iris()
X, y = data.data, data.target

# Initialize a classifier (replace this with your own classifier)


classifier = RandomForestClassifier()

# Set up KFold cross-validation with 5 folds


kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(classifier, X, y, cv=kf)

# Print the cross-validation scores


for i, score in enumerate(cv_scores):
print(f'Fold {i+1}: {score}')

# Print the mean and standard deviation of the cross-validation scores


print(f'Mean CV Score: {cv_scores.mean()}')
print(f'Standard Deviation of CV Scores: {cv_scores.std()}')

Confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load dataset (replace this with your own dataset)


data = load_iris()
X, y = data.data, data.target

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize a classifier (replace this with your own classifier)


classifier = RandomForestClassifier()

# Train the classifier on the training data


classifier.fit(X_train, y_train)

# Make predictions on the test data


y_pred = classifier.predict(X_test)

# Create a confusion matrix


cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix


print("Confusion Matrix:")
print(cm)

Gradient descent:
import numpy as np
import matplotlib.pyplot as plt

# Generate some random data for a linear relationship


np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# Add a bias term to the feature matrix


X_b = np.c_[np.ones((100, 1)), X]

# Set hyperparameters
learning_rate = 0.01
n_iterations = 1000

# Initialize random weights


theta = np.random.randn(2, 1)

# Gradient Descent algorithm


for iteration in range(n_iterations):
gradients = 2/100 * X_b.T.dot(X_b.dot(theta) - y)
theta = theta - learning_rate * gradients

# Print the final learned parameters (theta)


print("Final Parameters (theta):", theta)

# Plot the original data and the linear regression line


plt.scatter(X, y)
plt.plot(X, X_b.dot(theta), color='red', label='Linear Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
Grid search:-
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load dataset (you can replace this with your own dataset)
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target, test_size=0.2, random_state=42)

# Define the model


svm_model = SVC()

# Define the hyperparameter grid


param_grid = {
'C': [0.1, 1, 10, 100],
'kernel': ['linear', 'rbf', 'poly'],
'gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
'degree': [2, 3, 4]
}
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid,
cv=3, scoring='accuracy')

# Fit the model with the data


grid_search.fit(X_train, y_train)

# Print the best hyperparameters


print("Best Hyperparameters: ", grid_search.best_params_)

# Get the best model


best_model = grid_search.best_estimator_

# Evaluate the best model on the test set


y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

Randomized search CV:-


# Import necessary libraries
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load dataset (you can replace this with your dataset)


iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target, test_size=0.2, random_state=42)

# Define the model


rf = RandomForestClassifier()

# Define the hyperparameter distributions


param_dist = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf,
param_distributions=param_dist, n_iter=10, cv=3, scoring='accuracy',
random_state=42)

# Fit the model with the data


random_search.fit(X_train, y_train)

# Print the best hyperparameters


print("Best Hyperparameters: ", random_search.best_params_)

# Get the best model


best_model = random_search.best_estimator_

# Evaluate the model on the test set


accuracy = best_model.score(X_test, y_test)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

Loss function:-
import numpy as np

def mean_squared_error(y_true, y_pred):


"""
Calculate the mean squared error between the true and predicted
values.

Parameters:
- y_true: numpy array, true values
- y_pred: numpy array, predicted values

Returns:
- mse: float, mean squared error
"""
# Ensure the input arrays have the same shape
assert y_true.shape == y_pred.shape, "Input arrays must have the same
shape"

# Calculate the squared differences


squared_diff = (y_true - y_pred) ** 2
# Calculate the mean squared error
mse = np.mean(squared_diff)

return mse

# Example usage:
# Replace these arrays with your actual true and predicted values
true_values = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
predicted_values = np.array([1.5, 2.5, 2.8, 3.7, 4.2])

# Calculate the mean squared error


mse_result = mean_squared_error(true_values, predicted_values)

# Print the result


print("Mean Squared Error:", mse_result)

Stochastic gradient descent (sgd): -

import numpy as np

def stochastic_gradient_descent(X, y, learning_rate=0.01, epochs=100,


batch_size=1):
"""
Perform Stochastic Gradient Descent for linear regression.

Parameters:
- X: numpy array, feature matrix
- y: numpy array, target values
- learning_rate: float, step size for updating parameters
- epochs: int, number of passes through the entire dataset
- batch_size: int, number of samples in each mini-batch

Returns:
- theta: numpy array, learned parameters
- cost_history: list, history of cost during optimization
"""
# Initialize parameters
num_samples, num_features = X.shape
theta = np.zeros(num_features)
cost_history = []
# Stochastic Gradient Descent
for epoch in range(epochs):
for i in range(0, num_samples, batch_size):
X_batch = X[i:i + batch_size]
y_batch = y[i:i + batch_size]

# Compute predictions
predictions = np.dot(X_batch, theta)

# Compute errors
errors = predictions - y_batch

# Update parameters
gradient = np.dot(X_batch.T, errors) / batch_size
theta -= learning_rate * gradient

# Compute and record the cost


cost = np.mean((np.dot(X, theta) - y) ** 2) / 2.0
cost_history.append(cost)

if epoch % 10 == 0:
print(f"Epoch {epoch}/{epochs}, Cost: {cost}")

return theta, cost_history

# Example usage:
# Replace these arrays with your actual feature matrix (X) and target
values (y)
X = np.array([[1, 2], [1, 3], [1, 4]])
y = np.array([5, 6, 7])

# Set hyperparameters
learning_rate = 0.01
epochs = 100
batch_size = 1

# Run stochastic gradient descent


theta, cost_history = stochastic_gradient_descent(X, y, learning_rate,
epochs, batch_size)

# Print the learned parameters and cost history


print("Learned Parameters (Theta):", theta)
print("Final Cost:", cost_history[-1])
How to Save & Load Machine Learning Model
### Import Libraries
"""

# import libraries
import numpy as np
import pandas as pd

"""### Load Dataset"""

#load dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

data.data

data.feature_names

data.target

data.target_names

# create dtaframe
df = pd.DataFrame(np.c_[data.data, data.target],
columns=[list(data.feature_names)+['target']])
df.head()

df.tail()

df.shape

"""### Split Data"""

X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=2020)

print ('Shape of X_train = ', X_train.shape)


print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

"""## Train Random Forest Classification Model"""

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, criterion='gini')


classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

"""## Predict Cancer"""

patient1 = [17.99,
10.38,
122.8,
1001.0,
0.1184,
0.2776,
0.3001,
0.1471,
0.2419,
0.07871,
1.095,
0.9053,
8.589,
153.4,
0.006399,
0.04904,
0.05373,
0.01587,
0.03003,
0.006193,
25.38,
17.33,
184.6,
2019.0,
0.1622,
0.6656,
0.7119,
0.2654,
0.4601,
0.1189]

patient1 = np.array([patient1])
patient1

classifier.predict(patient1)

data.target_names

pred = classifier.predict(patient1)

if pred [0] == 0:
print ('Patient has Cancer (malignant tumor)')
else:
print ('Patient has no Cancer (malignant benign)')

"""# Save Model

## Save Model using Pickle


"""
import pickle

pickle.dump(classifier, open('model_save', 'wb'))

model = pickle.load(open('model_save', 'rb'))

model.predict(patient1)[0]

"""## Save Model using Joblib"""

import joblib

joblib.dump(classifier, 'model_save2')

model2 = joblib.load('model_save2')

model2.predict(patient1)

Recommendation system:
1.Collaborative filtering:-
2. import pandas as pd
3. import numpy as np
4. #Load the u.user file into a dataframe
5. u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
6.
7. users = pd.read_csv('/content/u.user', sep='|', names=u_cols,
8. encoding='latin-1')
9.
10. users.head(3)
11. #Load the u.item file into a dataframe
12. i_cols = ['movie_id', 'title' ,'release date','video release
date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
13. 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
'Drama', 'Fantasy',
14. 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-
Fi', 'Thriller', 'War', 'Western']
15.
16. movies = pd.read_csv('/content/u.item', sep='|', names=i_cols,
encoding='latin-1')
17.
18. movies.head(2)
19. #Remove all information except Movie ID and title
20. movies = movies[['movie_id', 'title']]
21. #Load the u.data file into a dataframe
22. r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
23.
24. ratings = pd.read_csv('/content/u.data', sep='\t',
names=r_cols,
25. encoding='latin-1')
26.
27. ratings.head(2)
28. #Drop the timestamp column
29. ratings = ratings.drop('timestamp', axis=1)
30. #Import the train_test_split function
31. from sklearn.model_selection import train_test_split
32.
33. #Assign X as the original ratings dataframe and y as the
user_id column of ratings.
34. X = ratings.copy()
35. y = ratings['user_id']
36.
37. #Split into training and test datasets, stratified along
user_id
38. X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25, stratify=y, random_state=42)
39. #Import the mean_squared_error function
40. from sklearn.metrics import mean_squared_error
41.
42. #Function that computes the root mean squared error (or RMSE)
43. def rmse(y_true, y_pred):
44. return np.sqrt(mean_squared_error(y_true, y_pred))
45. #Define the baseline model to always return 3.
46. def baseline(user_id, movie_id):
47. return 3.0
48. #Function to compute the RMSE score obtained on the testing
set by a model
49. def score(cf_model):
50.
51. #Construct a list of user-movie tuples from the testing
dataset
52. id_pairs = zip(X_test['user_id'], X_test['movie_id'])
53.
54. #Predict the rating for every user-movie tuple
55. y_pred = np.array([cf_model(user, movie) for (user, movie)
in id_pairs])
56.
57. #Extract the actual ratings given by the users in the test
data
58. y_true = np.array(X_test['rating'])
59.
60. #Return the final RMSE score
61. return rmse(y_true, y_pred)
62. score(baseline)
63. #Ratings Matrix
64. #Build the ratings matrix using pivot_table function
65. r_matrix = X_train.pivot_table(values='rating',
index='user_id', columns='movie_id')
66.
67. r_matrix.head()
68. #User Based Collaborative Filter using Mean Ratings
69. def cf_user_mean(user_id, movie_id):
70.
71. #Check if movie_id exists in r_matrix
72. if movie_id in r_matrix:
73. #Compute the mean of all the ratings given to the
movie
74. mean_rating = r_matrix[movie_id].mean()
75.
76. else:
77. #Default to a rating of 3.0 in the absence of any
information
78. mean_rating = 3.0
79.
80. return mean_rating
81. #Compute RMSE for the Mean model
82. score(cf_user_mean)
83. # Weighted Mean
84. #Create a dummy ratings matrix with all null values imputed to
0
85. r_matrix_dummy = r_matrix.copy().fillna(0)
86. # Import cosine_score
87. from sklearn.metrics.pairwise import cosine_similarity
88.
89. #Compute the cosine similarity matrix using the dummy ratings
matrix
90. cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)
91. #Convert into pandas dataframe
92. cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index,
columns=r_matrix.index)
93.
94. cosine_sim.head(10)
95. #User Based Collaborative Filter using Weighted Mean Ratings
96.
97. def cf_user_wmean(user_id, movie_id):
98.
99. # Check if movie_id exists in r_matrix
100. if movie_id in r_matrix:
101.
102. # Get the similarity scores for the user in question
with every other user
103. sim_scores = cosine_sim[user_id]
104.
105. # Get the user ratings for the movie in question
106. m_ratings = r_matrix[movie_id]
107.
108. # Extract the indices containing NaN in the m_ratings
series
109. idx = m_ratings[m_ratings.isnull()].index
110.
111. # Check for NaN values in similarity scores
112. if sim_scores.isnull().any() or
m_ratings.isnull().any():
113. # Default to a rating of 3.0 if there are NaN
values
114. wmean_rating = 3.0
115. else:
116. # Drop the NaN values from the m_ratings Series
117. m_ratings = m_ratings.dropna()
118.
119. # Drop the corresponding cosine scores from the
sim_scores series
120. sim_scores = sim_scores.drop(idx)
121.
122. # Compute the final weighted mean
123. wmean_rating = np.dot(sim_scores, m_ratings) /
sim_scores.sum()
124.
125. else:
126. # Default to a rating of 3.0 in the absence of any
information
127. wmean_rating = 3.0
128.
129. return wmean_rating
130.
131. # Now, re-run the score function
132. score(cf_user_wmean)
Content based recommendation system: -
Output
Hybrid recommendation system:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Sample user-item interaction data


user_item_data = {
'user_id': [1, 1, 2, 2, 3, 3, 4, 4],
'item_id': ['A', 'B', 'A', 'C', 'B', 'C', 'D', 'E']
}

df_user_item = pd.DataFrame(user_item_data)

# Sample item content data


item_content_data = {
'item_id': ['A', 'B', 'C', 'D', 'E'],
'description': ['Action movie', 'Drama movie', 'Comedy movie',
'Science fiction book', 'Mystery book']
}

df_item_content = pd.DataFrame(item_content_data)

# Collaborative Filtering
user_item_matrix = df_user_item.pivot_table(index='user_id',
columns='item_id', aggfunc=len, fill_value=0)

# Content-Based Filtering
vectorizer = CountVectorizer()
item_description_matrix =
vectorizer.fit_transform(df_item_content['description'])
cosine_similarities = cosine_similarity(item_description_matrix,
item_description_matrix)

# Hybrid Recommendation
def hybrid_recommendation(user_id, item_id):
# Collaborative Filtering
user_ratings = user_item_matrix.loc[user_id].values.reshape(1, -1)
item_ratings = user_item_matrix[item_id].values.reshape(1, -1)
collaborative_similarity = cosine_similarity(user_item_matrix.values,
user_ratings)

# Content-Based Filtering
item_index = df_item_content[df_item_content['item_id'] ==
item_id].index[0]
content_similarity = cosine_similarities[item_index]

# Hybrid Score
hybrid_score = 0.7 * collaborative_similarity + 0.3 *
content_similarity

# Get recommended items


recommended_items =
user_item_matrix.columns[np.argsort(hybrid_score[0])[::-1]]

return recommended_items

# Example usage
user_id = 1
item_id = 'B'
recommendations = hybrid_recommendation(user_id, item_id)

print(f"Recommendations for user {user_id} based on item {item_id}:


{recommendations}")

Hyperparameter tuning: -
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
# Load a sample dataset (Iris dataset in this case)
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Define the model and the hyperparameter grid


model = RandomForestClassifier()
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3,
scoring='accuracy')

# Fit the model with hyperparameter tuning


grid_search.fit(X_train, y_train)

# Get the best hyperparameters


best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Predict using the best model


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

You might also like