Data Mining Practicals
Data Mining Practicals
Underfitting
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X_test_poly = poly_features.transform(X_test)
y_test_pred = lin_reg.predict(X_test_poly)
mse_test = mean_squared_error(y_test, y_test_pred)
print(f'Mean Squared Error on Testing Data: {mse_test}')
Cross validation:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
Confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
Gradient descent:
import numpy as np
import matplotlib.pyplot as plt
# Set hyperparameters
learning_rate = 0.01
n_iterations = 1000
# Load dataset (you can replace this with your own dataset)
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target, test_size=0.2, random_state=42)
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf,
param_distributions=param_dist, n_iter=10, cv=3, scoring='accuracy',
random_state=42)
Loss function:-
import numpy as np
Parameters:
- y_true: numpy array, true values
- y_pred: numpy array, predicted values
Returns:
- mse: float, mean squared error
"""
# Ensure the input arrays have the same shape
assert y_true.shape == y_pred.shape, "Input arrays must have the same
shape"
return mse
# Example usage:
# Replace these arrays with your actual true and predicted values
true_values = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
predicted_values = np.array([1.5, 2.5, 2.8, 3.7, 4.2])
import numpy as np
Parameters:
- X: numpy array, feature matrix
- y: numpy array, target values
- learning_rate: float, step size for updating parameters
- epochs: int, number of passes through the entire dataset
- batch_size: int, number of samples in each mini-batch
Returns:
- theta: numpy array, learned parameters
- cost_history: list, history of cost during optimization
"""
# Initialize parameters
num_samples, num_features = X.shape
theta = np.zeros(num_features)
cost_history = []
# Stochastic Gradient Descent
for epoch in range(epochs):
for i in range(0, num_samples, batch_size):
X_batch = X[i:i + batch_size]
y_batch = y[i:i + batch_size]
# Compute predictions
predictions = np.dot(X_batch, theta)
# Compute errors
errors = predictions - y_batch
# Update parameters
gradient = np.dot(X_batch.T, errors) / batch_size
theta -= learning_rate * gradient
if epoch % 10 == 0:
print(f"Epoch {epoch}/{epochs}, Cost: {cost}")
# Example usage:
# Replace these arrays with your actual feature matrix (X) and target
values (y)
X = np.array([[1, 2], [1, 3], [1, 4]])
y = np.array([5, 6, 7])
# Set hyperparameters
learning_rate = 0.01
epochs = 100
batch_size = 1
# import libraries
import numpy as np
import pandas as pd
#load dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data.data
data.feature_names
data.target
data.target_names
# create dtaframe
df = pd.DataFrame(np.c_[data.data, data.target],
columns=[list(data.feature_names)+['target']])
df.head()
df.tail()
df.shape
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]
patient1 = [17.99,
10.38,
122.8,
1001.0,
0.1184,
0.2776,
0.3001,
0.1471,
0.2419,
0.07871,
1.095,
0.9053,
8.589,
153.4,
0.006399,
0.04904,
0.05373,
0.01587,
0.03003,
0.006193,
25.38,
17.33,
184.6,
2019.0,
0.1622,
0.6656,
0.7119,
0.2654,
0.4601,
0.1189]
patient1 = np.array([patient1])
patient1
classifier.predict(patient1)
data.target_names
pred = classifier.predict(patient1)
if pred [0] == 0:
print ('Patient has Cancer (malignant tumor)')
else:
print ('Patient has no Cancer (malignant benign)')
model.predict(patient1)[0]
import joblib
joblib.dump(classifier, 'model_save2')
model2 = joblib.load('model_save2')
model2.predict(patient1)
Recommendation system:
1.Collaborative filtering:-
2. import pandas as pd
3. import numpy as np
4. #Load the u.user file into a dataframe
5. u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
6.
7. users = pd.read_csv('/content/u.user', sep='|', names=u_cols,
8. encoding='latin-1')
9.
10. users.head(3)
11. #Load the u.item file into a dataframe
12. i_cols = ['movie_id', 'title' ,'release date','video release
date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
13. 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
'Drama', 'Fantasy',
14. 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-
Fi', 'Thriller', 'War', 'Western']
15.
16. movies = pd.read_csv('/content/u.item', sep='|', names=i_cols,
encoding='latin-1')
17.
18. movies.head(2)
19. #Remove all information except Movie ID and title
20. movies = movies[['movie_id', 'title']]
21. #Load the u.data file into a dataframe
22. r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
23.
24. ratings = pd.read_csv('/content/u.data', sep='\t',
names=r_cols,
25. encoding='latin-1')
26.
27. ratings.head(2)
28. #Drop the timestamp column
29. ratings = ratings.drop('timestamp', axis=1)
30. #Import the train_test_split function
31. from sklearn.model_selection import train_test_split
32.
33. #Assign X as the original ratings dataframe and y as the
user_id column of ratings.
34. X = ratings.copy()
35. y = ratings['user_id']
36.
37. #Split into training and test datasets, stratified along
user_id
38. X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25, stratify=y, random_state=42)
39. #Import the mean_squared_error function
40. from sklearn.metrics import mean_squared_error
41.
42. #Function that computes the root mean squared error (or RMSE)
43. def rmse(y_true, y_pred):
44. return np.sqrt(mean_squared_error(y_true, y_pred))
45. #Define the baseline model to always return 3.
46. def baseline(user_id, movie_id):
47. return 3.0
48. #Function to compute the RMSE score obtained on the testing
set by a model
49. def score(cf_model):
50.
51. #Construct a list of user-movie tuples from the testing
dataset
52. id_pairs = zip(X_test['user_id'], X_test['movie_id'])
53.
54. #Predict the rating for every user-movie tuple
55. y_pred = np.array([cf_model(user, movie) for (user, movie)
in id_pairs])
56.
57. #Extract the actual ratings given by the users in the test
data
58. y_true = np.array(X_test['rating'])
59.
60. #Return the final RMSE score
61. return rmse(y_true, y_pred)
62. score(baseline)
63. #Ratings Matrix
64. #Build the ratings matrix using pivot_table function
65. r_matrix = X_train.pivot_table(values='rating',
index='user_id', columns='movie_id')
66.
67. r_matrix.head()
68. #User Based Collaborative Filter using Mean Ratings
69. def cf_user_mean(user_id, movie_id):
70.
71. #Check if movie_id exists in r_matrix
72. if movie_id in r_matrix:
73. #Compute the mean of all the ratings given to the
movie
74. mean_rating = r_matrix[movie_id].mean()
75.
76. else:
77. #Default to a rating of 3.0 in the absence of any
information
78. mean_rating = 3.0
79.
80. return mean_rating
81. #Compute RMSE for the Mean model
82. score(cf_user_mean)
83. # Weighted Mean
84. #Create a dummy ratings matrix with all null values imputed to
0
85. r_matrix_dummy = r_matrix.copy().fillna(0)
86. # Import cosine_score
87. from sklearn.metrics.pairwise import cosine_similarity
88.
89. #Compute the cosine similarity matrix using the dummy ratings
matrix
90. cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)
91. #Convert into pandas dataframe
92. cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index,
columns=r_matrix.index)
93.
94. cosine_sim.head(10)
95. #User Based Collaborative Filter using Weighted Mean Ratings
96.
97. def cf_user_wmean(user_id, movie_id):
98.
99. # Check if movie_id exists in r_matrix
100. if movie_id in r_matrix:
101.
102. # Get the similarity scores for the user in question
with every other user
103. sim_scores = cosine_sim[user_id]
104.
105. # Get the user ratings for the movie in question
106. m_ratings = r_matrix[movie_id]
107.
108. # Extract the indices containing NaN in the m_ratings
series
109. idx = m_ratings[m_ratings.isnull()].index
110.
111. # Check for NaN values in similarity scores
112. if sim_scores.isnull().any() or
m_ratings.isnull().any():
113. # Default to a rating of 3.0 if there are NaN
values
114. wmean_rating = 3.0
115. else:
116. # Drop the NaN values from the m_ratings Series
117. m_ratings = m_ratings.dropna()
118.
119. # Drop the corresponding cosine scores from the
sim_scores series
120. sim_scores = sim_scores.drop(idx)
121.
122. # Compute the final weighted mean
123. wmean_rating = np.dot(sim_scores, m_ratings) /
sim_scores.sum()
124.
125. else:
126. # Default to a rating of 3.0 in the absence of any
information
127. wmean_rating = 3.0
128.
129. return wmean_rating
130.
131. # Now, re-run the score function
132. score(cf_user_wmean)
Content based recommendation system: -
Output
Hybrid recommendation system:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
df_user_item = pd.DataFrame(user_item_data)
df_item_content = pd.DataFrame(item_content_data)
# Collaborative Filtering
user_item_matrix = df_user_item.pivot_table(index='user_id',
columns='item_id', aggfunc=len, fill_value=0)
# Content-Based Filtering
vectorizer = CountVectorizer()
item_description_matrix =
vectorizer.fit_transform(df_item_content['description'])
cosine_similarities = cosine_similarity(item_description_matrix,
item_description_matrix)
# Hybrid Recommendation
def hybrid_recommendation(user_id, item_id):
# Collaborative Filtering
user_ratings = user_item_matrix.loc[user_id].values.reshape(1, -1)
item_ratings = user_item_matrix[item_id].values.reshape(1, -1)
collaborative_similarity = cosine_similarity(user_item_matrix.values,
user_ratings)
# Content-Based Filtering
item_index = df_item_content[df_item_content['item_id'] ==
item_id].index[0]
content_similarity = cosine_similarities[item_index]
# Hybrid Score
hybrid_score = 0.7 * collaborative_similarity + 0.3 *
content_similarity
return recommended_items
# Example usage
user_id = 1
item_id = 'B'
recommendations = hybrid_recommendation(user_id, item_id)
Hyperparameter tuning: -
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
# Load a sample dataset (Iris dataset in this case)
iris = load_iris()
X, y = iris.data, iris.target
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3,
scoring='accuracy')