import pandas as pd # To play with data tables
import [Link] as plt # To visualize data
import numpy as np
import copy
from sklearn.linear_model import LinearRegression
from [Link] import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from [Link] import make_regression
from sklearn.model_selection import KFold
!gdown 1i5c01hhj04J816siI‐6u3ea32vI‐HToy
Downloading...
From: [Link]
To: /content/alloy‐confp‐train‐[Link]
100% 7.22k/7.22k [00:00<00:00, 27.3MB/s]
data = pd.read_csv('/content/alloy‐confp‐train‐[Link]')
data = [Link](frac=1)
data.to_csv('/content/alloy‐confp‐train‐[Link]')
[Link]
(120, 8)
Xcols = [Link][[Link]('C.')]
X = data[Xcols]
X
[Link] [Link] [Link] [Link] [Link] [Link]
102 0.074074 0.000000 0.185185 0.185185 0.185185 0.370370
22 0.250000 0.166667 0.166667 0.083333 0.166667 0.166667
105 0.062500 0.000000 0.312500 0.000000 0.468750 0.156250
106 0.142857 0.285714 0.000000 0.000000 0.285714 0.285714
117 0.264706 0.147059 0.147059 0.147059 0.147059 0.147059
... ... ... ... ... ... ...
60 0.428571 0.142857 0.071429 0.071429 0.071429 0.214286
3 0.208333 0.000000 0.208333 0.208333 0.208333 0.166667
79 0.264706 0.000000 0.147059 0.147059 0.147059 0.294118
34 0.000000 0.250000 0.000000 0.250000 0.250000 0.250000
88 0.166667 0.166667 0.333333 0.000000 0.166667 0.166667
120 rows × 6 columns
from matplotlib import pyplot as plt
X['[Link]'].plot(kind='hist', bins=20, title='[Link]')
[Link]().spines[['top', 'right']].set_visible(False)
y = data['HV']
fig, (ax1,ax2,ax3,ax4) = [Link](nrows=1, ncols=4, figsize=(12,3.5))
[Link]([Link], bins=20)
[Link]([Link][:,0], bins=20, label='[Link]')
[Link]([Link][:,1], bins=20, label='[Link]')
[Link]([Link][:,2], bins=20, label='[Link]')
ax1.set_xlabel('HV', fontsize=14)
ax2.set_xlabel('[Link]', fontsize=14)
ax3.set_xlabel('[Link]', fontsize=14)
ax4.set_xlabel('[Link]', fontsize=14)
ax1.set_ylabel('Frequency', fontsize=14)
ax2.set_ylabel('Frequency', fontsize=14)
ax3.set_ylabel('Frequency', fontsize=14)
ax4.set_ylabel('Frequency', fontsize=14)
ax1.set_ylabel('Frequency', fontsize=14)
Text(0, 0.5, 'Frequency')
# First we will define function to make plots. This will make the code simpler.
def polt_parity(y_cv_test,y_pred_test, y_cv_train=None,y_pred_train=None, label=None, ylim=[50,900]):
"""
Function to make parity plots.
"""
# Plot Parity plot
rmse_test = [Link](mean_squared_error(y_cv_test,y_pred_test))
r2_test = r2_score(y_cv_test,y_pred_test)
if y_cv_train is None:
fig, ax1 = [Link](nrows=1, ncols=1, figsize=(5,4), sharey=True, sharex=True)
else:
fig, (ax1,ax2) = [Link](nrows=1, ncols=2, figsize=(9,4), sharey=True, sharex=True)
[Link](y_cv_test,y_pred_test)
[Link](0.95, 0.26, label, transform=[Link], ha='right', fontsize=14)
[Link](0.95, 0.18, "RMSE: %.2f"%rmse_test, transform=[Link], ha='right', fontsize=14)
[Link](0.95, 0.1, "R$^2$: %.2f"%r2_test, transform=[Link], ha='right', fontsize=14)
[Link](ylim, ylim, '‐‐k')
ax1.set_xlabel('True y', fontsize=14)
ax1.set_ylabel('Pred y', fontsize=14)
ax1.set_xlim(ylim[0],ylim[1])
ax1.set_ylim(ylim[0],ylim[1])
if y_cv_train is not None:
rmse_train = [Link](mean_squared_error(y_cv_train,y_pred_train))
r2_train = r2_score(y_cv_train,y_pred_train)
[Link](y_cv_train,y_pred_train, c='m')
[Link](0.95, 0.26, "Train", transform=[Link], ha='right', fontsize=14)
[Link](0.95, 0.18, "RMSE: %.2f"%rmse_train, transform=[Link], ha='right', fontsize=14
[Link](0.95, 0.1, "R2: %.2f"%r2_train, transform=[Link], ha='right', fontsize=14)
[Link](ylim, ylim, '‐‐k')
ax2.set_xlabel('True y', fontsize=14)
ax2.set_xlim(ylim[0],ylim[1])
ax2.set_ylim(ylim[0],ylim[1])
plt.tight_layout()
[Link]()
return None
X_tras_X_inv = [Link]([Link](X.T,X))
X_tras_y = [Link](X.T,y)
w_cap_vec = [Link](X_tras_X_inv,X_tras_y)
y_pred_manual = [Link](X,w_cap_vec)
polt_parity(y,y_pred_manual, label="Train")
lr = LinearRegression(fit_intercept=False)
model = [Link](X,y)
lr_model = [Link](model)
y_pred = [Link](X)
polt_parity(y,y_pred, label="Train")
print("Sklearn model: ", lr_model.coef_)
print("Eq. based model: ", w_cap_vec)
Sklearn model: [1589.03703891 154.02145017 647.00169133 279.68594241 204.32826373
‐241.42532589]
Eq. based model: [1589.03703891 154.02145017 647.00169133 279.68594241 204.32826373
‐241.42532589]
y_avg_pred = [[Link]()]*len(y)
print("Root mean squared error: %.2f" % [Link](mean_squared_error(y, y_avg_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y, y_avg_pred))
Root mean squared error: 186.35
Coefficient of determination: 0.00
errors = []
rmse_avg = []
for i in range(2,12):
kf = KFold(i)
rmses = []
for idx, (train, test) in enumerate([Link](X)):
X_cv_train = [Link][train]
X_cv_test = [Link][test]
y_cv_train = [Link][train]
y_cv_test = [Link][test]
# Model fit and prediction
model = [Link](X_cv_train,y_cv_train)
y_pred_test = [Link](X_cv_test)
y_pred_train = [Link](X_cv_train)
# Computing errors
rmse_test = [Link](mean_squared_error(y_cv_test, y_pred_test))
rmse_train = [Link](mean_squared_error(y_cv_train, y_pred_train))
r2_test = r2_score(y_cv_test, y_pred_test)
r2_train = r2_score(y_cv_train, y_pred_train)
# Plot Parity plot
# polt_parity(y_cv_test,y_pred_test, y_cv_train,y_pred_train)
# print("Root mean squared error: %.2f" % rmse_test)
# print("Coefficient of determination: %.2f" % r2_test)
[Link](rmse_test)
rmse_avg.append(sum(rmses)/len(rmses))
rmse_avg
[90.90410023685317,
86.94150026120126,
86.89238900991143,
86.44250692124533,
87.09357006766437,
87.02679087151705,
85.82328460961962,
85.80805780460595,
86.6502684030654,
87.13891194799572]
x= range(2,12)
errors = [Link](errors)
[Link](x,rmse_avg)
[Link]('Average Validation RMSE vs. Number of Folds (k)')
[Link]('Number of Folds (k)')
[Link]('Average Validation RMSE')
[Link](True)
[Link]()
[Link][1]
6
num_iterations = 1000
learning_rate = 0.01
weights = [Link]([Link][1])
bias = 0
def cost_function(weights, bias, X, y):
predictions = [Link](X, weights) + bias
cost = [Link]([Link]((predictions y) 2))
return cost
def gradient_descent(weights, bias, X, y, learning_rate, num_iterations):
for i in range(num_iterations):
predictions = [Link](X, weights) + bias
gradient_weights = [Link](X.T, (predictions ‐ y))
gradient_bias = [Link](predictions ‐ y)
weights ‐= learning_rate * gradient_weights
bias ‐= learning_rate * gradient_bias
if i % 100 == 0:
print(f"Iteration {i}: Cost = {cost_function(weights, bias, X, y)}")
return weights, bias
weights, bias = gradient_descent(weights, bias, X, y, learning_rate, num_iterations)
print(f"Optimal weights: {weights}")
print(f"Optimal bias: {bias}")
Iteration 0: Cost = 255.67893485794872
Iteration 100: Cost = 87.65877256028017
Iteration 200: Cost = 82.03876320532915
Iteration 300: Cost = 81.60425558281177
Iteration 400: Cost = 81.55097669962204
Iteration 500: Cost = 81.54245710489937
Iteration 600: Cost = 81.54084599370569
Iteration 700: Cost = 81.5405029810298
Iteration 800: Cost = 81.54042383588254
Iteration 900: Cost = 81.54040461622849
Optimal weights: [1212.94760702 ‐221.99136401 270.9281074 ‐96.50510823 ‐171.89813133
‐617.39672497]
Optimal bias: 376.08439070091987
weights
array([1212.94760702, ‐221.99136401, 270.9281074 , ‐96.50510823,
‐171.89813133, ‐617.39672497])