Laboratory 2
: Introduction to Python library package to support numerical
computations
1. To create ndarray and perform Element-wise Operations, Indexing and Slicing,
Numpy Arithmetic and Statistical Functions, Numpy linear algebra.
import numpy as np
def main():
# Creating NumPy arrays
array1 = np.array([[1, 2, 3], [4, 5, 6]])
array2 = np.array([[7, 8, 9], [10, 11, 12]])
print("Array 1:")
print (array1)
print("\nArray 2:")
print(array2)
# Element-wise operations
print("\nElement-wise operations:")
print("Addition: ")
print (array1 + array2)
print("Subtraction: ")
print (array1 - array2)
print("Multiplication: ")
print (array1*array2)
print("Division: ")
print(array1 / array2)
# Indexing and slicing
print("\nIndexing and slicing: ")
print("Element at (0, 1):", array1[0, 1])
print("First row: ", array1[0, ])
print("Second column: ", array1[:, 1])
print("slicing subarray: \n", array1[:, 1:])
# Numpy arithmetic and statistical functions
print("\nNumpy arithmetic and statistical functions: ")
print("Sum of array1:", np.sum (array1))
print("Mean of array2:", np.mean (array2))
print("standard deviation of array1:", np.std (array1))
print("Max value in array2:", np.max (array2))
#Numpy linear algebra
print("\nNumpy linear algebra: ")
A = np.array([[1, 2], [3,4]])
B= np.array([[5, 6], [7, 8]])
print("Matrix A:")
print (A)
print("\nMatrix B:")
print (B)
print("\nMatrix multiplication AB: ")
print (np.dot (A, B))
print("\nMatrix determinant of A:")
print(np.linalg.det(A))
print("\nMatrix inverse of A:")
print(np.linalg.inv(A))
if__name__ == "__main__":
main()
2. Create aDataFrame object(e.g., fromadictionary, list oftuples, orevennumpy's
ndarrays) and Plotting Series andDataFrame.
import pandas as pd
import matplotlib.pyplot as plt
# Create a DataFrame from a dictionary
data = { 'Name': ['John', 'Anna', 'Peter', 'Linda'],
'Age': [28, 24, 35, 32],
'score': [85, 90, 78, 92]}
df = pd.DataFrame(data)
print("DataFrame:")
print (df)
# Create a DataFrame from a list of tuples
data = [('John', 28, 85), ('Anna', 24, 90), ('Peter', 35, 78), ('Linda', 32, 92)]
df = pd.DataFrame(data, columns=['Name', 'Age','Score'])
print("\nDataFrame from list of tuples: ")
print(df)
# Create a DataFrame from numpy's ndarrays
names = np.array(['John', 'Anna', 'Peter', 'Linda'])
ages= np.array([28, 24, 35, 32])
scores = np.array([85, 90, 78, 92])
df = pd.DataFrame({'Name': names, 'Age': ages, 'Score': scores})
print("\nDataFrame from numpy's ndarrays: ")
print(df)
# Plotting Series
þlt.plot(df['Age']) plt.xlabel('Index') plt.ylabel('Age')
plt.title('Age Distribution')
plt.show()
# Plotting DataFrame
df.plot(kind='bar', x='Name', y='score')
plt.xlabel('Name')
plt.ylabel('Score')
plt.title('Score Distribution')
plt.show()
Laboratory 3
Title of the Laboratory Exercise: Data Exploration
a. Write a Python program to compute various summary statistics from the DataFrame.
Iris sample data, which contains information on 150 Iris flowers, 50 each from one
of
three Iris species: Setosa,Versicolour, and Virginica. Each flower is characterized
by
five attributes:
• sepal length in centimeters
• sepal width in centimeters
• petal length in centimeter
• petal width in centimeters
• class(Setosa, Versicolour,Virginica
import pandas as pd
from sklearn.datasets import load_iris
#load the iris dataset
iris = load_iris()
#iris = pd.read_csv("iris.csv")
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['class'] = iris.target
print("Data overview")
print(df.head())
print("Summary stats")
print(df.describe())
print("counts of each column")
print(df.groupby('class').size())
print("correlation matrix")
print(df.corr())
print(df.tail())
print(df.info())
print(df.dtypes)
print("/n Shape:",df.shape)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Initialize K-Nearest Neighbors classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
# Predict on the test set
y_pred = knn.predict(X_test)
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
# Classification report
print(classification_report(y_test, y_pred, target_names=iris.target_names))
# Visualizing the Iris dataset
plt.figure(figsize=(8, 6))
# Plotting sepal length vs. sepal width
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolor='k')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.title('Iris Dataset - Sepal Length vs. Sepal Width')
plt.colorbar(label='Class') # Specify the colorbar label directly
plt.show()
Laboratory 4
Title ofthe Laboratory Exercise:DataPreprocessing
a. Write a Python program to implement different approaches for handling missing
values(such as outliers, duplicate data, aggregation,sampling, discretization
etc.).
import pandas as pd
import numpy as np
# Create a sample DataFrame with missing values, outliers, and duplicate data
data = {
'A': [1, 2, np.nan, 4, 5, 100], # Missing value and outlier (100)
'B': [10, np.nan, 30, 40, 50, 10], # Missing value and duplicate (10)
'C': ['x', 'Y', 'Z', np.nan, 'w', 'x'],
'D': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
}
df = pd.DataFrame(data)
# Display the original DataFrame
print("Original DataFrame:")
print(df)
print()
# Handling Missing Values
print("Handling Missing Values:")
# Drop rows with any missing values
df_dropped = df.dropna() #removes missing values
print("1. Dropping rows with any missing values:")
print(df_dropped)
print()
# Fill missing values with mean, median, or mode
# Here we fill with mean for numeric columns and mode for categorical columns
df_filled = df.copy()
for col in df_filled.columns:
if df_filled[col].dtype == 'float64':
# Fill with mean for numeric columns
df_filled[col].fillna(df_filled[col].mean(), inplace=True)
elif df_filled[col].dtype == 'object':
# Fill with mode for categorical columns
df_filled[col].fillna(df_filled[col].mode()[0], inplace=True)
print("2. Filling missing values:")
print(df_filled)
print()
# Handling Outliers
print("Handling Outliers:")
# Example of handling outliers by winsorization (capping)
def winsorize_series(s):
q1, q3 = np.percentile(s, [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
s = s.clip(lower=lower_bound, upper=upper_bound) #The clip method replaces
values in the Series s that are less than lower_bound with lower_bound and values
greater than upper_bound with upper_bound
return s
df_outliers_handled = df_filled.copy()
for col in df_outliers_handled.select_dtypes(include=['float64']).columns:
#iterate through each column of that is of float type the data and apply winsorize
function
df_outliers_handled[col] = winsorize_series(df_outliers_handled[col])
print("3. Handling outliers by winsorization:")
print(df_outliers_handled)
print()
# Handling Duplicate Data
print("Handling Duplicate Data:")
# Drop duplicates
df_no_duplicates = df_outliers_handled.drop_duplicates()
print("4. Dropping duplicates:")
print(df_no_duplicates)
print()
# Aggregation (example of aggregating numeric columns)
print("Aggregation:")
# Aggregate by 'C' for numeric columns
df_aggregated = df_no_duplicates.groupby('C').agg({'A': 'sum', 'B':
'sum'}).reset_index()
print("5. Aggregated DataFrame:")
print(df_aggregated)
print()
# Sampling (example of random sampling)
print("Sampling:")
# Randomly sample 3 rows
df_sampled = df_no_duplicates.sample(n=3, random_state=42)
print("6. Randomly sampled DataFrame:")
print(df_sampled)
print()
# Discretization (example of binning numeric data)
print("Discretization:")
# Discretize column 'A' into 3 bins using cut
bins = pd.cut(df_aggregated['A'], bins=3, labels=['Low', 'Medium', 'High'])
df_discretized = pd.concat([df_aggregated, bins.rename('A_bin')], axis=1)
print("7. Discretized DataFrame:")
print(df_discretized)
print()
b. Write a Python program to implement Principal Component Analysis (PCA) for image
data
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_lfw_people
# Load a sample dataset (Labeled Faces in the wild dataset)
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
# Extract the image data and target labels
faces = lfw_people.images
n_samples, h, w = faces.shape #Retrieves the shape of the images array
X = faces.reshape((n_samples, h * w)) # h and w are the height and width of
each image, respectively.
# Compute PCA
n_components = 150 # Number of principal components
pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True).fit(X)
# Project the data into the PCA space
X_pca = pca.transform(X)
# Reconstruct the images using inverse PCA transformation
X_inverse = pca.inverse_transform(X_pca) #Projects the
original data X into the reduced-dimensional PCA space
faces_inverse = X_inverse.reshape((n_samples, h, w)) #Reshapes the
reconstructed data (X_inverse) back into images (faces_inverse).
# Plot original images and reconstructed images
plt.figure(figsize=(12, 6))
n_images = 10 # Number of images to display
for i in range(n_images):
# Plot original images
plt.subplot(2, n_images, i + 1)
plt.imshow(faces[i], cmap=plt.cm.gray)
import matplotlib.pyplot as plt
# ... (rest of the code, including loading data, PCA, etc.)
# Plot original images and reconstructed images
plt.figure(figsize=(12, 6))
n_images = 10 # Number of images to display
for i in range(n_images):
# Plot original images
plt.subplot(2, n_images, i + 1)
plt.imshow(faces[i], cmap=plt.cm.gray)
plt.title('Original')
plt.axis('off')
# Plot reconstructed images
plt.subplot(2, n_images, i + 1 + n_images)
plt.imshow(faces_inverse[i], cmap=plt.cm.gray)
plt.title('\nPCA Reconstructed')
plt.axis('off')
plt.suptitle(f"\nPCA Reconstruction of {n_images} Images") Adds a
centered title above the plots.
plt.show()
5. Laboratory Exercise : Regression methodsto solve problems
a. Write a Python program to implement fitting linear regression models to a
dataset.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
np.random.seed(0)
X = 2 * np.random.rand(100, 1) # 2D array (100, 1). Multiplying each of
these random numbers by 2 scales them to be within the range [0, 2).
y = 4 + 3 * X + np.random.randn(100, 1) # quadraric equation + noise
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}") #difference
between the predicted values and the actual values in a regression problem.
print(f"Coefficient of determination (R^2): {r2_score(y_test, y_pred)}")
#R-squared indicates how well the regression model fits the observed data.
plt.scatter(X_test, y_test, color='black', label='Actual data')
plt.plot(X_test, y_pred, color='blue', linewidth=3, label='Fitted line')
plt.xlabel("X")
plt.ylabel('y')
plt.title('Linear Regression Fit')
plt.legend()
plt.show()
Laboratory 6
Title of the Laboratory Exercise: Classifications
a. Write aPython programto apply a decision tree classifier to the vertebrate
dataset.
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
# Load the vertebral dataset
vertebral_data = pd.read_csv("/content/drive/MyDrive/column_3C_weka.csv")
# Preprocess the data (assuming the last column is the target variable)
X = vertebral_data.iloc[:, :-1] # Features
y = vertebral_data.iloc[:, -1] # Target variable
# Encode categorical target variable to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2,
random_state=42)
# Create a decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
# Train the classifier on the training data
dt_classifier.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = dt_classifier.predict(X_test)
# Inverse transform the predicted labels to original categorical form
y_pred_original = label_encoder.inverse_transform(y_pred)
# Evaluate the classifier's performance
accuracy = accuracy_score(label_encoder.inverse_transform(y_test), y_pred_original)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(label_encoder.inverse_transform(y_test),
y_pred_original, target_names=label_encoder.classes_))
Laboratory 7
Title ofthe Laboratory Exercise: ClusterAnalysis
a. Write a Python program to implement any two clustering algorithms using Python.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
kmeans = KMeans(n_clusters=4)
kmeans.fit(X) #fits data points x into a cluster
kmeans_labels = kmeans.labels_
kmeans_centers = kmeans.cluster_centers_
Z = linkage(X, method="ward") #reduces the variance
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis')
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1], marker='x', color='red',
s=200, label='Centers')
plt.title('K-Means Clustering')
plt.legend()
plt.subplot(122)
plt.title('Hierarchical Clustering Dendrogram')
dendrogram(Z)
plt.xlabel("Sample Index")
plt.tight_layout()
plt.show()
Laboratory 8
Title ofthe Laboratory Exercise:Anomaly detection.
a. Write a python program to apply an anomaly detection approach to a multivariate
time series data.
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
# Sample multivariate time series data (replace with your data)
data = pd.DataFrame({
"timestamp": pd.date_range(start="2023-01-01", periods=100, freq="D"),
"value1": np.random.randn(100),
"value2": np.random.randn(100)
})
# Normalize the data
scaler = StandardScaler()
data[["value1", "value2"]] = scaler.fit_transform(data[["value1", "value2"]])
# Train the isolation forest model
model = IsolationForest(contamination=0.05) # Adjust contamination based on
expected anomaly rate
model.fit(data[["value1", "value2"]])
# Predict anomalies
data['anomaly'] = pd.Series(model.predict(data[["value1", "value2"]])).apply(lambda
x: 1 if x == -1 else 0)
# Print anomalies
anomalies = data[data['anomaly'] == 1]
print("Detected anomalies:")
print(anomalies)
import matplotlib.pyplot as plt
# Assuming 'anomalies', 'data' are already defined
print(anomalies[['timestamp', 'value1', 'value2']])
# Plotting (optional)
plt.figure(figsize=(12, 6))
plt.plot(data['timestamp'], data['value1'], label='Value 1')
plt.plot(data['timestamp'], data['value2'], label='Value 2')
plt.scatter(anomalies['timestamp'], anomalies['value1'], color='red',
label='Anomaly')
plt.scatter(anomalies['timestamp'], anomalies['value2'], color='red')
plt.title('Anomaly Detection in Multivariate Time Series')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
plt.show()