0% found this document useful (0 votes)
5 views

Data Science Programs

Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

Data Science Programs

Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

Question: Create a Pandas program to read a CSV file, fill missing values with the column

mean, and group the data by a specified category to calculate the average of a numerical

column.

Answer:

import pandas as pd

# Read the CSV file into a DataFrame

file_path = 'data.csv' # Replace with your CSV file path

data = pd.read_csv(file_path)

# Fill missing values in each column with the column mean

data = data.fillna(data.mean(numeric_only=True))

# Specify the category column and numerical column

category_column = 'Category' # Replace with the name of your category column

numerical_column = 'Value' # Replace with the name of your numerical column

# Group the data by the category column and calculate the average of the numerical column

grouped_data = data.groupby(category_column)[numerical_column].mean()

# Display the results

print("Average of numerical column grouped by category:")

print(grouped_data)

Question: Implement a k-nearest neighbors (KNN) classifier using scikit-learn to predict

labels from the Iris dataset, and evaluate the model's accuracy.
Answer:

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

# Load the Iris dataset

iris = load_iris()

X, y = iris.data, iris.target

# Split the dataset into training and testing sets (80% train, 20% test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features for better performance

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

# Create the KNN classifier with k=3

knn = KNeighborsClassifier(n_neighbors=3)

# Train the classifier

knn.fit(X_train, y_train)

# Predict labels for the test set


y_pred = knn.predict(X_test)

# Evaluate the model's accuracy

accuracy = accuracy_score(y_test, y_pred)

# Display the accuracy

print("Accuracy of the KNN classifier:", accuracy)

Question: Write a Python program to load a CSV file into a Pandas DataFrame and display

summary statistics (mean, median, and mode) for numerical columns.

Answer:

import pandas as pd

# Load the CSV file into a DataFrame

file_path = 'data.csv' # Replace with the path to your CSV file

data = pd.read_csv(file_path)

# Display the DataFrame

print("DataFrame:")

print(data)

# Calculate and display summary statistics for numerical columns

numerical_data = data.select_dtypes(include=['number'])

# Mean

mean_values = numerical_data.mean()
print("\nMean of numerical columns:")

print(mean_values)

# Median

median_values = numerical_data.median()

print("\nMedian of numerical columns:")

print(median_values)

# Mode

mode_values = numerical_data.mode()

print("\nMode of numerical columns:")

print(mode_values.iloc[0]) # Display the first mode for simplicity

Question: Write a Dask program to load a large CSV file, filter the data based on specific

criteria, and save the results to a new CSV file.

Answer:

import dask.dataframe as dd

# Load the large CSV file into a Dask DataFrame

file_path = 'large_data.csv' # Replace with the path to your large CSV file

data = dd.read_csv(file_path)

# Define the filtering criteria (e.g., filter rows where 'column_name' > 50)

filtered_data = data[data['column_name'] > 50] # Replace 'column_name' and condition as needed

# Save the filtered data to a new CSV file


output_file_path = 'filtered_data.csv'

filtered_data.to_csv(output_file_path, single_file=True, index=False)

print(f"Filtered data has been saved to {output_file_path}")

Question: Write a Python function to calculate the mean, median, and mode of a given list of

numerical values.

Answer:

from statistics import mean, median, mode, StatisticsError

def calculate_statistics(numbers):

"""

Calculate the mean, median, and mode of a list of numerical values.

Args:

numbers (list): A list of numerical values.

Returns:

dict: A dictionary containing the mean, median, and mode.

"""

if not numbers:

return {"mean": None, "median": None, "mode": None}

try:

stats = {

"mean": mean(numbers),
"median": median(numbers),

"mode": mode(numbers),

except StatisticsError:

# Handle cases where mode is not defined (e.g., all values occur equally)

stats = {

"mean": mean(numbers),

"median": median(numbers),

"mode": "No unique mode",

return stats

# Example usage

numbers = [10, 20, 20, 30, 40]

result = calculate_statistics(numbers)

print("Mean:", result["mean"])

print("Median:", result["median"])

print("Mode:", result["mode"])

You might also like