0% found this document useful (0 votes)
5 views

# Use this cell to write your code

Uploaded by

Houssam Alrifaii
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

# Use this cell to write your code

Uploaded by

Houssam Alrifaii
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

# Use this cell to write your code for Task 2

clean_data = pd.read_csv('loyalty.csv')

# Calculate average spend and variance by loyalty years


spend_by_years = clean_data.groupby('loyalty_years').agg(
avg_spend=('spend', 'mean'),
var_spend=('spend', 'var')
).reset_index()

# Round the results to two decimal places


spend_by_years['avg_spend'] = spend_by_years['avg_spend'].round(2)
spend_by_years['var_spend'] = spend_by_years['var_spend'].round(2)

# Output the resulting DataFrame


print(spend_by_years)

....

# Use this cell to write your code for Task 3


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Load the training data
train_data = pd.read_csv('train.csv')

# Preprocess the training data


X_train = train_data.drop(columns=['customer_id', 'spend'])
y_train = train_data['spend']

# Define categorical and numerical features


categorical_features = ['region', 'loyalty_years', 'joining_month', 'promotion']
numerical_features = ['first_month', 'items_in_first_month']

# Create a Column Transformer to preprocess the data


preprocessor = ColumnTransformer(
transformers=[
('num', 'passthrough', numerical_features),
('cat', OneHotEncoder(), categorical_features)
])

# Define the model pipeline


baseline_model = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', LinearRegression())
])

# Fit the baseline model


baseline_model.fit(X_train, y_train)

# Load test data and make predictions


test_data = pd.read_csv('test.csv')
X_test = test_data.drop(columns=['customer_id'])

# Make predictions
predicted_spend = baseline_model.predict(X_test)

# Create the result DataFrame


base_result = pd.DataFrame({
'customer_id': test_data['customer_id'],
'spend': predicted_spend
})

# Output the resulting DataFrame


print(base_result)

....

# Use this cell to write your code for Task 4


# Define the comparison model using Random Forest
comparison_model = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=42))
])

# Fit the comparison model


comparison_model.fit(X_train, y_train)

# Make predictions using the comparison model


predicted_spend_compare = comparison_model.predict(X_test)

# Create the result DataFrame for the comparison model


compare_result = pd.DataFrame({
'customer_id': test_data['customer_id'],
'spend': predicted_spend_compare
})

# Output the resulting DataFrame


print(compare_result)

You might also like