9/18/23, 7:29 PM Data Wrangling (Data Preprocessing)
Data Wrangling (Data Preprocessing) Code
Mid-term assessment
Siddharth Dinkar Raul (s4015125)
18-09-2023
Setup
Hide
# Load the necessary packages required to reproduce the report.
library(tibble)
library(dplyr)
library(lubridate)
Data generation
Hide
[Link] Wrangling 2/[Link] 1/4
9/18/23, 7:29 PM Data Wrangling (Data Preprocessing)
# Data generation, provide your R codes
# Generating date range
start_date <- [Link]("2023-01-01")
end_date <- [Link]("2023-12-31")
date_range <- seq(start_date, end_date, by = "days")
# Setting the seed
[Link](285)
# Creating the first dataset ( Sales dataset)
sales_data <- tibble(
date = sample(date_range, 150, replace = TRUE),
product_id = sample(1:200, 150, replace = TRUE),
product_name = [Link](replicate(150, paste(sample(words, 2), collapse = " "))),
quantity_sold = [Link](sample(1:20, 150, replace = TRUE)),
price = [Link](runif(150, min = 50, max = 500)),
customer_id = [Link](sample(1:500, 150, replace = TRUE)),
store_id = [Link](sample(1:5, 150, replace = TRUE)) # Common variable "store_id"
)
# Introducing the missing values in the "price" column (approximately 5%)
sales_data[sample(1:150, 5), "price"] <- NA
# Introducing outliers
sales_data[sample(1:150, 5), "quantity_sold"] <- sales_data[sample(1:150, 5), "quantity_sol
d"] * 10
sales_data[sample(1:150, 5), "price"] <- sales_data[sample(1:150, 5), "price"] * 2
# Exporting to CSV
[Link](sales_data, "sales_data.csv", [Link] = FALSE)
# Creating second dataset ( Customer Dataset)
[Link](286)
customer_data <- tibble(
customer_id = [Link](1:200),
customer_name = [Link](replicate(200, paste(sample(LETTERS, 5), collapse = ""))),
email = [Link](paste0(replicate(200, paste(sample(letters, 5), collapse = "")), "@exa
[Link]")),
total_purchases = [Link](sample(100:1000, 200, replace = TRUE)),
is_member = [Link](sample(c(TRUE, FALSE), 200, replace = TRUE, prob = c(0.6, 0.4))),
store_id = [Link](sample(1:5, 200, replace = TRUE)) # Common variable "store_id"
)
# Introduce missing values in the "email" column (approximately 5%)
customer_data[sample(1:200, 10), "email"] <- NA
# Export to CSV
[Link](customer_data, "customer_data.csv", [Link] = FALSE)
[Link] Wrangling 2/[Link] 2/4
9/18/23, 7:29 PM Data Wrangling (Data Preprocessing)
# Creating second dataset ( Customer Dataset)
# Create an inventory dataset
[Link](789)
inventory_data <- tibble(
product_id = [Link](1:200),
product_name = [Link](replicate(200, paste(sample(words, 2), collapse = " "))),
stock_level = [Link](sample(1:100, 200, replace = TRUE)),
supplier = [Link](replicate(200, paste(sample(LETTERS, 3), collapse = ""))),
cost_price = [Link](runif(200, min = 50, max = 200)),
selling_price = [Link](runif(200, min = 100, max = 500)),
store_id = [Link](sample(1:5, 200, replace = TRUE)) # Common variable "store_id"
)
# Introduce missing values in the "stock_level" column (approximately 5%)
inventory_data[sample(1:200, 10), "stock_level"] <- NA
# Introduce outliers
inventory_data[sample(1:200, 5), "cost_price"] <- inventory_data[sample(1:200, 5), "cost_pric
e"] * 0.5
inventory_data[sample(1:200, 5), "selling_price"] <- inventory_data[sample(1:200, 5), "sellin
g_price"] * 2
# Export to CSV
[Link](inventory_data, "inventory_data.csv", [Link] = FALSE)
Provide explanations here.
Merging data sets
Hide
# Merge your synthetic data sets, provide R codes here.
Provide explanations here.
Checking structure of combined data
Hide
# Check structure of combined data and perform all necessary data type conversions, provide R
codes here.
Provide explanations here.
Generate summary statistics
Hide
# Generate summary statistics, provide R codes here.
[Link] Wrangling 2/[Link] 3/4
9/18/23, 7:29 PM Data Wrangling (Data Preprocessing)
Provide explanations here.
Scanning data
Hide
# Scan variables for missing values, provide R codes here.
Provide explanations here.
[Link] Wrangling 2/[Link] 4/4