-
Notifications
You must be signed in to change notification settings - Fork 44
/
data_loader.R
203 lines (159 loc) · 7.41 KB
/
data_loader.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
options(pillar.sigfig = 7)
# Extend these frequency lists as required
LOW_FREQUENCIES <- c("4_seconds", "minutely", "10_minutes", "15_minutes", "half_hourly", "hourly")
LOW_FREQ_VALS <- c("4 sec", "1 min", "10 min", "15 min", "30 min", "1 hour")
HIGH_FREQUENCIES <- c("daily", "weekly", "monthly", "quarterly", "yearly")
HIGH_FREQ_VALS <- c("1 day", "1 week", "1 month", "3 months", "1 year")
FREQUENCIES <- c(LOW_FREQUENCIES, HIGH_FREQUENCIES)
FREQ_VALS <- c(LOW_FREQ_VALS, HIGH_FREQ_VALS)
# Create a hashmap containing possible frequency key-value pairs
FREQ_MAP <- list()
for(f in seq_along(FREQUENCIES))
FREQ_MAP[[FREQUENCIES[f]]] <- FREQ_VALS[f]
# This function converts the contents in a .tsf file into a tsibble or a dataframe and returns it along with other meta-data of the dataset: frequency, horizon, whether the dataset contains missing values and whether the series have equal lengths
#
# Parameters
# file - .tsf file path
# value_column_name - Any name that is preferred to have as the name of the column containing series values in the returning tsibble
# key - The name of the attribute that should be used as the key when creating the tsibble. If doesn't provide, a data frame will be returned instead of a tsibble
# index - The name of the time attribute that should be used as the index when creating the tsibble. If doesn't provide, it will search for a valid index. When no valid index found, a data frame will be returned instead of a tsibble
convert_tsf_to_tsibble <- function(file, value_column_name = "series_value", key = NULL, index = NULL){
if(is.character(file)) {
file <- file(file, "r")
on.exit(close(file))
}
if(!inherits(file, "connection"))
stop("Argument 'file' must be a character string or connection.")
if(!isOpen(file)) {
open(file, "r")
on.exit(close(file))
}
# Read meta-data
col_names <- NULL
col_types <- NULL
frequency <- NULL
forecast_horizon <- NULL
contain_missing_values <- NULL
contain_equal_length <- NULL
index_var <- NULL
line <- readLines(file, n = 1) #n is no: of lines to read
while(length(line) && regexpr('^[[:space:]]*@data', line, perl = TRUE) == -1) { #Until read @data, run this loop (-1 indicate no match with the regular expression yet)
if(regexpr('^[[:space:]]*@', line, perl = TRUE) > 0) { #This condition will be true for lines starting with @
con <- textConnection(line)
line <- scan(con, character(), quiet = TRUE) #Creating a vector containing the words in a line (ex: "@attribute" "series_name" "string")
close(con)
if(line[1] == "@attribute"){
if(length(line) != 3) #Attributes have both name and type
stop("Invalid meta-data specification.")
if(is.null(index) & line[3] == "date")
index_var <- line[2]
col_names <- c(col_names, line[2])
col_types <- c(col_types, line[3])
}else{
if(length(line) != 2) #Other meta-data have only values
stop("Invalid meta-data specification.")
if(line[1] == "@frequency")
frequency <- line[2]
else if(line[1] == "@horizon")
forecast_horizon <- as.numeric(line[2])
else if(line[1] == "@missing")
contain_missing_values <- as.logical(line[2])
else if(line[1] == "@equallength")
contain_equal_length <- as.logical(line[2])
}
}
line <- readLines(file, n = 1)
}
if(length(line) == 0)
stop("Missing data section.")
if(is.null(col_names))
stop("Missing attribute section.")
line <- readLines(file, n = 1)
if(length(line) == 0)
stop("Missing series information under data section.")
for(col in col_names)
assign(col, NULL)
values <- NULL
row_count <- 0
# Get data
while(length(line) != 0){
full_info <- strsplit(line, ":")[[1]]
if(length(full_info) != length(col_names)+1)
stop("Missing attributes/values in series.")
series <- strsplit(tail(full_info, 1), ",")[[1]]
series[which(series == "?")] <- NA
series <- as.numeric(series)
if(all(is.na(series)))
stop("All series values are missing. A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series.")
values <- c(values, series)
row_count <- row_count + length(series)
attributes <- head(full_info, length(full_info)-1)
for(col in seq_along(col_names)){
att <- eval(parse(text=col_names[col]))
#This format supports 3 attribute types: string, numeric and date
if(col_types[col] == "date"){
if(is.null(frequency))
stop("Frequency is missing.")
else{
if(frequency %in% LOW_FREQUENCIES)
start_time <- as.POSIXct(attributes[col], format = "%Y-%m-%d %H-%M-%S", tz = "UTC")
else if(frequency %in% HIGH_FREQUENCIES)
start_time <- as.Date(attributes[col], format = "%Y-%m-%d %H-%M-%S")
else
stop("Invalid frequency.")
if(is.na(start_time))
stop("Incorrect timestamp format. Specify your timestamps as YYYY-mm-dd HH-MM-SS")
}
timestamps <- seq(start_time, length.out = length(series), by = FREQ_MAP[[frequency]])
if(is.null(att))
att <- timestamps
else
att[(length(att) + 1) : ((length(att) + length(timestamps)))] <- timestamps
}else{
if(col_types[col] == "numeric")
attributes[col] <- as.numeric(attributes[col])
else if(col_types[col] == "string")
attributes[col] <- as.character(attributes[col])
else
stop("Invalid attribute type.")
if(is.na(attributes[col]))
stop("Invalid attribute values.")
att <- append(att, rep(attributes[col], length(series)))
}
assign(col_names[col], att)
}
line <- readLines(file, n = 1)
}
data <- as.data.frame(matrix(nrow = row_count, ncol = length(col_names) + 1))
colnames(data) <- c(col_names, value_column_name)
for(col in col_names)
data[[col]] <- eval(parse(text = col))
data[[value_column_name]] <- values
if(!(is.null(key))){
if(!(key %in% col_names))
stop("Invalid key. Cannot convert data into tsibble format.")
else{
if(is.null(index)){
if(is.null(index_var))
cat("Index is not provided. No valid index found in data. Returning a dataframe.")
else
data <- tsibble:::build_tsibble(x = data, key = key, index = index_var, ordered = F)
}else{
if(!(index %in% col_names))
stop("Invalid index Cannot convert data into tsibble format.")
else
data <- tsibble:::build_tsibble(x = data, key = key, index = index, ordered = F)
}
}
}else{
cat("Key is not provided. Returning a dataframe.")
}
list(data, frequency, forecast_horizon, contain_missing_values, contain_equal_length)
}
# Example of usage
# loaded_data <- convert_tsf_to_tsibble(file.path("TSForecasting", "tsf_data", "sample.tsf", fsep = "/"), "series_value", "series_name", "start_timestamp")
# tsibble_data <- loaded_data[[1]]
# frequency <- loaded_data[[2]]
# forecast_horizon <- loaded_data[[3]]
# contain_missing_values <- loaded_data[[4]]
# contain_equal_length <- loaded_data[[5]]