-
Notifications
You must be signed in to change notification settings - Fork 44
/
data_loader.py
165 lines (138 loc) · 7.01 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from datetime import datetime
from distutils.util import strtobool
import pandas as pd
# Converts the contents in a .tsf file into a dataframe and returns it along with other meta-data of the dataset: frequency, horizon, whether the dataset contains missing values and whether the series have equal lengths
#
# Parameters
# full_file_path_and_name - complete .tsf file path
# replace_missing_vals_with - a term to indicate the missing values in series in the returning dataframe
# value_column_name - Any name that is preferred to have as the name of the column containing series values in the returning dataframe
def convert_tsf_to_dataframe(
full_file_path_and_name,
replace_missing_vals_with="NaN",
value_column_name="series_value",
):
col_names = []
col_types = []
all_data = {}
line_count = 0
frequency = None
forecast_horizon = None
contain_missing_values = None
contain_equal_length = None
found_data_tag = False
found_data_section = False
started_reading_data_section = False
with open(full_file_path_and_name, "r", encoding="cp1252") as file:
for line in file:
# Strip white space from start/end of line
line = line.strip()
if line:
if line.startswith("@"): # Read meta-data
if not line.startswith("@data"):
line_content = line.split(" ")
if line.startswith("@attribute"):
if (
len(line_content) != 3
): # Attributes have both name and type
raise Exception("Invalid meta-data specification.")
col_names.append(line_content[1])
col_types.append(line_content[2])
else:
if (
len(line_content) != 2
): # Other meta-data have only values
raise Exception("Invalid meta-data specification.")
if line.startswith("@frequency"):
frequency = line_content[1]
elif line.startswith("@horizon"):
forecast_horizon = int(line_content[1])
elif line.startswith("@missing"):
contain_missing_values = bool(
strtobool(line_content[1])
)
elif line.startswith("@equallength"):
contain_equal_length = bool(strtobool(line_content[1]))
else:
if len(col_names) == 0:
raise Exception(
"Missing attribute section. Attribute section must come before data."
)
found_data_tag = True
elif not line.startswith("#"):
if len(col_names) == 0:
raise Exception(
"Missing attribute section. Attribute section must come before data."
)
elif not found_data_tag:
raise Exception("Missing @data tag.")
else:
if not started_reading_data_section:
started_reading_data_section = True
found_data_section = True
all_series = []
for col in col_names:
all_data[col] = []
full_info = line.split(":")
if len(full_info) != (len(col_names) + 1):
raise Exception("Missing attributes/values in series.")
series = full_info[len(full_info) - 1]
series = series.split(",")
if len(series) == 0:
raise Exception(
"A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series. Missing values should be indicated with ? symbol"
)
numeric_series = []
for val in series:
if val == "?":
numeric_series.append(replace_missing_vals_with)
else:
numeric_series.append(float(val))
if numeric_series.count(replace_missing_vals_with) == len(
numeric_series
):
raise Exception(
"All series values are missing. A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series."
)
all_series.append(pd.Series(numeric_series).array)
for i in range(len(col_names)):
att_val = None
if col_types[i] == "numeric":
att_val = int(full_info[i])
elif col_types[i] == "string":
att_val = str(full_info[i])
elif col_types[i] == "date":
att_val = datetime.strptime(
full_info[i], "%Y-%m-%d %H-%M-%S"
)
else:
raise Exception(
"Invalid attribute type."
) # Currently, the code supports only numeric, string and date types. Extend this as required.
if att_val is None:
raise Exception("Invalid attribute value.")
else:
all_data[col_names[i]].append(att_val)
line_count = line_count + 1
if line_count == 0:
raise Exception("Empty file.")
if len(col_names) == 0:
raise Exception("Missing attribute section.")
if not found_data_section:
raise Exception("Missing series information under data section.")
all_data[value_column_name] = all_series
loaded_data = pd.DataFrame(all_data)
return (
loaded_data,
frequency,
forecast_horizon,
contain_missing_values,
contain_equal_length,
)
# Example of usage
# loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("TSForecasting/tsf_data/sample.tsf")
# print(loaded_data)
# print(frequency)
# print(forecast_horizon)
# print(contain_missing_values)
# print(contain_equal_length)