-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject.py
173 lines (128 loc) · 5.27 KB
/
project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import pandas as pd
fed_files = ["MORTGAGE30US.csv", "RRVRUSQ156N.csv", "CPIAUCSL.csv"]
dfs = [pd.read_csv("Tables/" + f, parse_dates=True, index_col=0) for f in fed_files]
dfs[0] #interest rate table
fed_data = pd.concat(dfs, axis=1) #combines the individual tables
fed_data
fed_data = fed_data.ffill().dropna() #forward fill works by holding the known value until another known value arrives
fed_data
zillow_files = ["Metro_median_sale_price_uc_sfrcondo_week.csv", "Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_month.csv"]
dfs = [pd.read_csv("Tables/" + f) for f in zillow_files]
#dfs[0] #median_sale_price table
#dfs[1] #zillow home value index
dfs = [pd.DataFrame(df.iloc[0,5:]) for df in dfs] #getting first row only discarding first 5 columns
for df in dfs:
df.index = pd.to_datetime(df.index)
df["month"] = df.index.to_period("M")
price_data = dfs[0].merge(dfs[1], on="month")
price_data.index = dfs[0].index
price_data
del price_data["month"]
price_data.columns = ["price", "value"]
from datetime import timedelta
fed_data.index = fed_data.index + timedelta(days=2)
fed_data.to_csv("Tables/fed_data.csv")
price_data = fed_data.merge(price_data, left_index=True, right_index=True)
price_data.columns = ["interest", "vacancy", "cpi", "price", "value"]
price_data.plot.line(y="price", use_index=True)
price_data["adj_price"] = price_data["price"] / price_data["cpi"] * 100
price_data["adj_value"] = price_data["value"] / price_data["cpi"] * 100
price_data.plot.line(y="adj_price", use_index=True)
price_data["next_quarter"] = price_data["adj_price"].shift(-13)
price_data.dropna(inplace=True)
price_data.to_csv("Tables/price_data.csv")
price_data["change"] = (price_data["next_quarter"] > price_data["adj_price"]).astype(int)
price_data["change"].value_counts()
predictors = ["interest", "vacancy", "adj_value"]
target = "adj_price"
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
from xgboost import XGBRegressor
import numpy as np
from sklearn.model_selection import GridSearchCV
START = 260
STEP = 52
def predict(train, test, predictors, target):
#rf = RandomForestRegressor(min_samples_split=10, random_state=1)
rf = LinearRegression()
#rf = Ridge(alpha = 0.1)
#rf = XGBRegressor()
rf.fit(train[predictors], train[target])
preds = rf.predict(test[predictors])
result = permutation_importance(rf, price_data[predictors], price_data[target], n_repeats=10, random_state=1)
print("Importances Mean", result["importances_mean"])
return preds
def backtest(data, predictors, target):
all_preds = []
for i in range(START, data.shape[0], STEP):
train = price_data.iloc[:i]
test = price_data.iloc[i:(i+STEP)]
all_preds.append(predict(train, test, predictors, target))
preds = np.concatenate(all_preds)
mse = mean_squared_error(data.iloc[START:][target], preds)
mae = mean_absolute_error(data.iloc[START:][target], preds)
r2 = r2_score(data.iloc[START:][target], preds)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-Squared:", r2)
return preds, mse, mae, r2
preds, mse, mae, r2 = backtest(price_data, predictors, target)
import matplotlib.pyplot as plt
true_values = price_data.iloc[START:][target].values
plt.scatter(range(len(preds)),preds,label='Predicted Values')
plt.scatter(range(len(true_values)),true_values,label='True Values')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()
plt.plot(preds, label='Predicted Values')
plt.plot(true_values, label='True Values')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()
preds = backtest(price_data, predictors, target)[0]
true_values = price_data.iloc[START:]['adj_price'].values
interest_rate = price_data.iloc[START:]['interest'].values
vacancy_rate = price_data.iloc[START:]['vacancy'].values
adj_value = price_data.iloc[START:]['adj_value'].values
# Create the figure and subplots
fig, axes = plt.subplots(5, 1, figsize=(8, 8), sharex=True)
# Plot the predicted values
axes[0].plot(preds, label='Predicted Values')
axes[0].set_ylabel('Predicted Price')
axes[0].legend()
# Plot the true values
axes[1].plot(true_values, label='True Values')
axes[1].set_ylabel('True Price')
axes[1].legend()
# Plot the interest rate
axes[2].plot(interest_rate, label='Interest Rate')
axes[2].set_ylabel('Interest Rate')
axes[2].legend()
# Plot the vacancy rate
axes[3].plot(vacancy_rate, label='Vacancy Rate')
axes[3].set_ylabel('Vacancy Rate')
axes[3].legend()
# Plot the adj_value
axes[4].plot(adj_value, label='adj_value')
axes[4].set_ylabel('adj_value')
axes[4].legend()
plt.xlabel('Time')
plt.show()
# Plot the predicted values
plt.plot(preds, label='Predicted Values', linestyle='-')
# Plot the true values
plt.plot(true_values, label='True Values', linestyle='-.')
# Plot the interest rate
plt.plot(interest_rate, label='Interest Rate', linestyle=':')
# Plot the vacancy rate
plt.plot(vacancy_rate, label='Vacancy Rate', linestyle='-.')
# Plot the adj_value
plt.plot(adj_value, label='adj_value', linestyle=':')
plt.xlabel('Time')
plt.ylabel('Values')
plt.legend()
plt.show()