Skip to content

Commit

Permalink
Fix xfail mark for flaky test
Browse files Browse the repository at this point in the history
  • Loading branch information
kbattocchi committed Mar 7, 2019
1 parent 49f4b15 commit ba499fa
Showing 1 changed file with 113 additions and 117 deletions.
230 changes: 113 additions & 117 deletions econml/tests/test_dominicks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,97 +12,123 @@
from sklearn.linear_model import RidgeCV, LinearRegression
import pytest

file_name = "oj_large.csv"

pytestmark = pytest.mark.xfail(
@pytest.mark.xfail(
reason="This test used to work, but fully downloading the blob has become flaky. Needs investigation.")
def test_dominicks():
file_name = "oj_large.csv"
if not os.path.isfile(file_name):
print("Downloading file (this might take a few seconds)...")
urllib.request.urlretrieve(
"https://msalicedatapublic.blob.core.windows.net/datasets/OrangeJuice/oj_large.csv", file_name)
oj_data = pd.read_csv(file_name)

brands = sorted(set(oj_data["brand"]))
stores = sorted(set(oj_data["store"]))

featnames = ["week", "feat"] + list(oj_data.columns[6:])

# Preprocess data
import datetime
import numpy as np

# Convert 'week' to a date
# week_zero = datetime.datetime.strptime("09/07/89", "%m/%d/%y")
# oj_data["week"] = pd.to_timedelta(oj_data["week"], unit='w') + week_zero

# Take log of price
oj_data["logprice"] = np.log(oj_data["price"])
oj_data.drop("price", axis=1, inplace=True)

# Make brand numeric
oj_data["brand"] = [brands.index(b) for b in oj_data["brand"]]

class PriceFeaturizer(TransformerMixin):
def __init__(self, n_prods, own_price=True,
cross_price_groups=False, cross_price_indiv=True, per_product_effects=True):
base_arrays = []
effect_names = []
one_hots = [(0,) * p + (1,) + (0,) * (n_prods - p - 1) for p in range(n_prods)]
if own_price:
base_arrays.append(np.eye(n_prods))
effect_names.append("own price")
if cross_price_groups:
base_arrays.append((np.ones((n_prods, n_prods)) - np.eye(n_prods)) / (n_prods - 1))
effect_names.append("group cross price")
if cross_price_indiv:
for p in range(n_prods):
base_arrays.append(one_hots[p] * np.ones((n_prods, 1)) - np.diag(one_hots[p]))
effect_names.append("cross price effect {} ->".format(p))
if per_product_effects:
all = [(np.diag(one_hots[p]) @ arr, nm + " {}".format(p))
for arr, nm in zip(base_arrays, effect_names) for p in range(n_prods)]
# remove meaningless features (e.g. cross-price effects of products on themselves),
# which have all zero coeffs
nonempty = [(arr, nm) for arr, nm in all if np.count_nonzero(arr) > 0]
self._features = [arr for arr, _ in nonempty]
self._names = [nm for _, nm in nonempty]
else:
self._features = base_arrays
self._names = effect_names

def fit(self, X):
self._is_fitted = True
assert shape(X)[1] == 0
return self

def transform(self, X):
assert self._is_fitted
assert shape(X)[1] == 0
return np.tile(self._features, (shape(X)[0], 1, 1, 1))

@property
def names(self):
return self._names

for name, op, xp_g, xp_i, pp in [("Homogeneous treatment effect", True, False, False, False),
("Heterogeneous treatment effects", True, False, False, True),
(("Heterogeneous treatment effects"
" with group effects"), True, True, False, True),
(("Heterogeneous treatment effects"
" with cross price effects"), True, False, True, True)]:

print(name)
np.random.seed(42)

ft = PriceFeaturizer(n_prods=3, own_price=op, cross_price_groups=xp_g,
cross_price_indiv=xp_i, per_product_effects=pp)
names = ft.names
dml = DMLCateEstimator(model_y=RandomForestRegressor(),
model_t=RandomForestRegressor(),
featurizer=ft,
n_splits=2)

effects = []
for store in stores:
data = oj_data[oj_data['store'] == store].sort_values(by=['week', 'brand'])
dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)),
Y=reshape(data.as_matrix(["logmove"]), (-1, 3)),
W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames))))
effects.append(dml.coef_)
effects = np.array(effects)
for nm, eff in zip(names, effects.T):
print(" Effect: {}".format(nm))
print(" Mean: {}".format(np.mean(eff)))
print(" Std.: {}".format(np.std(eff)))

class ConstFt(TransformerMixin):
def fit(self, X):
return self

def transform(self, X):
return np.ones((shape(X)[0], 1))

print("Vanilla HTE+XP")

if not os.path.isfile(file_name):
print("Downloading file (this might take a few seconds)...")
urllib.request.urlretrieve(
"https://msalicedatapublic.blob.core.windows.net/datasets/OrangeJuice/oj_large.csv", file_name)
oj_data = pd.read_csv(file_name)

brands = sorted(set(oj_data["brand"]))
stores = sorted(set(oj_data["store"]))

featnames = ["week", "feat"] + list(oj_data.columns[6:])

# Preprocess data
import datetime
import numpy as np

# Convert 'week' to a date
# week_zero = datetime.datetime.strptime("09/07/89", "%m/%d/%y")
# oj_data["week"] = pd.to_timedelta(oj_data["week"], unit='w') + week_zero

# Take log of price
oj_data["logprice"] = np.log(oj_data["price"])
oj_data.drop("price", axis=1, inplace=True)

# Make brand numeric
oj_data["brand"] = [brands.index(b) for b in oj_data["brand"]]


class PriceFeaturizer(TransformerMixin):
def __init__(self, n_prods, own_price=True,
cross_price_groups=False, cross_price_indiv=True, per_product_effects=True):
base_arrays = []
effect_names = []
one_hots = [(0,) * p + (1,) + (0,) * (n_prods - p - 1) for p in range(n_prods)]
if own_price:
base_arrays.append(np.eye(n_prods))
effect_names.append("own price")
if cross_price_groups:
base_arrays.append((np.ones((n_prods, n_prods)) - np.eye(n_prods)) / (n_prods - 1))
effect_names.append("group cross price")
if cross_price_indiv:
for p in range(n_prods):
base_arrays.append(one_hots[p] * np.ones((n_prods, 1)) - np.diag(one_hots[p]))
effect_names.append("cross price effect {} ->".format(p))
if per_product_effects:
all = [(np.diag(one_hots[p]) @ arr, nm + " {}".format(p))
for arr, nm in zip(base_arrays, effect_names) for p in range(n_prods)]
# remove meaningless features (e.g. cross-price effects of products on themselves),
# which have all zero coeffs
nonempty = [(arr, nm) for arr, nm in all if np.count_nonzero(arr) > 0]
self._features = [arr for arr, _ in nonempty]
self._names = [nm for _, nm in nonempty]
else:
self._features = base_arrays
self._names = effect_names

def fit(self, X):
self._is_fitted = True
assert shape(X)[1] == 0
return self

def transform(self, X):
assert self._is_fitted
assert shape(X)[1] == 0
return np.tile(self._features, (shape(X)[0], 1, 1, 1))

@property
def names(self):
return self._names


for name, op, xp_g, xp_i, pp in [("Homogeneous treatment effect", True, False, False, False),
("Heterogeneous treatment effects", True, False, False, True),
(("Heterogeneous treatment effects"
" with group effects"), True, True, False, True),
(("Heterogeneous treatment effects"
" with cross price effects"), True, False, True, True)]:

print(name)
np.random.seed(42)

ft = PriceFeaturizer(n_prods=3, own_price=op, cross_price_groups=xp_g,
cross_price_indiv=xp_i, per_product_effects=pp)
names = ft.names
dml = DMLCateEstimator(model_y=RandomForestRegressor(),
model_t=RandomForestRegressor(),
featurizer=ft,
featurizer=ConstFt(),
n_splits=2)

effects = []
Expand All @@ -113,38 +139,8 @@ def names(self):
W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames))))
effects.append(dml.coef_)
effects = np.array(effects)
for nm, eff in zip(names, effects.T):
names = ["{} on {}".format(i, j) for j in range(3) for i in range(3)]
for nm, eff in zip(names, reshape(effects, (-1, 9)).T):
print(" Effect: {}".format(nm))
print(" Mean: {}".format(np.mean(eff)))
print(" Std.: {}".format(np.std(eff)))


class ConstFt(TransformerMixin):
def fit(self, X):
return self

def transform(self, X):
return np.ones((shape(X)[0], 1))


print("Vanilla HTE+XP")

np.random.seed(42)
dml = DMLCateEstimator(model_y=RandomForestRegressor(),
model_t=RandomForestRegressor(),
featurizer=ConstFt(),
n_splits=2)

effects = []
for store in stores:
data = oj_data[oj_data['store'] == store].sort_values(by=['week', 'brand'])
dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)),
Y=reshape(data.as_matrix(["logmove"]), (-1, 3)),
W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames))))
effects.append(dml.coef_)
effects = np.array(effects)
names = ["{} on {}".format(i, j) for j in range(3) for i in range(3)]
for nm, eff in zip(names, reshape(effects, (-1, 9)).T):
print(" Effect: {}".format(nm))
print(" Mean: {}".format(np.mean(eff)))
print(" Std.: {}".format(np.std(eff)))

0 comments on commit ba499fa

Please sign in to comment.