-
Notifications
You must be signed in to change notification settings - Fork 1
/
lsq.py
194 lines (146 loc) · 7.23 KB
/
lsq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# Code for the paper "Fast Private Kernel Density Estimation via Locality Sensitive Quantization"
# By T. Wagner, Y. Naamad, N. Mishra
# Published in ICML 2023
# The code below is for the Gaussian kernel with fixed bandwidth, k(x,y) := exp(-||x-y||_2^2).
# The bandwidth can be changed by scaling the input point coordinates.
import numpy as np
import scipy as sp
### Auxiliary : computing squared-distance matrix ###
def get_sqdistance_matrix(M1, M2):
allsqnorms = np.linalg.norm(np.vstack([M1,M2]), axis=1).reshape(-1, 1)**2
M1sqnorms = allsqnorms[:M1.shape[0],:]
M2sqnorms = allsqnorms[M1.shape[0]:,:].reshape(1, -1)
dm = M1sqnorms + M2sqnorms - 2.0 * np.dot(M1, M2.T)
dm[dm < 0.0] = 0.0
return dm
### Exact KDE ###
def GaussianKDE(dataset, queries):
exp_sq_dist_matrix = np.exp(-1 * get_sqdistance_matrix(dataset, queries))
return np.mean(exp_sq_dist_matrix, axis=0).T
### LSQ with Random Fourier Features ###
class LSQ_RFF:
def __init__(self, dataset, dimension, reps):
self.n = None
self.d = dimension
self.reps = reps
# Sample random fourier features
self.rff = np.sqrt(2) * np.random.normal(0, 1, (self.d, self.reps))
self.rff_shift = np.random.uniform(0, 2*np.pi, self.reps).reshape(1, -1)
self.rff_kde = None
self.sanitized_rff_kde = None
self.sketch_dataset(dataset)
def sketch_dataset(self, dataset):
self.n = dataset.shape[0]
self.rff_kde = np.mean(self.apply_rff(dataset), axis=0)
def apply_rff(self, m):
return np.sqrt(2) * np.cos(np.dot(m, self.rff) + self.rff_shift)
def sanitize(self, epsilon):
self.sanitized_rff_kde = self.rff_kde + \
np.random.laplace(0, np.sqrt(2) * self.reps * 1. / (epsilon * self.n), self.reps)
def non_private_kde(self, queries):
return (1./self.reps) * np.dot(self.rff_kde, self.apply_rff(queries).T)
def private_kde(self, queries):
return (1./self.reps) * np.dot(self.sanitized_rff_kde, self.apply_rff(queries).T)
### LSQ with Fast Gauss Transform ###
class LSQ_FGT:
def __init__(self, dataset, dimension, coordinate_range, rho):
self.n = None
self.d = dimension
self.coordinate_range = coordinate_range
self.small_radius_squared = rho
self.rho = rho
# Sketch
self.sketch = np.zeros((self.coordinate_range ** self.d, self.rho ** self.d))
self.sanitized_sketch = None
# Sketch indexing auxiliaries
self.aux_dim0_powers = np.flip(np.array([self.coordinate_range ** i for i in range(self.d)]))
self.aux_dim0_tuples = np.indices(tuple([self.coordinate_range] * self.d)).reshape(self.d, -1).T
self.aux_dim1_tuples = np.indices(tuple([self.rho] * self.d)).reshape(self.d, -1).T
self.noise_scale = (2 * (1 - 0.5 ** self.rho)) ** self.d
# Hermite polynomials
self.hermite_polynomials = [sp.special.hermite(j) for j in np.arange(self.rho)]
self.sketch_dataset(dataset)
def sketch_dataset(self, dataset):
self.n = dataset.shape[0]
if self.rho == 0:
return
# Partition dataset into hypercubes
rounded_dataset = np.rint(dataset)
# Compute the index of the cell containing each data point
cell_indices = rounded_dataset.dot(self.aux_dim0_powers).astype(np.dtype(int))
if self.rho == 1:
np.add.at(self.sketch[:, 0], cell_indices, np.ones(self.n))
elif self.d == 2:
residual_dataset = dataset - rounded_dataset
all_powers = np.einsum('nk,nl->nkl',
np.vstack([residual_dataset[:, 0]**j for j in range(self.rho)]).T,
np.vstack([residual_dataset[:, 1]**j for j in range(self.rho)]).T
).reshape(self.n, -1)
np.add.at(self.sketch, cell_indices, all_powers)
else:
residual_dataset = dataset - rounded_dataset
for idx, idx_tuple in enumerate(self.aux_dim1_tuples):
np.add.at(self.sketch[:, idx], cell_indices, np.prod(np.power(residual_dataset, idx_tuple), axis=1))
self.sketch *= 1. / self.n
def sanitize(self, epsilon):
self.sanitized_sketch = self.sketch + \
np.random.laplace(0, self.noise_scale * 1. / (epsilon * self.n), self.sketch.shape)
def g(self, query):
# Compute the cells which are close enough to matter
cell_distances = get_sqdistance_matrix(query.reshape(1, -1), self.aux_dim0_tuples)
relevant_cells = np.where(cell_distances.ravel() <= self.small_radius_squared)[0]
# Compute normalized hermite functions of all query residual coordinates in relevant cells
# (Denominator turns hermite polynomial to hermite function)
q_residuals = query - self.aux_dim0_tuples[relevant_cells, :]
denominator = np.exp(q_residuals ** 2)
hermite_evaluations = [(1. / np.math.factorial(j)) *
self.hermite_polynomials[j](q_residuals) / denominator for j in np.arange(self.rho)]
# Compute g-coordinates of query
q_sketch = np.zeros((len(relevant_cells), self.rho ** self.d))
for cell_id in range(len(relevant_cells)):
for idx, idx_tuple in enumerate(self.aux_dim1_tuples):
q_sketch[cell_id, idx] = np.prod([hermite_evaluations[idx_tuple[i]][cell_id, i] for i in range(self.d)])
return q_sketch, relevant_cells
def one_query_kde(self, query, sanitized):
if sanitized:
dataset_sketch = self.sanitized_sketch
else:
dataset_sketch = self.sketch
q_sketch, relevant_cells = self.g(query)
return dataset_sketch[relevant_cells, :].ravel().dot(q_sketch.ravel())
def non_private_kde(self, queries):
return np.array([self.one_query_kde(query, False) for query in queries])
def private_kde(self, queries):
return np.array([self.one_query_kde(query, True) for query in queries])
if __name__ == '__main__':
### Usage example ###
# Generate random dataset and queries:
dimension = 2
coordinate_range = 10
n_data = 100000
n_queries = 20
dataset = np.random.uniform(0, coordinate_range-1, (n_data, dimension))
queries = np.random.uniform(0, coordinate_range-1, (n_queries, dimension))
method = "lsq-rff" # or "lsq-fgt"
print("DP-KDE method:", method)
if method == "lsq-rff":
# Init LSQ-RFF:
num_features = 200
mechanism = LSQ_RFF(dataset, dimension, num_features)
elif method == "lsq-fgt":
# init LSQ-FGT:
rho = 4
mechanism = LSQ_FGT(dataset, dimension, coordinate_range, rho)
# Exact Gaussian KDE:
exact_kde = GaussianKDE(dataset, queries)
print("Exact:", exact_kde)
# Non-DP KDE estimates:
non_dp_kde_estimate = mechanism.non_private_kde(queries)
print("Non-DP estimate:", non_dp_kde_estimate)
print("Mean error:", np.mean(np.abs(exact_kde - non_dp_kde_estimate)))
# DP KDE estimates:
epsilon = 0.5
mechanism.sanitize(epsilon)
dp_kde_estimate = mechanism.private_kde(queries)
print("DP estimate:", dp_kde_estimate)
print("Mean error:", np.mean(np.abs(exact_kde - dp_kde_estimate)))