28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389 | class CVTArchive(Archive, RNG):
"""An Archive that divides a high-dimensional measure space into k homogeneous geometric regions.
Based on the paper from Vassiliades et al (2018) <https://ieeexplore.ieee.org/document/8000667>
> The computational complexity of the method we provide for constructing the CVT (in Algorithm 1) is O(ndki),
> where n is the number of d-dimensional samples to be clustered, k is the number of clusters,
> and i is the number of iterations needed until convergence
"""
def __init__(
self,
k: int,
ranges: Sequence[Tuple[float, float]],
n_samples: int,
centroids: Optional[npt.NDArray | str] = None,
samples: Optional[npt.NDArray | str] = None,
dtype=np.float64,
seed: int = 42,
):
"""Creates a CVTArchive object
Args:
k (int): Number of centroids (regions) to create
ranges (Sequence[Tuple[float, float]]): Ranges of the measure space. Upper and lower bound of each
dimension of the measure space, e.g. ``[(-1, 1), (-2, 2)]``
indicates the first dimension should have bounds :math:`[-1,1]`
(inclusive), and the second dimension should have bounds
:math:`[-2,2]` (inclusive). The legnth of ``ranges`` indicates the number of dimensions of the measure space.
n_samples (int): Number of samples to generate before calculating the centroids.
centroids (Optional[npt.NDArray | str], optional): Precalculated centroids for the archive.
The options are a np.ndarray with the values of ``k`` centroids or a .txt with the centroids to be loaded by Numpy. Defaults to None.
samples (Optional[npt.NDArray | str], optional): Precalculated samples for the archive.
The options are a np.ndarray with the values of ``n_samples`` samples or a .txt with the samples to be loaded by Numpy. Defaults to None.
Raises:
ValueError: If len(ranges) <= 0.
ValueError: If the number of samples is less than zero or less than the number of regions (k).
ValueError: If the number of regions is less than zero.
ValueError: If the samples file cannot be loaded.
ValueError: If given a samples np.ndarray the number of samples in the file is different from the number of expected samples (n_samples).
ValueError: If the centroids file cannot be loaded.
ValueError: If given a centroids np.ndarray the number of centroids in the file is different from the number of regions (k).
"""
Archive.__init__(self, threshold=np.finfo(np.float32).max, dtype=dtype)
if k <= 0:
raise ValueError(f"The number of regions (k = {k}) must be >= 1")
if len(ranges) <= 0:
raise ValueError(
f"ranges must have length >= 1 and it has length {len(ranges)}"
)
if n_samples <= 0 or n_samples < k:
raise ValueError(
f"The number of samples (n_samples = {n_samples}) must be >= 1 and >= regions (k = {k})"
)
self._dimensions = len(ranges)
ranges = list(zip(*ranges))
self._lower_bounds = np.array(ranges[0], dtype=self._dtype)
self._upper_bounds = np.array(ranges[1], dtype=self._dtype)
self._interval = self._upper_bounds - self._lower_bounds
self._k = k
self._n_samples = n_samples
self._samples = None
self._centroids = None
self.initialize_rng(seed=seed)
self._kmeans = KMeans(n_clusters=self._k, n_init=1, random_state=self._seed)
# Data Structure to store the instances in the CVT
self._grid: Dict[int, Instance] = {}
# Loading samples if given
if samples is not None:
if isinstance(samples, str):
try:
self._samples = np.load(samples)
self._n_samples = len(self._samples)
except Exception as _:
raise ValueError(
f"Error in CVTArchive.__init__() loading the samples file {samples}."
)
elif isinstance(samples, np.ndarray) and len(samples) != n_samples:
raise ValueError(
f"The number of samples {len(samples)} must be equal to the number of expected samples (n_samples = {n_samples})"
)
else:
self._samples = np.asarray(samples)
if centroids is not None:
if isinstance(centroids, str):
try:
self._centroids = np.load(centroids)
self._k = len(self._centroids)
except Exception as _:
raise ValueError(
f"Error in CVTArchive.__init__() loading the centroids file {centroids}."
)
elif isinstance(centroids, np.ndarray) and len(centroids) != k:
raise ValueError(
f"The number of centroids {len(centroids)} must be equal to the number of regions (k = {self._k})"
)
else:
self._centroids = np.asarray(centroids)
else:
# Generate centroids
if self._samples is None:
# Generate uniform samples if not given
rng = np.random.default_rng(seed=self._seed)
self._samples = rng.uniform(
low=self._lower_bounds,
high=self._upper_bounds,
size=(self._n_samples, self._dimensions),
)
self._kmeans.fit(self._samples)
self._centroids = self._kmeans.cluster_centers_
self._kdtree = KDTree(self._centroids, metric="euclidean")
@property
def dimensions(self) -> int:
"""Dimensions of the measure space used
Returns:
int: Dimensions of the measure space used
"""
return self._dimensions
@property
def samples(self) -> np.ndarray:
"""Returns the samples used to generate the centroids
Returns:
np.ndarray: Samples
"""
return self._samples
@property
def centroids(self) -> np.ndarray:
"""Returns k centroids calculated from the samples
Returns:
np.ndarray: K d-dimensional centroids
"""
return self._centroids
@property
def regions(self) -> int:
"""Number of regions (k) of centroids in the CVTArchive
Returns:
int: k
"""
return self._k
@property
def bounds(self) -> Tuple[np.ndarray, np.ndarray]:
"""Tuple with the lower and upper bounds of the measure space
The first value is the lower bounds and the second value is the upper bounds.
Each value is a list with the corresponding lower/upper bound of the ith dimension
in the measure space
"""
return (self._lower_bounds, self._upper_bounds)
@property
def instances(self) -> list[Instance]:
return list(self._grid.values())
def __str__(self):
return f"CVArchive(dim={self._dimensions},regions={self._k},centroids={self._centroids})"
def __repr__(self):
return f"CVArchive(dim={self._dimensions},regions={self._k},centroids={self._centroids})"
def __iter__(self):
"""Iterates over the dictionary of instances
Returns:
Iterator: Yields position in the hypercube and instance located in such position
"""
return iter(self._grid.values())
def lower_i(self, i) -> np.float64:
if i < 0 or i > len(self._lower_bounds):
msg = f"index {i} is out of bounds. Valid values are [0-{len(self._lower_bounds)}]"
raise ValueError(msg)
return self._lower_bounds[i]
def upper_i(self, i) -> np.float64:
if i < 0 or i > len(self._upper_bounds):
msg = f"index {i} is out of bounds. Valid values are [0-{len(self._upper_bounds)}]"
raise ValueError(msg)
return self._upper_bounds[i]
def append(self, instance: Instance):
"""Inserts an Instance into the Grid
Args:
instance (Instance): Instace to be inserted
Raises:
TypeError: ``instance`` is not a instance of the class Instance.
"""
if isinstance(instance, Instance):
index = self.index_of(np.asarray(instance.descriptor).reshape(1, -1))[0]
if index not in self._grid or instance > self._grid[index]:
self._grid[index] = instance.clone()
else:
msg = "Only objects of type Instance can be inserted into a CVTArchive"
raise TypeError(msg)
def extend(self, iterable: Iterable[Instance]):
"""Includes all the instances in iterable into the Grid
Args:
iterable (Iterable[Instance]): Iterable of instances
"""
if not all(isinstance(i, Instance) for i in iterable):
msg = "Only objects of type Instance can be inserted into a CVTArchive"
raise TypeError(msg)
indeces = self.index_of([i.descriptor for i in iterable])
for idx, instance in zip(indeces, iterable, strict=True):
if idx not in self._grid or instance.fitness > self._grid[idx].fitness:
self._grid[idx] = instance.clone()
def remove(self, iterable: Iterable[Instance]):
"""Removes all the instances in iterable from the grid"""
if not all(isinstance(i, Instance) for i in iterable):
msg = "Only objects of type Instance can be removed from a CVTArchive"
raise TypeError(msg)
indeces_to_remove = self.index_of([i.descriptor for i in iterable])
for index in indeces_to_remove:
if index in self._grid:
del self._grid[index]
def remove_unfeasible(self, attr: str = "p"):
"""Removes all the unfeasible instances from the grid"""
keys_to_remove = [
i for i in self._grid.keys() if getattr(self._grid[i], attr) < 0
]
for i in keys_to_remove:
del self._grid[i]
def index_of(self, descriptors) -> np.ndarray:
"""Computes the indeces of a batch of descriptors.
Args:
descriptors (array-like): (batch_size, dimensions) array of descriptors for each instance
Raises:
ValueError: ``descriptors`` is not shape (batch_size, dimensions)
Returns:
np.ndarray: (batch_size, ) array of integer indices representing the flattened grid coordinates.
"""
descriptors = np.array(descriptors)
if len(descriptors) == 0:
return np.empty(0)
if (
descriptors.ndim == 1
and descriptors.shape[0] != self._dimensions
or descriptors.ndim == 2
and descriptors.shape[1] != self._dimensions
):
raise ValueError(
f"Expected descriptors to be an array with shape "
f"(batch_size, dimensions) (i.e. shape "
f"(batch_size, {self._dimensions})) but it had shape "
f"{descriptors.shape}"
)
indices = self._kdtree.query(descriptors, return_distance=False)
indices = indices[:, 0]
return indices.astype(np.int32)
def to_file(self, file_pattern: str = "CVTArchive"):
"""Saves the centroids and the samples of the CVTArchive to .npy files
Each attribute is saved in its own filename.
Therefore, file_pattern is expected not to contain any extension
Args:
file_pattern (str, optional): Pattern of the expected filenames. Defaults to "CVTArchive".
"""
np.save(f"{file_pattern}_centroids.npy", self._centroids)
np.save(f"{file_pattern}_samples.npy", self._samples)
@classmethod
def load_from_json(cls, filename: str):
"""Creates a CVTArchive object from the content of a previously created JSON file
Args:
filename (str): Filename of the JSON file with the CVTArchive information
Raises:
ValueError: If there's any error while loading the file. (IOError)
ValueError: If the JSON file does not contain all the expected keys
Returns:
Self: Returns a CVTArchive object
"""
expected_keys = {
"dimensions",
"n_samples",
"regions",
"lbs",
"ubs",
"centroids",
"samples",
}
try:
with open(filename, "r") as file:
json_data = json.load(file)
if expected_keys != json_data.keys():
raise ValueError(
f"The JSON file does not contain all the minimum expected keys. Expected keys are {expected_keys} and got {json_data.keys()}"
)
_ranges = [
(l_i, u_i) for l_i, u_i in zip(json_data["lbs"], json_data["ubs"])
]
new_archive = cls(
k=json_data["regions"],
ranges=_ranges,
n_samples=json_data["n_samples"],
centroids=json_data["centroids"],
samples=json_data["samples"],
)
return new_archive
except IOError as io:
raise ValueError(f"Error opening file {filename}. Reason -> {io.strerror}")
def asdict(self) -> dict:
return {
"dimensions": self._dimensions,
"n_samples": self._n_samples,
"regions": self._k,
"lbs": self._lower_bounds.tolist(),
"ubs": self._upper_bounds.tolist(),
"centroids": self._centroids.tolist(),
"samples": self._samples.tolist(),
"instances": {
i: instance.asdict() for i, instance in enumerate(self._grid.values())
},
}
def to_json(self, filename: Optional[str] = None) -> str:
"""Returns the content of the CVTArchive in JSON format.
Returns:
str: String in JSON format with the content of the CVTArchive
"""
json_data = json.dumps(self.asdict(), indent=4)
if filename is not None:
filename = (
f"{filename}.json" if not filename.endswith(".json") else filename
)
with open(filename, "w") as f:
f.write(json_data)
return json_data
|