Skip to content

Commit

Permalink
add some tests
Browse files Browse the repository at this point in the history
  • Loading branch information
joaopmatias committed Sep 29, 2024
1 parent 7813b90 commit 757d4d5
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 7 deletions.
9 changes: 4 additions & 5 deletions hdbscan/_hdbscan_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -515,12 +515,11 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling(
if cluster < root_cluster:
result[n] = -1
elif cluster == root_cluster:
if len(clusters) == 1 and allow_single_cluster:
if len(clusters) == 1 and allow_single_cluster and cluster in cluster_label_map:
# check if cluster was not pruned by max_cluster_size or
# cluster_selection_epsilon_max before executing this
if cluster_selection_epsilon != 0.0:
if (tree['lambda_val'][tree['child'] == n] >= 1 / cluster_selection_epsilon) \
and cluster in cluster_label_map:
# check if cluster was not pruned by max_cluster_size or
# cluster_selection_epsilon_max before executing this
if tree['lambda_val'][tree['child'] == n] >= 1 / cluster_selection_epsilon:
result[n] = cluster_label_map[cluster]
else:
result[n] = -1
Expand Down
4 changes: 2 additions & 2 deletions hdbscan/hdbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,8 +735,8 @@ def hdbscan(
"Minkowski metric with negative p value is not" " defined!"
)

if cluster_selection_epsilon_max <= 0:
raise ValueError("Cluster selection epsilon max must be a positive value!")
if cluster_selection_epsilon_max < cluster_selection_epsilon:
raise ValueError("Cluster selection epsilon max must be greater than epsilon!")

if match_reference_implementation:
min_samples = min_samples - 1
Expand Down
30 changes: 30 additions & 0 deletions hdbscan/tests/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,36 @@ def test_hdbscan_allow_single_cluster_with_epsilon():
assert counts[unique_labels == -1] == 2


def test_hdbscan_cluster_selection_epsilon_max():
"""Test that reducing the cluster_selection_epsilon_max parameter
results in more clusters with smaller sizes being found."""
blobs, _ = make_blobs(n_samples=50,
centers=[(1, 0), (-1, 0), (-1, 1), (1, 1)],
cluster_std=0.2,
random_state=42)

clusterer = HDBSCAN(cluster_selection_epsilon_max=2.0,
allow_single_cluster=True)
clusterer.fit(blobs)

assert_array_equal(np.unique(clusterer.labels_), np.array([0, 1]))

clusterer = HDBSCAN(cluster_selection_epsilon_max=1.0,
allow_single_cluster=True)
clusterer.fit(blobs)

assert_array_equal(np.unique(clusterer.labels_), np.array([-1, 0, 1, 2, 3]))


def test_hdbscan_parameters_do_not_trigger_errors():
blobs, _ = make_blobs(n_samples=50,
centers=[(1, 0), (-1, 0), (-1, 1), (1, 1)],
cluster_std=0.2,
random_state=42)
clusterer = HDBSCAN(max_cluster_size=1,
allow_single_cluster=True)
clusterer.fit(blobs)

# Disable for now -- need to refactor to meet newer standards
@pytest.mark.skip(reason="need to refactor to meet newer standards")
def test_hdbscan_is_sklearn_estimator():
Expand Down

0 comments on commit 757d4d5

Please sign in to comment.