add some tests

scikit-learn-contrib · Sep 29, 2024 · 757d4d5 · 757d4d5
1 parent 7813b90
commit 757d4d5
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 7 deletions.
diff --git a/hdbscan/_hdbscan_tree.pyx b/hdbscan/_hdbscan_tree.pyx
@@ -515,12 +515,11 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling(
         if cluster < root_cluster:
             result[n] = -1
         elif cluster == root_cluster:
-            if len(clusters) == 1 and allow_single_cluster:
+            if len(clusters) == 1 and allow_single_cluster and cluster in cluster_label_map:
+                # check if cluster was not pruned by max_cluster_size or
+                # cluster_selection_epsilon_max before executing this
                 if cluster_selection_epsilon != 0.0:
-                    if (tree['lambda_val'][tree['child'] == n] >= 1 / cluster_selection_epsilon) \
-                        and cluster in cluster_label_map:
-                        # check if cluster was not pruned by max_cluster_size or
-                        # cluster_selection_epsilon_max before executing this
+                    if tree['lambda_val'][tree['child'] == n] >= 1 / cluster_selection_epsilon:
                         result[n] = cluster_label_map[cluster]
                     else:
                         result[n] = -1

diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py
@@ -735,8 +735,8 @@ def hdbscan(
                 "Minkowski metric with negative p value is not" " defined!"
             )
 
-    if cluster_selection_epsilon_max <= 0:
-        raise ValueError("Cluster selection epsilon max must be a positive value!")
+    if cluster_selection_epsilon_max < cluster_selection_epsilon:
+        raise ValueError("Cluster selection epsilon max must be greater than epsilon!")
 
     if match_reference_implementation:
         min_samples = min_samples - 1

diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py
@@ -649,6 +649,36 @@ def test_hdbscan_allow_single_cluster_with_epsilon():
     assert counts[unique_labels == -1] == 2
 
 
+def test_hdbscan_cluster_selection_epsilon_max():
+    """Test that reducing the cluster_selection_epsilon_max parameter
+    results in more clusters with smaller sizes being found."""
+    blobs, _ = make_blobs(n_samples=50,
+                          centers=[(1, 0), (-1, 0), (-1, 1), (1, 1)],
+                          cluster_std=0.2,
+                          random_state=42)
+
+    clusterer = HDBSCAN(cluster_selection_epsilon_max=2.0,
+                        allow_single_cluster=True)
+    clusterer.fit(blobs)
+
+    assert_array_equal(np.unique(clusterer.labels_), np.array([0, 1]))
+
+    clusterer = HDBSCAN(cluster_selection_epsilon_max=1.0,
+                        allow_single_cluster=True)
+    clusterer.fit(blobs)
+
+    assert_array_equal(np.unique(clusterer.labels_), np.array([-1, 0, 1, 2, 3]))
+
+
+def test_hdbscan_parameters_do_not_trigger_errors():
+    blobs, _ = make_blobs(n_samples=50,
+                          centers=[(1, 0), (-1, 0), (-1, 1), (1, 1)],
+                          cluster_std=0.2,
+                          random_state=42)
+    clusterer = HDBSCAN(max_cluster_size=1,
+                        allow_single_cluster=True)
+    clusterer.fit(blobs)
+
 # Disable for now -- need to refactor to meet newer standards
 @pytest.mark.skip(reason="need to refactor to meet newer standards")
 def test_hdbscan_is_sklearn_estimator():