Merge branch 'master' into fix/remove-cython-from-install-requires

scikit-learn-contrib · Jul 22, 2024 · 8e76624 · 8e76624
2 parents bd486d0 + c201b2e
commit 8e76624
Show file tree

Hide file tree

Showing 13 changed files with 29 additions and 36 deletions.
diff --git a/.github/workflows/pythonpublish_wheel.yml b/.github/workflows/pythonpublish_wheel.yml
@@ -37,7 +37,7 @@ jobs:
   other-deploy:
     strategy:
       matrix:
-        python: ["3.8", "3.9", "3.10", "3.11"]
+        python: ["3.9", "3.10", "3.11", "3.12"]
         os: [windows-2019, macos-11]
     runs-on: ${{ matrix.os }}
     steps:
@@ -52,7 +52,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install setuptools build wheel twine
-          pip install "cython<3" oldest-supported-numpy
+          pip install cython "numpy>=2"
       - name: Build wheel
         run: |
           python -m build --no-isolation

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -27,24 +27,6 @@ stages:
       - job: run_platform_tests
         strategy:
           matrix:
-            mac_py37:
-              imageName: 'macOS-latest'
-              python.version: '3.7'
-            linux_py37:
-              imageName: 'ubuntu-latest'
-              python.version: '3.7'
-            windows_py37:
-              imageName: 'windows-latest'
-              python.version: '3.7'            
-            mac_py38:
-              imageName: 'macOS-latest'
-              python.version: '3.8'
-            linux_py38:
-              imageName: 'ubuntu-latest'
-              python.version: '3.8'
-            windows_py38:
-              imageName: 'windows-latest'
-              python.version: '3.8'
             mac_py39:
               imageName: 'macOS-latest'
               python.version: '3.9'
@@ -72,6 +54,15 @@ stages:
             windows_py311:
               imageName: 'windows-latest'
               python.version: '3.11'
+            mac_py312:
+              imageName: 'macOS-latest'
+              python.version: '3.12'
+            linux_py312:
+              imageName: 'ubuntu-latest'
+              python.version: '3.12'
+            windows_py312:
+              imageName: 'windows-latest'
+              python.version: '3.12'
         pool:
           vmImage: $(imageName)
 

diff --git a/docs/how_to_use_epsilon.rst b/docs/how_to_use_epsilon.rst
@@ -43,7 +43,7 @@ In our example, we choose to merge nested clusters below 5 meters (0.005 kilomet
 
 	X = np.radians(coordinates) #convert the list of lat/lon coordinates to radians
 	earth_radius_km = 6371
-	epsilon = 0.005 / earth_radius #calculate 5 meter epsilon threshold
+	epsilon = 0.005 / earth_radius_km #calculate 5 meter epsilon threshold
 	
 	clusterer = hdbscan.HDBSCAN(min_cluster_size=4, metric='haversine', 
 	cluster_selection_epsilon=epsilon, cluster_selection_method = 'eom')

diff --git a/docs/parameter_selection.rst b/docs/parameter_selection.rst
@@ -128,7 +128,7 @@ Selecting ``min_samples``
 Since we have seen that ``min_samples`` clearly has a dramatic effect on
 clustering, the question becomes: how do we select this parameter? The
 simplest intuition for what ``min_samples`` does is provide a measure of
-how conservative you want you clustering to be. The larger the value of
+how conservative you want your clustering to be. The larger the value of
 ``min_samples`` you provide, the more conservative the clustering --
 more points will be declared as noise, and clusters will be restricted
 to progressively more dense areas. We can see this in practice by

diff --git a/hdbscan/_hdbscan_linkage.pyx b/hdbscan/_hdbscan_linkage.pyx
@@ -33,7 +33,7 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core(
     result = np.zeros((distance_matrix.shape[0] - 1, 3))
     node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp)
     current_node = 0
-    current_distances = np.infty * np.ones(distance_matrix.shape[0])
+    current_distances = np.inf * np.ones(distance_matrix.shape[0])
     current_labels = node_labels
     for i in range(1, node_labels.shape[0]):
         label_filter = current_labels != current_node
@@ -100,7 +100,7 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector(
     result_arr = np.zeros((dim - 1, 3))
     in_tree_arr = np.zeros(dim, dtype=np.int8)
     current_node = 0
-    current_distances_arr = np.infty * np.ones(dim)
+    current_distances_arr = np.inf * np.ones(dim)
     current_sources_arr = np.ones(dim)
 
     result = (<np.double_t[:dim - 1, :3:1]> (<np.double_t *> result_arr.data))

diff --git a/hdbscan/_hdbscan_reachability.pyx b/hdbscan/_hdbscan_reachability.pyx
@@ -79,7 +79,7 @@ cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,
         if min_points - 1 < len(sorted_row_data):
             core_distance[i] = sorted_row_data[min_points - 1]
         else:
-            core_distance[i] = np.infty
+            core_distance[i] = np.inf
 
     if alpha != 1.0:
         lil_matrix = lil_matrix / alpha

diff --git a/hdbscan/_hdbscan_tree.pyx b/hdbscan/_hdbscan_tree.pyx
@@ -71,7 +71,7 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
     cdef list result_list
 
     cdef np.ndarray[np.intp_t, ndim=1] relabel
-    cdef np.ndarray[np.int_t, ndim=1] ignore
+    cdef np.ndarray[np.int8_t, ndim=1] ignore
     cdef np.ndarray[np.double_t, ndim=1] children
 
     cdef np.intp_t node
@@ -91,7 +91,7 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
     relabel = np.empty(root + 1, dtype=np.intp)
     relabel[root] = num_points
     result_list = []
-    ignore = np.zeros(len(node_list), dtype=int)
+    ignore = np.zeros(len(node_list), dtype=np.int8)
 
     for node in node_list:
         if ignore[node] or node < num_points:
@@ -251,7 +251,7 @@ cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root):
 
     while to_process.shape[0] > 0:
         result.extend(to_process.tolist())
-        to_process = tree['child'][np.in1d(tree['parent'], to_process)]
+        to_process = tree['child'][np.isin(tree['parent'], to_process)]
 
     return result
 
@@ -725,8 +725,10 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
     # if you do, change this accordingly!
     if allow_single_cluster:
         node_list = sorted(stability.keys(), reverse=True)
+        node_list = [int(n) for n in node_list]
     else:
         node_list = sorted(stability.keys(), reverse=True)[:-1]
+        node_list = [int(n) for n in node_list]
         # (exclude root)
 
     cluster_tree = tree[tree['child_size'] > 1]

diff --git a/hdbscan/plots.py b/hdbscan/plots.py
@@ -28,7 +28,7 @@ def _bfs_from_cluster_tree(tree, bfs_root):
 
     while to_process:
         result.extend(to_process)
-        to_process = tree['child'][np.in1d(tree['parent'], to_process)].tolist()
+        to_process = tree['child'][np.isin(tree['parent'], to_process)].tolist()
 
     return result
 

diff --git a/hdbscan/prediction.py b/hdbscan/prediction.py
@@ -81,7 +81,7 @@ def _clusters_below(self, cluster):
         while to_process:
             result.extend(to_process)
             to_process = \
-                self.cluster_tree['child'][np.in1d(self.cluster_tree['parent'],
+                self.cluster_tree['child'][np.isin(self.cluster_tree['parent'],
                                                    to_process)]
             to_process = to_process.tolist()
 

diff --git a/hdbscan/validity.py b/hdbscan/validity.py
@@ -180,8 +180,8 @@ def internal_minimum_spanning_tree(mr_distances):
     # A little "fancy" we select from the flattened array reshape back
     # (Fortran format to get indexing right) and take the product to do an and
     # then convert back to boolean type.
-    edge_selection = np.prod(np.in1d(min_span_tree.T[:2], vertices).reshape(
-        (min_span_tree.shape[0], 2), order='F'), axis=1).astype(bool)
+    edge_selection = np.prod(
+        np.isin(min_span_tree.T[:2], vertices), axis=0).astype(bool)
 
     # Density sparseness is not well defined if there are no
     # internal edges (as per the referenced paper). However

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,6 +2,6 @@
 requires = [
   "setuptools",
   "wheel",
-  "cython<3",
-  "oldest-supported-numpy"
+  "cython<4",
+  "numpy<3"
 ]
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-numpy>=1.20
+numpy>=1.20,<2
 scipy>= 1.0
 scikit-learn>=0.20
 joblib>=1.0
diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@ def requirements():
 
 configuration = {
     'name': 'hdbscan',
-    'version': '0.8.34-rc1',
+    'version': '0.8.37',
     'description': 'Clustering based on density with variable density clusters',
     'long_description': readme(),
     'classifiers': [