From 47ae0e2cc20f161e6ac5b5f1a87885cd7b945444 Mon Sep 17 00:00:00 2001 From: Daniel Mietchen Date: Tue, 30 Apr 2024 16:47:57 +0200 Subject: [PATCH 01/13] Typo fix in parameter_selection.rst --- docs/parameter_selection.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/parameter_selection.rst b/docs/parameter_selection.rst index 1952cd7f..0f459df3 100644 --- a/docs/parameter_selection.rst +++ b/docs/parameter_selection.rst @@ -128,7 +128,7 @@ Selecting ``min_samples`` Since we have seen that ``min_samples`` clearly has a dramatic effect on clustering, the question becomes: how do we select this parameter? The simplest intuition for what ``min_samples`` does is provide a measure of -how conservative you want you clustering to be. The larger the value of +how conservative you want your clustering to be. The larger the value of ``min_samples`` you provide, the more conservative the clustering -- more points will be declared as noise, and clusters will be restricted to progressively more dense areas. We can see this in practice by From c9933c8cd8693bbd23618c22eadec03c2bd0d420 Mon Sep 17 00:00:00 2001 From: cearlefraym <97044512+cearlefraym@users.noreply.github.com> Date: Wed, 22 May 2024 14:31:32 -0400 Subject: [PATCH 02/13] correct typo in variable declaration --- docs/how_to_use_epsilon.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/how_to_use_epsilon.rst b/docs/how_to_use_epsilon.rst index 970b9b97..0f7e2aec 100644 --- a/docs/how_to_use_epsilon.rst +++ b/docs/how_to_use_epsilon.rst @@ -43,7 +43,7 @@ In our example, we choose to merge nested clusters below 5 meters (0.005 kilomet X = np.radians(coordinates) #convert the list of lat/lon coordinates to radians earth_radius_km = 6371 - epsilon = 0.005 / earth_radius #calculate 5 meter epsilon threshold + epsilon = 0.005 / earth_radius_km #calculate 5 meter epsilon threshold clusterer = hdbscan.HDBSCAN(min_cluster_size=4, metric='haversine', cluster_selection_epsilon=epsilon, cluster_selection_method = 'eom') From a6f1c3aa01de54ad156b7ba9787e34b5803f4504 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Thu, 23 May 2024 19:52:15 -0400 Subject: [PATCH 03/13] Update _hdbscan_tree.pyx for newer numpy --- hdbscan/_hdbscan_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hdbscan/_hdbscan_tree.pyx b/hdbscan/_hdbscan_tree.pyx index 31ae7dfd..842bfa2f 100644 --- a/hdbscan/_hdbscan_tree.pyx +++ b/hdbscan/_hdbscan_tree.pyx @@ -71,7 +71,7 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy, cdef list result_list cdef np.ndarray[np.intp_t, ndim=1] relabel - cdef np.ndarray[np.int_t, ndim=1] ignore + cdef np.ndarray[np.int8_t, ndim=1] ignore cdef np.ndarray[np.double_t, ndim=1] children cdef np.intp_t node From 34be7d839e9c47a333a5accd5c56ed4e4d29e72b Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Thu, 23 May 2024 19:53:54 -0400 Subject: [PATCH 04/13] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 87f15a8d..641a2ab7 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ def requirements(): configuration = { 'name': 'hdbscan', - 'version': '0.8.34-rc1', + 'version': '0.8.35', 'description': 'Clustering based on density with variable density clusters', 'long_description': readme(), 'classifiers': [ From aa99a71daa070b64b45018378c9d4bdd9c28f686 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Fri, 24 May 2024 08:12:50 -0400 Subject: [PATCH 05/13] Update _hdbscan_tree.pyx --- hdbscan/_hdbscan_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hdbscan/_hdbscan_tree.pyx b/hdbscan/_hdbscan_tree.pyx index 842bfa2f..d3c25191 100644 --- a/hdbscan/_hdbscan_tree.pyx +++ b/hdbscan/_hdbscan_tree.pyx @@ -91,7 +91,7 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy, relabel = np.empty(root + 1, dtype=np.intp) relabel[root] = num_points result_list = [] - ignore = np.zeros(len(node_list), dtype=int) + ignore = np.zeros(len(node_list), dtype=np.int8) for node in node_list: if ignore[node] or node < num_points: From 5d75efb376326aeeb46f1d40f982569be4ebb907 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Fri, 24 May 2024 12:20:11 -0400 Subject: [PATCH 06/13] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 641a2ab7..3da85105 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ def requirements(): configuration = { 'name': 'hdbscan', - 'version': '0.8.35', + 'version': '0.8.36', 'description': 'Clustering based on density with variable density clusters', 'long_description': readme(), 'classifiers': [ From 142c6ac125b72c8433b7915c1816b5fa0e189122 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Mon, 17 Jun 2024 15:29:41 -0400 Subject: [PATCH 07/13] HDBSCAN doesn't work with numpy 2 yet. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3532921c..a69e93d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ cython>=0.27,<3 -numpy>=1.20 +numpy>=1.20,<2 scipy>= 1.0 scikit-learn>=0.20 joblib>=1.0 From c5fcf4b3829d391eadd14598736a763952790a82 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Mon, 17 Jun 2024 15:31:07 -0400 Subject: [PATCH 08/13] Update setup.py with version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3da85105..729c9eee 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ def requirements(): configuration = { 'name': 'hdbscan', - 'version': '0.8.36', + 'version': '0.8.37', 'description': 'Clustering based on density with variable density clusters', 'long_description': readme(), 'classifiers': [ From 29d8286c0cf6fe2def219c6170950f818931c5fc Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 3 Jul 2024 14:15:26 +0200 Subject: [PATCH 09/13] MAINT: Support NumPy 2 and build Python 3.12 wheels --- .github/workflows/pythonpublish_wheel.yml | 4 ++-- pyproject.toml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pythonpublish_wheel.yml b/.github/workflows/pythonpublish_wheel.yml index 20a68931..a1e1f52d 100644 --- a/.github/workflows/pythonpublish_wheel.yml +++ b/.github/workflows/pythonpublish_wheel.yml @@ -37,7 +37,7 @@ jobs: other-deploy: strategy: matrix: - python: ["3.8", "3.9", "3.10", "3.11"] + python: ["3.9", "3.10", "3.11", "3.12"] os: [windows-2019, macos-11] runs-on: ${{ matrix.os }} steps: @@ -52,7 +52,7 @@ jobs: run: | python -m pip install --upgrade pip pip install setuptools build wheel twine - pip install "cython<3" oldest-supported-numpy + pip install cython "numpy>=2" - name: Build wheel run: | python -m build --no-isolation diff --git a/pyproject.toml b/pyproject.toml index d65564a7..9ea88166 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,6 @@ requires = [ "setuptools", "wheel", - "cython<3", - "oldest-supported-numpy" + "cython<4", + "numpy<3" ] From 2e10d38b45100e11f9577c61a4f2d4d93973af96 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 3 Jul 2024 14:39:28 +0200 Subject: [PATCH 10/13] Apply simple NumPy 2 fixes and silence most warnings --- hdbscan/_hdbscan_linkage.pyx | 4 ++-- hdbscan/_hdbscan_reachability.pyx | 2 +- hdbscan/_hdbscan_tree.pyx | 2 +- hdbscan/plots.py | 2 +- hdbscan/prediction.py | 2 +- hdbscan/validity.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/hdbscan/_hdbscan_linkage.pyx b/hdbscan/_hdbscan_linkage.pyx index a35a958c..738ed6a2 100644 --- a/hdbscan/_hdbscan_linkage.pyx +++ b/hdbscan/_hdbscan_linkage.pyx @@ -33,7 +33,7 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core( result = np.zeros((distance_matrix.shape[0] - 1, 3)) node_labels = np.arange(distance_matrix.shape[0], dtype=np.intp) current_node = 0 - current_distances = np.infty * np.ones(distance_matrix.shape[0]) + current_distances = np.inf * np.ones(distance_matrix.shape[0]) current_labels = node_labels for i in range(1, node_labels.shape[0]): label_filter = current_labels != current_node @@ -100,7 +100,7 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector( result_arr = np.zeros((dim - 1, 3)) in_tree_arr = np.zeros(dim, dtype=np.int8) current_node = 0 - current_distances_arr = np.infty * np.ones(dim) + current_distances_arr = np.inf * np.ones(dim) current_sources_arr = np.ones(dim) result = ( ( result_arr.data)) diff --git a/hdbscan/_hdbscan_reachability.pyx b/hdbscan/_hdbscan_reachability.pyx index 2863dc8a..3f4e3141 100644 --- a/hdbscan/_hdbscan_reachability.pyx +++ b/hdbscan/_hdbscan_reachability.pyx @@ -79,7 +79,7 @@ cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, if min_points - 1 < len(sorted_row_data): core_distance[i] = sorted_row_data[min_points - 1] else: - core_distance[i] = np.infty + core_distance[i] = np.inf if alpha != 1.0: lil_matrix = lil_matrix / alpha diff --git a/hdbscan/_hdbscan_tree.pyx b/hdbscan/_hdbscan_tree.pyx index d3c25191..89c76299 100644 --- a/hdbscan/_hdbscan_tree.pyx +++ b/hdbscan/_hdbscan_tree.pyx @@ -251,7 +251,7 @@ cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root): while to_process.shape[0] > 0: result.extend(to_process.tolist()) - to_process = tree['child'][np.in1d(tree['parent'], to_process)] + to_process = tree['child'][np.isin(tree['parent'], to_process)] return result diff --git a/hdbscan/plots.py b/hdbscan/plots.py index e82655b3..617721e5 100644 --- a/hdbscan/plots.py +++ b/hdbscan/plots.py @@ -28,7 +28,7 @@ def _bfs_from_cluster_tree(tree, bfs_root): while to_process: result.extend(to_process) - to_process = tree['child'][np.in1d(tree['parent'], to_process)].tolist() + to_process = tree['child'][np.isin(tree['parent'], to_process)].tolist() return result diff --git a/hdbscan/prediction.py b/hdbscan/prediction.py index 166975f9..10cd6c60 100644 --- a/hdbscan/prediction.py +++ b/hdbscan/prediction.py @@ -81,7 +81,7 @@ def _clusters_below(self, cluster): while to_process: result.extend(to_process) to_process = \ - self.cluster_tree['child'][np.in1d(self.cluster_tree['parent'], + self.cluster_tree['child'][np.isin(self.cluster_tree['parent'], to_process)] to_process = to_process.tolist() diff --git a/hdbscan/validity.py b/hdbscan/validity.py index 8ddeb319..36c2edf7 100644 --- a/hdbscan/validity.py +++ b/hdbscan/validity.py @@ -180,7 +180,7 @@ def internal_minimum_spanning_tree(mr_distances): # A little "fancy" we select from the flattened array reshape back # (Fortran format to get indexing right) and take the product to do an and # then convert back to boolean type. - edge_selection = np.prod(np.in1d(min_span_tree.T[:2], vertices).reshape( + edge_selection = np.prod(np.isin(min_span_tree.T[:2], vertices).reshape( (min_span_tree.shape[0], 2), order='F'), axis=1).astype(bool) # Density sparseness is not well defined if there are no From c3f3d85c4ae948596c1cdab78ba612ecaf87d5b7 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 4 Jul 2024 22:59:07 +0200 Subject: [PATCH 11/13] Avoid cython error (but there should be a better way) --- hdbscan/_hdbscan_tree.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hdbscan/_hdbscan_tree.pyx b/hdbscan/_hdbscan_tree.pyx index 89c76299..aeb40518 100644 --- a/hdbscan/_hdbscan_tree.pyx +++ b/hdbscan/_hdbscan_tree.pyx @@ -725,8 +725,10 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, # if you do, change this accordingly! if allow_single_cluster: node_list = sorted(stability.keys(), reverse=True) + node_list = [int(n) for n in node_list] else: node_list = sorted(stability.keys(), reverse=True)[:-1] + node_list = [int(n) for n in node_list] # (exclude root) cluster_tree = tree[tree['child_size'] > 1] From 9acb4955e9539a0ba6720522182e1d70807fdec9 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 4 Jul 2024 23:00:54 +0200 Subject: [PATCH 12/13] `isin` returns the vertices shape, so the reshape is incorrect now --- hdbscan/validity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hdbscan/validity.py b/hdbscan/validity.py index 36c2edf7..7b09c277 100644 --- a/hdbscan/validity.py +++ b/hdbscan/validity.py @@ -180,8 +180,8 @@ def internal_minimum_spanning_tree(mr_distances): # A little "fancy" we select from the flattened array reshape back # (Fortran format to get indexing right) and take the product to do an and # then convert back to boolean type. - edge_selection = np.prod(np.isin(min_span_tree.T[:2], vertices).reshape( - (min_span_tree.shape[0], 2), order='F'), axis=1).astype(bool) + edge_selection = np.prod( + np.isin(min_span_tree.T[:2], vertices), axis=0).astype(bool) # Density sparseness is not well defined if there are no # internal edges (as per the referenced paper). However From ad235f01495a62c13e37cb77f0ec5453107a1e2c Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 5 Jul 2024 07:59:49 +0200 Subject: [PATCH 13/13] CI: Test with 3.12 and stop testing 3.7 and 3.8 --- azure-pipelines.yml | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2d7b2551..6ef9b53d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -27,24 +27,6 @@ stages: - job: run_platform_tests strategy: matrix: - mac_py37: - imageName: 'macOS-latest' - python.version: '3.7' - linux_py37: - imageName: 'ubuntu-latest' - python.version: '3.7' - windows_py37: - imageName: 'windows-latest' - python.version: '3.7' - mac_py38: - imageName: 'macOS-latest' - python.version: '3.8' - linux_py38: - imageName: 'ubuntu-latest' - python.version: '3.8' - windows_py38: - imageName: 'windows-latest' - python.version: '3.8' mac_py39: imageName: 'macOS-latest' python.version: '3.9' @@ -72,6 +54,15 @@ stages: windows_py311: imageName: 'windows-latest' python.version: '3.11' + mac_py312: + imageName: 'macOS-latest' + python.version: '3.12' + linux_py312: + imageName: 'ubuntu-latest' + python.version: '3.12' + windows_py312: + imageName: 'windows-latest' + python.version: '3.12' pool: vmImage: $(imageName)