diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bd1402a..340b4ade 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,15 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.17.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.0) - 2024-02-06 + +### Added +- Added `dataset.add_items_from_dir` +- Added pytest-xdist for test parallelization + +### Fixes +- Fix test `test_models.test_remove_invalid_tag_from_model` + ## [0.16.18](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.18) - 2024-02-06 diff --git a/nucleus/__init__.py b/nucleus/__init__.py index 5b4beba4..033ee58e 100644 --- a/nucleus/__init__.py +++ b/nucleus/__init__.py @@ -1252,6 +1252,25 @@ def _set_api_key(self, api_key): return api_key + @staticmethod + def valid_dirname(dirname) -> str: + """ + Validate directory exists + Args: + dirname: Path of directory + + Returns: + Existing directory path + + """ + # ensures path ends with a slash + _dirname = os.path.join(os.path.expanduser(dirname), "") + if not os.path.exists(_dirname): + raise ValueError( + f"Given directory name: {dirname} does not exists. Searched in {_dirname}" + ) + return _dirname + def create_dataset_from_dir( self, dirname: str, @@ -1260,7 +1279,7 @@ def create_dataset_from_dir( privacy_mode_proxy: str = "", allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"), skip_size_warning: bool = False, - ) -> Union[Dataset, None]: + ) -> Dataset: """ Create a dataset by recursively crawling through a directory. A DatasetItem will be created for each unique image found. @@ -1274,39 +1293,16 @@ def create_dataset_from_dir( allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png') skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data. """ - - if use_privacy_mode: - assert ( - privacy_mode_proxy - ), "When using privacy mode, must specify a proxy to serve the files" - - # ensures path ends with a slash - _dirname = os.path.join(os.path.expanduser(dirname), "") - if not os.path.exists(_dirname): - raise ValueError( - f"Given directory name: {dirname} does not exists. Searched in {_dirname}" - ) - - folder_name = os.path.basename(_dirname.rstrip("/")) + existing_dirname = self.valid_dirname(dirname) + folder_name = os.path.basename(existing_dirname.rstrip("/")) dataset_name = dataset_name or folder_name - items = create_items_from_folder_crawl( - _dirname, - allowed_file_types, - use_privacy_mode, - privacy_mode_proxy, - ) - - if len(items) == 0: - print(f"Did not find any items in {dirname}") - return None - - if len(items) > GLOB_SIZE_THRESHOLD_CHECK and not skip_size_warning: - raise Exception( - f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended, set skip_size_warning=True when calling this function." - ) - dataset = self.create_dataset( name=dataset_name, use_privacy_mode=use_privacy_mode ) - dataset.append(items, asynchronous=False) + dataset.add_items_from_dir( + existing_dirname=existing_dirname, + privacy_mode_proxy=privacy_mode_proxy, + allowed_file_types=allowed_file_types, + skip_size_warning=skip_size_warning, + ) return dataset diff --git a/nucleus/dataset.py b/nucleus/dataset.py index 9ed025ca..78f3061d 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -26,6 +26,7 @@ from nucleus.url_utils import sanitize_string_args from nucleus.utils import ( convert_export_payload, + create_items_from_folder_crawl, format_dataset_item_response, format_prediction_response, format_scale_task_info_response, @@ -50,6 +51,7 @@ EXPORT_FOR_TRAINING_KEY, EXPORTED_ROWS, FRAME_RATE_KEY, + GLOB_SIZE_THRESHOLD_CHECK, ITEM_KEY, ITEMS_KEY, JOB_REQ_LIMIT, @@ -2241,3 +2243,55 @@ def jobs( if stats_only: return jobs_status_overview(job_objects) return job_objects + + def add_items_from_dir( + self, + dirname: Optional[str] = None, + existing_dirname: Optional[str] = None, + privacy_mode_proxy: str = "", + allowed_file_types: Tuple[str, ...] = ("png", "jpg", "jpeg"), + skip_size_warning: bool = False, + update_items: bool = False, + ): + """ + Update dataset by recursively crawling through a directory. + A DatasetItem will be created for each unique image found. + The existing items are skipped or updated depending on update_items param + + Args: + dirname: Where to look for image files, recursively + existing_dirname: Already validated dirname + privacy_mode_proxy: Endpoint that serves image files for privacy mode, ignore if not using privacy mode. + The proxy should work based on the relative path of the images in the directory. + allowed_file_types: Which file type extensions to search for, ie: ('jpg', 'png') + skip_size_warning: If False, it will throw an error if the script globs more than 500 images. This is a safety check in case the dirname has a typo, and grabs too much data. + update_items: Whether to update items in existing dataset + """ + # fetch dataset use_privacy_mode for existence check + if self.use_privacy_mode: + assert ( + privacy_mode_proxy + ), "When using privacy mode, must specify a proxy to serve the files" + if not existing_dirname: + # ensures path ends with a slash + existing_dirname = self._client.valid_dirname(dirname) + items = create_items_from_folder_crawl( + existing_dirname, + allowed_file_types, + self.use_privacy_mode, + privacy_mode_proxy, + ) + + if len(items) > 0: + if ( + len(items) > GLOB_SIZE_THRESHOLD_CHECK + and not skip_size_warning + ): + raise Exception( + f"Found over {GLOB_SIZE_THRESHOLD_CHECK} items in {dirname}. If this is intended," + f" set skip_size_warning=True when calling this function." + ) + self.append(items, asynchronous=False, update=update_items) + + else: + print(f"Did not find any items in {dirname}.") diff --git a/pyproject.toml b/pyproject.toml index 34fbf5e1..d8f566d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running [tool.poetry] name = "scale-nucleus" -version = "0.16.18" +version = "0.17.0" description = "The official Python client library for Nucleus, the Data Platform for AI" license = "MIT" authors = ["Scale AI Nucleus Team "] diff --git a/tests/helpers.py b/tests/helpers.py index b9bbda1a..81c4874a 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -459,6 +459,7 @@ def reference_id_from_url(url): this_dir = os.path.dirname(os.path.realpath(__file__)) TEST_LOCAL_MASK_URL = os.path.join(this_dir, "testdata/000000000285.png") +TEST_LOCAL_TESTDIR = os.path.join(this_dir, "testdata/testdir") NUM_VALID_SEGMENTATIONS_IN_MAIN_DATASET = len(TEST_DATASET_ITEMS) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index d925b929..0172a136 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,5 +1,7 @@ import copy +import glob import math +import os import pytest @@ -38,6 +40,7 @@ TEST_DATASET_NAME, TEST_IMG_URLS, TEST_LIDAR_SCENES, + TEST_LOCAL_TESTDIR, TEST_MULTICATEGORY_ANNOTATIONS, TEST_POLYGON_ANNOTATIONS, TEST_SEGMENTATION_ANNOTATIONS, @@ -611,3 +614,35 @@ def test_query(CLIENT): with pytest.raises(NucleusAPIError): for qi in dataset.query_items("annotations.count bad syntax"): print(qi) # unreachable, just need to yield an item from generator + + +@pytest.mark.integration +def test_create_update_dataset_from_dir(CLIENT): + reference_ids = set() + for file_type in ["png", "jpeg"]: + pathname = os.path.join(TEST_LOCAL_TESTDIR, f"**/*.{file_type}") + reference_ids.update( + path.replace(TEST_LOCAL_TESTDIR + "/", "") + for path in glob.glob(pathname=pathname, recursive=True) + ) + dataset = CLIENT.create_dataset_from_dir( + TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["exe"]) + ) + assert dataset is not None + CLIENT.delete_dataset(dataset.id) + dataset = CLIENT.create_dataset_from_dir( + TEST_LOCAL_TESTDIR, allowed_file_types=tuple(["png"]) + ) + dataset_items = dataset.items + assert len(dataset_items) == 1 + assert dataset_items[0].reference_id in reference_ids + dataset.add_items_from_dir( + dirname=TEST_LOCAL_TESTDIR, + allowed_file_types=tuple(["png", "jpeg"]), + ) + dataset_items = dataset.items + assert len(dataset_items) == 2 + for dataset_item in dataset_items: + assert dataset_item.reference_id in reference_ids + reference_ids.remove(dataset_item.reference_id) + CLIENT.delete_dataset(dataset.id) diff --git a/tests/testdata/testdir/000000000285.png b/tests/testdata/testdir/000000000285.png new file mode 100644 index 00000000..09ab6953 Binary files /dev/null and b/tests/testdata/testdir/000000000285.png differ diff --git a/tests/testdata/testdir/airplane.jpeg b/tests/testdata/testdir/airplane.jpeg new file mode 100644 index 00000000..9a7606e5 Binary files /dev/null and b/tests/testdata/testdir/airplane.jpeg differ