From 3838811a896f529249051f86a4c87940cb6aa4d7 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 17 Apr 2024 20:20:53 +0000 Subject: [PATCH 1/3] fix: Use exact median implementation by default --- bigframes/core/block_transforms.py | 3 ++- bigframes/core/groupby/__init__.py | 8 +++---- bigframes/dataframe.py | 8 ++----- bigframes/series.py | 2 +- tests/system/small/test_series.py | 24 +++++++++++++++++-- .../bigframes_vendored/pandas/core/frame.py | 6 ++--- .../pandas/core/groupby/__init__.py | 7 +++--- .../bigframes_vendored/pandas/core/series.py | 8 +++---- 8 files changed, 41 insertions(+), 25 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 1eae73014c..6cc41f0930 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -111,6 +111,7 @@ def quantile( columns: Sequence[str], qs: Sequence[float], grouping_column_ids: Sequence[str] = (), + dropna: bool = False, ) -> blocks.Block: # TODO: handle windowing and more interpolation methods window = core.WindowSpec( @@ -134,7 +135,7 @@ def quantile( block, results = block.aggregate( grouping_column_ids, tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols), - dropna=True, + dropna=dropna, ) return block.select_columns(results).with_column_labels(labels) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 0f53342352..05b1cc7f41 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame: self._raise_on_non_numeric("mean") return self._aggregate_all(agg_ops.mean_op, numeric_only=True) - def median( - self, numeric_only: bool = False, *, exact: bool = False - ) -> df.DataFrame: + def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("median") if exact: @@ -138,6 +136,7 @@ def quantile( q_cols, qs=tuple(q) if multi_q else (q,), # type: ignore grouping_column_ids=self._by_col_ids, + dropna=self._dropna, ) result_df = df.DataFrame(result) if multi_q: @@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series: def median( self, *args, - exact: bool = False, + exact: bool = True, **kwargs, ) -> series.Series: if exact: @@ -508,6 +507,7 @@ def quantile( (self._value_column,), qs=tuple(q) if multi_q else (q,), # type: ignore grouping_column_ids=self._by_col_ids, + dropna=self._dropna, ) if multi_q: return series.Series(result.stack()) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 953a89c34f..46573f6286 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1999,18 +1999,14 @@ def mean( return bigframes.series.Series(block.select_column("values")) def median( - self, *, numeric_only: bool = False, exact: bool = False + self, *, numeric_only: bool = False, exact: bool = True ) -> bigframes.series.Series: - if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) if not numeric_only: frame = self._raise_on_non_numeric("median") else: frame = self._drop_non_numeric() if exact: - return self.quantile() + return frame.quantile() else: block = frame._block.aggregate_all_and_stack(agg_ops.median_op) return bigframes.series.Series(block.select_column("values")) diff --git a/bigframes/series.py b/bigframes/series.py index b834411bce..47acfd0afb 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -966,7 +966,7 @@ def mode(self) -> Series: def mean(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.mean_op)) - def median(self, *, exact: bool = False) -> float: + def median(self, *, exact: bool = True) -> float: if exact: return typing.cast(float, self.quantile(0.5)) else: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 87267696ba..5b2fa9f47c 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1523,12 +1523,32 @@ def test_groupby_mean(scalars_dfs): ) -def test_groupby_median(scalars_dfs): +def test_groupby_median_exact(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_series = ( + bf_result = ( scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median() ) + pd_result = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .median() + ) + + assert_series_equal( + pd_result, + bf_result.to_pandas(), + ) + + +def test_groupby_median_inexact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name] + .groupby(scalars_df["string_col"], dropna=False) + .median(exact=False) + ) pd_max = ( scalars_pandas_df[col_name] .groupby(scalars_pandas_df["string_col"], dropna=False) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e894900646..b9eadef2e9 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def median(self, *, numeric_only: bool = False, exact: bool = False): + def median(self, *, numeric_only: bool = False, exact: bool = True): """Return the median of the values over colunms. **Examples:** @@ -4507,8 +4507,8 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): Args: numeric_only (bool. default False): Default False. Include only float, int, boolean columns. - exact (bool. default False): - Default False. Get the exact median instead of an approximate + exact (bool. default True): + Default True. Get the exact median instead of an approximate one. Returns: diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 6310d7e271..f704df57fa 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -68,7 +68,7 @@ def median( self, numeric_only: bool = False, *, - exact: bool = False, + exact: bool = True, ): """ Compute median of groups, excluding missing values. @@ -76,9 +76,8 @@ def median( Args: numeric_only (bool, default False): Include only float, int, boolean columns. - exact (bool, default False): - Calculate the exact median instead of an approximation. Note: - ``exact=True`` is not supported. + exact (bool, default True): + Calculate the exact median instead of an approximation. Returns: pandas.Series or pandas.DataFrame: Median of groups. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 5e3b4c46ef..64f76115f3 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3147,13 +3147,13 @@ def mean(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def median(self, *, exact: bool = False): + def median(self, *, exact: bool = True): """Return the median of the values over the requested axis. Args: - exact (bool. default False): - Default False. Get the exact median instead of an approximate - one. Note: ``exact=True`` not yet supported. + exact (bool. default True): + Default True. Get the exact median instead of an approximate + one. Returns: scalar: Scalar. From 79ce279df242440fa6b06f928299d329f7f813e1 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 17 Apr 2024 22:25:41 +0000 Subject: [PATCH 2/3] fix test_numeric_literal test --- tests/system/small/test_series.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5b2fa9f47c..9cb615fdcb 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1345,10 +1345,9 @@ def test_numeric_literal(scalars_dfs): scalars_df, _ = scalars_dfs col_name = "numeric_col" assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9)) - bf_result = scalars_df[col_name] - scalars_df[col_name].median() + bf_result = scalars_df[col_name] + 42 assert bf_result.size == scalars_df[col_name].size - # TODO(b/323387826): The precision increased by 1 unexpectedly. - # assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) def test_repr(scalars_dfs): From dc85f8fd31db4ce962970f2ab44ac11bab6b5cbc Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 18 Apr 2024 18:07:51 +0000 Subject: [PATCH 3/3] amend doc example --- bigframes/dataframe.py | 4 +++- third_party/bigframes_vendored/pandas/core/frame.py | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f7421fd5cb..ff8404761c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2002,7 +2002,9 @@ def median( else: frame = self._drop_non_numeric() if exact: - return frame.quantile() + result = frame.quantile() + result.name = None + return result else: block = frame._block.aggregate_all_and_stack(agg_ops.median_op) return bigframes.series.Series(block.select_column("values")) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 86b431dbe3..0515f690e3 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4500,9 +4500,9 @@ def median(self, *, numeric_only: bool = False, exact: bool = True): Finding the median value of each column. >>> df.median() - A 1 - B 2 - dtype: Int64 + A 2.0 + B 3.0 + dtype: Float64 Args: numeric_only (bool. default False):