From 3838811a896f529249051f86a4c87940cb6aa4d7 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Wed, 17 Apr 2024 20:20:53 +0000
Subject: [PATCH 1/3] fix: Use exact median implementation by default

---
 bigframes/core/block_transforms.py            |  3 ++-
 bigframes/core/groupby/__init__.py            |  8 +++----
 bigframes/dataframe.py                        |  8 ++-----
 bigframes/series.py                           |  2 +-
 tests/system/small/test_series.py             | 24 +++++++++++++++++--
 .../bigframes_vendored/pandas/core/frame.py   |  6 ++---
 .../pandas/core/groupby/__init__.py           |  7 +++---
 .../bigframes_vendored/pandas/core/series.py  |  8 +++----
 8 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
index 1eae73014c..6cc41f0930 100644
--- a/bigframes/core/block_transforms.py
+++ b/bigframes/core/block_transforms.py
@@ -111,6 +111,7 @@ def quantile(
     columns: Sequence[str],
     qs: Sequence[float],
     grouping_column_ids: Sequence[str] = (),
+    dropna: bool = False,
 ) -> blocks.Block:
     # TODO: handle windowing and more interpolation methods
     window = core.WindowSpec(
@@ -134,7 +135,7 @@ def quantile(
     block, results = block.aggregate(
         grouping_column_ids,
         tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
-        dropna=True,
+        dropna=dropna,
     )
     return block.select_columns(results).with_column_labels(labels)
 
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
index 0f53342352..05b1cc7f41 100644
--- a/bigframes/core/groupby/__init__.py
+++ b/bigframes/core/groupby/__init__.py
@@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame:
             self._raise_on_non_numeric("mean")
         return self._aggregate_all(agg_ops.mean_op, numeric_only=True)
 
-    def median(
-        self, numeric_only: bool = False, *, exact: bool = False
-    ) -> df.DataFrame:
+    def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame:
         if not numeric_only:
             self._raise_on_non_numeric("median")
         if exact:
@@ -138,6 +136,7 @@ def quantile(
             q_cols,
             qs=tuple(q) if multi_q else (q,),  # type: ignore
             grouping_column_ids=self._by_col_ids,
+            dropna=self._dropna,
         )
         result_df = df.DataFrame(result)
         if multi_q:
@@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series:
     def median(
         self,
         *args,
-        exact: bool = False,
+        exact: bool = True,
         **kwargs,
     ) -> series.Series:
         if exact:
@@ -508,6 +507,7 @@ def quantile(
             (self._value_column,),
             qs=tuple(q) if multi_q else (q,),  # type: ignore
             grouping_column_ids=self._by_col_ids,
+            dropna=self._dropna,
         )
         if multi_q:
             return series.Series(result.stack())
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 953a89c34f..46573f6286 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1999,18 +1999,14 @@ def mean(
         return bigframes.series.Series(block.select_column("values"))
 
     def median(
-        self, *, numeric_only: bool = False, exact: bool = False
+        self, *, numeric_only: bool = False, exact: bool = True
     ) -> bigframes.series.Series:
-        if exact:
-            raise NotImplementedError(
-                f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
-            )
         if not numeric_only:
             frame = self._raise_on_non_numeric("median")
         else:
             frame = self._drop_non_numeric()
         if exact:
-            return self.quantile()
+            return frame.quantile()
         else:
             block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
             return bigframes.series.Series(block.select_column("values"))
diff --git a/bigframes/series.py b/bigframes/series.py
index b834411bce..47acfd0afb 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -966,7 +966,7 @@ def mode(self) -> Series:
     def mean(self) -> float:
         return typing.cast(float, self._apply_aggregation(agg_ops.mean_op))
 
-    def median(self, *, exact: bool = False) -> float:
+    def median(self, *, exact: bool = True) -> float:
         if exact:
             return typing.cast(float, self.quantile(0.5))
         else:
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 87267696ba..5b2fa9f47c 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -1523,12 +1523,32 @@ def test_groupby_mean(scalars_dfs):
     )
 
 
-def test_groupby_median(scalars_dfs):
+def test_groupby_median_exact(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     col_name = "int64_too"
-    bf_series = (
+    bf_result = (
         scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median()
     )
+    pd_result = (
+        scalars_pandas_df[col_name]
+        .groupby(scalars_pandas_df["string_col"], dropna=False)
+        .median()
+    )
+
+    assert_series_equal(
+        pd_result,
+        bf_result.to_pandas(),
+    )
+
+
+def test_groupby_median_inexact(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    col_name = "int64_too"
+    bf_series = (
+        scalars_df[col_name]
+        .groupby(scalars_df["string_col"], dropna=False)
+        .median(exact=False)
+    )
     pd_max = (
         scalars_pandas_df[col_name]
         .groupby(scalars_pandas_df["string_col"], dropna=False)
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index e894900646..b9eadef2e9 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def median(self, *, numeric_only: bool = False, exact: bool = False):
+    def median(self, *, numeric_only: bool = False, exact: bool = True):
         """Return the median of the values over colunms.
 
         **Examples:**
@@ -4507,8 +4507,8 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
         Args:
             numeric_only (bool. default False):
                 Default False. Include only float, int, boolean columns.
-            exact (bool. default False):
-                Default False. Get the exact median instead of an approximate
+            exact (bool. default True):
+                Default True. Get the exact median instead of an approximate
                 one.
 
         Returns:
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
index 6310d7e271..f704df57fa 100644
--- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
+++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -68,7 +68,7 @@ def median(
         self,
         numeric_only: bool = False,
         *,
-        exact: bool = False,
+        exact: bool = True,
     ):
         """
         Compute median of groups, excluding missing values.
@@ -76,9 +76,8 @@ def median(
         Args:
             numeric_only (bool, default False):
                 Include only float, int, boolean columns.
-            exact (bool, default False):
-                Calculate the exact median instead of an approximation. Note:
-                    ``exact=True`` is not supported.
+            exact (bool, default True):
+                Calculate the exact median instead of an approximation.
 
         Returns:
             pandas.Series or pandas.DataFrame: Median of groups.
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 5e3b4c46ef..64f76115f3 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -3147,13 +3147,13 @@ def mean(self):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def median(self, *, exact: bool = False):
+    def median(self, *, exact: bool = True):
         """Return the median of the values over the requested axis.
 
         Args:
-            exact (bool. default False):
-                Default False. Get the exact median instead of an approximate
-                one. Note: ``exact=True`` not yet supported.
+            exact (bool. default True):
+                Default True. Get the exact median instead of an approximate
+                one.
 
         Returns:
             scalar: Scalar.

From 79ce279df242440fa6b06f928299d329f7f813e1 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Wed, 17 Apr 2024 22:25:41 +0000
Subject: [PATCH 2/3] fix test_numeric_literal test

---
 tests/system/small/test_series.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 5b2fa9f47c..9cb615fdcb 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -1345,10 +1345,9 @@ def test_numeric_literal(scalars_dfs):
     scalars_df, _ = scalars_dfs
     col_name = "numeric_col"
     assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9))
-    bf_result = scalars_df[col_name] - scalars_df[col_name].median()
+    bf_result = scalars_df[col_name] + 42
     assert bf_result.size == scalars_df[col_name].size
-    # TODO(b/323387826): The precision increased by 1 unexpectedly.
-    # assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))
+    assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))
 
 
 def test_repr(scalars_dfs):

From dc85f8fd31db4ce962970f2ab44ac11bab6b5cbc Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 18 Apr 2024 18:07:51 +0000
Subject: [PATCH 3/3] amend doc example

---
 bigframes/dataframe.py                              | 4 +++-
 third_party/bigframes_vendored/pandas/core/frame.py | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index f7421fd5cb..ff8404761c 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2002,7 +2002,9 @@ def median(
         else:
             frame = self._drop_non_numeric()
         if exact:
-            return frame.quantile()
+            result = frame.quantile()
+            result.name = None
+            return result
         else:
             block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
             return bigframes.series.Series(block.select_column("values"))
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 86b431dbe3..0515f690e3 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4500,9 +4500,9 @@ def median(self, *, numeric_only: bool = False, exact: bool = True):
         Finding the median value of each column.
 
             >>> df.median()
-            A    1
-            B    2
-            dtype: Int64
+            A    2.0
+            B    3.0
+            dtype: Float64
 
         Args:
             numeric_only (bool. default False):