Support pandas 2.2.0 (#2657)

* remove upper pandas limit * Add workaround for nunique bug * Add release note * update test with woodwork main yml * update python version in ww main test * fix accidental yaml change * Just use nunique string in primitive as fix * fix docstring failure by allowing function to be used via param to primitive * Add release note for ww main tests * move testing release note
alteryx · Feb 12, 2024 · f8d5df7 · f8d5df7
1 parent 51cc990
commit f8d5df7
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 7 deletions.
diff --git a/.github/workflows/tests_with_woodwork_main_branch.yaml b/.github/workflows/tests_with_woodwork_main_branch.yaml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        python_version: ["3.8", "3.9", "3.10"]
+        python_version: ["3.9", "3.10", "3.11"]
         libraries: ["core", "spark - misc", "spark - computational", "spark - entityset_1", "spark - entityset_2", "spark - primitives"]
 
     steps:
@@ -62,7 +62,7 @@ jobs:
 
   slack_alert_failure:
     name: Send Slack alert if failure
-    needs: unit_tests_woodwork_main
+    needs: tests_woodwork_main
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -8,13 +8,15 @@ Future Release
     * Enhancements
     * Fixes
         * Fix dependency issues (:pr:`2644`, :pr:`2656`)
+        * Add workaround for pandas 2.2.0 bug with nunique and unpin pandas (:pr:`2657`)
     * Changes
     * Documentation Changes
     * Testing Changes
         * Update tests for compatibility with new versions of ``holidays`` (:pr:`2636`)
         * Update ruff to 0.1.6 and use ruff linter/formatter (:pr:`2639`)
         * Update ``release.yaml`` to use trusted publisher for PyPI releases (:pr:`2646`, :pr:`2653`, :pr:`2654`)
         * Update dependency checkers and tests to include Dask (:pr:`2658`)
+        * Fix the tests that run with Woodwork main so they can be triggered (:pr:`2657`)
 
 
     Thanks to the following people for contributing to this release:

diff --git a/featuretools/computational_backends/feature_set_calculator.py b/featuretools/computational_backends/feature_set_calculator.py
@@ -820,7 +820,6 @@ def last_n(df):
                 # work)
                 if is_instance(base_frame, (dd, ps), "DataFrame"):
                     to_merge = base_frame.groupby(groupby_col).agg(to_agg)
-
                 else:
                     to_merge = base_frame.groupby(
                         base_frame[groupby_col],

diff --git a/featuretools/primitives/standard/aggregation/num_unique.py b/featuretools/primitives/standard/aggregation/num_unique.py
@@ -11,8 +11,14 @@
 class NumUnique(AggregationPrimitive):
     """Determines the number of distinct values, ignoring `NaN` values.
 
+    Args:
+        use_string_for_pd_calc (bool): Determines if the string 'nunique' or the function
+            pd.Series.nunique is used for making the primitive calculation. Put in place to
+            account for the bug https://github.com/pandas-dev/pandas/issues/57317.
+            Defaults to using the string.
+
     Examples:
-        >>> num_unique = NumUnique()
+        >>> num_unique = NumUnique(use_string_for_pd_calc=False)
         >>> num_unique(['red', 'blue', 'green', 'yellow'])
         4
 
@@ -29,6 +35,9 @@ class NumUnique(AggregationPrimitive):
     compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
     description_template = "the number of unique elements in {}"
 
+    def __init__(self, use_string_for_pd_calc=True):
+        self.use_string_for_pd_calc = use_string_for_pd_calc
+
     def get_function(self, agg_type=Library.PANDAS):
         if agg_type == Library.DASK:
 
@@ -51,7 +60,6 @@ def finalize(s):
 
             return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize)
 
-        elif agg_type == Library.SPARK:
+        if self.use_string_for_pd_calc:
             return "nunique"
-
         return pd.Series.nunique
diff --git a/featuretools/tests/computational_backend/test_feature_set_calculator.py b/featuretools/tests/computational_backend/test_feature_set_calculator.py
@@ -40,6 +40,7 @@
     Trend,
 )
 from featuretools.primitives.base import AggregationPrimitive
+from featuretools.primitives.standard.aggregation.num_unique import NumUnique
 from featuretools.tests.testing_utils import backward_path, to_pandas
 from featuretools.utils import Trie
 from featuretools.utils.gen_utils import Library, import_or_none, is_instance
@@ -1293,3 +1294,26 @@ def error(s):
     # Calculating without precalculated features should error.
     with pytest.raises(RuntimeError, match=error_msg):
         FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct])).run(instance_ids)
+
+
+def test_nunique_nested_with_agg_bug(pd_es):
+    """Pandas 2.2.0 has a bug where pd.Series.nunique produces columns with
+    the category dtype instead of int64 dtype, causing an error when we attempt
+    another aggregation"""
+    num_unique_feature = AggregationFeature(
+        Feature(pd_es["log"].ww["priority_level"]),
+        "sessions",
+        primitive=NumUnique,
+    )
+
+    mean_nunique_feature = AggregationFeature(
+        num_unique_feature,
+        "customers",
+        primitive=Mean,
+    )
+    feature_set = FeatureSet([mean_nunique_feature])
+    calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set)
+    df = calculator.run(np.array([0]))
+    df = to_pandas(df, index="id")
+
+    assert df.iloc[0, 0].round(4) == 1.6667
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "holidays >= 0.17",
     "numpy >= 1.21.0",
     "packaging >= 20.0",
-    "pandas >= 1.5.0,<2.2.0",
+    "pandas >= 1.5.0",
     "psutil >= 5.6.6",
     "scipy >= 1.10.0",
     "tqdm >= 4.32.0",