Skip to content

Commit

Permalink
Support pandas 2.2.0 (#2657)
Browse files Browse the repository at this point in the history
* remove upper pandas limit

* Add workaround for nunique bug

* Add release note

* update test with woodwork main yml

* update python version in ww main test

* fix accidental yaml change

* Just use nunique string in primitive as fix

* fix docstring failure by allowing function to be used via param to primitive

* Add release note for ww main tests

* move testing release note
  • Loading branch information
tamargrey authored Feb 12, 2024
1 parent 51cc990 commit f8d5df7
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 7 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests_with_woodwork_main_branch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python_version: ["3.8", "3.9", "3.10"]
python_version: ["3.9", "3.10", "3.11"]
libraries: ["core", "spark - misc", "spark - computational", "spark - entityset_1", "spark - entityset_2", "spark - primitives"]

steps:
Expand Down Expand Up @@ -62,7 +62,7 @@ jobs:

slack_alert_failure:
name: Send Slack alert if failure
needs: unit_tests_woodwork_main
needs: tests_woodwork_main
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
Expand Down
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ Future Release
* Enhancements
* Fixes
* Fix dependency issues (:pr:`2644`, :pr:`2656`)
* Add workaround for pandas 2.2.0 bug with nunique and unpin pandas (:pr:`2657`)
* Changes
* Documentation Changes
* Testing Changes
* Update tests for compatibility with new versions of ``holidays`` (:pr:`2636`)
* Update ruff to 0.1.6 and use ruff linter/formatter (:pr:`2639`)
* Update ``release.yaml`` to use trusted publisher for PyPI releases (:pr:`2646`, :pr:`2653`, :pr:`2654`)
* Update dependency checkers and tests to include Dask (:pr:`2658`)
* Fix the tests that run with Woodwork main so they can be triggered (:pr:`2657`)


Thanks to the following people for contributing to this release:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,6 @@ def last_n(df):
# work)
if is_instance(base_frame, (dd, ps), "DataFrame"):
to_merge = base_frame.groupby(groupby_col).agg(to_agg)

else:
to_merge = base_frame.groupby(
base_frame[groupby_col],
Expand Down
14 changes: 11 additions & 3 deletions featuretools/primitives/standard/aggregation/num_unique.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,14 @@
class NumUnique(AggregationPrimitive):
"""Determines the number of distinct values, ignoring `NaN` values.
Args:
use_string_for_pd_calc (bool): Determines if the string 'nunique' or the function
pd.Series.nunique is used for making the primitive calculation. Put in place to
account for the bug https://github.com/pandas-dev/pandas/issues/57317.
Defaults to using the string.
Examples:
>>> num_unique = NumUnique()
>>> num_unique = NumUnique(use_string_for_pd_calc=False)
>>> num_unique(['red', 'blue', 'green', 'yellow'])
4
Expand All @@ -29,6 +35,9 @@ class NumUnique(AggregationPrimitive):
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the number of unique elements in {}"

def __init__(self, use_string_for_pd_calc=True):
self.use_string_for_pd_calc = use_string_for_pd_calc

def get_function(self, agg_type=Library.PANDAS):
if agg_type == Library.DASK:

Expand All @@ -51,7 +60,6 @@ def finalize(s):

return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize)

elif agg_type == Library.SPARK:
if self.use_string_for_pd_calc:
return "nunique"

return pd.Series.nunique
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
Trend,
)
from featuretools.primitives.base import AggregationPrimitive
from featuretools.primitives.standard.aggregation.num_unique import NumUnique
from featuretools.tests.testing_utils import backward_path, to_pandas
from featuretools.utils import Trie
from featuretools.utils.gen_utils import Library, import_or_none, is_instance
Expand Down Expand Up @@ -1293,3 +1294,26 @@ def error(s):
# Calculating without precalculated features should error.
with pytest.raises(RuntimeError, match=error_msg):
FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct])).run(instance_ids)


def test_nunique_nested_with_agg_bug(pd_es):
"""Pandas 2.2.0 has a bug where pd.Series.nunique produces columns with
the category dtype instead of int64 dtype, causing an error when we attempt
another aggregation"""
num_unique_feature = AggregationFeature(
Feature(pd_es["log"].ww["priority_level"]),
"sessions",
primitive=NumUnique,
)

mean_nunique_feature = AggregationFeature(
num_unique_feature,
"customers",
primitive=Mean,
)
feature_set = FeatureSet([mean_nunique_feature])
calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set)
df = calculator.run(np.array([0]))
df = to_pandas(df, index="id")

assert df.iloc[0, 0].round(4) == 1.6667
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dependencies = [
"holidays >= 0.17",
"numpy >= 1.21.0",
"packaging >= 20.0",
"pandas >= 1.5.0,<2.2.0",
"pandas >= 1.5.0",
"psutil >= 5.6.6",
"scipy >= 1.10.0",
"tqdm >= 4.32.0",
Expand Down

0 comments on commit f8d5df7

Please sign in to comment.