-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
ENH: Implement DataFrame.value_counts #31247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
0830e36 d946e93 7d9306d 2e58db4 25d7f2f aa96c98 aef75ae acb81cc 786de34 7eba59a 60554e9 4c4e858 07f0e76 d055b5c 957a8ec 4fee5e0 b8f4126 310c688 a266021 d738bf7 2618220 98e7e5b 1ab2aeb a97347f e12117e 9e75083 0d46697 81991a1 d618677 85bc213 425ef73 b03978c de043d9 2f0f46d 5544716 12898ad d743ac2 f7c3abe c297143 47683ad e60de83 3903a4d 9ee6e0e de40484 File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -39,7 +39,7 @@ | |
| from pandas._config import get_option | ||
| | ||
| from pandas._libs import algos as libalgos, lib, properties | ||
| from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer | ||
| from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer | ||
| from pandas.compat import PY37 | ||
| from pandas.compat._optional import import_optional_dependency | ||
| from pandas.compat.numpy import function as nv | ||
| | @@ -108,7 +108,7 @@ | |
| from pandas.core.indexes import base as ibase | ||
| from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences | ||
| from pandas.core.indexes.datetimes import DatetimeIndex | ||
| from pandas.core.indexes.multi import maybe_droplevels | ||
| from pandas.core.indexes.multi import MultiIndex, maybe_droplevels | ||
| from pandas.core.indexes.period import PeriodIndex | ||
| from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable | ||
| from pandas.core.internals import BlockManager | ||
| | @@ -5070,6 +5070,120 @@ def sort_index( | |
| else: | ||
| return self._constructor(new_data).__finalize__(self) | ||
| | ||
| def value_counts( | ||
| self, | ||
| subset: Optional[Sequence[Label]] = None, | ||
| normalize: bool = False, | ||
| sort: bool = True, | ||
| ascending: bool = False, | ||
| bins: Optional[int] = None, | ||
| dropna: bool = True, | ||
| ): | ||
| """ | ||
| Return a Series containing counts of unique rows in the DataFrame. | ||
| .. versionadded:: 1.1.0 | ||
dsaxton marked this conversation as resolved. Show resolved Hide resolved | ||
| The returned Series will have a MultiIndex with one level per input | ||
| ||
| column. | ||
| By default, rows that contain any NA values are omitted from the | ||
| result. | ||
| By default, the resulting Series will be in descending order so that the | ||
| first element is the most frequently-occurring row. | ||
| | ||
| Parameters | ||
| ---------- | ||
| subset : list-like, optional | ||
| Columns to use when counting unique combinations. | ||
| normalize : bool, default False | ||
| Return proportions rather than frequencies. | ||
| sort : bool, default True | ||
| Sort by frequencies. | ||
jreback marked this conversation as resolved. Show resolved Hide resolved | ||
| ascending : bool, default False | ||
| Sort in ascending order. | ||
| bins : int, optional | ||
| This parameter is not yet supported and must be set to None (the | ||
| default value). It exists to ensure compatibiliy with | ||
| `Series.value_counts`. | ||
| Rather than count values, group them into half-open bins, | ||
| a convenience for ``pd.cut``, only works with single-column numeric | ||
| data. | ||
| dropna : bool, default True | ||
| This parameter is not yet supported and must be set to True (the | ||
| default value). It exists to ensure compatibiliy with | ||
| `Series.value_counts`. | ||
| Don't include counts of rows containing NA values. | ||
| | ||
| Returns | ||
| ------- | ||
| Series | ||
| | ||
| See Also | ||
| -------- | ||
| Series.value_counts: Equivalent method on Series. | ||
jreback marked this conversation as resolved. Show resolved Hide resolved | ||
| | ||
| Examples | ||
| -------- | ||
| >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], | ||
| ... 'num_wings': [2, 0, 0, 0]}, | ||
| ... index=['falcon', 'dog', 'cat', 'ant']) | ||
| >>> df | ||
| num_legs num_wings | ||
| falcon 2 2 | ||
| dog 4 0 | ||
| cat 4 0 | ||
| ant 6 0 | ||
| >>> df.value_counts() | ||
dsaxton marked this conversation as resolved. Show resolved Hide resolved | ||
| num_legs num_wings | ||
| 4 0 2 | ||
| 6 0 1 | ||
| 2 2 1 | ||
| dtype: int64 | ||
| >>> df.value_counts(sort=False) | ||
| num_legs num_wings | ||
| 2 2 1 | ||
| 4 0 2 | ||
| 6 0 1 | ||
| dtype: int64 | ||
| >>> df.value_counts(ascending=True) | ||
| num_legs num_wings | ||
| 2 2 1 | ||
| 6 0 1 | ||
| 4 0 2 | ||
| dtype: int64 | ||
| >>> df.value_counts(normalize=True) | ||
| num_legs num_wings | ||
| 4 0 0.50 | ||
| 6 0 0.25 | ||
| 2 2 0.25 | ||
| dtype: float64 | ||
| """ | ||
| if subset is None: | ||
| subset = self.columns.tolist() | ||
jreback marked this conversation as resolved. Show resolved Hide resolved | ||
| | ||
| # Some features not supported yet | ||
| ||
| if not dropna: | ||
| raise NotImplementedError( | ||
| "`dropna=False` not yet supported for DataFrames." | ||
| ) | ||
| | ||
| if bins is not None: | ||
| raise NotImplementedError( | ||
| "`bins` parameter not yet supported for DataFrames." | ||
| ) | ||
| | ||
| counts = self.groupby(subset).size() | ||
| | ||
| if sort: | ||
| counts = counts.sort_values(ascending=ascending) | ||
| if normalize: | ||
| counts /= counts.sum() | ||
| # Force MultiIndex for single column | ||
dsaxton marked this conversation as resolved. Show resolved Hide resolved | ||
| if len(subset) == 1: | ||
| counts.index = MultiIndex.from_arrays( | ||
| [counts.index], names=[counts.index.name] | ||
| ) | ||
| | ||
| return counts | ||
| | ||
| def nlargest(self, n, columns, keep="first") -> "DataFrame": | ||
| """ | ||
| Return the first `n` rows ordered by `columns` in descending order. | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,123 @@ | ||
| import numpy as np | ||
| import pytest | ||
| | ||
| import pandas as pd | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move to the methods/ subdir | ||
| import pandas._testing as tm | ||
| | ||
| | ||
| def test_data_frame_value_counts_unsorted(): | ||
| df = pd.DataFrame( | ||
| {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
| index=["falcon", "dog", "cat", "ant"], | ||
| ) | ||
| | ||
| result = df.value_counts(sort=False) | ||
| expected = pd.Series( | ||
| data=[1, 2, 1], | ||
| index=pd.MultiIndex.from_arrays( | ||
| [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] | ||
| ), | ||
| ) | ||
| | ||
| tm.assert_series_equal(result, expected) | ||
| | ||
| | ||
| def test_data_frame_value_counts_ascending(): | ||
| df = pd.DataFrame( | ||
| {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
| index=["falcon", "dog", "cat", "ant"], | ||
| ) | ||
| | ||
| result = df.value_counts(ascending=True) | ||
| expected = pd.Series( | ||
| data=[1, 1, 2], | ||
| index=pd.MultiIndex.from_arrays( | ||
| [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] | ||
| ), | ||
| ) | ||
| | ||
| tm.assert_series_equal(result, expected) | ||
| | ||
| | ||
| def test_data_frame_value_counts_default(): | ||
| df = pd.DataFrame( | ||
| {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
| index=["falcon", "dog", "cat", "ant"], | ||
| ) | ||
| | ||
| result = df.value_counts() | ||
| expected = pd.Series( | ||
| data=[2, 1, 1], | ||
| index=pd.MultiIndex.from_arrays( | ||
| [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] | ||
| ), | ||
| ) | ||
| | ||
| tm.assert_series_equal(result, expected) | ||
| | ||
| | ||
| def test_data_frame_value_counts_normalize(): | ||
| df = pd.DataFrame( | ||
| {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
| index=["falcon", "dog", "cat", "ant"], | ||
| ) | ||
| | ||
| result = df.value_counts(normalize=True) | ||
| expected = pd.Series( | ||
| data=[0.5, 0.25, 0.25], | ||
| index=pd.MultiIndex.from_arrays( | ||
| [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] | ||
| ), | ||
| ) | ||
| | ||
| tm.assert_series_equal(result, expected) | ||
| | ||
| | ||
| def test_data_frame_value_counts_dropna_not_supported_yet(): | ||
| df = pd.DataFrame( | ||
| {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
| index=["falcon", "dog", "cat", "ant"], | ||
| ) | ||
| | ||
| with pytest.raises(NotImplementedError, match="not yet supported"): | ||
| df.value_counts(dropna=False) | ||
| | ||
| | ||
| def test_data_frame_value_counts_bins_not_supported(): | ||
| df = pd.DataFrame( | ||
| {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, | ||
| index=["falcon", "dog", "cat", "ant"], | ||
| ) | ||
| | ||
| with pytest.raises(NotImplementedError, match="not yet supported"): | ||
| df.value_counts(bins=2) | ||
| | ||
| | ||
| def test_data_frame_value_counts_single_col_default(): | ||
| df = pd.DataFrame({"num_legs": [2, 4, 4, 6]}) | ||
| | ||
| result = df.value_counts() | ||
| expected = pd.Series( | ||
| data=[2, 1, 1], | ||
| index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]), | ||
| ) | ||
| | ||
| tm.assert_series_equal(result, expected) | ||
| | ||
| | ||
| def test_data_frame_value_counts_empty(): | ||
| df_no_cols = pd.DataFrame() | ||
| | ||
| result = df_no_cols.value_counts() | ||
| expected = pd.Series([], dtype=np.int64) | ||
| | ||
| tm.assert_series_equal(result, expected) | ||
| | ||
| | ||
| def test_data_frame_value_counts_empty_normalize(): | ||
| df_no_cols = pd.DataFrame() | ||
| | ||
| result = df_no_cols.value_counts(normalize=True) | ||
| expected = pd.Series([], dtype=np.float64) | ||
| | ||
| tm.assert_series_equal(result, expected) | ||
Uh oh!
There was an error while loading. Please reload this page.