pandas-dev · jreback · Feb 26, 2020 · Jan 23, 2020 · Jan 23, 2020 · Jan 23, 2020
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -169,6 +169,7 @@ Computations / descriptive stats
  DataFrame.std
  DataFrame.var
  DataFrame.nunique
+ DataFrame.value_counts
 
 Reindexing / selection / label manipulation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -54,6 +54,7 @@ Other API changes
 
 - :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
  will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
+- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
 -
 
 Backwards incompatible API changes

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -39,7 +39,7 @@
 from pandas._config import get_option
 
 from pandas._libs import algos as libalgos, lib, properties
-from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer
+from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer
 from pandas.compat import PY37
 from pandas.compat._optional import import_optional_dependency
 from pandas.compat.numpy import function as nv
@@ -108,7 +108,7 @@
 from pandas.core.indexes import base as ibase
 from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
 from pandas.core.indexes.datetimes import DatetimeIndex
-from pandas.core.indexes.multi import maybe_droplevels
+from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
 from pandas.core.indexes.period import PeriodIndex
 from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
 from pandas.core.internals import BlockManager
@@ -5070,6 +5070,120 @@ def sort_index(
  else:
  return self._constructor(new_data).__finalize__(self)
 
+ def value_counts(
+ self,
+ subset: Optional[Sequence[Label]] = None,
+ normalize: bool = False,
+ sort: bool = True,
+ ascending: bool = False,
+ bins: Optional[int] = None,
+ dropna: bool = True,
+ ):
+ """
+ Return a Series containing counts of unique rows in the DataFrame.
+ .. versionadded:: 1.1.0
+ The returned Series will have a MultiIndex with one level per input
+ column.
+ By default, rows that contain any NA values are omitted from the
+ result.
+ By default, the resulting Series will be in descending order so that the
+ first element is the most frequently-occurring row.
+
+ Parameters
+ ----------
+ subset : list-like, optional
+ Columns to use when counting unique combinations.
+ normalize : bool, default False
+ Return proportions rather than frequencies.
+ sort : bool, default True
+ Sort by frequencies.
+ ascending : bool, default False
+ Sort in ascending order.
+ bins : int, optional
+ This parameter is not yet supported and must be set to None (the
+ default value). It exists to ensure compatibiliy with
+ `Series.value_counts`.
+ Rather than count values, group them into half-open bins,
+ a convenience for ``pd.cut``, only works with single-column numeric
+ data.
+ dropna : bool, default True
+ This parameter is not yet supported and must be set to True (the
+ default value). It exists to ensure compatibiliy with
+ `Series.value_counts`.
+ Don't include counts of rows containing NA values.
+
+ Returns
+ -------
+ Series
+
+ See Also
+ --------
+ Series.value_counts: Equivalent method on Series.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
+ ... 'num_wings': [2, 0, 0, 0]},
+ ... index=['falcon', 'dog', 'cat', 'ant'])
+ >>> df
+ num_legs num_wings
+ falcon 2 2
+ dog 4 0
+ cat 4 0
+ ant 6 0
+ >>> df.value_counts()
+ num_legs num_wings
+ 4 0 2
+ 6 0 1
+ 2 2 1
+ dtype: int64
+ >>> df.value_counts(sort=False)
+ num_legs num_wings
+ 2 2 1
+ 4 0 2
+ 6 0 1
+ dtype: int64
+ >>> df.value_counts(ascending=True)
+ num_legs num_wings
+ 2 2 1
+ 6 0 1
+ 4 0 2
+ dtype: int64
+ >>> df.value_counts(normalize=True)
+ num_legs num_wings
+ 4 0 0.50
+ 6 0 0.25
+ 2 2 0.25
+ dtype: float64
+ """
+ if subset is None:
+ subset = self.columns.tolist()
+
+ # Some features not supported yet
+ if not dropna:
+ raise NotImplementedError(
+ "`dropna=False` not yet supported for DataFrames."
+ )
+
+ if bins is not None:
+ raise NotImplementedError(
+ "`bins` parameter not yet supported for DataFrames."
+ )
+
+ counts = self.groupby(subset).size()
+
+ if sort:
+ counts = counts.sort_values(ascending=ascending)
+ if normalize:
+ counts /= counts.sum()
+ # Force MultiIndex for single column
+ if len(subset) == 1:
+ counts.index = MultiIndex.from_arrays(
+ [counts.index], names=[counts.index.name]
+ )
+
+ return counts
+
  def nlargest(self, n, columns, keep="first") -> "DataFrame":
  """
  Return the first `n` rows ordered by `columns` in descending order.

diff --git a/pandas/tests/frame/test_value_counts.py b/pandas/tests/frame/test_value_counts.py
@@ -0,0 +1,123 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+def test_data_frame_value_counts_unsorted():
+ df = pd.DataFrame(
+ {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+ index=["falcon", "dog", "cat", "ant"],
+ )
+
+ result = df.value_counts(sort=False)
+ expected = pd.Series(
+ data=[1, 2, 1],
+ index=pd.MultiIndex.from_arrays(
+ [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"]
+ ),
+ )
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_ascending():
+ df = pd.DataFrame(
+ {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+ index=["falcon", "dog", "cat", "ant"],
+ )
+
+ result = df.value_counts(ascending=True)
+ expected = pd.Series(
+ data=[1, 1, 2],
+ index=pd.MultiIndex.from_arrays(
+ [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"]
+ ),
+ )
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_default():
+ df = pd.DataFrame(
+ {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+ index=["falcon", "dog", "cat", "ant"],
+ )
+
+ result = df.value_counts()
+ expected = pd.Series(
+ data=[2, 1, 1],
+ index=pd.MultiIndex.from_arrays(
+ [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+ ),
+ )
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_normalize():
+ df = pd.DataFrame(
+ {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+ index=["falcon", "dog", "cat", "ant"],
+ )
+
+ result = df.value_counts(normalize=True)
+ expected = pd.Series(
+ data=[0.5, 0.25, 0.25],
+ index=pd.MultiIndex.from_arrays(
+ [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"]
+ ),
+ )
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_dropna_not_supported_yet():
+ df = pd.DataFrame(
+ {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+ index=["falcon", "dog", "cat", "ant"],
+ )
+
+ with pytest.raises(NotImplementedError, match="not yet supported"):
+ df.value_counts(dropna=False)
+
+
+def test_data_frame_value_counts_bins_not_supported():
+ df = pd.DataFrame(
+ {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+ index=["falcon", "dog", "cat", "ant"],
+ )
+
+ with pytest.raises(NotImplementedError, match="not yet supported"):
+ df.value_counts(bins=2)
+
+
+def test_data_frame_value_counts_single_col_default():
+ df = pd.DataFrame({"num_legs": [2, 4, 4, 6]})
+
+ result = df.value_counts()
+ expected = pd.Series(
+ data=[2, 1, 1],
+ index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]),
+ )
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_empty():
+ df_no_cols = pd.DataFrame()
+
+ result = df_no_cols.value_counts()
+ expected = pd.Series([], dtype=np.int64)
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_data_frame_value_counts_empty_normalize():
+ df_no_cols = pd.DataFrame()
+
+ result = df_no_cols.value_counts(normalize=True)
+ expected = pd.Series([], dtype=np.float64)
+
+ tm.assert_series_equal(result, expected)