Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,7 @@ Other

- Improved :class:`Timestamp` type checking in various datetime functions to prevent exceptions when using a subclassed `datetime` (:issue:`25851`)
- Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`)
-
-
- Added enhancement to :func:`pd.DataFrame.describe` to include missing data count as one of the summary statistics (:issue:`21689`)


.. _whatsnew_0.250.contributors:
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9802,9 +9802,10 @@ def describe(self, percentiles=None, include=None, exclude=None):

def describe_numeric_1d(series):
stat_index = (['count', 'mean', 'std', 'min'] +
formatted_percentiles + ['max'])
formatted_percentiles + ['max', 'missing'])
d = ([series.count(), series.mean(), series.std(), series.min()] +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be the first arg

series.quantile(percentiles).tolist() + [series.max()])
series.quantile(percentiles).tolist() + [series.max(),
series.isna().sum()])
return pd.Series(d, index=stat_index, name=series.name)

def describe_categorical_1d(data):
Expand Down
65 changes: 40 additions & 25 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,19 @@ def test_corrwith_kendall(self):
# ---------------------------------------------------------------------
# Describe

def test_missing_describe(self):
df = pd.DataFrame(data={'col1': [1, np.nan],
'col2': [3, 4]})
result = df.describe()

expected = pd.DataFrame({'col1': [1, 1, np.nan, 1, 1, 1, 1, 1, 1],
'col2': [2, 3.5, 0.707107, 3, 3.25, 3.5,
3.75, 4, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max', 'missing'])

tm.assert_frame_equal(result, expected)

def test_bool_describe_in_mixed_frame(self):
df = DataFrame({
'string_data': ['a', 'b', 'c', 'd', 'e'],
Expand All @@ -516,9 +529,9 @@ def test_bool_describe_in_mixed_frame(self):
# Boolean and string data are not.
result = df.describe()
expected = DataFrame({'int_data': [5, 30, df.int_data.std(),
10, 20, 30, 40, 50]},
10, 20, 30, 40, 50, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'])
'50%', '75%', 'max', 'missing'])
tm.assert_frame_equal(result, expected)

# Top value is a boolean value that is False
Expand Down Expand Up @@ -546,9 +559,9 @@ def test_describe_bool_frame(self):
})
result = df.describe()
expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1,
2, 3, 4]},
2, 3, 4, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'])
'50%', '75%', 'max', 'missing'])
tm.assert_frame_equal(result, expected)

df = pd.DataFrame({
Expand Down Expand Up @@ -605,11 +618,11 @@ def test_describe_categorical_columns(self):
categories=['int1', 'int2', 'obj'],
ordered=True, name='XXX')
expected = DataFrame({'int1': [5, 30, df.int1.std(),
10, 20, 30, 40, 50],
10, 20, 30, 40, 50, 0],
'int2': [5, 30, df.int2.std(),
10, 20, 30, 40, 50]},
10, 20, 30, 40, 50, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'],
'50%', '75%', 'max', 'missing'],
columns=exp_columns)
tm.assert_frame_equal(result, expected)
tm.assert_categorical_equal(result.columns.values,
Expand All @@ -627,11 +640,11 @@ def test_describe_datetime_columns(self):
exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
freq='MS', tz='US/Eastern', name='XXX')
expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
10, 20, 30, 40, 50],
10, 20, 30, 40, 50, 0],
1: [5, 30, df.iloc[:, 1].std(),
10, 20, 30, 40, 50]},
10, 20, 30, 40, 50, 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'])
'50%', '75%', 'max', 'missing'])
expected.columns = exp_columns
tm.assert_frame_equal(result, expected)
assert result.columns.freq == 'MS'
Expand All @@ -649,29 +662,30 @@ def test_describe_timedelta_values(self):
pd.Timedelta('2 days'),
pd.Timedelta('3 days'),
pd.Timedelta('4 days'),
pd.Timedelta('5 days')],
pd.Timedelta('5 days'), 0],
't2': [5, pd.Timedelta('3 hours'),
df.iloc[:, 1].std(),
pd.Timedelta('1 hours'),
pd.Timedelta('2 hours'),
pd.Timedelta('3 hours'),
pd.Timedelta('4 hours'),
pd.Timedelta('5 hours')]},
pd.Timedelta('5 hours'), 0]},
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'])
'50%', '75%', 'max', 'missing'])

result = df.describe()
tm.assert_frame_equal(result, expected)

exp_repr = (" t1 t2\n"
"count 5 5\n"
"mean 3 days 00:00:00 0 days 03:00:00\n"
"std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
"min 1 days 00:00:00 0 days 01:00:00\n"
"25% 2 days 00:00:00 0 days 02:00:00\n"
"50% 3 days 00:00:00 0 days 03:00:00\n"
"75% 4 days 00:00:00 0 days 04:00:00\n"
"max 5 days 00:00:00 0 days 05:00:00")
exp_repr = (" t1 t2\n"
"count 5 5\n"
"mean 3 days 00:00:00 0 days 03:00:00\n"
"std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
"min 1 days 00:00:00 0 days 01:00:00\n"
"25% 2 days 00:00:00 0 days 02:00:00\n"
"50% 3 days 00:00:00 0 days 03:00:00\n"
"75% 4 days 00:00:00 0 days 04:00:00\n"
"max 5 days 00:00:00 0 days 05:00:00\n"
"missing 0 0")
assert repr(result) == exp_repr

def test_describe_tz_values(self, tz_naive_fixture):
Expand All @@ -684,14 +698,15 @@ def test_describe_tz_values(self, tz_naive_fixture):
df = pd.DataFrame({'s1': s1, 's2': s2})

expected = DataFrame({'s1': [5, np.nan, np.nan, np.nan, np.nan, np.nan,
2, 1.581139, 0, 1, 2, 3, 4],
2, 1.581139, 0, 1, 2, 3, 4, 0],
's2': [5, 5, s2.value_counts().index[0], 1,
start.tz_localize(tz),
end.tz_localize(tz), np.nan, np.nan,
np.nan, np.nan, np.nan, np.nan, np.nan]},
np.nan, np.nan, np.nan, np.nan, np.nan,
np.nan]},
index=['count', 'unique', 'top', 'freq', 'first',
'last', 'mean', 'std', 'min', '25%', '50%',
'75%', 'max']
'75%', 'max', 'missing']
)
result = df.describe(include='all')
tm.assert_frame_equal(result, expected)
Expand Down