Skip to content

Commit 8f293f5

Browse files
authored
Merge pull request #101 from CBravoR/master
Fix bug and warnings
2 parents 2f28b47 + 0d783ff commit 8f293f5

File tree

5 files changed

+37
-28
lines changed

5 files changed

+37
-28
lines changed

.vscode/settings.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
{
2-
"python.pythonPath": "/Users/shichenxie/anaconda3/bin/python"
2+
"python.pythonPath": "/Users/shichenxie/anaconda3/bin/python",
3+
"python.analysis.typeCheckingMode": "off"
34
}

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# scorecardpy 0.1.9.4
2+
* fixed a bug on woebin function caused by pandas update (by @CBravoR)
3+
* suppressed warnings in woebin function caused by groupby operations (by @CBravoR)
4+
15
# scorecardpy 0.1.9.2
26
* fixed a bug in woebin function caused by the new function explode in pandas >= 0.25
37
* fixed a bug when intialzing binning

scorecardpy/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from scorecardpy.vif import vif
1313

1414

15-
__version__ = '0.1.9.3'
15+
__version__ = '0.1.9.4'
1616

1717
__all__ = (
1818
germancredit,

scorecardpy/woebin.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,11 @@ def dtm_binning_sv(dtm, breaks, spl_val):
134134
return {'binning_sv':None, 'dtm':dtm}
135135
# binning_sv
136136
binning_sv = pd.merge(
137-
dtm_sv.fillna('missing').groupby(['variable','value'])['y'].agg([n0, n1])\
137+
dtm_sv.fillna('missing').groupby(['variable','value'], group_keys=False)['y'].agg([n0, n1])\
138138
.reset_index().rename(columns={'n0':'good','n1':'bad'}),
139139
sv_df.fillna('missing'),
140140
on='value'
141-
).groupby(['variable', 'rowid', 'bin_chr']).agg({'bad':sum,'good':sum})\
141+
).groupby(['variable', 'rowid', 'bin_chr'], group_keys=False).agg({'bad':sum,'good':sum})\
142142
.reset_index().rename(columns={'bin_chr':'bin'})\
143143
.drop('rowid', axis=1)
144144
else:
@@ -158,7 +158,7 @@ def check_empty_bins(dtm, binning):
158158
bstbrks = sorted(list(map(float, ['-inf'] + list(binright) + ['inf'])))
159159
labels = ['[{},{})'.format(bstbrks[i], bstbrks[i+1]) for i in range(len(bstbrks)-1)]
160160
dtm.loc[:,'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
161-
binning = dtm.groupby(['variable','bin'])['y'].agg([n0, n1])\
161+
binning = dtm.groupby(['variable','bin'], group_keys=False)['y'].agg([n0, n1])\
162162
.reset_index().rename(columns={'n0':'good','n1':'bad'})
163163
# warnings.warn("The break points are modified into '[{}]'. There are empty bins based on the provided break points.".format(','.join(binright)))
164164
# binning
@@ -202,7 +202,7 @@ def woebin2_breaks(dtm, breaks, spl_val):
202202
dtm.loc[:,'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
203203
dtm['bin'] = dtm['bin'].astype(str)
204204

205-
binning = dtm.groupby(['variable','bin'])['y'].agg([n0, n1])\
205+
binning = dtm.groupby(['variable','bin'], group_keys=False)['y'].agg([n0, n1])\
206206
.reset_index().rename(columns={'n0':'good','n1':'bad'})
207207
# check empty bins for unmeric variable
208208
binning = check_empty_bins(dtm, binning)
@@ -225,7 +225,7 @@ def woebin2_breaks(dtm, breaks, spl_val):
225225
dtm,
226226
bk_df.assign(bin=lambda x: x.bin_chr),
227227
how='left', on='value'
228-
).fillna('missing').groupby(['variable', 'rowid', 'bin'])['y'].agg([n0,n1])\
228+
).fillna('missing').groupby(['variable', 'rowid', 'bin'], group_keys=False)['y'].agg([n0,n1])\
229229
.rename(columns={'n0':'good','n1':'bad'})\
230230
.reset_index().drop('rowid', axis=1)
231231
# return
@@ -317,7 +317,7 @@ def woebin2_init_bin(dtm, init_count_distr, breaks, spl_val):
317317
labels = ['[{},{})'.format(brk[i], brk[i+1]) for i in range(len(brk)-1)]
318318
dtm.loc[:,'bin'] = pd.cut(dtm['value'], brk, right=False, labels=labels)#.astype(str)
319319
# init_bin
320-
init_bin = dtm.groupby('bin')['y'].agg([n0, n1])\
320+
init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
321321
.reset_index().rename(columns={'n0':'good','n1':'bad'})
322322
# check empty bins for unmeric variable
323323
init_bin = check_empty_bins(dtm, init_bin)
@@ -329,7 +329,7 @@ def woebin2_init_bin(dtm, init_count_distr, breaks, spl_val):
329329
)[['variable', 'bin', 'brkp', 'good', 'bad', 'badprob']]
330330
else: # other type variable
331331
# initial binning datatable
332-
init_bin = dtm.groupby('value')['y'].agg([n0,n1])\
332+
init_bin = dtm.groupby('value', group_keys=False)['y'].agg([n0,n1])\
333333
.rename(columns={'n0':'good','n1':'bad'})\
334334
.assign(
335335
variable = dtm['variable'].values[0],
@@ -358,8 +358,8 @@ def woebin2_init_bin(dtm, init_count_distr, breaks, spl_val):
358358
init_bin = init_bin.assign(brkp2 = lambda x: x['brkp'].shift(shift_period))\
359359
.assign(brkp = lambda x:np.where(x['brkp'] == rm_brkp['brkp'], x['brkp2'], x['brkp']))
360360
# groupby brkp
361-
init_bin = init_bin.groupby('brkp').agg({
362-
'variable':lambda x: np.unique(x),
361+
init_bin = init_bin.groupby('brkp', group_keys=False).agg({
362+
'variable':lambda x: np.unique(x)[0],
363363
'bin': lambda x: '%,%'.join(x),
364364
'good': sum,
365365
'bad': sum
@@ -410,18 +410,18 @@ def total_iv_all_breaks(initial_binning, bestbreaks, dtm_rows):
410410
total_iv_all_brks = pd.melt(
411411
init_bin_all_breaks, id_vars=["variable", "good", "bad"], var_name='bstbin',
412412
value_vars=['bstbin'+str(i) for i in breaks_set])\
413-
.groupby(['variable', 'bstbin', 'value'])\
413+
.groupby(['variable', 'bstbin', 'value'], group_keys=False)\
414414
.agg({'good':sum, 'bad':sum}).reset_index()\
415415
.assign(count=lambda x: x['good']+x['bad'])
416416

417-
total_iv_all_brks['count_distr'] = total_iv_all_brks.groupby(['variable', 'bstbin'])\
417+
total_iv_all_brks['count_distr'] = total_iv_all_brks.groupby(['variable', 'bstbin'], group_keys=False)\
418418
['count'].apply(lambda x: x/dtm_rows).reset_index(drop=True)
419-
total_iv_all_brks['min_count_distr'] = total_iv_all_brks.groupby(['variable', 'bstbin'])\
419+
total_iv_all_brks['min_count_distr'] = total_iv_all_brks.groupby(['variable', 'bstbin'], group_keys=False)\
420420
['count_distr'].transform(lambda x: min(x))
421421

422422
total_iv_all_brks = total_iv_all_brks\
423423
.assign(bstbin = lambda x: [float(re.sub('^bstbin', '', i)) for i in x['bstbin']] )\
424-
.groupby(['variable','bstbin','min_count_distr'])\
424+
.groupby(['variable','bstbin','min_count_distr'], group_keys=False)\
425425
.apply(lambda x: iv_01(x['good'], x['bad'])).reset_index(name='total_iv')
426426
# return
427427
return total_iv_all_brks
@@ -439,11 +439,11 @@ def binning_add_1bst(initial_binning, bestbreaks):
439439
bstbin = lambda x: pd.cut(x['brkp'], bestbreaks_inf, right=False, labels=labels)
440440
)
441441
if is_numeric_dtype(dtm['value']):
442-
binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'])\
442+
binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
443443
.agg({'good':sum, 'bad':sum}).reset_index().assign(bin=lambda x: x['bstbin'])\
444444
[['bstbin', 'variable', 'bin', 'good', 'bad']]
445445
else:
446-
binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'])\
446+
binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
447447
.agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
448448
[['bstbin', 'variable', 'bin', 'good', 'bad']]
449449
# format
@@ -579,13 +579,13 @@ def add_chisq(initial_binning):
579579
var_name='goodbad', value_name='a')\
580580
.sort_values(by=['goodbad', 'brkp']).reset_index(drop=True)
581581
###
582-
chisq_df['a_lag'] = chisq_df.groupby('goodbad')['a'].apply(lambda x: x.shift(1))#.reset_index(drop=True)
583-
chisq_df['a_rowsum'] = chisq_df.groupby('brkp')['a'].transform(lambda x: sum(x))#.reset_index(drop=True)
584-
chisq_df['a_lag_rowsum'] = chisq_df.groupby('brkp')['a_lag'].transform(lambda x: sum(x))#.reset_index(drop=True)
582+
chisq_df['a_lag'] = chisq_df.groupby('goodbad', group_keys=False)['a'].apply(lambda x: x.shift(1))#.reset_index(drop=True)
583+
chisq_df['a_rowsum'] = chisq_df.groupby('brkp', group_keys=False)['a'].transform(lambda x: sum(x))#.reset_index(drop=True)
584+
chisq_df['a_lag_rowsum'] = chisq_df.groupby('brkp', group_keys=False)['a_lag'].transform(lambda x: sum(x))#.reset_index(drop=True)
585585
###
586586
chisq_df = pd.merge(
587587
chisq_df.assign(a_colsum = lambda df: df.a+df.a_lag),
588-
chisq_df.groupby('brkp').apply(lambda df: sum(df.a+df.a_lag)).reset_index(name='a_sum'))\
588+
chisq_df.groupby('brkp', group_keys=False).apply(lambda df: sum(df.a+df.a_lag)).reset_index(name='a_sum'))\
589589
.assign(
590590
e = lambda df: df.a_rowsum*df.a_colsum/df.a_sum,
591591
e_lag = lambda df: df.a_lag_rowsum*df.a_colsum/df.a_sum
@@ -637,7 +637,7 @@ def add_chisq(initial_binning):
637637
binning_chisq = binning_chisq.assign(brkp2 = lambda x: x['brkp'].shift(shift_period))\
638638
.assign(brkp = lambda x:np.where(x['brkp'] == rm_brkp['brkp'], x['brkp2'], x['brkp']))
639639
# groupby brkp
640-
binning_chisq = binning_chisq.groupby('brkp').agg({
640+
binning_chisq = binning_chisq.groupby('brkp', group_keys=False).agg({
641641
'variable':lambda x:np.unique(x),
642642
'bin': lambda x: '%,%'.join(x),
643643
'good': sum,
@@ -765,7 +765,7 @@ def bins_to_breaks(bins, dt, to_string=False, save_string=None):
765765
bins_breakslist = bins[~bins['breaks'].isin(["-inf","inf","missing"]) & ~bins['is_special_values']]
766766
bins_breakslist = pd.merge(bins_breakslist[['variable', 'breaks']], vars_class, how='left', on='variable')
767767
bins_breakslist.loc[bins_breakslist['not_numeric'], 'breaks'] = '\''+bins_breakslist.loc[bins_breakslist['not_numeric'], 'breaks']+'\''
768-
bins_breakslist = bins_breakslist.groupby('variable')['breaks'].agg(lambda x: ','.join(x))
768+
bins_breakslist = bins_breakslist.groupby('variable', group_keys=False)['breaks'].agg(lambda x: ','.join(x))
769769

770770
if to_string:
771771
bins_breakslist = "breaks_list={\n"+', \n'.join('\''+bins_breakslist.index[i]+'\': ['+bins_breakslist[i]+']' for i in np.arange(len(bins_breakslist)))+"}"
@@ -1301,15 +1301,15 @@ def gb_distr(binx):
13011301
binx['good_distr'] = binx['good']/sum(binx['count'])
13021302
binx['bad_distr'] = binx['bad']/sum(binx['count'])
13031303
return binx
1304-
bins = bins.groupby('variable').apply(gb_distr)
1304+
bins = bins.groupby('variable', group_keys=False).apply(gb_distr)
13051305
# x variable names
13061306
if xs is None: xs = bins['variable'].unique()
13071307
# plot export
13081308
plotlist = {}
13091309
for i in xs:
1310-
binx = bins[bins['variable'] == i].reset_index()
1310+
binx = bins[bins['variable'] == i].reset_index(drop=True)
13111311
plotlist[i] = plot_bin(binx, title, show_iv)
1312-
return plotlist
1312+
return plotlist
13131313

13141314

13151315

@@ -1436,10 +1436,10 @@ class number over total. Accepted range: 0.01-0.2; default
14361436
# adjust all variables
14371437
if not adj_all_var:
14381438
bins2 = bins.loc[~((bins['bin'] == 'missing') & (bins['count_distr'] >= count_distr_limit))].reset_index(drop=True)
1439-
bins2['badprob2'] = bins2.groupby('variable').apply(lambda x: x['badprob'].shift(1)).reset_index(drop=True)
1439+
bins2['badprob2'] = bins2.groupby('variable', group_keys=False).apply(lambda x: x['badprob'].shift(1)).reset_index(drop=True)
14401440
bins2 = bins2.dropna(subset=['badprob2']).reset_index(drop=True)
14411441
bins2 = bins2.assign(badprob_trend = lambda x: x.badprob >= x.badprob2)
1442-
xs_adj = bins2.groupby('variable')['badprob_trend'].nunique()
1442+
xs_adj = bins2.groupby('variable', group_keys=False)['badprob_trend'].nunique()
14431443
xs_adj = xs_adj[xs_adj>1].index
14441444
else:
14451445
xs_adj = xs_all

setup.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
setup(
2929
name='scorecardpy', # Required
3030
version=__version__, # Required
31+
package_dir={'scorecardpy':'scorecardpy'},
3132
description='Credit Risk Scorecard', # Required
3233
long_description=long_description, # Optional
3334
long_description_content_type='text/markdown', # Optional (see note above)
@@ -55,6 +56,9 @@
5556
'Programming Language :: Python :: 3.5',
5657
'Programming Language :: Python :: 3.6',
5758
'Programming Language :: Python :: 3.7',
59+
'Programming Language :: Python :: 3.8',
60+
'Programming Language :: Python :: 3.9',
61+
'Programming Language :: Python :: 3.10'
5862
],
5963
keywords='credit scorecard, woe binning, performace evaluation', # Optional
6064
packages=['scorecardpy'], # Required

0 commit comments

Comments
 (0)