Git Product home page Git Product logo

rbin's Introduction

rbin

Tools for binning data

CRAN_Status_Badge cran checks R-CMD-check Coverage status status Lifecycle: stable

Installation

# Install rbin from CRAN
install.packages("rbin")

# Or the development version from GitHub
# install.packages("devtools")
devtools::install_github("rsquaredacademy/rbin")

Addins

rbin includes two addins for manually binning data:

  • rbinAddin()
  • rbinFactorAddin()

Usage

Manual Binning

bins <- rbin_manual(mbank, y, age, c(29, 31, 34, 36, 39, 42, 46, 51, 56))
bins 
#> Binning Summary
#> ---------------------------
#> Method               Manual 
#> Response             y 
#> Predictor            age 
#> Bins                 10 
#> Count                4521 
#> Goods                517 
#> Bads                 4004 
#> Entropy              0.5 
#> Information Value    0.12 
#> 
#> 
#>    cut_point bin_count good bad          woe           iv   entropy
#> 1       < 29       410   71 339 -0.483686036 2.547353e-02 0.6649069
#> 2       < 31       313   41 272 -0.154776266 1.760055e-03 0.5601482
#> 3       < 34       567   55 512  0.183985174 3.953685e-03 0.4594187
#> 4       < 36       396   45 351  0.007117468 4.425063e-06 0.5107878
#> 5       < 39       519   47 472  0.259825118 7.008270e-03 0.4383322
#> 6       < 42       431   33 398  0.442938178 1.575567e-02 0.3899626
#> 7       < 46       449   47 402  0.099298221 9.423907e-04 0.4836486
#> 8       < 51       521   40 481  0.439981550 1.881380e-02 0.3907140
#> 9       < 56       445   49 396  0.042587647 1.756117e-04 0.5002548
#> 10     >= 56       470   89 381 -0.592843261 4.564428e-02 0.7001343

# plot
plot(bins)

Combine Factor Levels

# combine levels
upper <- c("secondary", "tertiary")
out <- rbin_factor_combine(mbank, education, upper, "upper")
table(out$education)
#> 
#>   upper unknown primary 
#>    3651     179     691

# bins
bins <- rbin_factor(out, y, education)
bins 
#> Binning Summary
#> ---------------------------
#> Method               Custom 
#> Response             y 
#> Predictor            education 
#> Levels               3 
#> Count                4521 
#> Goods                517 
#> Bads                 4004 
#> Entropy              0.51 
#> Information Value    0.01 
#> 
#> 
#>     level bin_count good  bad         woe           iv   entropy
#> 1   upper      3651  426 3225 -0.02275738 0.0004219212 0.5197428
#> 2 primary       691   66  625  0.20109064 0.0057178780 0.4546110
#> 3 unknown       179   25  154 -0.22892949 0.0022651110 0.5833603

# plot
plot(bins)

Quantile Binning

bins <- rbin_quantiles(mbank, y, age, 10)
bins 
#> Binning Summary
#> -----------------------------
#> Method               Quantile 
#> Response             y 
#> Predictor            age 
#> Bins                 10 
#> Count                4521 
#> Goods                517 
#> Bads                 4004 
#> Entropy              0.5 
#> Information Value    0.12 
#> 
#> 
#>    cut_point bin_count good bad          woe           iv   entropy
#> 1       < 29       410   71 339 -0.483686036 2.547353e-02 0.6649069
#> 2       < 31       313   41 272 -0.154776266 1.760055e-03 0.5601482
#> 3       < 34       567   55 512  0.183985174 3.953685e-03 0.4594187
#> 4       < 36       396   45 351  0.007117468 4.425063e-06 0.5107878
#> 5       < 39       519   47 472  0.259825118 7.008270e-03 0.4383322
#> 6       < 42       431   33 398  0.442938178 1.575567e-02 0.3899626
#> 7       < 46       449   47 402  0.099298221 9.423907e-04 0.4836486
#> 8       < 51       521   40 481  0.439981550 1.881380e-02 0.3907140
#> 9       < 56       445   49 396  0.042587647 1.756117e-04 0.5002548
#> 10     >= 56       470   89 381 -0.592843261 4.564428e-02 0.7001343

# plot
plot(bins)

Winsorized Binning

bins <- rbin_winsorize(mbank, y, age, 10, winsor_rate = 0.05)
bins 
#> Binning Summary
#> ------------------------------
#> Method               Winsorize 
#> Response             y 
#> Predictor            age 
#> Bins                 10 
#> Count                4521 
#> Goods                517 
#> Bads                 4004 
#> Entropy              0.51 
#> Information Value    0.1 
#> 
#> 
#>    cut_point bin_count good bad        woe           iv   entropy
#> 1     < 30.2       723  112 611 -0.3504082 0.0224390979 0.6219926
#> 2     < 33.4       567   55 512  0.1839852 0.0039536848 0.4594187
#> 3     < 36.6       573   58 515  0.1367176 0.0022470488 0.4728562
#> 4     < 39.8       497   44 453  0.2846962 0.0079801719 0.4315480
#> 5       < 43       396   37 359  0.2253982 0.0040782670 0.4478305
#> 6     < 46.2       461   43 418  0.2272751 0.0048235624 0.4473095
#> 7     < 49.4       281   22 259  0.4187793 0.0092684760 0.3961315
#> 8     < 52.6       309   32 277  0.1112753 0.0008106706 0.4801796
#> 9     < 55.8       244   25 219  0.1231896 0.0007809490 0.4767424
#> 10   >= 55.8       470   89 381 -0.5928433 0.0456442813 0.7001343

# plot
plot(bins)

Equal Length Binning

bins <- rbin_equal_length(mbank, y, age, 10)
bins 
#> Binning Summary
#> ---------------------------------
#> Method               Equal Length 
#> Response             y 
#> Predictor            age 
#> Bins                 10 
#> Count                4521 
#> Goods                517 
#> Bads                 4004 
#> Entropy              0.5 
#> Information Value    0.17 
#> 
#> 
#>    cut_point bin_count good  bad         woe           iv   entropy
#> 1     < 24.6        85   24   61 -1.11418623 0.0347480126 0.8586371
#> 2     < 31.2       822  106  716 -0.13676519 0.0035843196 0.5545619
#> 3     < 37.8      1133  115 1018  0.13365680 0.0042514380 0.4737339
#> 4     < 44.4       943   82  861  0.30436899 0.0171748162 0.4262287
#> 5       < 51       623   52  571  0.34913923 0.0146733167 0.4142794
#> 6     < 57.6       612   66  546  0.06595797 0.0005741022 0.4933757
#> 7     < 64.2       229   43  186 -0.58245971 0.0213871054 0.6967893
#> 8     < 70.8        34   12   22 -1.44087046 0.0255269312 0.9366674
#> 9     < 77.4        25   13   12 -2.12704897 0.0471100183 0.9988455
#> 10   >= 77.4        15    4   11 -1.03540535 0.0051663529 0.8366407

# plot
plot(bins)

Alternatives

Getting Help

If you encounter a bug, please file a minimal reproducible example using reprex on github. For questions and clarifications, use StackOverflow.

Code of Conduct

Please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.

rbin's People

Contributors

aravindhebbali avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar

rbin's Issues

Visualization

rb_bin_visualize() should create visualization for binning.

0.1.0 Checklist

Prepare for release:

  • Check that description is informative
  • Check licensing of included files
  • devtools::check_win_devel()
  • rhub::check_for_cran()

Perform release:

  • Bump version (in DESCRIPTION and NEWS)
  • devtools::check_win_devel() (again!)
  • devtools::submit_cran()
  • pkgdown::build_site()
  • Approve email

Wait for CRAN...

  • Tag release
  • Bump dev version
  • Write blog post
  • Tweet
  • Add link to blog post in pkgdown news menu

Template from r-lib/usethis#338

Manual split

rb_bin_manual() should allow users to specify manual binning.

Flexible intervals

Users should be able to choose between the below intervla:

  • left closed and right open
  • left open and right closed

Force increasing trend

rbin_trend_increasing() will force the variable to follow a monotonically increasing trend.

Variable binning

Bin continuous variables based on weight of evidence and information value. Users should be able to bin
the variables in the following ways:

  • API
  • RStudio Addin

Force decreasing trend

rbin_trend_decreasing() will force the variable to follow a monotonically decreasing trend.

Shiny App

The shiny app for rbin should do the following:

  • read data
  • allow the user to transform data
    • rename columns
    • modify data type
    • select columns
    • filter rows
  • bin multiple variables
  • create dummy variables
  • download data
  • apply binning criteria to test/validation data

Visual binning

Explore the features of SPSS visual binning and incorporate them in the RStudio Addin or shiny app.

Select data from RStudio

User should be able to select data from RStudio instead of uploading:

  • rbinAddin()
  • rbinFactorAddin()

Incorrect package alias

From CRAN:

Dear maintainer,

You have file 'rbin/man/rbin.Rd' with \docType{package}, likely
intended as a package overview help file, but without the appropriate
PKGNAME-package \alias as per "Documenting packages" in R-exts.

This seems to be the consequence of the breaking change

  Using @docType package no longer automatically adds a -package alias.
  Instead document _PACKAGE to get all the defaults for package
  documentation.

in roxygen2 7.0.0 (2019-11-12) having gone unnoticed, see
<https://github.com/r-lib/roxygen2/issues/1491>.

As explained in the issue, to get the desired PKGNAME-package \alias
back, you should either change to the new approach and document the new
special sentinel

  "_PACKAGE"

or manually add

  @aliases rbin-package

if remaining with the old approach.

Please fix in your master sources as appropriate, and submit a fixed
version of your package within the next few months.

Best,
-k

Error: Argument 2 must be length 3, not 2

when I use rbin_quantiles(data, response, predictors[i], bins=3) on my dataset, there is an error "Error: Argument 2 must be length 3, not 2"
rbin_quantiles(data, response, predictors[i], bins=2) works just fine

summary(Data$CONVERSION) #response
0 1
248996 24912

summary(Data$COUNT_VISITS_6M) #predictors[i]
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 2.00 10.64 11.00 707.00

I hope this helps!

Forthcoming release of ggplot2 and rbin

We are contacting you because you are the maintainer of rbin, which imports ggplot2 and uses vdiffr to manage visual test cases. The upcoming release of ggplot2 includes several improvements to plot rendering, including the ability to specify lineend and linejoin in geom_rect() and geom_tile(), and improved rendering of text. These improvements will result in subtle changes to your vdiffr dopplegangers when the new version is released.

Because vdiffr test cases do not run on CRAN by default, your CRAN checks will still pass. However, we suggest updating your visual test cases with the new version of ggplot2 as soon as possible to avoid confusion. You can install the development version of ggplot2 using remotes::install_github("tidyverse/ggplot2").

If you have any questions, let me know!

Error in equal frequency binning

There are two errors in the way equal frequency binning is computed:

  • bin count is used in place of bin number (453 instead of 10 in the below example)
  • mismatch between the bin number and the metrics (see the bin column in the last table)
library(rbin)

# equal frequency binning
bins <- rbin_equal_freq(mbank, y, age, 10)
bins
#> Binning Summary
#> ------------------------------------
#> Method               Equal Frequency 
#> Response             y 
#> Predictor            age 
#> Bins                 10 
#> Count                4521 
#> Goods                517 
#> Bads                 4004 
#> Entropy              0.51 
#> Information Value    0.01 
#> 
#> 
#>    lower_cut upper_cut bin_count good bad  good_rate         woe           iv
#> 1         18        29       452   55 397 0.12168142 -0.07040317 5.091649e-04
#> 2         29        31       452   57 395 0.12610619 -0.11117177 1.289604e-03
#> 3         31        34       452   46 406 0.10176991  0.13070550 1.623852e-03
#> 4         34        36       452   44 408 0.09734513  0.18007127 3.023706e-03
#> 5         36        39       452   58 394 0.12831858 -0.13109837 1.807071e-03
#> 6         39        42       452   51 401 0.11283186  0.01512953 2.275202e-05
#> 7         42        46       452   45 407 0.09955752  0.15514443 2.266308e-03
#> 8         46        51       452   60 392 0.13274336 -0.17008899 3.087466e-03
#> 9         51        56       452   53 399 0.11725664 -0.02833676 8.116094e-05
#> 10        56        84       453   48 405 0.10596026  0.08567979 7.116156e-04
#>      entropy
#> 1  0.5341748
#> 2  0.5466619
#> 3  0.4745811
#> 4  0.4605229
#> 5  0.5528088
#> 6  0.5083990
#> 7  0.4675914
#> 8  0.5649142
#> 9  0.5214234
#> 10 0.4876093

# plot
plot(bins)

# bins 
bins$bins
#>    lower_cut upper_cut bin bin_count good bad bin_cum_count good_cum_count
#> 1         18        29   2       452   55 397           452             55
#> 2         29        31   7       452   57 395           904            112
#> 3         31        34   4       452   46 406          1356            158
#> 4         34        36   6       452   44 408          1808            202
#> 5         36        39   9       452   58 394          2260            260
#> 6         39        42   1       452   51 401          2712            311
#> 7         42        46   5       452   45 407          3164            356
#> 8         46        51   3       452   60 392          3616            416
#> 9         51        56   8       452   53 399          4068            469
#> 10        56        84 453       453   48 405          4521            517
#>    bad_cum_count   bin_prop  good_rate  bad_rate  good_dist   bad_dist
#> 1            397 0.09997788 0.12168142 0.8783186 0.10638298 0.09915085
#> 2            792 0.09997788 0.12610619 0.8738938 0.11025145 0.09865135
#> 3           1198 0.09997788 0.10176991 0.8982301 0.08897485 0.10139860
#> 4           1606 0.09997788 0.09734513 0.9026549 0.08510638 0.10189810
#> 5           2000 0.09997788 0.12831858 0.8716814 0.11218569 0.09840160
#> 6           2401 0.09997788 0.11283186 0.8871681 0.09864603 0.10014985
#> 7           2808 0.09997788 0.09955752 0.9004425 0.08704062 0.10164835
#> 8           3200 0.09997788 0.13274336 0.8672566 0.11605416 0.09790210
#> 9           3599 0.09997788 0.11725664 0.8827434 0.10251451 0.09965035
#> 10          4004 0.10019907 0.10596026 0.8940397 0.09284333 0.10114885
#>            woe    dist_diff           iv   entropy prop_entropy
#> 1  -0.07040317 -0.007232130 5.091649e-04 0.5341748   0.05340567
#> 2  -0.11117177 -0.011600102 1.289604e-03 0.5466619   0.05465410
#> 3   0.13070550  0.012423746 1.623852e-03 0.4745811   0.04744761
#> 4   0.18007127  0.016791719 3.023706e-03 0.4605229   0.04604211
#> 5  -0.13109837 -0.013784088 1.807071e-03 0.5528088   0.05526865
#> 6   0.01512953  0.001503815 2.275202e-05 0.5083990   0.05082866
#> 7   0.15514443  0.014607733 2.266308e-03 0.4675914   0.04674880
#> 8  -0.17008899 -0.018152061 3.087466e-03 0.5649142   0.05647892
#> 9  -0.02833676 -0.002864157 8.116094e-05 0.5214234   0.05213081
#> 10  0.08567979  0.008305524 7.116156e-04 0.4876093   0.04885800

Created on 2023-06-02 by the reprex package (v0.3.0)

Equal length

rbin_equal_length should create bins of equal length.

Create bins

rb_create_bins() should create binned variables in a data set.

Winsorized

rbin_winsorized() should create bins using winsorized binning.

0.1.2 Release Checklist

Prepare for release:

  • Check that description is informative
  • Check licensing of included files
  • devtools::check_win_devel()
  • rhub::check_for_cran()

Perform release:

  • Bump version (in DESCRIPTION and NEWS)
  • devtools::check_win_devel() (again!)
  • devtools::submit_cran()
  • pkgdown::build_site()
  • Approve email

Wait for CRAN...

  • Tag release
  • Bump dev version
  • Write blog post
  • Tweet
  • Add link to blog post in pkgdown news menu

Template from r-lib/usethis#338

Do not write files

CRAN feedback: In rbinAddin() and rbinFactorAddin(), remove options to download data and plots.

Use \donttest

CRAN feedback: Use \donttest instead of \dontrun in case of examples with run time > 5s.

Bin multiple variables

User should be able to bin multiple variables without having to launch rdinAddin() or rbinFactorAddin() multiple times.

Return entropy

All binning functions should return the bin-wise and total entropy.

0.2.0 Checklist

Prepare for release:

  • devtools::check_win_devel()
  • rhub::check_for_cran()
  • Polish NEWS

Perform release:

  • Bump version (in DESCRIPTION and NEWS)
  • devtools::check_win_devel() (again!)
  • devtools::submit_cran()
  • pkgdown::build_site()
  • Approve email

Wait for CRAN...

  • Tag release
  • Bump dev version

Template from r-lib/usethis#338

Build

  • Travis
  • Appveyor
  • Code coverage

Display IV

All print methods should display the information value below the table.

Quantiles

rbin_quantiles() should create bins using quantiles.

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.