BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

winsorize_guide.md (5488B)


      1 # Winsorizing
      2 
      3 The function `winsorize` tries to emulate stata winsor function.
      4 
      5 There is a [`winsor`](https://juliastats.org/StatsBase.jl/stable/robust/#StatsBase.winsor) function in StatsBase.jl but I think it's a little less full-featured.
      6 
      7 
      8 ```@setup hist
      9 import Pkg; Pkg.add("Plots");
     10 using Plots, Random, BazerData
     11 gr(); theme(:wong2); Plots.default(display_type=:inline, size=(1250,750), thickness_scaling=1)
     12 ```
     13 
     14 
     15 ## Basic usage
     16 
     17 Start with a simple distribution to visualize the effect of *winsorizing*
     18 ```@example hist
     19 Random.seed!(3); x = randn(10_000);
     20 p1 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", 
     21     framestyle=:box, size=(1250,750))
     22 savefig(p1, "p1.svg"); nothing # hide
     23 ```
     24 ![](p1.svg)
     25 
     26 
     27 ### Replace the outliers based on quantile
     28 ```@example hist; 
     29 x_win = winsorize(x, probs=(0.05, 0.95));
     30 p2 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box); 
     31 histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized")
     32 savefig(p2, "p2.svg"); nothing # hide
     33 ```
     34 ![](p2.svg)
     35 
     36 
     37 ### One side trim
     38 ```@example hist; 
     39 x_win = winsorize(x, probs=(0, 0.8));
     40 p3 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box);
     41 histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");
     42 savefig(p3, "p3.svg"); nothing # hide
     43 ```
     44 ![](p3.svg)
     45 
     46 
     47 ### Bring your own cutpoints
     48 Another type of winsorizing is to specify your own cutpoints (they do not have to be symmetric):
     49 ```@example hist
     50 x_win = winsorize(x, cutpoints=(-1.96, 2.575));
     51 p4 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box); 
     52 histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");
     53 savefig(p4, "p4.svg"); nothing # hide
     54 ```
     55 ![](p4.svg)
     56 
     57 
     58 ### Rely on the computer to select the right cutpoints
     59 If you do not specify either they will specified automatically
     60 ```@example hist
     61 x_win = winsorize(x; verbose=true);
     62 p5 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box); 
     63 histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");
     64 savefig(p5, "p5.svg"); nothing # hide
     65 ```
     66 ![](p5.svg)
     67 
     68 
     69 ### How not to replace outliers
     70 If you do not want to replace the value by the cutoffs, specify `replace_value=missing`:
     71 ```@example hist
     72 x_win = winsorize(x, cutpoints=(-2.575, 1.96), replace_value=missing);
     73 p6 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box); 
     74 histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");
     75 savefig(p6, "p6.svg"); nothing # hide
     76 ```
     77 ![](p6.svg)
     78 
     79 
     80 ### How to choose your replacement
     81 The `replace_value` command gives you some flexibility to do whatever you want in your outlier data transformation
     82 ```@example hist
     83 x_win = winsorize(x, cutpoints=(-2.575, 1.96), replace_value=(-1.96, 1.28));
     84 p7 = histogram(x, bins=-4:0.1:4, color="blue", label="distribution", framestyle=:box); 
     85 histogram!(x_win, bins=-4:0.1:4, color="red", opacity=0.5, label="winsorized");
     86 savefig(p7, "p7.svg"); nothing # hide
     87 ```
     88 ![](p7.svg)
     89 
     90 
     91 
     92 ## Within a DataFrame
     93 
     94 I try to mimick the `gtools winsor` [example](https://raw.githubusercontent.com/mcaceresb/stata-gtools/master/docs/examples/gstats_winsor.do)
     95 
     96 ```@setup dataframe
     97 import Pkg; 
     98 Pkg.add("DataFrames"); Pkg.add("Plots");
     99 Pkg.add("PalmerPenguins"); ENV["DATADEPS_ALWAYS_ACCEPT"] = true
    100 using DataFrames, PalmerPenguins, Plots, BazerData
    101 gr(); theme(:wong2); Plots.default(display_type=:inline, size=(1250,750), thickness_scaling=1)
    102 ```
    103 
    104 
    105 Winsorize one variable
    106 ```@example dataframe
    107 df = DataFrame(PalmerPenguins.load())
    108 
    109 # gstats winsor wage
    110 transform!(df, :body_mass_g => (x -> winsorize(x, probs=(0.1, 0.9)) ) => :body_mass_g_w) 
    111 
    112 p8 = histogram(df.body_mass_g, bins=2700:100:6300, color="blue", label="distribution", framestyle=:box); 
    113 histogram!(df.body_mass_g_w, bins=2700:100:6300, color="red", opacity=0.5, label="winsorized");
    114 savefig(p8, "p8.svg"); nothing # hide
    115 ```
    116 ![](p8.svg)
    117 
    118 
    119 Winsorize multiple variables
    120 ```@example dataframe
    121 # gstats winsor wage age hours, cuts(0.5 99.5) replace
    122 var_to_winsorize = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm"]
    123 transform!(df, 
    124     var_to_winsorize .=> (x -> winsorize(x, probs=(0.1, 0.9)) ) .=> var_to_winsorize .* "_w")
    125 show(IOContext(stdout, :limit => true, :displaysize => (20, 100)), 
    126     select(df, :species, :island, :bill_length_mm, :bill_length_mm_w, 
    127                :bill_depth_mm, :bill_depth_mm_w, :flipper_length_mm, :flipper_length_mm_w),
    128     allcols=true, allrows=false)
    129 nothing; # hide
    130 ```
    131 
    132 Winsorize on one side only
    133 ```@example dataframe
    134 # left-winsorizing only, at 1th percentile; 
    135 # cap noi gstats winsor wage, cuts(1 100); gstats winsor wage, cuts(1 100) s(_w2)
    136 transform!(df, :body_mass_g => (x -> winsorize(x, probs=(0.1, 1)) ) => :body_mass_g_w )
    137 show(IOContext(stdout, :limit => true, :displaysize => (20, 100)), 
    138     select(df, :species, :island, :body_mass_g, :body_mass_g_w), 
    139     allcols=true, allrows=false)
    140 nothing; # hide
    141 ```
    142 
    143 Winsorize by groups
    144 ```@example dataframe
    145 transform!(
    146     groupby(df, :sex),
    147     :body_mass_g => (x -> winsorize(x, probs=(0.2, 0.8)) ) => :body_mass_g_w)
    148 p9 = histogram(df[ isequal.(df.sex, "male"), :body_mass_g], bins=3000:100:6300, 
    149     color="blue", label="distribution", framestyle=:box);
    150 histogram!(df[ isequal.(df.sex, "male"), :body_mass_g_w], bins=3000:100:6300, 
    151     color="red", opacity=0.5, label="winsorized");
    152 savefig(p9, "p9.svg"); nothing # hide
    153 ```
    154 ![](p9.svg)
    155 
    156 
    157 
    158 
    159