BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

Winsorize.jl (3168B)


      1 # ------------------------------------------------------------------------------------------
      2 """
      3     winsorize(x::AbstractVector;
      4         probs=nothing, cutpoints=nothing, replace_value=nothing,
      5         IQR=3, verbose=false)
      6 
      7 Winsorize (clip extreme values) in a vector.
      8 Based on Matthieu Gomez's winsorize function in the `statar` R package.
      9 
     10 # Arguments
     11 - `x::AbstractVector`: a vector of values
     12 
     13 # Keywords
     14 - `probs::Union{Tuple{Real, Real}, Nothing}`: Probability bounds for cutpoints (e.g., `(0.05, 0.95)`)
     15 - `cutpoints::Union{Tuple{Real, Real}, Nothing}`: Explicit cutpoints for outlier thresholds.
     16     Default is `(median - IQR*(q75-q25), median + IQR*(q75-q25))`
     17 - `replace_value`: Values to replace outliers with. Default: cutpoint values.
     18     Can be a tuple `(lo, hi)`, `missing`, or `(missing, missing)`
     19 - `IQR::Real=3`: Multiplier from the median for the interquartile range when inferring cutpoints
     20 - `verbose::Bool=false`: Print informational messages
     21 
     22 # Returns
     23 - `AbstractVector`: A vector the size of x with substituted values
     24 
     25 # Examples
     26 - See tests
     27 """
     28 function winsorize(x::AbstractVector{T}; 
     29     probs::Union{Tuple{Real, Real}, Nothing} = nothing,
     30     cutpoints::Union{Tuple{Union{T, Real}, Union{T, Real}}, Nothing} = nothing,
     31     replace_value::Union{Tuple{Union{T, Real}, Union{T, Real}}, Tuple{Missing, Missing}, Nothing, Missing} = nothing,
     32     IQR::Real=3,
     33     verbose::Bool=false
     34     ) where T
     35 
     36     isempty(x) && error("input vector is empty")
     37 
     38     if !isnothing(probs)
     39         (minimum(probs) < 0 || maximum(probs) > 1) && error("probabilities must be in [0, 1]")
     40         lower_percentile = minimum(probs)
     41         upper_percentile = maximum(probs)
     42         verbose && any(ismissing, x) && (@info "Some missing data skipped in winsorizing")
     43         verbose && !isnothing(cutpoints) && (@info "input cutpoints ignored ... using probabilities")
     44 
     45         cut_lo = (lower_percentile==0) ? minimum(skipmissing(x)) : quantile(skipmissing(x), lower_percentile)
     46         cut_hi = (upper_percentile==1) ? maximum(skipmissing(x)) : quantile(skipmissing(x), upper_percentile)
     47         cutpoints = (cut_lo, cut_hi)
     48         
     49     elseif isnothing(cutpoints)
     50         verbose && any(ismissing, x) && (@info "Some missing data skipped in winsorizing")
     51         l = quantile(skipmissing(x), [0.25, 0.50, 0.75])
     52         cutpoints = (l[2] - IQR * (l[3]-l[1]), l[2] + IQR * (l[3]-l[1]) )
     53         verbose && @info "Inferred cutpoints are ... $cutpoints (using interquartile range x $IQR from median)"
     54     end
     55 
     56     if isnothing(replace_value) # default to  cutpoints
     57         replace_value = (minimum(cutpoints), maximum(cutpoints))
     58         replace_value = convert.(Union{T, eltype(replace_value)}, replace_value)
     59     elseif ismissing(replace_value)
     60         replace_value = (missing, missing)
     61     end
     62 
     63     if any(ismissing.(replace_value))
     64         y = Vector{Union{T, Missing}}(x)  # Make a copy of x that can also store missing values
     65     else
     66         y = Vector{Union{T, eltype(replace_value)}}(x)
     67     end
     68     
     69     y[findall(skipmissing(x .< cutpoints[1]))] .= replace_value[1];
     70     y[findall(skipmissing(x .> cutpoints[2]))] .= replace_value[2];
     71 
     72     return y
     73 end