Winsorize.jl (3168B)
1 # ------------------------------------------------------------------------------------------ 2 """ 3 winsorize(x::AbstractVector; 4 probs=nothing, cutpoints=nothing, replace_value=nothing, 5 IQR=3, verbose=false) 6 7 Winsorize (clip extreme values) in a vector. 8 Based on Matthieu Gomez's winsorize function in the `statar` R package. 9 10 # Arguments 11 - `x::AbstractVector`: a vector of values 12 13 # Keywords 14 - `probs::Union{Tuple{Real, Real}, Nothing}`: Probability bounds for cutpoints (e.g., `(0.05, 0.95)`) 15 - `cutpoints::Union{Tuple{Real, Real}, Nothing}`: Explicit cutpoints for outlier thresholds. 16 Default is `(median - IQR*(q75-q25), median + IQR*(q75-q25))` 17 - `replace_value`: Values to replace outliers with. Default: cutpoint values. 18 Can be a tuple `(lo, hi)`, `missing`, or `(missing, missing)` 19 - `IQR::Real=3`: Multiplier from the median for the interquartile range when inferring cutpoints 20 - `verbose::Bool=false`: Print informational messages 21 22 # Returns 23 - `AbstractVector`: A vector the size of x with substituted values 24 25 # Examples 26 - See tests 27 """ 28 function winsorize(x::AbstractVector{T}; 29 probs::Union{Tuple{Real, Real}, Nothing} = nothing, 30 cutpoints::Union{Tuple{Union{T, Real}, Union{T, Real}}, Nothing} = nothing, 31 replace_value::Union{Tuple{Union{T, Real}, Union{T, Real}}, Tuple{Missing, Missing}, Nothing, Missing} = nothing, 32 IQR::Real=3, 33 verbose::Bool=false 34 ) where T 35 36 isempty(x) && error("input vector is empty") 37 38 if !isnothing(probs) 39 (minimum(probs) < 0 || maximum(probs) > 1) && error("probabilities must be in [0, 1]") 40 lower_percentile = minimum(probs) 41 upper_percentile = maximum(probs) 42 verbose && any(ismissing, x) && (@info "Some missing data skipped in winsorizing") 43 verbose && !isnothing(cutpoints) && (@info "input cutpoints ignored ... using probabilities") 44 45 cut_lo = (lower_percentile==0) ? minimum(skipmissing(x)) : quantile(skipmissing(x), lower_percentile) 46 cut_hi = (upper_percentile==1) ? maximum(skipmissing(x)) : quantile(skipmissing(x), upper_percentile) 47 cutpoints = (cut_lo, cut_hi) 48 49 elseif isnothing(cutpoints) 50 verbose && any(ismissing, x) && (@info "Some missing data skipped in winsorizing") 51 l = quantile(skipmissing(x), [0.25, 0.50, 0.75]) 52 cutpoints = (l[2] - IQR * (l[3]-l[1]), l[2] + IQR * (l[3]-l[1]) ) 53 verbose && @info "Inferred cutpoints are ... $cutpoints (using interquartile range x $IQR from median)" 54 end 55 56 if isnothing(replace_value) # default to cutpoints 57 replace_value = (minimum(cutpoints), maximum(cutpoints)) 58 replace_value = convert.(Union{T, eltype(replace_value)}, replace_value) 59 elseif ismissing(replace_value) 60 replace_value = (missing, missing) 61 end 62 63 if any(ismissing.(replace_value)) 64 y = Vector{Union{T, Missing}}(x) # Make a copy of x that can also store missing values 65 else 66 y = Vector{Union{T, eltype(replace_value)}}(x) 67 end 68 69 y[findall(skipmissing(x .< cutpoints[1]))] .= replace_value[1]; 70 y[findall(skipmissing(x .> cutpoints[2]))] .= replace_value[2]; 71 72 return y 73 end