winsorize.jl (4231B)
1 @testset "winsorize" begin 2 3 Random.seed!(3); 4 x1 = rand(100); 5 x2 = Vector{Union{Float64, Missing}}(rand(Float64, 100)); x2[rand(collect(1:100), 5)] .= missing; 6 7 # --- tests on non-missing vectors 8 x1_win = winsorize(x1, probs=(0.05, 0.95), verbose=true); 9 @test findall(x1 .!= x1_win) == [4, 15, 26, 32, 40, 44, 52, 59, 64, 97] 10 11 x1_win = winsorize(x1; verbose=true); 12 @test findall(x1 .!= x1_win) == [] 13 14 x1_win = winsorize(x1; cutpoints=(0.01, 0.99), verbose=true) 15 @test findall(x1 .!= x1_win) == [4, 26, 52] 16 17 x1_win = winsorize(x1; cutpoints=(0, 0.9), verbose=true) 18 @test isequal(minimum(x1), minimum(x1_win)) 19 20 # --- tests with some missing 21 x2_win = winsorize(x2, probs=(0.02, 0.98), verbose=true); 22 @test size(x2) == size(x2_win) 23 @test findall(skipmissing(x2 .!= x2_win)) == [5, 41, 83, 91] 24 25 x2_win = winsorize(x2; verbose=true) 26 @test size(x2) == size(x2_win) 27 @test findall(skipmissing(x2 .!= x2_win)) == [] 28 29 x2_win = winsorize(x2; cutpoints=(0.05, 0.95), verbose=true) 30 @test size(x2) == size(x2_win) 31 @test findall(skipmissing(x2 .!= x2_win)) == [5, 17, 41, 42, 65, 83, 91] 32 33 # --- tests to do: with replace 34 x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(missing, missing), verbose=true) 35 @test size(x2) == size(x2_win) 36 @test findall(ismissing.(x2) .!= ismissing.(x2_win)) == [5, 17, 41, 42, 65, 83, 91] 37 38 x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=missing, verbose=true) 39 @test size(x2) == size(x2_win) 40 @test findall(ismissing.(x2) .!= ismissing.(x2_win)) == [5, 17, 41, 42, 65, 83, 91] 41 42 x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(-1.0, 1.0), verbose=true) 43 @test size(x2) == size(x2_win) 44 @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91] 45 46 # we check that this works if the type of replace is slightly different ... 47 # maybe we want to change this ... 48 x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(-1, 1), verbose=true) 49 @test size(x2) == size(x2_win) 50 @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91] 51 52 end 53 54 55 @testset "winsorize - custom IQR" begin 56 Random.seed!(42) 57 x = randn(1000) # standard normal: outliers likely beyond ~3σ 58 59 # default IQR=3 should keep most data 60 w_default = winsorize(x) 61 n_changed_default = count(x .!= w_default) 62 63 # IQR=1 should clip more aggressively 64 w_tight = winsorize(x, IQR=1) 65 n_changed_tight = count(x .!= w_tight) 66 @test n_changed_tight > n_changed_default 67 68 # IQR=100 should clip almost nothing 69 w_loose = winsorize(x, IQR=100) 70 @test count(x .!= w_loose) == 0 71 end 72 73 74 @testset "winsorize - edge cases" begin 75 # all identical values: nothing to winsorize 76 x_same = fill(5.0, 50) 77 w = winsorize(x_same, probs=(0.05, 0.95)) 78 @test w == x_same 79 80 # single-element vector 81 x_one = [3.14] 82 w = winsorize(x_one, probs=(0.1, 0.9)) 83 @test w == x_one 84 85 # integer vector 86 x_int = collect(1:100) 87 w = winsorize(x_int, probs=(0.05, 0.95)) 88 @test length(w) == 100 89 @test minimum(w) >= minimum(x_int) 90 @test maximum(w) <= maximum(x_int) 91 @test count(w .!= x_int) > 0 # some values should be clipped 92 93 # one-sided winsorize: only clip top 94 Random.seed!(1) 95 x = rand(100) 96 w = winsorize(x, cutpoints=(minimum(x), 0.5)) 97 @test minimum(w) == minimum(x) # bottom unchanged 98 @test maximum(w) <= 0.5 99 100 # one-sided: only clip bottom 101 w = winsorize(x, cutpoints=(0.5, maximum(x))) 102 @test minimum(w) >= 0.5 103 @test maximum(w) == maximum(x) # top unchanged 104 end 105 106 107 @testset "winsorize - all missing" begin 108 x_all_missing = Vector{Union{Float64, Missing}}(fill(missing, 10)) 109 # probs path uses skipmissing which will be empty - quantile on empty should error 110 @test_throws Exception winsorize(x_all_missing, probs=(0.05, 0.95)) 111 end 112 113 114 @testset "winsorize - error paths" begin 115 # empty vector 116 @test_throws Exception winsorize(Float64[]) 117 118 # invalid probability bounds 119 @test_throws Exception winsorize([1.0, 2.0, 3.0], probs=(-0.1, 0.9)) 120 @test_throws Exception winsorize([1.0, 2.0, 3.0], probs=(0.1, 1.1)) 121 end