timeshift.jl (10365B)
1 @testset "Time Shift" begin 2 3 4 # -------------------------------------------------------------------------------------------------- 5 df1 = DataFrame( # missing t=2 for id=1 6 id = ["a","a","b","b","c","c","c"], 7 t = [1,3,8,9,1,2,4], 8 v1 = [1,1,1,6,6,0,0], 9 v2 = [1,2,3,6,6,4,5], 10 v3 = [1,5,4,6,6,15,12.25]) 11 12 df2 = DataFrame( # missing t=2 for id=1 13 id = ["a","a", "b","b", "c","c","c", "d","d","d","d"], 14 t = [Date(1990, 1, 1), Date(1990, 4, 1), Date(1990, 8, 1), Date(1990, 9, 1), 15 Date(1990, 1, 1), Date(1990, 2, 1), Date(1990, 4, 1), 16 Date(1999, 11, 10), Date(1999, 12, 21), Date(2000, 2, 5), Date(2000, 4, 1)], 17 v1 = [1,1, 1,6, 6,0,0, 1,4,11,13], 18 v2 = [1,2,3,6,6,4,5, 1,2,3,4], 19 v3 = [1,5,4,6,6,15,12.25, 21,22.5,17.2,1]) 20 21 # --- test for df1 22 @testset "DF1" begin 23 sort!(df1, [:id, :t]) 24 transform!(groupby(df1, :id), [:t, :v2] => ( (d, x) -> tlag(x, d)) => :v2_lag) 25 @test isequal(df1.v2_lag, [missing, missing, missing, 3, missing, 6, missing]) 26 end 27 28 # --- test for df2 multiple variables 29 @testset "DF2" begin 30 sort!(df2, [:id, :t]) 31 transform!( 32 groupby(df2, :id), 33 [:t, :v1] => 34 ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true), 35 v1_lag_mth = tlag(v1, t; n=Month(1), verbose=true) ) ) => 36 [:v1_lag_day, :v1_lag_mth]) 37 38 @test all(ismissing.(df2.v1_lag_day)) 39 @test isequal(df2.v1_lag_mth, 40 [missing, missing, missing, 1, missing, 6, missing, missing, missing, missing, missing ]) 41 42 end 43 # -------------------------------------------------------------------------------------------------- 44 45 46 # -------------------------------------------------------------------------------------------------- 47 @testset "General tests" begin 48 49 # --- test large datasets 50 function generate_test_data(;size=50_000, gap_probability=0.1, seed=123) 51 Random.seed!(seed) 52 53 # Start date and initialize arrays 54 start_date = Date(2020, 1, 1) 55 dates = Vector{Date}() 56 x_values = Vector{Float64}() 57 58 # Generate dates with some gaps and corresponding x values 59 current_date = start_date 60 for i in 1:size 61 # Add current date and value 62 push!(dates, current_date) 63 push!(x_values, sin(i/100) + 0.1*randn()) # Some noisy sine wave pattern 64 65 # Decide whether to introduce a gap (skip 1-5 days) 66 if rand() < gap_probability 67 gap_size = rand(1:5) 68 current_date += Day(gap_size + 1) 69 else 70 # Normal increment 71 current_date += Day(1) 72 end 73 end 74 75 # Create DataFrame 76 df = DataFrame(date=dates, x=x_values) 77 return df 78 end 79 80 tiny_df = generate_test_data(size=50, gap_probability=0.05); 81 small_df = generate_test_data(size=5_000, gap_probability=0.1); 82 large_df = generate_test_data(size=1_000_000, gap_probability=0.1); 83 84 @time transform!(small_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag) 85 @test nrow(subset(small_df, :x_lag => ByRow(!ismissing))) == 4525 86 87 @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day); 88 @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth); 89 @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr); 90 91 transform!(large_df, :date => ByRow(year) => :datey) 92 @test_throws r"time vector not sorted"i transform!(large_df, 93 [:x, :datey] => ( (x, d) -> tlag(x, d, n=1)) => :x_lag_datey); 94 95 @test nrow(subset(large_df, :x_lag_day => ByRow(!ismissing))) == 900_182 96 @test nrow(subset(large_df, :x_lag_mth => ByRow(!ismissing))) == 770_178 97 @test nrow(subset(large_df, :x_lag_yr => ByRow(!ismissing))) == 769_502 98 99 @time transform!(tiny_df, [:x, :date] => ( (x, d) -> tlead(x, d)) => :x_lead) 100 @time transform!(tiny_df, [:x_lead, :date] => ( (x, d) -> tlag(x, d)) => :x_lead_lag) 101 @test dropmissing(tiny_df) |> (df -> df.x == df.x_lead_lag) # lead lag reverts back up to destroyed information 102 103 @time transform!(tiny_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Day(2)) ) => :x_lead2) 104 @time transform!(tiny_df, [:x_lead2, :date] => ( (x, d) -> tlag(tlag(x, d), d) ) => :x_lead2_lag2) 105 @test dropmissing(tiny_df) |> (df -> df.x == df.x_lead2_lag2) # lead lag reverts back up to destroyed information 106 107 108 end # of "General tests" 109 # -------------------------------------------------------------------------------------------------- 110 111 112 # -------------------------------------------------------------------------------------------------- 113 @testset "From Panelshift.jl" begin 114 115 import PanelShift 116 117 # note the api for this package differs slightly ... 118 # PanelShift.tlag(time_variable, x) 119 # BazelData.tlag(x, time_variable) 120 121 x_shift = tlag([4, 5, 6], [1, 2, 3]) 122 @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 1), x_shift) 123 @test isequal(x_shift, [missing, 4, 5]) 124 125 x_shift = tlead([4, 5, 6], [1, 2, 3]) 126 @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 1), x_shift) 127 @test isequal(x_shift, [5; 6; missing]) 128 129 x_shift = tlag([4;5;6], [1;2;3], n=2) 130 @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift) 131 @test isequal(x_shift, [missing;missing;4]) 132 133 x_shift = tlead([4;5;6], [1;2;3], n=2) 134 @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift) 135 @test isequal(x_shift, [6; missing; missing]) 136 137 # unit-length vector 138 x_shift = tlag([1], [1]) 139 @test isequal(PanelShift.tlag([1], [1]), x_shift) #[missing;]) 140 @test isequal(x_shift, [missing]) 141 142 x_shift = tlead([1], [1]) 143 @test isequal(PanelShift.tlead([1], [1]), x_shift) 144 @test isequal(x_shift, [missing]) 145 146 # -- 147 x_shift = tlag([1;2;3;4;5], [1;3;5;6;7], n=2) 148 @test isequal(PanelShift.tlag([1;3;5;6;7], [1;2;3;4;5], 2), x_shift) 149 @test isequal(x_shift, [missing; 1; 2; missing; 3]) 150 151 x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2) 152 @test isequal(PanelShift.tlag(float.([1;3;5;6;7]), [1;2;3;4;5], 2), x_shift) 153 @test isequal(x_shift, [missing; 1; 2; missing; 3]) 154 155 # non-numeric x and unequal gaps 156 x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=1) 157 @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 1), x_shift) 158 @test isequal(x_shift, [missing; :apple; missing; missing; missing]) 159 160 x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=2) 161 @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 2), x_shift) 162 @test isequal(x_shift, [missing; missing; :orange; missing; missing]) 163 164 x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=3) 165 @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 3), x_shift) 166 @test isequal(x_shift, [missing; missing; :apple; :banana; missing]) 167 168 169 x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4) 170 @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift) 171 @test isequal(x_shift, [missing; missing; missing; missing; :pineapple]) 172 173 x_shift = tlead([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4) 174 @test isequal(PanelShift.tlead([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift) 175 @test isequal(x_shift, [missing; missing; missing; :strawberry; missing]) 176 177 # indexed by dates 178 x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(1)) 179 @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(1)), x_shift) 180 @test isequal(x_shift, [missing; 1; missing]) 181 182 x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(2)) 183 @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(2)), x_shift) 184 @test isequal(x_shift, [missing; missing; 2]) 185 186 # test shift 187 x_shift = tshift([1;2;3], [1;2;3], n=-1) 188 @test isequal(PanelShift.tshift([1;2;3], [1;2;3], -1), x_shift) 189 @test isequal(x_shift, tlead([1;2;3], [1;2;3], n=1)) 190 191 x_shift = tshift([1;2;3], [1;2;3], n=1) 192 @test isequal(PanelShift.tshift([1;2;3], [1;2;3], 1), x_shift) 193 @test isequal(x_shift, tlag([1;2;3], [1;2;3], n=1)) 194 195 # safeguards for tlag 196 @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2]) 197 @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3]) 198 @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0) 199 200 end 201 # -------------------------------------------------------------------------------------------------- 202 203 204 # -------------------------------------------------------------------------------------------------- 205 @testset "tlead error paths" begin 206 # unsorted time vector 207 @test_throws r"time vector not sorted"i tlead([1, 2, 3], [3, 1, 2]) 208 209 # mismatched lengths 210 @test_throws r"value and time vector"i tlead([1, 2], [1, 2, 3]) 211 212 # zero shift 213 @test_throws r"shift value"i tlead([1, 2, 3], [1, 2, 3], n=0) 214 end 215 # -------------------------------------------------------------------------------------------------- 216 217 218 # -------------------------------------------------------------------------------------------------- 219 @testset "tshift edge cases" begin 220 # tshift with n=nothing should warn and default to lag 221 result = @test_logs (:warn, r"shift not specified"i) tshift([1, 2, 3], [1, 2, 3]) 222 @test isequal(result, tlag([1, 2, 3], [1, 2, 3])) 223 224 # tshift with Date vectors 225 dates = [Date(2020, 1, 1), Date(2020, 1, 2), Date(2020, 1, 3)] 226 result = tshift([10, 20, 30], dates, n=Day(1)) 227 @test isequal(result, tlag([10, 20, 30], dates, n=Day(1))) 228 229 result = tshift([10, 20, 30], dates, n=Day(-1)) 230 @test isequal(result, tlead([10, 20, 30], dates, n=Day(1))) 231 end 232 # -------------------------------------------------------------------------------------------------- 233 234 235 end