BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

timeshift.jl (10365B)


      1 @testset "Time Shift" begin
      2 
      3 
      4 # --------------------------------------------------------------------------------------------------
      5     df1 = DataFrame(        # missing t=2 for id=1
      6         id = ["a","a","b","b","c","c","c"],
      7         t  = [1,3,8,9,1,2,4],
      8         v1  = [1,1,1,6,6,0,0],
      9         v2  = [1,2,3,6,6,4,5],
     10         v3  = [1,5,4,6,6,15,12.25])
     11 
     12     df2 = DataFrame(        # missing t=2 for id=1
     13         id = ["a","a", "b","b", "c","c","c", "d","d","d","d"],
     14         t  = [Date(1990, 1, 1), Date(1990, 4, 1), Date(1990, 8, 1), Date(1990, 9, 1),
     15               Date(1990, 1, 1), Date(1990, 2, 1), Date(1990, 4, 1),
     16               Date(1999, 11, 10), Date(1999, 12, 21), Date(2000, 2, 5), Date(2000, 4, 1)],
     17         v1 = [1,1, 1,6, 6,0,0, 1,4,11,13],
     18         v2 = [1,2,3,6,6,4,5, 1,2,3,4],
     19         v3 = [1,5,4,6,6,15,12.25, 21,22.5,17.2,1])
     20 
     21     # --- test for df1
     22     @testset "DF1" begin
     23         sort!(df1, [:id, :t])
     24         transform!(groupby(df1, :id), [:t, :v2] => ( (d, x) -> tlag(x, d)) => :v2_lag)
     25         @test isequal(df1.v2_lag, [missing, missing, missing, 3, missing, 6, missing])
     26     end
     27 
     28     # --- test  for df2 multiple variables
     29     @testset "DF2" begin
     30         sort!(df2, [:id, :t])
     31         transform!(
     32             groupby(df2, :id),
     33             [:t, :v1] =>
     34                 ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true),
     35                                v1_lag_mth = tlag(v1, t; n=Month(1), verbose=true) ) ) =>
     36                 [:v1_lag_day, :v1_lag_mth])
     37 
     38         @test all(ismissing.(df2.v1_lag_day))
     39         @test isequal(df2.v1_lag_mth,
     40             [missing, missing, missing, 1, missing, 6, missing, missing, missing, missing, missing ])
     41 
     42     end
     43 # --------------------------------------------------------------------------------------------------
     44 
     45 
     46 # --------------------------------------------------------------------------------------------------
     47     @testset "General tests" begin
     48 
     49     # --- test large datasets
     50     function generate_test_data(;size=50_000, gap_probability=0.1, seed=123)
     51         Random.seed!(seed)
     52 
     53         # Start date and initialize arrays
     54         start_date = Date(2020, 1, 1)
     55         dates = Vector{Date}()
     56         x_values = Vector{Float64}()
     57 
     58         # Generate dates with some gaps and corresponding x values
     59         current_date = start_date
     60         for i in 1:size
     61             # Add current date and value
     62             push!(dates, current_date)
     63             push!(x_values, sin(i/100) + 0.1*randn()) # Some noisy sine wave pattern
     64 
     65             # Decide whether to introduce a gap (skip 1-5 days)
     66             if rand() < gap_probability
     67                 gap_size = rand(1:5)
     68                 current_date += Day(gap_size + 1)
     69             else
     70                 # Normal increment
     71                 current_date += Day(1)
     72             end
     73         end
     74 
     75         # Create DataFrame
     76         df = DataFrame(date=dates, x=x_values)
     77         return df
     78     end
     79 
     80     tiny_df  = generate_test_data(size=50, gap_probability=0.05);
     81     small_df = generate_test_data(size=5_000, gap_probability=0.1);
     82     large_df = generate_test_data(size=1_000_000, gap_probability=0.1);
     83 
     84     @time transform!(small_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag)
     85     @test nrow(subset(small_df, :x_lag => ByRow(!ismissing))) == 4525
     86 
     87     @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day);
     88     @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth);
     89     @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr);
     90 
     91     transform!(large_df, :date => ByRow(year) => :datey)
     92     @test_throws r"time vector not sorted"i transform!(large_df,
     93         [:x, :datey] => ( (x, d) -> tlag(x, d, n=1)) => :x_lag_datey);
     94 
     95     @test nrow(subset(large_df, :x_lag_day => ByRow(!ismissing)))    == 900_182
     96     @test nrow(subset(large_df, :x_lag_mth => ByRow(!ismissing)))    == 770_178
     97     @test nrow(subset(large_df, :x_lag_yr => ByRow(!ismissing)))     == 769_502
     98 
     99     @time transform!(tiny_df, [:x, :date] => ( (x, d) -> tlead(x, d)) => :x_lead)
    100     @time transform!(tiny_df, [:x_lead, :date] => ( (x, d) -> tlag(x, d)) => :x_lead_lag)
    101     @test dropmissing(tiny_df) |> (df -> df.x == df.x_lead_lag)  # lead lag reverts back up to destroyed information
    102 
    103     @time transform!(tiny_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Day(2)) ) => :x_lead2)
    104     @time transform!(tiny_df, [:x_lead2, :date] => ( (x, d) -> tlag(tlag(x, d), d) ) => :x_lead2_lag2)
    105     @test dropmissing(tiny_df) |> (df -> df.x == df.x_lead2_lag2)  # lead lag reverts back up to destroyed information
    106 
    107 
    108     end # of "General tests"
    109 # --------------------------------------------------------------------------------------------------
    110 
    111 
    112 # --------------------------------------------------------------------------------------------------
    113     @testset "From Panelshift.jl" begin
    114 
    115     import PanelShift
    116 
    117     # note the api for this package differs slightly ...
    118     # PanelShift.tlag(time_variable, x)
    119     # BazelData.tlag(x, time_variable)
    120 
    121     x_shift = tlag([4, 5, 6], [1, 2, 3])
    122     @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 1), x_shift)
    123     @test isequal(x_shift, [missing, 4, 5])
    124 
    125     x_shift = tlead([4, 5, 6], [1, 2, 3])
    126     @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 1), x_shift)
    127     @test isequal(x_shift, [5; 6; missing])
    128 
    129     x_shift = tlag([4;5;6], [1;2;3], n=2)
    130     @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift)
    131     @test isequal(x_shift, [missing;missing;4])
    132 
    133     x_shift = tlead([4;5;6], [1;2;3], n=2)
    134     @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift)
    135     @test isequal(x_shift, [6; missing; missing])
    136 
    137     # unit-length vector
    138     x_shift = tlag([1], [1])
    139     @test isequal(PanelShift.tlag([1], [1]), x_shift)    #[missing;])
    140     @test isequal(x_shift, [missing])
    141 
    142     x_shift = tlead([1], [1])
    143     @test isequal(PanelShift.tlead([1], [1]), x_shift)
    144     @test isequal(x_shift, [missing])
    145 
    146     # --
    147     x_shift = tlag([1;2;3;4;5], [1;3;5;6;7], n=2)
    148     @test isequal(PanelShift.tlag([1;3;5;6;7], [1;2;3;4;5], 2), x_shift)
    149     @test isequal(x_shift, [missing; 1; 2; missing; 3])
    150 
    151     x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2)
    152     @test isequal(PanelShift.tlag(float.([1;3;5;6;7]), [1;2;3;4;5], 2), x_shift)
    153     @test isequal(x_shift, [missing; 1; 2; missing; 3])
    154 
    155     # non-numeric x and unequal gaps
    156     x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=1)
    157     @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 1), x_shift)
    158     @test isequal(x_shift, [missing; :apple; missing; missing; missing])
    159 
    160     x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=2)
    161     @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 2), x_shift)
    162     @test isequal(x_shift, [missing; missing; :orange; missing; missing])
    163 
    164     x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=3)
    165     @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 3), x_shift)
    166     @test isequal(x_shift, [missing; missing; :apple; :banana; missing])
    167 
    168 
    169     x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4)
    170     @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift)
    171     @test isequal(x_shift, [missing; missing; missing; missing; :pineapple])
    172 
    173     x_shift = tlead([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4)
    174     @test isequal(PanelShift.tlead([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift)
    175     @test isequal(x_shift, [missing; missing; missing; :strawberry; missing])
    176 
    177     # indexed by dates
    178     x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(1))
    179     @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(1)), x_shift)
    180     @test isequal(x_shift, [missing; 1; missing])
    181 
    182     x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(2))
    183     @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(2)), x_shift)
    184     @test isequal(x_shift, [missing; missing; 2])
    185 
    186     # test shift
    187     x_shift = tshift([1;2;3], [1;2;3], n=-1)
    188     @test isequal(PanelShift.tshift([1;2;3], [1;2;3], -1), x_shift)
    189     @test isequal(x_shift, tlead([1;2;3], [1;2;3], n=1))
    190 
    191     x_shift = tshift([1;2;3], [1;2;3], n=1)
    192     @test isequal(PanelShift.tshift([1;2;3], [1;2;3], 1), x_shift)
    193     @test isequal(x_shift, tlag([1;2;3], [1;2;3], n=1))
    194 
    195     # safeguards for tlag
    196     @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2])
    197     @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3])
    198     @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0)
    199 
    200     end
    201 # --------------------------------------------------------------------------------------------------
    202 
    203 
    204 # --------------------------------------------------------------------------------------------------
    205     @testset "tlead error paths" begin
    206         # unsorted time vector
    207         @test_throws r"time vector not sorted"i tlead([1, 2, 3], [3, 1, 2])
    208 
    209         # mismatched lengths
    210         @test_throws r"value and time vector"i tlead([1, 2], [1, 2, 3])
    211 
    212         # zero shift
    213         @test_throws r"shift value"i tlead([1, 2, 3], [1, 2, 3], n=0)
    214     end
    215 # --------------------------------------------------------------------------------------------------
    216 
    217 
    218 # --------------------------------------------------------------------------------------------------
    219     @testset "tshift edge cases" begin
    220         # tshift with n=nothing should warn and default to lag
    221         result = @test_logs (:warn, r"shift not specified"i) tshift([1, 2, 3], [1, 2, 3])
    222         @test isequal(result, tlag([1, 2, 3], [1, 2, 3]))
    223 
    224         # tshift with Date vectors
    225         dates = [Date(2020, 1, 1), Date(2020, 1, 2), Date(2020, 1, 3)]
    226         result = tshift([10, 20, 30], dates, n=Day(1))
    227         @test isequal(result, tlag([10, 20, 30], dates, n=Day(1)))
    228 
    229         result = tshift([10, 20, 30], dates, n=Day(-1))
    230         @test isequal(result, tlead([10, 20, 30], dates, n=Day(1)))
    231     end
    232 # --------------------------------------------------------------------------------------------------
    233 
    234 
    235 end