commit 1203c402a3c50602f5ea707c84134bcf6f58bfc0
parent 6109d8abafa19af3cc4bce88c555bec2d7c2bb5f
Author: Erik Loualiche <eloualic@umn.edu>
Date: Tue, 20 May 2025 17:46:01 -0500
loosen project.toml restrictions
Diffstat:
6 files changed, 456 insertions(+), 3 deletions(-)
diff --git a/Project.toml b/Project.toml
@@ -19,17 +19,18 @@ ColorSchemes = "3.29.0"
Crayons = "4.1.1"
DataFrames = "1.7.0"
Dates = "1.11.0"
-Interpolations = "0.16.1"
+Interpolations = ">= 0.15"
Missings = "1.2.0"
PrettyTables = "2.4.0"
Random = "1.11.0"
-StatsBase = "0.34.5"
+StatsBase = " >= 0.30"
julia = ">= 1.10.9"
[extras]
PalmerPenguins = "8b842266-38fa-440a-9b57-31493939ab85"
+PanelShift = "d68e4d5e-4a60-4df1-b225-9a1636c75ae0"
StreamToString = "dc918f9c-79cc-42e6-85f1-d8b9b09632f4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets]
-test = ["Test", "PalmerPenguins", "StreamToString"]
+test = ["Test", "PalmerPenguins", "StreamToString", "PanelShift"]
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ So far the package provides a four functions
2. create category based on quantile ([`xtile`](#xtile))
3. winsorize some data ([`winsorize`](#winsorize-data))
4. fill unbalanced panel data ([`panel_fill`](#filling-an-unbalanced-panel))
+ 5. lead and lag functions ([`tlead|tlag`](#leads-and-lags))
Note that as the package grow in different directions, dependencies might become overwhelming.
The readme serves as documentation; there might be more examples inside of the test folder.
@@ -104,7 +105,41 @@ panel_fill(df_panel, :id, :t, [:v1, :v2, :v3],
gap=Month(1), method=:linear, uniquecheck=true, flag=true, merge=true)
```
+### Leads and lags
+This is largely "borrowed" (copied) from @FuZhiyu [`PanelShift.jl`](https://github.com/FuZhiyu/PanelShift.jl) package.
+```julia
+t, v = [1;2;4], [1;2;3];
+julia> tlag(t, v) # the default lag period is the unitary difference in t, here 1
+3-element Vector{Union{Missing, Int64}}:
+ missing
+ 1
+ missing
+
+
+julia> tlag(t, v, 2) # we can also specify lags using the third argument
+3-element Vector{Union{Missing, Int64}}:
+ missing
+ missing
+ 2
+
+
+julia> using Dates;
+julia> t = [Date(2020,1,1); Date(2020,1,2); Date(2020,1,4)];
+julia> tlag(t, [1, 2, 3]) # customized types of the time vector are also supported
+3-element Vector{Union{Missing, Int64}}:
+ missing
+ 1
+ missing
+
+
+julia> tlag(t, [1, 2, 3], Day(2)) # specify two-day lags
+3-element Vector{Union{Missing, Int64}}:
+ missing
+ missing
+ 2
+
+```
## Other stuff
diff --git a/src/BazerData.jl b/src/BazerData.jl
@@ -3,6 +3,7 @@ module BazerData
# --------------------------------------------------------------------------------------------------
import ColorSchemes: get, colorschemes
import Crayons: @crayon_str
+import Dates: Date
import DataFrames: AbstractDataFrame, ByRow, DataFrame, groupby, combine, nrow, Not, nonunique, proprow,
rename, rename!, select, select!, transform, transform!, unstack
import Dates: format, now, DatePeriod, Dates, Dates.AbstractTime, ISODateTimeFormat
@@ -17,6 +18,7 @@ import StatsBase: quantile, UnitWeights, Weights
# --------------------------------------------------------------------------------------------------
# Import functions
include("PanelData.jl")
+include("TimeShift.jl")
include("StataUtils.jl")
include("Winsorize.jl")
# --------------------------------------------------------------------------------------------------
@@ -25,6 +27,7 @@ include("Winsorize.jl")
# --------------------------------------------------------------------------------------------------
# List of exported functions
export panel_fill, panel_fill!
+export tlead, tlag, tshift
export tabulate
export xtile
export winsorize
diff --git a/src/TimeShift.jl b/src/TimeShift.jl
@@ -0,0 +1,147 @@
+# --------------------------------------------------------------------------------------------------
+# most of this code was copied from @FuZhiyu PanelShift.jl package
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+function tlag(x, t_vec;
+ n = nothing,
+ checksorted = true,
+ verbose = false,
+ )
+
+ if isnothing(n) # this is the default
+ n = oneunit(t_vec[1] - t_vec[1])
+ verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") :
+ (@info "Default gap inferred ... $n") )
+ elseif eltype(t_vec) == Date
+ verbose && @info "No checks on increment argument n for type Date ... "
+ else
+ !(n isa typeof(t_vec[1]-t_vec[1])) &&
+ error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))")
+
+ end
+
+ checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!")
+ !(n > zero(n)) && error("shift value has to be positive!")
+
+ N = length(t_vec)
+ (length(x) != N) && error("value and time vector have different lengths!")
+
+ x_shift = Array{Union{Missing, eltype(x)}}(missing, N);
+
+ # _binary_search_lag!(x_shift, x, t_vec, n, N)
+ _linear_scan!(x_shift, x, t_vec, n, N)
+
+ return x_shift
+
+end
+
+function _linear_scan!(x_shift, x, t_vec, n, N)
+ j = 0
+ @inbounds for i in 1:N
+ # Calculate the target time we're looking for
+ lagt = t_vec[i] - n
+ # Scan forward from where we left off to find the largest index
+ # where t_vec[j] <= lagt (since t_vec is sorted)
+ while j < N && t_vec[j + 1] <= lagt
+ j += 1
+ end
+
+ # If we found a valid index and it's an exact match
+ if j > 0 && t_vec[j] == lagt
+ x_shift[i] = x[j]
+ # else
+ # x_shift[i] = missing
+ end
+ end
+ return x_shift
+end
+# --------------------------------------------------------------------------------------------------
+
+
+
+# --------------------------------------------------------------------------------------------------
+# most of this code was inspired by @FuZhiyu PanelShift.jl package
+function tlead(x, t_vec;
+ n = nothing,
+ checksorted = true,
+ verbose = false,
+ )
+
+ if isnothing(n) # this is the default
+ n = oneunit(t_vec[1] - t_vec[1])
+ verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") :
+ (@info "Default gap inferred ... $n") )
+ elseif eltype(t_vec) == Date
+ verbose && @info "No checks on increment argument n for date type ... "
+ else
+ !(n isa typeof(t_vec[1]-t_vec[1])) &&
+ error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))")
+ end
+
+ checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!")
+ !(n > zero(n)) && error("shift value has to be positive!")
+
+ N = length(t_vec)
+ (length(x) != N) && error("value and time vector have different lengths!")
+
+ x_shift = Array{Union{Missing, eltype(x)}}(missing, N);
+ _linear_scan_lead!(x_shift, x, t_vec, n, N)
+ return x_shift
+
+end
+
+function _linear_scan_lead!(x_shift, x, t_vec, n, N)
+ j = 0
+
+ @inbounds for i in 1:N
+ leadt = t_vec[i] + n
+ # Early termination if already past the end of the array
+ if leadt > t_vec[N]
+ # All remaining targets will be beyond the array bounds
+ break
+ end
+
+ # Fast forward scan (can add loop unrolling here if needed)
+ while j < N && t_vec[j + 1] < leadt
+ j += 1
+ end
+ # Check for exact match at the next position
+ if j + 1 <= N && t_vec[j + 1] == leadt
+ x_shift[i] = x[j + 1]
+ end
+ end
+ return x_shift
+
+end
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+function tshift(x, t_vec; n=nothing, kwargs...)
+
+ if isnothing(n)
+ @warn "shift not specified ... defaulting to lag"
+ n = oneunit(t_vec[1] - t_vec[1])
+ end
+
+ if n > zero(n)
+ return tlag(x, t_vec, n=n; kwargs...)
+ else
+ return tlead(x, t_vec, n=-n; kwargs...)
+ end
+end
+# --------------------------------------------------------------------------------------------------
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/UnitTests/timeshift.jl b/test/UnitTests/timeshift.jl
@@ -0,0 +1,266 @@
+@testset "Time Shift" begin
+
+
+# --------------------------------------------------------------------------------------------------
+ df1 = DataFrame( # missing t=2 for id=1
+ id = ["a","a","b","b","c","c","c"],
+ t = [1,3,8,9,1,2,4],
+ v1 = [1,1,1,6,6,0,0],
+ v2 = [1,2,3,6,6,4,5],
+ v3 = [1,5,4,6,6,15,12.25])
+
+ df2 = DataFrame( # missing t=2 for id=1
+ id = ["a","a", "b","b", "c","c","c", "d","d","d","d"],
+ t = [Date(1990, 1, 1), Date(1990, 4, 1), Date(1990, 8, 1), Date(1990, 9, 1),
+ Date(1990, 1, 1), Date(1990, 2, 1), Date(1990, 4, 1),
+ Date(1999, 11, 10), Date(1999, 12, 21), Date(2000, 2, 5), Date(2000, 4, 1)],
+ v1 = [1,1, 1,6, 6,0,0, 1,4,11,13],
+ v2 = [1,2,3,6,6,4,5, 1,2,3,4],
+ v3 = [1,5,4,6,6,15,12.25, 21,22.5,17.2,1])
+
+ # --- test for df1
+ @testset "DF1" begin
+ sort!(df1, [:id, :t])
+ transform!(groupby(df1, :id), [:t, :v2] => ( (d, x) -> tlag(x, d)) => :v2_lag)
+ @test isequal(df1.v2_lag, [missing, missing, missing, 3, missing, 6, missing])
+ end
+
+ # --- test for df2 multiple variables
+ @testset "DF2" begin
+ sort!(df2, [:id, :t])
+ transform!(
+ groupby(df2, :id),
+ [:t, :v1] =>
+ ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true),
+ v1_lag_mth = tlag(v1, t; n=Month(1), verbose=true) ) ) =>
+ [:v1_lag_day, :v1_lag_mth])
+
+ @test all(ismissing.(df2.v1_lag_day))
+ @test isequal(df2.v1_lag_mth,
+ [missing, missing, missing, 1, missing, 6, missing, missing, missing, missing, missing ])
+
+ end
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+ @testset "General tests" begin
+
+ # --- test large datasets
+ function generate_test_data(;size=50_000, gap_probability=0.1, seed=123)
+ Random.seed!(seed)
+
+ # Start date and initialize arrays
+ start_date = Date(2020, 1, 1)
+ dates = Vector{Date}()
+ x_values = Vector{Float64}()
+
+ # Generate dates with some gaps and corresponding x values
+ current_date = start_date
+ for i in 1:size
+ # Add current date and value
+ push!(dates, current_date)
+ push!(x_values, sin(i/100) + 0.1*randn()) # Some noisy sine wave pattern
+
+ # Decide whether to introduce a gap (skip 1-5 days)
+ if rand() < gap_probability
+ gap_size = rand(1:5)
+ current_date += Day(gap_size + 1)
+ else
+ # Normal increment
+ current_date += Day(1)
+ end
+ end
+
+ # Create DataFrame
+ df = DataFrame(date=dates, x=x_values)
+ return df
+ end
+
+ tiny_df = generate_test_data(size=50, gap_probability=0.05);
+ small_df = generate_test_data(size=5_000, gap_probability=0.1);
+ large_df = generate_test_data(size=1_000_000, gap_probability=0.1);
+
+ @time transform!(small_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag)
+ @test nrow(subset(small_df, :x_lag => ByRow(!ismissing))) == 4525
+
+ @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day);
+ @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth);
+ @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr);
+
+ transform!(large_df, :date => ByRow(year) => :datey)
+ @test_throws r"time vector not sorted"i transform!(large_df,
+ [:x, :datey] => ( (x, d) -> tlag(x, d, n=1)) => :x_lag_datey);
+
+ @test nrow(subset(large_df, :x_lag_day => ByRow(!ismissing))) == 900_182
+ @test nrow(subset(large_df, :x_lag_mth => ByRow(!ismissing))) == 770_178
+ @test nrow(subset(large_df, :x_lag_yr => ByRow(!ismissing))) == 769_502
+
+ @time transform!(tiny_df, [:x, :date] => ( (x, d) -> tlead(x, d)) => :x_lead)
+ @time transform!(tiny_df, [:x_lead, :date] => ( (x, d) -> tlag(x, d)) => :x_lead_lag)
+ @test dropmissing(tiny_df) |> (df -> df.x == df.x_lead_lag) # lead lag reverts back up to destroyed information
+
+ @time transform!(tiny_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Day(2)) ) => :x_lead2)
+ @time transform!(tiny_df, [:x_lead2, :date] => ( (x, d) -> tlag(tlag(x, d), d) ) => :x_lead2_lag2)
+ @test dropmissing(tiny_df) |> (df -> df.x == df.x_lead2_lag2) # lead lag reverts back up to destroyed information
+
+
+ end # of "General tests"
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+ @testset "From Panelshift.jl" begin
+
+ import PanelShift
+
+ # note the api for this package differs slightly ...
+ # PanelShift.tlag(time_variable, x)
+ # BazelData.tlag(x, time_variable)
+
+ x_shift = tlag([4, 5, 6], [1, 2, 3])
+ @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 1), x_shift)
+ @test isequal(x_shift, [missing, 4, 5])
+
+ x_shift = tlead([4, 5, 6], [1, 2, 3])
+ @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 1), x_shift)
+ @test isequal(x_shift, [5; 6; missing])
+
+ x_shift = tlag([4;5;6], [1;2;3], n=2)
+ @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift)
+ @test isequal(x_shift, [missing;missing;4])
+
+ x_shift = tlead([4;5;6], [1;2;3], n=2)
+ @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift)
+ @test isequal(x_shift, [6; missing; missing])
+
+ # unit-length vector
+ x_shift = tlag([1], [1])
+ @test isequal(PanelShift.tlag([1], [1]), x_shift) #[missing;])
+ @test isequal(x_shift, [missing])
+
+ x_shift = tlead([1], [1])
+ @test isequal(PanelShift.tlead([1], [1]), x_shift)
+ @test isequal(x_shift, [missing])
+
+ # --
+ x_shift = tlag([1;2;3;4;5], [1;3;5;6;7], n=2)
+ @test isequal(PanelShift.tlag([1;3;5;6;7], [1;2;3;4;5], 2), x_shift)
+ @test isequal(x_shift, [missing; 1; 2; missing; 3])
+
+ x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2)
+ @test isequal(PanelShift.tlag(float.([1;3;5;6;7]), [1;2;3;4;5], 2), x_shift)
+ @test isequal(x_shift, [missing; 1; 2; missing; 3])
+
+ # non-numeric x and unequal gaps
+ x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=1)
+ @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 1), x_shift)
+ @test isequal(x_shift, [missing; :apple; missing; missing; missing])
+
+ x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=2)
+ @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 2), x_shift)
+ @test isequal(x_shift, [missing; missing; :orange; missing; missing])
+
+ x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=3)
+ @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 3), x_shift)
+ @test isequal(x_shift, [missing; missing; :apple; :banana; missing])
+
+
+ x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4)
+ @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift)
+ @test isequal(x_shift, [missing; missing; missing; missing; :pineapple])
+
+ x_shift = tlead([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4)
+ @test isequal(PanelShift.tlead([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift)
+ @test isequal(x_shift, [missing; missing; missing; :strawberry; missing])
+
+ # indexed by dates
+ x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(1))
+ @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(1)), x_shift)
+ @test isequal(x_shift, [missing; 1; missing])
+
+ x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(2))
+ @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(2)), x_shift)
+ @test isequal(x_shift, [missing; missing; 2])
+
+ # test shift
+ x_shift = tshift([1;2;3], [1;2;3], n=-1)
+ @test isequal(PanelShift.tshift([1;2;3], [1;2;3], -1), x_shift)
+ @test isequal(x_shift, tlead([1;2;3], [1;2;3], n=1))
+
+ x_shift = tshift([1;2;3], [1;2;3], n=1)
+ @test isequal(PanelShift.tshift([1;2;3], [1;2;3], 1), x_shift)
+ @test isequal(x_shift, tlag([1;2;3], [1;2;3], n=1))
+
+ # safeguards
+ # @test_throws ArgumentError PanelShift.tlag([1;2;2], [1,2,3]) # argcheck error unsorted t
+ @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2])
+ # @test_throws ArgumentError PanelShift.tlag([1;2;], [1,2,3])
+ @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3])
+ # @test_throws ArgumentError PanelShift.tlag([1;2;3], [1,2,3], 0)
+ @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0)
+
+ end
+# --------------------------------------------------------------------------------------------------
+
+
+
+# --------------------------------------------------------------------------------------------------
+# benchmarking
+
+# using Chairmarks
+# large_df = generate_test_data(size=50_000_000, gap_probability=0.1);
+
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day)
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth)
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr)
+
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x)) => :x_lag_day)
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Month(1))) => :x_lag_mth)
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Year(1))) => :x_lag_yr)
+
+
+
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d)) => :x_lag_day)
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Month(1))) => :x_lag_mth)
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Year(1))) => :x_lag_yr)
+
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x)) => :x_lag_day)
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Month(1))) => :x_lag_mth)
+# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Year(1))) => :x_lag_yr)
+
+# --------------------------------------------------------------------------------------------------
+
+
+
+
+
+end
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -11,6 +11,7 @@ using StreamToString
const testsuite = [
"tabulate", "xtile", "winsorize", "panel_fill",
+ "timeshift"
]
ENV["DATADEPS_ALWAYS_ACCEPT"] = true # for data loading of PalmerPenguins