BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

commit b3b6222eb5461b6e738b6e3c7bcd2a0348ce496f
parent 79506f32a90c016825a9915e8c02a41560df73a9
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Wed, 25 Feb 2026 08:40:45 -0600

fix bugs and harden validation across all functions

- TimeShift: extract shared validation into _validate_tshift_args,
  add empty array handling, remove dead commented-out code
- Winsorize: fix unreachable prob validation (check before clamping),
  replace @error with error(), add empty vector guard
- StataUtils: replace @error with error() so invalid input halts
  execution, add n_quantiles >= 1 validation to xtile
- PanelData: fix == to isequal for missing-safe column comparison,
  fix incorrect default in error message, use deepcopy in panel_fill

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
Msrc/PanelData.jl | 8++++----
Msrc/StataUtils.jl | 9+++++----
Msrc/TimeShift.jl | 111++++++++++++++++++++++++++++---------------------------------------------------
Msrc/Winsorize.jl | 8+++++---
4 files changed, 53 insertions(+), 83 deletions(-)

diff --git a/src/PanelData.jl b/src/PanelData.jl @@ -59,7 +59,7 @@ function panel_fill!( ) else df[!, time_var_r] .= floor.(df[!, time_var], gap) - if !(df[!, time_var_r] == df[!, time_var]) + if !isequal(df[!, time_var_r], df[!, time_var]) @warn "Using rounded time variables for consistency with gap: $gap" end end @@ -86,7 +86,7 @@ function panel_fill!( error( """ Method $method not available. - Please choose from :backwards, :forwards, :nearest, :linear (default) + Please choose from :backwards (default), :forwards, :nearest, :linear """ ) end @@ -124,7 +124,7 @@ function panel_fill!( if flag df_fill[!, :flag] .= method end - if df[!, time_var_r] == df[!, time_var] + if isequal(df[!, time_var_r], df[!, time_var]) rename!(df_fill, time_var_r => time_var) select!(df, Not(time_var_r)) else # if they are not all the same we are going to fill @@ -157,7 +157,7 @@ function panel_fill( flag::Bool = false ) - df_res = copy(df) + df_res = deepcopy(df) panel_fill!(df_res, id_var, time_var, value_var, gap = gap, method = method, uniquecheck = uniquecheck, flag = flag) diff --git a/src/StataUtils.jl b/src/StataUtils.jl @@ -83,7 +83,7 @@ function tabulate( @warn "Converting format_tbl to :long" format_tbl = :long else - @error "Table format_tbl must be :long or :wide" + error("Table format_tbl must be :long or :wide") end end @@ -118,9 +118,9 @@ function _tabulate_compute(df, cols, group_type, reorder_cols) (d -> combine(groupby(d, name_type_cols), nrow => :freq, proprow =>:pct)) new_cols = name_type_cols elseif group_type isa Vector{Symbol} - !all(s -> s in [:value, :type], group_type) && (@error group_type_error_msg) + !all(s -> s in [:value, :type], group_type) && error(group_type_error_msg) (size(group_type, 1) != size(cols, 1)) && - (@error "\ngroup_type and cols must be the same size; \nsee help for more information") + error("group_type and cols must be the same size; see help for more information") type_cols = cols[group_type .== :type] name_type_cols = Symbol.(type_cols, "_typeof") group_cols = [cols[group_type .== :value]; name_type_cols] @@ -128,7 +128,7 @@ function _tabulate_compute(df, cols, group_type, reorder_cols) (d -> combine(groupby(d, group_cols), nrow => :freq, proprow =>:pct)) new_cols = group_cols else - @error group_type_error_msg + error(group_type_error_msg) end # resort columns based on the original order new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols, @@ -397,6 +397,7 @@ function xtile( )::Vector{Int} where T <: Real N = length(data) + n_quantiles < 1 && error("n_quantiles must be >= 1") n_quantiles > N && (@warn "More quantiles than data") probs = range(0, 1, length=n_quantiles + 1)[2:end] diff --git a/src/TimeShift.jl b/src/TimeShift.jl @@ -3,14 +3,29 @@ # -------------------------------------------------------------------------------------------------- -# ```@example -# julia> x = [1, 2, 3] -# julia> t = [1, 2, 3] -# julia> tlag(x, t, n = 1) -# 3-element Vector{Union{Missing, Int64}}: -# missing -# 1 -# 2 +# -------------------------------------------------------------------------------------------------- +# Shared validation for tlag/tlead +function _validate_tshift_args(x, t_vec; n=nothing, checksorted=true, verbose=false) + if isnothing(n) + n = oneunit(t_vec[1] - t_vec[1]) + verbose && ((t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") : + (@info "Default gap inferred ... $n")) + elseif eltype(t_vec) == Date + verbose && @info "No checks on increment argument n for type Date ... " + else + !(n isa typeof(t_vec[1]-t_vec[1])) && + error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))") + end + + checksorted && !issorted(t_vec; lt = (<=)) && error("time vector not sorted (order is strict)!") + !(n > zero(n)) && error("shift value has to be positive!") + + N = length(t_vec) + (length(x) != N) && error("value and time vector have different lengths!") + + return n, N +end +# -------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------- @@ -22,7 +37,7 @@ backward in time by a specified amount `n`. # Arguments - `x`: Array of values to be lagged -- `t_vec`: Vector of time points corresponding to each element in `x` +- `t_vec`: Vector of time points corresponding to each element in `x` # Keyword Arguments - `n`: Time gap for lagging. If `nothing` (default), uses the minimal unit difference between time points. @@ -57,55 +72,31 @@ julia> tlag([1, 2, 3], [1, 2, 3], n = 1) ``` """ -function tlag(x, t_vec; - n = nothing, +function tlag(x, t_vec; + n = nothing, checksorted = true, verbose = false, ) - if isnothing(n) # this is the default - n = oneunit(t_vec[1] - t_vec[1]) - verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") : - (@info "Default gap inferred ... $n") ) - elseif eltype(t_vec) == Date - verbose && @info "No checks on increment argument n for type Date ... " - else - !(n isa typeof(t_vec[1]-t_vec[1])) && - error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))") + isempty(t_vec) && return Array{Union{Missing, eltype(x)}}(missing, 0) - end + n, N = _validate_tshift_args(x, t_vec; n=n, checksorted=checksorted, verbose=verbose) - checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!") - !(n > zero(n)) && error("shift value has to be positive!") - - N = length(t_vec) - (length(x) != N) && error("value and time vector have different lengths!") - - x_shift = Array{Union{Missing, eltype(x)}}(missing, N); - - # _binary_search_lag!(x_shift, x, t_vec, n, N) + x_shift = Array{Union{Missing, eltype(x)}}(missing, N) _linear_scan!(x_shift, x, t_vec, n, N) return x_shift - end function _linear_scan!(x_shift, x, t_vec, n, N) j = 0 @inbounds for i in 1:N - # Calculate the target time we're looking for lagt = t_vec[i] - n - # Scan forward from where we left off to find the largest index - # where t_vec[j] <= lagt (since t_vec is sorted) while j < N && t_vec[j + 1] <= lagt j += 1 end - - # If we found a valid index and it's an exact match if j > 0 && t_vec[j] == lagt x_shift[i] = x[j] - # else - # x_shift[i] = missing end end return x_shift @@ -113,7 +104,6 @@ end # -------------------------------------------------------------------------------------------------- - # -------------------------------------------------------------------------------------------------- """ tlead(x, t_vec; n = nothing, checksorted = true, verbose = false) @@ -158,57 +148,38 @@ julia> tlead([1, 2, 3], [8, 9, 10], n = 1) ``` """ -function tlead(x, t_vec; - n = nothing, +function tlead(x, t_vec; + n = nothing, checksorted = true, verbose = false, ) - if isnothing(n) # this is the default - n = oneunit(t_vec[1] - t_vec[1]) - verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") : - (@info "Default gap inferred ... $n") ) - elseif eltype(t_vec) == Date - verbose && @info "No checks on increment argument n for date type ... " - else - !(n isa typeof(t_vec[1]-t_vec[1])) && - error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))") - end + isempty(t_vec) && return Array{Union{Missing, eltype(x)}}(missing, 0) - checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!") - !(n > zero(n)) && error("shift value has to be positive!") - - N = length(t_vec) - (length(x) != N) && error("value and time vector have different lengths!") + n, N = _validate_tshift_args(x, t_vec; n=n, checksorted=checksorted, verbose=verbose) - x_shift = Array{Union{Missing, eltype(x)}}(missing, N); + x_shift = Array{Union{Missing, eltype(x)}}(missing, N) _linear_scan_lead!(x_shift, x, t_vec, n, N) return x_shift - end function _linear_scan_lead!(x_shift, x, t_vec, n, N) j = 0 - + @inbounds for i in 1:N - leadt = t_vec[i] + n - # Early termination if already past the end of the array + leadt = t_vec[i] + n if leadt > t_vec[N] - # All remaining targets will be beyond the array bounds break end - - # Fast forward scan (can add loop unrolling here if needed) + while j < N && t_vec[j + 1] < leadt j += 1 end - # Check for exact match at the next position if j + 1 <= N && t_vec[j + 1] == leadt x_shift[i] = x[j + 1] end end return x_shift - end # -------------------------------------------------------------------------------------------------- @@ -225,7 +196,7 @@ by a specified amount `n`. Acts as a unified interface to `tlag` and `tlead`. - `t_vec`: Vector of time points corresponding to each element in `x` # Keyword Arguments -- `n`: Time gap for shifting. If positive, performs a lag operation (backward in time); +- `n`: Time gap for shifting. If positive, performs a lag operation (backward in time); if negative, performs a lead operation (forward in time). If `nothing` (default), defaults to a lag operation with minimal unit difference. - `kwargs...`: Additional keyword arguments passed to either `tlag` or `tlead` @@ -258,7 +229,7 @@ julia> tshift([1, 2, 3], [-3, -2, -1], n = -1) See also: [`tlag`](@ref), [`tlead`](@ref) """ function tshift(x, t_vec; n=nothing, kwargs...) - + if isnothing(n) @warn "shift not specified ... defaulting to lag" n = oneunit(t_vec[1] - t_vec[1]) @@ -271,7 +242,3 @@ function tshift(x, t_vec; n=nothing, kwargs...) end end # -------------------------------------------------------------------------------------------------- - - - - diff --git a/src/Winsorize.jl b/src/Winsorize.jl @@ -33,10 +33,12 @@ function winsorize(x::AbstractVector{T}; verbose::Bool=false ) where T + isempty(x) && error("input vector is empty") + if !isnothing(probs) - lower_percentile = max(minimum(probs), 0) - upper_percentile = min(maximum(probs), 1) - (lower_percentile<0 || upper_percentile>1) && @error "bad probability input" + (minimum(probs) < 0 || maximum(probs) > 1) && error("probabilities must be in [0, 1]") + lower_percentile = minimum(probs) + upper_percentile = maximum(probs) verbose && any(ismissing, x) && (@info "Some missing data skipped in winsorizing") verbose && !isnothing(cutpoints) && (@info "input cutpoints ignored ... using probabilities")