fix bugs and harden validation across all functions - BazerData.jl

commit b3b6222eb5461b6e738b6e3c7bcd2a0348ce496f
parent 79506f32a90c016825a9915e8c02a41560df73a9
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Wed, 25 Feb 2026 08:40:45 -0600

fix bugs and harden validation across all functions

- TimeShift: extract shared validation into _validate_tshift_args,
  add empty array handling, remove dead commented-out code
- Winsorize: fix unreachable prob validation (check before clamping),
  replace @error with error(), add empty vector guard
- StataUtils: replace @error with error() so invalid input halts
  execution, add n_quantiles >= 1 validation to xtile
- PanelData: fix == to isequal for missing-safe column comparison,
  fix incorrect default in error message, use deepcopy in panel_fill

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
M src/PanelData.jl  | 8 ++++----
M src/StataUtils.jl  | 9 +++++----
M src/TimeShift.jl  | 111 ++++++++++++++++++++++++++++---------------------------------------------------
M src/Winsorize.jl  | 8 +++++---

4 files changed, 53 insertions(+), 83 deletions(-)
diff --git a/src/PanelData.jl b/src/PanelData.jl
@@ -59,7 +59,7 @@ function panel_fill!(
             )
         else
             df[!, time_var_r] .= floor.(df[!, time_var], gap)
-            if !(df[!, time_var_r] == df[!, time_var])
+            if !isequal(df[!, time_var_r], df[!, time_var])
                 @warn "Using rounded time variables for consistency with gap: $gap"
             end
         end
@@ -86,7 +86,7 @@ function panel_fill!(
             error(
                 """
                 Method $method not available.
-                Please choose from :backwards, :forwards, :nearest, :linear (default)
+                Please choose from :backwards (default), :forwards, :nearest, :linear
                 """
                 )
         end
@@ -124,7 +124,7 @@ function panel_fill!(
     if flag 
         df_fill[!, :flag] .= method
     end
-    if df[!, time_var_r] == df[!, time_var]
+    if isequal(df[!, time_var_r], df[!, time_var])
         rename!(df_fill, time_var_r => time_var)
         select!(df, Not(time_var_r))
     else # if they are not all the same we are going to fill
@@ -157,7 +157,7 @@ function panel_fill(
     flag::Bool = false
     )
 
-    df_res = copy(df)
+    df_res = deepcopy(df)
 
     panel_fill!(df_res, id_var, time_var, value_var,
         gap = gap, method = method, uniquecheck = uniquecheck, flag = flag)
diff --git a/src/StataUtils.jl b/src/StataUtils.jl
@@ -83,7 +83,7 @@ function tabulate(
             @warn "Converting format_tbl to :long"
             format_tbl = :long
         else
-            @error "Table format_tbl must be :long or :wide"
+            error("Table format_tbl must be :long or :wide")
         end
     end
 
@@ -118,9 +118,9 @@ function _tabulate_compute(df, cols, group_type, reorder_cols)
             (d -> combine(groupby(d, name_type_cols), nrow => :freq, proprow =>:pct))
         new_cols = name_type_cols
     elseif group_type isa Vector{Symbol}
-        !all(s -> s in [:value, :type], group_type) && (@error group_type_error_msg)
+        !all(s -> s in [:value, :type], group_type) && error(group_type_error_msg)
         (size(group_type, 1) != size(cols, 1)) &&
-            (@error "\ngroup_type and cols must be the same size; \nsee help for more information")
+            error("group_type and cols must be the same size; see help for more information")
         type_cols = cols[group_type .== :type]
         name_type_cols = Symbol.(type_cols, "_typeof")
         group_cols = [cols[group_type .== :value]; name_type_cols]
@@ -128,7 +128,7 @@ function _tabulate_compute(df, cols, group_type, reorder_cols)
             (d -> combine(groupby(d, group_cols), nrow => :freq, proprow =>:pct))
         new_cols = group_cols
     else
-        @error group_type_error_msg
+        error(group_type_error_msg)
     end
     # resort columns based on the original order
     new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols,
@@ -397,6 +397,7 @@ function xtile(
 )::Vector{Int} where T <: Real
 
         N = length(data)
+        n_quantiles < 1 && error("n_quantiles must be >= 1")
         n_quantiles > N && (@warn "More quantiles than data")
 
         probs = range(0, 1, length=n_quantiles + 1)[2:end]
diff --git a/src/TimeShift.jl b/src/TimeShift.jl
@@ -3,14 +3,29 @@
 # --------------------------------------------------------------------------------------------------
 
 
-# ```@example
-# julia> x = [1, 2, 3]
-# julia> t = [1, 2, 3]
-# julia> tlag(x, t, n = 1)
-# 3-element Vector{Union{Missing, Int64}}:
-#   missing
-#  1
-#  2
+# --------------------------------------------------------------------------------------------------
+# Shared validation for tlag/tlead
+function _validate_tshift_args(x, t_vec; n=nothing, checksorted=true, verbose=false)
+    if isnothing(n)
+        n = oneunit(t_vec[1] - t_vec[1])
+        verbose && ((t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") :
+            (@info "Default gap inferred ... $n"))
+    elseif eltype(t_vec) == Date
+        verbose && @info "No checks on increment argument n for type Date ... "
+    else
+        !(n isa typeof(t_vec[1]-t_vec[1])) &&
+            error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))")
+    end
+
+    checksorted && !issorted(t_vec; lt = (<=)) && error("time vector not sorted (order is strict)!")
+    !(n > zero(n)) && error("shift value has to be positive!")
+
+    N = length(t_vec)
+    (length(x) != N) && error("value and time vector have different lengths!")
+
+    return n, N
+end
+# --------------------------------------------------------------------------------------------------
 
 
 # --------------------------------------------------------------------------------------------------
@@ -22,7 +37,7 @@ backward in time by a specified amount `n`.
 
 # Arguments
 - `x`: Array of values to be lagged
-- `t_vec`: Vector of time points corresponding to each element in `x` 
+- `t_vec`: Vector of time points corresponding to each element in `x`
 
 # Keyword Arguments
 - `n`: Time gap for lagging. If `nothing` (default), uses the minimal unit difference between time points.
@@ -57,55 +72,31 @@ julia> tlag([1, 2, 3], [1, 2, 3], n = 1)
 ```
 
 """
-function tlag(x, t_vec; 
-    n = nothing, 
+function tlag(x, t_vec;
+    n = nothing,
     checksorted = true,
     verbose = false,
     )
 
-    if isnothing(n) # this is the default
-        n = oneunit(t_vec[1] - t_vec[1])
-        verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") : 
-            (@info "Default gap inferred ... $n") )
-    elseif eltype(t_vec) == Date 
-        verbose && @info "No checks on increment argument n for type Date ... "
-    else
-        !(n isa typeof(t_vec[1]-t_vec[1])) && 
-            error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))")
+    isempty(t_vec) && return Array{Union{Missing, eltype(x)}}(missing, 0)
 
-    end
+    n, N = _validate_tshift_args(x, t_vec; n=n, checksorted=checksorted, verbose=verbose)
 
-    checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!")
-    !(n > zero(n)) && error("shift value has to be positive!")
-    
-    N = length(t_vec)
-    (length(x) != N) && error("value and time vector have different lengths!")
-
-    x_shift = Array{Union{Missing, eltype(x)}}(missing, N);
-
-    # _binary_search_lag!(x_shift, x, t_vec, n, N)
+    x_shift = Array{Union{Missing, eltype(x)}}(missing, N)
     _linear_scan!(x_shift, x, t_vec, n, N)
 
     return x_shift
-
 end
 
 function _linear_scan!(x_shift, x, t_vec, n, N)
     j = 0
     @inbounds for i in 1:N
-        # Calculate the target time we're looking for
         lagt = t_vec[i] - n
-        # Scan forward from where we left off to find the largest index
-        # where t_vec[j] <= lagt (since t_vec is sorted)
         while j < N && t_vec[j + 1] <= lagt
             j += 1
         end
-
-        # If we found a valid index and it's an exact match
         if j > 0 && t_vec[j] == lagt
             x_shift[i] = x[j]
-        # else
-        #     x_shift[i] = missing
         end
     end
     return x_shift
@@ -113,7 +104,6 @@ end
 # --------------------------------------------------------------------------------------------------
 
 
-
 # --------------------------------------------------------------------------------------------------
 """
     tlead(x, t_vec; n = nothing, checksorted = true, verbose = false)
@@ -158,57 +148,38 @@ julia> tlead([1, 2, 3], [8, 9, 10], n = 1)
 ```
 
 """
-function tlead(x, t_vec; 
-    n = nothing, 
+function tlead(x, t_vec;
+    n = nothing,
     checksorted = true,
     verbose = false,
     )
 
-    if isnothing(n) # this is the default
-        n = oneunit(t_vec[1] - t_vec[1])
-        verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") : 
-            (@info "Default gap inferred ... $n") )
-    elseif eltype(t_vec) == Date 
-        verbose && @info "No checks on increment argument n for date type ... "
-    else
-        !(n isa typeof(t_vec[1]-t_vec[1])) && 
-            error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))")
-    end
+    isempty(t_vec) && return Array{Union{Missing, eltype(x)}}(missing, 0)
 
-    checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!")
-    !(n > zero(n)) && error("shift value has to be positive!")
-    
-    N = length(t_vec)
-    (length(x) != N) && error("value and time vector have different lengths!")
+    n, N = _validate_tshift_args(x, t_vec; n=n, checksorted=checksorted, verbose=verbose)
 
-    x_shift = Array{Union{Missing, eltype(x)}}(missing, N);
+    x_shift = Array{Union{Missing, eltype(x)}}(missing, N)
     _linear_scan_lead!(x_shift, x, t_vec, n, N)
     return x_shift
-
 end
 
 function _linear_scan_lead!(x_shift, x, t_vec, n, N)
     j = 0
-    
+
     @inbounds for i in 1:N
-        leadt = t_vec[i] + n    
-        # Early termination if already past the end of the array
+        leadt = t_vec[i] + n
         if leadt > t_vec[N]
-            # All remaining targets will be beyond the array bounds
             break
         end
-     
-        # Fast forward scan (can add loop unrolling here if needed)
+
         while j < N && t_vec[j + 1] < leadt
             j += 1
         end
-        # Check for exact match at the next position
         if j + 1 <= N && t_vec[j + 1] == leadt
             x_shift[i] = x[j + 1]
         end
     end
     return x_shift
-
 end
 # --------------------------------------------------------------------------------------------------
 
@@ -225,7 +196,7 @@ by a specified amount `n`. Acts as a unified interface to `tlag` and `tlead`.
 - `t_vec`: Vector of time points corresponding to each element in `x`
 
 # Keyword Arguments
-- `n`: Time gap for shifting. If positive, performs a lag operation (backward in time); 
+- `n`: Time gap for shifting. If positive, performs a lag operation (backward in time);
        if negative, performs a lead operation (forward in time).
        If `nothing` (default), defaults to a lag operation with minimal unit difference.
 - `kwargs...`: Additional keyword arguments passed to either `tlag` or `tlead`
@@ -258,7 +229,7 @@ julia> tshift([1, 2, 3], [-3, -2, -1], n = -1)
 See also: [`tlag`](@ref), [`tlead`](@ref)
 """
 function tshift(x, t_vec; n=nothing, kwargs...)
-    
+
     if isnothing(n)
         @warn "shift not specified ... defaulting to lag"
         n = oneunit(t_vec[1] - t_vec[1])
@@ -271,7 +242,3 @@ function tshift(x, t_vec; n=nothing, kwargs...)
     end
 end
 # --------------------------------------------------------------------------------------------------
-
-
-
-
diff --git a/src/Winsorize.jl b/src/Winsorize.jl
@@ -33,10 +33,12 @@ function winsorize(x::AbstractVector{T};
     verbose::Bool=false
     ) where T
 
+    isempty(x) && error("input vector is empty")
+
     if !isnothing(probs)
-        lower_percentile = max(minimum(probs), 0)
-        upper_percentile = min(maximum(probs), 1)
-        (lower_percentile<0 || upper_percentile>1) && @error "bad probability input"
+        (minimum(probs) < 0 || maximum(probs) > 1) && error("probabilities must be in [0, 1]")
+        lower_percentile = minimum(probs)
+        upper_percentile = maximum(probs)
         verbose && any(ismissing, x) && (@info "Some missing data skipped in winsorizing")
         verbose && !isnothing(cutpoints) && (@info "input cutpoints ignored ... using probabilities")

	BazerData.jl Data manipulation utilities for Julia
	Log \| Files \| Refs \| README \| LICENSE

M	src/PanelData.jl	\|	8	++++----
M	src/StataUtils.jl	\|	9	+++++----
M	src/TimeShift.jl	\|	111	++++++++++++++++++++++++++++---------------------------------------------------
M	src/Winsorize.jl	\|	8	+++++---