commit b3b6222eb5461b6e738b6e3c7bcd2a0348ce496f
parent 79506f32a90c016825a9915e8c02a41560df73a9
Author: Erik Loualiche <eloualic@umn.edu>
Date: Wed, 25 Feb 2026 08:40:45 -0600
fix bugs and harden validation across all functions
- TimeShift: extract shared validation into _validate_tshift_args,
add empty array handling, remove dead commented-out code
- Winsorize: fix unreachable prob validation (check before clamping),
replace @error with error(), add empty vector guard
- StataUtils: replace @error with error() so invalid input halts
execution, add n_quantiles >= 1 validation to xtile
- PanelData: fix == to isequal for missing-safe column comparison,
fix incorrect default in error message, use deepcopy in panel_fill
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat:
4 files changed, 53 insertions(+), 83 deletions(-)
diff --git a/src/PanelData.jl b/src/PanelData.jl
@@ -59,7 +59,7 @@ function panel_fill!(
)
else
df[!, time_var_r] .= floor.(df[!, time_var], gap)
- if !(df[!, time_var_r] == df[!, time_var])
+ if !isequal(df[!, time_var_r], df[!, time_var])
@warn "Using rounded time variables for consistency with gap: $gap"
end
end
@@ -86,7 +86,7 @@ function panel_fill!(
error(
"""
Method $method not available.
- Please choose from :backwards, :forwards, :nearest, :linear (default)
+ Please choose from :backwards (default), :forwards, :nearest, :linear
"""
)
end
@@ -124,7 +124,7 @@ function panel_fill!(
if flag
df_fill[!, :flag] .= method
end
- if df[!, time_var_r] == df[!, time_var]
+ if isequal(df[!, time_var_r], df[!, time_var])
rename!(df_fill, time_var_r => time_var)
select!(df, Not(time_var_r))
else # if they are not all the same we are going to fill
@@ -157,7 +157,7 @@ function panel_fill(
flag::Bool = false
)
- df_res = copy(df)
+ df_res = deepcopy(df)
panel_fill!(df_res, id_var, time_var, value_var,
gap = gap, method = method, uniquecheck = uniquecheck, flag = flag)
diff --git a/src/StataUtils.jl b/src/StataUtils.jl
@@ -83,7 +83,7 @@ function tabulate(
@warn "Converting format_tbl to :long"
format_tbl = :long
else
- @error "Table format_tbl must be :long or :wide"
+ error("Table format_tbl must be :long or :wide")
end
end
@@ -118,9 +118,9 @@ function _tabulate_compute(df, cols, group_type, reorder_cols)
(d -> combine(groupby(d, name_type_cols), nrow => :freq, proprow =>:pct))
new_cols = name_type_cols
elseif group_type isa Vector{Symbol}
- !all(s -> s in [:value, :type], group_type) && (@error group_type_error_msg)
+ !all(s -> s in [:value, :type], group_type) && error(group_type_error_msg)
(size(group_type, 1) != size(cols, 1)) &&
- (@error "\ngroup_type and cols must be the same size; \nsee help for more information")
+ error("group_type and cols must be the same size; see help for more information")
type_cols = cols[group_type .== :type]
name_type_cols = Symbol.(type_cols, "_typeof")
group_cols = [cols[group_type .== :value]; name_type_cols]
@@ -128,7 +128,7 @@ function _tabulate_compute(df, cols, group_type, reorder_cols)
(d -> combine(groupby(d, group_cols), nrow => :freq, proprow =>:pct))
new_cols = group_cols
else
- @error group_type_error_msg
+ error(group_type_error_msg)
end
# resort columns based on the original order
new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols,
@@ -397,6 +397,7 @@ function xtile(
)::Vector{Int} where T <: Real
N = length(data)
+ n_quantiles < 1 && error("n_quantiles must be >= 1")
n_quantiles > N && (@warn "More quantiles than data")
probs = range(0, 1, length=n_quantiles + 1)[2:end]
diff --git a/src/TimeShift.jl b/src/TimeShift.jl
@@ -3,14 +3,29 @@
# --------------------------------------------------------------------------------------------------
-# ```@example
-# julia> x = [1, 2, 3]
-# julia> t = [1, 2, 3]
-# julia> tlag(x, t, n = 1)
-# 3-element Vector{Union{Missing, Int64}}:
-# missing
-# 1
-# 2
+# --------------------------------------------------------------------------------------------------
+# Shared validation for tlag/tlead
+function _validate_tshift_args(x, t_vec; n=nothing, checksorted=true, verbose=false)
+ if isnothing(n)
+ n = oneunit(t_vec[1] - t_vec[1])
+ verbose && ((t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") :
+ (@info "Default gap inferred ... $n"))
+ elseif eltype(t_vec) == Date
+ verbose && @info "No checks on increment argument n for type Date ... "
+ else
+ !(n isa typeof(t_vec[1]-t_vec[1])) &&
+ error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))")
+ end
+
+ checksorted && !issorted(t_vec; lt = (<=)) && error("time vector not sorted (order is strict)!")
+ !(n > zero(n)) && error("shift value has to be positive!")
+
+ N = length(t_vec)
+ (length(x) != N) && error("value and time vector have different lengths!")
+
+ return n, N
+end
+# --------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------
@@ -22,7 +37,7 @@ backward in time by a specified amount `n`.
# Arguments
- `x`: Array of values to be lagged
-- `t_vec`: Vector of time points corresponding to each element in `x`
+- `t_vec`: Vector of time points corresponding to each element in `x`
# Keyword Arguments
- `n`: Time gap for lagging. If `nothing` (default), uses the minimal unit difference between time points.
@@ -57,55 +72,31 @@ julia> tlag([1, 2, 3], [1, 2, 3], n = 1)
```
"""
-function tlag(x, t_vec;
- n = nothing,
+function tlag(x, t_vec;
+ n = nothing,
checksorted = true,
verbose = false,
)
- if isnothing(n) # this is the default
- n = oneunit(t_vec[1] - t_vec[1])
- verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") :
- (@info "Default gap inferred ... $n") )
- elseif eltype(t_vec) == Date
- verbose && @info "No checks on increment argument n for type Date ... "
- else
- !(n isa typeof(t_vec[1]-t_vec[1])) &&
- error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))")
+ isempty(t_vec) && return Array{Union{Missing, eltype(x)}}(missing, 0)
- end
+ n, N = _validate_tshift_args(x, t_vec; n=n, checksorted=checksorted, verbose=verbose)
- checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!")
- !(n > zero(n)) && error("shift value has to be positive!")
-
- N = length(t_vec)
- (length(x) != N) && error("value and time vector have different lengths!")
-
- x_shift = Array{Union{Missing, eltype(x)}}(missing, N);
-
- # _binary_search_lag!(x_shift, x, t_vec, n, N)
+ x_shift = Array{Union{Missing, eltype(x)}}(missing, N)
_linear_scan!(x_shift, x, t_vec, n, N)
return x_shift
-
end
function _linear_scan!(x_shift, x, t_vec, n, N)
j = 0
@inbounds for i in 1:N
- # Calculate the target time we're looking for
lagt = t_vec[i] - n
- # Scan forward from where we left off to find the largest index
- # where t_vec[j] <= lagt (since t_vec is sorted)
while j < N && t_vec[j + 1] <= lagt
j += 1
end
-
- # If we found a valid index and it's an exact match
if j > 0 && t_vec[j] == lagt
x_shift[i] = x[j]
- # else
- # x_shift[i] = missing
end
end
return x_shift
@@ -113,7 +104,6 @@ end
# --------------------------------------------------------------------------------------------------
-
# --------------------------------------------------------------------------------------------------
"""
tlead(x, t_vec; n = nothing, checksorted = true, verbose = false)
@@ -158,57 +148,38 @@ julia> tlead([1, 2, 3], [8, 9, 10], n = 1)
```
"""
-function tlead(x, t_vec;
- n = nothing,
+function tlead(x, t_vec;
+ n = nothing,
checksorted = true,
verbose = false,
)
- if isnothing(n) # this is the default
- n = oneunit(t_vec[1] - t_vec[1])
- verbose && ( (t_vec[1] isa Date) ? (@info "Default date gap inferred ... $n") :
- (@info "Default gap inferred ... $n") )
- elseif eltype(t_vec) == Date
- verbose && @info "No checks on increment argument n for date type ... "
- else
- !(n isa typeof(t_vec[1]-t_vec[1])) &&
- error("Time gap type does not match time variable: typeof(n)=$(typeof(n)) != eltype(vec)=$(eltype(t_vec))")
- end
+ isempty(t_vec) && return Array{Union{Missing, eltype(x)}}(missing, 0)
- checksorted && !issorted(t_vec; lt = (<=) ) && error("time vector not sorted (order is strict)!")
- !(n > zero(n)) && error("shift value has to be positive!")
-
- N = length(t_vec)
- (length(x) != N) && error("value and time vector have different lengths!")
+ n, N = _validate_tshift_args(x, t_vec; n=n, checksorted=checksorted, verbose=verbose)
- x_shift = Array{Union{Missing, eltype(x)}}(missing, N);
+ x_shift = Array{Union{Missing, eltype(x)}}(missing, N)
_linear_scan_lead!(x_shift, x, t_vec, n, N)
return x_shift
-
end
function _linear_scan_lead!(x_shift, x, t_vec, n, N)
j = 0
-
+
@inbounds for i in 1:N
- leadt = t_vec[i] + n
- # Early termination if already past the end of the array
+ leadt = t_vec[i] + n
if leadt > t_vec[N]
- # All remaining targets will be beyond the array bounds
break
end
-
- # Fast forward scan (can add loop unrolling here if needed)
+
while j < N && t_vec[j + 1] < leadt
j += 1
end
- # Check for exact match at the next position
if j + 1 <= N && t_vec[j + 1] == leadt
x_shift[i] = x[j + 1]
end
end
return x_shift
-
end
# --------------------------------------------------------------------------------------------------
@@ -225,7 +196,7 @@ by a specified amount `n`. Acts as a unified interface to `tlag` and `tlead`.
- `t_vec`: Vector of time points corresponding to each element in `x`
# Keyword Arguments
-- `n`: Time gap for shifting. If positive, performs a lag operation (backward in time);
+- `n`: Time gap for shifting. If positive, performs a lag operation (backward in time);
if negative, performs a lead operation (forward in time).
If `nothing` (default), defaults to a lag operation with minimal unit difference.
- `kwargs...`: Additional keyword arguments passed to either `tlag` or `tlead`
@@ -258,7 +229,7 @@ julia> tshift([1, 2, 3], [-3, -2, -1], n = -1)
See also: [`tlag`](@ref), [`tlead`](@ref)
"""
function tshift(x, t_vec; n=nothing, kwargs...)
-
+
if isnothing(n)
@warn "shift not specified ... defaulting to lag"
n = oneunit(t_vec[1] - t_vec[1])
@@ -271,7 +242,3 @@ function tshift(x, t_vec; n=nothing, kwargs...)
end
end
# --------------------------------------------------------------------------------------------------
-
-
-
-
diff --git a/src/Winsorize.jl b/src/Winsorize.jl
@@ -33,10 +33,12 @@ function winsorize(x::AbstractVector{T};
verbose::Bool=false
) where T
+ isempty(x) && error("input vector is empty")
+
if !isnothing(probs)
- lower_percentile = max(minimum(probs), 0)
- upper_percentile = min(maximum(probs), 1)
- (lower_percentile<0 || upper_percentile>1) && @error "bad probability input"
+ (minimum(probs) < 0 || maximum(probs) > 1) && error("probabilities must be in [0, 1]")
+ lower_percentile = minimum(probs)
+ upper_percentile = maximum(probs)
verbose && any(ismissing, x) && (@info "Some missing data skipped in winsorizing")
verbose && !isnothing(cutpoints) && (@info "input cutpoints ignored ... using probabilities")