BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

commit 4fcfd6f3198b6ce57d580193cb6e270a6a83f451
parent f1b1519c96366d69335ed5229febe6fcd00e1dec
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Mon, 16 Feb 2026 13:28:36 -0600

refactor, fix bugs, and expand test coverage across all functions

- tabulate: extract into _tabulate_compute, _tabulate_render_long,
  _tabulate_render_wide, _render_pretty_table helpers; fix typeof
  patterns to use isa; fix skip_stat bug using cols instead of
  new_cols; fix skip_stat highlighter column index mismatch
- panel_fill: fix hardcoded df.t to df[!, time_var]; fix typeof
  pattern; remove ghost merge param from docstring and docs examples
- winsorize: fix docstring to match actual replace_value kwarg
- timeshift: trim trailing blank lines
- docs: fix "not yet registered" text; remove invalid merge=true
- tests: 112 → 185 tests (+65%) with new edge case, error path,
  missing data, and feature combination coverage

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
Mdocs/src/index.md | 12++++++------
Msrc/PanelData.jl | 24+++++++++++-------------
Msrc/StataUtils.jl | 431++++++++++++++++++++++++++++++++++---------------------------------------------
Msrc/TimeShift.jl | 4----
Msrc/Winsorize.jl | 29++++++++++++++---------------
Mtest/UnitTests/panel_fill.jl | 55++++++++++++++++++++++++++++++++++++++++++++++++-------
Mtest/UnitTests/tabulate.jl | 106++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mtest/UnitTests/timeshift.jl | 129++++++++++++++++++++++++++++++-------------------------------------------------
Mtest/UnitTests/winsorize.jl | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mtest/UnitTests/xtile.jl | 49+++++++++++++++++++++++++++++++++++++++++++++++--
10 files changed, 520 insertions(+), 384 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md @@ -15,11 +15,11 @@ The readme serves as documentation; there might be more examples inside of the t ## Installation -`BazerData.jl` is a not yet a registered package. -You can install it from github via +`BazerData.jl` is a registered package. +You can install it via ```julia import Pkg -Pkg.add(url="https://github.com/eloualiche/BazerData.jl") +Pkg.add("BazerData") ``` @@ -96,11 +96,11 @@ df_panel = DataFrame( # missing t=2 for id=1 v3 = [1,5,4,6,6,15,12.25, 21,22.5,17.2,1]) panel_fill(df_panel, :id, :t, [:v1, :v2, :v3], - gap=Month(1), method=:backwards, uniquecheck=true, flag=true, merge=true) + gap=Month(1), method=:backwards, uniquecheck=true, flag=true) panel_fill(df_panel, :id, :t, [:v1, :v2, :v3], - gap=Month(1), method=:forwards, uniquecheck=true, flag=true, merge=true) + gap=Month(1), method=:forwards, uniquecheck=true, flag=true) panel_fill(df_panel, :id, :t, [:v1, :v2, :v3], - gap=Month(1), method=:linear, uniquecheck=true, flag=true, merge=true) + gap=Month(1), method=:linear, uniquecheck=true, flag=true) ``` ### Leads and lags diff --git a/src/PanelData.jl b/src/PanelData.jl @@ -2,14 +2,13 @@ """ panel_fill!( df::DataFrame, - id_var::Symbol, - time_var::Symbol, + id_var::Symbol, + time_var::Symbol, value_var::Union{Symbol, Vector{Symbol}}; - gap::Union{Int, DatePeriod} = 1, - method::Symbol = :backwards, + gap::Union{Int, DatePeriod} = 1, + method::Symbol = :backwards, uniquecheck::Bool = true, - flag::Bool = false, - merge::Bool = false + flag::Bool = false ) # Arguments @@ -17,17 +16,16 @@ - `id_var::Symbol`: the individual index dimension of the panel - `time_var::Symbol`: the time index dimension of the panel (must be integer or a date) - `value_var::Union{Symbol, Vector{Symbol}}`: the set of columns we would like to fill - + # Keywords - `gap::Union{Int, DatePeriod} = 1` : the interval size for which we want to fill data - `method::Symbol = :backwards`: the interpolation method to fill the data options are: `:backwards` (default), `:forwards`, `:linear`, `:nearest` - email me for other interpolations (anything from Interpolations.jl is possible) - `uniquecheck::Bool = true`: check if panel is clean - `flag::Bool = false`: flag the interpolated values # Returns -- `AbstractDataFrame`: +- `DataFrame`: the input DataFrame with interpolated rows appended # Examples - See tests @@ -43,7 +41,7 @@ function panel_fill!( # prepare the data sort!(df, [id_var, time_var]) - if isa(value_var, Symbol) + if value_var isa Symbol value_var = [value_var] end if uniquecheck # check for unicity @@ -52,11 +50,11 @@ function panel_fill!( end time_var_r = join([string(time_var), "rounded"], "_") # clean up if dates - if typeof(gap) <: DatePeriod - if !(eltype(df.t) <: Dates.AbstractTime) + if gap isa DatePeriod + if !(eltype(df[!, time_var]) <: Dates.AbstractTime) error( """ - Type of gap $(typeof(gap)) and type of time variable $(eltype(df.t)) do not match + Type of gap $(typeof(gap)) and type of time variable $(eltype(df[!, time_var])) do not match """ ) else diff --git a/src/StataUtils.jl b/src/StataUtils.jl @@ -9,7 +9,7 @@ # ------------------------------------------------------------------------------------------ # List of exported functions -# tabulate # (tab alias) +# tabulate # xtile # ------------------------------------------------------------------------------------------ @@ -19,7 +19,8 @@ tabulate(df::AbstractDataFrame, cols::Union{Symbol, Array{Symbol}}; reorder_cols=true, out::Symbol=:stdout) -This was forked from TexTables.jl and was inspired by https://github.com/matthieugomez/statar +Frequency tabulation inspired by Stata's `tabulate` command. +Forked from TexTables.jl and inspired by https://github.com/matthieugomez/statar # Arguments - `df::AbstractDataFrame`: Input DataFrame to analyze @@ -37,7 +38,6 @@ This was forked from TexTables.jl and was inspired by https://github.com/matthie - `:df` Return the result as a DataFrame - `:string` Return the formatted table as a string - # Returns - `Nothing` if `out=:stdout` - `DataFrame` if `out=:df` @@ -50,9 +50,6 @@ The resulting table contains the following columns: - `pct`: Percentage of total - `cum`: Cumulative percentage -# TO DO -allow user to specify order of columns (reorder = false flag) - # Examples See the README for more examples ```julia @@ -74,20 +71,15 @@ function tabulate( df::AbstractDataFrame, cols::Union{Symbol, Vector{Symbol}}; group_type::Union{Symbol, Vector{Symbol}}=:value, reorder_cols::Bool=true, - format_tbl::Symbol=:long, + format_tbl::Symbol=:long, format_stat::Symbol=:freq, skip_stat::Union{Nothing, Symbol, Vector{Symbol}}=nothing, out::Symbol=:stdout) - if typeof(cols) <: Symbol # check if it's an array or just a point - N_COLS = 1 - else - N_COLS = size(cols,1) - # error("Only accepts one variable for now ...") - end + N_COLS = cols isa Symbol ? 1 : length(cols) if !(format_tbl ∈ [:long, :wide]) - if size(cols, 1) == 1 + if N_COLS == 1 @warn "Converting format_tbl to :long" format_tbl = :long else @@ -100,7 +92,18 @@ function tabulate( return nothing end - # Count the number of observations by `columns`: this is the main calculation + df_out, new_cols = _tabulate_compute(df, cols, group_type, reorder_cols) + + if format_tbl == :long + return _tabulate_render_long(df_out, new_cols, N_COLS, out, skip_stat) + else # :wide + return _tabulate_render_wide(df_out, new_cols, N_COLS, format_stat, out) + end +end + + +# ----- Computation: groupby, combine, sort, pct/cum transforms +function _tabulate_compute(df, cols, group_type, reorder_cols) group_type_error_msg = """ \ngroup_type input must specify either ':value' or ':type' for columns; options are :value, :type, or a vector combining the two; @@ -114,8 +117,7 @@ function tabulate( df_out = transform(df, cols .=> ByRow(typeof) .=> name_type_cols) |> (d -> combine(groupby(d, name_type_cols), nrow => :freq, proprow =>:pct)) new_cols = name_type_cols - # rename!(df_out, name_type_cols .=> cols) - elseif typeof(group_type) <: Vector{Symbol} + elseif group_type isa Vector{Symbol} !all(s -> s in [:value, :type], group_type) && (@error group_type_error_msg) (size(group_type, 1) != size(cols, 1)) && (@error "\ngroup_type and cols must be the same size; \nsee help for more information") @@ -129,243 +131,189 @@ function tabulate( @error group_type_error_msg end # resort columns based on the original order - new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols, + new_cols = sort(new_cols isa Symbol ? [new_cols] : new_cols, by= x -> findfirst(==(replace(string(x), r"_typeof$" => "")), string.(cols)) ) if reorder_cols - cols_sortable = [ # check whether it makes sense to sort on the variables + cols_sortable = [ name for (name, col) in pairs(eachcol(select(df_out, new_cols))) if eltype(col) |> t -> hasmethod(isless, Tuple{t,t}) ] - if size(cols_sortable, 1)>0 - cols_sortable + if !isempty(cols_sortable) sort!(df_out, cols_sortable) # order before we build cumulative end end transform!(df_out, :pct => cumsum => :cum, :freq => ByRow(Int) => :freq) - # easier to do some of the transformations on the numbers directly than using formatters - transform!(df_out, - :pct => (x -> x .* 100), + transform!(df_out, + :pct => (x -> x .* 100), :cum => (x -> Int.(round.(x .* 100, digits=0))), renamecols=false) + return df_out, new_cols +end +# ----- Long format rendering +function _tabulate_render_long(df_out, new_cols, N_COLS, out, skip_stat) + transform!(df_out, :freq => (x->text_histogram(x, width=24)) => :freq_hist) + + # highlighter with gradient for the freq/pct/cum columns (rest is cyan) + col_highlighters = Tuple(vcat( + map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_COLS), + hl_custom_gradient(cols=(N_COLS+1), colorscheme=:Oranges_9, scale=maximum(df_out.freq)), + hl_custom_gradient(cols=(N_COLS+2), colorscheme=:Greens_9, scale=ceil(Int, maximum(df_out.pct))), + hl_custom_gradient(cols=(N_COLS+3), colorscheme=:Greens_9, scale=100), + )) + + # when skip_stat is provided and output is string, filter columns + if out == :string && !isnothing(skip_stat) + all_stats = [:freq, :pct, :cum, :freq_hist] + skip_list = skip_stat isa Vector ? skip_stat : [skip_stat] + col_stat = setdiff(all_stats, skip_list) + N_COL_STAT = length(col_stat) + + stat_headers = Dict(:freq=>"Freq.", :pct=>"Percent", :cum=>"Cum", :freq_hist=>"Hist.") + stat_formats = Dict(:freq=>"%d", :pct=>"%.1f", :cum=>"%d", :freq_hist=>"%s") + stat_colorschemes = Dict( + :freq => (:Oranges_9, maximum(df_out.freq)), + :pct => (:Greens_9, ceil(Int, maximum(df_out.pct))), + :cum => (:Greens_9, 100), + ) -# ----- prepare the table - if format_tbl == :long + header = vcat(string.(new_cols), + [stat_headers[k] for k in col_stat]) + formatters = Tuple(vcat( + [ft_printf("%s", i) for i in 1:N_COLS], + [ft_printf(stat_formats[k], N_COLS + i) for (i, k) in enumerate(col_stat)] + )) + # rebuild highlighters for the filtered column layout + filtered_highlighters = Tuple(vcat( + map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_COLS), + [haskey(stat_colorschemes, k) ? + hl_custom_gradient(cols=N_COLS+i, colorscheme=stat_colorschemes[k][1], scale=stat_colorschemes[k][2]) : + Highlighter((data, row, col) -> col == N_COLS+i, crayon"white") + for (i, k) in enumerate(col_stat)] + )) + alignment = vcat(repeat([:l], N_COLS), repeat([:c], N_COL_STAT)) + cell_alignment = reduce(push!, + map(i -> (i,1)=>:l, 1:N_COLS+N_COL_STAT-1), + init=Dict{Tuple{Int64, Int64}, Symbol}()) + + df_render = select(df_out, new_cols, col_stat) + return _render_pretty_table(df_render, out; + hlines=[1], vlines=[N_COLS], + alignment=alignment, cell_alignment=cell_alignment, + header=header, formatters=formatters, highlighters=filtered_highlighters) + end - transform!(df_out, :freq => (x->text_histogram(x, width=24)) => :freq_hist) + # default: all stat columns + header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."] + formatters = Tuple(vcat( + [ft_printf("%s", i) for i in 1:N_COLS], + [ft_printf("%d", N_COLS+1), ft_printf("%.1f", N_COLS+2), + ft_printf("%d", N_COLS+3), ft_printf("%s", N_COLS+4)] + )) + alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c) + cell_alignment = reduce(push!, + map(i -> (i,1)=>:l, 1:N_COLS+3), + init=Dict{Tuple{Int64, Int64}, Symbol}()) + + return _render_pretty_table(df_out, out; + hlines=[1], vlines=[N_COLS], + alignment=alignment, cell_alignment=cell_alignment, + header=header, formatters=formatters, highlighters=col_highlighters) +end - # highlighter with gradient for the freq/pct/cum columns (rest is blue) - col_highlighters = vcat( - map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_COLS), - hl_custom_gradient(cols=(N_COLS+1), colorscheme=:Oranges_9, scale=maximum(df_out.freq)), - hl_custom_gradient(cols=(N_COLS+2), colorscheme=:Greens_9, scale=ceil(Int, maximum(df_out.pct))), - hl_custom_gradient(cols=(N_COLS+3), colorscheme=:Greens_9, scale=100), - ) - col_highlighters = Tuple(x for x in col_highlighters) - - col_formatters = Tuple(vcat( - [ ft_printf("%s", i) for i in 1:N_COLS ], # Column values - [ - ft_printf("%d", N_COLS+1), # Frequency (integer) - ft_printf("%.1f", N_COLS+2), - ft_printf("%d", N_COLS+3), # Cumulative - ft_printf("%s", N_COLS+4) # Histogram - ] + +# ----- Wide format rendering +function _tabulate_render_wide(df_out, new_cols, N_COLS, format_stat, out) + df_out = unstack(df_out, + new_cols[1:(N_COLS-1)], new_cols[N_COLS], format_stat, + allowmissing=true) + + N_GROUP_COLS = N_COLS - 1 + N_VAR_COLS = size(df_out, 2) - N_GROUP_COLS + + if format_stat == :freq + + # frequency: add row and column totals + total_row_des = "Total by $(string(new_cols[N_COLS]))" + total_col_des = join(vcat("Total by ", join(string.(new_cols[1:(N_COLS-1)]), ", "))) + + sum_cols = sum.(skipmissing.(eachcol(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)]))) + row_vector = vcat([total_row_des], repeat(["-"], max(0, N_GROUP_COLS-1)), sum_cols) + df_out = vcat(df_out, + DataFrame(permutedims(row_vector)[:, end+1-size(df_out,2):end], names(df_out))) + sum_rows = sum.(skipmissing.(eachrow(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)]))) + col_vector = rename(DataFrame(total = sum_rows), "total" => total_col_des) + df_out = hcat(df_out, col_vector) + rename!(df_out, [i => "-"^i for i in 1:N_GROUP_COLS]) + + col_highlighters = Tuple(vcat( + map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS), + [ hl_custom_gradient(cols=i, colorscheme=:Greens_9, + scale = ceil(Int, maximum(skipmissing(df_out[1:end-1, i])))) + for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ], + Highlighter((data, row, col) -> col == size(df_out, 2), crayon"green") )) - if out ∈ [:stdout, :df] - - pretty_table(df_out; - hlines = [1], - vlines = [N_COLS], - alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c), - cell_alignment = reduce(push!, - map(i -> (i,1)=>:l, 1:N_COLS+3), - init=Dict{Tuple{Int64, Int64}, Symbol}()), - header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."], - formatters = col_formatters, - highlighters = col_highlighters, - vcrop_mode = :middle, - border_crayon = crayon"bold yellow", - header_crayon = crayon"bold light_green", - show_header = true, - ) - - if out==:stdout - return(nothing) - elseif out==:df - return(df_out) - end - - elseif out==:string # this might be costly as I am regenerating the table. - if isnothing(skip_stat) - pt = pretty_table(String, df_out; - hlines = [1], - vlines = [N_COLS], - alignment = vcat(repeat([:l], N_COLS), :c, :c, :c, :c), - cell_alignment = reduce(push!, - map(i -> (i,1)=>:l, 1:N_COLS+3), - init=Dict{Tuple{Int64, Int64}, Symbol}()), - header = [string.(new_cols); "Freq."; "Percent"; "Cum"; "Hist."], - formatters = col_formatters, - highlighters = col_highlighters, - crop = :none, # no crop for string output - border_crayon = crayon"bold yellow", - header_crayon = crayon"bold light_green", - show_header = true, - ) - else - col_stat = setdiff([:freq, :pct, :cum, :freq_hist], - isa(skip_stat, Vector) ? skip_stat : [skip_stat]) - N_COL_STAT = size(col_stat,1) - header_table = vcat(string.(new_cols), - [Dict(:freq=>"Freq.", :pct=>"Percent", :cum=>"Cum", :freq_hist=>"Hist.")[k] - for k in col_stat] - ) - df_sub_out = select(df_out, cols, col_stat) - pt = pretty_table(String, df_sub_out; - hlines = [1], - vlines = [N_COLS], - alignment = vcat(repeat([:l], N_COLS), repeat([:c], N_COL_STAT)), - cell_alignment = reduce(push!, - map(i -> (i,1)=>:l, 1:N_COLS+N_COL_STAT-1), - init=Dict{Tuple{Int64, Int64}, Symbol}()), - header = header_table, - formatters = col_formatters, - highlighters = col_highlighters, - crop = :none, # no crop for string output - border_crayon = crayon"bold yellow", - header_crayon = crayon"bold light_green", - show_header = true, - ) - end - - return(pt) - end + formatters = Tuple(vcat( + [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ], + [ ft_printf("%d", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ], + [ ft_printf("%d", 1+N_GROUP_COLS+N_VAR_COLS) ] + )) - elseif format_tbl == :wide - - df_out = unstack(df_out, - new_cols[1:(N_COLS-1)], new_cols[N_COLS], format_stat, - allowmissing=true) - # new_cols[1:(N_COLS-1)] might be more than one category - # new_cols[N_COLS] only one group! - - N_GROUP_COLS = N_COLS - 1 # the first set of category (on the left!) - N_VAR_COLS = size(df_out, 2) - N_GROUP_COLS - - - if format_stat == :freq - - # frequency we also show totals - total_row_des = "Total by $(string(new_cols[N_COLS]))" - total_col_des = join(vcat("Total by ", join(string.(new_cols[1:(N_COLS-1)]), ", "))) - - sum_cols = sum.(skipmissing.(eachcol(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)]))) - row_vector = vcat([total_row_des], repeat(["-"], max(0, N_GROUP_COLS-1)), sum_cols) - df_out = vcat(df_out, - DataFrame(permutedims(row_vector)[:, end+1-size(df_out,2):end], names(df_out)) - ) - sum_rows = sum.(skipmissing.(eachrow(df_out[:, range(1+N_GROUP_COLS; length=N_VAR_COLS)]))) - col_vector = rename(DataFrame(total = sum_rows), "total" => total_col_des) - df_out = hcat(df_out, col_vector) - rename!(df_out, [i => "-"^i for i in 1:N_GROUP_COLS]) - - #TODO: add a line on top - # blank for the group_cols - # name of the wide col - # total by for the sum col - - col_highlighters = vcat( - map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS), - [ hl_custom_gradient(cols=i, colorscheme=:Greens_9, - scale = ceil(Int, maximum(skipmissing(df_out[1:end-1, i])))) - for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ], - Highlighter((data, row, col) -> col == size(df_out, 2), crayon"green") - ) - - formatters = vcat( - [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ], - [ ft_printf("%d", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ], - [ ft_printf("%d", 1+N_GROUP_COLS+N_VAR_COLS) ] - ) - - hlines = [1, size(df_out, 1)] - vlines = [N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS] - alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS), [:l]) - - - elseif format_stat == :pct - - col_highlighters = vcat( - map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS), - [ hl_custom_gradient(cols=i, colorscheme=:Greens_9, - scale = ceil(Int, maximum(skipmissing(df_out[:, i]))) ) - for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ], - ) - - formatters = vcat( - [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ], - [ ft_printf("%.1f", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ] - ) - - hlines = [1] - vlines = [0, N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS] - alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS)) + hlines = [1, size(df_out, 1)] + vlines = [N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS] + alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS), [:l]) + elseif format_stat == :pct - end + col_highlighters = Tuple(vcat( + map(i -> Highlighter((data, row, col) -> col == i, crayon"cyan bold"), 1:N_GROUP_COLS), + [ hl_custom_gradient(cols=i, colorscheme=:Greens_9, + scale = ceil(Int, maximum(skipmissing(df_out[:, i]))) ) + for i in range(1+N_GROUP_COLS; length=N_VAR_COLS) ], + )) + + formatters = Tuple(vcat( + [ ft_printf("%s", i) for i in 1:N_GROUP_COLS ], + [ ft_printf("%.1f", j) for j in range(1+N_GROUP_COLS; length=N_VAR_COLS) ] + )) + + hlines = [1] + vlines = [0, N_GROUP_COLS, N_GROUP_COLS+N_VAR_COLS] + alignment = vcat(repeat([:l], N_GROUP_COLS), repeat([:c], N_VAR_COLS)) - col_highlighters = Tuple(x for x in col_highlighters) - - if out ∈ [:stdout, :df] - - pretty_table(df_out; - hlines = hlines, - vlines = vlines, - alignment = alignment, - cell_alignment = reduce(push!, - map(i -> (i,1)=>:l, 1:N_GROUP_COLS), - init=Dict{Tuple{Int64, Int64}, Symbol}()), - formatters = Tuple(formatters), - highlighters = col_highlighters, - vcrop_mode = :middle, - border_crayon = crayon"bold yellow", - header_crayon = crayon"bold light_green", - show_header = true, - show_subheader=false, - ) - - if out==:stdout - return(nothing) - elseif out==:df - return(df_out) - end - elseif out==:string - pt = pretty_table(String, df_out; - hlines = hlines, - vlines = vlines, - alignment = alignment, - cell_alignment = reduce(push!, - map(i -> (i,1)=>:l, 1:N_GROUP_COLS), - init=Dict{Tuple{Int64, Int64}, Symbol}()), - formatters = Tuple(formatters), - highlighters = col_highlighters, - crop = :none, # no crop for string output - border_crayon = crayon"bold yellow", - header_crayon = crayon"bold light_green", - show_header = true, - show_subheader = false, - ) - - return(pt) - end end + cell_alignment = reduce(push!, + map(i -> (i,1)=>:l, 1:N_GROUP_COLS), + init=Dict{Tuple{Int64, Int64}, Symbol}()) + return _render_pretty_table(df_out, out; + hlines=hlines, vlines=vlines, + alignment=alignment, cell_alignment=cell_alignment, + formatters=formatters, highlighters=col_highlighters, + show_subheader=false) +end + + +# ----- Unified pretty_table output handler (stdout / df / string) +function _render_pretty_table(df, out::Symbol; show_subheader=true, pt_kwargs...) + common = ( + border_crayon = crayon"bold yellow", + header_crayon = crayon"bold light_green", + show_header = true, + show_subheader = show_subheader, + ) + + if out ∈ [:stdout, :df] + pretty_table(df; common..., vcrop_mode=:middle, pt_kwargs...) + return out == :stdout ? nothing : df + else # :string + return pretty_table(String, df; common..., crop=:none, pt_kwargs...) + end end # -------------------------------------------------------------------------------------------------- @@ -395,17 +343,7 @@ end # -------------------------------------------------------------------------------------------------- # From https://github.com/mbauman/Sparklines.jl/blob/master/src/Sparklines.jl -# Sparklines.jl -# const ticks = ['▁','▂','▃','▄','▅','▆','▇','█'] -# function spark(x) -# min, max = extrema(x) -# f = div((max - min) * 2^8, length(ticks)-1) -# f < 1 && (f = one(typeof(f))) -# idxs = convert(Vector{Int}, map(v -> div(v, f), (x .- min) * 2^8)) -# return string.(ticks[idxs.+1]) -# end - -# Unicode characters: +# Unicode characters: # █ (Full block, U+2588) # ⣿ (Full Braille block, U+28FF) # ▓ (Dark shade, U+2593) @@ -418,7 +356,7 @@ function text_histogram(frequencies; width=12) max_freq = maximum(frequencies) max_freq == 0 && return fill(" " ^ width, length(frequencies)) scale = (width * 8 - 1) / max_freq # Subtract 1 to ensure we don't exceed width - + function bar(f) units = round(Int, f * scale) full_blocks = div(units, 8) @@ -434,7 +372,7 @@ end # -------------------------------------------------------------------------------------------------- """ - xtile(data::Vector{T}, n_quantiles::Integer, + xtile(data::Vector{T}, n_quantiles::Integer, weights::Union{Vector{Float64}, Nothing}=nothing)::Vector{Int} where T <: Real Create quantile groups using Julia's built-in weighted quantile functionality. @@ -453,11 +391,11 @@ b = xtile(sales, 10, weights=Weights(repeat([1], length(sales))) ); ``` """ function xtile( - data::AbstractVector{T}, + data::AbstractVector{T}, n_quantiles::Integer; weights::Union{Weights{<:Real}, Nothing} = nothing )::Vector{Int} where T <: Real - + N = length(data) n_quantiles > N && (@warn "More quantiles than data") @@ -472,11 +410,11 @@ end # String version function xtile( - data::AbstractVector{T}, + data::AbstractVector{T}, n_quantiles::Integer; weights::Union{Weights{<:Real}, Nothing} = nothing )::Vector{Int} where T <: AbstractString - + if weights === nothing weights = UnitWeights{Int}(length(data)) end @@ -486,14 +424,14 @@ function xtile( sorted_categories = sortperm(category_weights, rev=true) step = max(1, round(Int, length(sorted_categories) / n_quantiles)) cuts = unique(data)[sorted_categories][1:step:end] - + return searchsortedlast.(Ref(cuts), data) end # Dealing with missing and Numbers function xtile( - data::AbstractVector{T}, + data::AbstractVector{T}, n_quantiles::Integer; weights::Union{Weights{<:Real}, Nothing} = nothing )::Vector{Union{Int, Missing}} where {T <: Union{Missing, AbstractString, Number}} @@ -526,4 +464,3 @@ end - diff --git a/src/TimeShift.jl b/src/TimeShift.jl @@ -279,7 +279,3 @@ end - - - - diff --git a/src/Winsorize.jl b/src/Winsorize.jl @@ -1,30 +1,29 @@ # ------------------------------------------------------------------------------------------ """ - winsorize( - x::AbstractVector; - probs::Union{Tuple{Real, Real}, Nothing} = nothing, - cutpoints::Union{Tuple{Real, Real}, Nothing} = nothing, - replace::Symbol = :missing - verbose::Bool=false - ) + winsorize(x::AbstractVector; + probs=nothing, cutpoints=nothing, replace_value=nothing, + IQR=3, verbose=false) + +Winsorize (clip extreme values) in a vector. +Based on Matthieu Gomez's winsorize function in the `statar` R package. # Arguments - `x::AbstractVector`: a vector of values # Keywords -- `probs::Union{Tuple{Real, Real}, Nothing}`: A vector of probabilities that can be used instead of cutpoints -- `cutpoints::Union{Tuple{Real, Real}, Nothing}`: Cutpoints under and above which are defined outliers. Default is (median - five times interquartile range, median + five times interquartile range). Compared to bottom and top percentile, this takes into account the whole distribution of the vector -- `replace_value::Tuple`: Values by which outliers are replaced. Default to cutpoints. A frequent alternative is missing. -- `IQR::Real`: when inferring cutpoints what is the multiplier from the median for the interquartile range. (median ± IQR * (q75-q25)) -- `verbose::Bool`: printing level +- `probs::Union{Tuple{Real, Real}, Nothing}`: Probability bounds for cutpoints (e.g., `(0.05, 0.95)`) +- `cutpoints::Union{Tuple{Real, Real}, Nothing}`: Explicit cutpoints for outlier thresholds. + Default is `(median - IQR*(q75-q25), median + IQR*(q75-q25))` +- `replace_value`: Values to replace outliers with. Default: cutpoint values. + Can be a tuple `(lo, hi)`, `missing`, or `(missing, missing)` +- `IQR::Real=3`: Multiplier from the median for the interquartile range when inferring cutpoints +- `verbose::Bool=false`: Print informational messages # Returns -- `AbstractVector`: A vector the size of x with substituted values +- `AbstractVector`: A vector the size of x with substituted values # Examples - See tests - -This code is based on Matthieu Gomez winsorize function in the `statar` R package """ function winsorize(x::AbstractVector{T}; probs::Union{Tuple{Real, Real}, Nothing} = nothing, diff --git a/test/UnitTests/panel_fill.jl b/test/UnitTests/panel_fill.jl @@ -58,7 +58,7 @@ gap=Month(1), method=:backwards, uniquecheck=true, flag=true) @test isequal( select(subset(df3_test, :flag => ByRow(==(:backwards))), r"v"), - DataFrame(v1 = [1.0, 1.0, 0.0, 4.0, 11.0], + DataFrame(v1 = [1.0, 1.0, 0.0, 4.0, 11.0], v2 = [1.0, 1.0, 4.0, 2.0, 3.0], v3 = [1.0, 1.0, 15.0, 22.5, 17.2])) @@ -76,7 +76,7 @@ @test isapprox( select(subset(df3_test, :flag => ByRow(==(:linear)), skipmissing=true), r"v") , DataFrame( - v1 = [1.0, 1.0, 0.0, 7.5 , 12.0], + v1 = [1.0, 1.0, 0.0, 7.5 , 12.0], v2 = [1.333, 1.666, 4.5, 2.5, 3.5], v3 = [2.3333, 3.666, 13.625, 19.85, 9.1]), atol = 0.01) @@ -88,17 +88,58 @@ select(subset(df3_test, :flag => ByRow(==(:nearest)), skipmissing=true), :v1), DataFrame(v1 = [1.0, 1.0, 0.0, 11.0, 13.0])) - # TODO clean up these tests - # -- different time periods - # this fails - # panel_fill(df3, :id, :t, [:v1, :v2, :v3], - # gap=Month(2), method=:backwards, uniquecheck=true, flag=true, merge=true) df3_test = panel_fill(df3, :id, :t, [:v1, :v2, :v3], gap=Day(10), method=:forwards, uniquecheck=true, flag=true) @test isequal(nrow(df3_test) , 39) end +end + + +@testset "panel_fill - flag=false" begin + df = DataFrame(id = [1, 1, 2, 2], t = [1, 3, 1, 4], v = [10, 20, 30, 40]) + result = panel_fill(df, :id, :t, :v, gap=1, method=:backwards, flag=false) + @test !(:flag in names(result)) + @test nrow(result) > nrow(df) # should have filled rows +end + + +@testset "panel_fill - invalid method" begin + df = DataFrame(id = [1, 1], t = [1, 3], v = [10, 20]) + @test_throws Exception panel_fill(df, :id, :t, :v, gap=1, method=:invalid_method) +end + + +@testset "panel_fill - type mismatch" begin + # DatePeriod gap with integer time variable + df = DataFrame(id = [1, 1], t = [1, 3], v = [10, 20]) + @test_throws Exception panel_fill(df, :id, :t, :v, gap=Month(1)) +end + + +@testset "panel_fill - non-unique warning" begin + df = DataFrame(id = [1, 1, 1], t = [1, 2, 3], v = [10, 20, 30]) + # non-unique: add a duplicate + df_dup = vcat(df, DataFrame(id = [1], t = [2], v = [99])) + # should warn about non-unique observations + @test_logs (:warn, r"non unique"i) begin + try + panel_fill(df_dup, :id, :t, :v, + gap=1, method=:backwards, uniquecheck=true, flag=true) + catch + # the function may error after warning due to duplicate handling; + # we just verify the warning is emitted + end + end +end + +@testset "panel_fill - no gaps to fill" begin + # consecutive time values, nothing to interpolate + df = DataFrame(id = [1, 1, 1], t = [1, 2, 3], v = [10, 20, 30]) + result = panel_fill(df, :id, :t, :v, gap=1, method=:backwards, flag=true) + @test nrow(result) == 3 # no new rows added + @test all(result.flag .== :original) end diff --git a/test/UnitTests/tabulate.jl b/test/UnitTests/tabulate.jl @@ -37,9 +37,9 @@ @test all(x -> contains(first_line, x), ["island", "Freq", "Percent", "Cum", "Hist."]) # test the type columns get properly passed - @test contains(tabulate(df, [:island, :species], group_type = [:type, :value], out=:string), + @test contains(tabulate(df, [:island, :species], group_type = [:type, :value], out=:string), "island_typeof") - @test contains(tabulate(df, [:island, :species], group_type = [:value, :type], out=:string), + @test contains(tabulate(df, [:island, :species], group_type = [:value, :type], out=:string), "species_typeof") # test the twoway ad wide tabulate @@ -58,14 +58,109 @@ # test the group type options df = DataFrame(x = [1, 2, 2, "NA", missing], y = ["c", "c", "b", "z", "d"]) @test isequal( - tabulate(df, [:x, :y], out=:df).y, + tabulate(df, [:x, :y], out=:df).y, sort(df.y)) @test nrow(tabulate(df, [:x, :y], group_type = :value, out=:df)) == 5 @test nrow(tabulate(df, [:x, :y], group_type = :type, out=:df)) == 3 - @test nrow(tabulate(df, [:x, :y], group_type = [:type, :value], out=:df)) == 4 + @test nrow(tabulate(df, [:x, :y], group_type = [:type, :value], out=:df)) == 4 @test nrow(tabulate(df, [:x, :y], group_type = [:value, :type], out=:df)) == 4 end -# -- TODO: Add tests for results that include missing - \ No newline at end of file +@testset "Tabulate - wide format pct" begin + df = dropmissing(DataFrame(PalmerPenguins.load())) + + # wide format with format_stat=:pct returns a DataFrame + df_pct = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:df) + @test df_pct isa DataFrame + @test nrow(df_pct) == 3 + # pct columns should not have a totals column (unlike freq) + @test !any(contains.(names(df_pct), "Total")) + + # wide format pct as string output + pt = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:string) + @test pt isa String + @test length(pt) > 0 + + # wide format pct stdout returns nothing + result = tabulate(df, [:island, :species], format_tbl=:wide, format_stat=:pct, out=:stdout) + @test isnothing(result) +end + + +@testset "Tabulate - wide format string output" begin + df = dropmissing(DataFrame(PalmerPenguins.load())) + + # wide freq as string + pt = tabulate(df, [:island, :species], format_tbl=:wide, out=:string) + @test pt isa String + @test contains(pt, "Adelie") + @test contains(pt, "Gentoo") + @test contains(pt, "Chinstrap") + + # 3-column wide as string + pt = tabulate(df, [:sex, :island, :species], format_tbl=:wide, out=:string) + @test pt isa String + @test contains(pt, "Adelie") +end + + +@testset "Tabulate - missing values" begin + # DataFrame with missing values in the tabulated column + df = DataFrame(x = [1, 2, missing, 1, missing, 3]) + df_tab = tabulate(df, :x, out=:df) + @test nrow(df_tab) == 4 # 1, 2, 3, missing + @test sum(df_tab.freq) == 6 + @test :freq in propertynames(df_tab) + @test :pct in propertynames(df_tab) + @test :cum in propertynames(df_tab) + + # string output with missing values should not error + pt = tabulate(df, :x, out=:string) + @test pt isa String + @test contains(pt, "missing") + + # two-column with missing + df = DataFrame(x = ["a", "b", missing, "a"], y = [1, 2, 3, missing]) + df_tab = tabulate(df, [:x, :y], out=:df) + @test nrow(df_tab) == 4 + @test sum(df_tab.freq) == 4 +end + + +@testset "Tabulate - skip_stat vector" begin + df = dropmissing(DataFrame(PalmerPenguins.load())) + + # skip multiple stats + pt = tabulate(df, :island, out=:string, skip_stat=[:freq_hist, :cum]) + first_line = split(pt, '\n', limit=2)[1] + @test contains(first_line, "Freq") + @test contains(first_line, "Percent") + @test !contains(first_line, "Cum") + @test !contains(first_line, "Hist") + + # skip just freq + pt = tabulate(df, :island, out=:string, skip_stat=:freq) + first_line = split(pt, '\n', limit=2)[1] + @test !contains(first_line, "Freq.") + @test contains(first_line, "Percent") +end + + +@testset "Tabulate - single row DataFrame" begin + df = DataFrame(x = ["only_value"]) + df_tab = tabulate(df, :x, out=:df) + @test nrow(df_tab) == 1 + @test df_tab.freq[1] == 1 + @test df_tab.cum[1] == 100 +end + + +@testset "Tabulate - reorder_cols=false" begin + df = DataFrame(x = ["c", "a", "b", "a", "c", "c"]) + df_tab = tabulate(df, :x, reorder_cols=false, out=:df) + # without reordering, original groupby order is preserved + @test nrow(df_tab) == 3 + @test sum(df_tab.freq) == 6 +end diff --git a/test/UnitTests/timeshift.jl b/test/UnitTests/timeshift.jl @@ -30,13 +30,13 @@ sort!(df2, [:id, :t]) transform!( groupby(df2, :id), - [:t, :v1] => - ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true), + [:t, :v1] => + ((t, v1) -> (; v1_lag_day = tlag(v1, t; verbose=true), v1_lag_mth = tlag(v1, t; n=Month(1), verbose=true) ) ) => [:v1_lag_day, :v1_lag_mth]) @test all(ismissing.(df2.v1_lag_day)) - @test isequal(df2.v1_lag_mth, + @test isequal(df2.v1_lag_mth, [missing, missing, missing, 1, missing, 6, missing, missing, missing, missing, missing ]) end @@ -44,24 +44,24 @@ # -------------------------------------------------------------------------------------------------- - @testset "General tests" begin + @testset "General tests" begin # --- test large datasets function generate_test_data(;size=50_000, gap_probability=0.1, seed=123) Random.seed!(seed) - + # Start date and initialize arrays start_date = Date(2020, 1, 1) dates = Vector{Date}() x_values = Vector{Float64}() - + # Generate dates with some gaps and corresponding x values current_date = start_date for i in 1:size # Add current date and value push!(dates, current_date) push!(x_values, sin(i/100) + 0.1*randn()) # Some noisy sine wave pattern - + # Decide whether to introduce a gap (skip 1-5 days) if rand() < gap_probability gap_size = rand(1:5) @@ -71,7 +71,7 @@ current_date += Day(1) end end - + # Create DataFrame df = DataFrame(date=dates, x=x_values) return df @@ -83,15 +83,15 @@ @time transform!(small_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag) @test nrow(subset(small_df, :x_lag => ByRow(!ismissing))) == 4525 - + @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day); @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth); @time transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr); - + transform!(large_df, :date => ByRow(year) => :datey) - @test_throws r"time vector not sorted"i transform!(large_df, + @test_throws r"time vector not sorted"i transform!(large_df, [:x, :datey] => ( (x, d) -> tlag(x, d, n=1)) => :x_lag_datey); - + @test nrow(subset(large_df, :x_lag_day => ByRow(!ismissing))) == 900_182 @test nrow(subset(large_df, :x_lag_mth => ByRow(!ismissing))) == 770_178 @test nrow(subset(large_df, :x_lag_yr => ByRow(!ismissing))) == 769_502 @@ -114,7 +114,7 @@ import PanelShift - # note the api for this package differs slightly ... + # note the api for this package differs slightly ... # PanelShift.tlag(time_variable, x) # BazelData.tlag(x, time_variable) @@ -127,11 +127,11 @@ @test isequal(x_shift, [5; 6; missing]) x_shift = tlag([4;5;6], [1;2;3], n=2) - @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift) + @test isequal(PanelShift.tlag([1;2;3], [4;5;6], 2), x_shift) @test isequal(x_shift, [missing;missing;4]) x_shift = tlead([4;5;6], [1;2;3], n=2) - @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift) + @test isequal(PanelShift.tlead([1;2;3], [4;5;6], 2), x_shift) @test isequal(x_shift, [6; missing; missing]) # unit-length vector @@ -143,12 +143,12 @@ @test isequal(PanelShift.tlead([1], [1]), x_shift) @test isequal(x_shift, [missing]) - # -- + # -- x_shift = tlag([1;2;3;4;5], [1;3;5;6;7], n=2) @test isequal(PanelShift.tlag([1;3;5;6;7], [1;2;3;4;5], 2), x_shift) @test isequal(x_shift, [missing; 1; 2; missing; 3]) - x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2) + x_shift = tlag(float.([1;2;3;4;5]), [1;3;5;6;7], n=2) @test isequal(PanelShift.tlag(float.([1;3;5;6;7]), [1;2;3;4;5], 2), x_shift) @test isequal(x_shift, [missing; 1; 2; missing; 3]) @@ -164,7 +164,7 @@ x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=3) @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 3), x_shift) @test isequal(x_shift, [missing; missing; :apple; :banana; missing]) - + x_shift = tlag([:apple; :orange; :banana; :pineapple; :strawberry], [1;2;4;7;11], n=4) @test isequal(PanelShift.tlag([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift) @@ -174,11 +174,11 @@ @test isequal(PanelShift.tlead([1;2;4;7;11], [:apple; :orange; :banana; :pineapple; :strawberry], 4), x_shift) @test isequal(x_shift, [missing; missing; missing; :strawberry; missing]) - # indexed by dates + # indexed by dates x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(1)) @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(1)), x_shift) @test isequal(x_shift, [missing; 1; missing]) - + x_shift = tlag([1,2,3], [Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], n=Day(2)) @test isequal(PanelShift.tlag([Date(2000,1,1), Date(2000, 1,2), Date(2000,1, 4)], [1,2,3], Day(2)), x_shift) @test isequal(x_shift, [missing; missing; 2]) @@ -192,75 +192,44 @@ @test isequal(PanelShift.tshift([1;2;3], [1;2;3], 1), x_shift) @test isequal(x_shift, tlag([1;2;3], [1;2;3], n=1)) - # safeguards - # @test_throws ArgumentError PanelShift.tlag([1;2;2], [1,2,3]) # argcheck error unsorted t - @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2]) - # @test_throws ArgumentError PanelShift.tlag([1;2;], [1,2,3]) - @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3]) - # @test_throws ArgumentError PanelShift.tlag([1;2;3], [1,2,3], 0) - @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0) + # safeguards for tlag + @test_throws r"time vector not sorted"i tlag([1, 2, 3], [1, 2, 2]) + @test_throws r"value and time vector"i tlag([1, 2], [1, 2, 3]) + @test_throws r"shift value"i tlag([1, 2, 3], [1, 2, 3], n=0) - end + end # -------------------------------------------------------------------------------------------------- - # -------------------------------------------------------------------------------------------------- -# benchmarking - -# using Chairmarks -# large_df = generate_test_data(size=50_000_000, gap_probability=0.1); - -# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d)) => :x_lag_day) -# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Month(1))) => :x_lag_mth) -# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlag(x, d, n=Year(1))) => :x_lag_yr) + @testset "tlead error paths" begin + # unsorted time vector + @test_throws r"time vector not sorted"i tlead([1, 2, 3], [3, 1, 2]) -# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x)) => :x_lag_day) -# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Month(1))) => :x_lag_mth) -# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlag(d, x, Year(1))) => :x_lag_yr) - - - -# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d)) => :x_lag_day) -# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Month(1))) => :x_lag_mth) -# @b transform!(large_df, [:x, :date] => ( (x, d) -> tlead(x, d, n=Year(1))) => :x_lag_yr) - -# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x)) => :x_lag_day) -# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Month(1))) => :x_lag_mth) -# @b transform!(large_df, [:x, :date] => ( (x, d) -> PanelShift.tlead(d, x, Year(1))) => :x_lag_yr) - -# -------------------------------------------------------------------------------------------------- + # mismatched lengths + @test_throws r"value and time vector"i tlead([1, 2], [1, 2, 3]) + # zero shift + @test_throws r"shift value"i tlead([1, 2, 3], [1, 2, 3], n=0) + end +# -------------------------------------------------------------------------------------------------- +# -------------------------------------------------------------------------------------------------- + @testset "tshift edge cases" begin + # tshift with n=nothing should warn and default to lag + result = @test_logs (:warn, r"shift not specified"i) tshift([1, 2, 3], [1, 2, 3]) + @test isequal(result, tlag([1, 2, 3], [1, 2, 3])) + + # tshift with Date vectors + dates = [Date(2020, 1, 1), Date(2020, 1, 2), Date(2020, 1, 3)] + result = tshift([10, 20, 30], dates, n=Day(1)) + @test isequal(result, tlag([10, 20, 30], dates, n=Day(1))) + + result = tshift([10, 20, 30], dates, n=Day(-1)) + @test isequal(result, tlead([10, 20, 30], dates, n=Day(1))) + end +# -------------------------------------------------------------------------------------------------- end - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/test/UnitTests/winsorize.jl b/test/UnitTests/winsorize.jl @@ -1,13 +1,13 @@ @testset "winsorize" begin - Random.seed!(3); + Random.seed!(3); x1 = rand(100); x2 = Vector{Union{Float64, Missing}}(rand(Float64, 100)); x2[rand(collect(1:100), 5)] .= missing; # --- tests on non-missing vectors x1_win = winsorize(x1, probs=(0.05, 0.95), verbose=true); @test findall(x1 .!= x1_win) == [4, 15, 26, 32, 40, 44, 52, 59, 64, 97] - + x1_win = winsorize(x1; verbose=true); @test findall(x1 .!= x1_win) == [] @@ -21,7 +21,7 @@ x2_win = winsorize(x2, probs=(0.02, 0.98), verbose=true); @test size(x2) == size(x2_win) @test findall(skipmissing(x2 .!= x2_win)) == [5, 41, 83, 91] - + x2_win = winsorize(x2; verbose=true) @test size(x2) == size(x2_win) @test findall(skipmissing(x2 .!= x2_win)) == [] @@ -43,12 +43,69 @@ @test size(x2) == size(x2_win) @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91] - # we check that this works if the type of replace is slightly different ... + # we check that this works if the type of replace is slightly different ... # maybe we want to change this ... x2_win = winsorize(x2; cutpoints=(0.05, 0.95), replace_value=(-1, 1), verbose=true) @test size(x2) == size(x2_win) @test findall(v -> v ∈ (-1.0, 1.0), skipmissing(x2_win)) == [5, 17, 41, 42, 65, 83, 91] +end + + +@testset "winsorize - custom IQR" begin + Random.seed!(42) + x = randn(1000) # standard normal: outliers likely beyond ~3σ + + # default IQR=3 should keep most data + w_default = winsorize(x) + n_changed_default = count(x .!= w_default) + + # IQR=1 should clip more aggressively + w_tight = winsorize(x, IQR=1) + n_changed_tight = count(x .!= w_tight) + @test n_changed_tight > n_changed_default + + # IQR=100 should clip almost nothing + w_loose = winsorize(x, IQR=100) + @test count(x .!= w_loose) == 0 +end + + +@testset "winsorize - edge cases" begin + # all identical values: nothing to winsorize + x_same = fill(5.0, 50) + w = winsorize(x_same, probs=(0.05, 0.95)) + @test w == x_same + + # single-element vector + x_one = [3.14] + w = winsorize(x_one, probs=(0.1, 0.9)) + @test w == x_one + + # integer vector + x_int = collect(1:100) + w = winsorize(x_int, probs=(0.05, 0.95)) + @test length(w) == 100 + @test minimum(w) >= minimum(x_int) + @test maximum(w) <= maximum(x_int) + @test count(w .!= x_int) > 0 # some values should be clipped + + # one-sided winsorize: only clip top + Random.seed!(1) + x = rand(100) + w = winsorize(x, cutpoints=(minimum(x), 0.5)) + @test minimum(w) == minimum(x) # bottom unchanged + @test maximum(w) <= 0.5 + + # one-sided: only clip bottom + w = winsorize(x, cutpoints=(0.5, maximum(x))) + @test minimum(w) >= 0.5 + @test maximum(w) == maximum(x) # top unchanged +end +@testset "winsorize - all missing" begin + x_all_missing = Vector{Union{Float64, Missing}}(fill(missing, 10)) + # probs path uses skipmissing which will be empty - quantile on empty should error + @test_throws Exception winsorize(x_all_missing, probs=(0.05, 0.95)) end diff --git a/test/UnitTests/xtile.jl b/test/UnitTests/xtile.jl @@ -51,5 +51,51 @@ @test isequal(xtile(s_m, 3), [1, 1, 2, missing, 1, missing, 3]) @test isequal(xtile(s_m, 20), [1, 2, 4, missing, 2, missing, 5]) +end -end- \ No newline at end of file + +@testset "xtile - edge cases" begin + + # all-missing input + x_all_missing = Vector{Union{Int64, Missing}}(fill(missing, 10)) + result = xtile(x_all_missing, 4) + @test all(ismissing, result) + @test length(result) == 10 + + # single-element vector: searchsortedlast puts the value at the last quantile + result = xtile([42.0], 5) + @test length(result) == 1 + @test result[1] isa Int + + result = xtile([42], 5) + @test length(result) == 1 + + result = xtile(["hello"], 3) + @test length(result) == 1 + + # single-element with missing wrapper + x_single_m = Union{Int, Missing}[7] + result = xtile(x_single_m, 3) + @test length(result) == 1 + @test !ismissing(result[1]) + + # two-element vector: results should be valid bin indices + result = xtile([1.0, 2.0], 2) + @test length(result) == 2 + @test result[1] < result[2] # lower value gets lower bin + + # all identical values: all should get the same bin + x_same = fill(5.0, 100) + result = xtile(x_same, 10) + @test allequal(result) + @test length(result) == 100 + + # n_quantiles == 1: only the max element gets bin 1 (searchsortedlast behavior) + result = xtile(rand(50), 1) + @test all(r -> r in (0, 1), result) + + # large n_quantiles: bins are bounded by n_quantiles + result = xtile(rand(100), 10) + @test all(r -> 0 <= r <= 10, result) + +end