BazerData.jl

Data manipulation utilities for Julia
Log | Files | Refs | README | LICENSE

PanelData.jl (5059B)


      1 # ------------------------------------------------------------------------------------------
      2 """
      3     panel_fill!(
      4         df::DataFrame,
      5         id_var::Symbol,
      6         time_var::Symbol,
      7         value_var::Union{Symbol, Vector{Symbol}};
      8         gap::Union{Int, DatePeriod} = 1,
      9         method::Symbol = :backwards,
     10         uniquecheck::Bool = true,
     11         flag::Bool = false
     12     )
     13 
     14 # Arguments
     15 - `df::AbstractDataFrame`: a panel dataset
     16 - `id_var::Symbol`: the individual index dimension of the panel
     17 - `time_var::Symbol`: the time index dimension of the panel (must be integer or a date)
     18 - `value_var::Union{Symbol, Vector{Symbol}}`: the set of columns we would like to fill
     19 
     20 # Keywords
     21 - `gap::Union{Int, DatePeriod} = 1` : the interval size for which we want to fill data
     22 - `method::Symbol = :backwards`: the interpolation method to fill the data
     23     options are: `:backwards` (default), `:forwards`, `:linear`, `:nearest`
     24 - `uniquecheck::Bool = true`: check if panel is clean
     25 - `flag::Bool = false`: flag the interpolated values
     26 
     27 # Returns
     28 - `DataFrame`: the input DataFrame with interpolated rows appended
     29 
     30 # Examples
     31 - See tests
     32 """
     33 function panel_fill!(
     34     df::DataFrame,
     35     id_var::Symbol, time_var::Symbol, value_var::Union{Symbol, Vector{Symbol}};
     36     gap::Union{Int, DatePeriod} = 1, 
     37     method::Symbol = :backwards, 
     38     uniquecheck::Bool = true,
     39     flag::Bool = false,
     40     )
     41  
     42     # prepare the data
     43     sort!(df, [id_var, time_var])
     44     if value_var isa Symbol
     45         value_var = [value_var]
     46     end
     47     if uniquecheck # check for unicity 
     48         any(nonunique(df, [id_var, time_var])) && 
     49             (@warn "Some non unique observations in dataset")
     50     end
     51 
     52     time_var_r = join([string(time_var), "rounded"], "_") # clean up if dates
     53     if gap isa DatePeriod
     54         if !(eltype(df[!, time_var]) <: Dates.AbstractTime)
     55             error(
     56                 """
     57                 Type of gap $(typeof(gap)) and type of time variable $(eltype(df[!, time_var])) do not match
     58                 """
     59             )
     60         else
     61             df[!, time_var_r] .= floor.(df[!, time_var], gap)
     62             if !isequal(df[!, time_var_r], df[!, time_var])
     63                 @warn "Using rounded time variables for consistency with gap: $gap"
     64             end
     65         end
     66     else
     67         df[!, time_var_r] .= df[!, time_var]
     68     end
     69 
     70     if method == :backwards
     71         interpolate_method = BSpline(Constant(Previous))
     72     elseif method == :forwards
     73         interpolate_method = BSpline(Constant(Next))
     74     elseif method == :nearest
     75         interpolate_method = BSpline(Constant())
     76     elseif method == :linear
     77         interpolate_method = BSpline(Linear())
     78     else
     79         error(
     80             """
     81             Method $method not available.
     82             Please choose from :backwards (default), :forwards, :nearest, :linear
     83             """
     84             )
     85     end
     86 
     87     gdf = groupby(df, [id_var])
     88     fill_chunks = DataFrame[]
     89 
     90     for id_gdf in eachindex(gdf)
     91         subdf = gdf[id_gdf]
     92 
     93         if nrow(subdf) > 1
     94             sort!(subdf, time_var_r)
     95             rowdf_init = subdf[1, :]
     96             for rowdf in eachrow(subdf)[2:end]
     97 
     98                 old_t = rowdf_init[time_var_r]
     99                 enum_t = rowdf[time_var_r]
    100 
    101                 t_fill = collect(range(old_t, enum_t, step=sign(enum_t-old_t) * gap))[2:end-1]
    102                 group_fill = DataFrame(
    103                     Dict(Symbol(time_var_r) => t_fill, id_var => id_gdf[1]))
    104                 N_fill = nrow(group_fill)
    105                 scale_xs = range(1, 2, N_fill+2)[2:end-1]
    106 
    107                 interp_dict = Dict(
    108                     v => interpolate([rowdf_init[v], rowdf[v]], interpolate_method)
    109                     for v in value_var)
    110                 var_fill = DataFrame(
    111                     Dict(v => interp_dict[v].(scale_xs) for v in value_var))
    112 
    113                 push!(fill_chunks, hcat(group_fill, var_fill))
    114                 rowdf_init = rowdf
    115             end
    116         end
    117     end
    118 
    119     df_fill = isempty(fill_chunks) ? DataFrame() : vcat(fill_chunks...)
    120     
    121     # clean up the output
    122     if flag 
    123         df_fill[!, :flag] .= method
    124     end
    125     if isequal(df[!, time_var_r], df[!, time_var])
    126         rename!(df_fill, time_var_r => time_var)
    127         select!(df, Not(time_var_r))
    128     else # if they are not all the same we are going to fill
    129         transform!(df_fill, time_var_r => time_var)
    130     end
    131 
    132     if flag
    133         df[!, :flag] .= :original
    134     end
    135 
    136     append!(df, df_fill, cols=:union)
    137     sort!(df, [id_var, time_var])
    138 
    139     return df
    140 
    141 end
    142 
    143 
    144 """ 
    145     panel_fill(...)
    146 
    147     Same as panel_fill but without modification in place in place
    148 """    
    149 function panel_fill(
    150     df::DataFrame,
    151     id_var::Symbol, time_var::Symbol, value_var::Union{Symbol, Vector{Symbol}};
    152     gap::Union{Int, DatePeriod} = 1, 
    153     method::Symbol = :backwards, 
    154     uniquecheck::Bool = true,
    155     flag::Bool = false
    156     )
    157 
    158     df_res = deepcopy(df)
    159 
    160     panel_fill!(df_res, id_var, time_var, value_var,
    161         gap = gap, method = method, uniquecheck = uniquecheck, flag = flag)
    162     
    163     return df_res
    164 
    165 end
    166