FinanceRoutines.jl

Financial data routines for Julia
Log | Files | Refs | README | LICENSE

Diagnostics.jl (2881B)


      1 # --------------------------------------------------------------------------------------------------
      2 # Diagnostics.jl
      3 
      4 # Data quality diagnostics for financial DataFrames
      5 # --------------------------------------------------------------------------------------------------
      6 
      7 
      8 # --------------------------------------------------------------------------------------------------
      9 """
     10     diagnose(df; id_col=:permno, date_col=:date, ret_col=:ret, price_col=:prc)
     11 
     12 Run data quality diagnostics on a financial DataFrame.
     13 
     14 # Arguments
     15 - `df::AbstractDataFrame`: The data to diagnose
     16 
     17 # Keywords
     18 - `id_col::Symbol=:permno`: Entity identifier column
     19 - `date_col::Symbol=:date`: Date column
     20 - `ret_col::Union{Nothing,Symbol}=:ret`: Return column (set to `nothing` to skip)
     21 - `price_col::Union{Nothing,Symbol}=:prc`: Price column (set to `nothing` to skip)
     22 
     23 # Returns
     24 - `Dict{Symbol, Any}` with keys:
     25   - `:nrow`, `:ncol` — dimensions
     26   - `:missing_rates` — `Dict{Symbol, Float64}` fraction missing per column
     27   - `:duplicate_keys` — count of duplicate (id, date) pairs (if both columns exist)
     28   - `:suspicious_values` — `Vector{String}` descriptions of anomalies found
     29 
     30 # Examples
     31 ```julia
     32 df = import_MSF(conn; date_range=(Date("2020-01-01"), Date("2022-12-31")))
     33 report = diagnose(df)
     34 report[:missing_rates]      # Dict(:permno => 0.0, :ret => 0.02, ...)
     35 report[:duplicate_keys]     # 0
     36 report[:suspicious_values]  # ["15 returns outside [-100%, +100%]"]
     37 ```
     38 """
     39 function diagnose(df::AbstractDataFrame;
     40     id_col::Symbol=:permno, date_col::Symbol=:date,
     41     ret_col::Union{Nothing,Symbol}=:ret,
     42     price_col::Union{Nothing,Symbol}=:prc)
     43 
     44     report = Dict{Symbol, Any}()
     45     report[:nrow] = nrow(df)
     46     report[:ncol] = ncol(df)
     47 
     48     # Missing rates
     49     missing_rates = Dict{Symbol, Float64}()
     50     for col in names(df)
     51         col_sym = Symbol(col)
     52         missing_rates[col_sym] = nrow(df) > 0 ? count(ismissing, df[!, col]) / nrow(df) : 0.0
     53     end
     54     report[:missing_rates] = missing_rates
     55 
     56     # Duplicate keys
     57     if id_col in propertynames(df) && date_col in propertynames(df)
     58         report[:duplicate_keys] = nrow(df) - nrow(unique(df, [id_col, date_col]))
     59     end
     60 
     61     # Suspicious values
     62     suspicious = String[]
     63     if !isnothing(ret_col) && ret_col in propertynames(df)
     64         n_extreme = count(r -> !ismissing(r) && (r > 1.0 || r < -1.0), df[!, ret_col])
     65         n_extreme > 0 && push!(suspicious, "$n_extreme returns outside [-100%, +100%]")
     66     end
     67     if !isnothing(price_col) && price_col in propertynames(df)
     68         n_neg = count(r -> !ismissing(r) && r < 0, df[!, price_col])
     69         n_neg > 0 && push!(suspicious, "$n_neg negative prices (CRSP convention for bid/ask midpoint)")
     70     end
     71     report[:suspicious_values] = suspicious
     72 
     73     return report
     74 end
     75 # --------------------------------------------------------------------------------------------------