FinanceRoutines.jl

Financial data routines for Julia
Log | Files | Refs | README | LICENSE

commit ed42f3556beff04ce8df2766e5860e17f3db065a
parent a6eb318183709289a50af1b9d90a7bed53d9048d
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Sun, 22 Mar 2026 12:15:05 -0500

Add diagnose() for data quality diagnostics

Reports missing rates, duplicate keys, suspicious returns
(outside [-100%, +100%]), and negative prices. Configurable
column names with sensible CRSP defaults. 12 test assertions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Asrc/Diagnostics.jl | 75+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/FinanceRoutines.jl | 2++
Atest/UnitTests/Diagnostics.jl | 40++++++++++++++++++++++++++++++++++++++++
Mtest/runtests.jl | 1+
4 files changed, 118 insertions(+), 0 deletions(-)

diff --git a/src/Diagnostics.jl b/src/Diagnostics.jl @@ -0,0 +1,75 @@ +# -------------------------------------------------------------------------------------------------- +# Diagnostics.jl + +# Data quality diagnostics for financial DataFrames +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +""" + diagnose(df; id_col=:permno, date_col=:date, ret_col=:ret, price_col=:prc) + +Run data quality diagnostics on a financial DataFrame. + +# Arguments +- `df::AbstractDataFrame`: The data to diagnose + +# Keywords +- `id_col::Symbol=:permno`: Entity identifier column +- `date_col::Symbol=:date`: Date column +- `ret_col::Union{Nothing,Symbol}=:ret`: Return column (set to `nothing` to skip) +- `price_col::Union{Nothing,Symbol}=:prc`: Price column (set to `nothing` to skip) + +# Returns +- `Dict{Symbol, Any}` with keys: + - `:nrow`, `:ncol` — dimensions + - `:missing_rates` — `Dict{Symbol, Float64}` fraction missing per column + - `:duplicate_keys` — count of duplicate (id, date) pairs (if both columns exist) + - `:suspicious_values` — `Vector{String}` descriptions of anomalies found + +# Examples +```julia +df = import_MSF(conn; date_range=(Date("2020-01-01"), Date("2022-12-31"))) +report = diagnose(df) +report[:missing_rates] # Dict(:permno => 0.0, :ret => 0.02, ...) +report[:duplicate_keys] # 0 +report[:suspicious_values] # ["15 returns outside [-100%, +100%]"] +``` +""" +function diagnose(df::AbstractDataFrame; + id_col::Symbol=:permno, date_col::Symbol=:date, + ret_col::Union{Nothing,Symbol}=:ret, + price_col::Union{Nothing,Symbol}=:prc) + + report = Dict{Symbol, Any}() + report[:nrow] = nrow(df) + report[:ncol] = ncol(df) + + # Missing rates + missing_rates = Dict{Symbol, Float64}() + for col in names(df) + col_sym = Symbol(col) + missing_rates[col_sym] = nrow(df) > 0 ? count(ismissing, df[!, col]) / nrow(df) : 0.0 + end + report[:missing_rates] = missing_rates + + # Duplicate keys + if id_col in propertynames(df) && date_col in propertynames(df) + report[:duplicate_keys] = nrow(df) - nrow(unique(df, [id_col, date_col])) + end + + # Suspicious values + suspicious = String[] + if !isnothing(ret_col) && ret_col in propertynames(df) + n_extreme = count(r -> !ismissing(r) && (r > 1.0 || r < -1.0), df[!, ret_col]) + n_extreme > 0 && push!(suspicious, "$n_extreme returns outside [-100%, +100%]") + end + if !isnothing(price_col) && price_col in propertynames(df) + n_neg = count(r -> !ismissing(r) && r < 0, df[!, price_col]) + n_neg > 0 && push!(suspicious, "$n_neg negative prices (CRSP convention for bid/ask midpoint)") + end + report[:suspicious_values] = suspicious + + return report +end +# -------------------------------------------------------------------------------------------------- diff --git a/src/FinanceRoutines.jl b/src/FinanceRoutines.jl @@ -39,6 +39,7 @@ include("ImportCRSP.jl") include("ImportComp.jl") include("Merge_CRSP_Comp.jl") include("PortfolioUtils.jl") +include("Diagnostics.jl") # -------------------------------------------------------------------------------------------------- @@ -70,6 +71,7 @@ export link_MSF # More practical functions export calculate_rolling_betas export calculate_portfolio_returns +export diagnose # -------------------------------------------------------------------------------------------------- diff --git a/test/UnitTests/Diagnostics.jl b/test/UnitTests/Diagnostics.jl @@ -0,0 +1,40 @@ +@testset "Data Quality Diagnostics" begin + + import Dates: Date + + # Create test data with known issues + df = DataFrame( + permno = [1, 1, 1, 2, 2, 2], + date = [Date(2020,1,1), Date(2020,2,1), Date(2020,2,1), # duplicate for permno 1 + Date(2020,1,1), Date(2020,3,1), Date(2020,4,1)], # gap for permno 2 + ret = [0.05, missing, 0.03, -1.5, 0.02, 150.0], # suspicious: -1.5, 150.0 + prc = [10.0, 20.0, 20.0, -5.0, 30.0, 40.0] # negative price + ) + allowmissing!(df, :ret) + + report = diagnose(df) + + # Basic structure + @test report[:nrow] == 6 + @test report[:ncol] == 4 + + # Missing rates + @test haskey(report, :missing_rates) + @test report[:missing_rates][:ret] ≈ 1/6 + @test report[:missing_rates][:permno] == 0.0 + + # Duplicates + @test haskey(report, :duplicate_keys) + @test report[:duplicate_keys] == 1 # one duplicate (permno=1, date=2020-02-01) + + # Suspicious values + @test haskey(report, :suspicious_values) + @test length(report[:suspicious_values]) == 2 # extreme returns + negative prices + @test any(s -> occursin("returns outside", s), report[:suspicious_values]) + @test any(s -> occursin("negative prices", s), report[:suspicious_values]) + + # Test with custom columns / no ret/prc + report2 = diagnose(df; ret_col=nothing, price_col=nothing) + @test isempty(report2[:suspicious_values]) + +end diff --git a/test/runtests.jl b/test/runtests.jl @@ -18,6 +18,7 @@ const testsuite = [ "betas", "Yields", "PortfolioUtils", + "Diagnostics", ] # --------------------------------------------------------------------------------------------------