commit ed42f3556beff04ce8df2766e5860e17f3db065a
parent a6eb318183709289a50af1b9d90a7bed53d9048d
Author: Erik Loualiche <eloualic@umn.edu>
Date: Sun, 22 Mar 2026 12:15:05 -0500
Add diagnose() for data quality diagnostics
Reports missing rates, duplicate keys, suspicious returns
(outside [-100%, +100%]), and negative prices. Configurable
column names with sensible CRSP defaults. 12 test assertions.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
4 files changed, 118 insertions(+), 0 deletions(-)
diff --git a/src/Diagnostics.jl b/src/Diagnostics.jl
@@ -0,0 +1,75 @@
+# --------------------------------------------------------------------------------------------------
+# Diagnostics.jl
+
+# Data quality diagnostics for financial DataFrames
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+"""
+ diagnose(df; id_col=:permno, date_col=:date, ret_col=:ret, price_col=:prc)
+
+Run data quality diagnostics on a financial DataFrame.
+
+# Arguments
+- `df::AbstractDataFrame`: The data to diagnose
+
+# Keywords
+- `id_col::Symbol=:permno`: Entity identifier column
+- `date_col::Symbol=:date`: Date column
+- `ret_col::Union{Nothing,Symbol}=:ret`: Return column (set to `nothing` to skip)
+- `price_col::Union{Nothing,Symbol}=:prc`: Price column (set to `nothing` to skip)
+
+# Returns
+- `Dict{Symbol, Any}` with keys:
+ - `:nrow`, `:ncol` — dimensions
+ - `:missing_rates` — `Dict{Symbol, Float64}` fraction missing per column
+ - `:duplicate_keys` — count of duplicate (id, date) pairs (if both columns exist)
+ - `:suspicious_values` — `Vector{String}` descriptions of anomalies found
+
+# Examples
+```julia
+df = import_MSF(conn; date_range=(Date("2020-01-01"), Date("2022-12-31")))
+report = diagnose(df)
+report[:missing_rates] # Dict(:permno => 0.0, :ret => 0.02, ...)
+report[:duplicate_keys] # 0
+report[:suspicious_values] # ["15 returns outside [-100%, +100%]"]
+```
+"""
+function diagnose(df::AbstractDataFrame;
+ id_col::Symbol=:permno, date_col::Symbol=:date,
+ ret_col::Union{Nothing,Symbol}=:ret,
+ price_col::Union{Nothing,Symbol}=:prc)
+
+ report = Dict{Symbol, Any}()
+ report[:nrow] = nrow(df)
+ report[:ncol] = ncol(df)
+
+ # Missing rates
+ missing_rates = Dict{Symbol, Float64}()
+ for col in names(df)
+ col_sym = Symbol(col)
+ missing_rates[col_sym] = nrow(df) > 0 ? count(ismissing, df[!, col]) / nrow(df) : 0.0
+ end
+ report[:missing_rates] = missing_rates
+
+ # Duplicate keys
+ if id_col in propertynames(df) && date_col in propertynames(df)
+ report[:duplicate_keys] = nrow(df) - nrow(unique(df, [id_col, date_col]))
+ end
+
+ # Suspicious values
+ suspicious = String[]
+ if !isnothing(ret_col) && ret_col in propertynames(df)
+ n_extreme = count(r -> !ismissing(r) && (r > 1.0 || r < -1.0), df[!, ret_col])
+ n_extreme > 0 && push!(suspicious, "$n_extreme returns outside [-100%, +100%]")
+ end
+ if !isnothing(price_col) && price_col in propertynames(df)
+ n_neg = count(r -> !ismissing(r) && r < 0, df[!, price_col])
+ n_neg > 0 && push!(suspicious, "$n_neg negative prices (CRSP convention for bid/ask midpoint)")
+ end
+ report[:suspicious_values] = suspicious
+
+ return report
+end
+# --------------------------------------------------------------------------------------------------
diff --git a/src/FinanceRoutines.jl b/src/FinanceRoutines.jl
@@ -39,6 +39,7 @@ include("ImportCRSP.jl")
include("ImportComp.jl")
include("Merge_CRSP_Comp.jl")
include("PortfolioUtils.jl")
+include("Diagnostics.jl")
# --------------------------------------------------------------------------------------------------
@@ -70,6 +71,7 @@ export link_MSF
# More practical functions
export calculate_rolling_betas
export calculate_portfolio_returns
+export diagnose
# --------------------------------------------------------------------------------------------------
diff --git a/test/UnitTests/Diagnostics.jl b/test/UnitTests/Diagnostics.jl
@@ -0,0 +1,40 @@
+@testset "Data Quality Diagnostics" begin
+
+ import Dates: Date
+
+ # Create test data with known issues
+ df = DataFrame(
+ permno = [1, 1, 1, 2, 2, 2],
+ date = [Date(2020,1,1), Date(2020,2,1), Date(2020,2,1), # duplicate for permno 1
+ Date(2020,1,1), Date(2020,3,1), Date(2020,4,1)], # gap for permno 2
+ ret = [0.05, missing, 0.03, -1.5, 0.02, 150.0], # suspicious: -1.5, 150.0
+ prc = [10.0, 20.0, 20.0, -5.0, 30.0, 40.0] # negative price
+ )
+ allowmissing!(df, :ret)
+
+ report = diagnose(df)
+
+ # Basic structure
+ @test report[:nrow] == 6
+ @test report[:ncol] == 4
+
+ # Missing rates
+ @test haskey(report, :missing_rates)
+ @test report[:missing_rates][:ret] ≈ 1/6
+ @test report[:missing_rates][:permno] == 0.0
+
+ # Duplicates
+ @test haskey(report, :duplicate_keys)
+ @test report[:duplicate_keys] == 1 # one duplicate (permno=1, date=2020-02-01)
+
+ # Suspicious values
+ @test haskey(report, :suspicious_values)
+ @test length(report[:suspicious_values]) == 2 # extreme returns + negative prices
+ @test any(s -> occursin("returns outside", s), report[:suspicious_values])
+ @test any(s -> occursin("negative prices", s), report[:suspicious_values])
+
+ # Test with custom columns / no ret/prc
+ report2 = diagnose(df; ret_col=nothing, price_col=nothing)
+ @test isempty(report2[:suspicious_values])
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -18,6 +18,7 @@ const testsuite = [
"betas",
"Yields",
"PortfolioUtils",
+ "Diagnostics",
]
# --------------------------------------------------------------------------------------------------