DataImport.jl (4602B)
1 # src/DataImport.jl 2 module DataImport 3 4 using CSV, DataFrames, Dates, Statistics 5 include("FredUtils.jl") 6 using .FredUtils 7 8 export compute_dp, compute_tbill, compute_excess_returns, build_predictors 9 10 """ 11 compute_dp(msi_path) -> DataFrame 12 13 Compute 12-month rolling dividend-price ratio from CRSP MSI data. 14 Replicates R logic: accumulate monthly dividends with reinvestment over 12 months. 15 16 Returns DataFrame with columns: dateym, dp. 17 """ 18 function compute_dp(msi_path::String) 19 msi = CSV.read(msi_path, DataFrame) 20 dropmissing!(msi, [:vwretd, :vwretx]) 21 22 n = nrow(msi) 23 dpvw = 100.0 .* (msi.vwretd .- msi.vwretx) ./ (1.0 .+ msi.vwretx) 24 retd_retx = (1.0 .+ msi.vwretd) ./ (1.0 .+ msi.vwretx) 25 26 dp = zeros(n) 27 for i in 11:-1:0 28 for t in (i+1):n 29 dp[t] = dp[t] * retd_retx[t-i] + dpvw[t-i] 30 end 31 end 32 dp ./= 100.0 33 34 dates = Date.(msi.date) 35 dateym = year.(dates) .* 100 .+ month.(dates) 36 37 return DataFrame(dateym = dateym, dp = dp) 38 end 39 40 """ 41 compute_tbill(; observation_end) -> DataFrame 42 43 Download 3-month T-bill from FRED (TB3MS). Returns DataFrame with dateym, rf. 44 """ 45 function compute_tbill(; observation_end::Date = Date(2026,12,31)) 46 df = fred_observations("TB3MS"; end_date=observation_end) 47 dropmissing!(df) 48 dateym = year.(df.date) .* 100 .+ month.(df.date) 49 rf = df.value ./ 100.0 50 return DataFrame(dateym = dateym, rf = rf) 51 end 52 53 """ 54 compute_excess_returns(msi_path, tbill_df) -> DataFrame 55 56 Compute 3-year geometric average future excess returns. 57 Replicates R logic from import_predictors.R lines 100-114. 58 59 The R code does: 60 1. lead1_retm = shift(retm, 1, type="lead") -- next month's return 61 2. retm_y = exp(roll_sum(log(1+lead1_retm), n=12, align="left")) - 1 -- 12-month forward return 62 3. rf_y = geometric average of quarterly T-bill rates (current + 3,6,9 month leads) 63 4. rmrf_y3 = 3-year geometric average excess return 64 """ 65 function compute_excess_returns(msi_path::String, tbill_df::DataFrame) 66 msi = CSV.read(msi_path, DataFrame) 67 dropmissing!(msi, [:vwretd]) 68 dates = Date.(msi.date) 69 dateym = year.(dates) .* 100 .+ month.(dates) 70 71 df = DataFrame(dateym = dateym, retm = msi.vwretd) 72 df = innerjoin(df, tbill_df, on=:dateym) 73 74 n = nrow(df) 75 retm = df.retm 76 rf = df.rf 77 78 # 12-month forward market return: lead by 1, then rolling 12-month sum of log returns 79 retm_y = Vector{Union{Float64,Missing}}(missing, n) 80 for t in 1:(n-12) 81 retm_y[t] = exp(sum(log.(1.0 .+ retm[(t+1):(t+12)]))) - 1.0 82 end 83 84 # Annual risk-free: geometric average of quarterly T-bill 85 rf_y = Vector{Union{Float64,Missing}}(missing, n) 86 for t in 1:(n-9) 87 rf_y[t] = (1+rf[t])^0.25 * (1+rf[t+3])^0.25 * (1+rf[t+6])^0.25 * (1+rf[t+9])^0.25 - 1.0 88 end 89 90 # 3-year geometric average excess return 91 rmrf_y3 = Vector{Union{Float64,Missing}}(missing, n) 92 for t in 1:(n-24) 93 if !ismissing(retm_y[t]) && !ismissing(retm_y[t+12]) && !ismissing(retm_y[t+24]) && 94 !ismissing(rf_y[t]) && !ismissing(rf_y[t+12]) && !ismissing(rf_y[t+24]) 95 rmrf_y3[t] = ( 96 ((1+retm_y[t]) * (1+retm_y[t+12]) * (1+retm_y[t+24]))^(1/3) - 97 (((1+rf_y[t]) * (1+rf_y[t+12]) * (1+rf_y[t+24]))^(1/3) - 1) - 1 98 ) 99 end 100 end 101 102 return DataFrame(dateym = df.dateym, rmrf_y3 = rmrf_y3) 103 end 104 105 """ 106 build_predictors(msi_path, cay_path) -> DataFrame 107 108 Full pipeline: compute D/P, download T-bill, compute excess returns, 109 load CAY, merge all. Returns DataFrame with: dateym, dp, rf, rmrf_y3, cay. 110 """ 111 function build_predictors(msi_path::String, cay_path::String) 112 dp_df = compute_dp(msi_path) 113 tbill_df = compute_tbill() 114 rmrf_df = compute_excess_returns(msi_path, tbill_df) 115 116 # Load CAY (either Lettau's or our computed version) 117 cay_raw = CSV.read(cay_path, DataFrame; header=1) 118 rename!(cay_raw, Symbol.(["date", "c", "w", "y", "cay"])) 119 cay_raw.date = Date.(cay_raw.date) 120 cay_df = DataFrame( 121 dateym = year.(cay_raw.date) .* 100 .+ month.(cay_raw.date), 122 cay = cay_raw.cay, 123 ) 124 125 # Merge 126 predict = innerjoin(dp_df, tbill_df, on=:dateym) 127 predict = innerjoin(predict, rmrf_df, on=:dateym) 128 predict = innerjoin(predict, cay_df, on=:dateym) 129 dropmissing!(predict) 130 131 return predict 132 end 133 134 # When run as script 135 if abspath(PROGRAM_FILE) == @__FILE__ 136 mkpath("tmp") 137 predict = build_predictors("output/msi.csv", "input/cay_computed.csv") 138 CSV.write("tmp/predict.csv", predict) 139 println("Wrote tmp/predict.csv: $(nrow(predict)) rows") 140 end 141 142 end # module