FinanceRoutines.jl

Financial data routines for Julia
Log | Files | Refs | README | LICENSE

commit b787aa1236e4cc994c3fcfd077e900d3ca16a101
parent ecdfad908a2ffdc692076849123da118749e1eba
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Tue, 20 Jan 2026 21:03:56 -0600

Fix WRDS password handling crash and missing siccd column

- Fix SecretBuffer crash in open_wrds_pg() by properly extracting
  password bytes before shredding
- Fix import_MSF_v2 to validate columns exist before selecting siccd/naics
- Update Project.toml: bump to v0.4.4, Julia 1.10 LTS, add missing compat bounds

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Diffstat:
M.gitignore | 1+
MProject.toml | 21++++++++++++++++-----
Msrc/FinanceRoutines.jl | 3++-
Msrc/ImportCRSP.jl | 28+++++++++++++++-------------
Msrc/ImportFamaFrench.jl | 20++++++++++----------
Msrc/Utilities.jl | 9++++++---
Mtest/UnitTests/KenFrench.jl | 6++----
Mtest/UnitTests/WRDS.jl | 38+++++++++++++++++++++++---------------
Mtest/runtests.jl | 6+++---
9 files changed, 78 insertions(+), 54 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -9,6 +9,7 @@ docs/.DS_Store .env .env.gpg documentation +.claude # --------------------------------------------------------- diff --git a/Project.toml b/Project.toml @@ -1,7 +1,7 @@ name = "FinanceRoutines" uuid = "2e4c0fa2-b49b-4c8f-9592-485f04b9fc03" authors = ["Erik Loualiche <eloualic@umn.edu>"] -version = "0.4.3" +version = "0.4.4" [deps] BazerData = "d6d9bf1d-14ee-42c9-93f7-cccc2a9ff2c2" @@ -26,11 +26,22 @@ WeakRefStrings = "ea10d353-3f73-51f8-a26c-33c1cb351aa5" ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [compat] -BazerData = "0.7.4" +BazerData = "0.7" CSV = "0.10" -DataFrames = "1" -Roots = "2.2.7" -julia = "1" +DataFrames = "1.6" +DataPipes = "0.3" +Decimals = "0.4" +FlexiJoins = "0.1" +GLM = "1.9" +IntervalSets = "0.7" +LibPQ = "1.17" +Missings = "1" +PeriodicalDates = "2" +Roots = "2.2" +Tables = "1.10" +WeakRefStrings = "1.4" +ZipFile = "0.10" +julia = "1.10" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/FinanceRoutines.jl b/src/FinanceRoutines.jl @@ -5,7 +5,7 @@ module FinanceRoutines import BazerData: tlag import CSV import DataFrames: AbstractDataFrame, AsTable, DataFrame, DataFrameRow, ByRow, groupby, nrow, passmissing, Not, - rename!, select, select!, subset!, transform!, leftjoin, disallowmissing! + rename!, select, select!, subset, subset!, transform!, leftjoin, disallowmissing! import DataPipes: @p import Dates: Dates, Date, Day, Month, year import Decimals: Decimal @@ -24,6 +24,7 @@ import Tables: columntable import WeakRefStrings: String3, String7, String15 import ZipFile # import ZipFile: ZipFile.Reader + # -------------------------------------------------------------------------------------------------- diff --git a/src/ImportCRSP.jl b/src/ImportCRSP.jl @@ -386,14 +386,14 @@ function import_MSF_v2(wrds_conn::Connection; @log_msg "# -- GETTING StkSecurityInfoHist (CIZ)" # stksecurityinfo = _get_postgres_columns("crsp", "stksecurityinfohist"; wrds_conn=wrds_conn) stksecurityinfo_cols = vcat( - ["PERMNO", "SecInfoStartDt", "SecInfoEndDt", "IssuerNm", "ShareClass", + ["PERMNO", "SecInfoStartDt", "SecInfoEndDt", "IssuerNm", "ShareClass", "PrimaryExch", "TradingStatusFlg", "NAICS", "SICCD", "HDRCUSIP"], - uppercase.(variables)) |> filter(!isempty) |> unique - stksecurityinfo = _get_postgres_columns("crsp", "stksecurityinfohist"; wrds_conn=wrds_conn, + uppercase.(variables)) |> filter(!isempty) |> unique + stksecurityinfo_cols = _get_postgres_columns("crsp", "stksecurityinfohist"; wrds_conn=wrds_conn, prior_columns = stksecurityinfo_cols) |> sort - stksecurityinfo_cols = join(uppercase.(stksecurityinfo_cols), ", ") + stksecurityinfo_query = join(stksecurityinfo_cols, ", ") - postgre_query_stksecurityinfo = "SELECT $stksecurityinfo_cols FROM crsp.stksecurityinfohist" + postgre_query_stksecurityinfo = "SELECT $stksecurityinfo_query FROM crsp.stksecurityinfohist" df_stksecurityinfo = execute(wrds_conn, postgre_query_stksecurityinfo) |> DataFrame; transform!(df_stksecurityinfo, names(df_stksecurityinfo, check_integer.(eachcol(df_stksecurityinfo))) .=> @@ -411,6 +411,8 @@ function import_MSF_v2(wrds_conn::Connection; # ---------------------------------------------------------------------------------------------- + # only include siccd/naics if they exist in the DataFrame + optional_cols = intersect([:siccd, :naics], Symbol.(names(df_msf_v2))) var_select = vcat( :permno, # Security identifier :mthcaldt, # Date of the observation @@ -420,17 +422,17 @@ function import_MSF_v2(wrds_conn::Connection; :mthprc, :mthcap, :mthprevcap, - # :mthvol, :mthprcvol # volume and price volume - :siccd, # Industry code - :naics, # Industry code + # :mthvol, :mthprcvol # volume and price volume + optional_cols, Symbol.(intersect(variables, names(df_msf_v2))) ) - @p df_msf_v2 |> select!(__, var_select) |> sort!(__, [:permno, :mthcaldt]) |> - disallowmissing!(__, [:mthcaldt]) - transform!(df_msf_v2, - :naics => (x -> replace(x, "0" => missing)) => :naics, - :mthcaldt => ByRow(MonthlyDate) => :datem) + @p df_msf_v2 |> select!(__, var_select) |> sort!(__, [:permno, :mthcaldt]) |> + disallowmissing!(__, [:mthcaldt]) + if "naics" in names(df_msf_v2) + transform!(df_msf_v2, :naics => (x -> replace(x, "0" => missing)) => :naics) + end + transform!(df_msf_v2, :mthcaldt => ByRow(MonthlyDate) => :datem) # ---------------------------------------------------------------------------------------------- diff --git a/src/ImportFamaFrench.jl b/src/ImportFamaFrench.jl @@ -74,7 +74,7 @@ function import_FF3(;frequency::Symbol=:monthly) http_response = Downloads.download(url_FF_mth_yr); z = ZipFile.Reader(http_response) ; - a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1] + a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1] df_FF3 = copy(_parse_ff_annual(a_file_in_zip, types=ff_col_classes)) close(z) return df_FF3 @@ -87,7 +87,7 @@ function import_FF3(;frequency::Symbol=:monthly) a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1] df_FF3 = copy(_parse_ff_monthly(a_file_in_zip, types=ff_col_classes)) close(z) - + transform!(df_FF3, :datem => ByRow(x -> MonthlyDate(x, "yyyymm")) => :datem) return df_FF3 @@ -122,7 +122,7 @@ function _parse_ff_annual(zip_file; types=nothing) # Read all lines from the zip file entry file_lines = split(String(read(zip_file)), '\n') - + for line in file_lines if occursin(r"Annual Factors", line) found_annual = true @@ -136,14 +136,15 @@ function _parse_ff_annual(zip_file; types=nothing) end if occursin(r"^\s*$", line) || occursin(r"[A-Za-z]{3,}", line[1:min(10, length(line))]) - if !occursin(r"^\s*$", line) && !occursin(r"^\d{4}", line) + if !occursin(r"^\s*$", line) && !occursin(r"^\s*\d{4}", line) # Added \s* break end continue end - if occursin(r"^\d{4}", line) - push!(lines, line) + if occursin(r"^\s*\d{4}", line) + clean_line = replace(line, r"[\r]" => "") + push!(lines, clean_line) end end end @@ -152,8 +153,8 @@ function _parse_ff_annual(zip_file; types=nothing) error("Annual Factors section not found in file") end - buffer = IOBuffer(join(lines, "\n")) - return CSV.File(buffer, header=false, delim=",", ntasks=1, types=types) |> DataFrame |> + lines_buffer = IOBuffer(join(lines, "\n")) + return CSV.File(lines_buffer, header=false, delim=",", ntasks=1, types=types) |> DataFrame |> df -> rename!(df, [:datey, :mktrf, :smb, :hml, :rf]) end # -------------------------------------------------------------------------------------------------- @@ -162,10 +163,9 @@ end # -------------------------------------------------------------------------------------------------- function _parse_ff_monthly(zip_file; types=nothing) - # Read all lines from the zip file entry file_lines = split(String(read(zip_file)), '\n') - skipto = 5 + skipto = 6 # Collect data lines until we hit "Annual Factors" data_lines = String[] diff --git a/src/Utilities.jl b/src/Utilities.jl @@ -34,10 +34,13 @@ function open_wrds_pg() print("Enter WRDS username: ") user = readline() password_buffer = Base.getpass("Enter WRDS password") - # con = open_wrds_pg(user, String(password_buffer.data[1:password_buffer.size])); - con = open_wrds_pg(user, read(password_buffer, String)) + password_bytes = copy(password_buffer.data[1:password_buffer.size]) Base.shred!(password_buffer) - return con + try + return open_wrds_pg(user, String(password_bytes)) + finally + fill!(password_bytes, 0x00) # zero out the password bytes + end end # -------------------------------------------------------------------------------------------------- diff --git a/test/UnitTests/KenFrench.jl b/test/UnitTests/KenFrench.jl @@ -2,11 +2,9 @@ import Dates - - df_FF3_annual = FinanceRoutines.import_FF3(frequency=:annual); + df_FF3_annual = FinanceRoutines.import_FF3(frequency=:annual) @test names(df_FF3_annual) == ["datey", "mktrf", "smb", "hml", "rf"] - @test nrow(df_FF3_annual) >= Dates.year(Dates.today()) - 1926 - 1 - + @test nrow(df_FF3_annual) >= Dates.year(Dates.today() - Dates.Month(1)) - 1926 - 1 df_FF3_monthly = FinanceRoutines.import_FF3(frequency=:monthly); @test names(df_FF3_monthly) == ["datem", "mktrf", "smb", "hml", "rf"] diff --git a/test/UnitTests/WRDS.jl b/test/UnitTests/WRDS.jl @@ -5,18 +5,19 @@ import Tables: columntable wrds_conn = FinanceRoutines.open_wrds_pg( - get(ENV, "WRDS_USERNAME", ""), + get(ENV, "WRDS_USERNAME", ""), get(ENV, "WRDS_PWD", "")) @test typeof(wrds_conn) == Connection - date_range_test = (Date("2000-01-01"), Date("2002-01-01")) - date_range_test_daily = (Date("2002-02-01"), Date("2002-02-05")) + date_range_test = (Date("2000-01-01"), Date("2002-01-01")) + date_range_test_recent = (Date("2025-01-01"), Date("2025-03-01")) + date_range_test_daily = (Date("2002-02-01"), Date("2002-02-05")) # ----------------------------------------------------------------------------------------- # @testset "CRSP MSF" begin println("\033[1m\033[32m → running\033[0m: CRSP MSF") df_msf = import_MSF(wrds_conn; date_range = date_range_test); - build_MSF!(df_msf; clean_cols=true); + build_MSF!(df_msf; clean_cols=true); @test minimum(skipmissing(df_msf.date)) >= Date("2000-01-01") @test maximum(skipmissing(df_msf.date)) <= Date("2002-01-01") @@ -29,8 +30,17 @@ println("\033[1m\033[32m → running\033[0m: CRSP MSF V2") # new version CIZ of crsp msf + @test try + import_MSF_v2(wrds_conn; date_range = date_range_test_recent, logging_level=:info) + true + catch e + @error e + false + end + df_msf_v2 = import_MSF_v2(wrds_conn; date_range = date_range_test, logging_level=:info) - + + # @test subset(df_msf_v2, [:mthcaldt, :mthprcdt] => (x,y) -> isequal.(x, y) ) |> nrow > 0 @test subset(df_msf_v2, :mthprc => ByRow(x -> !isequal(x, abs(x))) ) |> nrow == 0 @test subset(df_msf_v2, :mthcap => (x -> isequal.(x, 0) ) ) |> nrow == 0 @@ -39,9 +49,9 @@ @test maximum(skipmissing(df_msf_v2.mthcaldt)) <= Date("2002-01-01") @test nrow(df_msf_v2) > 100_000 - # discrepancy in nrow with df_msf_v2 ... + # discrepancy in nrow with df_msf_v2 ... - end + end # ----------------------------------------------------------------------------------------- # @@ -91,11 +101,11 @@ # ----------------------------------------------------------------------------------------- # @testset "CRSP-Compustat LINK" begin println("\033[1m\033[32m → running\033[0m: CRSP-Compustat LINK") - + df_linktable = FinanceRoutines.import_ccm_link(wrds_conn) # test on table itself @test all(map(s -> s in names(df_linktable), - lowercase.(["GVKEY", "LINKPRIM", "LIID", "LINKTYPE", "PERMNO", "LPERMCO", + lowercase.(["GVKEY", "LINKPRIM", "LIID", "LINKTYPE", "PERMNO", "LPERMCO", "LINKDT", "LINKENDDT"]))) @test isempty(setdiff(unique(df_linktable.linktype), ["LU", "LC", "LS"])) @test isempty(setdiff(unique(df_linktable.linkprim), ["P", "C"])) @@ -104,17 +114,17 @@ df_msf_v2 = import_MSF_v2(wrds_conn; date_range = date_range_test, logging_level=:info) df_msf_v2 = select(df_msf_v2, :permno, :mthcaldt=>:date, :datem, :mthret=>:ret, :mthcap) - df_funda = @p import_Funda(wrds_conn; date_range = date_range_test, + df_funda = @p import_Funda(wrds_conn; date_range = date_range_test, variables=["PPENT", "NAICSH"]) |> build_Funda(__; clean_cols=true) df_msf_v2 = link_MSF(df_linktable, df_msf_v2) # merge gvkey on monthly stock file - @test @p df_msf_v2 |> unique(__, [:permno, :gvkey]) |> + @test @p df_msf_v2 |> unique(__, [:permno, :gvkey]) |> groupby(__, :permno) |> combine(__, nrow) |> __.nrow |> unique |> all( .<=(2) ) - + df_ccm = innerjoin(df_msf_v2, df_funda, on = [:gvkey, :datey], matchmissing=:notequal) - @test @p df_msf_v2 |> unique(__, [:permno, :gvkey, :date, :datey]) |> + @test @p df_msf_v2 |> unique(__, [:permno, :gvkey, :date, :datey]) |> groupby(__, [:permno, :datey]) |> combine(__, nrow) |> __.nrow |> unique |> all( .<=(12) ) @@ -126,5 +136,3 @@ end - - diff --git a/test/runtests.jl b/test/runtests.jl @@ -12,10 +12,10 @@ import DataPipes: @p # -------------------------------------------------------------------------------------------------- const testsuite = [ - "KenFrench", - "WRDS", + "KenFrench", + "WRDS", "betas", - "Yields", + "Yields", ] # --------------------------------------------------------------------------------------------------