commit b787aa1236e4cc994c3fcfd077e900d3ca16a101
parent ecdfad908a2ffdc692076849123da118749e1eba
Author: Erik Loualiche <eloualic@umn.edu>
Date: Tue, 20 Jan 2026 21:03:56 -0600
Fix WRDS password handling crash and missing siccd column
- Fix SecretBuffer crash in open_wrds_pg() by properly extracting
password bytes before shredding
- Fix import_MSF_v2 to validate columns exist before selecting siccd/naics
- Update Project.toml: bump to v0.4.4, Julia 1.10 LTS, add missing compat bounds
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat:
9 files changed, 78 insertions(+), 54 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ docs/.DS_Store
.env
.env.gpg
documentation
+.claude
# ---------------------------------------------------------
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
name = "FinanceRoutines"
uuid = "2e4c0fa2-b49b-4c8f-9592-485f04b9fc03"
authors = ["Erik Loualiche <eloualic@umn.edu>"]
-version = "0.4.3"
+version = "0.4.4"
[deps]
BazerData = "d6d9bf1d-14ee-42c9-93f7-cccc2a9ff2c2"
@@ -26,11 +26,22 @@ WeakRefStrings = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"
ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
[compat]
-BazerData = "0.7.4"
+BazerData = "0.7"
CSV = "0.10"
-DataFrames = "1"
-Roots = "2.2.7"
-julia = "1"
+DataFrames = "1.6"
+DataPipes = "0.3"
+Decimals = "0.4"
+FlexiJoins = "0.1"
+GLM = "1.9"
+IntervalSets = "0.7"
+LibPQ = "1.17"
+Missings = "1"
+PeriodicalDates = "2"
+Roots = "2.2"
+Tables = "1.10"
+WeakRefStrings = "1.4"
+ZipFile = "0.10"
+julia = "1.10"
[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/FinanceRoutines.jl b/src/FinanceRoutines.jl
@@ -5,7 +5,7 @@ module FinanceRoutines
import BazerData: tlag
import CSV
import DataFrames: AbstractDataFrame, AsTable, DataFrame, DataFrameRow, ByRow, groupby, nrow, passmissing, Not,
- rename!, select, select!, subset!, transform!, leftjoin, disallowmissing!
+ rename!, select, select!, subset, subset!, transform!, leftjoin, disallowmissing!
import DataPipes: @p
import Dates: Dates, Date, Day, Month, year
import Decimals: Decimal
@@ -24,6 +24,7 @@ import Tables: columntable
import WeakRefStrings: String3, String7, String15
import ZipFile
# import ZipFile: ZipFile.Reader
+
# --------------------------------------------------------------------------------------------------
diff --git a/src/ImportCRSP.jl b/src/ImportCRSP.jl
@@ -386,14 +386,14 @@ function import_MSF_v2(wrds_conn::Connection;
@log_msg "# -- GETTING StkSecurityInfoHist (CIZ)"
# stksecurityinfo = _get_postgres_columns("crsp", "stksecurityinfohist"; wrds_conn=wrds_conn)
stksecurityinfo_cols = vcat(
- ["PERMNO", "SecInfoStartDt", "SecInfoEndDt", "IssuerNm", "ShareClass",
+ ["PERMNO", "SecInfoStartDt", "SecInfoEndDt", "IssuerNm", "ShareClass",
"PrimaryExch", "TradingStatusFlg", "NAICS", "SICCD", "HDRCUSIP"],
- uppercase.(variables)) |> filter(!isempty) |> unique
- stksecurityinfo = _get_postgres_columns("crsp", "stksecurityinfohist"; wrds_conn=wrds_conn,
+ uppercase.(variables)) |> filter(!isempty) |> unique
+ stksecurityinfo_cols = _get_postgres_columns("crsp", "stksecurityinfohist"; wrds_conn=wrds_conn,
prior_columns = stksecurityinfo_cols) |> sort
- stksecurityinfo_cols = join(uppercase.(stksecurityinfo_cols), ", ")
+ stksecurityinfo_query = join(stksecurityinfo_cols, ", ")
- postgre_query_stksecurityinfo = "SELECT $stksecurityinfo_cols FROM crsp.stksecurityinfohist"
+ postgre_query_stksecurityinfo = "SELECT $stksecurityinfo_query FROM crsp.stksecurityinfohist"
df_stksecurityinfo = execute(wrds_conn, postgre_query_stksecurityinfo) |> DataFrame;
transform!(df_stksecurityinfo,
names(df_stksecurityinfo, check_integer.(eachcol(df_stksecurityinfo))) .=>
@@ -411,6 +411,8 @@ function import_MSF_v2(wrds_conn::Connection;
# ----------------------------------------------------------------------------------------------
+ # only include siccd/naics if they exist in the DataFrame
+ optional_cols = intersect([:siccd, :naics], Symbol.(names(df_msf_v2)))
var_select = vcat(
:permno, # Security identifier
:mthcaldt, # Date of the observation
@@ -420,17 +422,17 @@ function import_MSF_v2(wrds_conn::Connection;
:mthprc,
:mthcap,
:mthprevcap,
- # :mthvol, :mthprcvol # volume and price volume
- :siccd, # Industry code
- :naics, # Industry code
+ # :mthvol, :mthprcvol # volume and price volume
+ optional_cols,
Symbol.(intersect(variables, names(df_msf_v2)))
)
- @p df_msf_v2 |> select!(__, var_select) |> sort!(__, [:permno, :mthcaldt]) |>
- disallowmissing!(__, [:mthcaldt])
- transform!(df_msf_v2,
- :naics => (x -> replace(x, "0" => missing)) => :naics,
- :mthcaldt => ByRow(MonthlyDate) => :datem)
+ @p df_msf_v2 |> select!(__, var_select) |> sort!(__, [:permno, :mthcaldt]) |>
+ disallowmissing!(__, [:mthcaldt])
+ if "naics" in names(df_msf_v2)
+ transform!(df_msf_v2, :naics => (x -> replace(x, "0" => missing)) => :naics)
+ end
+ transform!(df_msf_v2, :mthcaldt => ByRow(MonthlyDate) => :datem)
# ----------------------------------------------------------------------------------------------
diff --git a/src/ImportFamaFrench.jl b/src/ImportFamaFrench.jl
@@ -74,7 +74,7 @@ function import_FF3(;frequency::Symbol=:monthly)
http_response = Downloads.download(url_FF_mth_yr);
z = ZipFile.Reader(http_response) ;
- a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1]
+ a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1]
df_FF3 = copy(_parse_ff_annual(a_file_in_zip, types=ff_col_classes))
close(z)
return df_FF3
@@ -87,7 +87,7 @@ function import_FF3(;frequency::Symbol=:monthly)
a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1]
df_FF3 = copy(_parse_ff_monthly(a_file_in_zip, types=ff_col_classes))
close(z)
-
+
transform!(df_FF3, :datem => ByRow(x -> MonthlyDate(x, "yyyymm")) => :datem)
return df_FF3
@@ -122,7 +122,7 @@ function _parse_ff_annual(zip_file; types=nothing)
# Read all lines from the zip file entry
file_lines = split(String(read(zip_file)), '\n')
-
+
for line in file_lines
if occursin(r"Annual Factors", line)
found_annual = true
@@ -136,14 +136,15 @@ function _parse_ff_annual(zip_file; types=nothing)
end
if occursin(r"^\s*$", line) || occursin(r"[A-Za-z]{3,}", line[1:min(10, length(line))])
- if !occursin(r"^\s*$", line) && !occursin(r"^\d{4}", line)
+ if !occursin(r"^\s*$", line) && !occursin(r"^\s*\d{4}", line) # Added \s*
break
end
continue
end
- if occursin(r"^\d{4}", line)
- push!(lines, line)
+ if occursin(r"^\s*\d{4}", line)
+ clean_line = replace(line, r"[\r]" => "")
+ push!(lines, clean_line)
end
end
end
@@ -152,8 +153,8 @@ function _parse_ff_annual(zip_file; types=nothing)
error("Annual Factors section not found in file")
end
- buffer = IOBuffer(join(lines, "\n"))
- return CSV.File(buffer, header=false, delim=",", ntasks=1, types=types) |> DataFrame |>
+ lines_buffer = IOBuffer(join(lines, "\n"))
+ return CSV.File(lines_buffer, header=false, delim=",", ntasks=1, types=types) |> DataFrame |>
df -> rename!(df, [:datey, :mktrf, :smb, :hml, :rf])
end
# --------------------------------------------------------------------------------------------------
@@ -162,10 +163,9 @@ end
# --------------------------------------------------------------------------------------------------
function _parse_ff_monthly(zip_file; types=nothing)
-
# Read all lines from the zip file entry
file_lines = split(String(read(zip_file)), '\n')
- skipto = 5
+ skipto = 6
# Collect data lines until we hit "Annual Factors"
data_lines = String[]
diff --git a/src/Utilities.jl b/src/Utilities.jl
@@ -34,10 +34,13 @@ function open_wrds_pg()
print("Enter WRDS username: ")
user = readline()
password_buffer = Base.getpass("Enter WRDS password")
- # con = open_wrds_pg(user, String(password_buffer.data[1:password_buffer.size]));
- con = open_wrds_pg(user, read(password_buffer, String))
+ password_bytes = copy(password_buffer.data[1:password_buffer.size])
Base.shred!(password_buffer)
- return con
+ try
+ return open_wrds_pg(user, String(password_bytes))
+ finally
+ fill!(password_bytes, 0x00) # zero out the password bytes
+ end
end
# --------------------------------------------------------------------------------------------------
diff --git a/test/UnitTests/KenFrench.jl b/test/UnitTests/KenFrench.jl
@@ -2,11 +2,9 @@
import Dates
-
- df_FF3_annual = FinanceRoutines.import_FF3(frequency=:annual);
+ df_FF3_annual = FinanceRoutines.import_FF3(frequency=:annual)
@test names(df_FF3_annual) == ["datey", "mktrf", "smb", "hml", "rf"]
- @test nrow(df_FF3_annual) >= Dates.year(Dates.today()) - 1926 - 1
-
+ @test nrow(df_FF3_annual) >= Dates.year(Dates.today() - Dates.Month(1)) - 1926 - 1
df_FF3_monthly = FinanceRoutines.import_FF3(frequency=:monthly);
@test names(df_FF3_monthly) == ["datem", "mktrf", "smb", "hml", "rf"]
diff --git a/test/UnitTests/WRDS.jl b/test/UnitTests/WRDS.jl
@@ -5,18 +5,19 @@
import Tables: columntable
wrds_conn = FinanceRoutines.open_wrds_pg(
- get(ENV, "WRDS_USERNAME", ""),
+ get(ENV, "WRDS_USERNAME", ""),
get(ENV, "WRDS_PWD", ""))
@test typeof(wrds_conn) == Connection
- date_range_test = (Date("2000-01-01"), Date("2002-01-01"))
- date_range_test_daily = (Date("2002-02-01"), Date("2002-02-05"))
+ date_range_test = (Date("2000-01-01"), Date("2002-01-01"))
+ date_range_test_recent = (Date("2025-01-01"), Date("2025-03-01"))
+ date_range_test_daily = (Date("2002-02-01"), Date("2002-02-05"))
# ----------------------------------------------------------------------------------------- #
@testset "CRSP MSF" begin
println("\033[1m\033[32m → running\033[0m: CRSP MSF")
df_msf = import_MSF(wrds_conn; date_range = date_range_test);
- build_MSF!(df_msf; clean_cols=true);
+ build_MSF!(df_msf; clean_cols=true);
@test minimum(skipmissing(df_msf.date)) >= Date("2000-01-01")
@test maximum(skipmissing(df_msf.date)) <= Date("2002-01-01")
@@ -29,8 +30,17 @@
println("\033[1m\033[32m → running\033[0m: CRSP MSF V2")
# new version CIZ of crsp msf
+ @test try
+ import_MSF_v2(wrds_conn; date_range = date_range_test_recent, logging_level=:info)
+ true
+ catch e
+ @error e
+ false
+ end
+
df_msf_v2 = import_MSF_v2(wrds_conn; date_range = date_range_test, logging_level=:info)
-
+
+
# @test subset(df_msf_v2, [:mthcaldt, :mthprcdt] => (x,y) -> isequal.(x, y) ) |> nrow > 0
@test subset(df_msf_v2, :mthprc => ByRow(x -> !isequal(x, abs(x))) ) |> nrow == 0
@test subset(df_msf_v2, :mthcap => (x -> isequal.(x, 0) ) ) |> nrow == 0
@@ -39,9 +49,9 @@
@test maximum(skipmissing(df_msf_v2.mthcaldt)) <= Date("2002-01-01")
@test nrow(df_msf_v2) > 100_000
- # discrepancy in nrow with df_msf_v2 ...
+ # discrepancy in nrow with df_msf_v2 ...
- end
+ end
# ----------------------------------------------------------------------------------------- #
@@ -91,11 +101,11 @@
# ----------------------------------------------------------------------------------------- #
@testset "CRSP-Compustat LINK" begin
println("\033[1m\033[32m → running\033[0m: CRSP-Compustat LINK")
-
+
df_linktable = FinanceRoutines.import_ccm_link(wrds_conn)
# test on table itself
@test all(map(s -> s in names(df_linktable),
- lowercase.(["GVKEY", "LINKPRIM", "LIID", "LINKTYPE", "PERMNO", "LPERMCO",
+ lowercase.(["GVKEY", "LINKPRIM", "LIID", "LINKTYPE", "PERMNO", "LPERMCO",
"LINKDT", "LINKENDDT"])))
@test isempty(setdiff(unique(df_linktable.linktype), ["LU", "LC", "LS"]))
@test isempty(setdiff(unique(df_linktable.linkprim), ["P", "C"]))
@@ -104,17 +114,17 @@
df_msf_v2 = import_MSF_v2(wrds_conn; date_range = date_range_test, logging_level=:info)
df_msf_v2 = select(df_msf_v2, :permno, :mthcaldt=>:date, :datem, :mthret=>:ret, :mthcap)
- df_funda = @p import_Funda(wrds_conn; date_range = date_range_test,
+ df_funda = @p import_Funda(wrds_conn; date_range = date_range_test,
variables=["PPENT", "NAICSH"]) |>
build_Funda(__; clean_cols=true)
df_msf_v2 = link_MSF(df_linktable, df_msf_v2) # merge gvkey on monthly stock file
- @test @p df_msf_v2 |> unique(__, [:permno, :gvkey]) |>
+ @test @p df_msf_v2 |> unique(__, [:permno, :gvkey]) |>
groupby(__, :permno) |> combine(__, nrow) |> __.nrow |> unique |>
all( .<=(2) )
-
+
df_ccm = innerjoin(df_msf_v2, df_funda, on = [:gvkey, :datey], matchmissing=:notequal)
- @test @p df_msf_v2 |> unique(__, [:permno, :gvkey, :date, :datey]) |>
+ @test @p df_msf_v2 |> unique(__, [:permno, :gvkey, :date, :datey]) |>
groupby(__, [:permno, :datey]) |> combine(__, nrow) |> __.nrow |> unique |>
all( .<=(12) )
@@ -126,5 +136,3 @@
end
-
-
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -12,10 +12,10 @@ import DataPipes: @p
# --------------------------------------------------------------------------------------------------
const testsuite = [
- "KenFrench",
- "WRDS",
+ "KenFrench",
+ "WRDS",
"betas",
- "Yields",
+ "Yields",
]
# --------------------------------------------------------------------------------------------------