Fix WRDS password handling crash and missing siccd column - FinanceRoutines.jl

commit b787aa1236e4cc994c3fcfd077e900d3ca16a101
parent ecdfad908a2ffdc692076849123da118749e1eba
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Tue, 20 Jan 2026 21:03:56 -0600

Fix WRDS password handling crash and missing siccd column

- Fix SecretBuffer crash in open_wrds_pg() by properly extracting
  password bytes before shredding
- Fix import_MSF_v2 to validate columns exist before selecting siccd/naics
- Update Project.toml: bump to v0.4.4, Julia 1.10 LTS, add missing compat bounds

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Diffstat:
M .gitignore  | 1 +
M Project.toml  | 21 ++++++++++++++++-----
M src/FinanceRoutines.jl  | 3 ++-
M src/ImportCRSP.jl  | 28 +++++++++++++++-------------
M src/ImportFamaFrench.jl  | 20 ++++++++++----------
M src/Utilities.jl  | 9 ++++++---
M test/UnitTests/KenFrench.jl  | 6 ++----
M test/UnitTests/WRDS.jl  | 38 +++++++++++++++++++++++---------------
M test/runtests.jl  | 6 +++---

9 files changed, 78 insertions(+), 54 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ docs/.DS_Store
 .env
 .env.gpg
 documentation
+.claude
 # ---------------------------------------------------------
 
 
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FinanceRoutines"
 uuid = "2e4c0fa2-b49b-4c8f-9592-485f04b9fc03"
 authors = ["Erik Loualiche <eloualic@umn.edu>"]
-version = "0.4.3"
+version = "0.4.4"
 
 [deps]
 BazerData = "d6d9bf1d-14ee-42c9-93f7-cccc2a9ff2c2"
@@ -26,11 +26,22 @@ WeakRefStrings = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"
 ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"
 
 [compat]
-BazerData = "0.7.4"
+BazerData = "0.7"
 CSV = "0.10"
-DataFrames = "1"
-Roots = "2.2.7"
-julia = "1"
+DataFrames = "1.6"
+DataPipes = "0.3"
+Decimals = "0.4"
+FlexiJoins = "0.1"
+GLM = "1.9"
+IntervalSets = "0.7"
+LibPQ = "1.17"
+Missings = "1"
+PeriodicalDates = "2"
+Roots = "2.2"
+Tables = "1.10"
+WeakRefStrings = "1.4"
+ZipFile = "0.10"
+julia = "1.10"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/FinanceRoutines.jl b/src/FinanceRoutines.jl
@@ -5,7 +5,7 @@ module FinanceRoutines
 import BazerData: tlag
 import CSV
 import DataFrames: AbstractDataFrame, AsTable, DataFrame, DataFrameRow, ByRow, groupby, nrow, passmissing, Not,
-  rename!, select, select!, subset!, transform!, leftjoin, disallowmissing!
+  rename!, select, select!, subset, subset!, transform!, leftjoin, disallowmissing!
 import DataPipes: @p
 import Dates: Dates, Date, Day, Month, year
 import Decimals: Decimal
@@ -24,6 +24,7 @@ import Tables: columntable
 import WeakRefStrings: String3, String7, String15
 import ZipFile
 # import ZipFile: ZipFile.Reader
+
 # --------------------------------------------------------------------------------------------------
 
 
diff --git a/src/ImportCRSP.jl b/src/ImportCRSP.jl
@@ -386,14 +386,14 @@ function import_MSF_v2(wrds_conn::Connection;
     @log_msg "# -- GETTING StkSecurityInfoHist (CIZ)"
     # stksecurityinfo = _get_postgres_columns("crsp", "stksecurityinfohist"; wrds_conn=wrds_conn)
     stksecurityinfo_cols = vcat(
-        ["PERMNO", "SecInfoStartDt", "SecInfoEndDt", "IssuerNm", "ShareClass", 
+        ["PERMNO", "SecInfoStartDt", "SecInfoEndDt", "IssuerNm", "ShareClass",
          "PrimaryExch", "TradingStatusFlg", "NAICS", "SICCD", "HDRCUSIP"],
-        uppercase.(variables)) |> filter(!isempty) |> unique 
-    stksecurityinfo = _get_postgres_columns("crsp", "stksecurityinfohist"; wrds_conn=wrds_conn,
+        uppercase.(variables)) |> filter(!isempty) |> unique
+    stksecurityinfo_cols = _get_postgres_columns("crsp", "stksecurityinfohist"; wrds_conn=wrds_conn,
         prior_columns = stksecurityinfo_cols) |> sort
-    stksecurityinfo_cols = join(uppercase.(stksecurityinfo_cols), ", ")
+    stksecurityinfo_query = join(stksecurityinfo_cols, ", ")
 
-    postgre_query_stksecurityinfo = "SELECT $stksecurityinfo_cols FROM crsp.stksecurityinfohist"
+    postgre_query_stksecurityinfo = "SELECT $stksecurityinfo_query FROM crsp.stksecurityinfohist"
     df_stksecurityinfo = execute(wrds_conn, postgre_query_stksecurityinfo) |> DataFrame;
     transform!(df_stksecurityinfo,
         names(df_stksecurityinfo, check_integer.(eachcol(df_stksecurityinfo))) .=> 
@@ -411,6 +411,8 @@ function import_MSF_v2(wrds_conn::Connection;
 
 
     # ----------------------------------------------------------------------------------------------
+    # only include siccd/naics if they exist in the DataFrame
+    optional_cols = intersect([:siccd, :naics], Symbol.(names(df_msf_v2)))
     var_select = vcat(
         :permno,   # Security identifier
         :mthcaldt, # Date of the observation
@@ -420,17 +422,17 @@ function import_MSF_v2(wrds_conn::Connection;
         :mthprc,
         :mthcap,
         :mthprevcap,
-        # :mthvol, :mthprcvol # volume and price volume        
-        :siccd, # Industry code
-        :naics, # Industry code
+        # :mthvol, :mthprcvol # volume and price volume
+        optional_cols,
         Symbol.(intersect(variables, names(df_msf_v2)))
     )
 
-    @p df_msf_v2 |> select!(__, var_select) |> sort!(__, [:permno, :mthcaldt]) |> 
-        disallowmissing!(__, [:mthcaldt]) 
-    transform!(df_msf_v2, 
-        :naics => (x -> replace(x, "0" => missing)) => :naics,
-        :mthcaldt => ByRow(MonthlyDate) => :datem)
+    @p df_msf_v2 |> select!(__, var_select) |> sort!(__, [:permno, :mthcaldt]) |>
+        disallowmissing!(__, [:mthcaldt])
+    if "naics" in names(df_msf_v2)
+        transform!(df_msf_v2, :naics => (x -> replace(x, "0" => missing)) => :naics)
+    end
+    transform!(df_msf_v2, :mthcaldt => ByRow(MonthlyDate) => :datem)
     # ----------------------------------------------------------------------------------------------
 
 
diff --git a/src/ImportFamaFrench.jl b/src/ImportFamaFrench.jl
@@ -74,7 +74,7 @@ function import_FF3(;frequency::Symbol=:monthly)
 
         http_response = Downloads.download(url_FF_mth_yr);
         z = ZipFile.Reader(http_response) ;
-        a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1]
+        a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1]       
         df_FF3 = copy(_parse_ff_annual(a_file_in_zip, types=ff_col_classes))
         close(z)
         return df_FF3
@@ -87,7 +87,7 @@ function import_FF3(;frequency::Symbol=:monthly)
         a_file_in_zip = filter(x -> match(r".*csv", lowercase(x.name)) != nothing, z.files)[1]
         df_FF3 = copy(_parse_ff_monthly(a_file_in_zip, types=ff_col_classes))
         close(z)
-
+        
         transform!(df_FF3, :datem => ByRow(x -> MonthlyDate(x, "yyyymm")) => :datem)
         return df_FF3
 
@@ -122,7 +122,7 @@ function _parse_ff_annual(zip_file; types=nothing)
     
     # Read all lines from the zip file entry
     file_lines = split(String(read(zip_file)), '\n')
-    
+   
     for line in file_lines
         if occursin(r"Annual Factors", line)
             found_annual = true
@@ -136,14 +136,15 @@ function _parse_ff_annual(zip_file; types=nothing)
             end
             
             if occursin(r"^\s*$", line) || occursin(r"[A-Za-z]{3,}", line[1:min(10, length(line))])
-                if !occursin(r"^\s*$", line) && !occursin(r"^\d{4}", line)
+                if !occursin(r"^\s*$", line) && !occursin(r"^\s*\d{4}", line) # Added \s*
                     break
                 end
                 continue
             end
             
-            if occursin(r"^\d{4}", line)
-                push!(lines, line)
+            if occursin(r"^\s*\d{4}", line) 
+                clean_line = replace(line, r"[\r]" => "") 
+                push!(lines, clean_line)
             end
         end
     end
@@ -152,8 +153,8 @@ function _parse_ff_annual(zip_file; types=nothing)
         error("Annual Factors section not found in file")
     end
     
-    buffer = IOBuffer(join(lines, "\n"))
-    return CSV.File(buffer, header=false, delim=",", ntasks=1, types=types) |> DataFrame |>
+    lines_buffer = IOBuffer(join(lines, "\n"))
+    return CSV.File(lines_buffer, header=false, delim=",", ntasks=1, types=types) |> DataFrame |>
            df -> rename!(df, [:datey, :mktrf, :smb, :hml, :rf])
 end
 # --------------------------------------------------------------------------------------------------
@@ -162,10 +163,9 @@ end
 # --------------------------------------------------------------------------------------------------
 function _parse_ff_monthly(zip_file; types=nothing)
     
-
     # Read all lines from the zip file entry
     file_lines = split(String(read(zip_file)), '\n')
-    skipto = 5
+    skipto = 6
 
     # Collect data lines until we hit "Annual Factors"
     data_lines = String[]
diff --git a/src/Utilities.jl b/src/Utilities.jl
@@ -34,10 +34,13 @@ function open_wrds_pg()
     print("Enter WRDS username: ")
     user = readline()
     password_buffer = Base.getpass("Enter WRDS password")
-    # con = open_wrds_pg(user, String(password_buffer.data[1:password_buffer.size]));
-    con = open_wrds_pg(user, read(password_buffer, String))
+    password_bytes = copy(password_buffer.data[1:password_buffer.size])
     Base.shred!(password_buffer)
-    return con
+    try
+        return open_wrds_pg(user, String(password_bytes))
+    finally
+        fill!(password_bytes, 0x00)  # zero out the password bytes
+    end
 end
 # --------------------------------------------------------------------------------------------------
 
diff --git a/test/UnitTests/KenFrench.jl b/test/UnitTests/KenFrench.jl
@@ -2,11 +2,9 @@
 
         import Dates
 
-
-        df_FF3_annual = FinanceRoutines.import_FF3(frequency=:annual);
+        df_FF3_annual = FinanceRoutines.import_FF3(frequency=:annual)
         @test names(df_FF3_annual) == ["datey", "mktrf", "smb", "hml",  "rf"]
-        @test nrow(df_FF3_annual) >= Dates.year(Dates.today()) - 1926 - 1
-
+        @test nrow(df_FF3_annual) >= Dates.year(Dates.today() - Dates.Month(1)) - 1926 - 1
 
         df_FF3_monthly = FinanceRoutines.import_FF3(frequency=:monthly);
         @test names(df_FF3_monthly) == ["datem", "mktrf", "smb", "hml",  "rf"]
diff --git a/test/UnitTests/WRDS.jl b/test/UnitTests/WRDS.jl
@@ -5,18 +5,19 @@
     import Tables: columntable
 
     wrds_conn = FinanceRoutines.open_wrds_pg(
-        get(ENV, "WRDS_USERNAME", ""), 
+        get(ENV, "WRDS_USERNAME", ""),
         get(ENV, "WRDS_PWD", ""))
     @test typeof(wrds_conn) == Connection
 
-    date_range_test       = (Date("2000-01-01"), Date("2002-01-01"))
-    date_range_test_daily = (Date("2002-02-01"), Date("2002-02-05"))
+    date_range_test        = (Date("2000-01-01"), Date("2002-01-01"))
+    date_range_test_recent = (Date("2025-01-01"), Date("2025-03-01"))
+    date_range_test_daily  = (Date("2002-02-01"), Date("2002-02-05"))
 
     # ----------------------------------------------------------------------------------------- #
     @testset "CRSP MSF" begin
         println("\033[1m\033[32m    → running\033[0m: CRSP MSF")
         df_msf = import_MSF(wrds_conn; date_range = date_range_test);
-        build_MSF!(df_msf; clean_cols=true);    
+        build_MSF!(df_msf; clean_cols=true);
 
         @test minimum(skipmissing(df_msf.date)) >= Date("2000-01-01")
         @test maximum(skipmissing(df_msf.date)) <= Date("2002-01-01")
@@ -29,8 +30,17 @@
         println("\033[1m\033[32m    → running\033[0m: CRSP MSF V2")
 
         # new version CIZ of crsp msf
+        @test try
+            import_MSF_v2(wrds_conn; date_range = date_range_test_recent, logging_level=:info)
+            true
+        catch e
+            @error e
+            false
+        end
+
         df_msf_v2 = import_MSF_v2(wrds_conn; date_range = date_range_test, logging_level=:info)
-      
+
+
         # @test subset(df_msf_v2, [:mthcaldt, :mthprcdt] => (x,y) -> isequal.(x, y) ) |> nrow > 0
         @test subset(df_msf_v2, :mthprc => ByRow(x -> !isequal(x, abs(x))) ) |> nrow == 0
         @test subset(df_msf_v2, :mthcap => (x -> isequal.(x, 0) ) ) |> nrow == 0
@@ -39,9 +49,9 @@
         @test maximum(skipmissing(df_msf_v2.mthcaldt)) <= Date("2002-01-01")
         @test nrow(df_msf_v2) > 100_000
 
-        # discrepancy in nrow with df_msf_v2 ... 
+        # discrepancy in nrow with df_msf_v2 ...
 
-    end 
+    end
 
 
     # ----------------------------------------------------------------------------------------- #
@@ -91,11 +101,11 @@
     # ----------------------------------------------------------------------------------------- #
     @testset "CRSP-Compustat LINK" begin
         println("\033[1m\033[32m    → running\033[0m: CRSP-Compustat LINK")
-    
+
         df_linktable = FinanceRoutines.import_ccm_link(wrds_conn)
         # test on table itself
         @test all(map(s -> s in names(df_linktable),
-                  lowercase.(["GVKEY", "LINKPRIM", "LIID", "LINKTYPE", "PERMNO", "LPERMCO", 
+                  lowercase.(["GVKEY", "LINKPRIM", "LIID", "LINKTYPE", "PERMNO", "LPERMCO",
                               "LINKDT", "LINKENDDT"])))
         @test isempty(setdiff(unique(df_linktable.linktype), ["LU", "LC", "LS"]))
         @test isempty(setdiff(unique(df_linktable.linkprim), ["P", "C"]))
@@ -104,17 +114,17 @@
         df_msf_v2 = import_MSF_v2(wrds_conn; date_range = date_range_test, logging_level=:info)
         df_msf_v2 = select(df_msf_v2, :permno, :mthcaldt=>:date, :datem, :mthret=>:ret, :mthcap)
 
-        df_funda  =  @p import_Funda(wrds_conn; date_range = date_range_test, 
+        df_funda  =  @p import_Funda(wrds_conn; date_range = date_range_test,
             variables=["PPENT", "NAICSH"]) |>
             build_Funda(__; clean_cols=true)
 
         df_msf_v2 = link_MSF(df_linktable, df_msf_v2) # merge gvkey on monthly stock file
-        @test @p df_msf_v2 |> unique(__, [:permno, :gvkey]) |> 
+        @test @p df_msf_v2 |> unique(__, [:permno, :gvkey]) |>
             groupby(__, :permno) |> combine(__, nrow) |> __.nrow |> unique |>
             all( .<=(2) )
-        
+
         df_ccm = innerjoin(df_msf_v2, df_funda, on = [:gvkey, :datey], matchmissing=:notequal)
-        @test @p df_msf_v2 |> unique(__, [:permno, :gvkey, :date, :datey]) |> 
+        @test @p df_msf_v2 |> unique(__, [:permno, :gvkey, :date, :datey]) |>
             groupby(__, [:permno, :datey]) |> combine(__, nrow) |> __.nrow |> unique |>
             all( .<=(12) )
 
@@ -126,5 +136,3 @@
 
 
 end
-
-
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -12,10 +12,10 @@ import DataPipes: @p
 
 # --------------------------------------------------------------------------------------------------
 const testsuite = [
-    "KenFrench", 
-    "WRDS", 
+    "KenFrench",
+    "WRDS",
     "betas",
-    "Yields", 
+    "Yields",
 ]
 # --------------------------------------------------------------------------------------------------

	FinanceRoutines.jl Financial data routines for Julia
	Log \| Files \| Refs \| README \| LICENSE

M	.gitignore	\|	1	+
M	Project.toml	\|	21	++++++++++++++++-----
M	src/FinanceRoutines.jl	\|	3	++-
M	src/ImportCRSP.jl	\|	28	+++++++++++++++-------------
M	src/ImportFamaFrench.jl	\|	20	++++++++++----------
M	src/Utilities.jl	\|	9	++++++---
M	test/UnitTests/KenFrench.jl	\|	6	++----
M	test/UnitTests/WRDS.jl	\|	38	+++++++++++++++++++++++---------------
M	test/runtests.jl	\|	6	+++---