FinanceRoutines.jl

Financial data routines for Julia
Log | Files | Refs | README | LICENSE

ImportComp.jl (15703B)


      1 # ------------------------------------------------------------------------------------------
      2 # ImportComp.jl
      3 
      4 # Collection of functions that import
      5 #  compustat data into julia
      6 
      7 # List of exported functions
      8 # export import_Funda
      9 # export build_Funda
     10 # ------------------------------------------------------------------------------------------
     11 
     12 
     13 
     14 # ------------------------------------------------------------------------------------------
     15 """
     16     import_Funda(wrds_conn; date_range, variables)
     17     import_Funda(;
     18         date_range::Tuple{Date, Date} = (Date("1900-01-01"), Dates.today()),
     19         variables::String = "", user="", password="")
     20 
     21 Import the funda file from CapitalIQ Compustat on WRDS Postgres server
     22 
     23 # Arguments
     24 - `wrds_conn::Connection`: An existing Postgres connection to WRDS; creates one if empty
     25 
     26 # Keywords
     27 - `date_range::Tuple{Date, Date}`: A tuple of dates to select data (limits the download size)
     28 - `variables::Vector{String}`: A vector of String of additional variable to include in the download
     29 - `user::String`: username to log into the WRDS cli; default to ask user for authentication
     30 - `password::String`: password to log into the WRDS cli
     31 
     32 # Returns
     33 - `df_funda::DataFrame`: DataFrame with compustat funda file
     34 """
     35 function import_Funda(wrds_conn::Connection;
     36     date_range::Tuple{Date,Date}=(Date("1900-01-01"), Dates.today()),
     37     variables::Union{Nothing,Vector{String}}=nothing,
     38     filter_variables=Dict(:CURCD => "USD")  # if you want something fanciers ... export variable and do it later
     39 )
     40 
     41     date_range = _validate_date_range(date_range; earliest=Date("1950-01-01"))
     42 
     43     var_funda = ["GVKEY", "DATADATE", "SICH", "FYR", "FYEAR",
     44         "AT", "LT", "SALE", "EBITDA", "CAPX", "NI", "DV", "CEQ", "CEQL", "SEQ",
     45         "TXDITC", "TXP", "TXDB", "ITCB", "DVT", "PSTK", "PSTKL", "PSTKRV"]
     46     !isnothing(variables) && append!(var_funda, uppercase.(variables))
     47     !isnothing(filter_variables) && append!(var_funda, uppercase.(string.(keys(filter_variables))))
     48 
     49     # Validate variables against actual schema (falls back to hardcoded list)
     50     valid_cols = _get_funda_columns(wrds_conn)
     51     var_check = setdiff(var_funda, valid_cols)
     52     !isempty(var_check) && (@warn "Queried variables not in comp.funda: $(join(var_check, ", "))")
     53     filter!(in(valid_cols), var_funda)
     54 
     55     # set up the query for funda (dates parameterized to prevent SQL injection)
     56     postgre_query_funda = """
     57         SELECT $(join(unique(var_funda), ","))
     58             FROM comp.funda
     59             WHERE INDFMT = 'INDL' AND DATAFMT = 'STD' AND CONSOL = 'C' AND POPSRC = 'D'
     60                 AND DATADATE >= \$1
     61                 AND DATADATE <= \$2
     62     """
     63     res_q_funda = execute(wrds_conn, postgre_query_funda, (date_range[1], date_range[2]))
     64     df_funda = DataFrame(columntable(res_q_funda))
     65 
     66     # run the filter
     67     !isnothing(filter_variables) && for (key, value) in Dict(lowercase(string(k)) => v for (k, v) in filter_variables)
     68         filter!(row -> (ismissing(row[key])) | (row[key] == value), df_funda) # we keep missing ...
     69     end
     70 
     71     # clean up the dataframe
     72     transform!(df_funda,
     73         names(df_funda, check_integer.(eachcol(df_funda))) .=> (x -> convert.(Union{Missing,Int}, x));
     74         renamecols=false)
     75     df_funda[!, :gvkey] .= parse.(Int, df_funda[!, :gvkey])
     76 
     77     return df_funda
     78 
     79 end
     80 
     81 function import_Funda(;
     82     date_range::Tuple{Date,Date}=(Date("1900-01-01"), Dates.today()),
     83     variables::Union{Nothing,Vector{String}}=nothing,
     84     filter_variables::Dict{Symbol,Any}=Dict(:CURCD => "USD"),
     85     user::String="", password::String="")
     86 
     87     with_wrds_connection(user=user, password=password) do conn
     88         import_Funda(conn; date_range=date_range, variables=variables)
     89     end
     90 end
     91 # ------------------------------------------------------------------------------------------
     92 
     93 
     94 
     95 
     96 # ------------------------------------------------------------------------------------------
     97 """
     98     build_Funda!(df_funda::DataFrame; save)
     99 
    100 Clean up the compustat funda file download from crsp (see `import_Funda`)
    101 
    102 # Arguments
    103 - `df_funda::DataFrame`: A standard dataframe with compustat data (minimum variables are in `import_Funda`)
    104 
    105 # Keywords
    106 - `save::String`: Save a gzip version of the data on path `\$save/funda.csv.gz`; Default does not save the data.
    107 - `clean_cols::Bool`: Clean up the columns of the dataframe to be of type Float64; Default is `false` and leaves the Decimal type intact
    108 
    109 # Returns
    110 - `df_funda::DataFrame`: DataFrame with compustat funda file "cleaned"
    111 """
    112 function build_Funda!(df::DataFrame;
    113     clean_cols::Bool=false,
    114     verbose::Bool=false
    115 )
    116 
    117     verbose && (@info "--- Creating clean funda panel")
    118 
    119     # define book equity value
    120     verbose && (@info ". Creating book equity")
    121     transform!(df, 
    122         AsTable([:seq, :ceq, :pstk, :at, :lt, :txditc, :txdb, :itcb, :pstkrv, :pstkl]) =>
    123         ByRow(r -> coalesce(r.seq, r.ceq + r.pstk, r.at - r.lt) +
    124                    coalesce(r.txditc, r.txdb + r.itcb, 0) -
    125                    coalesce(r.pstkrv, r.pstkl, r.pstk, 0)) => 
    126         :be)
    127 
    128     df[isless.(df.be, 0), :be] .= missing
    129     transform!(df, :datadate => ByRow(year) => :datey)
    130     sort!(df, [:gvkey, :datey, :datadate])
    131     unique!(df, [:gvkey, :datey], keep=:last) # last obs
    132 
    133     verbose && (@info ". Cleaning superfluous columns INDFMT, etc.")
    134     select!(df, Not(intersect(names(df), ["indfmt", "datafmt", "consol", "popsrc", "curcd"])))
    135 
    136     if clean_cols
    137         verbose && (@info ". Converting decimal type columns to Float64.")
    138         for col in names(df)
    139             if eltype(df[!, col]) == Union{Missing,Decimal}
    140                 df[!, col] = convert.(Union{Missing,Float64}, df[!, col])
    141             end
    142         end
    143     end
    144 
    145     return df
    146 end
    147 
    148 function build_Funda(df::DataFrame;
    149     clean_cols::Bool=false,
    150     verbose::Bool=false
    151 )
    152 
    153     df_ret = copy(df)
    154     build_Funda!(df_ret, clean_cols=clean_cols, verbose=verbose)
    155 
    156     return df_ret
    157     
    158 end
    159 
    160 # ------------------------------------------------------------------------------------------
    161 
    162 
    163 # ------------------------------------------------------------------------------------------
    164 # Session cache for funda columns — queried once per session, falls back to hardcoded list
    165 const _funda_columns_cache = Ref{Vector{String}}()
    166 
    167 """
    168     _get_funda_columns(wrds_conn) -> Vector{String}
    169 
    170 Get valid column names for comp.funda. Queries the schema at runtime and caches
    171 for the session. Falls back to the hardcoded `compd_funda` list on error.
    172 """
    173 function _get_funda_columns(wrds_conn)
    174     if isassigned(_funda_columns_cache)
    175         return _funda_columns_cache[]
    176     end
    177     try
    178         cols = _get_postgres_columns("comp", "funda"; wrds_conn=wrds_conn)
    179         _funda_columns_cache[] = cols
    180         return cols
    181     catch e
    182         @warn "Could not query comp.funda schema, using hardcoded variable list" exception=(e, catch_backtrace())
    183         return compd_funda
    184     end
    185 end
    186 # ------------------------------------------------------------------------------------------
    187 
    188 
    189 # ------------------------------------------------------------------------------------------
    190 # Hardcoded fallback list of comp.funda columns (last verified: 2025-02)
    191 const compd_funda = [
    192     "GVKEY", "DATADATE", "FYEAR", "INDFMT", "CONSOL", "POPSRC", "DATAFMT", "TIC", "CUSIP", "CONM", 
    193     "ACCTCHG", "ACCTSTD", "ACQMETH", "ADRR", "AJEX", "AJP", "BSPR", "COMPST", "CURCD", "CURNCD", 
    194     "CURRTR", "CURUSCN", "FINAL", "FYR", "ISMOD", "LTCM", "OGM", "PDDUR", "SCF", "SRC", "STALT", 
    195     "UDPL", "UPD", "APDEDATE", "FDATE", "PDATE", "ACCHG", "ACCO", "ACCRT", "ACDO", "ACO", "ACODO", 
    196     "ACOMINC", "ACOX", "ACOXAR", "ACQAO", "ACQCSHI", "ACQGDWL", "ACQIC",
    197     "ACQINTAN", "ACQINVT", "ACQLNTAL", "ACQNIINTC", "ACQPPE", "ACQSC", "ACT", "ADPAC", "AEDI", "AFUDCC", "AFUDCI", "ALDO", "AM", "AMC", "AMDC", "AMGW", "ANO",
    198     "AO", "AOCIDERGL", "AOCIOTHER", "AOCIPEN", "AOCISECGL", "AODO", "AOL2", "AOLOCH", "AOX", "AP", "APALCH", "APB", "APC", "APOFS", "AQA", "AQC", "AQD", "AQEPS",
    199     "AQI", "AQP", "AQPL1", "AQS", "ARB", "ARC", "ARCE", "ARCED", "ARCEEPS", "ARTFS", "AT", "AUL3", "AUTXR", "BALR", "BANLR", "BAST", "BASTR", "BATR", "BCEF", "BCLR",
    200     "BCLTBL", "BCNLR", "BCRBL", "BCT", "BCTBL", "BCTR", "BILLEXCE", "BKVLPS", "BLTBL", "CA", "CAPR1", "CAPR2", "CAPR3", "CAPS", "CAPSFT", "CAPX", "CAPXV", "CB", "CBI",
    201     "CDPAC", "CDVC", "CEIEXBILL", "CEQ", "CEQL", "CEQT", "CFBD", "CFERE", "CFO", "CFPDO", "CGA", "CGRI", "CGTI", "CGUI", "CH", "CHE", "CHECH", "CHS", "CI", "CIBEGNI",
    202     "CICURR", "CIDERGL", "CIMII", "CIOTHER", "CIPEN", "CISECGL", "CITOTAL", "CLD2", "CLD3", "CLD4", "CLD5", "CLFC", "CLFX", "CLG", "CLIS", "CLL", "CLLC", "CLO", "CLRLL",
    203     "CLT", "CMP", "CNLTBL", "COGS", "CPCBL", "CPDOI", "CPNLI", "CPPBL", "CPREI", "CRV", "CRVNLI", "CSHFD", "CSHI", "CSHO", "CSHPRI", "CSHR", "CSHRC", "CSHRP", "CSHRSO",
    204     "CSHRT", "CSHRW", "CSTK", "CSTKCV", "CSTKE", "DBI", "DC", "DCLO", "DCOM", "DCPSTK", "DCS", "DCVSR", "DCVSUB", "DCVT", "DD", "DD1", "DD2", "DD3", "DD4", "DD5", "DEPC",
    205     "DERAC", "DERALT", "DERHEDGL", "DERLC", "DERLLT", "DFPAC", "DFS", "DFXA", "DILADJ", "DILAVX", "DLC", "DLCCH", "DLTIS", "DLTO", "DLTP", "DLTR", "DLTSUB", "DLTT", "DM",
    206     "DN", "DO", "DONR", "DP", "DPACB", "DPACC", "DPACLI", "DPACLS", "DPACME", "DPACNR", "DPACO", "DPACRE", "DPACT", "DPC", "DPDC", "DPLTB", "DPRET", "DPSC", "DPSTB", "DPTB",
    207     "DPTC", "DPTIC", "DPVIEB", "DPVIO", "DPVIR", "DRC", "DRCI", "DRLT", "DS", "DT", "DTEA", "DTED", "DTEEPS", "DTEP", "DUDD", "DV", "DVC", "DVDNP", "DVINTF", "DVP", "DVPA",
    208     "DVPD", "DVPDP", "DVPIBB", "DVRPIV", "DVRRE", "DVSCO", "DVT", "DXD2", "DXD3", "DXD4", "DXD5", "EA", "EBIT", "EBITDA", "EIEA", "EMOL", "EMP", "EPSFI", "EPSFX", "EPSPI",
    209     "EPSPX", "ESOPCT", "ESOPDLT", "ESOPNR", "ESOPR", "ESOPT", "ESUB", "ESUBC", "EXCADJ", "EXRE", "FATB", "FATC", "FATD", "FATE", "FATL", "FATN", "FATO", "FATP", "FCA", "FDFR",
    210     "FEA", "FEL", "FFO", "FFS", "FIAO", "FINACO", "FINAO", "FINCF", "FINCH", "FINDLC", "FINDLT", "FINIVST", "FINLCO", "FINLTO", "FINNP", "FINRECC", "FINRECLT", "FINREV", "FINXINT",
    211     "FINXOPR", "FOPO", "FOPOX", "FOPT", "FSRCO", "FSRCT", "FUSEO", "FUSET", "GBBL", "GDWL", "GDWLAM", "GDWLIA", "GDWLID", "GDWLIEPS", "GDWLIP", "GEQRV", "GLA", "GLCEA", "GLCED",
    212     "GLCEEPS", "GLCEP", "GLD", "GLEPS", "GLIV", "GLP", "GOVGR", "GOVTOWN", "GP", "GPHBL", "GPLBL", "GPOBL", "GPRBL", "GPTBL", "GWO", "HEDGEGL", "IAEQ", "IAEQCI", "IAEQMI", "IAFICI",
    213     "IAFXI", "IAFXMI", "IALI", "IALOI", "IALTI", "IAMLI", "IAOI", "IAPLI", "IAREI", "IASCI", "IASMI", "IASSI", "IASTI", "IATCI", "IATI", "IATMI", "IAUI", "IB", "IBADJ", "IBBL",
    214     "IBC", "IBCOM", "IBKI", "IBMII", "ICAPT", "IDIIS", "IDILB", "IDILC", "IDIS", "IDIST", "IDIT", "IDITS", "IIRE", "INITB", "INTAN", "INTANO", "INTC", "INTPN", "INVCH", "INVFG",
    215     "INVO", "INVOFS", "INVREH", "INVREI", "INVRES", "INVRM", "INVT", "INVWIP", "IOBD", "IOI", "IORE", "IP", "IPABL", "IPC", "IPHBL", "IPLBL", "IPOBL", "IPTBL", "IPTI", "IPV",
    216     "IREI", "IRENT", "IRII", "IRLI", "IRNLI", "IRSI", "ISEQ", "ISEQC", "ISEQM", "ISFI", "ISFXC", "ISFXM", "ISGR", "ISGT", "ISGU", "ISLG", "ISLGC", "ISLGM", "ISLT", "ISNG",
    217     "ISNGC", "ISNGM", "ISOTC", "ISOTH", "ISOTM", "ISSC", "ISSM", "ISSU", "IST", "ISTC", "ISTM", "ISUT", "ITCB", "ITCC", "ITCI", "IVACO", "IVAEQ", "IVAO", "IVCH", "IVGOD",
    218     "IVI", "IVNCF", "IVPT", "IVST", "IVSTCH", "LCABG", "LCACL", "LCACR", "LCAG", "LCAL", "LCALT", "LCAM", "LCAO", "LCAST", "LCAT", "LCO", "LCOX", "LCOXAR", "LCOXDR", "LCT",
    219     "LCUACU", "LI", "LIF", "LIFR", "LIFRP", "LLOML", "LLOO", "LLOT", "LLRCI", "LLRCR", "LLWOCI", "LLWOCR", "LNO", "LO", "LOL2", "LOXDR", "LQPL1", "LRV", "LS", "LSE", "LST",
    220     "LT", "LUL3", "MIB", "MIBN", "MIBT", "MII", "MRC1", "MRC2", "MRC3", "MRC4", "MRC5", "MRCT", "MRCTA", "MSA", "MSVRV", "MTL", "NAT", "NCO", "NFSR", "NI", "NIADJ", "NIECI",
    221     "NIINT", "NIINTPFC", "NIINTPFP", "NIIT", "NIM", "NIO", "NIPFC", "NIPFP", "NIT", "NITS", "NOPI", "NOPIO", "NP", "NPANL", "NPAORE", "NPARL", "NPAT", "NRTXT", "NRTXTD",
    222     "NRTXTEPS", "OANCF", "OB", "OIADP", "OIBDP", "OPEPS", "OPILI", "OPINCAR", "OPINI", "OPIOI", "OPIRI", "OPITI", "OPREPSX", "OPTCA", "OPTDR", "OPTEX", "OPTEXD", "OPTFVGR",
    223     "OPTGR", "OPTLIFE", "OPTOSBY", "OPTOSEY", "OPTPRCBY", "OPTPRCCA", "OPTPRCEX", "OPTPRCEY", "OPTPRCGR", "OPTPRCWA", "OPTRFR", "OPTVOL", "PALR", "PANLR", "PATR", "PCL",
    224     "PCLR", "PCNLR", "PCTR", "PDVC", "PI", "PIDOM", "PIFO", "PLL", "PLTBL", "PNCA", "PNCAD", "PNCAEPS", "PNCIA", "PNCID", "PNCIEPS", "PNCIP", "PNCWIA", "PNCWID", "PNCWIEPS",
    225     "PNCWIP", "PNLBL", "PNLI", "PNRSHO", "POBL", "PPCBL", "PPEGT", "PPENB", "PPENC", "PPENLI", "PPENLS", "PPENME", "PPENNR", "PPENO", "PPENT", "PPEVBB", "PPEVEB", "PPEVO",
    226     "PPEVR", "PPPABL", "PPPHBL", "PPPOBL", "PPPTBL", "PRC", "PRCA", "PRCAD", "PRCAEPS", "PREBL", "PRI", "PRODV", "PRSHO", "PRSTKC", "PRSTKCC", "PRSTKPC", "PRVT", "PSTK",
    227     "PSTKC", "PSTKL", "PSTKN", "PSTKR", "PSTKRV", "PTBL", "PTRAN", "PVCL", "PVO", "PVON", "PVPL", "PVT", "PWOI", "RADP", "RAGR", "RARI", "RATI", "RCA", "RCD", "RCEPS",
    228     "RCL", "RCP", "RDIP", "RDIPA", "RDIPD", "RDIPEPS", "RDP", "RE", "REA", "REAJO", "RECCH", "RECCO", "RECD", "RECT", "RECTA", "RECTR", "RECUB", "RET", "REUNA", "REUNR",
    229     "REVT", "RIS", "RLL", "RLO", "RLP", "RLRI", "RLT", "RMUM", "RPAG", "RRA", "RRD", "RREPS", "RRP", "RSTCHE", "RSTCHELT", "RVBCI", "RVBPI", "RVBTI", "RVDO", "RVDT",
    230     "RVEQT", "RVLRV", "RVNO", "RVNT", "RVRI", "RVSI", "RVTI", "RVTXR", "RVUPI", "RVUTX", "SAA", "SAL", "SALE", "SALEPFC", "SALEPFP", "SBDC", "SC", "SCO", "SCSTKC",
    231     "SECU", "SEQ", "SEQO", "SETA", "SETD", "SETEPS", "SETP", "SIV", "SPCE", "SPCED", "SPCEEPS", "SPI", "SPID", "SPIEPS", "SPIOA", "SPIOP", "SPPE", "SPPIV", "SPSTKC",
    232     "SRET", "SRT", "SSNP", "SSTK", "STBO", "STIO", "STKCO", "STKCPA", "TDC", "TDSCD", "TDSCE", "TDSG", "TDSLG", "TDSMM", "TDSNG", "TDSO", "TDSS", "TDST", "TEQ", "TF",
    233     "TFVA", "TFVCE", "TFVL", "TIE", "TII", "TLCF", "TRANSA", "TSA", "TSAFC", "TSO", "TSTK", "TSTKC", "TSTKME", "TSTKN", "TSTKP", "TXACH", "TXBCO", "TXBCOF", "TXC",
    234     "TXDB", "TXDBA", "TXDBCA", "TXDBCL", "TXDC", "TXDFED", "TXDFO", "TXDI", "TXDITC", "TXDS", "TXEQA", "TXEQII", "TXFED", "TXFO", "TXNDB", "TXNDBA", "TXNDBL", "TXNDBR",
    235     "TXO", "TXP", "TXPD", "TXR", "TXS", "TXT", "TXTUBADJUST", "TXTUBBEGIN", "TXTUBEND", "TXTUBMAX", "TXTUBMIN", "TXTUBPOSDEC", "TXTUBPOSINC", "TXTUBPOSPDEC", "TXTUBPOSPINC",
    236     "TXTUBSETTLE", "TXTUBSOFLIMIT", "TXTUBTXTR", "TXTUBXINTBS", "TXTUBXINTIS", "TXVA", "TXW", "UAOLOCH", "UAOX", "UAPT", "UCAPS", "UCCONS", "UCEQ", "UCUSTAD", "UDCOPRES",
    237     "UDD", "UDFCC", "UDMB", "UDOLT", "UDPCO", "UDPFA", "UDVP", "UFRETSD", "UGI", "UI", "UINVT", "ULCM", "ULCO", "UNIAMI", "UNL", "UNNP", "UNNPL", "UNOPINC", "UNWCC",
    238     "UOIS", "UOPI", "UOPRES", "UPDVP", "UPMCSTK", "UPMPF", "UPMPFS", "UPMSUBP", "UPSTK", "UPSTKC", "UPSTKSF", "URECT", "URECTR", "UREVUB", "USPI", "USTDNC", "USUBDVP",
    239     "USUBPSTK", "UTFDOC", "UTFOSC", "UTME", "UTXFED", "UWKCAPC", "UXINST", "UXINTD", "VPAC", "VPO", "WCAP", "WCAPC", "WCAPCH", "WDA", "WDD", "WDEPS", "WDP", "XACC",
    240     "XAD", "XAGO", "XAGT", "XCOM", "XCOMI", "XDEPL", "XDP", "XDVRE", "XEQO", "XI", "XIDO", "XIDOC", "XINDB", "XINDC", "XINS", "XINST", "XINT", "XINTD", "XINTOPT",
    241     "XIVI", "XIVRE", "XLR", "XNBI", "XNF", "XNINS", "XNITB", "XOBD", "XOI", "XOPR", "XOPRAR", "XOPTD", "XOPTEPS", "XORE", "XPP", "XPR", "XRD", "XRDP", "XRENT", "XS",
    242     "XSGA", "XSTF", "XSTFO", "XSTFWS", "XT", "XUW", "XUWLI", "XUWNLI", "XUWOI", "XUWREI", "XUWTI", "IID", "EXCHG", "CIK", "COSTAT", "FIC", "NAICSH", "SICH", "CSHTR_C",
    243     "DVPSP_C", "DVPSX_C", "PRCC_C", "PRCH_C", "PRCL_C", "ADJEX_C", "CSHTR_F", "DVPSP_F", "DVPSX_F", "MKVALT", "PRCC_F", "PRCH_F", "PRCL_F", "ADJEX_F", "RANK", "AU",
    244     "AUOP", "AUOPIC", "CEOSO", "CFOSO"]