ImportComp.jl (15703B)
1 # ------------------------------------------------------------------------------------------ 2 # ImportComp.jl 3 4 # Collection of functions that import 5 # compustat data into julia 6 7 # List of exported functions 8 # export import_Funda 9 # export build_Funda 10 # ------------------------------------------------------------------------------------------ 11 12 13 14 # ------------------------------------------------------------------------------------------ 15 """ 16 import_Funda(wrds_conn; date_range, variables) 17 import_Funda(; 18 date_range::Tuple{Date, Date} = (Date("1900-01-01"), Dates.today()), 19 variables::String = "", user="", password="") 20 21 Import the funda file from CapitalIQ Compustat on WRDS Postgres server 22 23 # Arguments 24 - `wrds_conn::Connection`: An existing Postgres connection to WRDS; creates one if empty 25 26 # Keywords 27 - `date_range::Tuple{Date, Date}`: A tuple of dates to select data (limits the download size) 28 - `variables::Vector{String}`: A vector of String of additional variable to include in the download 29 - `user::String`: username to log into the WRDS cli; default to ask user for authentication 30 - `password::String`: password to log into the WRDS cli 31 32 # Returns 33 - `df_funda::DataFrame`: DataFrame with compustat funda file 34 """ 35 function import_Funda(wrds_conn::Connection; 36 date_range::Tuple{Date,Date}=(Date("1900-01-01"), Dates.today()), 37 variables::Union{Nothing,Vector{String}}=nothing, 38 filter_variables=Dict(:CURCD => "USD") # if you want something fanciers ... export variable and do it later 39 ) 40 41 date_range = _validate_date_range(date_range; earliest=Date("1950-01-01")) 42 43 var_funda = ["GVKEY", "DATADATE", "SICH", "FYR", "FYEAR", 44 "AT", "LT", "SALE", "EBITDA", "CAPX", "NI", "DV", "CEQ", "CEQL", "SEQ", 45 "TXDITC", "TXP", "TXDB", "ITCB", "DVT", "PSTK", "PSTKL", "PSTKRV"] 46 !isnothing(variables) && append!(var_funda, uppercase.(variables)) 47 !isnothing(filter_variables) && append!(var_funda, uppercase.(string.(keys(filter_variables)))) 48 49 # Validate variables against actual schema (falls back to hardcoded list) 50 valid_cols = _get_funda_columns(wrds_conn) 51 var_check = setdiff(var_funda, valid_cols) 52 !isempty(var_check) && (@warn "Queried variables not in comp.funda: $(join(var_check, ", "))") 53 filter!(in(valid_cols), var_funda) 54 55 # set up the query for funda (dates parameterized to prevent SQL injection) 56 postgre_query_funda = """ 57 SELECT $(join(unique(var_funda), ",")) 58 FROM comp.funda 59 WHERE INDFMT = 'INDL' AND DATAFMT = 'STD' AND CONSOL = 'C' AND POPSRC = 'D' 60 AND DATADATE >= \$1 61 AND DATADATE <= \$2 62 """ 63 res_q_funda = execute(wrds_conn, postgre_query_funda, (date_range[1], date_range[2])) 64 df_funda = DataFrame(columntable(res_q_funda)) 65 66 # run the filter 67 !isnothing(filter_variables) && for (key, value) in Dict(lowercase(string(k)) => v for (k, v) in filter_variables) 68 filter!(row -> (ismissing(row[key])) | (row[key] == value), df_funda) # we keep missing ... 69 end 70 71 # clean up the dataframe 72 transform!(df_funda, 73 names(df_funda, check_integer.(eachcol(df_funda))) .=> (x -> convert.(Union{Missing,Int}, x)); 74 renamecols=false) 75 df_funda[!, :gvkey] .= parse.(Int, df_funda[!, :gvkey]) 76 77 return df_funda 78 79 end 80 81 function import_Funda(; 82 date_range::Tuple{Date,Date}=(Date("1900-01-01"), Dates.today()), 83 variables::Union{Nothing,Vector{String}}=nothing, 84 filter_variables::Dict{Symbol,Any}=Dict(:CURCD => "USD"), 85 user::String="", password::String="") 86 87 with_wrds_connection(user=user, password=password) do conn 88 import_Funda(conn; date_range=date_range, variables=variables) 89 end 90 end 91 # ------------------------------------------------------------------------------------------ 92 93 94 95 96 # ------------------------------------------------------------------------------------------ 97 """ 98 build_Funda!(df_funda::DataFrame; save) 99 100 Clean up the compustat funda file download from crsp (see `import_Funda`) 101 102 # Arguments 103 - `df_funda::DataFrame`: A standard dataframe with compustat data (minimum variables are in `import_Funda`) 104 105 # Keywords 106 - `save::String`: Save a gzip version of the data on path `\$save/funda.csv.gz`; Default does not save the data. 107 - `clean_cols::Bool`: Clean up the columns of the dataframe to be of type Float64; Default is `false` and leaves the Decimal type intact 108 109 # Returns 110 - `df_funda::DataFrame`: DataFrame with compustat funda file "cleaned" 111 """ 112 function build_Funda!(df::DataFrame; 113 clean_cols::Bool=false, 114 verbose::Bool=false 115 ) 116 117 verbose && (@info "--- Creating clean funda panel") 118 119 # define book equity value 120 verbose && (@info ". Creating book equity") 121 transform!(df, 122 AsTable([:seq, :ceq, :pstk, :at, :lt, :txditc, :txdb, :itcb, :pstkrv, :pstkl]) => 123 ByRow(r -> coalesce(r.seq, r.ceq + r.pstk, r.at - r.lt) + 124 coalesce(r.txditc, r.txdb + r.itcb, 0) - 125 coalesce(r.pstkrv, r.pstkl, r.pstk, 0)) => 126 :be) 127 128 df[isless.(df.be, 0), :be] .= missing 129 transform!(df, :datadate => ByRow(year) => :datey) 130 sort!(df, [:gvkey, :datey, :datadate]) 131 unique!(df, [:gvkey, :datey], keep=:last) # last obs 132 133 verbose && (@info ". Cleaning superfluous columns INDFMT, etc.") 134 select!(df, Not(intersect(names(df), ["indfmt", "datafmt", "consol", "popsrc", "curcd"]))) 135 136 if clean_cols 137 verbose && (@info ". Converting decimal type columns to Float64.") 138 for col in names(df) 139 if eltype(df[!, col]) == Union{Missing,Decimal} 140 df[!, col] = convert.(Union{Missing,Float64}, df[!, col]) 141 end 142 end 143 end 144 145 return df 146 end 147 148 function build_Funda(df::DataFrame; 149 clean_cols::Bool=false, 150 verbose::Bool=false 151 ) 152 153 df_ret = copy(df) 154 build_Funda!(df_ret, clean_cols=clean_cols, verbose=verbose) 155 156 return df_ret 157 158 end 159 160 # ------------------------------------------------------------------------------------------ 161 162 163 # ------------------------------------------------------------------------------------------ 164 # Session cache for funda columns — queried once per session, falls back to hardcoded list 165 const _funda_columns_cache = Ref{Vector{String}}() 166 167 """ 168 _get_funda_columns(wrds_conn) -> Vector{String} 169 170 Get valid column names for comp.funda. Queries the schema at runtime and caches 171 for the session. Falls back to the hardcoded `compd_funda` list on error. 172 """ 173 function _get_funda_columns(wrds_conn) 174 if isassigned(_funda_columns_cache) 175 return _funda_columns_cache[] 176 end 177 try 178 cols = _get_postgres_columns("comp", "funda"; wrds_conn=wrds_conn) 179 _funda_columns_cache[] = cols 180 return cols 181 catch e 182 @warn "Could not query comp.funda schema, using hardcoded variable list" exception=(e, catch_backtrace()) 183 return compd_funda 184 end 185 end 186 # ------------------------------------------------------------------------------------------ 187 188 189 # ------------------------------------------------------------------------------------------ 190 # Hardcoded fallback list of comp.funda columns (last verified: 2025-02) 191 const compd_funda = [ 192 "GVKEY", "DATADATE", "FYEAR", "INDFMT", "CONSOL", "POPSRC", "DATAFMT", "TIC", "CUSIP", "CONM", 193 "ACCTCHG", "ACCTSTD", "ACQMETH", "ADRR", "AJEX", "AJP", "BSPR", "COMPST", "CURCD", "CURNCD", 194 "CURRTR", "CURUSCN", "FINAL", "FYR", "ISMOD", "LTCM", "OGM", "PDDUR", "SCF", "SRC", "STALT", 195 "UDPL", "UPD", "APDEDATE", "FDATE", "PDATE", "ACCHG", "ACCO", "ACCRT", "ACDO", "ACO", "ACODO", 196 "ACOMINC", "ACOX", "ACOXAR", "ACQAO", "ACQCSHI", "ACQGDWL", "ACQIC", 197 "ACQINTAN", "ACQINVT", "ACQLNTAL", "ACQNIINTC", "ACQPPE", "ACQSC", "ACT", "ADPAC", "AEDI", "AFUDCC", "AFUDCI", "ALDO", "AM", "AMC", "AMDC", "AMGW", "ANO", 198 "AO", "AOCIDERGL", "AOCIOTHER", "AOCIPEN", "AOCISECGL", "AODO", "AOL2", "AOLOCH", "AOX", "AP", "APALCH", "APB", "APC", "APOFS", "AQA", "AQC", "AQD", "AQEPS", 199 "AQI", "AQP", "AQPL1", "AQS", "ARB", "ARC", "ARCE", "ARCED", "ARCEEPS", "ARTFS", "AT", "AUL3", "AUTXR", "BALR", "BANLR", "BAST", "BASTR", "BATR", "BCEF", "BCLR", 200 "BCLTBL", "BCNLR", "BCRBL", "BCT", "BCTBL", "BCTR", "BILLEXCE", "BKVLPS", "BLTBL", "CA", "CAPR1", "CAPR2", "CAPR3", "CAPS", "CAPSFT", "CAPX", "CAPXV", "CB", "CBI", 201 "CDPAC", "CDVC", "CEIEXBILL", "CEQ", "CEQL", "CEQT", "CFBD", "CFERE", "CFO", "CFPDO", "CGA", "CGRI", "CGTI", "CGUI", "CH", "CHE", "CHECH", "CHS", "CI", "CIBEGNI", 202 "CICURR", "CIDERGL", "CIMII", "CIOTHER", "CIPEN", "CISECGL", "CITOTAL", "CLD2", "CLD3", "CLD4", "CLD5", "CLFC", "CLFX", "CLG", "CLIS", "CLL", "CLLC", "CLO", "CLRLL", 203 "CLT", "CMP", "CNLTBL", "COGS", "CPCBL", "CPDOI", "CPNLI", "CPPBL", "CPREI", "CRV", "CRVNLI", "CSHFD", "CSHI", "CSHO", "CSHPRI", "CSHR", "CSHRC", "CSHRP", "CSHRSO", 204 "CSHRT", "CSHRW", "CSTK", "CSTKCV", "CSTKE", "DBI", "DC", "DCLO", "DCOM", "DCPSTK", "DCS", "DCVSR", "DCVSUB", "DCVT", "DD", "DD1", "DD2", "DD3", "DD4", "DD5", "DEPC", 205 "DERAC", "DERALT", "DERHEDGL", "DERLC", "DERLLT", "DFPAC", "DFS", "DFXA", "DILADJ", "DILAVX", "DLC", "DLCCH", "DLTIS", "DLTO", "DLTP", "DLTR", "DLTSUB", "DLTT", "DM", 206 "DN", "DO", "DONR", "DP", "DPACB", "DPACC", "DPACLI", "DPACLS", "DPACME", "DPACNR", "DPACO", "DPACRE", "DPACT", "DPC", "DPDC", "DPLTB", "DPRET", "DPSC", "DPSTB", "DPTB", 207 "DPTC", "DPTIC", "DPVIEB", "DPVIO", "DPVIR", "DRC", "DRCI", "DRLT", "DS", "DT", "DTEA", "DTED", "DTEEPS", "DTEP", "DUDD", "DV", "DVC", "DVDNP", "DVINTF", "DVP", "DVPA", 208 "DVPD", "DVPDP", "DVPIBB", "DVRPIV", "DVRRE", "DVSCO", "DVT", "DXD2", "DXD3", "DXD4", "DXD5", "EA", "EBIT", "EBITDA", "EIEA", "EMOL", "EMP", "EPSFI", "EPSFX", "EPSPI", 209 "EPSPX", "ESOPCT", "ESOPDLT", "ESOPNR", "ESOPR", "ESOPT", "ESUB", "ESUBC", "EXCADJ", "EXRE", "FATB", "FATC", "FATD", "FATE", "FATL", "FATN", "FATO", "FATP", "FCA", "FDFR", 210 "FEA", "FEL", "FFO", "FFS", "FIAO", "FINACO", "FINAO", "FINCF", "FINCH", "FINDLC", "FINDLT", "FINIVST", "FINLCO", "FINLTO", "FINNP", "FINRECC", "FINRECLT", "FINREV", "FINXINT", 211 "FINXOPR", "FOPO", "FOPOX", "FOPT", "FSRCO", "FSRCT", "FUSEO", "FUSET", "GBBL", "GDWL", "GDWLAM", "GDWLIA", "GDWLID", "GDWLIEPS", "GDWLIP", "GEQRV", "GLA", "GLCEA", "GLCED", 212 "GLCEEPS", "GLCEP", "GLD", "GLEPS", "GLIV", "GLP", "GOVGR", "GOVTOWN", "GP", "GPHBL", "GPLBL", "GPOBL", "GPRBL", "GPTBL", "GWO", "HEDGEGL", "IAEQ", "IAEQCI", "IAEQMI", "IAFICI", 213 "IAFXI", "IAFXMI", "IALI", "IALOI", "IALTI", "IAMLI", "IAOI", "IAPLI", "IAREI", "IASCI", "IASMI", "IASSI", "IASTI", "IATCI", "IATI", "IATMI", "IAUI", "IB", "IBADJ", "IBBL", 214 "IBC", "IBCOM", "IBKI", "IBMII", "ICAPT", "IDIIS", "IDILB", "IDILC", "IDIS", "IDIST", "IDIT", "IDITS", "IIRE", "INITB", "INTAN", "INTANO", "INTC", "INTPN", "INVCH", "INVFG", 215 "INVO", "INVOFS", "INVREH", "INVREI", "INVRES", "INVRM", "INVT", "INVWIP", "IOBD", "IOI", "IORE", "IP", "IPABL", "IPC", "IPHBL", "IPLBL", "IPOBL", "IPTBL", "IPTI", "IPV", 216 "IREI", "IRENT", "IRII", "IRLI", "IRNLI", "IRSI", "ISEQ", "ISEQC", "ISEQM", "ISFI", "ISFXC", "ISFXM", "ISGR", "ISGT", "ISGU", "ISLG", "ISLGC", "ISLGM", "ISLT", "ISNG", 217 "ISNGC", "ISNGM", "ISOTC", "ISOTH", "ISOTM", "ISSC", "ISSM", "ISSU", "IST", "ISTC", "ISTM", "ISUT", "ITCB", "ITCC", "ITCI", "IVACO", "IVAEQ", "IVAO", "IVCH", "IVGOD", 218 "IVI", "IVNCF", "IVPT", "IVST", "IVSTCH", "LCABG", "LCACL", "LCACR", "LCAG", "LCAL", "LCALT", "LCAM", "LCAO", "LCAST", "LCAT", "LCO", "LCOX", "LCOXAR", "LCOXDR", "LCT", 219 "LCUACU", "LI", "LIF", "LIFR", "LIFRP", "LLOML", "LLOO", "LLOT", "LLRCI", "LLRCR", "LLWOCI", "LLWOCR", "LNO", "LO", "LOL2", "LOXDR", "LQPL1", "LRV", "LS", "LSE", "LST", 220 "LT", "LUL3", "MIB", "MIBN", "MIBT", "MII", "MRC1", "MRC2", "MRC3", "MRC4", "MRC5", "MRCT", "MRCTA", "MSA", "MSVRV", "MTL", "NAT", "NCO", "NFSR", "NI", "NIADJ", "NIECI", 221 "NIINT", "NIINTPFC", "NIINTPFP", "NIIT", "NIM", "NIO", "NIPFC", "NIPFP", "NIT", "NITS", "NOPI", "NOPIO", "NP", "NPANL", "NPAORE", "NPARL", "NPAT", "NRTXT", "NRTXTD", 222 "NRTXTEPS", "OANCF", "OB", "OIADP", "OIBDP", "OPEPS", "OPILI", "OPINCAR", "OPINI", "OPIOI", "OPIRI", "OPITI", "OPREPSX", "OPTCA", "OPTDR", "OPTEX", "OPTEXD", "OPTFVGR", 223 "OPTGR", "OPTLIFE", "OPTOSBY", "OPTOSEY", "OPTPRCBY", "OPTPRCCA", "OPTPRCEX", "OPTPRCEY", "OPTPRCGR", "OPTPRCWA", "OPTRFR", "OPTVOL", "PALR", "PANLR", "PATR", "PCL", 224 "PCLR", "PCNLR", "PCTR", "PDVC", "PI", "PIDOM", "PIFO", "PLL", "PLTBL", "PNCA", "PNCAD", "PNCAEPS", "PNCIA", "PNCID", "PNCIEPS", "PNCIP", "PNCWIA", "PNCWID", "PNCWIEPS", 225 "PNCWIP", "PNLBL", "PNLI", "PNRSHO", "POBL", "PPCBL", "PPEGT", "PPENB", "PPENC", "PPENLI", "PPENLS", "PPENME", "PPENNR", "PPENO", "PPENT", "PPEVBB", "PPEVEB", "PPEVO", 226 "PPEVR", "PPPABL", "PPPHBL", "PPPOBL", "PPPTBL", "PRC", "PRCA", "PRCAD", "PRCAEPS", "PREBL", "PRI", "PRODV", "PRSHO", "PRSTKC", "PRSTKCC", "PRSTKPC", "PRVT", "PSTK", 227 "PSTKC", "PSTKL", "PSTKN", "PSTKR", "PSTKRV", "PTBL", "PTRAN", "PVCL", "PVO", "PVON", "PVPL", "PVT", "PWOI", "RADP", "RAGR", "RARI", "RATI", "RCA", "RCD", "RCEPS", 228 "RCL", "RCP", "RDIP", "RDIPA", "RDIPD", "RDIPEPS", "RDP", "RE", "REA", "REAJO", "RECCH", "RECCO", "RECD", "RECT", "RECTA", "RECTR", "RECUB", "RET", "REUNA", "REUNR", 229 "REVT", "RIS", "RLL", "RLO", "RLP", "RLRI", "RLT", "RMUM", "RPAG", "RRA", "RRD", "RREPS", "RRP", "RSTCHE", "RSTCHELT", "RVBCI", "RVBPI", "RVBTI", "RVDO", "RVDT", 230 "RVEQT", "RVLRV", "RVNO", "RVNT", "RVRI", "RVSI", "RVTI", "RVTXR", "RVUPI", "RVUTX", "SAA", "SAL", "SALE", "SALEPFC", "SALEPFP", "SBDC", "SC", "SCO", "SCSTKC", 231 "SECU", "SEQ", "SEQO", "SETA", "SETD", "SETEPS", "SETP", "SIV", "SPCE", "SPCED", "SPCEEPS", "SPI", "SPID", "SPIEPS", "SPIOA", "SPIOP", "SPPE", "SPPIV", "SPSTKC", 232 "SRET", "SRT", "SSNP", "SSTK", "STBO", "STIO", "STKCO", "STKCPA", "TDC", "TDSCD", "TDSCE", "TDSG", "TDSLG", "TDSMM", "TDSNG", "TDSO", "TDSS", "TDST", "TEQ", "TF", 233 "TFVA", "TFVCE", "TFVL", "TIE", "TII", "TLCF", "TRANSA", "TSA", "TSAFC", "TSO", "TSTK", "TSTKC", "TSTKME", "TSTKN", "TSTKP", "TXACH", "TXBCO", "TXBCOF", "TXC", 234 "TXDB", "TXDBA", "TXDBCA", "TXDBCL", "TXDC", "TXDFED", "TXDFO", "TXDI", "TXDITC", "TXDS", "TXEQA", "TXEQII", "TXFED", "TXFO", "TXNDB", "TXNDBA", "TXNDBL", "TXNDBR", 235 "TXO", "TXP", "TXPD", "TXR", "TXS", "TXT", "TXTUBADJUST", "TXTUBBEGIN", "TXTUBEND", "TXTUBMAX", "TXTUBMIN", "TXTUBPOSDEC", "TXTUBPOSINC", "TXTUBPOSPDEC", "TXTUBPOSPINC", 236 "TXTUBSETTLE", "TXTUBSOFLIMIT", "TXTUBTXTR", "TXTUBXINTBS", "TXTUBXINTIS", "TXVA", "TXW", "UAOLOCH", "UAOX", "UAPT", "UCAPS", "UCCONS", "UCEQ", "UCUSTAD", "UDCOPRES", 237 "UDD", "UDFCC", "UDMB", "UDOLT", "UDPCO", "UDPFA", "UDVP", "UFRETSD", "UGI", "UI", "UINVT", "ULCM", "ULCO", "UNIAMI", "UNL", "UNNP", "UNNPL", "UNOPINC", "UNWCC", 238 "UOIS", "UOPI", "UOPRES", "UPDVP", "UPMCSTK", "UPMPF", "UPMPFS", "UPMSUBP", "UPSTK", "UPSTKC", "UPSTKSF", "URECT", "URECTR", "UREVUB", "USPI", "USTDNC", "USUBDVP", 239 "USUBPSTK", "UTFDOC", "UTFOSC", "UTME", "UTXFED", "UWKCAPC", "UXINST", "UXINTD", "VPAC", "VPO", "WCAP", "WCAPC", "WCAPCH", "WDA", "WDD", "WDEPS", "WDP", "XACC", 240 "XAD", "XAGO", "XAGT", "XCOM", "XCOMI", "XDEPL", "XDP", "XDVRE", "XEQO", "XI", "XIDO", "XIDOC", "XINDB", "XINDC", "XINS", "XINST", "XINT", "XINTD", "XINTOPT", 241 "XIVI", "XIVRE", "XLR", "XNBI", "XNF", "XNINS", "XNITB", "XOBD", "XOI", "XOPR", "XOPRAR", "XOPTD", "XOPTEPS", "XORE", "XPP", "XPR", "XRD", "XRDP", "XRENT", "XS", 242 "XSGA", "XSTF", "XSTFO", "XSTFWS", "XT", "XUW", "XUWLI", "XUWNLI", "XUWOI", "XUWREI", "XUWTI", "IID", "EXCHG", "CIK", "COSTAT", "FIC", "NAICSH", "SICH", "CSHTR_C", 243 "DVPSP_C", "DVPSX_C", "PRCC_C", "PRCH_C", "PRCL_C", "ADJEX_C", "CSHTR_F", "DVPSP_F", "DVPSX_F", "MKVALT", "PRCC_F", "PRCH_F", "PRCL_F", "ADJEX_F", "RANK", "AU", 244 "AUOP", "AUOPIC", "CEOSO", "CFOSO"]