HTMLTables.jl (10056B)
1 # -------------------------------------------------------------------------------------------------- 2 # HTML Table Parsing 3 # 4 # Parse HTML tables into DataFrames, handling colspan/rowspan and multi-level headers. 5 # Replaces PyCall/pandas read_html for Julia-native HTML scraping. 6 # 7 # Public API: 8 # read_html_tables(source; match=nothing, flatten=nothing) -> Vector{DataFrame} 9 # 10 # Future extension points (not implemented): 11 # - attrs kwarg: filter tables by HTML attributes (id, class) 12 # - header kwarg: explicit row indices for headers (override auto-detection) 13 # - skiprows kwarg: skip specific rows 14 # - displayed_only kwarg: filter out display:none elements 15 # - Type inference: auto-detect numeric columns 16 # -------------------------------------------------------------------------------------------------- 17 18 using Gumbo 19 using Cascadia 20 using HTTP 21 using DataFrames 22 23 24 # -------------------------------------------------------------------------------------------------- 25 # Text extraction 26 # -------------------------------------------------------------------------------------------------- 27 28 """Extract text from an HTML node, converting <br> to spaces and stripping <style> content.""" 29 function _cell_text(node)::String 30 if node isa HTMLText 31 return node.text 32 elseif node isa HTMLElement 33 tag = Gumbo.tag(node) 34 tag == :br && return " " 35 tag == :style && return "" 36 return join((_cell_text(c) for c in Gumbo.children(node)), "") 37 end 38 return "" 39 end 40 41 42 # -------------------------------------------------------------------------------------------------- 43 # Row classification 44 # -------------------------------------------------------------------------------------------------- 45 46 """ 47 A parsed cell: text content + HTML attributes needed for span expansion. 48 """ 49 struct ParsedCell 50 text::String 51 is_header::Bool 52 colspan::Int 53 rowspan::Int 54 end 55 56 """Extract ParsedCells from a <tr> element.""" 57 function _parse_row(tr)::Vector{ParsedCell} 58 cells = ParsedCell[] 59 for child in Gumbo.children(tr) 60 child isa HTMLElement || continue 61 t = Gumbo.tag(child) 62 (t == :th || t == :td) || continue 63 text = strip(_cell_text(child)) 64 cs = parse(Int, get(child.attributes, "colspan", "1")) 65 rs = parse(Int, get(child.attributes, "rowspan", "1")) 66 push!(cells, ParsedCell(text, t == :th, cs, rs)) 67 end 68 return cells 69 end 70 71 """ 72 Classify table rows into header rows and body rows. 73 74 Rules: 75 - <thead> rows -> header 76 - <tbody> rows -> body (multiple <tbody> concatenated) 77 - <tfoot> rows -> appended to body 78 - No <thead>: consecutive all-<th> rows from top of body -> moved to header 79 """ 80 function _classify_rows(table_elem) 81 header_rows = Vector{Vector{ParsedCell}}() 82 body_rows = Vector{Vector{ParsedCell}}() 83 has_thead = false 84 85 for child in Gumbo.children(table_elem) 86 child isa HTMLElement || continue 87 t = Gumbo.tag(child) 88 if t == :thead 89 has_thead = true 90 for tr in Gumbo.children(child) 91 tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(header_rows, _parse_row(tr)) 92 end 93 elseif t == :tbody 94 for tr in Gumbo.children(child) 95 tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(body_rows, _parse_row(tr)) 96 end 97 elseif t == :tfoot 98 for tr in Gumbo.children(child) 99 tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(body_rows, _parse_row(tr)) 100 end 101 elseif t == :tr 102 # bare <tr> not inside thead/tbody/tfoot 103 push!(body_rows, _parse_row(child)) 104 end 105 end 106 107 # If no <thead>, scan top of body for consecutive all-<th> rows 108 if !has_thead 109 while !isempty(body_rows) && all(c -> c.is_header, body_rows[1]) 110 push!(header_rows, popfirst!(body_rows)) 111 end 112 end 113 114 return header_rows, body_rows 115 end 116 117 118 # -------------------------------------------------------------------------------------------------- 119 # Span expansion 120 # -------------------------------------------------------------------------------------------------- 121 122 """ 123 Expand colspan/rowspan into a filled text grid. 124 125 Takes a flat vector of ParsedCell rows, returns a Matrix{Union{String,Nothing}} 126 where spanned cells are duplicated into all positions they cover. 127 """ 128 function _expand_spans(rows::Vector{Vector{ParsedCell}}) 129 isempty(rows) && return Matrix{Union{String,Nothing}}(nothing, 0, 0) 130 131 # Use a Dict-based sparse grid that grows as needed 132 grid = Dict{Tuple{Int,Int}, String}() 133 max_row = 0 134 max_col = 0 135 136 for (ri, row) in enumerate(rows) 137 col = 1 138 for cell in row 139 # Find next empty slot in this row 140 while haskey(grid, (ri, col)) 141 col += 1 142 end 143 # Fill the rowspan x colspan rectangle 144 for dr in 0:(cell.rowspan - 1) 145 for dc in 0:(cell.colspan - 1) 146 r, c = ri + dr, col + dc 147 grid[(r, c)] = cell.text 148 max_row = max(max_row, r) 149 max_col = max(max_col, c) 150 end 151 end 152 col += cell.colspan 153 end 154 end 155 156 # Convert to dense matrix 157 result = Matrix{Union{String,Nothing}}(nothing, max_row, max_col) 158 for ((r, c), text) in grid 159 result[r, c] = text 160 end 161 162 return result 163 end 164 165 166 # -------------------------------------------------------------------------------------------------- 167 # Table parsing 168 # -------------------------------------------------------------------------------------------------- 169 170 """Deduplicate column names by appending .1, .2, etc.""" 171 function _dedup_names(names_vec) 172 seen = Dict{String,Int}() 173 result = Vector{String}(undef, length(names_vec)) 174 for (i, name) in enumerate(names_vec) 175 if haskey(seen, name) 176 seen[name] += 1 177 result[i] = "$(name).$(seen[name])" 178 else 179 seen[name] = 0 180 result[i] = name 181 end 182 end 183 return result 184 end 185 186 """ 187 Parse a single <table> element into a DataFrame. 188 189 Returns nothing if the table has no data rows. 190 """ 191 function _parse_table(table_elem; flatten::Union{Nothing,Symbol}=nothing) 192 header_rows, body_rows = _classify_rows(table_elem) 193 194 # Combine all rows for span expansion, then split back 195 all_rows = vcat(header_rows, body_rows) 196 isempty(all_rows) && return nothing 197 198 grid = _expand_spans(all_rows) 199 nrows_total, ncols = size(grid) 200 ncols == 0 && return nothing 201 202 n_header = length(header_rows) 203 n_body = nrows_total - n_header 204 205 n_body <= 0 && return nothing 206 207 # Build column names 208 if n_header == 0 209 col_names = ["Column$i" for i in 1:ncols] 210 elseif n_header == 1 211 col_names = [something(grid[1, c], "Column$c") for c in 1:ncols] 212 else 213 # Multi-level headers: build tuple representation then convert to strings 214 raw_tuples = [Tuple(something(grid[r, c], "") for r in 1:n_header) for c in 1:ncols] 215 216 if flatten == :join 217 col_names = [join(filter(!isempty, t), "_") for t in raw_tuples] 218 elseif flatten == :last 219 col_names = [String(t[end]) for t in raw_tuples] 220 else 221 # Default: string representation of tuple, e.g. "(A, a)" 222 col_names = ["(" * join(t, ", ") * ")" for t in raw_tuples] 223 end 224 end 225 226 # Apply flatten for single-level headers (no-op, already strings) 227 228 # Deduplicate 229 col_names = _dedup_names(col_names) 230 231 # Build DataFrame from body rows 232 cols = Vector{Vector{Union{String,Missing}}}(undef, ncols) 233 for c in 1:ncols 234 vals = Vector{Union{String,Missing}}(undef, n_body) 235 for (idx, r) in enumerate((n_header + 1):nrows_total) 236 val = grid[r, c] 237 vals[idx] = (val === nothing || val == "") ? missing : val 238 end 239 cols[c] = vals 240 end 241 242 # Construct DataFrame preserving column order 243 df = DataFrame() 244 for (c, name) in enumerate(col_names) 245 df[!, name] = cols[c] 246 end 247 248 return df 249 end 250 251 252 # -------------------------------------------------------------------------------------------------- 253 # Public API 254 # -------------------------------------------------------------------------------------------------- 255 256 """ 257 read_html_tables(source::String; match=nothing, flatten=nothing) -> Vector{DataFrame} 258 259 Parse all HTML tables from a URL or raw HTML string into DataFrames. 260 261 # Arguments 262 - `source`: URL (starting with "http") or raw HTML string 263 - `match`: optional `Regex` -- only return tables whose text content matches 264 - `flatten`: controls multi-level header column names (DataFrames requires String column names) 265 - `nothing` (default): string representation of tuples, e.g. `"(A, a)"` 266 - `:join`: join levels with `"_"`, e.g. `"A_a"` 267 - `:last`: last header level only, e.g. `"a"` 268 269 # Returns 270 Vector of DataFrames with String/Missing columns. Empty tables are skipped. 271 272 # Examples 273 ```julia 274 dfs = read_html_tables("https://en.wikipedia.org/wiki/List_of_Alabama_state_parks") 275 dfs = read_html_tables(html_string; match=r"Name"i, flatten=:last) 276 ``` 277 """ 278 function read_html_tables(source::String; match::Union{Nothing,Regex}=nothing, 279 flatten::Union{Nothing,Symbol}=nothing) 280 # Fetch HTML 281 html = if startswith(source, "http://") || startswith(source, "https://") 282 String(HTTP.get(source).body) 283 else 284 source 285 end 286 287 doc = parsehtml(html) 288 tables = eachmatch(Selector("table"), doc.root) 289 290 dfs = DataFrame[] 291 for table_elem in tables 292 df = _parse_table(table_elem; flatten=flatten) 293 df === nothing && continue 294 295 # Filter by match regex if provided 296 if match !== nothing 297 table_text = _cell_text(table_elem) 298 occursin(match, table_text) || continue 299 end 300 301 push!(dfs, df) 302 end 303 304 return dfs 305 end 306 # --------------------------------------------------------------------------------------------------