BazerUtils.jl

Assorted Julia utilities including custom logging
Log | Files | Refs | README | LICENSE

HTMLTables.jl (10056B)


      1 # --------------------------------------------------------------------------------------------------
      2 # HTML Table Parsing
      3 #
      4 # Parse HTML tables into DataFrames, handling colspan/rowspan and multi-level headers.
      5 # Replaces PyCall/pandas read_html for Julia-native HTML scraping.
      6 #
      7 # Public API:
      8 #   read_html_tables(source; match=nothing, flatten=nothing) -> Vector{DataFrame}
      9 #
     10 # Future extension points (not implemented):
     11 #   - attrs kwarg: filter tables by HTML attributes (id, class)
     12 #   - header kwarg: explicit row indices for headers (override auto-detection)
     13 #   - skiprows kwarg: skip specific rows
     14 #   - displayed_only kwarg: filter out display:none elements
     15 #   - Type inference: auto-detect numeric columns
     16 # --------------------------------------------------------------------------------------------------
     17 
     18 using Gumbo
     19 using Cascadia
     20 using HTTP
     21 using DataFrames
     22 
     23 
     24 # --------------------------------------------------------------------------------------------------
     25 # Text extraction
     26 # --------------------------------------------------------------------------------------------------
     27 
     28 """Extract text from an HTML node, converting <br> to spaces and stripping <style> content."""
     29 function _cell_text(node)::String
     30     if node isa HTMLText
     31         return node.text
     32     elseif node isa HTMLElement
     33         tag = Gumbo.tag(node)
     34         tag == :br && return " "
     35         tag == :style && return ""
     36         return join((_cell_text(c) for c in Gumbo.children(node)), "")
     37     end
     38     return ""
     39 end
     40 
     41 
     42 # --------------------------------------------------------------------------------------------------
     43 # Row classification
     44 # --------------------------------------------------------------------------------------------------
     45 
     46 """
     47 A parsed cell: text content + HTML attributes needed for span expansion.
     48 """
     49 struct ParsedCell
     50     text::String
     51     is_header::Bool
     52     colspan::Int
     53     rowspan::Int
     54 end
     55 
     56 """Extract ParsedCells from a <tr> element."""
     57 function _parse_row(tr)::Vector{ParsedCell}
     58     cells = ParsedCell[]
     59     for child in Gumbo.children(tr)
     60         child isa HTMLElement || continue
     61         t = Gumbo.tag(child)
     62         (t == :th || t == :td) || continue
     63         text = strip(_cell_text(child))
     64         cs = parse(Int, get(child.attributes, "colspan", "1"))
     65         rs = parse(Int, get(child.attributes, "rowspan", "1"))
     66         push!(cells, ParsedCell(text, t == :th, cs, rs))
     67     end
     68     return cells
     69 end
     70 
     71 """
     72 Classify table rows into header rows and body rows.
     73 
     74 Rules:
     75 - <thead> rows -> header
     76 - <tbody> rows -> body (multiple <tbody> concatenated)
     77 - <tfoot> rows -> appended to body
     78 - No <thead>: consecutive all-<th> rows from top of body -> moved to header
     79 """
     80 function _classify_rows(table_elem)
     81     header_rows = Vector{Vector{ParsedCell}}()
     82     body_rows = Vector{Vector{ParsedCell}}()
     83     has_thead = false
     84 
     85     for child in Gumbo.children(table_elem)
     86         child isa HTMLElement || continue
     87         t = Gumbo.tag(child)
     88         if t == :thead
     89             has_thead = true
     90             for tr in Gumbo.children(child)
     91                 tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(header_rows, _parse_row(tr))
     92             end
     93         elseif t == :tbody
     94             for tr in Gumbo.children(child)
     95                 tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(body_rows, _parse_row(tr))
     96             end
     97         elseif t == :tfoot
     98             for tr in Gumbo.children(child)
     99                 tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(body_rows, _parse_row(tr))
    100             end
    101         elseif t == :tr
    102             # bare <tr> not inside thead/tbody/tfoot
    103             push!(body_rows, _parse_row(child))
    104         end
    105     end
    106 
    107     # If no <thead>, scan top of body for consecutive all-<th> rows
    108     if !has_thead
    109         while !isempty(body_rows) && all(c -> c.is_header, body_rows[1])
    110             push!(header_rows, popfirst!(body_rows))
    111         end
    112     end
    113 
    114     return header_rows, body_rows
    115 end
    116 
    117 
    118 # --------------------------------------------------------------------------------------------------
    119 # Span expansion
    120 # --------------------------------------------------------------------------------------------------
    121 
    122 """
    123 Expand colspan/rowspan into a filled text grid.
    124 
    125 Takes a flat vector of ParsedCell rows, returns a Matrix{Union{String,Nothing}}
    126 where spanned cells are duplicated into all positions they cover.
    127 """
    128 function _expand_spans(rows::Vector{Vector{ParsedCell}})
    129     isempty(rows) && return Matrix{Union{String,Nothing}}(nothing, 0, 0)
    130 
    131     # Use a Dict-based sparse grid that grows as needed
    132     grid = Dict{Tuple{Int,Int}, String}()
    133     max_row = 0
    134     max_col = 0
    135 
    136     for (ri, row) in enumerate(rows)
    137         col = 1
    138         for cell in row
    139             # Find next empty slot in this row
    140             while haskey(grid, (ri, col))
    141                 col += 1
    142             end
    143             # Fill the rowspan x colspan rectangle
    144             for dr in 0:(cell.rowspan - 1)
    145                 for dc in 0:(cell.colspan - 1)
    146                     r, c = ri + dr, col + dc
    147                     grid[(r, c)] = cell.text
    148                     max_row = max(max_row, r)
    149                     max_col = max(max_col, c)
    150                 end
    151             end
    152             col += cell.colspan
    153         end
    154     end
    155 
    156     # Convert to dense matrix
    157     result = Matrix{Union{String,Nothing}}(nothing, max_row, max_col)
    158     for ((r, c), text) in grid
    159         result[r, c] = text
    160     end
    161 
    162     return result
    163 end
    164 
    165 
    166 # --------------------------------------------------------------------------------------------------
    167 # Table parsing
    168 # --------------------------------------------------------------------------------------------------
    169 
    170 """Deduplicate column names by appending .1, .2, etc."""
    171 function _dedup_names(names_vec)
    172     seen = Dict{String,Int}()
    173     result = Vector{String}(undef, length(names_vec))
    174     for (i, name) in enumerate(names_vec)
    175         if haskey(seen, name)
    176             seen[name] += 1
    177             result[i] = "$(name).$(seen[name])"
    178         else
    179             seen[name] = 0
    180             result[i] = name
    181         end
    182     end
    183     return result
    184 end
    185 
    186 """
    187 Parse a single <table> element into a DataFrame.
    188 
    189 Returns nothing if the table has no data rows.
    190 """
    191 function _parse_table(table_elem; flatten::Union{Nothing,Symbol}=nothing)
    192     header_rows, body_rows = _classify_rows(table_elem)
    193 
    194     # Combine all rows for span expansion, then split back
    195     all_rows = vcat(header_rows, body_rows)
    196     isempty(all_rows) && return nothing
    197 
    198     grid = _expand_spans(all_rows)
    199     nrows_total, ncols = size(grid)
    200     ncols == 0 && return nothing
    201 
    202     n_header = length(header_rows)
    203     n_body = nrows_total - n_header
    204 
    205     n_body <= 0 && return nothing
    206 
    207     # Build column names
    208     if n_header == 0
    209         col_names = ["Column$i" for i in 1:ncols]
    210     elseif n_header == 1
    211         col_names = [something(grid[1, c], "Column$c") for c in 1:ncols]
    212     else
    213         # Multi-level headers: build tuple representation then convert to strings
    214         raw_tuples = [Tuple(something(grid[r, c], "") for r in 1:n_header) for c in 1:ncols]
    215 
    216         if flatten == :join
    217             col_names = [join(filter(!isempty, t), "_") for t in raw_tuples]
    218         elseif flatten == :last
    219             col_names = [String(t[end]) for t in raw_tuples]
    220         else
    221             # Default: string representation of tuple, e.g. "(A, a)"
    222             col_names = ["(" * join(t, ", ") * ")" for t in raw_tuples]
    223         end
    224     end
    225 
    226     # Apply flatten for single-level headers (no-op, already strings)
    227 
    228     # Deduplicate
    229     col_names = _dedup_names(col_names)
    230 
    231     # Build DataFrame from body rows
    232     cols = Vector{Vector{Union{String,Missing}}}(undef, ncols)
    233     for c in 1:ncols
    234         vals = Vector{Union{String,Missing}}(undef, n_body)
    235         for (idx, r) in enumerate((n_header + 1):nrows_total)
    236             val = grid[r, c]
    237             vals[idx] = (val === nothing || val == "") ? missing : val
    238         end
    239         cols[c] = vals
    240     end
    241 
    242     # Construct DataFrame preserving column order
    243     df = DataFrame()
    244     for (c, name) in enumerate(col_names)
    245         df[!, name] = cols[c]
    246     end
    247 
    248     return df
    249 end
    250 
    251 
    252 # --------------------------------------------------------------------------------------------------
    253 # Public API
    254 # --------------------------------------------------------------------------------------------------
    255 
    256 """
    257     read_html_tables(source::String; match=nothing, flatten=nothing) -> Vector{DataFrame}
    258 
    259 Parse all HTML tables from a URL or raw HTML string into DataFrames.
    260 
    261 # Arguments
    262 - `source`: URL (starting with "http") or raw HTML string
    263 - `match`: optional `Regex` -- only return tables whose text content matches
    264 - `flatten`: controls multi-level header column names (DataFrames requires String column names)
    265   - `nothing` (default): string representation of tuples, e.g. `"(A, a)"`
    266   - `:join`: join levels with `"_"`, e.g. `"A_a"`
    267   - `:last`: last header level only, e.g. `"a"`
    268 
    269 # Returns
    270 Vector of DataFrames with String/Missing columns. Empty tables are skipped.
    271 
    272 # Examples
    273 ```julia
    274 dfs = read_html_tables("https://en.wikipedia.org/wiki/List_of_Alabama_state_parks")
    275 dfs = read_html_tables(html_string; match=r"Name"i, flatten=:last)
    276 ```
    277 """
    278 function read_html_tables(source::String; match::Union{Nothing,Regex}=nothing,
    279                           flatten::Union{Nothing,Symbol}=nothing)
    280     # Fetch HTML
    281     html = if startswith(source, "http://") || startswith(source, "https://")
    282         String(HTTP.get(source).body)
    283     else
    284         source
    285     end
    286 
    287     doc = parsehtml(html)
    288     tables = eachmatch(Selector("table"), doc.root)
    289 
    290     dfs = DataFrame[]
    291     for table_elem in tables
    292         df = _parse_table(table_elem; flatten=flatten)
    293         df === nothing && continue
    294 
    295         # Filter by match regex if provided
    296         if match !== nothing
    297             table_text = _cell_text(table_elem)
    298             occursin(match, table_text) || continue
    299         end
    300 
    301         push!(dfs, df)
    302     end
    303 
    304     return dfs
    305 end
    306 # --------------------------------------------------------------------------------------------------