Add read_html_tables: native HTML table parsing (v0.12.0) - BazerUtils.jl - Assorted Julia utilities including custom logging

commit a312dbea5a51984d6660816e6a89ea15418df345
parent a4ce90a0b8acae89ccdbd551f29007a5ec260142
Author: Erik Loualiche <eloualiche@users.noreply.github.com>
Date:   Sat, 28 Mar 2026 22:18:51 -0500

Add read_html_tables: native HTML table parsing (v0.12.0)

* Add Gumbo, Cascadia, HTTP, DataFrames deps for HTML table parsing

* Add HTMLTables scaffold and 26 test cases

Test fixtures adapted from pandas read_html test suite.
Covers: core parsing, colspan/rowspan, multi-level headers, data quality.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* Implement read_html_tables with colspan/rowspan and multi-level headers

Core algorithm: sparse grid expansion (same approach as pandas _expand_colspan_rowspan).
Handles: thead/tbody/tfoot, auto-detect headers from th rows, multi-level headers,
flatten kwarg (:join, :last), match regex filtering, br->space, style stripping.
All 26 tests passing (67 assertions).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* Add Wikipedia integration test, bump to v0.12.0

- Live test against Alabama state parks Wikipedia page
- Version bump: 0.11.0 -> 0.12.0 (new feature: read_html_tables)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
M Project.toml  | 10 +++++++++-
M src/BazerUtils.jl  | 2 ++
A src/HTMLTables.jl  | 306 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/UnitTests/html_tables.jl  | 378 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M test/runtests.jl  | 4 +++-

5 files changed, 698 insertions(+), 2 deletions(-)
diff --git a/Project.toml b/Project.toml
@@ -1,18 +1,26 @@
 name = "BazerUtils"
 uuid = "36dcebb2-80bb-4116-91f4-ed9f396c4a1c"
+version = "0.12.0"
 authors = ["Erik Loualiche"]
-version = "0.11.0"
 
 [deps]
+Cascadia = "54eefc05-d75b-58de-a785-1a3403f0919f"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
+Cascadia = "1.0.2"
 CodecZlib = "0.7"
+DataFrames = "1.8.1"
+Gumbo = "0.8.3"
+HTTP = "1.11.0"
 JSON = "1"
 LoggingExtras = "1"
 Tables = "1.12"
diff --git a/src/BazerUtils.jl b/src/BazerUtils.jl
@@ -15,6 +15,7 @@ import CodecZlib: CodecZlib
 # Import functions
 include("CustomLogger.jl")
 include("JSONLines.jl")
+include("HTMLTables.jl")
 # --------------------------------------------------------------------------------------------------
 
 
@@ -22,6 +23,7 @@ include("JSONLines.jl")
 # List of exported functions
 export custom_logger
 export read_jsonl, stream_jsonl, write_jsonl
+export read_html_tables
 # --------------------------------------------------------------------------------------------------
 
 
diff --git a/src/HTMLTables.jl b/src/HTMLTables.jl
@@ -0,0 +1,306 @@
+# --------------------------------------------------------------------------------------------------
+# HTML Table Parsing
+#
+# Parse HTML tables into DataFrames, handling colspan/rowspan and multi-level headers.
+# Replaces PyCall/pandas read_html for Julia-native HTML scraping.
+#
+# Public API:
+#   read_html_tables(source; match=nothing, flatten=nothing) -> Vector{DataFrame}
+#
+# Future extension points (not implemented):
+#   - attrs kwarg: filter tables by HTML attributes (id, class)
+#   - header kwarg: explicit row indices for headers (override auto-detection)
+#   - skiprows kwarg: skip specific rows
+#   - displayed_only kwarg: filter out display:none elements
+#   - Type inference: auto-detect numeric columns
+# --------------------------------------------------------------------------------------------------
+
+using Gumbo
+using Cascadia
+using HTTP
+using DataFrames
+
+
+# --------------------------------------------------------------------------------------------------
+# Text extraction
+# --------------------------------------------------------------------------------------------------
+
+"""Extract text from an HTML node, converting <br> to spaces and stripping <style> content."""
+function _cell_text(node)::String
+    if node isa HTMLText
+        return node.text
+    elseif node isa HTMLElement
+        tag = Gumbo.tag(node)
+        tag == :br && return " "
+        tag == :style && return ""
+        return join((_cell_text(c) for c in Gumbo.children(node)), "")
+    end
+    return ""
+end
+
+
+# --------------------------------------------------------------------------------------------------
+# Row classification
+# --------------------------------------------------------------------------------------------------
+
+"""
+A parsed cell: text content + HTML attributes needed for span expansion.
+"""
+struct ParsedCell
+    text::String
+    is_header::Bool
+    colspan::Int
+    rowspan::Int
+end
+
+"""Extract ParsedCells from a <tr> element."""
+function _parse_row(tr)::Vector{ParsedCell}
+    cells = ParsedCell[]
+    for child in Gumbo.children(tr)
+        child isa HTMLElement || continue
+        t = Gumbo.tag(child)
+        (t == :th || t == :td) || continue
+        text = strip(_cell_text(child))
+        cs = parse(Int, get(child.attributes, "colspan", "1"))
+        rs = parse(Int, get(child.attributes, "rowspan", "1"))
+        push!(cells, ParsedCell(text, t == :th, cs, rs))
+    end
+    return cells
+end
+
+"""
+Classify table rows into header rows and body rows.
+
+Rules:
+- <thead> rows -> header
+- <tbody> rows -> body (multiple <tbody> concatenated)
+- <tfoot> rows -> appended to body
+- No <thead>: consecutive all-<th> rows from top of body -> moved to header
+"""
+function _classify_rows(table_elem)
+    header_rows = Vector{Vector{ParsedCell}}()
+    body_rows = Vector{Vector{ParsedCell}}()
+    has_thead = false
+
+    for child in Gumbo.children(table_elem)
+        child isa HTMLElement || continue
+        t = Gumbo.tag(child)
+        if t == :thead
+            has_thead = true
+            for tr in Gumbo.children(child)
+                tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(header_rows, _parse_row(tr))
+            end
+        elseif t == :tbody
+            for tr in Gumbo.children(child)
+                tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(body_rows, _parse_row(tr))
+            end
+        elseif t == :tfoot
+            for tr in Gumbo.children(child)
+                tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(body_rows, _parse_row(tr))
+            end
+        elseif t == :tr
+            # bare <tr> not inside thead/tbody/tfoot
+            push!(body_rows, _parse_row(child))
+        end
+    end
+
+    # If no <thead>, scan top of body for consecutive all-<th> rows
+    if !has_thead
+        while !isempty(body_rows) && all(c -> c.is_header, body_rows[1])
+            push!(header_rows, popfirst!(body_rows))
+        end
+    end
+
+    return header_rows, body_rows
+end
+
+
+# --------------------------------------------------------------------------------------------------
+# Span expansion
+# --------------------------------------------------------------------------------------------------
+
+"""
+Expand colspan/rowspan into a filled text grid.
+
+Takes a flat vector of ParsedCell rows, returns a Matrix{Union{String,Nothing}}
+where spanned cells are duplicated into all positions they cover.
+"""
+function _expand_spans(rows::Vector{Vector{ParsedCell}})
+    isempty(rows) && return Matrix{Union{String,Nothing}}(nothing, 0, 0)
+
+    # Use a Dict-based sparse grid that grows as needed
+    grid = Dict{Tuple{Int,Int}, String}()
+    max_row = 0
+    max_col = 0
+
+    for (ri, row) in enumerate(rows)
+        col = 1
+        for cell in row
+            # Find next empty slot in this row
+            while haskey(grid, (ri, col))
+                col += 1
+            end
+            # Fill the rowspan x colspan rectangle
+            for dr in 0:(cell.rowspan - 1)
+                for dc in 0:(cell.colspan - 1)
+                    r, c = ri + dr, col + dc
+                    grid[(r, c)] = cell.text
+                    max_row = max(max_row, r)
+                    max_col = max(max_col, c)
+                end
+            end
+            col += cell.colspan
+        end
+    end
+
+    # Convert to dense matrix
+    result = Matrix{Union{String,Nothing}}(nothing, max_row, max_col)
+    for ((r, c), text) in grid
+        result[r, c] = text
+    end
+
+    return result
+end
+
+
+# --------------------------------------------------------------------------------------------------
+# Table parsing
+# --------------------------------------------------------------------------------------------------
+
+"""Deduplicate column names by appending .1, .2, etc."""
+function _dedup_names(names_vec)
+    seen = Dict{String,Int}()
+    result = Vector{String}(undef, length(names_vec))
+    for (i, name) in enumerate(names_vec)
+        if haskey(seen, name)
+            seen[name] += 1
+            result[i] = "$(name).$(seen[name])"
+        else
+            seen[name] = 0
+            result[i] = name
+        end
+    end
+    return result
+end
+
+"""
+Parse a single <table> element into a DataFrame.
+
+Returns nothing if the table has no data rows.
+"""
+function _parse_table(table_elem; flatten::Union{Nothing,Symbol}=nothing)
+    header_rows, body_rows = _classify_rows(table_elem)
+
+    # Combine all rows for span expansion, then split back
+    all_rows = vcat(header_rows, body_rows)
+    isempty(all_rows) && return nothing
+
+    grid = _expand_spans(all_rows)
+    nrows_total, ncols = size(grid)
+    ncols == 0 && return nothing
+
+    n_header = length(header_rows)
+    n_body = nrows_total - n_header
+
+    n_body <= 0 && return nothing
+
+    # Build column names
+    if n_header == 0
+        col_names = ["Column$i" for i in 1:ncols]
+    elseif n_header == 1
+        col_names = [something(grid[1, c], "Column$c") for c in 1:ncols]
+    else
+        # Multi-level headers: build tuple representation then convert to strings
+        raw_tuples = [Tuple(something(grid[r, c], "") for r in 1:n_header) for c in 1:ncols]
+
+        if flatten == :join
+            col_names = [join(filter(!isempty, t), "_") for t in raw_tuples]
+        elseif flatten == :last
+            col_names = [String(t[end]) for t in raw_tuples]
+        else
+            # Default: string representation of tuple, e.g. "(A, a)"
+            col_names = ["(" * join(t, ", ") * ")" for t in raw_tuples]
+        end
+    end
+
+    # Apply flatten for single-level headers (no-op, already strings)
+
+    # Deduplicate
+    col_names = _dedup_names(col_names)
+
+    # Build DataFrame from body rows
+    cols = Vector{Vector{Union{String,Missing}}}(undef, ncols)
+    for c in 1:ncols
+        vals = Vector{Union{String,Missing}}(undef, n_body)
+        for (idx, r) in enumerate((n_header + 1):nrows_total)
+            val = grid[r, c]
+            vals[idx] = (val === nothing || val == "") ? missing : val
+        end
+        cols[c] = vals
+    end
+
+    # Construct DataFrame preserving column order
+    df = DataFrame()
+    for (c, name) in enumerate(col_names)
+        df[!, name] = cols[c]
+    end
+
+    return df
+end
+
+
+# --------------------------------------------------------------------------------------------------
+# Public API
+# --------------------------------------------------------------------------------------------------
+
+"""
+    read_html_tables(source::String; match=nothing, flatten=nothing) -> Vector{DataFrame}
+
+Parse all HTML tables from a URL or raw HTML string into DataFrames.
+
+# Arguments
+- `source`: URL (starting with "http") or raw HTML string
+- `match`: optional `Regex` -- only return tables whose text content matches
+- `flatten`: controls multi-level header column names (DataFrames requires String column names)
+  - `nothing` (default): string representation of tuples, e.g. `"(A, a)"`
+  - `:join`: join levels with `"_"`, e.g. `"A_a"`
+  - `:last`: last header level only, e.g. `"a"`
+
+# Returns
+Vector of DataFrames with String/Missing columns. Empty tables are skipped.
+
+# Examples
+```julia
+dfs = read_html_tables("https://en.wikipedia.org/wiki/List_of_Alabama_state_parks")
+dfs = read_html_tables(html_string; match=r"Name"i, flatten=:last)
+```
+"""
+function read_html_tables(source::String; match::Union{Nothing,Regex}=nothing,
+                          flatten::Union{Nothing,Symbol}=nothing)
+    # Fetch HTML
+    html = if startswith(source, "http://") || startswith(source, "https://")
+        String(HTTP.get(source).body)
+    else
+        source
+    end
+
+    doc = parsehtml(html)
+    tables = eachmatch(Selector("table"), doc.root)
+
+    dfs = DataFrame[]
+    for table_elem in tables
+        df = _parse_table(table_elem; flatten=flatten)
+        df === nothing && continue
+
+        # Filter by match regex if provided
+        if match !== nothing
+            table_text = _cell_text(table_elem)
+            occursin(match, table_text) || continue
+        end
+
+        push!(dfs, df)
+    end
+
+    return dfs
+end
+# --------------------------------------------------------------------------------------------------
diff --git a/test/UnitTests/html_tables.jl b/test/UnitTests/html_tables.jl
@@ -0,0 +1,378 @@
+using Test
+using BazerUtils
+using DataFrames
+
+@testset "HTMLTables" begin
+
+# ==================================================================================
+# Tier 1: Core table parsing
+# ==================================================================================
+
+@testset "Tier 1: Core parsing" begin
+
+@testset "basic table with thead/tbody" begin
+    html = """
+    <table>
+      <thead><tr><th>A</th><th>B</th></tr></thead>
+      <tbody><tr><td>1</td><td>2</td></tr>
+             <tr><td>3</td><td>4</td></tr></tbody>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test length(dfs) == 1
+    df = dfs[1]
+    @test names(df) == ["A", "B"]
+    @test size(df) == (2, 2)
+    @test df[1, "A"] == "1"
+    @test df[2, "B"] == "4"
+end
+
+@testset "table without thead (auto-detect from th rows)" begin
+    html = """
+    <table>
+      <tr><th>X</th><th>Y</th></tr>
+      <tr><td>a</td><td>b</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test length(dfs) == 1
+    @test names(dfs[1]) == ["X", "Y"]
+    @test dfs[1][1, "X"] == "a"
+end
+
+@testset "multiple tbody elements concatenated" begin
+    html = """
+    <table>
+      <thead><tr><th>A</th><th>B</th></tr></thead>
+      <tbody><tr><td>1</td><td>2</td></tr></tbody>
+      <tbody><tr><td>3</td><td>4</td></tr></tbody>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test size(dfs[1]) == (2, 2)
+    @test dfs[1][2, "A"] == "3"
+end
+
+@testset "tfoot with data appended to body" begin
+    html = """
+    <table>
+      <thead><tr><th>A</th><th>B</th></tr></thead>
+      <tbody><tr><td>1</td><td>2</td></tr></tbody>
+      <tfoot><tr><td>foot1</td><td>foot2</td></tr></tfoot>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test size(dfs[1]) == (2, 2)
+    @test dfs[1][2, "A"] == "foot1"
+end
+
+@testset "mixed th/td in body row" begin
+    html = """
+    <table>
+      <thead><tr><th>Country</th><th>City</th><th>Year</th></tr></thead>
+      <tbody><tr><td>Ukraine</td><th>Odessa</th><td>1944</td></tr></tbody>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test dfs[1][1, "City"] == "Odessa"
+end
+
+@testset "single column table" begin
+    html = """
+    <table>
+      <tr><th>Only</th></tr>
+      <tr><td>val</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test size(dfs[1]) == (1, 1)
+    @test names(dfs[1]) == ["Only"]
+end
+
+@testset "empty table skipped" begin
+    html = """
+    <table><tbody></tbody></table>
+    <table>
+      <tr><th>A</th></tr>
+      <tr><td>1</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test length(dfs) == 1
+    @test names(dfs[1]) == ["A"]
+end
+
+@testset "multiple tables in document" begin
+    html = """
+    <table><tr><th>T1</th></tr><tr><td>a</td></tr></table>
+    <table><tr><th>T2</th></tr><tr><td>b</td></tr></table>
+    <table><tr><th>T3</th></tr><tr><td>c</td></tr></table>"""
+    dfs = read_html_tables(html)
+    @test length(dfs) == 3
+    @test names(dfs[2]) == ["T2"]
+end
+
+@testset "match kwarg filters tables" begin
+    html = """
+    <table><tr><th>Name</th></tr><tr><td>park</td></tr></table>
+    <table><tr><th>Other</th></tr><tr><td>data</td></tr></table>"""
+    dfs = read_html_tables(html; match=r"park"i)
+    @test length(dfs) == 1
+    @test names(dfs[1]) == ["Name"]
+end
+
+end # Tier 1
+
+
+# ==================================================================================
+# Tier 2: Colspan/rowspan
+# ==================================================================================
+
+@testset "Tier 2: Colspan/rowspan" begin
+
+@testset "colspan=1 and rowspan=1 are no-ops" begin
+    html = """
+    <table>
+      <tr><th>A</th><th colspan="1">B</th><th rowspan="1">C</th></tr>
+      <tr><td>a</td><td>b</td><td>c</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test names(dfs[1]) == ["A", "B", "C"]
+    @test dfs[1][1, "B"] == "b"
+end
+
+@testset "colspan=2 in header" begin
+    html = """
+    <table>
+      <tr><th colspan="2">Wide</th><th>Narrow</th></tr>
+      <tr><td>a</td><td>b</td><td>c</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test size(dfs[1], 2) == 3
+    @test dfs[1][1, 1] == "a"
+    @test dfs[1][1, 3] == "c"
+end
+
+@testset "colspan=2 in body" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th><th>C</th></tr>
+      <tr><td colspan="2">wide</td><td>c</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test dfs[1][1, "A"] == "wide"
+    @test dfs[1][1, "B"] == "wide"
+    @test dfs[1][1, "C"] == "c"
+end
+
+@testset "rowspan=2 in body" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th></tr>
+      <tr><td rowspan="2">tall</td><td>1</td></tr>
+      <tr><td>2</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test size(dfs[1]) == (2, 2)
+    @test dfs[1][1, "A"] == "tall"
+    @test dfs[1][2, "A"] == "tall"
+    @test dfs[1][2, "B"] == "2"
+end
+
+@testset "rowspan at end of row" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th></tr>
+      <tr><td>x</td><td rowspan="2">y</td></tr>
+      <tr><td>z</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test dfs[1][2, "B"] == "y"
+    @test dfs[1][2, "A"] == "z"
+end
+
+@testset "both rowspan and colspan on same cell" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th></tr>
+      <tr><td rowspan="2">a</td><td rowspan="2" colspan="3">block</td><td>e1</td></tr>
+      <tr><td>e2</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test size(dfs[1]) == (2, 5)
+    @test dfs[1][1, "B"] == "block"
+    @test dfs[1][1, "C"] == "block"
+    @test dfs[1][1, "D"] == "block"
+    @test dfs[1][2, "B"] == "block"
+    @test dfs[1][2, "D"] == "block"
+    @test dfs[1][2, "A"] == "a"
+    @test dfs[1][1, "E"] == "e1"
+    @test dfs[1][2, "E"] == "e2"
+end
+
+@testset "rowspan spanning header into body" begin
+    html = """
+    <table>
+      <tr><th rowspan="2">A</th><th>B</th></tr>
+      <tr><td>1</td></tr>
+      <tr><td>C</td><td>2</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test names(dfs[1]) == ["A", "B"]
+    @test dfs[1][1, "A"] == "A"
+    @test dfs[1][1, "B"] == "1"
+    @test dfs[1][2, "A"] == "C"
+    @test dfs[1][2, "B"] == "2"
+end
+
+@testset "rowspan-only rows" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th></tr>
+      <tr><td rowspan="3">x</td><td rowspan="3">y</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test size(dfs[1]) == (3, 2)
+    @test dfs[1][3, "A"] == "x"
+    @test dfs[1][3, "B"] == "y"
+end
+
+end # Tier 2
+
+
+# ==================================================================================
+# Tier 3: Multi-level headers + flatten
+# ==================================================================================
+
+@testset "Tier 3: Multi-level headers" begin
+
+@testset "two th rows give string-tuple column names" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th></tr>
+      <tr><th>a</th><th>b</th></tr>
+      <tr><td>1</td><td>2</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test names(dfs[1]) == ["(A, a)", "(B, b)"]
+end
+
+@testset "flatten=:join joins with underscore" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th></tr>
+      <tr><th>a</th><th>b</th></tr>
+      <tr><td>1</td><td>2</td></tr>
+    </table>"""
+    dfs = read_html_tables(html; flatten=:join)
+    @test names(dfs[1]) == ["A_a", "B_b"]
+end
+
+@testset "flatten=:last takes last level" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th></tr>
+      <tr><th>a</th><th>b</th></tr>
+      <tr><td>1</td><td>2</td></tr>
+    </table>"""
+    dfs = read_html_tables(html; flatten=:last)
+    @test names(dfs[1]) == ["a", "b"]
+end
+
+@testset "Wikipedia-style colspan grouping with sub-headers" begin
+    html = """
+    <table>
+      <tr><th rowspan="2">Name</th><th colspan="2">Size</th><th rowspan="2">Year</th></tr>
+      <tr><th>acres</th><th>ha</th></tr>
+      <tr><td>Park A</td><td>100</td><td>40</td><td>1920</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test names(dfs[1]) == ["(Name, Name)", "(Size, acres)", "(Size, ha)", "(Year, Year)"]
+    @test dfs[1][1, "(Size, acres)"] == "100"
+
+    dfs2 = read_html_tables(html; flatten=:last)
+    @test names(dfs2[1]) == ["Name", "acres", "ha", "Year"]
+end
+
+end # Tier 3
+
+
+# ==================================================================================
+# Tier 4: Data quality
+# ==================================================================================
+
+@testset "Tier 4: Data quality" begin
+
+@testset "empty cells become missing" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th></tr>
+      <tr><td></td><td>val</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test ismissing(dfs[1][1, "A"])
+    @test dfs[1][1, "B"] == "val"
+end
+
+@testset "ragged rows padded with missing" begin
+    html = """
+    <table>
+      <tr><th>A</th><th>B</th><th>C</th></tr>
+      <tr><td>1</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test dfs[1][1, "A"] == "1"
+    @test ismissing(dfs[1][1, "B"])
+    @test ismissing(dfs[1][1, "C"])
+end
+
+@testset "br inside cell becomes space" begin
+    html = """
+    <table>
+      <tr><th>A</th></tr>
+      <tr><td>word1<br>word2</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test dfs[1][1, "A"] == "word1 word2"
+end
+
+@testset "style tag stripped from header" begin
+    html = """
+    <table>
+      <tr><th><style>.x{color:red}</style>Name</th><th>B</th></tr>
+      <tr><td>a</td><td>b</td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test strip(names(dfs[1])[1]) == "Name" || names(dfs[1])[1] == "Name"
+end
+
+@testset "whitespace normalization" begin
+    html = """
+    <table>
+      <tr><th>  A  </th></tr>
+      <tr><td>  val  </td></tr>
+    </table>"""
+    dfs = read_html_tables(html)
+    @test names(dfs[1]) == ["A"]
+    @test dfs[1][1, "A"] == "val"
+end
+
+end # Tier 4
+
+
+# ==================================================================================
+# Integration: real Wikipedia page
+# ==================================================================================
+
+@testset "Integration: Wikipedia state parks" begin
+    try
+        dfs = read_html_tables(
+            "https://en.wikipedia.org/wiki/List_of_Alabama_state_parks";
+            match=r"[Nn]ame", flatten=:last)
+        @test length(dfs) >= 1
+        df = dfs[1]
+        @test any(contains.(lowercase.(names(df)), "name"))
+        @test nrow(df) > 10
+    catch e
+        if e isa HTTP.Exceptions.StatusError || e isa Downloads.RequestError
+            @warn "Skipping Wikipedia test (network error)"
+        else
+            rethrow(e)
+        end
+    end
+end
+
+end # HTMLTables
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,10 +8,12 @@ import JSON
 import CodecZlib
 import HTTP
 import Dates
+import DataFrames
 
 const testsuite = [
     "customlogger",
-    "jsonlines"
+    "jsonlines",
+    "html_tables"
 ]
 
 # --------------------------------------------------------------------------------------------------

	BazerUtils.jl Assorted Julia utilities including custom logging
	Log \| Files \| Refs \| README \| LICENSE

M	Project.toml	\|	10	+++++++++-
M	src/BazerUtils.jl	\|	2	++
A	src/HTMLTables.jl	\|	306	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/UnitTests/html_tables.jl	\|	378	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	test/runtests.jl	\|	4	+++-