commit a312dbea5a51984d6660816e6a89ea15418df345
parent a4ce90a0b8acae89ccdbd551f29007a5ec260142
Author: Erik Loualiche <eloualiche@users.noreply.github.com>
Date: Sat, 28 Mar 2026 22:18:51 -0500
Add read_html_tables: native HTML table parsing (v0.12.0)
* Add Gumbo, Cascadia, HTTP, DataFrames deps for HTML table parsing
* Add HTMLTables scaffold and 26 test cases
Test fixtures adapted from pandas read_html test suite.
Covers: core parsing, colspan/rowspan, multi-level headers, data quality.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
* Implement read_html_tables with colspan/rowspan and multi-level headers
Core algorithm: sparse grid expansion (same approach as pandas _expand_colspan_rowspan).
Handles: thead/tbody/tfoot, auto-detect headers from th rows, multi-level headers,
flatten kwarg (:join, :last), match regex filtering, br->space, style stripping.
All 26 tests passing (67 assertions).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
* Add Wikipedia integration test, bump to v0.12.0
- Live test against Alabama state parks Wikipedia page
- Version bump: 0.11.0 -> 0.12.0 (new feature: read_html_tables)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---------
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
5 files changed, 698 insertions(+), 2 deletions(-)
diff --git a/Project.toml b/Project.toml
@@ -1,18 +1,26 @@
name = "BazerUtils"
uuid = "36dcebb2-80bb-4116-91f4-ed9f396c4a1c"
+version = "0.12.0"
authors = ["Erik Loualiche"]
-version = "0.11.0"
[deps]
+Cascadia = "54eefc05-d75b-58de-a785-1a3403f0919f"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
[compat]
+Cascadia = "1.0.2"
CodecZlib = "0.7"
+DataFrames = "1.8.1"
+Gumbo = "0.8.3"
+HTTP = "1.11.0"
JSON = "1"
LoggingExtras = "1"
Tables = "1.12"
diff --git a/src/BazerUtils.jl b/src/BazerUtils.jl
@@ -15,6 +15,7 @@ import CodecZlib: CodecZlib
# Import functions
include("CustomLogger.jl")
include("JSONLines.jl")
+include("HTMLTables.jl")
# --------------------------------------------------------------------------------------------------
@@ -22,6 +23,7 @@ include("JSONLines.jl")
# List of exported functions
export custom_logger
export read_jsonl, stream_jsonl, write_jsonl
+export read_html_tables
# --------------------------------------------------------------------------------------------------
diff --git a/src/HTMLTables.jl b/src/HTMLTables.jl
@@ -0,0 +1,306 @@
+# --------------------------------------------------------------------------------------------------
+# HTML Table Parsing
+#
+# Parse HTML tables into DataFrames, handling colspan/rowspan and multi-level headers.
+# Replaces PyCall/pandas read_html for Julia-native HTML scraping.
+#
+# Public API:
+# read_html_tables(source; match=nothing, flatten=nothing) -> Vector{DataFrame}
+#
+# Future extension points (not implemented):
+# - attrs kwarg: filter tables by HTML attributes (id, class)
+# - header kwarg: explicit row indices for headers (override auto-detection)
+# - skiprows kwarg: skip specific rows
+# - displayed_only kwarg: filter out display:none elements
+# - Type inference: auto-detect numeric columns
+# --------------------------------------------------------------------------------------------------
+
+using Gumbo
+using Cascadia
+using HTTP
+using DataFrames
+
+
+# --------------------------------------------------------------------------------------------------
+# Text extraction
+# --------------------------------------------------------------------------------------------------
+
+"""Extract text from an HTML node, converting <br> to spaces and stripping <style> content."""
+function _cell_text(node)::String
+ if node isa HTMLText
+ return node.text
+ elseif node isa HTMLElement
+ tag = Gumbo.tag(node)
+ tag == :br && return " "
+ tag == :style && return ""
+ return join((_cell_text(c) for c in Gumbo.children(node)), "")
+ end
+ return ""
+end
+
+
+# --------------------------------------------------------------------------------------------------
+# Row classification
+# --------------------------------------------------------------------------------------------------
+
+"""
+A parsed cell: text content + HTML attributes needed for span expansion.
+"""
+struct ParsedCell
+ text::String
+ is_header::Bool
+ colspan::Int
+ rowspan::Int
+end
+
+"""Extract ParsedCells from a <tr> element."""
+function _parse_row(tr)::Vector{ParsedCell}
+ cells = ParsedCell[]
+ for child in Gumbo.children(tr)
+ child isa HTMLElement || continue
+ t = Gumbo.tag(child)
+ (t == :th || t == :td) || continue
+ text = strip(_cell_text(child))
+ cs = parse(Int, get(child.attributes, "colspan", "1"))
+ rs = parse(Int, get(child.attributes, "rowspan", "1"))
+ push!(cells, ParsedCell(text, t == :th, cs, rs))
+ end
+ return cells
+end
+
+"""
+Classify table rows into header rows and body rows.
+
+Rules:
+- <thead> rows -> header
+- <tbody> rows -> body (multiple <tbody> concatenated)
+- <tfoot> rows -> appended to body
+- No <thead>: consecutive all-<th> rows from top of body -> moved to header
+"""
+function _classify_rows(table_elem)
+ header_rows = Vector{Vector{ParsedCell}}()
+ body_rows = Vector{Vector{ParsedCell}}()
+ has_thead = false
+
+ for child in Gumbo.children(table_elem)
+ child isa HTMLElement || continue
+ t = Gumbo.tag(child)
+ if t == :thead
+ has_thead = true
+ for tr in Gumbo.children(child)
+ tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(header_rows, _parse_row(tr))
+ end
+ elseif t == :tbody
+ for tr in Gumbo.children(child)
+ tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(body_rows, _parse_row(tr))
+ end
+ elseif t == :tfoot
+ for tr in Gumbo.children(child)
+ tr isa HTMLElement && Gumbo.tag(tr) == :tr && push!(body_rows, _parse_row(tr))
+ end
+ elseif t == :tr
+ # bare <tr> not inside thead/tbody/tfoot
+ push!(body_rows, _parse_row(child))
+ end
+ end
+
+ # If no <thead>, scan top of body for consecutive all-<th> rows
+ if !has_thead
+ while !isempty(body_rows) && all(c -> c.is_header, body_rows[1])
+ push!(header_rows, popfirst!(body_rows))
+ end
+ end
+
+ return header_rows, body_rows
+end
+
+
+# --------------------------------------------------------------------------------------------------
+# Span expansion
+# --------------------------------------------------------------------------------------------------
+
+"""
+Expand colspan/rowspan into a filled text grid.
+
+Takes a flat vector of ParsedCell rows, returns a Matrix{Union{String,Nothing}}
+where spanned cells are duplicated into all positions they cover.
+"""
+function _expand_spans(rows::Vector{Vector{ParsedCell}})
+ isempty(rows) && return Matrix{Union{String,Nothing}}(nothing, 0, 0)
+
+ # Use a Dict-based sparse grid that grows as needed
+ grid = Dict{Tuple{Int,Int}, String}()
+ max_row = 0
+ max_col = 0
+
+ for (ri, row) in enumerate(rows)
+ col = 1
+ for cell in row
+ # Find next empty slot in this row
+ while haskey(grid, (ri, col))
+ col += 1
+ end
+ # Fill the rowspan x colspan rectangle
+ for dr in 0:(cell.rowspan - 1)
+ for dc in 0:(cell.colspan - 1)
+ r, c = ri + dr, col + dc
+ grid[(r, c)] = cell.text
+ max_row = max(max_row, r)
+ max_col = max(max_col, c)
+ end
+ end
+ col += cell.colspan
+ end
+ end
+
+ # Convert to dense matrix
+ result = Matrix{Union{String,Nothing}}(nothing, max_row, max_col)
+ for ((r, c), text) in grid
+ result[r, c] = text
+ end
+
+ return result
+end
+
+
+# --------------------------------------------------------------------------------------------------
+# Table parsing
+# --------------------------------------------------------------------------------------------------
+
+"""Deduplicate column names by appending .1, .2, etc."""
+function _dedup_names(names_vec)
+ seen = Dict{String,Int}()
+ result = Vector{String}(undef, length(names_vec))
+ for (i, name) in enumerate(names_vec)
+ if haskey(seen, name)
+ seen[name] += 1
+ result[i] = "$(name).$(seen[name])"
+ else
+ seen[name] = 0
+ result[i] = name
+ end
+ end
+ return result
+end
+
+"""
+Parse a single <table> element into a DataFrame.
+
+Returns nothing if the table has no data rows.
+"""
+function _parse_table(table_elem; flatten::Union{Nothing,Symbol}=nothing)
+ header_rows, body_rows = _classify_rows(table_elem)
+
+ # Combine all rows for span expansion, then split back
+ all_rows = vcat(header_rows, body_rows)
+ isempty(all_rows) && return nothing
+
+ grid = _expand_spans(all_rows)
+ nrows_total, ncols = size(grid)
+ ncols == 0 && return nothing
+
+ n_header = length(header_rows)
+ n_body = nrows_total - n_header
+
+ n_body <= 0 && return nothing
+
+ # Build column names
+ if n_header == 0
+ col_names = ["Column$i" for i in 1:ncols]
+ elseif n_header == 1
+ col_names = [something(grid[1, c], "Column$c") for c in 1:ncols]
+ else
+ # Multi-level headers: build tuple representation then convert to strings
+ raw_tuples = [Tuple(something(grid[r, c], "") for r in 1:n_header) for c in 1:ncols]
+
+ if flatten == :join
+ col_names = [join(filter(!isempty, t), "_") for t in raw_tuples]
+ elseif flatten == :last
+ col_names = [String(t[end]) for t in raw_tuples]
+ else
+ # Default: string representation of tuple, e.g. "(A, a)"
+ col_names = ["(" * join(t, ", ") * ")" for t in raw_tuples]
+ end
+ end
+
+ # Apply flatten for single-level headers (no-op, already strings)
+
+ # Deduplicate
+ col_names = _dedup_names(col_names)
+
+ # Build DataFrame from body rows
+ cols = Vector{Vector{Union{String,Missing}}}(undef, ncols)
+ for c in 1:ncols
+ vals = Vector{Union{String,Missing}}(undef, n_body)
+ for (idx, r) in enumerate((n_header + 1):nrows_total)
+ val = grid[r, c]
+ vals[idx] = (val === nothing || val == "") ? missing : val
+ end
+ cols[c] = vals
+ end
+
+ # Construct DataFrame preserving column order
+ df = DataFrame()
+ for (c, name) in enumerate(col_names)
+ df[!, name] = cols[c]
+ end
+
+ return df
+end
+
+
+# --------------------------------------------------------------------------------------------------
+# Public API
+# --------------------------------------------------------------------------------------------------
+
+"""
+ read_html_tables(source::String; match=nothing, flatten=nothing) -> Vector{DataFrame}
+
+Parse all HTML tables from a URL or raw HTML string into DataFrames.
+
+# Arguments
+- `source`: URL (starting with "http") or raw HTML string
+- `match`: optional `Regex` -- only return tables whose text content matches
+- `flatten`: controls multi-level header column names (DataFrames requires String column names)
+ - `nothing` (default): string representation of tuples, e.g. `"(A, a)"`
+ - `:join`: join levels with `"_"`, e.g. `"A_a"`
+ - `:last`: last header level only, e.g. `"a"`
+
+# Returns
+Vector of DataFrames with String/Missing columns. Empty tables are skipped.
+
+# Examples
+```julia
+dfs = read_html_tables("https://en.wikipedia.org/wiki/List_of_Alabama_state_parks")
+dfs = read_html_tables(html_string; match=r"Name"i, flatten=:last)
+```
+"""
+function read_html_tables(source::String; match::Union{Nothing,Regex}=nothing,
+ flatten::Union{Nothing,Symbol}=nothing)
+ # Fetch HTML
+ html = if startswith(source, "http://") || startswith(source, "https://")
+ String(HTTP.get(source).body)
+ else
+ source
+ end
+
+ doc = parsehtml(html)
+ tables = eachmatch(Selector("table"), doc.root)
+
+ dfs = DataFrame[]
+ for table_elem in tables
+ df = _parse_table(table_elem; flatten=flatten)
+ df === nothing && continue
+
+ # Filter by match regex if provided
+ if match !== nothing
+ table_text = _cell_text(table_elem)
+ occursin(match, table_text) || continue
+ end
+
+ push!(dfs, df)
+ end
+
+ return dfs
+end
+# --------------------------------------------------------------------------------------------------
diff --git a/test/UnitTests/html_tables.jl b/test/UnitTests/html_tables.jl
@@ -0,0 +1,378 @@
+using Test
+using BazerUtils
+using DataFrames
+
+@testset "HTMLTables" begin
+
+# ==================================================================================
+# Tier 1: Core table parsing
+# ==================================================================================
+
+@testset "Tier 1: Core parsing" begin
+
+@testset "basic table with thead/tbody" begin
+ html = """
+ <table>
+ <thead><tr><th>A</th><th>B</th></tr></thead>
+ <tbody><tr><td>1</td><td>2</td></tr>
+ <tr><td>3</td><td>4</td></tr></tbody>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test length(dfs) == 1
+ df = dfs[1]
+ @test names(df) == ["A", "B"]
+ @test size(df) == (2, 2)
+ @test df[1, "A"] == "1"
+ @test df[2, "B"] == "4"
+end
+
+@testset "table without thead (auto-detect from th rows)" begin
+ html = """
+ <table>
+ <tr><th>X</th><th>Y</th></tr>
+ <tr><td>a</td><td>b</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test length(dfs) == 1
+ @test names(dfs[1]) == ["X", "Y"]
+ @test dfs[1][1, "X"] == "a"
+end
+
+@testset "multiple tbody elements concatenated" begin
+ html = """
+ <table>
+ <thead><tr><th>A</th><th>B</th></tr></thead>
+ <tbody><tr><td>1</td><td>2</td></tr></tbody>
+ <tbody><tr><td>3</td><td>4</td></tr></tbody>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test size(dfs[1]) == (2, 2)
+ @test dfs[1][2, "A"] == "3"
+end
+
+@testset "tfoot with data appended to body" begin
+ html = """
+ <table>
+ <thead><tr><th>A</th><th>B</th></tr></thead>
+ <tbody><tr><td>1</td><td>2</td></tr></tbody>
+ <tfoot><tr><td>foot1</td><td>foot2</td></tr></tfoot>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test size(dfs[1]) == (2, 2)
+ @test dfs[1][2, "A"] == "foot1"
+end
+
+@testset "mixed th/td in body row" begin
+ html = """
+ <table>
+ <thead><tr><th>Country</th><th>City</th><th>Year</th></tr></thead>
+ <tbody><tr><td>Ukraine</td><th>Odessa</th><td>1944</td></tr></tbody>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test dfs[1][1, "City"] == "Odessa"
+end
+
+@testset "single column table" begin
+ html = """
+ <table>
+ <tr><th>Only</th></tr>
+ <tr><td>val</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test size(dfs[1]) == (1, 1)
+ @test names(dfs[1]) == ["Only"]
+end
+
+@testset "empty table skipped" begin
+ html = """
+ <table><tbody></tbody></table>
+ <table>
+ <tr><th>A</th></tr>
+ <tr><td>1</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test length(dfs) == 1
+ @test names(dfs[1]) == ["A"]
+end
+
+@testset "multiple tables in document" begin
+ html = """
+ <table><tr><th>T1</th></tr><tr><td>a</td></tr></table>
+ <table><tr><th>T2</th></tr><tr><td>b</td></tr></table>
+ <table><tr><th>T3</th></tr><tr><td>c</td></tr></table>"""
+ dfs = read_html_tables(html)
+ @test length(dfs) == 3
+ @test names(dfs[2]) == ["T2"]
+end
+
+@testset "match kwarg filters tables" begin
+ html = """
+ <table><tr><th>Name</th></tr><tr><td>park</td></tr></table>
+ <table><tr><th>Other</th></tr><tr><td>data</td></tr></table>"""
+ dfs = read_html_tables(html; match=r"park"i)
+ @test length(dfs) == 1
+ @test names(dfs[1]) == ["Name"]
+end
+
+end # Tier 1
+
+
+# ==================================================================================
+# Tier 2: Colspan/rowspan
+# ==================================================================================
+
+@testset "Tier 2: Colspan/rowspan" begin
+
+@testset "colspan=1 and rowspan=1 are no-ops" begin
+ html = """
+ <table>
+ <tr><th>A</th><th colspan="1">B</th><th rowspan="1">C</th></tr>
+ <tr><td>a</td><td>b</td><td>c</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test names(dfs[1]) == ["A", "B", "C"]
+ @test dfs[1][1, "B"] == "b"
+end
+
+@testset "colspan=2 in header" begin
+ html = """
+ <table>
+ <tr><th colspan="2">Wide</th><th>Narrow</th></tr>
+ <tr><td>a</td><td>b</td><td>c</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test size(dfs[1], 2) == 3
+ @test dfs[1][1, 1] == "a"
+ @test dfs[1][1, 3] == "c"
+end
+
+@testset "colspan=2 in body" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th><th>C</th></tr>
+ <tr><td colspan="2">wide</td><td>c</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test dfs[1][1, "A"] == "wide"
+ @test dfs[1][1, "B"] == "wide"
+ @test dfs[1][1, "C"] == "c"
+end
+
+@testset "rowspan=2 in body" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th></tr>
+ <tr><td rowspan="2">tall</td><td>1</td></tr>
+ <tr><td>2</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test size(dfs[1]) == (2, 2)
+ @test dfs[1][1, "A"] == "tall"
+ @test dfs[1][2, "A"] == "tall"
+ @test dfs[1][2, "B"] == "2"
+end
+
+@testset "rowspan at end of row" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th></tr>
+ <tr><td>x</td><td rowspan="2">y</td></tr>
+ <tr><td>z</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test dfs[1][2, "B"] == "y"
+ @test dfs[1][2, "A"] == "z"
+end
+
+@testset "both rowspan and colspan on same cell" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th></tr>
+ <tr><td rowspan="2">a</td><td rowspan="2" colspan="3">block</td><td>e1</td></tr>
+ <tr><td>e2</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test size(dfs[1]) == (2, 5)
+ @test dfs[1][1, "B"] == "block"
+ @test dfs[1][1, "C"] == "block"
+ @test dfs[1][1, "D"] == "block"
+ @test dfs[1][2, "B"] == "block"
+ @test dfs[1][2, "D"] == "block"
+ @test dfs[1][2, "A"] == "a"
+ @test dfs[1][1, "E"] == "e1"
+ @test dfs[1][2, "E"] == "e2"
+end
+
+@testset "rowspan spanning header into body" begin
+ html = """
+ <table>
+ <tr><th rowspan="2">A</th><th>B</th></tr>
+ <tr><td>1</td></tr>
+ <tr><td>C</td><td>2</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test names(dfs[1]) == ["A", "B"]
+ @test dfs[1][1, "A"] == "A"
+ @test dfs[1][1, "B"] == "1"
+ @test dfs[1][2, "A"] == "C"
+ @test dfs[1][2, "B"] == "2"
+end
+
+@testset "rowspan-only rows" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th></tr>
+ <tr><td rowspan="3">x</td><td rowspan="3">y</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test size(dfs[1]) == (3, 2)
+ @test dfs[1][3, "A"] == "x"
+ @test dfs[1][3, "B"] == "y"
+end
+
+end # Tier 2
+
+
+# ==================================================================================
+# Tier 3: Multi-level headers + flatten
+# ==================================================================================
+
+@testset "Tier 3: Multi-level headers" begin
+
+@testset "two th rows give string-tuple column names" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th></tr>
+ <tr><th>a</th><th>b</th></tr>
+ <tr><td>1</td><td>2</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test names(dfs[1]) == ["(A, a)", "(B, b)"]
+end
+
+@testset "flatten=:join joins with underscore" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th></tr>
+ <tr><th>a</th><th>b</th></tr>
+ <tr><td>1</td><td>2</td></tr>
+ </table>"""
+ dfs = read_html_tables(html; flatten=:join)
+ @test names(dfs[1]) == ["A_a", "B_b"]
+end
+
+@testset "flatten=:last takes last level" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th></tr>
+ <tr><th>a</th><th>b</th></tr>
+ <tr><td>1</td><td>2</td></tr>
+ </table>"""
+ dfs = read_html_tables(html; flatten=:last)
+ @test names(dfs[1]) == ["a", "b"]
+end
+
+@testset "Wikipedia-style colspan grouping with sub-headers" begin
+ html = """
+ <table>
+ <tr><th rowspan="2">Name</th><th colspan="2">Size</th><th rowspan="2">Year</th></tr>
+ <tr><th>acres</th><th>ha</th></tr>
+ <tr><td>Park A</td><td>100</td><td>40</td><td>1920</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test names(dfs[1]) == ["(Name, Name)", "(Size, acres)", "(Size, ha)", "(Year, Year)"]
+ @test dfs[1][1, "(Size, acres)"] == "100"
+
+ dfs2 = read_html_tables(html; flatten=:last)
+ @test names(dfs2[1]) == ["Name", "acres", "ha", "Year"]
+end
+
+end # Tier 3
+
+
+# ==================================================================================
+# Tier 4: Data quality
+# ==================================================================================
+
+@testset "Tier 4: Data quality" begin
+
+@testset "empty cells become missing" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th></tr>
+ <tr><td></td><td>val</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test ismissing(dfs[1][1, "A"])
+ @test dfs[1][1, "B"] == "val"
+end
+
+@testset "ragged rows padded with missing" begin
+ html = """
+ <table>
+ <tr><th>A</th><th>B</th><th>C</th></tr>
+ <tr><td>1</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test dfs[1][1, "A"] == "1"
+ @test ismissing(dfs[1][1, "B"])
+ @test ismissing(dfs[1][1, "C"])
+end
+
+@testset "br inside cell becomes space" begin
+ html = """
+ <table>
+ <tr><th>A</th></tr>
+ <tr><td>word1<br>word2</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test dfs[1][1, "A"] == "word1 word2"
+end
+
+@testset "style tag stripped from header" begin
+ html = """
+ <table>
+ <tr><th><style>.x{color:red}</style>Name</th><th>B</th></tr>
+ <tr><td>a</td><td>b</td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test strip(names(dfs[1])[1]) == "Name" || names(dfs[1])[1] == "Name"
+end
+
+@testset "whitespace normalization" begin
+ html = """
+ <table>
+ <tr><th> A </th></tr>
+ <tr><td> val </td></tr>
+ </table>"""
+ dfs = read_html_tables(html)
+ @test names(dfs[1]) == ["A"]
+ @test dfs[1][1, "A"] == "val"
+end
+
+end # Tier 4
+
+
+# ==================================================================================
+# Integration: real Wikipedia page
+# ==================================================================================
+
+@testset "Integration: Wikipedia state parks" begin
+ try
+ dfs = read_html_tables(
+ "https://en.wikipedia.org/wiki/List_of_Alabama_state_parks";
+ match=r"[Nn]ame", flatten=:last)
+ @test length(dfs) >= 1
+ df = dfs[1]
+ @test any(contains.(lowercase.(names(df)), "name"))
+ @test nrow(df) > 10
+ catch e
+ if e isa HTTP.Exceptions.StatusError || e isa Downloads.RequestError
+ @warn "Skipping Wikipedia test (network error)"
+ else
+ rethrow(e)
+ end
+ end
+end
+
+end # HTMLTables
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,10 +8,12 @@ import JSON
import CodecZlib
import HTTP
import Dates
+import DataFrames
const testsuite = [
"customlogger",
- "jsonlines"
+ "jsonlines",
+ "html_tables"
]
# --------------------------------------------------------------------------------------------------