html_tables.jl (10611B)
1 using Test 2 using BazerUtils 3 using DataFrames 4 5 @testset "HTMLTables" begin 6 7 # ================================================================================== 8 # Tier 1: Core table parsing 9 # ================================================================================== 10 11 @testset "Tier 1: Core parsing" begin 12 13 @testset "basic table with thead/tbody" begin 14 html = """ 15 <table> 16 <thead><tr><th>A</th><th>B</th></tr></thead> 17 <tbody><tr><td>1</td><td>2</td></tr> 18 <tr><td>3</td><td>4</td></tr></tbody> 19 </table>""" 20 dfs = read_html_tables(html) 21 @test length(dfs) == 1 22 df = dfs[1] 23 @test names(df) == ["A", "B"] 24 @test size(df) == (2, 2) 25 @test df[1, "A"] == "1" 26 @test df[2, "B"] == "4" 27 end 28 29 @testset "table without thead (auto-detect from th rows)" begin 30 html = """ 31 <table> 32 <tr><th>X</th><th>Y</th></tr> 33 <tr><td>a</td><td>b</td></tr> 34 </table>""" 35 dfs = read_html_tables(html) 36 @test length(dfs) == 1 37 @test names(dfs[1]) == ["X", "Y"] 38 @test dfs[1][1, "X"] == "a" 39 end 40 41 @testset "multiple tbody elements concatenated" begin 42 html = """ 43 <table> 44 <thead><tr><th>A</th><th>B</th></tr></thead> 45 <tbody><tr><td>1</td><td>2</td></tr></tbody> 46 <tbody><tr><td>3</td><td>4</td></tr></tbody> 47 </table>""" 48 dfs = read_html_tables(html) 49 @test size(dfs[1]) == (2, 2) 50 @test dfs[1][2, "A"] == "3" 51 end 52 53 @testset "tfoot with data appended to body" begin 54 html = """ 55 <table> 56 <thead><tr><th>A</th><th>B</th></tr></thead> 57 <tbody><tr><td>1</td><td>2</td></tr></tbody> 58 <tfoot><tr><td>foot1</td><td>foot2</td></tr></tfoot> 59 </table>""" 60 dfs = read_html_tables(html) 61 @test size(dfs[1]) == (2, 2) 62 @test dfs[1][2, "A"] == "foot1" 63 end 64 65 @testset "mixed th/td in body row" begin 66 html = """ 67 <table> 68 <thead><tr><th>Country</th><th>City</th><th>Year</th></tr></thead> 69 <tbody><tr><td>Ukraine</td><th>Odessa</th><td>1944</td></tr></tbody> 70 </table>""" 71 dfs = read_html_tables(html) 72 @test dfs[1][1, "City"] == "Odessa" 73 end 74 75 @testset "single column table" begin 76 html = """ 77 <table> 78 <tr><th>Only</th></tr> 79 <tr><td>val</td></tr> 80 </table>""" 81 dfs = read_html_tables(html) 82 @test size(dfs[1]) == (1, 1) 83 @test names(dfs[1]) == ["Only"] 84 end 85 86 @testset "empty table skipped" begin 87 html = """ 88 <table><tbody></tbody></table> 89 <table> 90 <tr><th>A</th></tr> 91 <tr><td>1</td></tr> 92 </table>""" 93 dfs = read_html_tables(html) 94 @test length(dfs) == 1 95 @test names(dfs[1]) == ["A"] 96 end 97 98 @testset "multiple tables in document" begin 99 html = """ 100 <table><tr><th>T1</th></tr><tr><td>a</td></tr></table> 101 <table><tr><th>T2</th></tr><tr><td>b</td></tr></table> 102 <table><tr><th>T3</th></tr><tr><td>c</td></tr></table>""" 103 dfs = read_html_tables(html) 104 @test length(dfs) == 3 105 @test names(dfs[2]) == ["T2"] 106 end 107 108 @testset "match kwarg filters tables" begin 109 html = """ 110 <table><tr><th>Name</th></tr><tr><td>park</td></tr></table> 111 <table><tr><th>Other</th></tr><tr><td>data</td></tr></table>""" 112 dfs = read_html_tables(html; match=r"park"i) 113 @test length(dfs) == 1 114 @test names(dfs[1]) == ["Name"] 115 end 116 117 end # Tier 1 118 119 120 # ================================================================================== 121 # Tier 2: Colspan/rowspan 122 # ================================================================================== 123 124 @testset "Tier 2: Colspan/rowspan" begin 125 126 @testset "colspan=1 and rowspan=1 are no-ops" begin 127 html = """ 128 <table> 129 <tr><th>A</th><th colspan="1">B</th><th rowspan="1">C</th></tr> 130 <tr><td>a</td><td>b</td><td>c</td></tr> 131 </table>""" 132 dfs = read_html_tables(html) 133 @test names(dfs[1]) == ["A", "B", "C"] 134 @test dfs[1][1, "B"] == "b" 135 end 136 137 @testset "colspan=2 in header" begin 138 html = """ 139 <table> 140 <tr><th colspan="2">Wide</th><th>Narrow</th></tr> 141 <tr><td>a</td><td>b</td><td>c</td></tr> 142 </table>""" 143 dfs = read_html_tables(html) 144 @test size(dfs[1], 2) == 3 145 @test dfs[1][1, 1] == "a" 146 @test dfs[1][1, 3] == "c" 147 end 148 149 @testset "colspan=2 in body" begin 150 html = """ 151 <table> 152 <tr><th>A</th><th>B</th><th>C</th></tr> 153 <tr><td colspan="2">wide</td><td>c</td></tr> 154 </table>""" 155 dfs = read_html_tables(html) 156 @test dfs[1][1, "A"] == "wide" 157 @test dfs[1][1, "B"] == "wide" 158 @test dfs[1][1, "C"] == "c" 159 end 160 161 @testset "rowspan=2 in body" begin 162 html = """ 163 <table> 164 <tr><th>A</th><th>B</th></tr> 165 <tr><td rowspan="2">tall</td><td>1</td></tr> 166 <tr><td>2</td></tr> 167 </table>""" 168 dfs = read_html_tables(html) 169 @test size(dfs[1]) == (2, 2) 170 @test dfs[1][1, "A"] == "tall" 171 @test dfs[1][2, "A"] == "tall" 172 @test dfs[1][2, "B"] == "2" 173 end 174 175 @testset "rowspan at end of row" begin 176 html = """ 177 <table> 178 <tr><th>A</th><th>B</th></tr> 179 <tr><td>x</td><td rowspan="2">y</td></tr> 180 <tr><td>z</td></tr> 181 </table>""" 182 dfs = read_html_tables(html) 183 @test dfs[1][2, "B"] == "y" 184 @test dfs[1][2, "A"] == "z" 185 end 186 187 @testset "both rowspan and colspan on same cell" begin 188 html = """ 189 <table> 190 <tr><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th></tr> 191 <tr><td rowspan="2">a</td><td rowspan="2" colspan="3">block</td><td>e1</td></tr> 192 <tr><td>e2</td></tr> 193 </table>""" 194 dfs = read_html_tables(html) 195 @test size(dfs[1]) == (2, 5) 196 @test dfs[1][1, "B"] == "block" 197 @test dfs[1][1, "C"] == "block" 198 @test dfs[1][1, "D"] == "block" 199 @test dfs[1][2, "B"] == "block" 200 @test dfs[1][2, "D"] == "block" 201 @test dfs[1][2, "A"] == "a" 202 @test dfs[1][1, "E"] == "e1" 203 @test dfs[1][2, "E"] == "e2" 204 end 205 206 @testset "rowspan spanning header into body" begin 207 html = """ 208 <table> 209 <tr><th rowspan="2">A</th><th>B</th></tr> 210 <tr><td>1</td></tr> 211 <tr><td>C</td><td>2</td></tr> 212 </table>""" 213 dfs = read_html_tables(html) 214 @test names(dfs[1]) == ["A", "B"] 215 @test dfs[1][1, "A"] == "A" 216 @test dfs[1][1, "B"] == "1" 217 @test dfs[1][2, "A"] == "C" 218 @test dfs[1][2, "B"] == "2" 219 end 220 221 @testset "rowspan-only rows" begin 222 html = """ 223 <table> 224 <tr><th>A</th><th>B</th></tr> 225 <tr><td rowspan="3">x</td><td rowspan="3">y</td></tr> 226 </table>""" 227 dfs = read_html_tables(html) 228 @test size(dfs[1]) == (3, 2) 229 @test dfs[1][3, "A"] == "x" 230 @test dfs[1][3, "B"] == "y" 231 end 232 233 end # Tier 2 234 235 236 # ================================================================================== 237 # Tier 3: Multi-level headers + flatten 238 # ================================================================================== 239 240 @testset "Tier 3: Multi-level headers" begin 241 242 @testset "two th rows give string-tuple column names" begin 243 html = """ 244 <table> 245 <tr><th>A</th><th>B</th></tr> 246 <tr><th>a</th><th>b</th></tr> 247 <tr><td>1</td><td>2</td></tr> 248 </table>""" 249 dfs = read_html_tables(html) 250 @test names(dfs[1]) == ["(A, a)", "(B, b)"] 251 end 252 253 @testset "flatten=:join joins with underscore" begin 254 html = """ 255 <table> 256 <tr><th>A</th><th>B</th></tr> 257 <tr><th>a</th><th>b</th></tr> 258 <tr><td>1</td><td>2</td></tr> 259 </table>""" 260 dfs = read_html_tables(html; flatten=:join) 261 @test names(dfs[1]) == ["A_a", "B_b"] 262 end 263 264 @testset "flatten=:last takes last level" begin 265 html = """ 266 <table> 267 <tr><th>A</th><th>B</th></tr> 268 <tr><th>a</th><th>b</th></tr> 269 <tr><td>1</td><td>2</td></tr> 270 </table>""" 271 dfs = read_html_tables(html; flatten=:last) 272 @test names(dfs[1]) == ["a", "b"] 273 end 274 275 @testset "Wikipedia-style colspan grouping with sub-headers" begin 276 html = """ 277 <table> 278 <tr><th rowspan="2">Name</th><th colspan="2">Size</th><th rowspan="2">Year</th></tr> 279 <tr><th>acres</th><th>ha</th></tr> 280 <tr><td>Park A</td><td>100</td><td>40</td><td>1920</td></tr> 281 </table>""" 282 dfs = read_html_tables(html) 283 @test names(dfs[1]) == ["(Name, Name)", "(Size, acres)", "(Size, ha)", "(Year, Year)"] 284 @test dfs[1][1, "(Size, acres)"] == "100" 285 286 dfs2 = read_html_tables(html; flatten=:last) 287 @test names(dfs2[1]) == ["Name", "acres", "ha", "Year"] 288 end 289 290 end # Tier 3 291 292 293 # ================================================================================== 294 # Tier 4: Data quality 295 # ================================================================================== 296 297 @testset "Tier 4: Data quality" begin 298 299 @testset "empty cells become missing" begin 300 html = """ 301 <table> 302 <tr><th>A</th><th>B</th></tr> 303 <tr><td></td><td>val</td></tr> 304 </table>""" 305 dfs = read_html_tables(html) 306 @test ismissing(dfs[1][1, "A"]) 307 @test dfs[1][1, "B"] == "val" 308 end 309 310 @testset "ragged rows padded with missing" begin 311 html = """ 312 <table> 313 <tr><th>A</th><th>B</th><th>C</th></tr> 314 <tr><td>1</td></tr> 315 </table>""" 316 dfs = read_html_tables(html) 317 @test dfs[1][1, "A"] == "1" 318 @test ismissing(dfs[1][1, "B"]) 319 @test ismissing(dfs[1][1, "C"]) 320 end 321 322 @testset "br inside cell becomes space" begin 323 html = """ 324 <table> 325 <tr><th>A</th></tr> 326 <tr><td>word1<br>word2</td></tr> 327 </table>""" 328 dfs = read_html_tables(html) 329 @test dfs[1][1, "A"] == "word1 word2" 330 end 331 332 @testset "style tag stripped from header" begin 333 html = """ 334 <table> 335 <tr><th><style>.x{color:red}</style>Name</th><th>B</th></tr> 336 <tr><td>a</td><td>b</td></tr> 337 </table>""" 338 dfs = read_html_tables(html) 339 @test strip(names(dfs[1])[1]) == "Name" || names(dfs[1])[1] == "Name" 340 end 341 342 @testset "whitespace normalization" begin 343 html = """ 344 <table> 345 <tr><th> A </th></tr> 346 <tr><td> val </td></tr> 347 </table>""" 348 dfs = read_html_tables(html) 349 @test names(dfs[1]) == ["A"] 350 @test dfs[1][1, "A"] == "val" 351 end 352 353 end # Tier 4 354 355 356 # ================================================================================== 357 # Integration: real Wikipedia page 358 # ================================================================================== 359 360 @testset "Integration: Wikipedia state parks" begin 361 try 362 dfs = read_html_tables( 363 "https://en.wikipedia.org/wiki/List_of_Alabama_state_parks"; 364 match=r"[Nn]ame", flatten=:last) 365 @test length(dfs) >= 1 366 df = dfs[1] 367 @test any(contains.(lowercase.(names(df)), "name")) 368 @test nrow(df) > 10 369 catch e 370 if e isa HTTP.Exceptions.StatusError || e isa Downloads.RequestError 371 @warn "Skipping Wikipedia test (network error)" 372 else 373 rethrow(e) 374 end 375 end 376 end 377 378 end # HTMLTables