jsonlines.jl (7184B)
1 @testset "JSONLines" begin 2 3 4 5 @testset "stream_jsonl" begin 6 7 data = [ 8 Dict("a" => 1, "b" => "foo"), 9 Dict("a" => 2, "b" => "bar"), 10 Dict("a" => 3, "b" => "baz") 11 ] 12 jsonl_file = tempname() 13 open(jsonl_file, "w") do io 14 for obj in data 15 JSON.json(io, obj) 16 write(io, '\n') 17 end 18 end 19 20 21 # --- iterate 22 stream = stream_jsonl(jsonl_file) 23 @test !(stream isa AbstractArray) 24 25 first_obj = iterate(stream)[1] 26 @test first_obj["a"] == 1 27 @test first_obj["b"] == "foo" 28 29 # Test that the iterator yields the next element correctly 30 second_obj = iterate(stream)[1] 31 @test second_obj["a"] == 2 32 @test second_obj["b"] == "bar" 33 34 third_obj = iterate(stream)[1] 35 @test third_obj["a"] == 3 36 @test third_obj["b"] == "baz" 37 38 @test isnothing(iterate(stream)) 39 @test !isopen(stream) 40 41 # --- iterators 42 stream = stream_jsonl(jsonl_file) 43 stateful_stream = Iterators.Stateful(stream) 44 first_obj = popfirst!(stateful_stream) 45 @test first_obj["a"] == 1 46 @test first_obj["b"] == "foo" 47 second_obj = popfirst!(stateful_stream) 48 @test second_obj["a"] == 2 49 @test second_obj["b"] == "bar" 50 third_obj = popfirst!(stateful_stream) 51 @test third_obj["a"] == 3 52 @test third_obj["b"] == "baz" 53 @test_throws EOFError popfirst!(stateful_stream) 54 55 # --- collect 56 # Test that the iterator can be collected fully 57 results = collect(stream_jsonl(jsonl_file)) 58 @test length(results) == 3 59 @test results[3]["b"] == "baz" 60 61 # Test with empty file 62 empty_file = tempname() 63 open(empty_file, "w") do io end 64 @test collect(stream_jsonl(empty_file)) == [] 65 @test !isopen(stream) 66 67 # Test wrong types 68 stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]")) 69 @test_throws TaskFailedException collect(stream) 70 stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]"), T=Any) 71 @test collect(stream)[3] == [1,2,3] 72 73 rm(jsonl_file) 74 rm(empty_file) 75 end 76 77 78 79 @testset "read_jsonl" begin 80 data = [ 81 Dict("x" => 10, "y" => "baz"), 82 Dict("x" => 20, "y" => "qux"), 83 Dict("x" => 30, "y" => "zap") 84 ] 85 jsonl_file = tempname() 86 open(jsonl_file, "w") do io 87 for obj in data 88 JSON.json(io, obj) 89 write(io, '\n') 90 end 91 end 92 93 results = read_jsonl(jsonl_file) 94 @test length(results) == 3 95 @test results[1]["x"] == 10 96 @test results[2]["y"] == "qux" 97 @test results[3]["x"] == 30 98 @test results[3]["y"] == "zap" 99 100 results = read_jsonl(jsonl_file; dict_of_json=true) 101 @test results isa Vector{Dict{Symbol, Any}} 102 103 # Test with empty file 104 empty_file = tempname() 105 open(empty_file, "w") do io end 106 @test read_jsonl(empty_file) == [] 107 108 # Test with malformed JSON line 109 bad_file = tempname() 110 open(bad_file, "w") do io 111 JSON.json(io, Dict("a" => 1)) 112 write(io, '\n') 113 write(io, "{bad json}\n") 114 end 115 @test_throws Exception read_jsonl(bad_file) 116 117 rm(jsonl_file) 118 rm(empty_file) 119 rm(bad_file) 120 end 121 # -------------------------------------------------------------------------------------------------- 122 123 124 # -------------------------------------------------------------------------------------------------- 125 @testset "Writing" begin 126 127 128 function test_jsonlines_roundtrip(data) 129 130 buf = IOBuffer() 131 # Write each value as a JSON line 132 for obj in data 133 JSON.json(buf, obj) 134 write(buf, '\n') 135 end 136 seekstart(buf) 137 138 # Read all at once 139 read_data = read_jsonl(buf) 140 141 # Stream and collect 142 seekstart(buf) 143 streamed = collect(stream_jsonl(buf, T=Any)) 144 @test streamed == read_data 145 end 146 147 data_dict = [Dict(:a=>1, :b => Dict(:c => "bar")), Dict(:c=>2)] 148 test_jsonlines_roundtrip(data_dict) 149 150 data_array = [[1,2,3], [4,5,6]] 151 test_jsonlines_roundtrip(data_array) 152 153 # Test gzip 154 jsonl_file = tempname() * ".jsonl.gz" 155 write_jsonl(jsonl_file, data_dict) 156 157 gz_data = read_jsonl(CodecZlib.GzipDecompressorStream(open(jsonl_file))) 158 @test BazerUtils._dict_of_json.(gz_data) == data_dict 159 # @assert gz_data == data 160 161 jsonl_file = tempname() * ".jsonl" 162 simple_table = [ 163 (id=1, name="Alice", age=30), 164 (id=2, name="Bob", age=25), 165 (id=3, name="Charlie", age=35) 166 ] 167 write_jsonl(jsonl_file, simple_table) 168 simple_dict = read_jsonl(jsonl_file) 169 @test BazerUtils._dict_of_json.(simple_dict) == map(row -> Dict(pairs(row)), simple_table) 170 171 end 172 # -------------------------------------------------------------------------------------------------- 173 174 175 176 # -------------------------------------------------------------------------------------------------- 177 @testset "compare speed: stream_jsonl vs read_jsonl for first 10 elements" begin 178 large_file = tempname() 179 open(large_file, "w") do io 180 for i in 1:10^6 181 JSON.json(io, Dict("i" => i)) 182 write(io, '\n') 183 end 184 end 185 186 # Time to get first 10 elements with stream_jsonl 187 t_stream = @elapsed begin 188 stream = stream_jsonl(large_file) 189 first10 = collect(Iterators.take(stream, 10)) 190 end 191 192 # Time to get first 10 elements with read_jsonl (loads all) 193 t_read = @elapsed begin 194 all = read_jsonl(large_file) 195 first10_read = all[1:10] 196 end 197 198 @test t_stream < t_read / 10 # streaming should be much faster for first 10 199 @test first10 == first10_read 200 201 rm(large_file) 202 end 203 # -------------------------------------------------------------------------------------------------- 204 205 206 # -------------------------------------------------------------------------------------------------- 207 @testset "Robustness" begin 208 209 @testset "File not found" begin 210 # Test that both functions throw an error when the file does not exist 211 @test_throws Exception stream_jsonl("does_not_exist.jsonl") 212 @test_throws Exception read_jsonl("does_not_exist.jsonl") 213 end 214 215 @testset "trailing newlines and empty lines" begin 216 file = tempname() 217 open(file, "w") do io 218 JSON.json(io, Dict("a" => 1)) 219 write(io, "\n\n") # two trailing newlines (one empty line) 220 JSON.json(io, Dict("a" => 2)) 221 write(io, "\n\n\n") # three trailing newlines (two empty lines) 222 end 223 result_stream = collect(stream_jsonl(file)) 224 result_read = read_jsonl(file) 225 @test length(result_stream) == 2 226 @test length(result_read) == 2 227 @test result_stream[1]["a"] == 1 228 @test result_stream[2]["a"] == 2 229 @test result_read[1]["a"] == 1 230 @test result_read[2]["a"] == 2 231 rm(file) 232 end 233 234 @testset "comments or non-JSON lines" begin 235 file = tempname() 236 open(file, "w") do io 237 write(io, "# this is a comment\n") 238 JSON.json(io, Dict("a" => 1)) 239 write(io, "\n") 240 write(io, "// another comment\n") 241 JSON.json(io, Dict("a" => 2)) 242 write(io, "\n") 243 end 244 # Should throw, since comments are not valid JSON 245 @test_throws Exception collect(stream_jsonl(file)) 246 @test_throws Exception read_jsonl(file) 247 rm(file) 248 end 249 250 end 251 252 253 254 end