BazerUtils.jl

Assorted Julia utilities including custom logging
Log | Files | Refs | README | LICENSE

jsonlines.jl (7184B)


      1 @testset "JSONLines" begin
      2 
      3 
      4 
      5     @testset "stream_jsonl" begin
      6 
      7     data = [
      8         Dict("a" => 1, "b" => "foo"),
      9         Dict("a" => 2, "b" => "bar"),
     10         Dict("a" => 3, "b" => "baz")
     11     ]
     12     jsonl_file = tempname()
     13     open(jsonl_file, "w") do io
     14         for obj in data
     15             JSON.json(io, obj)
     16             write(io, '\n')
     17         end
     18     end
     19 
     20 
     21     # --- iterate
     22     stream = stream_jsonl(jsonl_file)
     23     @test !(stream isa AbstractArray)
     24 
     25     first_obj = iterate(stream)[1]
     26     @test first_obj["a"] == 1
     27     @test first_obj["b"] == "foo"
     28 
     29     # Test that the iterator yields the next element correctly
     30     second_obj = iterate(stream)[1]
     31     @test second_obj["a"] == 2
     32     @test second_obj["b"] == "bar"
     33 
     34     third_obj = iterate(stream)[1]
     35     @test third_obj["a"] == 3
     36     @test third_obj["b"] == "baz"
     37 
     38     @test isnothing(iterate(stream))
     39     @test !isopen(stream)
     40 
     41     # --- iterators
     42     stream = stream_jsonl(jsonl_file)
     43     stateful_stream = Iterators.Stateful(stream)
     44     first_obj = popfirst!(stateful_stream)
     45     @test first_obj["a"] == 1
     46     @test first_obj["b"] == "foo"
     47     second_obj = popfirst!(stateful_stream)
     48     @test second_obj["a"] == 2
     49     @test second_obj["b"] == "bar"
     50     third_obj = popfirst!(stateful_stream)
     51     @test third_obj["a"] == 3
     52     @test third_obj["b"] == "baz"
     53     @test_throws EOFError popfirst!(stateful_stream)
     54 
     55     # --- collect
     56     # Test that the iterator can be collected fully
     57     results = collect(stream_jsonl(jsonl_file))
     58     @test length(results) == 3
     59     @test results[3]["b"] == "baz"
     60 
     61     # Test with empty file
     62     empty_file = tempname()
     63     open(empty_file, "w") do io end
     64     @test collect(stream_jsonl(empty_file)) == []
     65     @test !isopen(stream)
     66 
     67     # Test wrong types
     68     stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]"))
     69     @test_throws TaskFailedException collect(stream)
     70     stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]"), T=Any)
     71     @test collect(stream)[3] == [1,2,3]
     72 
     73     rm(jsonl_file)
     74     rm(empty_file)
     75 end
     76 
     77 
     78 
     79 @testset "read_jsonl" begin
     80     data = [
     81         Dict("x" => 10, "y" => "baz"),
     82         Dict("x" => 20, "y" => "qux"),
     83         Dict("x" => 30, "y" => "zap")
     84     ]
     85     jsonl_file = tempname()
     86     open(jsonl_file, "w") do io
     87         for obj in data
     88             JSON.json(io, obj)
     89             write(io, '\n')
     90         end
     91     end
     92 
     93     results = read_jsonl(jsonl_file)
     94     @test length(results) == 3
     95     @test results[1]["x"] == 10
     96     @test results[2]["y"] == "qux"
     97     @test results[3]["x"] == 30
     98     @test results[3]["y"] == "zap"
     99 
    100     results = read_jsonl(jsonl_file; dict_of_json=true)
    101     @test results isa Vector{Dict{Symbol, Any}}
    102 
    103     # Test with empty file
    104     empty_file = tempname()
    105     open(empty_file, "w") do io end
    106     @test read_jsonl(empty_file) == []
    107 
    108     # Test with malformed JSON line
    109     bad_file = tempname()
    110     open(bad_file, "w") do io
    111         JSON.json(io, Dict("a" => 1))
    112         write(io, '\n')
    113         write(io, "{bad json}\n")
    114     end
    115     @test_throws Exception read_jsonl(bad_file)
    116 
    117     rm(jsonl_file)
    118     rm(empty_file)
    119     rm(bad_file)
    120 end
    121 # --------------------------------------------------------------------------------------------------
    122 
    123 
    124 # --------------------------------------------------------------------------------------------------
    125 @testset "Writing" begin
    126 
    127 
    128     function test_jsonlines_roundtrip(data)
    129 
    130         buf = IOBuffer()
    131         # Write each value as a JSON line
    132         for obj in data
    133             JSON.json(buf, obj)
    134             write(buf, '\n')
    135         end
    136         seekstart(buf)
    137 
    138         # Read all at once
    139         read_data = read_jsonl(buf)
    140 
    141         # Stream and collect
    142         seekstart(buf)
    143         streamed = collect(stream_jsonl(buf, T=Any))
    144         @test streamed == read_data
    145     end
    146 
    147     data_dict = [Dict(:a=>1, :b => Dict(:c => "bar")), Dict(:c=>2)]
    148     test_jsonlines_roundtrip(data_dict)
    149 
    150     data_array = [[1,2,3], [4,5,6]]
    151     test_jsonlines_roundtrip(data_array)
    152 
    153     # Test gzip
    154     jsonl_file = tempname() * ".jsonl.gz"
    155     write_jsonl(jsonl_file, data_dict)
    156 
    157     gz_data = read_jsonl(CodecZlib.GzipDecompressorStream(open(jsonl_file)))
    158     @test BazerUtils._dict_of_json.(gz_data) == data_dict
    159     # @assert gz_data == data
    160 
    161     jsonl_file = tempname() * ".jsonl"
    162     simple_table = [
    163         (id=1, name="Alice", age=30),
    164         (id=2, name="Bob", age=25),
    165         (id=3, name="Charlie", age=35)
    166     ]
    167     write_jsonl(jsonl_file, simple_table)
    168     simple_dict = read_jsonl(jsonl_file)
    169     @test BazerUtils._dict_of_json.(simple_dict) == map(row -> Dict(pairs(row)), simple_table)
    170 
    171 end
    172 # --------------------------------------------------------------------------------------------------
    173 
    174 
    175 
    176 # --------------------------------------------------------------------------------------------------
    177 @testset "compare speed: stream_jsonl vs read_jsonl for first 10 elements" begin
    178     large_file = tempname()
    179     open(large_file, "w") do io
    180         for i in 1:10^6
    181             JSON.json(io, Dict("i" => i))
    182             write(io, '\n')
    183         end
    184     end
    185 
    186     # Time to get first 10 elements with stream_jsonl
    187     t_stream = @elapsed begin
    188         stream = stream_jsonl(large_file)
    189         first10 = collect(Iterators.take(stream, 10))
    190     end
    191 
    192     # Time to get first 10 elements with read_jsonl (loads all)
    193     t_read = @elapsed begin
    194         all = read_jsonl(large_file)
    195         first10_read = all[1:10]
    196     end
    197 
    198     @test t_stream < t_read / 10  # streaming should be much faster for first 10
    199     @test first10 == first10_read
    200 
    201     rm(large_file)
    202 end
    203 # --------------------------------------------------------------------------------------------------
    204 
    205 
    206 # --------------------------------------------------------------------------------------------------
    207 @testset "Robustness" begin
    208 
    209     @testset "File not found" begin
    210     # Test that both functions throw an error when the file does not exist
    211     @test_throws Exception stream_jsonl("does_not_exist.jsonl")
    212     @test_throws Exception read_jsonl("does_not_exist.jsonl")
    213     end
    214 
    215     @testset "trailing newlines and empty lines" begin
    216     file = tempname()
    217     open(file, "w") do io
    218         JSON.json(io, Dict("a" => 1))
    219         write(io, "\n\n")  # two trailing newlines (one empty line)
    220         JSON.json(io, Dict("a" => 2))
    221         write(io, "\n\n\n")  # three trailing newlines (two empty lines)
    222     end
    223     result_stream = collect(stream_jsonl(file))
    224     result_read = read_jsonl(file)
    225     @test length(result_stream) == 2
    226     @test length(result_read) == 2
    227     @test result_stream[1]["a"] == 1
    228     @test result_stream[2]["a"] == 2
    229     @test result_read[1]["a"] == 1
    230     @test result_read[2]["a"] == 2
    231     rm(file)
    232     end
    233 
    234     @testset "comments or non-JSON lines" begin
    235     file = tempname()
    236     open(file, "w") do io
    237         write(io, "# this is a comment\n")
    238         JSON.json(io, Dict("a" => 1))
    239         write(io, "\n")
    240         write(io, "// another comment\n")
    241         JSON.json(io, Dict("a" => 2))
    242         write(io, "\n")
    243     end
    244     # Should throw, since comments are not valid JSON
    245     @test_throws Exception collect(stream_jsonl(file))
    246     @test_throws Exception read_jsonl(file)
    247     rm(file)
    248     end
    249 
    250 end
    251 
    252 
    253 
    254 end