BazerUtils.jl

Assorted Julia utilities including custom logging
Log | Files | Refs | README | LICENSE

commit 76ebffe7fb9a3acbb5c8c6a0063848ee74c74fa6
parent d706ce71dd256756537cfec82027fdbea5cab8cc
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Thu,  5 Jun 2025 21:48:38 -0500

some adjustment on typing

Diffstat:
MProject.toml | 2+-
Mdocs/make.jl | 2+-
Mdocs/src/man/read_jsonl.md | 57+++++++++++++++++++++++++++++++++++++++++----------------
Msrc/JSONLines.jl | 145+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mtest/UnitTests/jsonlines.jl | 11++++++++++-
5 files changed, 132 insertions(+), 85 deletions(-)

diff --git a/Project.toml b/Project.toml @@ -1,7 +1,7 @@ name = "BazerUtils" uuid = "36dcebb2-80bb-4116-91f4-ed9f396c4a1c" authors = ["Erik Loualiche"] -version = "0.8.0" +version = "0.8.2" [deps] CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" diff --git a/docs/make.jl b/docs/make.jl @@ -54,7 +54,7 @@ deploydocs(; branch = "gh-pages", devbranch = "main", # or "master" versions = [ - "stable" => "0.8.1", + "stable" => "0.8.2", "dev" => "dev", ], ) diff --git a/docs/src/man/read_jsonl.md b/docs/src/man/read_jsonl.md @@ -32,31 +32,50 @@ You can use the `read_jsonl` and `stream_jsonl` functions to read JSONL files or Reads the entire file or stream into memory and returns a vector of parsed JSON values. ```julia +using BazerUtils +import JSON3 data = read_jsonl("data.jsonl") # or from an IOBuffer -buf = IOBuffer("{\"a\": 1}\n{\"a\": 2}\n") -data = read_jsonl(buf) +buf = +data = read_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n")) +data = read_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n"); dict_of_json=true) ``` + - **Arguments:** `source::Union{AbstractString, IO}` - **Returns:** `Vector` of parsed JSON values - **Note:** Loads all data into memory. For large files, use `stream_jsonl`. --- + ### `stream_jsonl` Creates a lazy iterator (Channel) that yields one parsed JSON value at a time, without loading the entire file into memory. ```julia -for record in stream_jsonl("data.jsonl") - println(record) -end +stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n")) +data = collect(stream) +BazerUtils._dict_of_json3.(data) + +stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]")) +collect(stream) # error because types of vector elements are not all JSON3.Object{} +stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]"), T=Any) +collect(stream) # default to Vector{Any} + +stream = stream_jsonl(IOBuffer("[4,5,6]\n[1,2,3]"), T= JSON3.Array{}) +collect(stream) +stream = stream_jsonl(IOBuffer("4\n1"), T=Int) +collect(stream) +``` -# Collect the first 10 records -first10 = collect(Iterators.take(stream_jsonl("data.jsonl"), 10)) +Allows iterators +```julia +first10 = collect(Iterators.take(stream_jsonl("data.jsonl"), 10)) # Collect the first 10 records +# see tests for other iterators ... ``` + - **Arguments:** `source::Union{AbstractString, IO}` - **Returns:** `Channel` (iterator) of parsed JSON values - **Note:** Ideal for large files and streaming workflows. @@ -83,18 +102,24 @@ write_jsonl("out.jsonl.gz", (Dict("i"=>i) for i in 1:100); compress=true) ## Example: Roundtrip with IOBuffer -Note that there is no stable roundtrip between read and write, because of the way `JSON3` processes record into dictionaries. +Note that there is no stable roundtrip between read and write, because of the way `JSON3` processes record into dictionaries and even when we add the dict flag it is `Symbol => Any` ```julia -data = [Dict("a"=>1), Dict("b"=>2)] -buf = IOBuffer() -for obj in data - JSON3.write(buf, obj) - write(buf, '\n') +data_string = [Dict("a"=>1), Dict("b"=>2)] +data_symbol = [Dict(:a=>1), Dict(:b=>2)] + +function roundtrip(data) + buf = IOBuffer() + for obj in data + JSON3.write(buf, obj) + write(buf, '\n') + end + seekstart(buf) + return read_jsonl(buf; dict_of_json=true) end -seekstart(buf) -read_data = read_jsonl(buf) -@assert read_data == data + +roundtrip(data_string) == data_string +roundtrip(data_symbol) == data_symbol ``` --- diff --git a/src/JSONLines.jl b/src/JSONLines.jl @@ -14,7 +14,7 @@ # -------------------------------------------------------------------------------------------------- """ - read_jsonl(source::Union{AbstractString, IO}) -> Vector + read_jsonl(source::Union{AbstractString, IO}; dict_of_json::Bool=false) -> Vector Read a JSON Lines (.jsonl) file or stream and return all records as a vector. @@ -23,6 +23,7 @@ JSON value. Empty lines are automatically skipped. # Arguments - `source::Union{AbstractString, IO}`: Path to the JSON Lines file to read, or an IO stream (e.g., IOBuffer, file handle). +- `dict_of_json::Bool=false`: If `true` and the parsed type is `JSON3.Object`, convert each record to a `Dict{Symbol,Any}`. # Returns - `Vector`: A vector containing all parsed JSON values from the file or stream. @@ -36,6 +37,9 @@ data = read_jsonl("data.jsonl") buf = IOBuffer("$(JSON3.write(Dict(:a=>1)))\n$(JSON3.write(Dict(:a=>2)))\n") data = read_jsonl(buf) +# Convert JSON3.Object records to Dict +data = read_jsonl("data.jsonl"; dict_of_json=true) + # Access individual records first_record = data[1] println("First record ID: ", first_record.id) @@ -46,26 +50,38 @@ println("First record ID: ", first_record.id) - For large files, consider using `stream_jsonl()` for streaming processing. - The function will throw an error if the JSON on any line is malformed. - The path must refer to an existing regular file. +- If `dict_of_json=true`, all records must be of type `JSON3.Object`. # See Also - [`stream_jsonl`](@ref): For memory-efficient streaming of large JSONL files. """ -function read_jsonl(io::IO) - results = [] - for line in eachline(io) - if !isempty(strip(line)) - push!(results, JSON3.read(line)) - end +function read_jsonl(io::IO; dict_of_json::Bool=false) + lines = collect(eachline(io)) + nonempty_lines = filter(l -> !isempty(strip(l)), lines) + isempty(nonempty_lines) && return [] + + first_val = JSON3.read(nonempty_lines[1]) + T = typeof(first_val) + results = Vector{T}(undef, length(nonempty_lines)) + results[1] = first_val + + for (i, line) in enumerate(nonempty_lines[2:end]) + results[i+1] = JSON3.read(line) + end + @show T + if dict_of_json && T <: JSON3.Object{} + results = [_dict_of_json3(r) for r in results] end + return results end -function read_jsonl(filename::AbstractString) +function read_jsonl(filename::AbstractString; kwargs...) if !isfile(filename) throw(ArgumentError("File does not exist or is not a regular file: $filename")) end open(filename, "r") do io - return read_jsonl(io) + return read_jsonl(io; kwargs...) end end # -------------------------------------------------------------------------------------------------- @@ -75,25 +91,27 @@ end # Using lazy evaluation with generators # For very large files, you can create a generator that yields records on demand: """ - stream_jsonl(source::Union{AbstractString, IO}) -> Channel + stream_jsonl(source::Union{AbstractString, IO}; T::Type=JSON3.Object{}) -> Channel -Create a lazy iterator for reading JSON Lines files record by record. +Create a lazy iterator (Channel) for reading JSON Lines files record by record. This function returns a Channel that yields JSON objects one at a time without loading the entire file into memory. This is memory-efficient for processing large JSONL files. +Each parsed record is checked to match the specified type `T` (default: `JSON3.Object{}`). +If a record does not match `T`, an error is thrown. # Arguments - `source::Union{AbstractString, IO}`: Path to the JSON Lines file to read, or an IO stream (e.g., IOBuffer, file handle). +- `T::Type=JSON3.Object{}`: The expected type for each parsed record. Use `T=Any` to allow mixed types. # Returns -- `Channel`: A channel that yields parsed JSON objects one at a time +- `Channel{T}`: A channel that yields parsed JSON objects one at a time. # Examples ```julia # Process records one at a time (memory efficient) for record in stream_jsonl("large_file.jsonl") println("Processing record: ", record.id) - # Process each record without loading all into memory end # Collect first N records @@ -107,60 +125,55 @@ buf = IOBuffer("$(JSON3.write(Dict(:a=>1)))\n$(JSON3.write(Dict(:a=>2)))\n") for record in stream_jsonl(buf) @show record end + +# Allow mixed types +for record in stream_jsonl("data.jsonl"; T=Any) + @show record +end ``` # Notes -- This is a lazy iterator - records are only read and parsed when requested -- Memory usage remains constant regardless of file size -- Empty lines are automatically skipped -- The Channel is automatically closed when the file or stream is fully read or an error occurs -- If JSON parsing fails on any line, the Channel will close and propagate the error - -# Performance -- More memory efficient than `read_jsonl()` for large files -- Slightly slower per-record access due to Channel overhead -- Ideal for streaming processing workflows +- This is a lazy iterator: records are only read and parsed when requested. +- Memory usage remains constant regardless of file size. +- Empty lines are automatically skipped. +- The Channel is automatically closed when the file or stream is fully read or an error occurs. +- If JSON parsing fails on any line, the Channel will close and propagate the error. +- For file paths, the file remains open for the lifetime of the channel. +- For IO streams, the user is responsible for keeping the IO open while consuming the channel. +- If a parsed record does not match `T`, an error is thrown. Use `T=Any` to allow mixed types. # See Also -- [`read_jsonl`](@ref): For loading entire JSONL files into memory at once +- [`read_jsonl`](@ref): For loading entire JSONL files into memory at once. """ - -# function stream_jsonl(io::IO) -# Channel() do ch -# for line in eachline(io) -# println("LINE: ", line) -# if !isempty(strip(line)) -# try -# put!(ch, JSON3.read(line)) -# catch e -# @warn "Failed to parse JSON line: $line" exception=e -# end -# end -# end -# end -# end - -function stream_jsonl(io::IO) - Channel() do ch - for line in eachline(io) - if !isempty(strip(line)) - put!(ch, JSON3.read(line)) +function stream_jsonl(io::IO; T::Type=JSON3.Object{}) + lines = Iterators.filter(l -> !isempty(strip(l)), eachline(io)) + return Channel{T}() do ch + for line in lines + val = JSON3.read(line) + if !isa(val, T) + throw(ArgumentError("Parsed value of type $(typeof(val)) does not match expected type $T;\nTry specifying T::Any")) end + put!(ch, val) end end end -function stream_jsonl(filename::AbstractString) +function stream_jsonl(filename::AbstractString; T::Type=JSON3.Object{}) if !isfile(filename) throw(ArgumentError("File does not exist or is not a regular file: $filename")) end - Channel() do ch + return Channel{T}() do ch open(filename, "r") do io for line in eachline(io) - if !isempty(strip(line)) - put!(ch, JSON3.read(line)) + if isempty(strip(line)) + continue end + val = JSON3.read(line) + if !isa(val, T) + throw(ArgumentError("Parsed value of type $(typeof(val)) does not match expected type $T")) + end + put!(ch, val) end end end @@ -207,10 +220,23 @@ end # -------------------------------------------------------------------------------------------------- -# not exported -# d = read_data[1] -# d isa JSON3.Object{} +""" + _dict_of_json3(obj::JSON3.Object) -> Dict{Symbol, Any} + +Recursively convert a `JSON3.Object` (from JSON3.jl) into a standard Julia `Dict` with `Symbol` keys. + +This function traverses the input `JSON3.Object`, converting all keys to `Symbol` and recursively converting any nested `JSON3.Object` values. Non-object values are left unchanged. + +# Arguments +- `obj::JSON3.Object`: The JSON3 object to convert. + +# Returns +- `Dict{Symbol, Any}`: A Julia dictionary with symbol keys and values converted recursively. +# Notes +- This function is intended for internal use and is not exported. +- Useful for converting parsed JSON3 objects into standard Julia dictionaries for easier manipulation. +""" function _dict_of_json3(d::JSON3.Object{}) result = Dict{Symbol, Any}() for (k, v) in d @@ -218,17 +244,4 @@ function _dict_of_json3(d::JSON3.Object{}) end return result end - - - - - - - - - - - - - - +# -------------------------------------------------------------------------------------------------- diff --git a/test/UnitTests/jsonlines.jl b/test/UnitTests/jsonlines.jl @@ -67,6 +67,12 @@ @test collect(stream_jsonl(empty_file)) == [] @test !isopen(stream) + # Test wrong types + stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]")) + @test_throws TaskFailedException collect(stream) + stream = stream_jsonl(IOBuffer("{\"a\": 1}\n{\"a\": 2}\n[1,2,3]"), T=Any) + @test collect(stream)[3] == [1,2,3] + rm(jsonl_file) rm(empty_file) end @@ -94,6 +100,9 @@ end @test results[3]["x"] == 30 @test results[3]["y"] == "zap" + results = read_jsonl(jsonl_file; dict_of_json=true) + @test results isa Vector{Dict{Symbol, Any}} + # Test with empty file empty_file = tempname() open(empty_file, "w") do io end @@ -136,7 +145,7 @@ end # Stream and collect seekstart(buf) - streamed = collect(stream_jsonl(buf)) + streamed = collect(stream_jsonl(buf, T=Any)) @test streamed == data end