BazerUtils.jl

Assorted Julia utilities including custom logging
Log | Files | Refs | README | LICENSE

commit 91a9d2e16e071bfebe3b6c0b181fc3cd333ae370
parent dac0c742bf528d0e7520be6515a015c588008384
Author: Erik Loualiche <eloualiche@users.noreply.github.com>
Date:   Thu,  5 Jun 2025 11:24:59 -0500

Merge pull request #5 from LouLouLibs/feature/jsonlines

json lines
Diffstat:
M.gitignore | 1+
MProject.toml | 10++++++----
MREADME.md | 5+++++
Msrc/BazerUtils.jl | 9++++++---
Asrc/JSONLines.jl | 234+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/UnitTests/jsonlines.jl | 240+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/runtests.jl | 6++++--
7 files changed, 496 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -9,5 +9,6 @@ docs/node_modules docs/.DS_Store .env .env.gpg +.vscode # --------------------------------------------------------- diff --git a/Project.toml b/Project.toml @@ -1,24 +1,26 @@ name = "BazerUtils" uuid = "36dcebb2-80bb-4116-91f4-ed9f396c4a1c" authors = ["Erik Loualiche"] -version = "0.7.1" +version = "0.8.0" [deps] +CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36" [compat] +CodecZlib = "0.7.8" Dates = "1.11.0" +JSON3 = "1.14.3" Logging = "1.11.0" LoggingExtras = "1.1.0" julia = "1.6.7" [extras] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" - +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] test = ["Test", "HTTP"] - diff --git a/README.md b/README.md @@ -60,6 +60,11 @@ custom_logger( ``` +### JSON Lines + +A easy way to read json lines files into julia leaning on `JSON3` reader. + + ## Other stuff diff --git a/src/BazerUtils.jl b/src/BazerUtils.jl @@ -2,23 +2,26 @@ module BazerUtils # -------------------------------------------------------------------------------------------------- -import Dates: format, now, Dates, ISODateTimeFormat +import Dates: format, now, Dates, ISODateTimeFormat import Logging: global_logger, Logging, Logging.Debug, Logging.Info, Logging.Warn, AbstractLogger -import LoggingExtras: ConsoleLogger, EarlyFilteredLogger, FileLogger, FormatLogger, +import LoggingExtras: ConsoleLogger, EarlyFilteredLogger, FileLogger, FormatLogger, MinLevelLogger, TeeLogger, TransformerLogger - +import JSON3: JSON3 +import CodecZlib: CodecZlib # -------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------- # Import functions include("CustomLogger.jl") +include("JSONLines.jl") # -------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------- # List of exported functions export custom_logger +export read_jsonl, stream_jsonl, write_jsonl # -------------------------------------------------------------------------------------------------- diff --git a/src/JSONLines.jl b/src/JSONLines.jl @@ -0,0 +1,234 @@ +# -------------------------------------------------------------------------------------------------- + +# JSONLines.jl + +# Function to naturally parse json lines files +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +# Exported function +# JSONLines +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +""" + read_jsonl(source::Union{AbstractString, IO}) -> Vector + +Read a JSON Lines (.jsonl) file or stream and return all records as a vector. + +This function reads the entire file or IO stream into memory at once, parsing each line as a separate +JSON value. Empty lines are automatically skipped. + +# Arguments +- `source::Union{AbstractString, IO}`: Path to the JSON Lines file to read, or an IO stream (e.g., IOBuffer, file handle). + +# Returns +- `Vector`: A vector containing all parsed JSON values from the file or stream. + +# Examples +```julia +# Read all records from a JSONL file +data = read_jsonl("data.jsonl") + +# Read from an IOBuffer +buf = IOBuffer("$(JSON3.write(Dict(:a=>1)))\n$(JSON3.write(Dict(:a=>2)))\n") +data = read_jsonl(buf) + +# Access individual records +first_record = data[1] +println("First record ID: ", first_record.id) +``` + +# Notes +- This function loads all data into memory, so it may not be suitable for very large files. +- For large files, consider using `stream_jsonl()` for streaming processing. +- The function will throw an error if the JSON on any line is malformed. +- The path must refer to an existing regular file. + +# See Also +- [`stream_jsonl`](@ref): For memory-efficient streaming of large JSONL files. +""" +function read_jsonl(io::IO) + results = [] + for line in eachline(io) + if !isempty(strip(line)) + push!(results, JSON3.read(line)) + end + end + return results +end + +function read_jsonl(filename::AbstractString) + if !isfile(filename) + throw(ArgumentError("File does not exist or is not a regular file: $filename")) + end + open(filename, "r") do io + return read_jsonl(io) + end +end +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +# Using lazy evaluation with generators +# For very large files, you can create a generator that yields records on demand: +""" + stream_jsonl(source::Union{AbstractString, IO}) -> Channel + +Create a lazy iterator for reading JSON Lines files record by record. + +This function returns a Channel that yields JSON objects one at a time without loading +the entire file into memory. This is memory-efficient for processing large JSONL files. + +# Arguments +- `source::Union{AbstractString, IO}`: Path to the JSON Lines file to read, or an IO stream (e.g., IOBuffer, file handle). + +# Returns +- `Channel`: A channel that yields parsed JSON objects one at a time + +# Examples +```julia +# Process records one at a time (memory efficient) +for record in stream_jsonl("large_file.jsonl") + println("Processing record: ", record.id) + # Process each record without loading all into memory +end + +# Collect first N records +first_10 = collect(Iterators.take(stream_jsonl("data.jsonl"), 10)) + +# Filter and process +filtered_records = [r for r in stream_jsonl("data.jsonl") if r.score > 0.5] + +# Stream from an IOBuffer +buf = IOBuffer("$(JSON3.write(Dict(:a=>1)))\n$(JSON3.write(Dict(:a=>2)))\n") +for record in stream_jsonl(buf) + @show record +end +``` + +# Notes +- This is a lazy iterator - records are only read and parsed when requested +- Memory usage remains constant regardless of file size +- Empty lines are automatically skipped +- The Channel is automatically closed when the file or stream is fully read or an error occurs +- If JSON parsing fails on any line, the Channel will close and propagate the error + +# Performance +- More memory efficient than `read_jsonl()` for large files +- Slightly slower per-record access due to Channel overhead +- Ideal for streaming processing workflows + +# See Also +- [`read_jsonl`](@ref): For loading entire JSONL files into memory at once +""" + +# function stream_jsonl(io::IO) +# Channel() do ch +# for line in eachline(io) +# println("LINE: ", line) +# if !isempty(strip(line)) +# try +# put!(ch, JSON3.read(line)) +# catch e +# @warn "Failed to parse JSON line: $line" exception=e +# end +# end +# end +# end +# end + +function stream_jsonl(io::IO) + Channel() do ch + for line in eachline(io) + if !isempty(strip(line)) + put!(ch, JSON3.read(line)) + end + end + end +end + + +function stream_jsonl(filename::AbstractString) + if !isfile(filename) + throw(ArgumentError("File does not exist or is not a regular file: $filename")) + end + Channel() do ch + open(filename, "r") do io + for line in eachline(io) + if !isempty(strip(line)) + put!(ch, JSON3.read(line)) + end + end + end + end +end +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +""" + write_jsonl(filename, data; compress=false) + +Write an iterable of JSON-serializable values to a JSON Lines file. + +- `filename`: Output file path (if ends with `.gz` or `compress=true`, writes gzip-compressed) +- `data`: An iterable (e.g., Vector, generator) of values (Dict, Array, String, Number, Bool, nothing, etc.) + +Returns the filename. + +# Example +```julia +write_jsonl("out.jsonl", [Dict("a"=>1), Dict("b"=>2)]) +write_jsonl("out.jsonl.gz", (Dict("i"=>i) for i in 1:10^6)) +``` +""" +function write_jsonl(filename::AbstractString, data; compress::Bool=false) + dir = dirname(filename) + if !isdir(dir) + throw(ArgumentError("Directory does not exist: $dir")) + end + isgz = compress || endswith(filename, ".gz") + openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w") + io = openf(filename) + try + for value in data + JSON3.write(io, value) + write(io, '\n') + end + finally + close(io) + end + return filename +end +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +# not exported +# d = read_data[1] +# d isa JSON3.Object{} + +function _dict_of_json3(d::JSON3.Object{}) + result = Dict{Symbol, Any}() + for (k, v) in d + result[Symbol(k)] = v isa JSON3.Object{} ? _dict_of_json3(v) : v + end + return result +end + + + + + + + + + + + + + + diff --git a/test/UnitTests/jsonlines.jl b/test/UnitTests/jsonlines.jl @@ -0,0 +1,240 @@ +@testset "JSONLines" begin + + + + @testset "stream_jsonl" begin + + data = [ + Dict("a" => 1, "b" => "foo"), + Dict("a" => 2, "b" => "bar"), + Dict("a" => 3, "b" => "baz") + ] + jsonl_file = tempname() + open(jsonl_file, "w") do io + for obj in data + JSON3.write(io, obj) + write(io, '\n') + end + end + + + # --- iterate + stream = stream_jsonl(jsonl_file) + @test !(stream isa AbstractArray) + + first_obj = iterate(stream)[1] + @test first_obj["a"] == 1 + @test first_obj["b"] == "foo" + + # Test that the iterator yields the next element correctly + second_obj = iterate(stream)[1] + @test second_obj["a"] == 2 + @test second_obj["b"] == "bar" + + third_obj = iterate(stream)[1] + @test third_obj["a"] == 3 + @test third_obj["b"] == "baz" + + @test isnothing(iterate(stream)) + @test !isopen(stream) + + # --- iterators + stream = stream_jsonl(jsonl_file) + stateful_stream = Iterators.Stateful(stream) + first_obj = popfirst!(stateful_stream) + @test first_obj["a"] == 1 + @test first_obj["b"] == "foo" + second_obj = popfirst!(stateful_stream) + @test second_obj["a"] == 2 + @test second_obj["b"] == "bar" + third_obj = popfirst!(stateful_stream) + @test third_obj["a"] == 3 + @test third_obj["b"] == "baz" + try popfirst!(stateful_stream) + catch e + @test isa(e, EOFError) + end + + # --- collect + # Test that the iterator can be collected fully + results = collect(stream_jsonl(jsonl_file)) + @test length(results) == 3 + @test results[3]["b"] == "baz" + + # Test with empty file + empty_file = tempname() + open(empty_file, "w") do io end + @test collect(stream_jsonl(empty_file)) == [] + @test !isopen(stream) + + rm(jsonl_file) + rm(empty_file) +end + + + +@testset "read_jsonl" begin + data = [ + Dict("x" => 10, "y" => "baz"), + Dict("x" => 20, "y" => "qux"), + Dict("x" => 30, "y" => "zap") + ] + jsonl_file = tempname() + open(jsonl_file, "w") do io + for obj in data + JSON3.write(io, obj) + write(io, '\n') + end + end + + results = read_jsonl(jsonl_file) + @test length(results) == 3 + @test results[1]["x"] == 10 + @test results[2]["y"] == "qux" + @test results[3]["x"] == 30 + @test results[3]["y"] == "zap" + + # Test with empty file + empty_file = tempname() + open(empty_file, "w") do io end + @test read_jsonl(empty_file) == [] + + # Test with malformed JSON line + bad_file = tempname() + open(bad_file, "w") do io + JSON3.write(io, Dict("a" => 1)) + write(io, '\n') + write(io, "{bad json}\n") + end + @test_throws Exception read_jsonl(bad_file) + + rm(jsonl_file) + rm(empty_file) + rm(bad_file) +end +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +@testset "Writing" begin + + + function test_jsonlines_roundtrip(data) + + buf = IOBuffer() + # Write each value as a JSON line + for obj in data + JSON3.write(buf, obj) + write(buf, '\n') + end + seekstart(buf) + # String(read(buf)) + + # Read all at once + read_data = read_jsonl(buf) + read_data = read_data isa JSON3.Object ? BazerUtils._dict_of_json3.(read_data) : read_data + + # Stream and collect + seekstart(buf) + streamed = collect(stream_jsonl(buf)) + @test streamed == data + end + + data_dict = [Dict(:a=>1, :b => Dict(:c => "bar")), Dict(:c=>2)] + test_jsonlines_roundtrip(data_dict) + + data_array = [[1,2,3], [4,5,6]] + test_jsonlines_roundtrip(data_array) + + # Test gzip + jsonl_file = tempname() * ".jsonl.gz" + write_jsonl(jsonl_file, data_dict) + + gz_data = read_jsonl(CodecZlib.GzipDecompressorStream(open(jsonl_file))) + @test BazerUtils._dict_of_json3.(gz_data) == data_dict + # @assert gz_data == data + +end +# -------------------------------------------------------------------------------------------------- + + + +# -------------------------------------------------------------------------------------------------- +@testset "compare speed: stream_jsonl vs read_jsonl for first 10 elements" begin + large_file = tempname() + open(large_file, "w") do io + for i in 1:10^6 + JSON3.write(io, Dict("i" => i)) + write(io, '\n') + end + end + + # Time to get first 10 elements with stream_jsonl + t_stream = @elapsed begin + stream = stream_jsonl(large_file) + first10 = collect(Iterators.take(stream, 10)) + end + + # Time to get first 10 elements with read_jsonl (loads all) + t_read = @elapsed begin + all = read_jsonl(large_file) + first10_read = all[1:10] + end + + @test t_stream < t_read / 10 # streaming should be much faster for first 10 + @test first10 == first10_read + + rm(large_file) +end +# -------------------------------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------------------------------- +@testset "Robustness" begin + + @testset "File not found" begin + # Test that both functions throw an error when the file does not exist + @test_throws Exception stream_jsonl("does_not_exist.jsonl") + @test_throws Exception read_jsonl("does_not_exist.jsonl") + end + + @testset "trailing newlines and empty lines" begin + file = tempname() + open(file, "w") do io + JSON3.write(io, Dict("a" => 1)) + write(io, "\n\n") # two trailing newlines (one empty line) + JSON3.write(io, Dict("a" => 2)) + write(io, "\n\n\n") # three trailing newlines (two empty lines) + end + result_stream = collect(stream_jsonl(file)) + result_read = read_jsonl(file) + @test length(result_stream) == 2 + @test length(result_read) == 2 + @test result_stream[1]["a"] == 1 + @test result_stream[2]["a"] == 2 + @test result_read[1]["a"] == 1 + @test result_read[2]["a"] == 2 + rm(file) + end + + @testset "comments or non-JSON lines" begin + file = tempname() + open(file, "w") do io + write(io, "# this is a comment\n") + JSON3.write(io, Dict("a" => 1)) + write(io, "\n") + write(io, "// another comment\n") + JSON3.write(io, Dict("a" => 2)) + write(io, "\n") + end + # Should throw, since comments are not valid JSON + @test_throws Exception collect(stream_jsonl(file)) + @test_throws Exception read_jsonl(file) + rm(file) + end + +end + + + +end diff --git a/test/runtests.jl b/test/runtests.jl @@ -4,10 +4,13 @@ using Test import Logging: global_logger import LoggingExtras: ConsoleLogger, TeeLogger +import JSON3 +import CodecZlib import HTTP const testsuite = [ - "customlogger" + "customlogger", + "jsonlines" ] # -------------------------------------------------------------------------------------------------- @@ -24,4 +27,3 @@ printstyled("Running tests:\n", color=:blue, bold=true) end end # -------------------------------------------------------------------------------------------------- -