commit 91a9d2e16e071bfebe3b6c0b181fc3cd333ae370
parent dac0c742bf528d0e7520be6515a015c588008384
Author: Erik Loualiche <eloualiche@users.noreply.github.com>
Date: Thu, 5 Jun 2025 11:24:59 -0500
Merge pull request #5 from LouLouLibs/feature/jsonlines
json lines
Diffstat:
7 files changed, 496 insertions(+), 9 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -9,5 +9,6 @@ docs/node_modules
docs/.DS_Store
.env
.env.gpg
+.vscode
# ---------------------------------------------------------
diff --git a/Project.toml b/Project.toml
@@ -1,24 +1,26 @@
name = "BazerUtils"
uuid = "36dcebb2-80bb-4116-91f4-ed9f396c4a1c"
authors = ["Erik Loualiche"]
-version = "0.7.1"
+version = "0.8.0"
[deps]
+CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
[compat]
+CodecZlib = "0.7.8"
Dates = "1.11.0"
+JSON3 = "1.14.3"
Logging = "1.11.0"
LoggingExtras = "1.1.0"
julia = "1.6.7"
[extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
-
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets]
test = ["Test", "HTTP"]
-
diff --git a/README.md b/README.md
@@ -60,6 +60,11 @@ custom_logger(
```
+### JSON Lines
+
+A easy way to read json lines files into julia leaning on `JSON3` reader.
+
+
## Other stuff
diff --git a/src/BazerUtils.jl b/src/BazerUtils.jl
@@ -2,23 +2,26 @@ module BazerUtils
# --------------------------------------------------------------------------------------------------
-import Dates: format, now, Dates, ISODateTimeFormat
+import Dates: format, now, Dates, ISODateTimeFormat
import Logging: global_logger, Logging, Logging.Debug, Logging.Info, Logging.Warn, AbstractLogger
-import LoggingExtras: ConsoleLogger, EarlyFilteredLogger, FileLogger, FormatLogger,
+import LoggingExtras: ConsoleLogger, EarlyFilteredLogger, FileLogger, FormatLogger,
MinLevelLogger, TeeLogger, TransformerLogger
-
+import JSON3: JSON3
+import CodecZlib: CodecZlib
# --------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------
# Import functions
include("CustomLogger.jl")
+include("JSONLines.jl")
# --------------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------------
# List of exported functions
export custom_logger
+export read_jsonl, stream_jsonl, write_jsonl
# --------------------------------------------------------------------------------------------------
diff --git a/src/JSONLines.jl b/src/JSONLines.jl
@@ -0,0 +1,234 @@
+# --------------------------------------------------------------------------------------------------
+
+# JSONLines.jl
+
+# Function to naturally parse json lines files
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+# Exported function
+# JSONLines
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+"""
+ read_jsonl(source::Union{AbstractString, IO}) -> Vector
+
+Read a JSON Lines (.jsonl) file or stream and return all records as a vector.
+
+This function reads the entire file or IO stream into memory at once, parsing each line as a separate
+JSON value. Empty lines are automatically skipped.
+
+# Arguments
+- `source::Union{AbstractString, IO}`: Path to the JSON Lines file to read, or an IO stream (e.g., IOBuffer, file handle).
+
+# Returns
+- `Vector`: A vector containing all parsed JSON values from the file or stream.
+
+# Examples
+```julia
+# Read all records from a JSONL file
+data = read_jsonl("data.jsonl")
+
+# Read from an IOBuffer
+buf = IOBuffer("$(JSON3.write(Dict(:a=>1)))\n$(JSON3.write(Dict(:a=>2)))\n")
+data = read_jsonl(buf)
+
+# Access individual records
+first_record = data[1]
+println("First record ID: ", first_record.id)
+```
+
+# Notes
+- This function loads all data into memory, so it may not be suitable for very large files.
+- For large files, consider using `stream_jsonl()` for streaming processing.
+- The function will throw an error if the JSON on any line is malformed.
+- The path must refer to an existing regular file.
+
+# See Also
+- [`stream_jsonl`](@ref): For memory-efficient streaming of large JSONL files.
+"""
+function read_jsonl(io::IO)
+ results = []
+ for line in eachline(io)
+ if !isempty(strip(line))
+ push!(results, JSON3.read(line))
+ end
+ end
+ return results
+end
+
+function read_jsonl(filename::AbstractString)
+ if !isfile(filename)
+ throw(ArgumentError("File does not exist or is not a regular file: $filename"))
+ end
+ open(filename, "r") do io
+ return read_jsonl(io)
+ end
+end
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+# Using lazy evaluation with generators
+# For very large files, you can create a generator that yields records on demand:
+"""
+ stream_jsonl(source::Union{AbstractString, IO}) -> Channel
+
+Create a lazy iterator for reading JSON Lines files record by record.
+
+This function returns a Channel that yields JSON objects one at a time without loading
+the entire file into memory. This is memory-efficient for processing large JSONL files.
+
+# Arguments
+- `source::Union{AbstractString, IO}`: Path to the JSON Lines file to read, or an IO stream (e.g., IOBuffer, file handle).
+
+# Returns
+- `Channel`: A channel that yields parsed JSON objects one at a time
+
+# Examples
+```julia
+# Process records one at a time (memory efficient)
+for record in stream_jsonl("large_file.jsonl")
+ println("Processing record: ", record.id)
+ # Process each record without loading all into memory
+end
+
+# Collect first N records
+first_10 = collect(Iterators.take(stream_jsonl("data.jsonl"), 10))
+
+# Filter and process
+filtered_records = [r for r in stream_jsonl("data.jsonl") if r.score > 0.5]
+
+# Stream from an IOBuffer
+buf = IOBuffer("$(JSON3.write(Dict(:a=>1)))\n$(JSON3.write(Dict(:a=>2)))\n")
+for record in stream_jsonl(buf)
+ @show record
+end
+```
+
+# Notes
+- This is a lazy iterator - records are only read and parsed when requested
+- Memory usage remains constant regardless of file size
+- Empty lines are automatically skipped
+- The Channel is automatically closed when the file or stream is fully read or an error occurs
+- If JSON parsing fails on any line, the Channel will close and propagate the error
+
+# Performance
+- More memory efficient than `read_jsonl()` for large files
+- Slightly slower per-record access due to Channel overhead
+- Ideal for streaming processing workflows
+
+# See Also
+- [`read_jsonl`](@ref): For loading entire JSONL files into memory at once
+"""
+
+# function stream_jsonl(io::IO)
+# Channel() do ch
+# for line in eachline(io)
+# println("LINE: ", line)
+# if !isempty(strip(line))
+# try
+# put!(ch, JSON3.read(line))
+# catch e
+# @warn "Failed to parse JSON line: $line" exception=e
+# end
+# end
+# end
+# end
+# end
+
+function stream_jsonl(io::IO)
+ Channel() do ch
+ for line in eachline(io)
+ if !isempty(strip(line))
+ put!(ch, JSON3.read(line))
+ end
+ end
+ end
+end
+
+
+function stream_jsonl(filename::AbstractString)
+ if !isfile(filename)
+ throw(ArgumentError("File does not exist or is not a regular file: $filename"))
+ end
+ Channel() do ch
+ open(filename, "r") do io
+ for line in eachline(io)
+ if !isempty(strip(line))
+ put!(ch, JSON3.read(line))
+ end
+ end
+ end
+ end
+end
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+"""
+ write_jsonl(filename, data; compress=false)
+
+Write an iterable of JSON-serializable values to a JSON Lines file.
+
+- `filename`: Output file path (if ends with `.gz` or `compress=true`, writes gzip-compressed)
+- `data`: An iterable (e.g., Vector, generator) of values (Dict, Array, String, Number, Bool, nothing, etc.)
+
+Returns the filename.
+
+# Example
+```julia
+write_jsonl("out.jsonl", [Dict("a"=>1), Dict("b"=>2)])
+write_jsonl("out.jsonl.gz", (Dict("i"=>i) for i in 1:10^6))
+```
+"""
+function write_jsonl(filename::AbstractString, data; compress::Bool=false)
+ dir = dirname(filename)
+ if !isdir(dir)
+ throw(ArgumentError("Directory does not exist: $dir"))
+ end
+ isgz = compress || endswith(filename, ".gz")
+ openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w")
+ io = openf(filename)
+ try
+ for value in data
+ JSON3.write(io, value)
+ write(io, '\n')
+ end
+ finally
+ close(io)
+ end
+ return filename
+end
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+# not exported
+# d = read_data[1]
+# d isa JSON3.Object{}
+
+function _dict_of_json3(d::JSON3.Object{})
+ result = Dict{Symbol, Any}()
+ for (k, v) in d
+ result[Symbol(k)] = v isa JSON3.Object{} ? _dict_of_json3(v) : v
+ end
+ return result
+end
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/UnitTests/jsonlines.jl b/test/UnitTests/jsonlines.jl
@@ -0,0 +1,240 @@
+@testset "JSONLines" begin
+
+
+
+ @testset "stream_jsonl" begin
+
+ data = [
+ Dict("a" => 1, "b" => "foo"),
+ Dict("a" => 2, "b" => "bar"),
+ Dict("a" => 3, "b" => "baz")
+ ]
+ jsonl_file = tempname()
+ open(jsonl_file, "w") do io
+ for obj in data
+ JSON3.write(io, obj)
+ write(io, '\n')
+ end
+ end
+
+
+ # --- iterate
+ stream = stream_jsonl(jsonl_file)
+ @test !(stream isa AbstractArray)
+
+ first_obj = iterate(stream)[1]
+ @test first_obj["a"] == 1
+ @test first_obj["b"] == "foo"
+
+ # Test that the iterator yields the next element correctly
+ second_obj = iterate(stream)[1]
+ @test second_obj["a"] == 2
+ @test second_obj["b"] == "bar"
+
+ third_obj = iterate(stream)[1]
+ @test third_obj["a"] == 3
+ @test third_obj["b"] == "baz"
+
+ @test isnothing(iterate(stream))
+ @test !isopen(stream)
+
+ # --- iterators
+ stream = stream_jsonl(jsonl_file)
+ stateful_stream = Iterators.Stateful(stream)
+ first_obj = popfirst!(stateful_stream)
+ @test first_obj["a"] == 1
+ @test first_obj["b"] == "foo"
+ second_obj = popfirst!(stateful_stream)
+ @test second_obj["a"] == 2
+ @test second_obj["b"] == "bar"
+ third_obj = popfirst!(stateful_stream)
+ @test third_obj["a"] == 3
+ @test third_obj["b"] == "baz"
+ try popfirst!(stateful_stream)
+ catch e
+ @test isa(e, EOFError)
+ end
+
+ # --- collect
+ # Test that the iterator can be collected fully
+ results = collect(stream_jsonl(jsonl_file))
+ @test length(results) == 3
+ @test results[3]["b"] == "baz"
+
+ # Test with empty file
+ empty_file = tempname()
+ open(empty_file, "w") do io end
+ @test collect(stream_jsonl(empty_file)) == []
+ @test !isopen(stream)
+
+ rm(jsonl_file)
+ rm(empty_file)
+end
+
+
+
+@testset "read_jsonl" begin
+ data = [
+ Dict("x" => 10, "y" => "baz"),
+ Dict("x" => 20, "y" => "qux"),
+ Dict("x" => 30, "y" => "zap")
+ ]
+ jsonl_file = tempname()
+ open(jsonl_file, "w") do io
+ for obj in data
+ JSON3.write(io, obj)
+ write(io, '\n')
+ end
+ end
+
+ results = read_jsonl(jsonl_file)
+ @test length(results) == 3
+ @test results[1]["x"] == 10
+ @test results[2]["y"] == "qux"
+ @test results[3]["x"] == 30
+ @test results[3]["y"] == "zap"
+
+ # Test with empty file
+ empty_file = tempname()
+ open(empty_file, "w") do io end
+ @test read_jsonl(empty_file) == []
+
+ # Test with malformed JSON line
+ bad_file = tempname()
+ open(bad_file, "w") do io
+ JSON3.write(io, Dict("a" => 1))
+ write(io, '\n')
+ write(io, "{bad json}\n")
+ end
+ @test_throws Exception read_jsonl(bad_file)
+
+ rm(jsonl_file)
+ rm(empty_file)
+ rm(bad_file)
+end
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+@testset "Writing" begin
+
+
+ function test_jsonlines_roundtrip(data)
+
+ buf = IOBuffer()
+ # Write each value as a JSON line
+ for obj in data
+ JSON3.write(buf, obj)
+ write(buf, '\n')
+ end
+ seekstart(buf)
+ # String(read(buf))
+
+ # Read all at once
+ read_data = read_jsonl(buf)
+ read_data = read_data isa JSON3.Object ? BazerUtils._dict_of_json3.(read_data) : read_data
+
+ # Stream and collect
+ seekstart(buf)
+ streamed = collect(stream_jsonl(buf))
+ @test streamed == data
+ end
+
+ data_dict = [Dict(:a=>1, :b => Dict(:c => "bar")), Dict(:c=>2)]
+ test_jsonlines_roundtrip(data_dict)
+
+ data_array = [[1,2,3], [4,5,6]]
+ test_jsonlines_roundtrip(data_array)
+
+ # Test gzip
+ jsonl_file = tempname() * ".jsonl.gz"
+ write_jsonl(jsonl_file, data_dict)
+
+ gz_data = read_jsonl(CodecZlib.GzipDecompressorStream(open(jsonl_file)))
+ @test BazerUtils._dict_of_json3.(gz_data) == data_dict
+ # @assert gz_data == data
+
+end
+# --------------------------------------------------------------------------------------------------
+
+
+
+# --------------------------------------------------------------------------------------------------
+@testset "compare speed: stream_jsonl vs read_jsonl for first 10 elements" begin
+ large_file = tempname()
+ open(large_file, "w") do io
+ for i in 1:10^6
+ JSON3.write(io, Dict("i" => i))
+ write(io, '\n')
+ end
+ end
+
+ # Time to get first 10 elements with stream_jsonl
+ t_stream = @elapsed begin
+ stream = stream_jsonl(large_file)
+ first10 = collect(Iterators.take(stream, 10))
+ end
+
+ # Time to get first 10 elements with read_jsonl (loads all)
+ t_read = @elapsed begin
+ all = read_jsonl(large_file)
+ first10_read = all[1:10]
+ end
+
+ @test t_stream < t_read / 10 # streaming should be much faster for first 10
+ @test first10 == first10_read
+
+ rm(large_file)
+end
+# --------------------------------------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------------------------------------
+@testset "Robustness" begin
+
+ @testset "File not found" begin
+ # Test that both functions throw an error when the file does not exist
+ @test_throws Exception stream_jsonl("does_not_exist.jsonl")
+ @test_throws Exception read_jsonl("does_not_exist.jsonl")
+ end
+
+ @testset "trailing newlines and empty lines" begin
+ file = tempname()
+ open(file, "w") do io
+ JSON3.write(io, Dict("a" => 1))
+ write(io, "\n\n") # two trailing newlines (one empty line)
+ JSON3.write(io, Dict("a" => 2))
+ write(io, "\n\n\n") # three trailing newlines (two empty lines)
+ end
+ result_stream = collect(stream_jsonl(file))
+ result_read = read_jsonl(file)
+ @test length(result_stream) == 2
+ @test length(result_read) == 2
+ @test result_stream[1]["a"] == 1
+ @test result_stream[2]["a"] == 2
+ @test result_read[1]["a"] == 1
+ @test result_read[2]["a"] == 2
+ rm(file)
+ end
+
+ @testset "comments or non-JSON lines" begin
+ file = tempname()
+ open(file, "w") do io
+ write(io, "# this is a comment\n")
+ JSON3.write(io, Dict("a" => 1))
+ write(io, "\n")
+ write(io, "// another comment\n")
+ JSON3.write(io, Dict("a" => 2))
+ write(io, "\n")
+ end
+ # Should throw, since comments are not valid JSON
+ @test_throws Exception collect(stream_jsonl(file))
+ @test_throws Exception read_jsonl(file)
+ rm(file)
+ end
+
+end
+
+
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,10 +4,13 @@ using Test
import Logging: global_logger
import LoggingExtras: ConsoleLogger, TeeLogger
+import JSON3
+import CodecZlib
import HTTP
const testsuite = [
- "customlogger"
+ "customlogger",
+ "jsonlines"
]
# --------------------------------------------------------------------------------------------------
@@ -24,4 +27,3 @@ printstyled("Running tests:\n", color=:blue, bold=true)
end
end
# --------------------------------------------------------------------------------------------------
-