BazerUtils.jl

Assorted Julia utilities including custom logging
Log | Files | Refs | README | LICENSE

commit 4adb77fa5e631a146a12a939aca03b010737507a
parent 250d7d372328d10b7f0fe5de9c1a047e88c10d91
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Thu, 19 Jun 2025 19:36:52 -0500

fix some bugs in dataframe interface to jsonlines ... more general with tables

Diffstat:
MProject.toml | 2++
Msrc/BazerUtils.jl | 1+
Msrc/JSONLines.jl | 44+++++++++++++++++++++++++++++++++++++++-----
Mtest/UnitTests/jsonlines.jl | 10++++++++++
4 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/Project.toml b/Project.toml @@ -9,6 +9,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] CodecZlib = "0.7.8" @@ -16,6 +17,7 @@ Dates = "1.11.0" JSON3 = "1.14.3" Logging = "1.11.0" LoggingExtras = "1.1.0" +Tables = "1.12.1" julia = "1.6.7" [extras] diff --git a/src/BazerUtils.jl b/src/BazerUtils.jl @@ -7,6 +7,7 @@ import Logging: global_logger, Logging, Logging.Debug, Logging.Info, Logging.War import LoggingExtras: ConsoleLogger, EarlyFilteredLogger, FileLogger, FormatLogger, MinLevelLogger, TeeLogger, TransformerLogger import JSON3: JSON3 +import Tables: Tables import CodecZlib: CodecZlib # -------------------------------------------------------------------------------------------------- diff --git a/src/JSONLines.jl b/src/JSONLines.jl @@ -182,6 +182,24 @@ end # -------------------------------------------------------------------------------------------------- +abstract type IterationStyle end +struct TableIteration <: IterationStyle end +struct DirectIteration <: IterationStyle end + +function iteration_style(x) + # Only use table iteration for proper table types + if (Tables.istable(x) && !isa(x, AbstractVector) && !isa(x, AbstractDict)) + TableIteration() + else + DirectIteration() + end +end + + +function write_jsonl(filename::AbstractString, data; kwargs...) + write_jsonl(filename, data, iteration_style(data); kwargs...) +end + """ write_jsonl(filename, data; compress=false) @@ -198,7 +216,8 @@ write_jsonl("out.jsonl", [Dict("a"=>1), Dict("b"=>2)]) write_jsonl("out.jsonl.gz", (Dict("i"=>i) for i in 1:10^6)) ``` """ -function write_jsonl(filename::AbstractString, data; compress::Bool=false) +function write_jsonl(filename::AbstractString, data, ::TableIteration; compress::Bool=false) + # @warn "Implementation for tables" dir = dirname(filename) if !isdir(dir) throw(ArgumentError("Directory does not exist: $dir")) @@ -207,7 +226,7 @@ function write_jsonl(filename::AbstractString, data; compress::Bool=false) openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w") io = openf(filename) try - for value in data + for value in Tables.namedtupleiterator(data) JSON3.write(io, value) write(io, '\n') end @@ -217,9 +236,24 @@ function write_jsonl(filename::AbstractString, data; compress::Bool=false) return filename end -function write_jsonl(filename::AbstractString, data::AbstractDataFrame; kwargs...) - row_tuples = (NamedTuple(row) for row in eachrow(data)) - write_jsonl(filename, row_tuples; kwargs...) +function write_jsonl(filename::AbstractString, data, ::DirectIteration; compress::Bool=false) + # @warn "Implementation for direct iteration" + dir = dirname(filename) + if !isdir(dir) + throw(ArgumentError("Directory does not exist: $dir")) + end + isgz = compress || endswith(filename, ".gz") + openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w") + io = openf(filename) + try + for value in data + JSON3.write(io, value) + write(io, '\n') + end + finally + close(io) + end + return filename end # -------------------------------------------------------------------------------------------------- diff --git a/test/UnitTests/jsonlines.jl b/test/UnitTests/jsonlines.jl @@ -163,6 +163,16 @@ end @test BazerUtils._dict_of_json3.(gz_data) == data_dict # @assert gz_data == data + jsonl_file = tempname() * ".jsonl" + simple_table = [ + (id=1, name="Alice", age=30), + (id=2, name="Bob", age=25), + (id=3, name="Charlie", age=35) + ] + write_jsonl(jsonl_file, simple_table) + simple_dict = read_jsonl(jsonl_file) + @test BazerUtils._dict_of_json3.(simple_dict) == map(row -> Dict(pairs(row)), simple_table) + end # --------------------------------------------------------------------------------------------------