BazerUtils.jl

Assorted Julia utilities including custom logging
Log | Files | Refs | README | LICENSE

JSONLines.jl (8033B)


      1 # --------------------------------------------------------------------------------------------------
      2 
      3 # JSONLines.jl
      4 
      5 # Function to naturally parse json lines files
      6 # --------------------------------------------------------------------------------------------------
      7 
      8 
      9 # --------------------------------------------------------------------------------------------------
     10 # Exported function
     11 # JSONLines
     12 # --------------------------------------------------------------------------------------------------
     13 
     14 
     15 # --------------------------------------------------------------------------------------------------
     16 """
     17     read_jsonl(source::Union{AbstractString, IO}; dict_of_json::Bool=false) -> Vector
     18 
     19 !!! warning "Deprecated"
     20     `read_jsonl` is deprecated. Use `JSON.parse(source; jsonlines=true)` from
     21     [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead.
     22 
     23 Read a JSON Lines (.jsonl) file or stream and return all records as a vector.
     24 
     25 Each line is parsed as a separate JSON value. Empty lines are skipped.
     26 
     27 # Arguments
     28 - `source::Union{AbstractString, IO}`: Path to a JSONL file, or an IO stream.
     29 - `dict_of_json::Bool=false`: If `true` and the parsed type is `JSON.Object`, convert each record to a `Dict{Symbol,Any}`.
     30 
     31 # Returns
     32 - `Vector`: A vector of parsed JSON values.
     33 """
     34 function read_jsonl(io::IO; dict_of_json::Bool=false)
     35     Base.depwarn("`read_jsonl` is deprecated. Use `JSON.parse(io; jsonlines=true)` from JSON.jl v1 instead.", :read_jsonl)
     36     lines = collect(eachline(io))
     37     nonempty_lines = filter(l -> !isempty(strip(l)), lines)
     38     isempty(nonempty_lines) && return []
     39 
     40     first_val = JSON.parse(nonempty_lines[1])
     41     T = typeof(first_val)
     42     results = Vector{T}(undef, length(nonempty_lines))
     43     results[1] = first_val
     44 
     45     for (i, line) in enumerate(nonempty_lines[2:end])
     46         results[i+1] = JSON.parse(line)
     47     end
     48     if dict_of_json && T <: JSON.Object
     49         results = [_dict_of_json(r) for r in results]
     50     end
     51 
     52     return results
     53 end
     54 
     55 function read_jsonl(filename::AbstractString; kwargs...)
     56     if !isfile(filename)
     57         throw(ArgumentError("File does not exist or is not a regular file: $filename"))
     58     end
     59     open(filename, "r") do io
     60         return read_jsonl(io; kwargs...)
     61     end
     62 end
     63 # --------------------------------------------------------------------------------------------------
     64 
     65 
     66 # --------------------------------------------------------------------------------------------------
     67 # Using lazy evaluation with generators
     68 # For very large files, you can create a generator that yields records on demand:
     69 """
     70     stream_jsonl(source::Union{AbstractString, IO}; T::Type=JSON.Object{String, Any}) -> Channel
     71 
     72 !!! warning "Deprecated"
     73     `stream_jsonl` is deprecated. Use `JSON.parse(source; jsonlines=true)` from
     74     [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead.
     75 
     76 Create a lazy Channel iterator for reading JSON Lines files record by record.
     77 
     78 # Arguments
     79 - `source::Union{AbstractString, IO}`: Path to a JSONL file, or an IO stream.
     80 - `T::Type=JSON.Object{String, Any}`: Expected type for each record. Use `T=Any` for mixed types.
     81 
     82 # Returns
     83 - `Channel{T}`: A channel yielding parsed JSON objects one at a time.
     84 """
     85 function stream_jsonl(io::IO; T::Type=JSON.Object{String, Any})
     86     Base.depwarn("`stream_jsonl` is deprecated. Use `JSON.parse(io; jsonlines=true)` from JSON.jl v1 instead.", :stream_jsonl)
     87     lines = Iterators.filter(l -> !isempty(strip(l)), eachline(io))
     88     return Channel{T}() do ch
     89         for line in lines
     90             val = JSON.parse(line)
     91             if !isa(val, T)
     92                 throw(ArgumentError("Parsed value of type $(typeof(val)) does not match expected type $T;\nTry specifying T::Any"))
     93             end
     94             put!(ch, val)
     95         end
     96     end
     97 end
     98 
     99 
    100 function stream_jsonl(filename::AbstractString; T::Type=JSON.Object{String, Any})
    101     Base.depwarn("`stream_jsonl` is deprecated. Use `JSON.parse(filename; jsonlines=true)` from JSON.jl v1 instead.", :stream_jsonl)
    102     if !isfile(filename)
    103         throw(ArgumentError("File does not exist or is not a regular file: $filename"))
    104     end
    105     return Channel{T}() do ch
    106         open(filename, "r") do io
    107             for line in eachline(io)
    108                 if isempty(strip(line))
    109                     continue
    110                 end
    111                 val = JSON.parse(line)
    112                 if !isa(val, T)
    113                     throw(ArgumentError("Parsed value of type $(typeof(val)) does not match expected type $T"))
    114                 end
    115                 put!(ch, val)
    116             end
    117         end
    118     end
    119 end
    120 # --------------------------------------------------------------------------------------------------
    121 
    122 
    123 # --------------------------------------------------------------------------------------------------
    124 abstract type IterationStyle end
    125 struct TableIteration <: IterationStyle end
    126 struct DirectIteration <: IterationStyle end
    127 
    128 function iteration_style(x)
    129     # Only use table iteration for proper table types
    130     if (Tables.istable(x) && !isa(x, AbstractVector) && !isa(x, AbstractDict))
    131         TableIteration()
    132     else
    133         DirectIteration()
    134     end
    135 end
    136 
    137 
    138 function write_jsonl(filename::AbstractString, data; kwargs...)
    139     Base.depwarn("`write_jsonl` is deprecated. Use `JSON.json(filename, data; jsonlines=true)` from JSON.jl v1 instead.", :write_jsonl)
    140     write_jsonl(filename, data, iteration_style(data); kwargs...)
    141 end
    142 
    143 """
    144     write_jsonl(filename, data; compress=false)
    145 
    146 !!! warning "Deprecated"
    147     `write_jsonl` is deprecated. Use `JSON.json(filename, data; jsonlines=true)` from
    148     [JSON.jl](https://github.com/JuliaIO/JSON.jl) v1 instead.
    149 
    150 Write an iterable of JSON-serializable values to a JSON Lines file.
    151 
    152 # Arguments
    153 - `filename`: Output file path (writes gzip-compressed if ends with `.gz` or `compress=true`)
    154 - `data`: An iterable of JSON-serializable values
    155 - `compress::Bool=false`: Force gzip compression
    156 
    157 # Returns
    158 The filename.
    159 """
    160 function write_jsonl(filename::AbstractString, data, ::TableIteration; compress::Bool=false)
    161     dir = dirname(filename)
    162     if !isempty(dir) && !isdir(dir)
    163         throw(ArgumentError("Directory does not exist: $dir"))
    164     end
    165     isgz = compress || endswith(filename, ".gz")
    166     openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w")
    167     io = openf(filename)
    168     try
    169         for value in Tables.namedtupleiterator(data)
    170             JSON.json(io, value)
    171             write(io, '\n')
    172         end
    173     finally
    174         close(io)
    175     end
    176     return filename
    177 end
    178 
    179 function write_jsonl(filename::AbstractString, data, ::DirectIteration; compress::Bool=false)
    180     dir = dirname(filename)
    181     if !isempty(dir) && !isdir(dir)
    182         throw(ArgumentError("Directory does not exist: $dir"))
    183     end
    184     isgz = compress || endswith(filename, ".gz")
    185     openf = isgz ? x->CodecZlib.GzipCompressorStream(open(x, "w")) : x->open(x, "w")
    186     io = openf(filename)
    187     try
    188         for value in data
    189             JSON.json(io, value)
    190             write(io, '\n')
    191         end
    192     finally
    193         close(io)
    194     end
    195     return filename
    196 end
    197 # --------------------------------------------------------------------------------------------------
    198 
    199 
    200 # --------------------------------------------------------------------------------------------------
    201 """
    202     _dict_of_json(obj::AbstractDict) -> Dict{Symbol, Any}
    203 
    204 Recursively convert a parsed JSON dictionary into a `Dict` with `Symbol` keys.
    205 
    206 All string keys are converted to `Symbol` and nested dictionaries are converted recursively.
    207 Non-dict values are left unchanged.
    208 """
    209 function _dict_of_json(d::AbstractDict)
    210     result = Dict{Symbol, Any}()
    211     for (k, v) in d
    212         result[Symbol(k)] = v isa AbstractDict ? _dict_of_json(v) : v
    213     end
    214     return result
    215 end
    216 
    217 # Keep old name as deprecated alias
    218 function _dict_of_json3(d)
    219     Base.depwarn("`_dict_of_json3` is deprecated. Use `_dict_of_json` instead.", :_dict_of_json3)
    220     _dict_of_json(d)
    221 end
    222 # --------------------------------------------------------------------------------------------------