dt-cli-tools

CLI tools for viewing, filtering, and comparing tabular data files
Log | Files | Refs | README | LICENSE

read.rs (3283B)


      1 use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
      2 use polars::prelude::*;
      3 use std::path::PathBuf;
      4 use tempfile::NamedTempFile;
      5 use std::io::Write;
      6 
      7 use dtcore::format::Format;
      8 use dtcore::reader::{ReadOptions, read_file};
      9 
     10 fn generate_csv(n: usize) -> NamedTempFile {
     11     let mut f = NamedTempFile::with_suffix(".csv").unwrap();
     12     writeln!(f, "id,name,region,value").unwrap();
     13     let regions = ["East", "West", "North", "South"];
     14     for i in 0..n {
     15         writeln!(f, "{},name_{},{},{}", i, i, regions[i % 4], i * 100).unwrap();
     16     }
     17     f.flush().unwrap();
     18     f
     19 }
     20 
     21 fn generate_parquet(n: usize) -> NamedTempFile {
     22     let ids: Vec<i64> = (0..n as i64).collect();
     23     let names: Vec<String> = (0..n).map(|i| format!("name_{}", i)).collect();
     24     let regions: Vec<&str> = (0..n).map(|i| ["East", "West", "North", "South"][i % 4]).collect();
     25     let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect();
     26 
     27     let mut df = DataFrame::new(vec![
     28         Series::new("id".into(), &ids).into_column(),
     29         Series::new("name".into(), &names).into_column(),
     30         Series::new("region".into(), &regions).into_column(),
     31         Series::new("value".into(), &values).into_column(),
     32     ]).unwrap();
     33 
     34     let f = NamedTempFile::with_suffix(".parquet").unwrap();
     35     let file = std::fs::File::create(f.path()).unwrap();
     36     ParquetWriter::new(file).finish(&mut df).unwrap();
     37     f
     38 }
     39 
     40 fn generate_arrow(n: usize) -> NamedTempFile {
     41     let ids: Vec<i64> = (0..n as i64).collect();
     42     let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect();
     43 
     44     let mut df = DataFrame::new(vec![
     45         Series::new("id".into(), &ids).into_column(),
     46         Series::new("value".into(), &values).into_column(),
     47     ]).unwrap();
     48 
     49     let f = NamedTempFile::with_suffix(".arrow").unwrap();
     50     let file = std::fs::File::create(f.path()).unwrap();
     51     IpcWriter::new(file).finish(&mut df).unwrap();
     52     f
     53 }
     54 
     55 fn generate_ndjson(n: usize) -> NamedTempFile {
     56     let mut f = NamedTempFile::with_suffix(".ndjson").unwrap();
     57     let regions = ["East", "West", "North", "South"];
     58     for i in 0..n {
     59         writeln!(f, r#"{{"id":{},"name":"name_{}","region":"{}","value":{}}}"#,
     60             i, i, regions[i % 4], i * 100).unwrap();
     61     }
     62     f.flush().unwrap();
     63     f
     64 }
     65 
     66 fn bench_read(c: &mut Criterion) {
     67     let opts = ReadOptions::default();
     68 
     69     for &size in &[1_000, 10_000, 100_000] {
     70         let csv = generate_csv(size);
     71         let parquet = generate_parquet(size);
     72         let arrow = generate_arrow(size);
     73         let ndjson = generate_ndjson(size);
     74 
     75         let mut group = c.benchmark_group(format!("read_{size}"));
     76 
     77         group.bench_function("csv", |b| {
     78             b.iter(|| read_file(csv.path(), Format::Csv, &opts).unwrap())
     79         });
     80 
     81         group.bench_function("parquet", |b| {
     82             b.iter(|| read_file(parquet.path(), Format::Parquet, &opts).unwrap())
     83         });
     84 
     85         group.bench_function("arrow", |b| {
     86             b.iter(|| read_file(arrow.path(), Format::Arrow, &opts).unwrap())
     87         });
     88 
     89         group.bench_function("ndjson", |b| {
     90             b.iter(|| read_file(ndjson.path(), Format::Ndjson, &opts).unwrap())
     91         });
     92 
     93         group.finish();
     94     }
     95 }
     96 
     97 criterion_group!(benches, bench_read);
     98 criterion_main!(benches);