read.rs (3283B)
1 use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId}; 2 use polars::prelude::*; 3 use std::path::PathBuf; 4 use tempfile::NamedTempFile; 5 use std::io::Write; 6 7 use dtcore::format::Format; 8 use dtcore::reader::{ReadOptions, read_file}; 9 10 fn generate_csv(n: usize) -> NamedTempFile { 11 let mut f = NamedTempFile::with_suffix(".csv").unwrap(); 12 writeln!(f, "id,name,region,value").unwrap(); 13 let regions = ["East", "West", "North", "South"]; 14 for i in 0..n { 15 writeln!(f, "{},name_{},{},{}", i, i, regions[i % 4], i * 100).unwrap(); 16 } 17 f.flush().unwrap(); 18 f 19 } 20 21 fn generate_parquet(n: usize) -> NamedTempFile { 22 let ids: Vec<i64> = (0..n as i64).collect(); 23 let names: Vec<String> = (0..n).map(|i| format!("name_{}", i)).collect(); 24 let regions: Vec<&str> = (0..n).map(|i| ["East", "West", "North", "South"][i % 4]).collect(); 25 let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect(); 26 27 let mut df = DataFrame::new(vec![ 28 Series::new("id".into(), &ids).into_column(), 29 Series::new("name".into(), &names).into_column(), 30 Series::new("region".into(), ®ions).into_column(), 31 Series::new("value".into(), &values).into_column(), 32 ]).unwrap(); 33 34 let f = NamedTempFile::with_suffix(".parquet").unwrap(); 35 let file = std::fs::File::create(f.path()).unwrap(); 36 ParquetWriter::new(file).finish(&mut df).unwrap(); 37 f 38 } 39 40 fn generate_arrow(n: usize) -> NamedTempFile { 41 let ids: Vec<i64> = (0..n as i64).collect(); 42 let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect(); 43 44 let mut df = DataFrame::new(vec![ 45 Series::new("id".into(), &ids).into_column(), 46 Series::new("value".into(), &values).into_column(), 47 ]).unwrap(); 48 49 let f = NamedTempFile::with_suffix(".arrow").unwrap(); 50 let file = std::fs::File::create(f.path()).unwrap(); 51 IpcWriter::new(file).finish(&mut df).unwrap(); 52 f 53 } 54 55 fn generate_ndjson(n: usize) -> NamedTempFile { 56 let mut f = NamedTempFile::with_suffix(".ndjson").unwrap(); 57 let regions = ["East", "West", "North", "South"]; 58 for i in 0..n { 59 writeln!(f, r#"{{"id":{},"name":"name_{}","region":"{}","value":{}}}"#, 60 i, i, regions[i % 4], i * 100).unwrap(); 61 } 62 f.flush().unwrap(); 63 f 64 } 65 66 fn bench_read(c: &mut Criterion) { 67 let opts = ReadOptions::default(); 68 69 for &size in &[1_000, 10_000, 100_000] { 70 let csv = generate_csv(size); 71 let parquet = generate_parquet(size); 72 let arrow = generate_arrow(size); 73 let ndjson = generate_ndjson(size); 74 75 let mut group = c.benchmark_group(format!("read_{size}")); 76 77 group.bench_function("csv", |b| { 78 b.iter(|| read_file(csv.path(), Format::Csv, &opts).unwrap()) 79 }); 80 81 group.bench_function("parquet", |b| { 82 b.iter(|| read_file(parquet.path(), Format::Parquet, &opts).unwrap()) 83 }); 84 85 group.bench_function("arrow", |b| { 86 b.iter(|| read_file(arrow.path(), Format::Arrow, &opts).unwrap()) 87 }); 88 89 group.bench_function("ndjson", |b| { 90 b.iter(|| read_file(ndjson.path(), Format::Ndjson, &opts).unwrap()) 91 }); 92 93 group.finish(); 94 } 95 } 96 97 criterion_group!(benches, bench_read); 98 criterion_main!(benches);