dt-cli-tools

CLI tools for viewing, filtering, and comparing tabular data files
Log | Files | Refs | README | LICENSE

commit b2789b7aff3b7b6cb1c326fac1049494051df3ce
parent 0287d300c262d5078fdb930576a23e680c1c07dd
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Sat,  4 Apr 2026 10:56:45 -0500

feat: add LICENSE, --all flag, tests, benchmarks, and v0.2.0 spec

- Add MIT LICENSE file
- Add --all flag to dtcat (override adaptive row limit)
- Expand integration tests: 18 → 65 (all formats covered)
- Add Criterion benchmarks for read, filter, and diff
- Add Cargo.toml metadata (homepage, repository, keywords)
- Add x86_64-apple-darwin install instructions to README
- Add v0.2.0 design spec (--sample, --convert)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M.gitignore | 1+
MCargo.toml | 17+++++++++++++++++
ALICENSE | 21+++++++++++++++++++++
MREADME.md | 7+++++++
Abenches/diff.rs | 47+++++++++++++++++++++++++++++++++++++++++++++++
Abenches/filter.rs | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abenches/read.rs | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adocs/superpowers/specs/2026-04-04-v0.2.0-sample-convert-design.md | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/bin/dtcat.rs | 8++++++--
Mtests/dtcat.rs | 174++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mtests/dtdiff.rs | 156+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mtests/dtfilter.rs | 191++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Atests/fixtures/data.arrow | 0
Atests/fixtures/data.csv | 4++++
Atests/fixtures/data.json | 2++
Atests/fixtures/data.ndjson | 3+++
Atests/fixtures/data.parquet | 0
Atests/fixtures/new.arrow | 0
Atests/fixtures/new.csv | 4++++
Atests/fixtures/new.json | 2++
Atests/fixtures/new.ndjson | 3+++
Atests/fixtures/new.parquet | 0
Atests/fixtures/old.arrow | 0
Atests/fixtures/old.csv | 4++++
Atests/fixtures/old.json | 2++
Atests/fixtures/old.ndjson | 3+++
Atests/fixtures/old.parquet | 0
27 files changed, 872 insertions(+), 21 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,2 +1,3 @@ /target Cargo.lock +.claude/ diff --git a/Cargo.toml b/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" edition = "2024" description = "CLI tools for viewing, filtering, and comparing tabular data files" license = "MIT" +homepage = "https://github.com/LouLouLibs/dt-cli-tools" +repository = "https://github.com/LouLouLibs/dt-cli-tools" +keywords = ["cli", "csv", "parquet", "data", "diff"] +categories = ["command-line-utilities"] [lib] name = "dtcore" @@ -45,3 +49,16 @@ opt-level = "z" assert_cmd = "2" predicates = "3" tempfile = "3" +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "read" +harness = false + +[[bench]] +name = "filter" +harness = false + +[[bench]] +name = "diff" +harness = false diff --git a/LICENSE b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Erik Loualiche + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md @@ -58,6 +58,13 @@ for tool in dtcat dtfilter dtdiff; do -o ~/.local/bin/$tool done chmod +x ~/.local/bin/dt{cat,filter,diff} + +# Intel Mac (macOS) +for tool in dtcat dtfilter dtdiff; do + curl -L "https://github.com/LouLouLibs/dt-cli-tools/releases/latest/download/${tool}-x86_64-apple-darwin" \ + -o ~/.local/bin/$tool +done +chmod +x ~/.local/bin/dt{cat,filter,diff} ``` ### From source diff --git a/benches/diff.rs b/benches/diff.rs @@ -0,0 +1,47 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use polars::prelude::*; + +use dtcore::diff::{DiffOptions, SheetSource, diff_positional, diff_keyed}; + +fn source(name: &str) -> SheetSource { + SheetSource { file_name: name.into(), sheet_name: "Sheet1".into() } +} + +fn make_df(n: usize, offset: i64) -> DataFrame { + let ids: Vec<i64> = (offset..offset + n as i64).collect(); + let names: Vec<String> = ids.iter().map(|i| format!("name_{}", i)).collect(); + let values: Vec<i64> = ids.iter().map(|i| i * 100).collect(); + + DataFrame::new(vec![ + Series::new("id".into(), &ids).into_column(), + Series::new("name".into(), &names).into_column(), + Series::new("value".into(), &values).into_column(), + ]).unwrap() +} + +fn bench_diff(c: &mut Criterion) { + let opts_positional = DiffOptions::default(); + let opts_keyed = DiffOptions { key_columns: vec!["id".into()], tolerance: None }; + + for &size in &[1_000, 10_000, 100_000] { + let df_a = make_df(size, 0); + // 10% of rows differ (shifted by 10% of size) + let shift = (size / 10) as i64; + let df_b = make_df(size, shift); + + let mut group = c.benchmark_group(format!("diff_{size}")); + + group.bench_function("positional", |b| { + b.iter(|| diff_positional(&df_a, &df_b, &opts_positional, source("a"), source("b")).unwrap()) + }); + + group.bench_function("keyed", |b| { + b.iter(|| diff_keyed(&df_a, &df_b, &opts_keyed, source("a"), source("b")).unwrap()) + }); + + group.finish(); + } +} + +criterion_group!(benches, bench_diff); +criterion_main!(benches); diff --git a/benches/filter.rs b/benches/filter.rs @@ -0,0 +1,62 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use polars::prelude::*; + +use dtcore::filter::{FilterExpr, FilterOp, FilterOptions, SortSpec, apply_filters, filter_pipeline}; + +fn make_df(n: usize) -> DataFrame { + let ids: Vec<i64> = (0..n as i64).collect(); + let regions: Vec<&str> = (0..n).map(|i| ["East", "West", "North", "South"][i % 4]).collect(); + let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect(); + let names: Vec<String> = (0..n).map(|i| format!("name_{}", i)).collect(); + + DataFrame::new(vec![ + Series::new("id".into(), &ids).into_column(), + Series::new("region".into(), &regions).into_column(), + Series::new("value".into(), &values).into_column(), + Series::new("name".into(), &names).into_column(), + ]).unwrap() +} + +fn bench_filter(c: &mut Criterion) { + for &size in &[1_000, 10_000, 100_000] { + let df = make_df(size); + + let mut group = c.benchmark_group(format!("filter_{size}")); + + // Equality filter + let eq_expr = vec![FilterExpr { column: "region".into(), op: FilterOp::Eq, value: "East".into() }]; + group.bench_function("eq", |b| { + b.iter(|| apply_filters(&df, &eq_expr).unwrap()) + }); + + // Numeric comparison + let gt_expr = vec![FilterExpr { column: "value".into(), op: FilterOp::Gt, value: (size as i64 * 50).to_string() }]; + group.bench_function("gt", |b| { + b.iter(|| apply_filters(&df, &gt_expr).unwrap()) + }); + + // Contains (string scan) + let contains_expr = vec![FilterExpr { column: "name".into(), op: FilterOp::Contains, value: "42".into() }]; + group.bench_function("contains", |b| { + b.iter(|| apply_filters(&df, &contains_expr).unwrap()) + }); + + // Full pipeline: filter + sort + limit + let pipeline_opts = FilterOptions { + filters: vec![FilterExpr { column: "region".into(), op: FilterOp::Eq, value: "East".into() }], + sort: Some(SortSpec { column: "value".into(), descending: true }), + limit: Some(10), + cols: None, + head: None, + tail: None, + }; + group.bench_function("pipeline", |b| { + b.iter(|| filter_pipeline(df.clone(), &pipeline_opts).unwrap()) + }); + + group.finish(); + } +} + +criterion_group!(benches, bench_filter); +criterion_main!(benches); diff --git a/benches/read.rs b/benches/read.rs @@ -0,0 +1,98 @@ +use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId}; +use polars::prelude::*; +use std::path::PathBuf; +use tempfile::NamedTempFile; +use std::io::Write; + +use dtcore::format::Format; +use dtcore::reader::{ReadOptions, read_file}; + +fn generate_csv(n: usize) -> NamedTempFile { + let mut f = NamedTempFile::with_suffix(".csv").unwrap(); + writeln!(f, "id,name,region,value").unwrap(); + let regions = ["East", "West", "North", "South"]; + for i in 0..n { + writeln!(f, "{},name_{},{},{}", i, i, regions[i % 4], i * 100).unwrap(); + } + f.flush().unwrap(); + f +} + +fn generate_parquet(n: usize) -> NamedTempFile { + let ids: Vec<i64> = (0..n as i64).collect(); + let names: Vec<String> = (0..n).map(|i| format!("name_{}", i)).collect(); + let regions: Vec<&str> = (0..n).map(|i| ["East", "West", "North", "South"][i % 4]).collect(); + let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect(); + + let mut df = DataFrame::new(vec![ + Series::new("id".into(), &ids).into_column(), + Series::new("name".into(), &names).into_column(), + Series::new("region".into(), &regions).into_column(), + Series::new("value".into(), &values).into_column(), + ]).unwrap(); + + let f = NamedTempFile::with_suffix(".parquet").unwrap(); + let file = std::fs::File::create(f.path()).unwrap(); + ParquetWriter::new(file).finish(&mut df).unwrap(); + f +} + +fn generate_arrow(n: usize) -> NamedTempFile { + let ids: Vec<i64> = (0..n as i64).collect(); + let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect(); + + let mut df = DataFrame::new(vec![ + Series::new("id".into(), &ids).into_column(), + Series::new("value".into(), &values).into_column(), + ]).unwrap(); + + let f = NamedTempFile::with_suffix(".arrow").unwrap(); + let file = std::fs::File::create(f.path()).unwrap(); + IpcWriter::new(file).finish(&mut df).unwrap(); + f +} + +fn generate_ndjson(n: usize) -> NamedTempFile { + let mut f = NamedTempFile::with_suffix(".ndjson").unwrap(); + let regions = ["East", "West", "North", "South"]; + for i in 0..n { + writeln!(f, r#"{{"id":{},"name":"name_{}","region":"{}","value":{}}}"#, + i, i, regions[i % 4], i * 100).unwrap(); + } + f.flush().unwrap(); + f +} + +fn bench_read(c: &mut Criterion) { + let opts = ReadOptions::default(); + + for &size in &[1_000, 10_000, 100_000] { + let csv = generate_csv(size); + let parquet = generate_parquet(size); + let arrow = generate_arrow(size); + let ndjson = generate_ndjson(size); + + let mut group = c.benchmark_group(format!("read_{size}")); + + group.bench_function("csv", |b| { + b.iter(|| read_file(csv.path(), Format::Csv, &opts).unwrap()) + }); + + group.bench_function("parquet", |b| { + b.iter(|| read_file(parquet.path(), Format::Parquet, &opts).unwrap()) + }); + + group.bench_function("arrow", |b| { + b.iter(|| read_file(arrow.path(), Format::Arrow, &opts).unwrap()) + }); + + group.bench_function("ndjson", |b| { + b.iter(|| read_file(ndjson.path(), Format::Ndjson, &opts).unwrap()) + }); + + group.finish(); + } +} + +criterion_group!(benches, bench_read); +criterion_main!(benches); diff --git a/docs/superpowers/specs/2026-04-04-v0.2.0-sample-convert-design.md b/docs/superpowers/specs/2026-04-04-v0.2.0-sample-convert-design.md @@ -0,0 +1,84 @@ +# dt-cli-tools v0.2.0 — sample and convert + +## Summary + +Add two flags to dtcat: `--sample N` for random row sampling and `--convert FORMAT` with `-o PATH` for format conversion. No changes to dtfilter or dtdiff. + +## `--sample N` + +Randomly select N rows from the DataFrame after reading. + +- Mutually exclusive with `--head`, `--tail`, `--all` +- Works with `--csv` output and default markdown +- Works with `--skip` and `--sheet` (applied before sampling) +- Mutually exclusive with `--schema`, `--describe`, `--info` +- Non-deterministic (no seed flag) +- If N >= row count, return all rows (no error) + +### Examples + +```bash +dtcat huge.parquet --sample 20 +dtcat huge.parquet --sample 50 --csv +dtcat report.xlsx --sheet Data --sample 10 +``` + +## `--convert FORMAT` with `-o PATH` + +Read any supported format, write to a different format. + +- `--convert FORMAT` — target format: csv, tsv, parquet, arrow, json, ndjson +- `-o PATH` — output file path +- For text formats (csv, tsv, json, ndjson): if `-o` omitted, write to stdout +- For binary formats (parquet, arrow): `-o` is required; error if missing +- Mutually exclusive with all display flags (`--schema`, `--describe`, `--info`, `--csv`, `--head`, `--tail`, `--all`, `--sample`) +- Works with `--skip` and `--sheet` (select data before converting) + +### Examples + +```bash +dtcat data.csv --convert parquet -o data.parquet +dtcat report.xlsx --sheet Revenue --convert csv -o revenue.csv +dtcat data.json --convert arrow -o data.arrow +dtcat data.parquet --convert ndjson # stdout +``` + +## Architecture + +Both features are additions to `src/bin/dtcat.rs` (new CLI args) and `dtcore` (writers). + +### New library code + +- `src/writers/` module with: `csv.rs`, `parquet.rs`, `arrow.rs`, `json.rs` +- Each writer takes a `&mut DataFrame` and a `Write` or file path, returns `Result<()>` +- `src/lib.rs` exports new `writers` module + +### Sampling + +- Use Polars `DataFrame::sample_n` or equivalent +- Implemented in dtcat binary after read, before display/convert + +## Testing + +### `--sample N` +- Verify output has exactly N rows (when N < total) +- Verify N >= total returns all rows +- Verify mutual exclusivity with `--head`/`--tail`/`--all` +- Works on CSV, Parquet, Excel + +### `--convert FORMAT` +- Roundtrip: CSV → Parquet → CSV, verify data matches +- All 6 target formats produce valid output +- Text formats work without `-o` (stdout) +- Binary formats error without `-o` +- `--skip` and `--sheet` apply before conversion +- Mutual exclusivity with display flags + +### Existing tests +- All 187+ existing tests still pass + +## Not in scope + +- No changes to dtfilter or dtdiff +- No new binaries +- No dtset, dtvalidate, or dtjoin diff --git a/src/bin/dtcat.rs b/src/bin/dtcat.rs @@ -59,6 +59,10 @@ struct Args { #[arg(long)] csv: bool, + /// Show all rows (override adaptive row limit) + #[arg(long)] + all: bool, + /// Show file metadata only #[arg(long)] info: bool, @@ -239,8 +243,8 @@ fn run(args: Args) -> Result<()> { format_data_table(&sliced) } (None, None) => { - // Default: show all if <= threshold, otherwise head+tail - if df.height() <= DEFAULT_THRESHOLD { + // Default: show all if <= threshold or --all, otherwise head+tail + if args.all || df.height() <= DEFAULT_THRESHOLD { format_data_table(&df) } else { format_head_tail(&df, DEFAULT_HEAD_TAIL, DEFAULT_HEAD_TAIL) diff --git a/tests/dtcat.rs b/tests/dtcat.rs @@ -14,6 +14,8 @@ fn csv_file(content: &str) -> NamedTempFile { f } +// ─── Basic viewing ─── + #[test] fn shows_csv_data() { let f = csv_file("name,value\nAlice,100\nBob,200\n"); @@ -23,6 +25,20 @@ fn shows_csv_data() { } #[test] +fn header_only_csv() { + let f = csv_file("name,value\n"); + dtcat().arg(f.path()).assert().success() + .stdout(predicate::str::contains("no data rows")); +} + +#[test] +fn nonexistent_file_exits_1() { + dtcat().arg("/tmp/does_not_exist_12345.csv").assert().failure(); +} + +// ─── Modes ─── + +#[test] fn schema_flag() { let f = csv_file("name,value\nAlice,100\n"); dtcat().arg(f.path()).arg("--schema").assert().success() @@ -31,23 +47,61 @@ fn schema_flag() { } #[test] -fn csv_output_flag() { +fn describe_flag() { + let f = csv_file("name,value\nAlice,100\nBob,200\n"); + dtcat().arg(f.path()).arg("--describe").assert().success() + .stdout(predicate::str::contains("count")) + .stdout(predicate::str::contains("mean")); +} + +#[test] +fn info_flag() { let f = csv_file("name,value\nAlice,100\n"); - dtcat().arg(f.path()).arg("--csv").assert().success() - .stdout(predicate::str::contains("name,value")); + dtcat().arg(f.path()).arg("--info").assert().success() + .stdout(predicate::str::contains("File:")); } +// ─── Row windowing ─── + #[test] fn head_flag() { let f = csv_file("x\n1\n2\n3\n4\n5\n"); - dtcat().arg(f.path()).arg("--head").arg("2").assert().success(); + dtcat().arg(f.path()).arg("--head").arg("2").assert().success() + .stdout(predicate::str::contains("1")) + .stdout(predicate::str::contains("2")) + .stdout(predicate::str::contains("3").not()); } #[test] -fn nonexistent_file_exits_1() { - dtcat().arg("/tmp/does_not_exist_12345.csv").assert().failure(); +fn tail_flag() { + let f = csv_file("x\n1\n2\n3\n4\n5\n"); + dtcat().arg(f.path()).arg("--tail").arg("2").assert().success() + .stdout(predicate::str::contains("4")) + .stdout(predicate::str::contains("5")); +} + +#[test] +fn head_and_tail_combined() { + let f = csv_file("x\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n"); + dtcat().arg(f.path()).arg("--head").arg("2").arg("--tail").arg("2") + .assert().success() + .stdout(predicate::str::contains("1")) + .stdout(predicate::str::contains("2")) + .stdout(predicate::str::contains("9")) + .stdout(predicate::str::contains("10")); } +// ─── Output format ─── + +#[test] +fn csv_output_flag() { + let f = csv_file("name,value\nAlice,100\n"); + dtcat().arg(f.path()).arg("--csv").assert().success() + .stdout(predicate::str::contains("name,value")); +} + +// ─── Format detection ─── + #[test] fn format_override() { let mut f = NamedTempFile::with_suffix(".txt").unwrap(); @@ -58,9 +112,107 @@ fn format_override() { } #[test] -fn describe_flag() { - let f = csv_file("name,value\nAlice,100\nBob,200\n"); - dtcat().arg(f.path()).arg("--describe").assert().success() - .stdout(predicate::str::contains("count")) - .stdout(predicate::str::contains("mean")); +fn tsv_detection() { + let mut f = NamedTempFile::with_suffix(".tsv").unwrap(); + write!(f, "name\tvalue\nAlice\t100\n").unwrap(); + f.flush().unwrap(); + dtcat().arg(f.path()).assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("100")); +} + +// ─── Skip rows ─── + +#[test] +fn skip_metadata_rows() { + let f = csv_file("meta1\nmeta2\nname,value\nAlice,100\n"); + dtcat().arg(f.path()).arg("--skip").arg("2").assert().success() + .stdout(predicate::str::contains("Alice")); +} + +// ─── All flag ─── + +#[test] +fn all_flag_shows_every_row() { + // 60 rows > threshold of 50, so without --all we'd get head+tail + let mut content = String::from("x\n"); + for i in 1..=60 { + content.push_str(&format!("{}\n", i)); + } + let f = csv_file(&content); + // With --all, row 30 should appear (it would be omitted in head25+tail25) + dtcat().arg(f.path()).arg("--all").assert().success() + .stdout(predicate::str::contains("| 30 ")); +} + +// ─── Parquet ─── + +#[test] +fn parquet_view() { + dtcat().arg("tests/fixtures/data.parquet").assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Charlie")); +} + +#[test] +fn parquet_schema() { + dtcat().arg("tests/fixtures/data.parquet").arg("--schema").assert().success() + .stdout(predicate::str::contains("name")) + .stdout(predicate::str::contains("value")); +} + +// ─── Arrow/IPC ─── + +#[test] +fn arrow_view() { + dtcat().arg("tests/fixtures/data.arrow").assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Charlie")); +} + +#[test] +fn arrow_schema() { + dtcat().arg("tests/fixtures/data.arrow").arg("--schema").assert().success() + .stdout(predicate::str::contains("name")) + .stdout(predicate::str::contains("value")); +} + +// ─── JSON ─── + +#[test] +fn json_view() { + dtcat().arg("tests/fixtures/data.json").assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Charlie")); +} + +// ─── NDJSON ─── + +#[test] +fn ndjson_view() { + dtcat().arg("tests/fixtures/data.ndjson").assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Charlie")); +} + +// ─── Excel ─── + +#[test] +fn excel_view() { + dtcat().arg("demo/sales.xlsx").assert().success() + .stdout(predicate::str::contains("Revenue")); +} + +#[test] +fn excel_schema() { + dtcat().arg("demo/sales.xlsx").arg("--schema").assert().success() + .stdout(predicate::str::contains("Column")) + .stdout(predicate::str::contains("Revenue")); +} + +#[test] +fn excel_info() { + dtcat().arg("demo/sales.xlsx").arg("--info").assert().success() + .stdout(predicate::str::contains("Excel")) + .stdout(predicate::str::contains("Sheet1")); } diff --git a/tests/dtdiff.rs b/tests/dtdiff.rs @@ -14,6 +14,8 @@ fn csv_file(content: &str) -> NamedTempFile { f } +// ─── Positional mode ─── + #[test] fn no_diff_exits_0() { let a = csv_file("name,value\nAlice,100\n"); @@ -23,20 +25,142 @@ fn no_diff_exits_0() { } #[test] -fn diff_exits_1() { +fn positional_diff_exits_1() { let a = csv_file("name,value\nAlice,100\n"); let b = csv_file("name,value\nBob,200\n"); dtdiff().arg(a.path()).arg(b.path()).assert().code(1); } #[test] -fn keyed_diff() { +fn positional_added_row() { + let a = csv_file("name,value\nAlice,100\n"); + let b = csv_file("name,value\nAlice,100\nBob,200\n"); + dtdiff().arg(a.path()).arg(b.path()).assert().code(1) + .stdout(predicate::str::contains("Added: 1")); +} + +#[test] +fn positional_removed_row() { + let a = csv_file("name,value\nAlice,100\nBob,200\n"); + let b = csv_file("name,value\nAlice,100\n"); + dtdiff().arg(a.path()).arg(b.path()).assert().code(1) + .stdout(predicate::str::contains("Removed: 1")); +} + +// ─── Key-based mode ─── + +#[test] +fn keyed_diff_modified() { let a = csv_file("id,name\n1,Alice\n2,Bob\n"); let b = csv_file("id,name\n1,Alice\n2,Robert\n"); dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id") - .assert().code(1); + .assert().code(1) + .stdout(predicate::str::contains("Modified: 1")) + .stdout(predicate::str::contains("Bob")); +} + +#[test] +fn keyed_diff_added_and_removed() { + let a = csv_file("id,name\n1,Alice\n2,Bob\n"); + let b = csv_file("id,name\n1,Alice\n3,Charlie\n"); + dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id") + .assert().code(1) + .stdout(predicate::str::contains("Added: 1")) + .stdout(predicate::str::contains("Removed: 1")); +} + +#[test] +fn keyed_no_diff() { + let a = csv_file("id,name\n1,Alice\n2,Bob\n"); + let b = csv_file("id,name\n2,Bob\n1,Alice\n"); + dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id") + .assert().success() + .stdout(predicate::str::contains("No differences")); +} + +// ─── Composite keys ─── + +#[test] +fn composite_key() { + let a = csv_file("date,ticker,price\n2024-01-01,AAPL,150\n2024-01-01,GOOG,140\n"); + let b = csv_file("date,ticker,price\n2024-01-01,AAPL,150\n2024-01-01,GOOG,145\n"); + dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("date,ticker") + .assert().code(1) + .stdout(predicate::str::contains("Modified: 1")) + .stdout(predicate::str::contains("GOOG")); +} + +// ─── Float tolerance ─── + +#[test] +fn tolerance_suppresses_small_diff() { + let a = csv_file("id,price\n1,150.000\n"); + let b = csv_file("id,price\n1,150.005\n"); + dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id").arg("--tolerance").arg("0.01") + .assert().success() + .stdout(predicate::str::contains("No differences")); +} + +#[test] +fn tolerance_reports_large_diff() { + let a = csv_file("id,price\n1,150.0\n"); + let b = csv_file("id,price\n1,155.0\n"); + dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id").arg("--tolerance").arg("0.01") + .assert().code(1) + .stdout(predicate::str::contains("Modified: 1")); +} + +// ─── Parquet ─── + +#[test] +fn parquet_keyed_diff() { + dtdiff().arg("tests/fixtures/old.parquet").arg("tests/fixtures/new.parquet") + .arg("--key").arg("id") + .assert().code(1) + .stdout(predicate::str::contains("Added: 1")) + .stdout(predicate::str::contains("Removed: 1")); +} + +#[test] +fn parquet_no_diff() { + dtdiff().arg("tests/fixtures/data.parquet").arg("tests/fixtures/data.parquet") + .assert().success() + .stdout(predicate::str::contains("No differences")); +} + +// ─── Arrow/IPC ─── + +#[test] +fn arrow_keyed_diff() { + dtdiff().arg("tests/fixtures/old.arrow").arg("tests/fixtures/new.arrow") + .arg("--key").arg("id") + .assert().code(1) + .stdout(predicate::str::contains("Added: 1")) + .stdout(predicate::str::contains("Removed: 1")); +} + +// ─── JSON ─── + +#[test] +fn json_keyed_diff() { + dtdiff().arg("tests/fixtures/old.json").arg("tests/fixtures/new.json") + .arg("--key").arg("id") + .assert().code(1) + .stdout(predicate::str::contains("Modified: 1")); } +// ─── NDJSON ─── + +#[test] +fn ndjson_keyed_diff() { + dtdiff().arg("tests/fixtures/old.ndjson").arg("tests/fixtures/new.ndjson") + .arg("--key").arg("id") + .assert().code(1) + .stdout(predicate::str::contains("Modified: 1")); +} + +// ─── Output formats ─── + #[test] fn json_output() { let a = csv_file("id,val\n1,a\n"); @@ -54,3 +178,29 @@ fn csv_output() { .assert().code(1) .stdout(predicate::str::contains("_status")); } + +#[test] +fn no_color_flag() { + let a = csv_file("name,value\nAlice,100\n"); + let b = csv_file("name,value\nBob,200\n"); + dtdiff().arg(a.path()).arg(b.path()).arg("--no-color") + .assert().code(1); +} + +// ─── Excel ─── + +#[test] +fn excel_keyed_diff() { + dtdiff().arg("demo/old.xlsx").arg("demo/new.xlsx").arg("--key").arg("ID") + .assert().code(1) + .stdout(predicate::str::contains("Added: 1")) + .stdout(predicate::str::contains("Removed: 1")) + .stdout(predicate::str::contains("Modified: 3")); +} + +#[test] +fn excel_no_diff() { + dtdiff().arg("demo/old.xlsx").arg("demo/old.xlsx") + .assert().success() + .stdout(predicate::str::contains("No differences")); +} diff --git a/tests/dtfilter.rs b/tests/dtfilter.rs @@ -14,29 +14,124 @@ fn csv_file(content: &str) -> NamedTempFile { f } +const DATA: &str = "name,value\nAlice,100\nBob,200\nCharlie,300\n"; + +// ─── Equality ─── + #[test] fn filter_eq() { - let f = csv_file("name,value\nAlice,100\nBob,200\n"); + let f = csv_file(DATA); dtfilter().arg(f.path()).arg("--filter").arg("name=Alice").assert().success() .stdout(predicate::str::contains("Alice")) .stdout(predicate::str::contains("Bob").not()); } #[test] +fn filter_neq() { + let f = csv_file(DATA); + dtfilter().arg(f.path()).arg("--filter").arg("name!=Alice").assert().success() + .stdout(predicate::str::contains("Bob")) + .stdout(predicate::str::contains("Charlie")) + .stdout(predicate::str::contains("Alice").not()); +} + +// ─── Numeric comparisons ─── + +#[test] fn filter_gt() { - let f = csv_file("name,value\nAlice,100\nBob,200\nCharlie,300\n"); + let f = csv_file(DATA); dtfilter().arg(f.path()).arg("--filter").arg("value>150").assert().success() .stdout(predicate::str::contains("Bob")) - .stdout(predicate::str::contains("Charlie")); + .stdout(predicate::str::contains("Charlie")) + .stdout(predicate::str::contains("Alice").not()); +} + +#[test] +fn filter_lt() { + let f = csv_file(DATA); + dtfilter().arg(f.path()).arg("--filter").arg("value<200").assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Bob").not()); +} + +#[test] +fn filter_gte() { + let f = csv_file(DATA); + dtfilter().arg(f.path()).arg("--filter").arg("value>=200").assert().success() + .stdout(predicate::str::contains("Bob")) + .stdout(predicate::str::contains("Charlie")) + .stdout(predicate::str::contains("Alice").not()); +} + +#[test] +fn filter_lte() { + let f = csv_file(DATA); + dtfilter().arg(f.path()).arg("--filter").arg("value<=200").assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Bob")) + .stdout(predicate::str::contains("Charlie").not()); +} + +// ─── String matching ─── + +#[test] +fn filter_contains() { + let f = csv_file(DATA); + dtfilter().arg(f.path()).arg("--filter").arg("name~ob").assert().success() + .stdout(predicate::str::contains("Bob")) + .stdout(predicate::str::contains("Alice").not()); } #[test] +fn filter_not_contains() { + let f = csv_file(DATA); + dtfilter().arg(f.path()).arg("--filter").arg("name!~ob").assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Charlie")) + .stdout(predicate::str::contains("Bob").not()); +} + +// ─── Multiple filters (AND) ─── + +#[test] +fn multiple_filters_and() { + let f = csv_file(DATA); + dtfilter().arg(f.path()) + .arg("--filter").arg("value>=200") + .arg("--filter").arg("value<=300") + .assert().success() + .stdout(predicate::str::contains("Bob")) + .stdout(predicate::str::contains("Charlie")) + .stdout(predicate::str::contains("Alice").not()); +} + +// ─── Sort ─── + +#[test] fn sort_desc() { - let f = csv_file("name,value\nAlice,100\nBob,200\n"); - dtfilter().arg(f.path()).arg("--sort").arg("value:desc").assert().success(); + let f = csv_file(DATA); + let out = dtfilter().arg(f.path()).arg("--sort").arg("value:desc") + .assert().success(); + let stdout = String::from_utf8(out.get_output().stdout.clone()).unwrap(); + let charlie_pos = stdout.find("Charlie").unwrap(); + let alice_pos = stdout.find("Alice").unwrap(); + assert!(charlie_pos < alice_pos, "Charlie (300) should appear before Alice (100) in desc sort"); } #[test] +fn sort_asc() { + let f = csv_file(DATA); + let out = dtfilter().arg(f.path()).arg("--sort").arg("value") + .assert().success(); + let stdout = String::from_utf8(out.get_output().stdout.clone()).unwrap(); + let alice_pos = stdout.find("Alice").unwrap(); + let charlie_pos = stdout.find("Charlie").unwrap(); + assert!(alice_pos < charlie_pos, "Alice (100) should appear before Charlie (300) in asc sort"); +} + +// ─── Column selection ─── + +#[test] fn columns_select() { let f = csv_file("name,value,extra\nAlice,100,x\n"); dtfilter().arg(f.path()).arg("--columns").arg("name,value").assert().success() @@ -44,6 +139,19 @@ fn columns_select() { .stdout(predicate::str::contains("extra").not()); } +// ─── Limit ─── + +#[test] +fn limit_output() { + let f = csv_file(DATA); + dtfilter().arg(f.path()).arg("--sort").arg("value:desc").arg("--limit").arg("1") + .assert().success() + .stdout(predicate::str::contains("Charlie")) + .stdout(predicate::str::contains("Alice").not()); +} + +// ─── Output format ─── + #[test] fn csv_output() { let f = csv_file("name,value\nAlice,100\n"); @@ -51,9 +159,82 @@ fn csv_output() { .stdout(predicate::str::contains("name,value")); } +// ─── Windowing ─── + +#[test] +fn head_before_filter() { + let f = csv_file("name,value\nAlice,100\nBob,200\nCharlie,300\n"); + dtfilter().arg(f.path()).arg("--head").arg("2").arg("--filter").arg("value>150") + .assert().success() + .stdout(predicate::str::contains("Bob")) + .stdout(predicate::str::contains("Charlie").not()); +} + #[test] fn head_tail_exclusive() { let f = csv_file("x\n1\n2\n"); dtfilter().arg(f.path()).arg("--head").arg("1").arg("--tail").arg("1") .assert().code(2); } + +// ─── Excel ─── + +#[test] +fn filter_excel() { + dtfilter().arg("demo/sales.xlsx").arg("--filter").arg("Region=East") + .assert().success() + .stdout(predicate::str::contains("East")) + .stdout(predicate::str::contains("West").not()); +} + +// ─── Parquet ─── + +#[test] +fn filter_parquet() { + dtfilter().arg("tests/fixtures/data.parquet").arg("--filter").arg("value>150") + .assert().success() + .stdout(predicate::str::contains("Bob")) + .stdout(predicate::str::contains("Charlie")) + .stdout(predicate::str::contains("Alice").not()); +} + +// ─── Arrow/IPC ─── + +#[test] +fn filter_arrow() { + dtfilter().arg("tests/fixtures/data.arrow").arg("--filter").arg("value>150") + .assert().success() + .stdout(predicate::str::contains("Bob")) + .stdout(predicate::str::contains("Charlie")) + .stdout(predicate::str::contains("Alice").not()); +} + +// ─── JSON ─── + +#[test] +fn filter_json() { + dtfilter().arg("tests/fixtures/data.json").arg("--filter").arg("name=Alice") + .assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Bob").not()); +} + +// ─── NDJSON ─── + +#[test] +fn filter_ndjson() { + dtfilter().arg("tests/fixtures/data.ndjson").arg("--filter").arg("name=Alice") + .assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Bob").not()); +} + +// ─── Edge cases ─── + +#[test] +fn filter_no_matches() { + let f = csv_file(DATA); + dtfilter().arg(f.path()).arg("--filter").arg("name=Nobody") + .assert().success() + .stderr(predicate::str::contains("0 rows")); +} diff --git a/tests/fixtures/data.arrow b/tests/fixtures/data.arrow Binary files differ. diff --git a/tests/fixtures/data.csv b/tests/fixtures/data.csv @@ -0,0 +1,4 @@ +name,value +Alice,100 +Bob,200 +Charlie,300 diff --git a/tests/fixtures/data.json b/tests/fixtures/data.json @@ -0,0 +1 @@ +[{"name":"Alice","value":100},{"name":"Bob","value":200},{"name":"Charlie","value":300}]+ \ No newline at end of file diff --git a/tests/fixtures/data.ndjson b/tests/fixtures/data.ndjson @@ -0,0 +1,3 @@ +{"name":"Alice","value":100} +{"name":"Bob","value":200} +{"name":"Charlie","value":300} diff --git a/tests/fixtures/data.parquet b/tests/fixtures/data.parquet Binary files differ. diff --git a/tests/fixtures/new.arrow b/tests/fixtures/new.arrow Binary files differ. diff --git a/tests/fixtures/new.csv b/tests/fixtures/new.csv @@ -0,0 +1,4 @@ +id,name +1,Alice +2,Robert +4,Diana diff --git a/tests/fixtures/new.json b/tests/fixtures/new.json @@ -0,0 +1 @@ +[{"id": 1, "name": "Alice"}, {"id": 2, "name": "Robert"}, {"id": 4, "name": "Diana"}]+ \ No newline at end of file diff --git a/tests/fixtures/new.ndjson b/tests/fixtures/new.ndjson @@ -0,0 +1,3 @@ +{"id":1,"name":"Alice"} +{"id":2,"name":"Robert"} +{"id":4,"name":"Diana"} diff --git a/tests/fixtures/new.parquet b/tests/fixtures/new.parquet Binary files differ. diff --git a/tests/fixtures/old.arrow b/tests/fixtures/old.arrow Binary files differ. diff --git a/tests/fixtures/old.csv b/tests/fixtures/old.csv @@ -0,0 +1,4 @@ +id,name +1,Alice +2,Bob +3,Charlie diff --git a/tests/fixtures/old.json b/tests/fixtures/old.json @@ -0,0 +1 @@ +[{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}, {"id": 3, "name": "Charlie"}]+ \ No newline at end of file diff --git a/tests/fixtures/old.ndjson b/tests/fixtures/old.ndjson @@ -0,0 +1,3 @@ +{"id":1,"name":"Alice"} +{"id":2,"name":"Bob"} +{"id":3,"name":"Charlie"} diff --git a/tests/fixtures/old.parquet b/tests/fixtures/old.parquet Binary files differ.