commit b2789b7aff3b7b6cb1c326fac1049494051df3ce
parent 0287d300c262d5078fdb930576a23e680c1c07dd
Author: Erik Loualiche <eloualic@umn.edu>
Date: Sat, 4 Apr 2026 10:56:45 -0500
feat: add LICENSE, --all flag, tests, benchmarks, and v0.2.0 spec
- Add MIT LICENSE file
- Add --all flag to dtcat (override adaptive row limit)
- Expand integration tests: 18 → 65 (all formats covered)
- Add Criterion benchmarks for read, filter, and diff
- Add Cargo.toml metadata (homepage, repository, keywords)
- Add x86_64-apple-darwin install instructions to README
- Add v0.2.0 design spec (--sample, --convert)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
27 files changed, 872 insertions(+), 21 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
/target
Cargo.lock
+.claude/
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
edition = "2024"
description = "CLI tools for viewing, filtering, and comparing tabular data files"
license = "MIT"
+homepage = "https://github.com/LouLouLibs/dt-cli-tools"
+repository = "https://github.com/LouLouLibs/dt-cli-tools"
+keywords = ["cli", "csv", "parquet", "data", "diff"]
+categories = ["command-line-utilities"]
[lib]
name = "dtcore"
@@ -45,3 +49,16 @@ opt-level = "z"
assert_cmd = "2"
predicates = "3"
tempfile = "3"
+criterion = { version = "0.5", features = ["html_reports"] }
+
+[[bench]]
+name = "read"
+harness = false
+
+[[bench]]
+name = "filter"
+harness = false
+
+[[bench]]
+name = "diff"
+harness = false
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 Erik Loualiche
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -58,6 +58,13 @@ for tool in dtcat dtfilter dtdiff; do
-o ~/.local/bin/$tool
done
chmod +x ~/.local/bin/dt{cat,filter,diff}
+
+# Intel Mac (macOS)
+for tool in dtcat dtfilter dtdiff; do
+ curl -L "https://github.com/LouLouLibs/dt-cli-tools/releases/latest/download/${tool}-x86_64-apple-darwin" \
+ -o ~/.local/bin/$tool
+done
+chmod +x ~/.local/bin/dt{cat,filter,diff}
```
### From source
diff --git a/benches/diff.rs b/benches/diff.rs
@@ -0,0 +1,47 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+use polars::prelude::*;
+
+use dtcore::diff::{DiffOptions, SheetSource, diff_positional, diff_keyed};
+
+fn source(name: &str) -> SheetSource {
+ SheetSource { file_name: name.into(), sheet_name: "Sheet1".into() }
+}
+
+fn make_df(n: usize, offset: i64) -> DataFrame {
+ let ids: Vec<i64> = (offset..offset + n as i64).collect();
+ let names: Vec<String> = ids.iter().map(|i| format!("name_{}", i)).collect();
+ let values: Vec<i64> = ids.iter().map(|i| i * 100).collect();
+
+ DataFrame::new(vec![
+ Series::new("id".into(), &ids).into_column(),
+ Series::new("name".into(), &names).into_column(),
+ Series::new("value".into(), &values).into_column(),
+ ]).unwrap()
+}
+
+fn bench_diff(c: &mut Criterion) {
+ let opts_positional = DiffOptions::default();
+ let opts_keyed = DiffOptions { key_columns: vec!["id".into()], tolerance: None };
+
+ for &size in &[1_000, 10_000, 100_000] {
+ let df_a = make_df(size, 0);
+ // 10% of rows differ (shifted by 10% of size)
+ let shift = (size / 10) as i64;
+ let df_b = make_df(size, shift);
+
+ let mut group = c.benchmark_group(format!("diff_{size}"));
+
+ group.bench_function("positional", |b| {
+ b.iter(|| diff_positional(&df_a, &df_b, &opts_positional, source("a"), source("b")).unwrap())
+ });
+
+ group.bench_function("keyed", |b| {
+ b.iter(|| diff_keyed(&df_a, &df_b, &opts_keyed, source("a"), source("b")).unwrap())
+ });
+
+ group.finish();
+ }
+}
+
+criterion_group!(benches, bench_diff);
+criterion_main!(benches);
diff --git a/benches/filter.rs b/benches/filter.rs
@@ -0,0 +1,62 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+use polars::prelude::*;
+
+use dtcore::filter::{FilterExpr, FilterOp, FilterOptions, SortSpec, apply_filters, filter_pipeline};
+
+fn make_df(n: usize) -> DataFrame {
+ let ids: Vec<i64> = (0..n as i64).collect();
+ let regions: Vec<&str> = (0..n).map(|i| ["East", "West", "North", "South"][i % 4]).collect();
+ let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect();
+ let names: Vec<String> = (0..n).map(|i| format!("name_{}", i)).collect();
+
+ DataFrame::new(vec![
+ Series::new("id".into(), &ids).into_column(),
+ Series::new("region".into(), ®ions).into_column(),
+ Series::new("value".into(), &values).into_column(),
+ Series::new("name".into(), &names).into_column(),
+ ]).unwrap()
+}
+
+fn bench_filter(c: &mut Criterion) {
+ for &size in &[1_000, 10_000, 100_000] {
+ let df = make_df(size);
+
+ let mut group = c.benchmark_group(format!("filter_{size}"));
+
+ // Equality filter
+ let eq_expr = vec![FilterExpr { column: "region".into(), op: FilterOp::Eq, value: "East".into() }];
+ group.bench_function("eq", |b| {
+ b.iter(|| apply_filters(&df, &eq_expr).unwrap())
+ });
+
+ // Numeric comparison
+ let gt_expr = vec![FilterExpr { column: "value".into(), op: FilterOp::Gt, value: (size as i64 * 50).to_string() }];
+ group.bench_function("gt", |b| {
+ b.iter(|| apply_filters(&df, >_expr).unwrap())
+ });
+
+ // Contains (string scan)
+ let contains_expr = vec![FilterExpr { column: "name".into(), op: FilterOp::Contains, value: "42".into() }];
+ group.bench_function("contains", |b| {
+ b.iter(|| apply_filters(&df, &contains_expr).unwrap())
+ });
+
+ // Full pipeline: filter + sort + limit
+ let pipeline_opts = FilterOptions {
+ filters: vec![FilterExpr { column: "region".into(), op: FilterOp::Eq, value: "East".into() }],
+ sort: Some(SortSpec { column: "value".into(), descending: true }),
+ limit: Some(10),
+ cols: None,
+ head: None,
+ tail: None,
+ };
+ group.bench_function("pipeline", |b| {
+ b.iter(|| filter_pipeline(df.clone(), &pipeline_opts).unwrap())
+ });
+
+ group.finish();
+ }
+}
+
+criterion_group!(benches, bench_filter);
+criterion_main!(benches);
diff --git a/benches/read.rs b/benches/read.rs
@@ -0,0 +1,98 @@
+use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
+use polars::prelude::*;
+use std::path::PathBuf;
+use tempfile::NamedTempFile;
+use std::io::Write;
+
+use dtcore::format::Format;
+use dtcore::reader::{ReadOptions, read_file};
+
+fn generate_csv(n: usize) -> NamedTempFile {
+ let mut f = NamedTempFile::with_suffix(".csv").unwrap();
+ writeln!(f, "id,name,region,value").unwrap();
+ let regions = ["East", "West", "North", "South"];
+ for i in 0..n {
+ writeln!(f, "{},name_{},{},{}", i, i, regions[i % 4], i * 100).unwrap();
+ }
+ f.flush().unwrap();
+ f
+}
+
+fn generate_parquet(n: usize) -> NamedTempFile {
+ let ids: Vec<i64> = (0..n as i64).collect();
+ let names: Vec<String> = (0..n).map(|i| format!("name_{}", i)).collect();
+ let regions: Vec<&str> = (0..n).map(|i| ["East", "West", "North", "South"][i % 4]).collect();
+ let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect();
+
+ let mut df = DataFrame::new(vec![
+ Series::new("id".into(), &ids).into_column(),
+ Series::new("name".into(), &names).into_column(),
+ Series::new("region".into(), ®ions).into_column(),
+ Series::new("value".into(), &values).into_column(),
+ ]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".parquet").unwrap();
+ let file = std::fs::File::create(f.path()).unwrap();
+ ParquetWriter::new(file).finish(&mut df).unwrap();
+ f
+}
+
+fn generate_arrow(n: usize) -> NamedTempFile {
+ let ids: Vec<i64> = (0..n as i64).collect();
+ let values: Vec<i64> = (0..n).map(|i| i as i64 * 100).collect();
+
+ let mut df = DataFrame::new(vec![
+ Series::new("id".into(), &ids).into_column(),
+ Series::new("value".into(), &values).into_column(),
+ ]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".arrow").unwrap();
+ let file = std::fs::File::create(f.path()).unwrap();
+ IpcWriter::new(file).finish(&mut df).unwrap();
+ f
+}
+
+fn generate_ndjson(n: usize) -> NamedTempFile {
+ let mut f = NamedTempFile::with_suffix(".ndjson").unwrap();
+ let regions = ["East", "West", "North", "South"];
+ for i in 0..n {
+ writeln!(f, r#"{{"id":{},"name":"name_{}","region":"{}","value":{}}}"#,
+ i, i, regions[i % 4], i * 100).unwrap();
+ }
+ f.flush().unwrap();
+ f
+}
+
+fn bench_read(c: &mut Criterion) {
+ let opts = ReadOptions::default();
+
+ for &size in &[1_000, 10_000, 100_000] {
+ let csv = generate_csv(size);
+ let parquet = generate_parquet(size);
+ let arrow = generate_arrow(size);
+ let ndjson = generate_ndjson(size);
+
+ let mut group = c.benchmark_group(format!("read_{size}"));
+
+ group.bench_function("csv", |b| {
+ b.iter(|| read_file(csv.path(), Format::Csv, &opts).unwrap())
+ });
+
+ group.bench_function("parquet", |b| {
+ b.iter(|| read_file(parquet.path(), Format::Parquet, &opts).unwrap())
+ });
+
+ group.bench_function("arrow", |b| {
+ b.iter(|| read_file(arrow.path(), Format::Arrow, &opts).unwrap())
+ });
+
+ group.bench_function("ndjson", |b| {
+ b.iter(|| read_file(ndjson.path(), Format::Ndjson, &opts).unwrap())
+ });
+
+ group.finish();
+ }
+}
+
+criterion_group!(benches, bench_read);
+criterion_main!(benches);
diff --git a/docs/superpowers/specs/2026-04-04-v0.2.0-sample-convert-design.md b/docs/superpowers/specs/2026-04-04-v0.2.0-sample-convert-design.md
@@ -0,0 +1,84 @@
+# dt-cli-tools v0.2.0 — sample and convert
+
+## Summary
+
+Add two flags to dtcat: `--sample N` for random row sampling and `--convert FORMAT` with `-o PATH` for format conversion. No changes to dtfilter or dtdiff.
+
+## `--sample N`
+
+Randomly select N rows from the DataFrame after reading.
+
+- Mutually exclusive with `--head`, `--tail`, `--all`
+- Works with `--csv` output and default markdown
+- Works with `--skip` and `--sheet` (applied before sampling)
+- Mutually exclusive with `--schema`, `--describe`, `--info`
+- Non-deterministic (no seed flag)
+- If N >= row count, return all rows (no error)
+
+### Examples
+
+```bash
+dtcat huge.parquet --sample 20
+dtcat huge.parquet --sample 50 --csv
+dtcat report.xlsx --sheet Data --sample 10
+```
+
+## `--convert FORMAT` with `-o PATH`
+
+Read any supported format, write to a different format.
+
+- `--convert FORMAT` — target format: csv, tsv, parquet, arrow, json, ndjson
+- `-o PATH` — output file path
+- For text formats (csv, tsv, json, ndjson): if `-o` omitted, write to stdout
+- For binary formats (parquet, arrow): `-o` is required; error if missing
+- Mutually exclusive with all display flags (`--schema`, `--describe`, `--info`, `--csv`, `--head`, `--tail`, `--all`, `--sample`)
+- Works with `--skip` and `--sheet` (select data before converting)
+
+### Examples
+
+```bash
+dtcat data.csv --convert parquet -o data.parquet
+dtcat report.xlsx --sheet Revenue --convert csv -o revenue.csv
+dtcat data.json --convert arrow -o data.arrow
+dtcat data.parquet --convert ndjson # stdout
+```
+
+## Architecture
+
+Both features are additions to `src/bin/dtcat.rs` (new CLI args) and `dtcore` (writers).
+
+### New library code
+
+- `src/writers/` module with: `csv.rs`, `parquet.rs`, `arrow.rs`, `json.rs`
+- Each writer takes a `&mut DataFrame` and a `Write` or file path, returns `Result<()>`
+- `src/lib.rs` exports new `writers` module
+
+### Sampling
+
+- Use Polars `DataFrame::sample_n` or equivalent
+- Implemented in dtcat binary after read, before display/convert
+
+## Testing
+
+### `--sample N`
+- Verify output has exactly N rows (when N < total)
+- Verify N >= total returns all rows
+- Verify mutual exclusivity with `--head`/`--tail`/`--all`
+- Works on CSV, Parquet, Excel
+
+### `--convert FORMAT`
+- Roundtrip: CSV → Parquet → CSV, verify data matches
+- All 6 target formats produce valid output
+- Text formats work without `-o` (stdout)
+- Binary formats error without `-o`
+- `--skip` and `--sheet` apply before conversion
+- Mutual exclusivity with display flags
+
+### Existing tests
+- All 187+ existing tests still pass
+
+## Not in scope
+
+- No changes to dtfilter or dtdiff
+- No new binaries
+- No dtset, dtvalidate, or dtjoin
diff --git a/src/bin/dtcat.rs b/src/bin/dtcat.rs
@@ -59,6 +59,10 @@ struct Args {
#[arg(long)]
csv: bool,
+ /// Show all rows (override adaptive row limit)
+ #[arg(long)]
+ all: bool,
+
/// Show file metadata only
#[arg(long)]
info: bool,
@@ -239,8 +243,8 @@ fn run(args: Args) -> Result<()> {
format_data_table(&sliced)
}
(None, None) => {
- // Default: show all if <= threshold, otherwise head+tail
- if df.height() <= DEFAULT_THRESHOLD {
+ // Default: show all if <= threshold or --all, otherwise head+tail
+ if args.all || df.height() <= DEFAULT_THRESHOLD {
format_data_table(&df)
} else {
format_head_tail(&df, DEFAULT_HEAD_TAIL, DEFAULT_HEAD_TAIL)
diff --git a/tests/dtcat.rs b/tests/dtcat.rs
@@ -14,6 +14,8 @@ fn csv_file(content: &str) -> NamedTempFile {
f
}
+// ─── Basic viewing ───
+
#[test]
fn shows_csv_data() {
let f = csv_file("name,value\nAlice,100\nBob,200\n");
@@ -23,6 +25,20 @@ fn shows_csv_data() {
}
#[test]
+fn header_only_csv() {
+ let f = csv_file("name,value\n");
+ dtcat().arg(f.path()).assert().success()
+ .stdout(predicate::str::contains("no data rows"));
+}
+
+#[test]
+fn nonexistent_file_exits_1() {
+ dtcat().arg("/tmp/does_not_exist_12345.csv").assert().failure();
+}
+
+// ─── Modes ───
+
+#[test]
fn schema_flag() {
let f = csv_file("name,value\nAlice,100\n");
dtcat().arg(f.path()).arg("--schema").assert().success()
@@ -31,23 +47,61 @@ fn schema_flag() {
}
#[test]
-fn csv_output_flag() {
+fn describe_flag() {
+ let f = csv_file("name,value\nAlice,100\nBob,200\n");
+ dtcat().arg(f.path()).arg("--describe").assert().success()
+ .stdout(predicate::str::contains("count"))
+ .stdout(predicate::str::contains("mean"));
+}
+
+#[test]
+fn info_flag() {
let f = csv_file("name,value\nAlice,100\n");
- dtcat().arg(f.path()).arg("--csv").assert().success()
- .stdout(predicate::str::contains("name,value"));
+ dtcat().arg(f.path()).arg("--info").assert().success()
+ .stdout(predicate::str::contains("File:"));
}
+// ─── Row windowing ───
+
#[test]
fn head_flag() {
let f = csv_file("x\n1\n2\n3\n4\n5\n");
- dtcat().arg(f.path()).arg("--head").arg("2").assert().success();
+ dtcat().arg(f.path()).arg("--head").arg("2").assert().success()
+ .stdout(predicate::str::contains("1"))
+ .stdout(predicate::str::contains("2"))
+ .stdout(predicate::str::contains("3").not());
}
#[test]
-fn nonexistent_file_exits_1() {
- dtcat().arg("/tmp/does_not_exist_12345.csv").assert().failure();
+fn tail_flag() {
+ let f = csv_file("x\n1\n2\n3\n4\n5\n");
+ dtcat().arg(f.path()).arg("--tail").arg("2").assert().success()
+ .stdout(predicate::str::contains("4"))
+ .stdout(predicate::str::contains("5"));
+}
+
+#[test]
+fn head_and_tail_combined() {
+ let f = csv_file("x\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n");
+ dtcat().arg(f.path()).arg("--head").arg("2").arg("--tail").arg("2")
+ .assert().success()
+ .stdout(predicate::str::contains("1"))
+ .stdout(predicate::str::contains("2"))
+ .stdout(predicate::str::contains("9"))
+ .stdout(predicate::str::contains("10"));
}
+// ─── Output format ───
+
+#[test]
+fn csv_output_flag() {
+ let f = csv_file("name,value\nAlice,100\n");
+ dtcat().arg(f.path()).arg("--csv").assert().success()
+ .stdout(predicate::str::contains("name,value"));
+}
+
+// ─── Format detection ───
+
#[test]
fn format_override() {
let mut f = NamedTempFile::with_suffix(".txt").unwrap();
@@ -58,9 +112,107 @@ fn format_override() {
}
#[test]
-fn describe_flag() {
- let f = csv_file("name,value\nAlice,100\nBob,200\n");
- dtcat().arg(f.path()).arg("--describe").assert().success()
- .stdout(predicate::str::contains("count"))
- .stdout(predicate::str::contains("mean"));
+fn tsv_detection() {
+ let mut f = NamedTempFile::with_suffix(".tsv").unwrap();
+ write!(f, "name\tvalue\nAlice\t100\n").unwrap();
+ f.flush().unwrap();
+ dtcat().arg(f.path()).assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("100"));
+}
+
+// ─── Skip rows ───
+
+#[test]
+fn skip_metadata_rows() {
+ let f = csv_file("meta1\nmeta2\nname,value\nAlice,100\n");
+ dtcat().arg(f.path()).arg("--skip").arg("2").assert().success()
+ .stdout(predicate::str::contains("Alice"));
+}
+
+// ─── All flag ───
+
+#[test]
+fn all_flag_shows_every_row() {
+ // 60 rows > threshold of 50, so without --all we'd get head+tail
+ let mut content = String::from("x\n");
+ for i in 1..=60 {
+ content.push_str(&format!("{}\n", i));
+ }
+ let f = csv_file(&content);
+ // With --all, row 30 should appear (it would be omitted in head25+tail25)
+ dtcat().arg(f.path()).arg("--all").assert().success()
+ .stdout(predicate::str::contains("| 30 "));
+}
+
+// ─── Parquet ───
+
+#[test]
+fn parquet_view() {
+ dtcat().arg("tests/fixtures/data.parquet").assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Charlie"));
+}
+
+#[test]
+fn parquet_schema() {
+ dtcat().arg("tests/fixtures/data.parquet").arg("--schema").assert().success()
+ .stdout(predicate::str::contains("name"))
+ .stdout(predicate::str::contains("value"));
+}
+
+// ─── Arrow/IPC ───
+
+#[test]
+fn arrow_view() {
+ dtcat().arg("tests/fixtures/data.arrow").assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Charlie"));
+}
+
+#[test]
+fn arrow_schema() {
+ dtcat().arg("tests/fixtures/data.arrow").arg("--schema").assert().success()
+ .stdout(predicate::str::contains("name"))
+ .stdout(predicate::str::contains("value"));
+}
+
+// ─── JSON ───
+
+#[test]
+fn json_view() {
+ dtcat().arg("tests/fixtures/data.json").assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Charlie"));
+}
+
+// ─── NDJSON ───
+
+#[test]
+fn ndjson_view() {
+ dtcat().arg("tests/fixtures/data.ndjson").assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Charlie"));
+}
+
+// ─── Excel ───
+
+#[test]
+fn excel_view() {
+ dtcat().arg("demo/sales.xlsx").assert().success()
+ .stdout(predicate::str::contains("Revenue"));
+}
+
+#[test]
+fn excel_schema() {
+ dtcat().arg("demo/sales.xlsx").arg("--schema").assert().success()
+ .stdout(predicate::str::contains("Column"))
+ .stdout(predicate::str::contains("Revenue"));
+}
+
+#[test]
+fn excel_info() {
+ dtcat().arg("demo/sales.xlsx").arg("--info").assert().success()
+ .stdout(predicate::str::contains("Excel"))
+ .stdout(predicate::str::contains("Sheet1"));
}
diff --git a/tests/dtdiff.rs b/tests/dtdiff.rs
@@ -14,6 +14,8 @@ fn csv_file(content: &str) -> NamedTempFile {
f
}
+// ─── Positional mode ───
+
#[test]
fn no_diff_exits_0() {
let a = csv_file("name,value\nAlice,100\n");
@@ -23,20 +25,142 @@ fn no_diff_exits_0() {
}
#[test]
-fn diff_exits_1() {
+fn positional_diff_exits_1() {
let a = csv_file("name,value\nAlice,100\n");
let b = csv_file("name,value\nBob,200\n");
dtdiff().arg(a.path()).arg(b.path()).assert().code(1);
}
#[test]
-fn keyed_diff() {
+fn positional_added_row() {
+ let a = csv_file("name,value\nAlice,100\n");
+ let b = csv_file("name,value\nAlice,100\nBob,200\n");
+ dtdiff().arg(a.path()).arg(b.path()).assert().code(1)
+ .stdout(predicate::str::contains("Added: 1"));
+}
+
+#[test]
+fn positional_removed_row() {
+ let a = csv_file("name,value\nAlice,100\nBob,200\n");
+ let b = csv_file("name,value\nAlice,100\n");
+ dtdiff().arg(a.path()).arg(b.path()).assert().code(1)
+ .stdout(predicate::str::contains("Removed: 1"));
+}
+
+// ─── Key-based mode ───
+
+#[test]
+fn keyed_diff_modified() {
let a = csv_file("id,name\n1,Alice\n2,Bob\n");
let b = csv_file("id,name\n1,Alice\n2,Robert\n");
dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id")
- .assert().code(1);
+ .assert().code(1)
+ .stdout(predicate::str::contains("Modified: 1"))
+ .stdout(predicate::str::contains("Bob"));
+}
+
+#[test]
+fn keyed_diff_added_and_removed() {
+ let a = csv_file("id,name\n1,Alice\n2,Bob\n");
+ let b = csv_file("id,name\n1,Alice\n3,Charlie\n");
+ dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id")
+ .assert().code(1)
+ .stdout(predicate::str::contains("Added: 1"))
+ .stdout(predicate::str::contains("Removed: 1"));
+}
+
+#[test]
+fn keyed_no_diff() {
+ let a = csv_file("id,name\n1,Alice\n2,Bob\n");
+ let b = csv_file("id,name\n2,Bob\n1,Alice\n");
+ dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id")
+ .assert().success()
+ .stdout(predicate::str::contains("No differences"));
+}
+
+// ─── Composite keys ───
+
+#[test]
+fn composite_key() {
+ let a = csv_file("date,ticker,price\n2024-01-01,AAPL,150\n2024-01-01,GOOG,140\n");
+ let b = csv_file("date,ticker,price\n2024-01-01,AAPL,150\n2024-01-01,GOOG,145\n");
+ dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("date,ticker")
+ .assert().code(1)
+ .stdout(predicate::str::contains("Modified: 1"))
+ .stdout(predicate::str::contains("GOOG"));
+}
+
+// ─── Float tolerance ───
+
+#[test]
+fn tolerance_suppresses_small_diff() {
+ let a = csv_file("id,price\n1,150.000\n");
+ let b = csv_file("id,price\n1,150.005\n");
+ dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id").arg("--tolerance").arg("0.01")
+ .assert().success()
+ .stdout(predicate::str::contains("No differences"));
+}
+
+#[test]
+fn tolerance_reports_large_diff() {
+ let a = csv_file("id,price\n1,150.0\n");
+ let b = csv_file("id,price\n1,155.0\n");
+ dtdiff().arg(a.path()).arg(b.path()).arg("--key").arg("id").arg("--tolerance").arg("0.01")
+ .assert().code(1)
+ .stdout(predicate::str::contains("Modified: 1"));
+}
+
+// ─── Parquet ───
+
+#[test]
+fn parquet_keyed_diff() {
+ dtdiff().arg("tests/fixtures/old.parquet").arg("tests/fixtures/new.parquet")
+ .arg("--key").arg("id")
+ .assert().code(1)
+ .stdout(predicate::str::contains("Added: 1"))
+ .stdout(predicate::str::contains("Removed: 1"));
+}
+
+#[test]
+fn parquet_no_diff() {
+ dtdiff().arg("tests/fixtures/data.parquet").arg("tests/fixtures/data.parquet")
+ .assert().success()
+ .stdout(predicate::str::contains("No differences"));
+}
+
+// ─── Arrow/IPC ───
+
+#[test]
+fn arrow_keyed_diff() {
+ dtdiff().arg("tests/fixtures/old.arrow").arg("tests/fixtures/new.arrow")
+ .arg("--key").arg("id")
+ .assert().code(1)
+ .stdout(predicate::str::contains("Added: 1"))
+ .stdout(predicate::str::contains("Removed: 1"));
+}
+
+// ─── JSON ───
+
+#[test]
+fn json_keyed_diff() {
+ dtdiff().arg("tests/fixtures/old.json").arg("tests/fixtures/new.json")
+ .arg("--key").arg("id")
+ .assert().code(1)
+ .stdout(predicate::str::contains("Modified: 1"));
}
+// ─── NDJSON ───
+
+#[test]
+fn ndjson_keyed_diff() {
+ dtdiff().arg("tests/fixtures/old.ndjson").arg("tests/fixtures/new.ndjson")
+ .arg("--key").arg("id")
+ .assert().code(1)
+ .stdout(predicate::str::contains("Modified: 1"));
+}
+
+// ─── Output formats ───
+
#[test]
fn json_output() {
let a = csv_file("id,val\n1,a\n");
@@ -54,3 +178,29 @@ fn csv_output() {
.assert().code(1)
.stdout(predicate::str::contains("_status"));
}
+
+#[test]
+fn no_color_flag() {
+ let a = csv_file("name,value\nAlice,100\n");
+ let b = csv_file("name,value\nBob,200\n");
+ dtdiff().arg(a.path()).arg(b.path()).arg("--no-color")
+ .assert().code(1);
+}
+
+// ─── Excel ───
+
+#[test]
+fn excel_keyed_diff() {
+ dtdiff().arg("demo/old.xlsx").arg("demo/new.xlsx").arg("--key").arg("ID")
+ .assert().code(1)
+ .stdout(predicate::str::contains("Added: 1"))
+ .stdout(predicate::str::contains("Removed: 1"))
+ .stdout(predicate::str::contains("Modified: 3"));
+}
+
+#[test]
+fn excel_no_diff() {
+ dtdiff().arg("demo/old.xlsx").arg("demo/old.xlsx")
+ .assert().success()
+ .stdout(predicate::str::contains("No differences"));
+}
diff --git a/tests/dtfilter.rs b/tests/dtfilter.rs
@@ -14,29 +14,124 @@ fn csv_file(content: &str) -> NamedTempFile {
f
}
+const DATA: &str = "name,value\nAlice,100\nBob,200\nCharlie,300\n";
+
+// ─── Equality ───
+
#[test]
fn filter_eq() {
- let f = csv_file("name,value\nAlice,100\nBob,200\n");
+ let f = csv_file(DATA);
dtfilter().arg(f.path()).arg("--filter").arg("name=Alice").assert().success()
.stdout(predicate::str::contains("Alice"))
.stdout(predicate::str::contains("Bob").not());
}
#[test]
+fn filter_neq() {
+ let f = csv_file(DATA);
+ dtfilter().arg(f.path()).arg("--filter").arg("name!=Alice").assert().success()
+ .stdout(predicate::str::contains("Bob"))
+ .stdout(predicate::str::contains("Charlie"))
+ .stdout(predicate::str::contains("Alice").not());
+}
+
+// ─── Numeric comparisons ───
+
+#[test]
fn filter_gt() {
- let f = csv_file("name,value\nAlice,100\nBob,200\nCharlie,300\n");
+ let f = csv_file(DATA);
dtfilter().arg(f.path()).arg("--filter").arg("value>150").assert().success()
.stdout(predicate::str::contains("Bob"))
- .stdout(predicate::str::contains("Charlie"));
+ .stdout(predicate::str::contains("Charlie"))
+ .stdout(predicate::str::contains("Alice").not());
+}
+
+#[test]
+fn filter_lt() {
+ let f = csv_file(DATA);
+ dtfilter().arg(f.path()).arg("--filter").arg("value<200").assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Bob").not());
+}
+
+#[test]
+fn filter_gte() {
+ let f = csv_file(DATA);
+ dtfilter().arg(f.path()).arg("--filter").arg("value>=200").assert().success()
+ .stdout(predicate::str::contains("Bob"))
+ .stdout(predicate::str::contains("Charlie"))
+ .stdout(predicate::str::contains("Alice").not());
+}
+
+#[test]
+fn filter_lte() {
+ let f = csv_file(DATA);
+ dtfilter().arg(f.path()).arg("--filter").arg("value<=200").assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Bob"))
+ .stdout(predicate::str::contains("Charlie").not());
+}
+
+// ─── String matching ───
+
+#[test]
+fn filter_contains() {
+ let f = csv_file(DATA);
+ dtfilter().arg(f.path()).arg("--filter").arg("name~ob").assert().success()
+ .stdout(predicate::str::contains("Bob"))
+ .stdout(predicate::str::contains("Alice").not());
}
#[test]
+fn filter_not_contains() {
+ let f = csv_file(DATA);
+ dtfilter().arg(f.path()).arg("--filter").arg("name!~ob").assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Charlie"))
+ .stdout(predicate::str::contains("Bob").not());
+}
+
+// ─── Multiple filters (AND) ───
+
+#[test]
+fn multiple_filters_and() {
+ let f = csv_file(DATA);
+ dtfilter().arg(f.path())
+ .arg("--filter").arg("value>=200")
+ .arg("--filter").arg("value<=300")
+ .assert().success()
+ .stdout(predicate::str::contains("Bob"))
+ .stdout(predicate::str::contains("Charlie"))
+ .stdout(predicate::str::contains("Alice").not());
+}
+
+// ─── Sort ───
+
+#[test]
fn sort_desc() {
- let f = csv_file("name,value\nAlice,100\nBob,200\n");
- dtfilter().arg(f.path()).arg("--sort").arg("value:desc").assert().success();
+ let f = csv_file(DATA);
+ let out = dtfilter().arg(f.path()).arg("--sort").arg("value:desc")
+ .assert().success();
+ let stdout = String::from_utf8(out.get_output().stdout.clone()).unwrap();
+ let charlie_pos = stdout.find("Charlie").unwrap();
+ let alice_pos = stdout.find("Alice").unwrap();
+ assert!(charlie_pos < alice_pos, "Charlie (300) should appear before Alice (100) in desc sort");
}
#[test]
+fn sort_asc() {
+ let f = csv_file(DATA);
+ let out = dtfilter().arg(f.path()).arg("--sort").arg("value")
+ .assert().success();
+ let stdout = String::from_utf8(out.get_output().stdout.clone()).unwrap();
+ let alice_pos = stdout.find("Alice").unwrap();
+ let charlie_pos = stdout.find("Charlie").unwrap();
+ assert!(alice_pos < charlie_pos, "Alice (100) should appear before Charlie (300) in asc sort");
+}
+
+// ─── Column selection ───
+
+#[test]
fn columns_select() {
let f = csv_file("name,value,extra\nAlice,100,x\n");
dtfilter().arg(f.path()).arg("--columns").arg("name,value").assert().success()
@@ -44,6 +139,19 @@ fn columns_select() {
.stdout(predicate::str::contains("extra").not());
}
+// ─── Limit ───
+
+#[test]
+fn limit_output() {
+ let f = csv_file(DATA);
+ dtfilter().arg(f.path()).arg("--sort").arg("value:desc").arg("--limit").arg("1")
+ .assert().success()
+ .stdout(predicate::str::contains("Charlie"))
+ .stdout(predicate::str::contains("Alice").not());
+}
+
+// ─── Output format ───
+
#[test]
fn csv_output() {
let f = csv_file("name,value\nAlice,100\n");
@@ -51,9 +159,82 @@ fn csv_output() {
.stdout(predicate::str::contains("name,value"));
}
+// ─── Windowing ───
+
+#[test]
+fn head_before_filter() {
+ let f = csv_file("name,value\nAlice,100\nBob,200\nCharlie,300\n");
+ dtfilter().arg(f.path()).arg("--head").arg("2").arg("--filter").arg("value>150")
+ .assert().success()
+ .stdout(predicate::str::contains("Bob"))
+ .stdout(predicate::str::contains("Charlie").not());
+}
+
#[test]
fn head_tail_exclusive() {
let f = csv_file("x\n1\n2\n");
dtfilter().arg(f.path()).arg("--head").arg("1").arg("--tail").arg("1")
.assert().code(2);
}
+
+// ─── Excel ───
+
+#[test]
+fn filter_excel() {
+ dtfilter().arg("demo/sales.xlsx").arg("--filter").arg("Region=East")
+ .assert().success()
+ .stdout(predicate::str::contains("East"))
+ .stdout(predicate::str::contains("West").not());
+}
+
+// ─── Parquet ───
+
+#[test]
+fn filter_parquet() {
+ dtfilter().arg("tests/fixtures/data.parquet").arg("--filter").arg("value>150")
+ .assert().success()
+ .stdout(predicate::str::contains("Bob"))
+ .stdout(predicate::str::contains("Charlie"))
+ .stdout(predicate::str::contains("Alice").not());
+}
+
+// ─── Arrow/IPC ───
+
+#[test]
+fn filter_arrow() {
+ dtfilter().arg("tests/fixtures/data.arrow").arg("--filter").arg("value>150")
+ .assert().success()
+ .stdout(predicate::str::contains("Bob"))
+ .stdout(predicate::str::contains("Charlie"))
+ .stdout(predicate::str::contains("Alice").not());
+}
+
+// ─── JSON ───
+
+#[test]
+fn filter_json() {
+ dtfilter().arg("tests/fixtures/data.json").arg("--filter").arg("name=Alice")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Bob").not());
+}
+
+// ─── NDJSON ───
+
+#[test]
+fn filter_ndjson() {
+ dtfilter().arg("tests/fixtures/data.ndjson").arg("--filter").arg("name=Alice")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Bob").not());
+}
+
+// ─── Edge cases ───
+
+#[test]
+fn filter_no_matches() {
+ let f = csv_file(DATA);
+ dtfilter().arg(f.path()).arg("--filter").arg("name=Nobody")
+ .assert().success()
+ .stderr(predicate::str::contains("0 rows"));
+}
diff --git a/tests/fixtures/data.arrow b/tests/fixtures/data.arrow
Binary files differ.
diff --git a/tests/fixtures/data.csv b/tests/fixtures/data.csv
@@ -0,0 +1,4 @@
+name,value
+Alice,100
+Bob,200
+Charlie,300
diff --git a/tests/fixtures/data.json b/tests/fixtures/data.json
@@ -0,0 +1 @@
+[{"name":"Alice","value":100},{"name":"Bob","value":200},{"name":"Charlie","value":300}]+
\ No newline at end of file
diff --git a/tests/fixtures/data.ndjson b/tests/fixtures/data.ndjson
@@ -0,0 +1,3 @@
+{"name":"Alice","value":100}
+{"name":"Bob","value":200}
+{"name":"Charlie","value":300}
diff --git a/tests/fixtures/data.parquet b/tests/fixtures/data.parquet
Binary files differ.
diff --git a/tests/fixtures/new.arrow b/tests/fixtures/new.arrow
Binary files differ.
diff --git a/tests/fixtures/new.csv b/tests/fixtures/new.csv
@@ -0,0 +1,4 @@
+id,name
+1,Alice
+2,Robert
+4,Diana
diff --git a/tests/fixtures/new.json b/tests/fixtures/new.json
@@ -0,0 +1 @@
+[{"id": 1, "name": "Alice"}, {"id": 2, "name": "Robert"}, {"id": 4, "name": "Diana"}]+
\ No newline at end of file
diff --git a/tests/fixtures/new.ndjson b/tests/fixtures/new.ndjson
@@ -0,0 +1,3 @@
+{"id":1,"name":"Alice"}
+{"id":2,"name":"Robert"}
+{"id":4,"name":"Diana"}
diff --git a/tests/fixtures/new.parquet b/tests/fixtures/new.parquet
Binary files differ.
diff --git a/tests/fixtures/old.arrow b/tests/fixtures/old.arrow
Binary files differ.
diff --git a/tests/fixtures/old.csv b/tests/fixtures/old.csv
@@ -0,0 +1,4 @@
+id,name
+1,Alice
+2,Bob
+3,Charlie
diff --git a/tests/fixtures/old.json b/tests/fixtures/old.json
@@ -0,0 +1 @@
+[{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}, {"id": 3, "name": "Charlie"}]+
\ No newline at end of file
diff --git a/tests/fixtures/old.ndjson b/tests/fixtures/old.ndjson
@@ -0,0 +1,3 @@
+{"id":1,"name":"Alice"}
+{"id":2,"name":"Bob"}
+{"id":3,"name":"Charlie"}
diff --git a/tests/fixtures/old.parquet b/tests/fixtures/old.parquet
Binary files differ.