dt-cli-tools

CLI tools for viewing, filtering, and comparing tabular data files
Log | Files | Refs | README | LICENSE

commit 601fbfe88df565049c4894f1024cb016beab9d67
parent b2789b7aff3b7b6cb1c326fac1049494051df3ce
Author: Erik Loualiche <eloualiche@users.noreply.github.com>
Date:   Sat,  4 Apr 2026 13:55:34 -0500

Merge pull request #1 from LouLouLibs/feat/v0.2.0-sample-convert

feat: v0.2.0 — add --sample and --convert to dtcat
Diffstat:
MCargo.toml | 3++-
MREADME.md | 13+++++++++++++
Adocs/superpowers/plans/2026-04-04-v0.2.0-sample-convert.md | 692+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/bin/dtcat.rs | 62+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Msrc/lib.rs | 2++
Asrc/writer.rs | 17+++++++++++++++++
Asrc/writers/arrow.rs | 35+++++++++++++++++++++++++++++++++++
Asrc/writers/csv.rs | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/writers/json.rs | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/writers/mod.rs | 4++++
Asrc/writers/parquet.rs | 36++++++++++++++++++++++++++++++++++++
Mtests/dtcat.rs | 132+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
12 files changed, 1116 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dt-cli-tools" -version = "0.1.0" +version = "0.2.0" edition = "2024" description = "CLI tools for viewing, filtering, and comparing tabular data files" license = "MIT" @@ -32,6 +32,7 @@ polars = { version = "0.46", default-features = false, features = [ "parquet", "ipc", "json", + "random", ] } calamine = "0.26" clap = { version = "4", features = ["derive"] } diff --git a/README.md b/README.md @@ -102,6 +102,15 @@ dtcat data.csv --tail 5 # CSV output for piping dtcat data.parquet --csv +# Random sample of rows +dtcat huge.parquet --sample 20 +dtcat huge.parquet --sample 50 --csv + +# Convert between formats +dtcat data.csv --convert parquet -o data.parquet +dtcat report.xlsx --sheet Revenue --convert csv -o revenue.csv +dtcat data.parquet --convert ndjson # text formats go to stdout + # Override format detection dtcat data.txt --format csv @@ -141,6 +150,10 @@ dtcat data.csv --skip 2 Modes `--schema`, `--describe`, `--info`, and data (default) are mutually exclusive. +`--sample N` randomly selects N rows; mutually exclusive with `--head`/`--tail`/`--all`. + +`--convert FORMAT` writes to a different format. Use `-o PATH` for output file (required for binary formats Parquet/Arrow; optional for text formats which default to stdout). Supported targets: csv, tsv, parquet, arrow, json, ndjson. + ## dtfilter — Query and Filter <img src="demo/dtfilter.gif" alt="dtfilter demo" width="80%" /> diff --git a/docs/superpowers/plans/2026-04-04-v0.2.0-sample-convert.md b/docs/superpowers/plans/2026-04-04-v0.2.0-sample-convert.md @@ -0,0 +1,692 @@ +# v0.2.0: --sample and --convert Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add random row sampling (`--sample N`) and format conversion (`--convert FORMAT -o PATH`) to dtcat. + +**Architecture:** Both features extend the existing dtcat binary. `--sample` uses Polars `DataFrame::sample_n_literal` after reading, before display. `--convert` requires new writer functions in a `src/writers/` module mirroring `src/readers/`, then a write path in dtcat that short-circuits display. + +**Tech Stack:** Polars (ParquetWriter, IpcWriter, JsonWriter, CsvWriter), clap, anyhow. + +--- + +### Task 1: Add `--sample N` flag and validation + +**Files:** +- Modify: `src/bin/dtcat.rs` +- Test: `tests/dtcat.rs` + +- [ ] **Step 1: Write the failing tests** + +Add to `tests/dtcat.rs`: + +```rust +#[test] +fn sample_returns_n_rows() { + // 18-row fixture, sample 5 + let out = dtcat().arg("demo/sales.csv").arg("--sample").arg("5").arg("--csv") + .assert().success(); + let stdout = String::from_utf8(out.get_output().stdout.clone()).unwrap(); + // CSV header + 5 data rows = 6 lines (last line may be empty) + let lines: Vec<&str> = stdout.trim().lines().collect(); + assert_eq!(lines.len(), 6, "expected header + 5 rows, got {}", lines.len()); +} + +#[test] +fn sample_ge_total_returns_all() { + let f = csv_file("x\n1\n2\n3\n"); + dtcat().arg(f.path()).arg("--sample").arg("100").arg("--csv") + .assert().success(); +} + +#[test] +fn sample_conflicts_with_head() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--sample").arg("1").arg("--head").arg("1") + .assert().code(2); +} + +#[test] +fn sample_conflicts_with_tail() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--sample").arg("1").arg("--tail").arg("1") + .assert().code(2); +} + +#[test] +fn sample_conflicts_with_all() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--sample").arg("1").arg("--all") + .assert().code(2); +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cargo test --test dtcat sample` +Expected: FAIL — unknown arg `--sample` + +- [ ] **Step 3: Add `--sample` arg and validation to dtcat** + +In `src/bin/dtcat.rs`, add to the `Args` struct after the `all` field: + +```rust + /// Randomly sample N rows + #[arg(long, value_name = "N")] + sample: Option<usize>, +``` + +Update `validate_args`: + +```rust +fn validate_args(args: &Args) -> Result<()> { + if args.schema && args.describe { + bail!("--schema and --describe are mutually exclusive"); + } + if args.sample.is_some() { + if args.head.is_some() { + bail!("--sample and --head are mutually exclusive"); + } + if args.tail.is_some() { + bail!("--sample and --tail are mutually exclusive"); + } + if args.all { + bail!("--sample and --all are mutually exclusive"); + } + } + Ok(()) +} +``` + +- [ ] **Step 4: Implement sampling logic in the display section** + +In `src/bin/dtcat.rs`, replace the display match block (the `let output = match ...` section) with: + +```rust + // Determine what to display + let output = if let Some(n) = args.sample { + let sampled = if n >= df.height() { + df + } else { + df.sample_n_literal(n, false, false, None)? + }; + format_data_table(&sampled) + } else { + match (args.head, args.tail) { + (Some(h), Some(t)) => { + format_head_tail(&df, h, t) + } + (Some(h), None) => { + let sliced = df.head(Some(h)); + format_data_table(&sliced) + } + (None, Some(t)) => { + let sliced = df.tail(Some(t)); + format_data_table(&sliced) + } + (None, None) => { + if args.all || df.height() <= DEFAULT_THRESHOLD { + format_data_table(&df) + } else { + format_head_tail(&df, DEFAULT_HEAD_TAIL, DEFAULT_HEAD_TAIL) + } + } + } + }; +``` + +Also handle `--sample` with `--csv` output. The current `--csv` branch exits early before the display match. Move sampling before the csv check, or handle it inline. The simplest approach: apply sampling before the `--csv` check. After the line `let df = read_file(&path, fmt, &opts)?;`, add: + +```rust + // Apply sampling if requested (before any display mode) + let df = if let Some(n) = args.sample { + if n >= df.height() { + df + } else { + df.sample_n_literal(n, false, false, None)? + } + } else { + df + }; +``` + +Then remove the sample handling from the display match block (revert it to the original match block). This way `--sample` + `--csv` works naturally. + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `cargo test --test dtcat sample` +Expected: all 5 sample tests PASS + +- [ ] **Step 6: Commit** + +```bash +git add src/bin/dtcat.rs tests/dtcat.rs +git commit -m "feat: add --sample N flag to dtcat" +``` + +--- + +### Task 2: Create writers module + +**Files:** +- Create: `src/writers/mod.rs` +- Create: `src/writers/csv.rs` +- Create: `src/writers/parquet.rs` +- Create: `src/writers/arrow.rs` +- Create: `src/writers/json.rs` +- Modify: `src/lib.rs` + +- [ ] **Step 1: Create `src/writers/mod.rs`** + +```rust +pub mod arrow; +pub mod csv; +pub mod json; +pub mod parquet; +``` + +- [ ] **Step 2: Create `src/writers/csv.rs`** + +```rust +use anyhow::Result; +use polars::prelude::*; +use std::io::Write; +use std::path::Path; + +use crate::format::Format; + +pub fn write(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> { + let separator = match format { + Format::Tsv => b'\t', + _ => b',', + }; + + match path { + Some(p) => { + let file = std::fs::File::create(p)?; + CsvWriter::new(file) + .with_separator(separator) + .finish(df)?; + } + None => { + let mut buf = Vec::new(); + CsvWriter::new(&mut buf) + .with_separator(separator) + .finish(df)?; + std::io::stdout().write_all(&buf)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn write_csv_roundtrip() { + let s1 = Series::new("name".into(), &["Alice", "Bob"]); + let s2 = Series::new("value".into(), &[100i64, 200]); + let mut df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".csv").unwrap(); + write(&mut df, Some(f.path()), Format::Csv).unwrap(); + + let result = crate::readers::csv::read(f.path(), &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 2); + assert_eq!(result.get_column_names(), df.get_column_names()); + } + + #[test] + fn write_tsv_uses_tab() { + let s = Series::new("x".into(), &[1i64]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".tsv").unwrap(); + write(&mut df, Some(f.path()), Format::Tsv).unwrap(); + + let content = std::fs::read_to_string(f.path()).unwrap(); + assert!(!content.contains(',')); + } +} +``` + +- [ ] **Step 3: Create `src/writers/parquet.rs`** + +```rust +use anyhow::{bail, Result}; +use polars::prelude::*; +use std::path::Path; + +pub fn write(df: &mut DataFrame, path: Option<&Path>) -> Result<()> { + let path = path.ok_or_else(|| anyhow::anyhow!("--convert parquet requires -o PATH"))?; + let file = std::fs::File::create(path)?; + ParquetWriter::new(file).finish(df)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn write_parquet_roundtrip() { + let s1 = Series::new("name".into(), &["Alice", "Bob"]); + let s2 = Series::new("value".into(), &[100i64, 200]); + let mut df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".parquet").unwrap(); + write(&mut df, Some(f.path())).unwrap(); + + let result = crate::readers::parquet::read(f.path(), &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 2); + } + + #[test] + fn write_parquet_no_path_errors() { + let s = Series::new("x".into(), &[1i64]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + assert!(write(&mut df, None).is_err()); + } +} +``` + +- [ ] **Step 4: Create `src/writers/arrow.rs`** + +```rust +use anyhow::Result; +use polars::prelude::*; +use std::path::Path; + +pub fn write(df: &mut DataFrame, path: Option<&Path>) -> Result<()> { + let path = path.ok_or_else(|| anyhow::anyhow!("--convert arrow requires -o PATH"))?; + let file = std::fs::File::create(path)?; + IpcWriter::new(file).finish(df)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn write_arrow_roundtrip() { + let s = Series::new("x".into(), &[1i64, 2, 3]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".arrow").unwrap(); + write(&mut df, Some(f.path())).unwrap(); + + let result = crate::readers::arrow::read(f.path(), &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 3); + } + + #[test] + fn write_arrow_no_path_errors() { + let s = Series::new("x".into(), &[1i64]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + assert!(write(&mut df, None).is_err()); + } +} +``` + +- [ ] **Step 5: Create `src/writers/json.rs`** + +```rust +use anyhow::Result; +use polars::prelude::*; +use std::io::Write as IoWrite; +use std::path::Path; + +use crate::format::Format; + +pub fn write(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> { + match format { + Format::Ndjson => write_ndjson(df, path), + _ => write_json(df, path), + } +} + +fn write_json(df: &mut DataFrame, path: Option<&Path>) -> Result<()> { + match path { + Some(p) => { + let file = std::fs::File::create(p)?; + JsonWriter::new(file).finish(df)?; + } + None => { + let mut buf = Vec::new(); + JsonWriter::new(&mut buf).finish(df)?; + std::io::stdout().write_all(&buf)?; + } + } + Ok(()) +} + +fn write_ndjson(df: &mut DataFrame, path: Option<&Path>) -> Result<()> { + match path { + Some(p) => { + let file = std::fs::File::create(p)?; + JsonLineWriter::new(file).finish(df)?; + } + None => { + let mut buf = Vec::new(); + JsonLineWriter::new(&mut buf).finish(df)?; + std::io::stdout().write_all(&buf)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn write_json_roundtrip() { + let s = Series::new("x".into(), &[1i64, 2]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".json").unwrap(); + write(&mut df, Some(f.path()), Format::Json).unwrap(); + + let result = crate::readers::json::read(f.path(), Format::Json, &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 2); + } + + #[test] + fn write_ndjson_roundtrip() { + let s = Series::new("x".into(), &[1i64, 2]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".ndjson").unwrap(); + write(&mut df, Some(f.path()), Format::Ndjson).unwrap(); + + let result = crate::readers::json::read(f.path(), Format::Ndjson, &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 2); + } +} +``` + +- [ ] **Step 6: Add `writers` to `src/lib.rs`** + +Replace the contents of `src/lib.rs` with: + +```rust +pub mod diff; +pub mod filter; +pub mod format; +pub mod formatter; +pub mod metadata; +pub mod reader; +pub mod readers; +pub mod writers; +``` + +- [ ] **Step 7: Run unit tests** + +Run: `cargo test --lib` +Expected: all unit tests pass including new writer tests + +- [ ] **Step 8: Commit** + +```bash +git add src/writers/ src/lib.rs +git commit -m "feat: add writers module (csv, tsv, parquet, arrow, json, ndjson)" +``` + +--- + +### Task 3: Add write_file dispatch function + +**Files:** +- Create: `src/writer.rs` +- Modify: `src/lib.rs` + +- [ ] **Step 1: Create `src/writer.rs`** + +```rust +use anyhow::{bail, Result}; +use polars::prelude::*; +use std::path::Path; + +use crate::format::Format; +use crate::writers; + +/// Write a DataFrame to a file or stdout, dispatching to the appropriate writer. +/// +/// For binary formats (Parquet, Arrow), `path` is required. +/// For text formats (CSV, TSV, JSON, NDJSON), `path` is optional (None = stdout). +/// Excel writing is not supported. +pub fn write_file(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> { + match format { + Format::Csv | Format::Tsv => writers::csv::write(df, path, format), + Format::Parquet => writers::parquet::write(df, path), + Format::Arrow => writers::arrow::write(df, path), + Format::Json | Format::Ndjson => writers::json::write(df, path, format), + Format::Excel => bail!("writing Excel format is not supported; use csv or parquet"), + } +} +``` + +- [ ] **Step 2: Add `writer` to `src/lib.rs`** + +```rust +pub mod diff; +pub mod filter; +pub mod format; +pub mod formatter; +pub mod metadata; +pub mod reader; +pub mod readers; +pub mod writer; +pub mod writers; +``` + +- [ ] **Step 3: Run tests** + +Run: `cargo test --lib` +Expected: PASS + +- [ ] **Step 4: Commit** + +```bash +git add src/writer.rs src/lib.rs +git commit -m "feat: add write_file dispatch function" +``` + +--- + +### Task 4: Add `--convert` and `-o` flags to dtcat + +**Files:** +- Modify: `src/bin/dtcat.rs` +- Test: `tests/dtcat.rs` + +- [ ] **Step 1: Write the failing tests** + +Add to `tests/dtcat.rs`: + +```rust +#[test] +fn convert_csv_to_parquet() { + let out = NamedTempFile::with_suffix(".parquet").unwrap(); + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("parquet") + .arg("-o").arg(out.path()) + .assert().success(); + // Read back and verify + dtcat().arg(out.path()).arg("--csv") + .assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Charlie")); +} + +#[test] +fn convert_parquet_to_csv_file() { + let out = NamedTempFile::with_suffix(".csv").unwrap(); + dtcat().arg("tests/fixtures/data.parquet") + .arg("--convert").arg("csv") + .arg("-o").arg(out.path()) + .assert().success(); + dtcat().arg(out.path()) + .assert().success() + .stdout(predicate::str::contains("Alice")); +} + +#[test] +fn convert_csv_to_json_stdout() { + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("json") + .assert().success() + .stdout(predicate::str::contains("Alice")); +} + +#[test] +fn convert_csv_to_ndjson_stdout() { + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("ndjson") + .assert().success() + .stdout(predicate::str::contains("Alice")); +} + +#[test] +fn convert_parquet_no_output_errors() { + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("parquet") + .assert().failure(); +} + +#[test] +fn convert_arrow_no_output_errors() { + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("arrow") + .assert().failure(); +} + +#[test] +fn convert_conflicts_with_schema() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--convert").arg("csv").arg("--schema") + .assert().code(2); +} + +#[test] +fn convert_with_skip() { + let f = csv_file("meta\nname,value\nAlice,100\n"); + dtcat().arg(f.path()).arg("--skip").arg("1").arg("--convert").arg("csv") + .assert().success() + .stdout(predicate::str::contains("Alice")); +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cargo test --test dtcat convert` +Expected: FAIL — unknown arg `--convert` + +- [ ] **Step 3: Add `--convert` and `-o` args and validation** + +In `src/bin/dtcat.rs`, add to the `Args` struct: + +```rust + /// Convert to format (csv, tsv, parquet, arrow, json, ndjson) + #[arg(long, value_name = "FORMAT")] + convert: Option<String>, + + /// Output file path (required for binary formats with --convert) + #[arg(short = 'o', value_name = "PATH")] + output: Option<String>, +``` + +Add to imports at the top of the file: + +```rust +use dtcore::format::parse_format_str; +use dtcore::writer::write_file; +``` + +Update `validate_args` to add after the sample checks: + +```rust + if args.convert.is_some() { + if args.schema || args.describe || args.info || args.csv + || args.head.is_some() || args.tail.is_some() + || args.all || args.sample.is_some() + { + bail!("--convert is mutually exclusive with display flags"); + } + } +``` + +- [ ] **Step 4: Add convert logic to the run function** + +In `src/bin/dtcat.rs`, insert after the sampling block and before the empty DataFrame check: + +```rust + // --convert: write to a different format and exit + if let Some(ref convert_str) = args.convert { + let target_fmt = parse_format_str(convert_str)?; + let out_path = args.output.as_deref().map(std::path::Path::new); + let mut df = df; + write_file(&mut df, out_path, target_fmt)?; + return Ok(()); + } +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `cargo test --test dtcat convert` +Expected: all 8 convert tests PASS + +- [ ] **Step 6: Run all tests** + +Run: `cargo test` +Expected: all tests PASS + +- [ ] **Step 7: Commit** + +```bash +git add src/bin/dtcat.rs tests/dtcat.rs +git commit -m "feat: add --convert FORMAT and -o PATH to dtcat" +``` + +--- + +### Task 5: Bump version and final verification + +**Files:** +- Modify: `Cargo.toml` + +- [ ] **Step 1: Bump version** + +In `Cargo.toml`, change: + +```toml +version = "0.2.0" +``` + +- [ ] **Step 2: Run full test suite** + +Run: `cargo test` +Expected: all tests PASS + +- [ ] **Step 3: Run clippy** + +Run: `cargo clippy --release` +Expected: no warnings + +- [ ] **Step 4: Verify CLI help** + +Run: `cargo run --release --bin dtcat -- --help` +Expected: output includes `--sample`, `--convert`, `-o` + +- [ ] **Step 5: Commit** + +```bash +git add Cargo.toml +git commit -m "chore: bump version to 0.2.0" +``` diff --git a/src/bin/dtcat.rs b/src/bin/dtcat.rs @@ -4,7 +4,8 @@ use std::process; use anyhow::{bail, Result}; use clap::Parser; -use dtcore::format::{detect_format, Format}; +use dtcore::format::{detect_format, parse_format_str, Format}; +use dtcore::writer::write_file; use dtcore::formatter::{ format_csv, format_data_table, format_describe, format_empty_sheet, format_head_tail, format_header, format_schema, format_sheet_listing, @@ -63,15 +64,54 @@ struct Args { #[arg(long)] all: bool, + /// Randomly sample N rows + #[arg(long, value_name = "N")] + sample: Option<usize>, + /// Show file metadata only #[arg(long)] info: bool, + + /// Convert to format (csv, tsv, parquet, arrow, json, ndjson) + #[arg(long, value_name = "FORMAT")] + convert: Option<String>, + + /// Output file path (required for binary formats with --convert) + #[arg(short = 'o', value_name = "PATH")] + output: Option<String>, } fn validate_args(args: &Args) -> Result<()> { if args.schema && args.describe { bail!("--schema and --describe are mutually exclusive"); } + if args.sample.is_some() { + if args.schema { + bail!("--sample and --schema are mutually exclusive"); + } + if args.describe { + bail!("--sample and --describe are mutually exclusive"); + } + if args.info { + bail!("--sample and --info are mutually exclusive"); + } + if args.head.is_some() { + bail!("--sample and --head are mutually exclusive"); + } + if args.tail.is_some() { + bail!("--sample and --tail are mutually exclusive"); + } + if args.all { + bail!("--sample and --all are mutually exclusive"); + } + } + if args.convert.is_some() + && (args.schema || args.describe || args.info || args.csv + || args.head.is_some() || args.tail.is_some() + || args.all || args.sample.is_some()) + { + bail!("--convert is mutually exclusive with display flags"); + } Ok(()) } @@ -202,6 +242,26 @@ fn run(args: Args) -> Result<()> { sheet_info_from_df(&file_name, &df) }; + // Apply sampling if requested (before any display mode) + let df = if let Some(n) = args.sample { + if n >= df.height() { + df + } else { + df.sample_n_literal(n, false, false, None)? + } + } else { + df + }; + + // --convert: write to a different format and exit + if let Some(ref convert_str) = args.convert { + let target_fmt = parse_format_str(convert_str)?; + let out_path = args.output.as_deref().map(std::path::Path::new); + let mut df = df; + write_file(&mut df, out_path, target_fmt)?; + return Ok(()); + } + // Handle empty DataFrame if df.is_empty() { print!("{}", format_empty_sheet(&sheet)); diff --git a/src/lib.rs b/src/lib.rs @@ -5,3 +5,5 @@ pub mod formatter; pub mod metadata; pub mod reader; pub mod readers; +pub mod writer; +pub mod writers; diff --git a/src/writer.rs b/src/writer.rs @@ -0,0 +1,17 @@ +use anyhow::{bail, Result}; +use polars::prelude::*; +use std::path::Path; + +use crate::format::Format; +use crate::writers; + +/// Write a DataFrame to a file or stdout, dispatching to the appropriate writer. +pub fn write_file(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> { + match format { + Format::Csv | Format::Tsv => writers::csv::write(df, path, format), + Format::Parquet => writers::parquet::write(df, path), + Format::Arrow => writers::arrow::write(df, path), + Format::Json | Format::Ndjson => writers::json::write(df, path, format), + Format::Excel => bail!("writing Excel format is not supported; use csv or parquet"), + } +} diff --git a/src/writers/arrow.rs b/src/writers/arrow.rs @@ -0,0 +1,35 @@ +use anyhow::Result; +use polars::prelude::*; +use std::path::Path; + +pub fn write(df: &mut DataFrame, path: Option<&Path>) -> Result<()> { + let path = path.ok_or_else(|| anyhow::anyhow!("--convert arrow requires -o PATH"))?; + let file = std::fs::File::create(path)?; + IpcWriter::new(file).finish(df)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn write_arrow_roundtrip() { + let s = Series::new("x".into(), &[1i64, 2, 3]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".arrow").unwrap(); + write(&mut df, Some(f.path())).unwrap(); + + let result = crate::readers::arrow::read(f.path(), &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 3); + } + + #[test] + fn write_arrow_no_path_errors() { + let s = Series::new("x".into(), &[1i64]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + assert!(write(&mut df, None).is_err()); + } +} diff --git a/src/writers/csv.rs b/src/writers/csv.rs @@ -0,0 +1,62 @@ +use anyhow::Result; +use polars::prelude::*; +use std::io::Write; +use std::path::Path; + +use crate::format::Format; + +pub fn write(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> { + let separator = match format { + Format::Tsv => b'\t', + _ => b',', + }; + + match path { + Some(p) => { + let file = std::fs::File::create(p)?; + CsvWriter::new(file) + .with_separator(separator) + .finish(df)?; + } + None => { + let mut buf = Vec::new(); + CsvWriter::new(&mut buf) + .with_separator(separator) + .finish(df)?; + std::io::stdout().write_all(&buf)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn write_csv_roundtrip() { + let s1 = Series::new("name".into(), &["Alice", "Bob"]); + let s2 = Series::new("value".into(), &[100i64, 200]); + let mut df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".csv").unwrap(); + write(&mut df, Some(f.path()), Format::Csv).unwrap(); + + let result = crate::readers::csv::read(f.path(), &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 2); + assert_eq!(result.get_column_names(), df.get_column_names()); + } + + #[test] + fn write_tsv_uses_tab() { + let s = Series::new("x".into(), &[1i64]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".tsv").unwrap(); + write(&mut df, Some(f.path()), Format::Tsv).unwrap(); + + let content = std::fs::read_to_string(f.path()).unwrap(); + assert!(!content.contains(',')); + } +} diff --git a/src/writers/json.rs b/src/writers/json.rs @@ -0,0 +1,60 @@ +use anyhow::Result; +use polars::prelude::*; +use std::io::Write as IoWrite; +use std::path::Path; + +use crate::format::Format; + +pub fn write(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> { + let json_format = match format { + Format::Ndjson => JsonFormat::JsonLines, + _ => JsonFormat::Json, + }; + + match path { + Some(p) => { + let file = std::fs::File::create(p)?; + JsonWriter::new(file) + .with_json_format(json_format) + .finish(df)?; + } + None => { + let mut buf = Vec::new(); + JsonWriter::new(&mut buf) + .with_json_format(json_format) + .finish(df)?; + std::io::stdout().write_all(&buf)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn write_json_roundtrip() { + let s = Series::new("x".into(), &[1i64, 2]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".json").unwrap(); + write(&mut df, Some(f.path()), Format::Json).unwrap(); + + let result = crate::readers::json::read(f.path(), Format::Json, &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 2); + } + + #[test] + fn write_ndjson_roundtrip() { + let s = Series::new("x".into(), &[1i64, 2]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".ndjson").unwrap(); + write(&mut df, Some(f.path()), Format::Ndjson).unwrap(); + + let result = crate::readers::json::read(f.path(), Format::Ndjson, &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 2); + } +} diff --git a/src/writers/mod.rs b/src/writers/mod.rs @@ -0,0 +1,4 @@ +pub mod arrow; +pub mod csv; +pub mod json; +pub mod parquet; diff --git a/src/writers/parquet.rs b/src/writers/parquet.rs @@ -0,0 +1,36 @@ +use anyhow::Result; +use polars::prelude::*; +use std::path::Path; + +pub fn write(df: &mut DataFrame, path: Option<&Path>) -> Result<()> { + let path = path.ok_or_else(|| anyhow::anyhow!("--convert parquet requires -o PATH"))?; + let file = std::fs::File::create(path)?; + ParquetWriter::new(file).finish(df)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn write_parquet_roundtrip() { + let s1 = Series::new("name".into(), &["Alice", "Bob"]); + let s2 = Series::new("value".into(), &[100i64, 200]); + let mut df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); + + let f = NamedTempFile::with_suffix(".parquet").unwrap(); + write(&mut df, Some(f.path())).unwrap(); + + let result = crate::readers::parquet::read(f.path(), &crate::reader::ReadOptions::default()).unwrap(); + assert_eq!(result.height(), 2); + } + + #[test] + fn write_parquet_no_path_errors() { + let s = Series::new("x".into(), &[1i64]); + let mut df = DataFrame::new(vec![s.into_column()]).unwrap(); + assert!(write(&mut df, None).is_err()); + } +} diff --git a/tests/dtcat.rs b/tests/dtcat.rs @@ -145,6 +145,66 @@ fn all_flag_shows_every_row() { .stdout(predicate::str::contains("| 30 ")); } +// ─── Sample ─── + +#[test] +fn sample_returns_n_rows() { + let out = dtcat().arg("demo/sales.csv").arg("--sample").arg("5").arg("--csv") + .assert().success(); + let stdout = String::from_utf8(out.get_output().stdout.clone()).unwrap(); + let lines: Vec<&str> = stdout.trim().lines().collect(); + assert_eq!(lines.len(), 6, "expected header + 5 rows, got {}", lines.len()); +} + +#[test] +fn sample_ge_total_returns_all() { + let f = csv_file("x\n1\n2\n3\n"); + dtcat().arg(f.path()).arg("--sample").arg("100").arg("--csv") + .assert().success(); +} + +#[test] +fn sample_conflicts_with_head() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--sample").arg("1").arg("--head").arg("1") + .assert().code(2); +} + +#[test] +fn sample_conflicts_with_tail() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--sample").arg("1").arg("--tail").arg("1") + .assert().code(2); +} + +#[test] +fn sample_conflicts_with_all() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--sample").arg("1").arg("--all") + .assert().code(2); +} + +#[test] +fn sample_conflicts_with_schema() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--sample").arg("1").arg("--schema") + .assert().code(2); +} + +#[test] +fn sample_conflicts_with_describe() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--sample").arg("1").arg("--describe") + .assert().code(2); +} + +#[test] +fn sample_conflicts_with_info() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--sample").arg("1").arg("--info") + .assert().code(2); +} + // ─── Parquet ─── #[test] @@ -216,3 +276,75 @@ fn excel_info() { .stdout(predicate::str::contains("Excel")) .stdout(predicate::str::contains("Sheet1")); } + +// ─── Convert ─── + +#[test] +fn convert_csv_to_parquet() { + let out = NamedTempFile::with_suffix(".parquet").unwrap(); + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("parquet") + .arg("-o").arg(out.path()) + .assert().success(); + dtcat().arg(out.path()).arg("--csv") + .assert().success() + .stdout(predicate::str::contains("Alice")) + .stdout(predicate::str::contains("Charlie")); +} + +#[test] +fn convert_parquet_to_csv_file() { + let out = NamedTempFile::with_suffix(".csv").unwrap(); + dtcat().arg("tests/fixtures/data.parquet") + .arg("--convert").arg("csv") + .arg("-o").arg(out.path()) + .assert().success(); + dtcat().arg(out.path()) + .assert().success() + .stdout(predicate::str::contains("Alice")); +} + +#[test] +fn convert_csv_to_json_stdout() { + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("json") + .assert().success() + .stdout(predicate::str::contains("Alice")); +} + +#[test] +fn convert_csv_to_ndjson_stdout() { + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("ndjson") + .assert().success() + .stdout(predicate::str::contains("Alice")); +} + +#[test] +fn convert_parquet_no_output_errors() { + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("parquet") + .assert().failure(); +} + +#[test] +fn convert_arrow_no_output_errors() { + dtcat().arg("tests/fixtures/data.csv") + .arg("--convert").arg("arrow") + .assert().failure(); +} + +#[test] +fn convert_conflicts_with_schema() { + let f = csv_file("x\n1\n"); + dtcat().arg(f.path()).arg("--convert").arg("csv").arg("--schema") + .assert().code(2); +} + +#[test] +fn convert_with_skip() { + let f = csv_file("meta\nname,value\nAlice,100\n"); + dtcat().arg(f.path()).arg("--skip").arg("1").arg("--convert").arg("csv") + .assert().success() + .stdout(predicate::str::contains("Alice")); +}