commit 601fbfe88df565049c4894f1024cb016beab9d67
parent b2789b7aff3b7b6cb1c326fac1049494051df3ce
Author: Erik Loualiche <eloualiche@users.noreply.github.com>
Date: Sat, 4 Apr 2026 13:55:34 -0500
Merge pull request #1 from LouLouLibs/feat/v0.2.0-sample-convert
feat: v0.2.0 — add --sample and --convert to dtcat
Diffstat:
12 files changed, 1116 insertions(+), 2 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "dt-cli-tools"
-version = "0.1.0"
+version = "0.2.0"
edition = "2024"
description = "CLI tools for viewing, filtering, and comparing tabular data files"
license = "MIT"
@@ -32,6 +32,7 @@ polars = { version = "0.46", default-features = false, features = [
"parquet",
"ipc",
"json",
+ "random",
] }
calamine = "0.26"
clap = { version = "4", features = ["derive"] }
diff --git a/README.md b/README.md
@@ -102,6 +102,15 @@ dtcat data.csv --tail 5
# CSV output for piping
dtcat data.parquet --csv
+# Random sample of rows
+dtcat huge.parquet --sample 20
+dtcat huge.parquet --sample 50 --csv
+
+# Convert between formats
+dtcat data.csv --convert parquet -o data.parquet
+dtcat report.xlsx --sheet Revenue --convert csv -o revenue.csv
+dtcat data.parquet --convert ndjson # text formats go to stdout
+
# Override format detection
dtcat data.txt --format csv
@@ -141,6 +150,10 @@ dtcat data.csv --skip 2
Modes `--schema`, `--describe`, `--info`, and data (default) are mutually exclusive.
+`--sample N` randomly selects N rows; mutually exclusive with `--head`/`--tail`/`--all`.
+
+`--convert FORMAT` writes to a different format. Use `-o PATH` for output file (required for binary formats Parquet/Arrow; optional for text formats which default to stdout). Supported targets: csv, tsv, parquet, arrow, json, ndjson.
+
## dtfilter — Query and Filter
<img src="demo/dtfilter.gif" alt="dtfilter demo" width="80%" />
diff --git a/docs/superpowers/plans/2026-04-04-v0.2.0-sample-convert.md b/docs/superpowers/plans/2026-04-04-v0.2.0-sample-convert.md
@@ -0,0 +1,692 @@
+# v0.2.0: --sample and --convert Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Add random row sampling (`--sample N`) and format conversion (`--convert FORMAT -o PATH`) to dtcat.
+
+**Architecture:** Both features extend the existing dtcat binary. `--sample` uses Polars `DataFrame::sample_n_literal` after reading, before display. `--convert` requires new writer functions in a `src/writers/` module mirroring `src/readers/`, then a write path in dtcat that short-circuits display.
+
+**Tech Stack:** Polars (ParquetWriter, IpcWriter, JsonWriter, CsvWriter), clap, anyhow.
+
+---
+
+### Task 1: Add `--sample N` flag and validation
+
+**Files:**
+- Modify: `src/bin/dtcat.rs`
+- Test: `tests/dtcat.rs`
+
+- [ ] **Step 1: Write the failing tests**
+
+Add to `tests/dtcat.rs`:
+
+```rust
+#[test]
+fn sample_returns_n_rows() {
+ // 18-row fixture, sample 5
+ let out = dtcat().arg("demo/sales.csv").arg("--sample").arg("5").arg("--csv")
+ .assert().success();
+ let stdout = String::from_utf8(out.get_output().stdout.clone()).unwrap();
+ // CSV header + 5 data rows = 6 lines (last line may be empty)
+ let lines: Vec<&str> = stdout.trim().lines().collect();
+ assert_eq!(lines.len(), 6, "expected header + 5 rows, got {}", lines.len());
+}
+
+#[test]
+fn sample_ge_total_returns_all() {
+ let f = csv_file("x\n1\n2\n3\n");
+ dtcat().arg(f.path()).arg("--sample").arg("100").arg("--csv")
+ .assert().success();
+}
+
+#[test]
+fn sample_conflicts_with_head() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--sample").arg("1").arg("--head").arg("1")
+ .assert().code(2);
+}
+
+#[test]
+fn sample_conflicts_with_tail() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--sample").arg("1").arg("--tail").arg("1")
+ .assert().code(2);
+}
+
+#[test]
+fn sample_conflicts_with_all() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--sample").arg("1").arg("--all")
+ .assert().code(2);
+}
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `cargo test --test dtcat sample`
+Expected: FAIL — unknown arg `--sample`
+
+- [ ] **Step 3: Add `--sample` arg and validation to dtcat**
+
+In `src/bin/dtcat.rs`, add to the `Args` struct after the `all` field:
+
+```rust
+ /// Randomly sample N rows
+ #[arg(long, value_name = "N")]
+ sample: Option<usize>,
+```
+
+Update `validate_args`:
+
+```rust
+fn validate_args(args: &Args) -> Result<()> {
+ if args.schema && args.describe {
+ bail!("--schema and --describe are mutually exclusive");
+ }
+ if args.sample.is_some() {
+ if args.head.is_some() {
+ bail!("--sample and --head are mutually exclusive");
+ }
+ if args.tail.is_some() {
+ bail!("--sample and --tail are mutually exclusive");
+ }
+ if args.all {
+ bail!("--sample and --all are mutually exclusive");
+ }
+ }
+ Ok(())
+}
+```
+
+- [ ] **Step 4: Implement sampling logic in the display section**
+
+In `src/bin/dtcat.rs`, replace the display match block (the `let output = match ...` section) with:
+
+```rust
+ // Determine what to display
+ let output = if let Some(n) = args.sample {
+ let sampled = if n >= df.height() {
+ df
+ } else {
+ df.sample_n_literal(n, false, false, None)?
+ };
+ format_data_table(&sampled)
+ } else {
+ match (args.head, args.tail) {
+ (Some(h), Some(t)) => {
+ format_head_tail(&df, h, t)
+ }
+ (Some(h), None) => {
+ let sliced = df.head(Some(h));
+ format_data_table(&sliced)
+ }
+ (None, Some(t)) => {
+ let sliced = df.tail(Some(t));
+ format_data_table(&sliced)
+ }
+ (None, None) => {
+ if args.all || df.height() <= DEFAULT_THRESHOLD {
+ format_data_table(&df)
+ } else {
+ format_head_tail(&df, DEFAULT_HEAD_TAIL, DEFAULT_HEAD_TAIL)
+ }
+ }
+ }
+ };
+```
+
+Also handle `--sample` with `--csv` output. The current `--csv` branch exits early before the display match. Move sampling before the csv check, or handle it inline. The simplest approach: apply sampling before the `--csv` check. After the line `let df = read_file(&path, fmt, &opts)?;`, add:
+
+```rust
+ // Apply sampling if requested (before any display mode)
+ let df = if let Some(n) = args.sample {
+ if n >= df.height() {
+ df
+ } else {
+ df.sample_n_literal(n, false, false, None)?
+ }
+ } else {
+ df
+ };
+```
+
+Then remove the sample handling from the display match block (revert it to the original match block). This way `--sample` + `--csv` works naturally.
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+Run: `cargo test --test dtcat sample`
+Expected: all 5 sample tests PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add src/bin/dtcat.rs tests/dtcat.rs
+git commit -m "feat: add --sample N flag to dtcat"
+```
+
+---
+
+### Task 2: Create writers module
+
+**Files:**
+- Create: `src/writers/mod.rs`
+- Create: `src/writers/csv.rs`
+- Create: `src/writers/parquet.rs`
+- Create: `src/writers/arrow.rs`
+- Create: `src/writers/json.rs`
+- Modify: `src/lib.rs`
+
+- [ ] **Step 1: Create `src/writers/mod.rs`**
+
+```rust
+pub mod arrow;
+pub mod csv;
+pub mod json;
+pub mod parquet;
+```
+
+- [ ] **Step 2: Create `src/writers/csv.rs`**
+
+```rust
+use anyhow::Result;
+use polars::prelude::*;
+use std::io::Write;
+use std::path::Path;
+
+use crate::format::Format;
+
+pub fn write(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> {
+ let separator = match format {
+ Format::Tsv => b'\t',
+ _ => b',',
+ };
+
+ match path {
+ Some(p) => {
+ let file = std::fs::File::create(p)?;
+ CsvWriter::new(file)
+ .with_separator(separator)
+ .finish(df)?;
+ }
+ None => {
+ let mut buf = Vec::new();
+ CsvWriter::new(&mut buf)
+ .with_separator(separator)
+ .finish(df)?;
+ std::io::stdout().write_all(&buf)?;
+ }
+ }
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn write_csv_roundtrip() {
+ let s1 = Series::new("name".into(), &["Alice", "Bob"]);
+ let s2 = Series::new("value".into(), &[100i64, 200]);
+ let mut df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".csv").unwrap();
+ write(&mut df, Some(f.path()), Format::Csv).unwrap();
+
+ let result = crate::readers::csv::read(f.path(), &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 2);
+ assert_eq!(result.get_column_names(), df.get_column_names());
+ }
+
+ #[test]
+ fn write_tsv_uses_tab() {
+ let s = Series::new("x".into(), &[1i64]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".tsv").unwrap();
+ write(&mut df, Some(f.path()), Format::Tsv).unwrap();
+
+ let content = std::fs::read_to_string(f.path()).unwrap();
+ assert!(!content.contains(','));
+ }
+}
+```
+
+- [ ] **Step 3: Create `src/writers/parquet.rs`**
+
+```rust
+use anyhow::{bail, Result};
+use polars::prelude::*;
+use std::path::Path;
+
+pub fn write(df: &mut DataFrame, path: Option<&Path>) -> Result<()> {
+ let path = path.ok_or_else(|| anyhow::anyhow!("--convert parquet requires -o PATH"))?;
+ let file = std::fs::File::create(path)?;
+ ParquetWriter::new(file).finish(df)?;
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn write_parquet_roundtrip() {
+ let s1 = Series::new("name".into(), &["Alice", "Bob"]);
+ let s2 = Series::new("value".into(), &[100i64, 200]);
+ let mut df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".parquet").unwrap();
+ write(&mut df, Some(f.path())).unwrap();
+
+ let result = crate::readers::parquet::read(f.path(), &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+
+ #[test]
+ fn write_parquet_no_path_errors() {
+ let s = Series::new("x".into(), &[1i64]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+ assert!(write(&mut df, None).is_err());
+ }
+}
+```
+
+- [ ] **Step 4: Create `src/writers/arrow.rs`**
+
+```rust
+use anyhow::Result;
+use polars::prelude::*;
+use std::path::Path;
+
+pub fn write(df: &mut DataFrame, path: Option<&Path>) -> Result<()> {
+ let path = path.ok_or_else(|| anyhow::anyhow!("--convert arrow requires -o PATH"))?;
+ let file = std::fs::File::create(path)?;
+ IpcWriter::new(file).finish(df)?;
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn write_arrow_roundtrip() {
+ let s = Series::new("x".into(), &[1i64, 2, 3]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".arrow").unwrap();
+ write(&mut df, Some(f.path())).unwrap();
+
+ let result = crate::readers::arrow::read(f.path(), &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 3);
+ }
+
+ #[test]
+ fn write_arrow_no_path_errors() {
+ let s = Series::new("x".into(), &[1i64]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+ assert!(write(&mut df, None).is_err());
+ }
+}
+```
+
+- [ ] **Step 5: Create `src/writers/json.rs`**
+
+```rust
+use anyhow::Result;
+use polars::prelude::*;
+use std::io::Write as IoWrite;
+use std::path::Path;
+
+use crate::format::Format;
+
+pub fn write(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> {
+ match format {
+ Format::Ndjson => write_ndjson(df, path),
+ _ => write_json(df, path),
+ }
+}
+
+fn write_json(df: &mut DataFrame, path: Option<&Path>) -> Result<()> {
+ match path {
+ Some(p) => {
+ let file = std::fs::File::create(p)?;
+ JsonWriter::new(file).finish(df)?;
+ }
+ None => {
+ let mut buf = Vec::new();
+ JsonWriter::new(&mut buf).finish(df)?;
+ std::io::stdout().write_all(&buf)?;
+ }
+ }
+ Ok(())
+}
+
+fn write_ndjson(df: &mut DataFrame, path: Option<&Path>) -> Result<()> {
+ match path {
+ Some(p) => {
+ let file = std::fs::File::create(p)?;
+ JsonLineWriter::new(file).finish(df)?;
+ }
+ None => {
+ let mut buf = Vec::new();
+ JsonLineWriter::new(&mut buf).finish(df)?;
+ std::io::stdout().write_all(&buf)?;
+ }
+ }
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn write_json_roundtrip() {
+ let s = Series::new("x".into(), &[1i64, 2]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".json").unwrap();
+ write(&mut df, Some(f.path()), Format::Json).unwrap();
+
+ let result = crate::readers::json::read(f.path(), Format::Json, &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+
+ #[test]
+ fn write_ndjson_roundtrip() {
+ let s = Series::new("x".into(), &[1i64, 2]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".ndjson").unwrap();
+ write(&mut df, Some(f.path()), Format::Ndjson).unwrap();
+
+ let result = crate::readers::json::read(f.path(), Format::Ndjson, &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+}
+```
+
+- [ ] **Step 6: Add `writers` to `src/lib.rs`**
+
+Replace the contents of `src/lib.rs` with:
+
+```rust
+pub mod diff;
+pub mod filter;
+pub mod format;
+pub mod formatter;
+pub mod metadata;
+pub mod reader;
+pub mod readers;
+pub mod writers;
+```
+
+- [ ] **Step 7: Run unit tests**
+
+Run: `cargo test --lib`
+Expected: all unit tests pass including new writer tests
+
+- [ ] **Step 8: Commit**
+
+```bash
+git add src/writers/ src/lib.rs
+git commit -m "feat: add writers module (csv, tsv, parquet, arrow, json, ndjson)"
+```
+
+---
+
+### Task 3: Add write_file dispatch function
+
+**Files:**
+- Create: `src/writer.rs`
+- Modify: `src/lib.rs`
+
+- [ ] **Step 1: Create `src/writer.rs`**
+
+```rust
+use anyhow::{bail, Result};
+use polars::prelude::*;
+use std::path::Path;
+
+use crate::format::Format;
+use crate::writers;
+
+/// Write a DataFrame to a file or stdout, dispatching to the appropriate writer.
+///
+/// For binary formats (Parquet, Arrow), `path` is required.
+/// For text formats (CSV, TSV, JSON, NDJSON), `path` is optional (None = stdout).
+/// Excel writing is not supported.
+pub fn write_file(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> {
+ match format {
+ Format::Csv | Format::Tsv => writers::csv::write(df, path, format),
+ Format::Parquet => writers::parquet::write(df, path),
+ Format::Arrow => writers::arrow::write(df, path),
+ Format::Json | Format::Ndjson => writers::json::write(df, path, format),
+ Format::Excel => bail!("writing Excel format is not supported; use csv or parquet"),
+ }
+}
+```
+
+- [ ] **Step 2: Add `writer` to `src/lib.rs`**
+
+```rust
+pub mod diff;
+pub mod filter;
+pub mod format;
+pub mod formatter;
+pub mod metadata;
+pub mod reader;
+pub mod readers;
+pub mod writer;
+pub mod writers;
+```
+
+- [ ] **Step 3: Run tests**
+
+Run: `cargo test --lib`
+Expected: PASS
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add src/writer.rs src/lib.rs
+git commit -m "feat: add write_file dispatch function"
+```
+
+---
+
+### Task 4: Add `--convert` and `-o` flags to dtcat
+
+**Files:**
+- Modify: `src/bin/dtcat.rs`
+- Test: `tests/dtcat.rs`
+
+- [ ] **Step 1: Write the failing tests**
+
+Add to `tests/dtcat.rs`:
+
+```rust
+#[test]
+fn convert_csv_to_parquet() {
+ let out = NamedTempFile::with_suffix(".parquet").unwrap();
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("parquet")
+ .arg("-o").arg(out.path())
+ .assert().success();
+ // Read back and verify
+ dtcat().arg(out.path()).arg("--csv")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Charlie"));
+}
+
+#[test]
+fn convert_parquet_to_csv_file() {
+ let out = NamedTempFile::with_suffix(".csv").unwrap();
+ dtcat().arg("tests/fixtures/data.parquet")
+ .arg("--convert").arg("csv")
+ .arg("-o").arg(out.path())
+ .assert().success();
+ dtcat().arg(out.path())
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"));
+}
+
+#[test]
+fn convert_csv_to_json_stdout() {
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("json")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"));
+}
+
+#[test]
+fn convert_csv_to_ndjson_stdout() {
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("ndjson")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"));
+}
+
+#[test]
+fn convert_parquet_no_output_errors() {
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("parquet")
+ .assert().failure();
+}
+
+#[test]
+fn convert_arrow_no_output_errors() {
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("arrow")
+ .assert().failure();
+}
+
+#[test]
+fn convert_conflicts_with_schema() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--convert").arg("csv").arg("--schema")
+ .assert().code(2);
+}
+
+#[test]
+fn convert_with_skip() {
+ let f = csv_file("meta\nname,value\nAlice,100\n");
+ dtcat().arg(f.path()).arg("--skip").arg("1").arg("--convert").arg("csv")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"));
+}
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `cargo test --test dtcat convert`
+Expected: FAIL — unknown arg `--convert`
+
+- [ ] **Step 3: Add `--convert` and `-o` args and validation**
+
+In `src/bin/dtcat.rs`, add to the `Args` struct:
+
+```rust
+ /// Convert to format (csv, tsv, parquet, arrow, json, ndjson)
+ #[arg(long, value_name = "FORMAT")]
+ convert: Option<String>,
+
+ /// Output file path (required for binary formats with --convert)
+ #[arg(short = 'o', value_name = "PATH")]
+ output: Option<String>,
+```
+
+Add to imports at the top of the file:
+
+```rust
+use dtcore::format::parse_format_str;
+use dtcore::writer::write_file;
+```
+
+Update `validate_args` to add after the sample checks:
+
+```rust
+ if args.convert.is_some() {
+ if args.schema || args.describe || args.info || args.csv
+ || args.head.is_some() || args.tail.is_some()
+ || args.all || args.sample.is_some()
+ {
+ bail!("--convert is mutually exclusive with display flags");
+ }
+ }
+```
+
+- [ ] **Step 4: Add convert logic to the run function**
+
+In `src/bin/dtcat.rs`, insert after the sampling block and before the empty DataFrame check:
+
+```rust
+ // --convert: write to a different format and exit
+ if let Some(ref convert_str) = args.convert {
+ let target_fmt = parse_format_str(convert_str)?;
+ let out_path = args.output.as_deref().map(std::path::Path::new);
+ let mut df = df;
+ write_file(&mut df, out_path, target_fmt)?;
+ return Ok(());
+ }
+```
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+Run: `cargo test --test dtcat convert`
+Expected: all 8 convert tests PASS
+
+- [ ] **Step 6: Run all tests**
+
+Run: `cargo test`
+Expected: all tests PASS
+
+- [ ] **Step 7: Commit**
+
+```bash
+git add src/bin/dtcat.rs tests/dtcat.rs
+git commit -m "feat: add --convert FORMAT and -o PATH to dtcat"
+```
+
+---
+
+### Task 5: Bump version and final verification
+
+**Files:**
+- Modify: `Cargo.toml`
+
+- [ ] **Step 1: Bump version**
+
+In `Cargo.toml`, change:
+
+```toml
+version = "0.2.0"
+```
+
+- [ ] **Step 2: Run full test suite**
+
+Run: `cargo test`
+Expected: all tests PASS
+
+- [ ] **Step 3: Run clippy**
+
+Run: `cargo clippy --release`
+Expected: no warnings
+
+- [ ] **Step 4: Verify CLI help**
+
+Run: `cargo run --release --bin dtcat -- --help`
+Expected: output includes `--sample`, `--convert`, `-o`
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add Cargo.toml
+git commit -m "chore: bump version to 0.2.0"
+```
diff --git a/src/bin/dtcat.rs b/src/bin/dtcat.rs
@@ -4,7 +4,8 @@ use std::process;
use anyhow::{bail, Result};
use clap::Parser;
-use dtcore::format::{detect_format, Format};
+use dtcore::format::{detect_format, parse_format_str, Format};
+use dtcore::writer::write_file;
use dtcore::formatter::{
format_csv, format_data_table, format_describe, format_empty_sheet, format_head_tail,
format_header, format_schema, format_sheet_listing,
@@ -63,15 +64,54 @@ struct Args {
#[arg(long)]
all: bool,
+ /// Randomly sample N rows
+ #[arg(long, value_name = "N")]
+ sample: Option<usize>,
+
/// Show file metadata only
#[arg(long)]
info: bool,
+
+ /// Convert to format (csv, tsv, parquet, arrow, json, ndjson)
+ #[arg(long, value_name = "FORMAT")]
+ convert: Option<String>,
+
+ /// Output file path (required for binary formats with --convert)
+ #[arg(short = 'o', value_name = "PATH")]
+ output: Option<String>,
}
fn validate_args(args: &Args) -> Result<()> {
if args.schema && args.describe {
bail!("--schema and --describe are mutually exclusive");
}
+ if args.sample.is_some() {
+ if args.schema {
+ bail!("--sample and --schema are mutually exclusive");
+ }
+ if args.describe {
+ bail!("--sample and --describe are mutually exclusive");
+ }
+ if args.info {
+ bail!("--sample and --info are mutually exclusive");
+ }
+ if args.head.is_some() {
+ bail!("--sample and --head are mutually exclusive");
+ }
+ if args.tail.is_some() {
+ bail!("--sample and --tail are mutually exclusive");
+ }
+ if args.all {
+ bail!("--sample and --all are mutually exclusive");
+ }
+ }
+ if args.convert.is_some()
+ && (args.schema || args.describe || args.info || args.csv
+ || args.head.is_some() || args.tail.is_some()
+ || args.all || args.sample.is_some())
+ {
+ bail!("--convert is mutually exclusive with display flags");
+ }
Ok(())
}
@@ -202,6 +242,26 @@ fn run(args: Args) -> Result<()> {
sheet_info_from_df(&file_name, &df)
};
+ // Apply sampling if requested (before any display mode)
+ let df = if let Some(n) = args.sample {
+ if n >= df.height() {
+ df
+ } else {
+ df.sample_n_literal(n, false, false, None)?
+ }
+ } else {
+ df
+ };
+
+ // --convert: write to a different format and exit
+ if let Some(ref convert_str) = args.convert {
+ let target_fmt = parse_format_str(convert_str)?;
+ let out_path = args.output.as_deref().map(std::path::Path::new);
+ let mut df = df;
+ write_file(&mut df, out_path, target_fmt)?;
+ return Ok(());
+ }
+
// Handle empty DataFrame
if df.is_empty() {
print!("{}", format_empty_sheet(&sheet));
diff --git a/src/lib.rs b/src/lib.rs
@@ -5,3 +5,5 @@ pub mod formatter;
pub mod metadata;
pub mod reader;
pub mod readers;
+pub mod writer;
+pub mod writers;
diff --git a/src/writer.rs b/src/writer.rs
@@ -0,0 +1,17 @@
+use anyhow::{bail, Result};
+use polars::prelude::*;
+use std::path::Path;
+
+use crate::format::Format;
+use crate::writers;
+
+/// Write a DataFrame to a file or stdout, dispatching to the appropriate writer.
+pub fn write_file(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> {
+ match format {
+ Format::Csv | Format::Tsv => writers::csv::write(df, path, format),
+ Format::Parquet => writers::parquet::write(df, path),
+ Format::Arrow => writers::arrow::write(df, path),
+ Format::Json | Format::Ndjson => writers::json::write(df, path, format),
+ Format::Excel => bail!("writing Excel format is not supported; use csv or parquet"),
+ }
+}
diff --git a/src/writers/arrow.rs b/src/writers/arrow.rs
@@ -0,0 +1,35 @@
+use anyhow::Result;
+use polars::prelude::*;
+use std::path::Path;
+
+pub fn write(df: &mut DataFrame, path: Option<&Path>) -> Result<()> {
+ let path = path.ok_or_else(|| anyhow::anyhow!("--convert arrow requires -o PATH"))?;
+ let file = std::fs::File::create(path)?;
+ IpcWriter::new(file).finish(df)?;
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn write_arrow_roundtrip() {
+ let s = Series::new("x".into(), &[1i64, 2, 3]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".arrow").unwrap();
+ write(&mut df, Some(f.path())).unwrap();
+
+ let result = crate::readers::arrow::read(f.path(), &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 3);
+ }
+
+ #[test]
+ fn write_arrow_no_path_errors() {
+ let s = Series::new("x".into(), &[1i64]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+ assert!(write(&mut df, None).is_err());
+ }
+}
diff --git a/src/writers/csv.rs b/src/writers/csv.rs
@@ -0,0 +1,62 @@
+use anyhow::Result;
+use polars::prelude::*;
+use std::io::Write;
+use std::path::Path;
+
+use crate::format::Format;
+
+pub fn write(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> {
+ let separator = match format {
+ Format::Tsv => b'\t',
+ _ => b',',
+ };
+
+ match path {
+ Some(p) => {
+ let file = std::fs::File::create(p)?;
+ CsvWriter::new(file)
+ .with_separator(separator)
+ .finish(df)?;
+ }
+ None => {
+ let mut buf = Vec::new();
+ CsvWriter::new(&mut buf)
+ .with_separator(separator)
+ .finish(df)?;
+ std::io::stdout().write_all(&buf)?;
+ }
+ }
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn write_csv_roundtrip() {
+ let s1 = Series::new("name".into(), &["Alice", "Bob"]);
+ let s2 = Series::new("value".into(), &[100i64, 200]);
+ let mut df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".csv").unwrap();
+ write(&mut df, Some(f.path()), Format::Csv).unwrap();
+
+ let result = crate::readers::csv::read(f.path(), &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 2);
+ assert_eq!(result.get_column_names(), df.get_column_names());
+ }
+
+ #[test]
+ fn write_tsv_uses_tab() {
+ let s = Series::new("x".into(), &[1i64]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".tsv").unwrap();
+ write(&mut df, Some(f.path()), Format::Tsv).unwrap();
+
+ let content = std::fs::read_to_string(f.path()).unwrap();
+ assert!(!content.contains(','));
+ }
+}
diff --git a/src/writers/json.rs b/src/writers/json.rs
@@ -0,0 +1,60 @@
+use anyhow::Result;
+use polars::prelude::*;
+use std::io::Write as IoWrite;
+use std::path::Path;
+
+use crate::format::Format;
+
+pub fn write(df: &mut DataFrame, path: Option<&Path>, format: Format) -> Result<()> {
+ let json_format = match format {
+ Format::Ndjson => JsonFormat::JsonLines,
+ _ => JsonFormat::Json,
+ };
+
+ match path {
+ Some(p) => {
+ let file = std::fs::File::create(p)?;
+ JsonWriter::new(file)
+ .with_json_format(json_format)
+ .finish(df)?;
+ }
+ None => {
+ let mut buf = Vec::new();
+ JsonWriter::new(&mut buf)
+ .with_json_format(json_format)
+ .finish(df)?;
+ std::io::stdout().write_all(&buf)?;
+ }
+ }
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn write_json_roundtrip() {
+ let s = Series::new("x".into(), &[1i64, 2]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".json").unwrap();
+ write(&mut df, Some(f.path()), Format::Json).unwrap();
+
+ let result = crate::readers::json::read(f.path(), Format::Json, &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+
+ #[test]
+ fn write_ndjson_roundtrip() {
+ let s = Series::new("x".into(), &[1i64, 2]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".ndjson").unwrap();
+ write(&mut df, Some(f.path()), Format::Ndjson).unwrap();
+
+ let result = crate::readers::json::read(f.path(), Format::Ndjson, &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+}
diff --git a/src/writers/mod.rs b/src/writers/mod.rs
@@ -0,0 +1,4 @@
+pub mod arrow;
+pub mod csv;
+pub mod json;
+pub mod parquet;
diff --git a/src/writers/parquet.rs b/src/writers/parquet.rs
@@ -0,0 +1,36 @@
+use anyhow::Result;
+use polars::prelude::*;
+use std::path::Path;
+
+pub fn write(df: &mut DataFrame, path: Option<&Path>) -> Result<()> {
+ let path = path.ok_or_else(|| anyhow::anyhow!("--convert parquet requires -o PATH"))?;
+ let file = std::fs::File::create(path)?;
+ ParquetWriter::new(file).finish(df)?;
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn write_parquet_roundtrip() {
+ let s1 = Series::new("name".into(), &["Alice", "Bob"]);
+ let s2 = Series::new("value".into(), &[100i64, 200]);
+ let mut df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".parquet").unwrap();
+ write(&mut df, Some(f.path())).unwrap();
+
+ let result = crate::readers::parquet::read(f.path(), &crate::reader::ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 2);
+ }
+
+ #[test]
+ fn write_parquet_no_path_errors() {
+ let s = Series::new("x".into(), &[1i64]);
+ let mut df = DataFrame::new(vec![s.into_column()]).unwrap();
+ assert!(write(&mut df, None).is_err());
+ }
+}
diff --git a/tests/dtcat.rs b/tests/dtcat.rs
@@ -145,6 +145,66 @@ fn all_flag_shows_every_row() {
.stdout(predicate::str::contains("| 30 "));
}
+// ─── Sample ───
+
+#[test]
+fn sample_returns_n_rows() {
+ let out = dtcat().arg("demo/sales.csv").arg("--sample").arg("5").arg("--csv")
+ .assert().success();
+ let stdout = String::from_utf8(out.get_output().stdout.clone()).unwrap();
+ let lines: Vec<&str> = stdout.trim().lines().collect();
+ assert_eq!(lines.len(), 6, "expected header + 5 rows, got {}", lines.len());
+}
+
+#[test]
+fn sample_ge_total_returns_all() {
+ let f = csv_file("x\n1\n2\n3\n");
+ dtcat().arg(f.path()).arg("--sample").arg("100").arg("--csv")
+ .assert().success();
+}
+
+#[test]
+fn sample_conflicts_with_head() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--sample").arg("1").arg("--head").arg("1")
+ .assert().code(2);
+}
+
+#[test]
+fn sample_conflicts_with_tail() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--sample").arg("1").arg("--tail").arg("1")
+ .assert().code(2);
+}
+
+#[test]
+fn sample_conflicts_with_all() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--sample").arg("1").arg("--all")
+ .assert().code(2);
+}
+
+#[test]
+fn sample_conflicts_with_schema() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--sample").arg("1").arg("--schema")
+ .assert().code(2);
+}
+
+#[test]
+fn sample_conflicts_with_describe() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--sample").arg("1").arg("--describe")
+ .assert().code(2);
+}
+
+#[test]
+fn sample_conflicts_with_info() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--sample").arg("1").arg("--info")
+ .assert().code(2);
+}
+
// ─── Parquet ───
#[test]
@@ -216,3 +276,75 @@ fn excel_info() {
.stdout(predicate::str::contains("Excel"))
.stdout(predicate::str::contains("Sheet1"));
}
+
+// ─── Convert ───
+
+#[test]
+fn convert_csv_to_parquet() {
+ let out = NamedTempFile::with_suffix(".parquet").unwrap();
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("parquet")
+ .arg("-o").arg(out.path())
+ .assert().success();
+ dtcat().arg(out.path()).arg("--csv")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"))
+ .stdout(predicate::str::contains("Charlie"));
+}
+
+#[test]
+fn convert_parquet_to_csv_file() {
+ let out = NamedTempFile::with_suffix(".csv").unwrap();
+ dtcat().arg("tests/fixtures/data.parquet")
+ .arg("--convert").arg("csv")
+ .arg("-o").arg(out.path())
+ .assert().success();
+ dtcat().arg(out.path())
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"));
+}
+
+#[test]
+fn convert_csv_to_json_stdout() {
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("json")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"));
+}
+
+#[test]
+fn convert_csv_to_ndjson_stdout() {
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("ndjson")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"));
+}
+
+#[test]
+fn convert_parquet_no_output_errors() {
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("parquet")
+ .assert().failure();
+}
+
+#[test]
+fn convert_arrow_no_output_errors() {
+ dtcat().arg("tests/fixtures/data.csv")
+ .arg("--convert").arg("arrow")
+ .assert().failure();
+}
+
+#[test]
+fn convert_conflicts_with_schema() {
+ let f = csv_file("x\n1\n");
+ dtcat().arg(f.path()).arg("--convert").arg("csv").arg("--schema")
+ .assert().code(2);
+}
+
+#[test]
+fn convert_with_skip() {
+ let f = csv_file("meta\nname,value\nAlice,100\n");
+ dtcat().arg(f.path()).arg("--skip").arg("1").arg("--convert").arg("csv")
+ .assert().success()
+ .stdout(predicate::str::contains("Alice"));
+}