dt-cli-tools

CLI tools for viewing, filtering, and comparing tabular data files
Log | Files | Refs | README | LICENSE

commit 8f3a14e14fd5eb2b61c31c855004cf94e55425f8
parent 279df986b6657b229bdcb695e99294081a7a3fe4
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Tue, 31 Mar 2026 10:51:15 -0500

feat: add dtcat, dtfilter, and dtdiff binaries

- dtcat: view/inspect tabular data files with schema, describe, head/tail
- dtfilter: filter/query with expressions, sort, column selection
- dtdiff: compare two files with positional or key-based diff, multiple output formats

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Msrc/bin/dtcat.rs | 275++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Msrc/bin/dtdiff.rs | 420++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Msrc/bin/dtfilter.rs | 173++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Msrc/reader.rs | 2+-
4 files changed, 866 insertions(+), 4 deletions(-)

diff --git a/src/bin/dtcat.rs b/src/bin/dtcat.rs @@ -1 +1,274 @@ -fn main() {} +use std::path::PathBuf; +use std::process; + +use anyhow::{bail, Result}; +use clap::Parser; + +use dtcore::format::{detect_format, Format}; +use dtcore::formatter::{ + format_csv, format_data_table, format_describe, format_empty_sheet, format_head_tail, + format_header, format_schema, format_sheet_listing, +}; +use dtcore::metadata::SheetInfo; +use dtcore::reader::{read_file, read_file_info, ReadOptions}; + +/// Default row threshold: show all rows if <= this many, otherwise head+tail +const DEFAULT_THRESHOLD: usize = 50; +/// Default head/tail row count when splitting +const DEFAULT_HEAD_TAIL: usize = 25; + +#[derive(Parser)] +#[command( + name = "dtcat", + about = "View tabular data files in the terminal", + version +)] +struct Args { + /// File to view + file: String, + + /// Override format detection (csv, tsv, parquet, arrow, json, ndjson, excel) + #[arg(long, value_name = "FMT")] + format: Option<String>, + + /// Select sheet by name or 0-based index (Excel only) + #[arg(long, value_name = "NAME|INDEX")] + sheet: Option<String>, + + /// Skip first N rows + #[arg(long, value_name = "N")] + skip: Option<usize>, + + /// Show column names and types only + #[arg(long)] + schema: bool, + + /// Show summary statistics + #[arg(long)] + describe: bool, + + /// Show first N rows + #[arg(long, value_name = "N")] + head: Option<usize>, + + /// Show last N rows + #[arg(long, value_name = "N")] + tail: Option<usize>, + + /// Output as CSV instead of markdown table + #[arg(long)] + csv: bool, + + /// Show file metadata only + #[arg(long)] + info: bool, +} + +fn validate_args(args: &Args) -> Result<()> { + if args.schema && args.describe { + bail!("--schema and --describe are mutually exclusive"); + } + Ok(()) +} + +/// Build a synthetic SheetInfo for non-Excel formats from a loaded DataFrame. +fn sheet_info_from_df(file_name: &str, df: &polars::prelude::DataFrame) -> SheetInfo { + SheetInfo { + name: file_name.to_string(), + // rows includes the header row conceptually; formatter subtracts 1 + rows: df.height() + 1, + cols: df.width(), + } +} + +fn run(args: Args) -> Result<()> { + validate_args(&args)?; + + let path = PathBuf::from(&args.file); + if !path.exists() { + bail!("file not found: {}", path.display()); + } + + let fmt = detect_format(&path, args.format.as_deref())?; + + let file_name = path + .file_name() + .map(|s| s.to_string_lossy().to_string()) + .unwrap_or_else(|| args.file.clone()); + + // --info: show metadata and exit + if args.info { + let info = read_file_info(&path, fmt)?; + print!("{}", format_header(&file_name, &info)); + + // For Excel, also list sheet names and dimensions + if fmt == Format::Excel && !info.sheets.is_empty() { + println!(); + for sheet in &info.sheets { + let data_rows = if sheet.rows == 0 { 0 } else { sheet.rows - 1 }; + println!(" {} ({} rows x {} cols)", sheet.name, data_rows, sheet.cols); + } + } + return Ok(()); + } + + // Excel with multiple sheets and no --sheet: show sheet listing + if fmt == Format::Excel && args.sheet.is_none() { + let info = read_file_info(&path, fmt)?; + if info.sheets.len() > 1 { + // Load a small sample of each sheet to display schemas + let mut schemas: Vec<(SheetInfo, polars::prelude::DataFrame)> = Vec::new(); + for sheet in &info.sheets { + let opts = ReadOptions { + sheet: Some(sheet.name.clone()), + skip_rows: args.skip, + separator: None, + }; + match read_file(&path, fmt, &opts) { + Ok(df) => schemas.push((sheet.clone(), df)), + Err(_) => { + // Empty or unreadable sheet + schemas.push(( + SheetInfo { + name: sheet.name.clone(), + rows: 0, + cols: 0, + }, + polars::prelude::DataFrame::default(), + )); + } + } + } + let schema_refs: Vec<(&SheetInfo, polars::prelude::DataFrame)> = schemas + .iter() + .map(|(s, df)| (s, df.clone())) + .collect(); + print!( + "{}", + format_sheet_listing(&file_name, &info, &schema_refs) + ); + return Ok(()); + } + } + + // Build read options + let opts = ReadOptions { + sheet: args.sheet.clone(), + skip_rows: args.skip, + separator: None, + }; + + let df = read_file(&path, fmt, &opts)?; + + // Determine sheet info for display + let sheet = if fmt == Format::Excel { + // Try to get the sheet name we actually read + let info = read_file_info(&path, fmt)?; + if let Some(sheet_arg) = &args.sheet { + // Find the matching sheet in info + let matched = info.sheets.iter().find(|s| { + &s.name == sheet_arg + || sheet_arg + .parse::<usize>() + .map(|idx| { + info.sheets + .iter() + .position(|x| x.name == s.name) + .map(|i| i == idx) + .unwrap_or(false) + }) + .unwrap_or(false) + }); + if let Some(s) = matched { + s.clone() + } else { + // Fallback: build from df + SheetInfo { + name: sheet_arg.clone(), + rows: df.height() + 1, + cols: df.width(), + } + } + } else if let Some(first) = info.sheets.first() { + first.clone() + } else { + sheet_info_from_df(&file_name, &df) + } + } else { + sheet_info_from_df(&file_name, &df) + }; + + // Handle empty DataFrame + if df.is_empty() { + print!("{}", format_empty_sheet(&sheet)); + return Ok(()); + } + + // --schema + if args.schema { + print!("{}", format_schema(&sheet, &df)); + return Ok(()); + } + + // --describe + if args.describe { + print!("{}", format_describe(&df)); + return Ok(()); + } + + // --csv output mode + if args.csv { + print!("{}", format_csv(&df)); + return Ok(()); + } + + // Determine what to display + let output = match (args.head, args.tail) { + (Some(h), Some(t)) => { + // Both specified: show head + tail with omission line + format_head_tail(&df, h, t) + } + (Some(h), None) => { + // Only --head: slice the DataFrame and show all + let sliced = df.head(Some(h)); + format_data_table(&sliced) + } + (None, Some(t)) => { + // Only --tail: slice and show all + let sliced = df.tail(Some(t)); + format_data_table(&sliced) + } + (None, None) => { + // Default: show all if <= threshold, otherwise head+tail + if df.height() <= DEFAULT_THRESHOLD { + format_data_table(&df) + } else { + format_head_tail(&df, DEFAULT_HEAD_TAIL, DEFAULT_HEAD_TAIL) + } + } + }; + + print!("{}", output); + Ok(()) +} + +fn main() { + let args = Args::parse(); + match run(args) { + Ok(()) => {} + Err(err) => { + // Check if this is an arg validation error (exit 2) vs runtime error (exit 1) + let msg = err.to_string(); + if msg.contains("mutually exclusive") + || msg.contains("invalid") + || msg.contains("unknown format") + { + eprintln!("dtcat: {err}"); + process::exit(2); + } else { + eprintln!("dtcat: {err}"); + process::exit(1); + } + } + } +} diff --git a/src/bin/dtdiff.rs b/src/bin/dtdiff.rs @@ -1 +1,419 @@ -fn main() {} +use std::io::IsTerminal; +use std::path::PathBuf; +use std::process; + +use anyhow::{Result, bail}; +use clap::Parser; +use serde_json::{Map, Value, json}; + +use dtcore::diff::{DiffOptions, DiffResult, SheetSource}; +use dtcore::format::{detect_format, Format}; +use dtcore::reader::{ReadOptions, read_file}; + +#[derive(Parser)] +#[command( + name = "dtdiff", + about = "Compare two tabular data files and show differences", + version +)] +struct Args { + /// First file to compare + file_a: String, + + /// Second file to compare + file_b: String, + + /// Override format detection (applies to both files) + #[arg(long, value_name = "FMT")] + format: Option<String>, + + /// Select sheet by name or index (Excel only) + #[arg(long, value_name = "NAME|INDEX")] + sheet: Option<String>, + + /// Key column(s) for matched comparison (comma-separated) + #[arg(long, value_name = "COL")] + key: Option<String>, + + /// Float comparison tolerance (default: 1e-10) + #[arg(long)] + tolerance: Option<f64>, + + /// Output as JSON + #[arg(long)] + json: bool, + + /// Output as CSV + #[arg(long)] + csv: bool, + + /// Disable colored output + #[arg(long)] + no_color: bool, +} + +// --------------------------------------------------------------------------- +// Output formatters (ported from xldiff.rs) +// --------------------------------------------------------------------------- + +/// Format a row's values inline: `Name: "Alice" Score: "90"` +fn format_row_inline(headers: &[String], values: &[String]) -> String { + headers + .iter() + .zip(values.iter()) + .map(|(h, v)| format!("{}: \"{}\"", h, v)) + .collect::<Vec<_>>() + .join(" ") +} + +/// Format diff result as colored (or plain) text output. +fn format_text(result: &DiffResult, color: bool) -> String { + if !result.has_differences() { + return "No differences found.\n".to_string(); + } + + let (red, green, yellow, reset) = if color { + ("\x1b[31m", "\x1b[32m", "\x1b[33m", "\x1b[0m") + } else { + ("", "", "", "") + }; + + let mut out = String::new(); + + // Header + out.push_str(&format!( + "--- {} ({})\n+++ {} ({})\n\n", + result.source_a.sheet_name, + result.source_a.file_name, + result.source_b.sheet_name, + result.source_b.file_name, + )); + + // Summary + out.push_str(&format!( + "Added: {} | Removed: {} | Modified: {}\n\n", + result.added.len(), + result.removed.len(), + result.modified.len(), + )); + + // Removed rows + for row in &result.removed { + out.push_str(&format!( + "{}- {}{}", + red, + format_row_inline(&result.headers, &row.values), + reset, + )); + out.push('\n'); + } + + // Added rows + for row in &result.added { + out.push_str(&format!( + "{}+ {}{}", + green, + format_row_inline(&result.headers, &row.values), + reset, + )); + out.push('\n'); + } + + // Modified rows + for m in &result.modified { + let key_display: Vec<String> = result + .key_columns + .iter() + .zip(m.key.iter()) + .map(|(col, val)| format!("{}: \"{}\"", col, val)) + .collect(); + out.push_str(&format!( + "{}~ {}{}", + yellow, + key_display.join(" "), + reset, + )); + out.push('\n'); + for change in &m.changes { + out.push_str(&format!( + " {}: \"{}\" \u{2192} \"{}\"\n", + change.column, change.old_value, change.new_value, + )); + } + } + + out +} + +/// Format diff result as JSON. +fn format_json(result: &DiffResult) -> String { + let added: Vec<Value> = result + .added + .iter() + .map(|row| { + let mut map = Map::new(); + for (h, v) in result.headers.iter().zip(row.values.iter()) { + map.insert(h.clone(), Value::String(v.clone())); + } + Value::Object(map) + }) + .collect(); + + let removed: Vec<Value> = result + .removed + .iter() + .map(|row| { + let mut map = Map::new(); + for (h, v) in result.headers.iter().zip(row.values.iter()) { + map.insert(h.clone(), Value::String(v.clone())); + } + Value::Object(map) + }) + .collect(); + + let modified: Vec<Value> = result + .modified + .iter() + .map(|m| { + let mut key_map = Map::new(); + for (col, val) in result.key_columns.iter().zip(m.key.iter()) { + key_map.insert(col.clone(), Value::String(val.clone())); + } + let changes: Vec<Value> = m + .changes + .iter() + .map(|c| { + json!({ + "column": c.column, + "old": c.old_value, + "new": c.new_value, + }) + }) + .collect(); + json!({ + "key": Value::Object(key_map), + "changes": changes, + }) + }) + .collect(); + + let output = json!({ + "added": added, + "removed": removed, + "modified": modified, + }); + + serde_json::to_string_pretty(&output).unwrap() + "\n" +} + +/// Quote a value per RFC 4180: if it contains comma, quote, or newline, wrap +/// in double quotes and escape any internal quotes by doubling them. +fn csv_quote(value: &str) -> String { + if value.contains(',') || value.contains('"') || value.contains('\n') { + format!("\"{}\"", value.replace('"', "\"\"")) + } else { + value.to_string() + } +} + +/// Build a CSV row from a slice of values. +fn csv_row(values: &[String]) -> String { + values.iter().map(|v| csv_quote(v)).collect::<Vec<_>>().join(",") +} + +/// Format diff result as CSV. +/// +/// Header: _status, col1, col2, ..., _old_col1, _old_col2, ... +/// Added rows: "added" + values + empty _old_ columns +/// Removed rows: "removed" + values + empty _old_ columns +/// Modified rows: "modified" + new values + old values in _old_ columns +fn format_csv_output(result: &DiffResult) -> String { + let mut out = String::new(); + + // Build header + let mut header_parts: Vec<String> = vec!["_status".to_string()]; + for h in &result.headers { + header_parts.push(h.clone()); + } + for h in &result.headers { + header_parts.push(format!("_old_{}", h)); + } + out.push_str(&csv_row(&header_parts)); + out.push('\n'); + + let empty_cols: Vec<String> = result.headers.iter().map(|_| String::new()).collect(); + + // Removed rows + for row in &result.removed { + let mut parts: Vec<String> = vec!["removed".to_string()]; + parts.extend(row.values.iter().cloned()); + while parts.len() < 1 + result.headers.len() { + parts.push(String::new()); + } + parts.extend(empty_cols.iter().cloned()); + out.push_str(&csv_row(&parts)); + out.push('\n'); + } + + // Added rows + for row in &result.added { + let mut parts: Vec<String> = vec!["added".to_string()]; + parts.extend(row.values.iter().cloned()); + while parts.len() < 1 + result.headers.len() { + parts.push(String::new()); + } + parts.extend(empty_cols.iter().cloned()); + out.push_str(&csv_row(&parts)); + out.push('\n'); + } + + // Modified rows + for m in &result.modified { + let mut main_cols: Vec<String> = Vec::new(); + let mut old_cols: Vec<String> = Vec::new(); + + for h in &result.headers { + if let Some(key_idx) = result.key_columns.iter().position(|k| k == h) { + main_cols.push(m.key.get(key_idx).cloned().unwrap_or_default()); + old_cols.push(String::new()); + } else if let Some(change) = m.changes.iter().find(|c| c.column == *h) { + main_cols.push(change.new_value.clone()); + old_cols.push(change.old_value.clone()); + } else { + // Unchanged non-key column — leave empty in both + main_cols.push(String::new()); + old_cols.push(String::new()); + } + } + + let mut parts: Vec<String> = vec!["modified".to_string()]; + parts.extend(main_cols); + parts.extend(old_cols); + out.push_str(&csv_row(&parts)); + out.push('\n'); + } + + out +} + +// --------------------------------------------------------------------------- +// run / main +// --------------------------------------------------------------------------- + +fn run(args: Args) -> Result<()> { + let path_a = PathBuf::from(&args.file_a); + let path_b = PathBuf::from(&args.file_b); + + // Validate files exist + if !path_a.exists() { + bail!("file not found: {}", path_a.display()); + } + if !path_b.exists() { + bail!("file not found: {}", path_b.display()); + } + + // Detect formats + let fmt_a = detect_format(&path_a, args.format.as_deref())?; + let fmt_b = detect_format(&path_b, args.format.as_deref())?; + + // Enforce same-format constraint + if !fmt_a.same_family(fmt_b) { + bail!( + "files have incompatible formats: {:?} vs {:?}. Both files must use the same format family.", + fmt_a, + fmt_b + ); + } + + // Build read options + let opts_a = ReadOptions { + sheet: args.sheet.clone(), + skip_rows: None, + separator: None, + }; + let opts_b = ReadOptions { + sheet: args.sheet.clone(), + skip_rows: None, + separator: None, + }; + + // Read DataFrames + let df_a = read_file(&path_a, fmt_a, &opts_a)?; + let df_b = read_file(&path_b, fmt_b, &opts_b)?; + + // Resolve key columns + let key_columns: Vec<String> = if let Some(ref key_str) = args.key { + key_str.split(',').map(|s| s.trim().to_string()).collect() + } else { + vec![] + }; + + // Build source labels + let file_name_a = path_a + .file_name() + .map(|s| s.to_string_lossy().to_string()) + .unwrap_or_else(|| args.file_a.clone()); + let file_name_b = path_b + .file_name() + .map(|s| s.to_string_lossy().to_string()) + .unwrap_or_else(|| args.file_b.clone()); + + // Use file name as "sheet name" for non-Excel formats; for Excel use the + // sheet name from opts (or a placeholder if none was specified). + let sheet_name_a = if fmt_a == Format::Excel { + args.sheet.clone().unwrap_or_else(|| file_name_a.clone()) + } else { + file_name_a.clone() + }; + let sheet_name_b = if fmt_b == Format::Excel { + args.sheet.clone().unwrap_or_else(|| file_name_b.clone()) + } else { + file_name_b.clone() + }; + + let source_a = SheetSource { + file_name: file_name_a, + sheet_name: sheet_name_a, + }; + let source_b = SheetSource { + file_name: file_name_b, + sheet_name: sheet_name_b, + }; + + let diff_opts = DiffOptions { + key_columns, + tolerance: args.tolerance, + }; + + // Run diff + let result = dtcore::diff::diff_sheets(&df_a, &df_b, &diff_opts, source_a, source_b)?; + + // TTY detection for color + let use_color = !args.no_color && std::io::stdout().is_terminal(); + + // Format output: --json and --csv are mutually exclusive flags; default is text + let output = if args.json { + format_json(&result) + } else if args.csv { + format_csv_output(&result) + } else { + format_text(&result, use_color) + }; + + print!("{}", output); + + // Exit 1 if differences found (diff convention), 0 if identical + if result.has_differences() { + process::exit(1); + } + + Ok(()) +} + +fn main() { + let args = Args::parse(); + if let Err(err) = run(args) { + eprintln!("dtdiff: {err}"); + process::exit(2); + } +} diff --git a/src/bin/dtfilter.rs b/src/bin/dtfilter.rs @@ -1 +1,172 @@ -fn main() {} +use std::io::Write; +use std::path::PathBuf; +use std::process; + +use anyhow::{Result, bail}; +use clap::Parser; + +use dtcore::filter::{FilterOptions, parse_filter_expr, parse_sort_spec, filter_pipeline}; +use dtcore::format::detect_format; +use dtcore::formatter::{format_data_table, format_csv}; +use dtcore::reader::{ReadOptions, read_file}; + +// --------------------------------------------------------------------------- +// Argument parsing +// --------------------------------------------------------------------------- + +#[derive(Parser)] +#[command( + name = "dtfilter", + about = "Filter, sort, and select columns from tabular data files", + version +)] +struct Args { + /// Input file + file: String, + + /// Override format detection + #[arg(long, value_name = "FMT")] + format: Option<String>, + + /// Select sheet by name or index (Excel only) + #[arg(long, value_name = "NAME|INDEX")] + sheet: Option<String>, + + /// Skip first N rows after the header + #[arg(long, value_name = "N")] + skip: Option<usize>, + + /// Filter expression(s), e.g. "State=CA", "Amount>1000" (repeatable, ANDed) + #[arg(long = "filter", value_name = "EXPR", action = clap::ArgAction::Append)] + filters: Vec<String>, + + /// Sort spec, e.g. "Amount:desc" or "Name" + #[arg(long, value_name = "SPEC")] + sort: Option<String>, + + /// Select columns by name (comma-separated) + #[arg(long, value_name = "COLS")] + columns: Option<String>, + + /// First N rows (before filter) + #[arg(long, value_name = "N")] + head: Option<usize>, + + /// Last N rows (before filter) + #[arg(long, value_name = "N")] + tail: Option<usize>, + + /// Max output rows (after filter) + #[arg(long, value_name = "N")] + limit: Option<usize>, + + /// Output as CSV + #[arg(long)] + csv: bool, +} + +// --------------------------------------------------------------------------- +// Validation helpers +// --------------------------------------------------------------------------- + +/// Validate args and return an error message for invalid combinations. +/// Returns exit-code 2 on any argument error. +fn validate_args(args: &Args) -> Result<(), ArgError> { + if args.head.is_some() && args.tail.is_some() { + return Err(ArgError("--head and --tail are mutually exclusive".to_string())); + } + Ok(()) +} + +struct ArgError(String); + +// --------------------------------------------------------------------------- +// Core logic +// --------------------------------------------------------------------------- + +fn run(args: Args) -> Result<()> { + let path = PathBuf::from(&args.file); + + if !path.exists() { + bail!("file not found: {}", path.display()); + } + + // Detect format + let fmt = detect_format(&path, args.format.as_deref())?; + + // Build read options + let read_opts = ReadOptions { + sheet: args.sheet.clone(), + skip_rows: args.skip, + separator: None, + }; + + // Read the DataFrame + let df = read_file(&path, fmt, &read_opts)?; + + // Parse filter expressions + let filters = args + .filters + .iter() + .map(|s| parse_filter_expr(s).map_err(|e| anyhow::anyhow!("{}", e))) + .collect::<Result<Vec<_>>>()?; + + // Parse sort spec + let sort = args + .sort + .as_deref() + .map(|s| parse_sort_spec(s).map_err(|e| anyhow::anyhow!("{}", e))) + .transpose()?; + + // Parse column selection + let cols: Option<Vec<String>> = args.columns.as_deref().map(|s| { + s.split(',') + .map(|c| c.trim().to_string()) + .filter(|c| !c.is_empty()) + .collect() + }); + + // Build filter options + let filter_opts = FilterOptions { + filters, + cols, + sort, + limit: args.limit, + head: args.head, + tail: args.tail, + }; + + // Run the pipeline + let result = filter_pipeline(df, &filter_opts)?; + + // Report row count to stderr + let row_count = result.height(); + eprintln!("{} row{}", row_count, if row_count == 1 { "" } else { "s" }); + + // Output + let output = if args.csv { + format_csv(&result) + } else { + format_data_table(&result) + }; + + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + out.write_all(output.as_bytes())?; + + Ok(()) +} + +fn main() { + let args = Args::parse(); + + if let Err(e) = validate_args(&args) { + eprintln!("dtfilter: {}", e.0); + process::exit(2); + } + + if let Err(err) = run(args) { + eprintln!("dtfilter: {err}"); + process::exit(1); + } +} diff --git a/src/reader.rs b/src/reader.rs @@ -3,7 +3,7 @@ use polars::prelude::*; use std::path::Path; use crate::format::Format; -use crate::metadata::{FileInfo, SheetInfo}; +use crate::metadata::FileInfo; use crate::readers; /// Options that control how a file is read.