dt-cli-tools

CLI tools for viewing, filtering, and comparing tabular data files
Log | Files | Refs | README | LICENSE

commit 37a609215ba119cf6ecc566db0fca113512cfc95
parent f8c2df851876904698f838083527a6bb301c078c
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Mon, 30 Mar 2026 23:23:53 -0500

feat: port formatter module from xl-cli-tools with format-name support

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Diffstat:
Msrc/formatter.rs | 528+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 528 insertions(+), 0 deletions(-)

diff --git a/src/formatter.rs b/src/formatter.rs @@ -0,0 +1,528 @@ +use crate::metadata::{format_file_size, format_name, FileInfo, SheetInfo}; +use polars::prelude::*; +use std::fmt::Write as FmtWrite; + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/// Render the top-level file header. +/// +/// ```text +/// # File: report.xlsx (245 KB) [Excel] +/// # Sheets: 3 +/// ``` +pub fn format_header(file_name: &str, info: &FileInfo) -> String { + let size_str = format_file_size(info.file_size); + let fmt_name = format_name(info.format); + let sheet_count = info.sheets.len(); + if sheet_count > 1 { + format!("# File: {file_name} ({size_str}) [{fmt_name}]\n# Sheets: {sheet_count}\n") + } else { + format!("# File: {file_name} ({size_str}) [{fmt_name}]\n") + } +} + +/// Render the schema block for a single sheet. +/// +/// ```text +/// ## Sheet: Revenue (1240 rows x 8 cols) +/// +/// | Column | Type | +/// |--------|------| +/// | date | Date | +/// ... +/// ``` +pub fn format_schema(sheet: &SheetInfo, df: &DataFrame) -> String { + let data_rows = if sheet.rows == 0 { 0 } else { sheet.rows - 1 }; + let cols = sheet.cols; + + let mut out = format!( + "## Sheet: {} ({} rows x {} cols)\n\n", + sheet.name, data_rows, cols + ); + let headers = vec!["Column".to_string(), "Type".to_string()]; + let rows: Vec<Vec<String>> = df + .get_columns() + .iter() + .map(|col| vec![col.name().to_string(), format_dtype(col.dtype()).to_string()]) + .collect(); + out.push_str(&render_table(&headers, &rows)); + out +} + +/// Render the multi-sheet listing (header + schema for each + prompt). +pub fn format_sheet_listing( + file_name: &str, + info: &FileInfo, + schemas: &[(&SheetInfo, DataFrame)], +) -> String { + let mut out = format_header(file_name, info); + out.push('\n'); + + for (sheet, df) in schemas { + if sheet.rows == 0 && sheet.cols == 0 { + out.push_str(&format_empty_sheet(sheet)); + } else { + out.push_str(&format_schema(sheet, df)); + } + out.push('\n'); + } + + out.push_str("Use --sheet <name> to view a specific sheet.\n"); + out +} + +/// Render the full DataFrame as a markdown table with aligned columns. +pub fn format_data_table(df: &DataFrame) -> String { + let (headers, rows) = df_to_strings(df); + render_table(&headers, &rows) +} + +/// Render head / tail view of a DataFrame with aligned columns. +/// +/// If total rows <= head_n + tail_n, shows all rows. +/// Otherwise shows first head_n rows, an omission line, then last tail_n rows. +/// Column widths are computed from both head and tail so pipes stay aligned. +pub fn format_head_tail(df: &DataFrame, head_n: usize, tail_n: usize) -> String { + let total = df.height(); + if total <= head_n + tail_n { + return format_data_table(df); + } + + let head_df = df.head(Some(head_n)); + let tail_df = df.tail(Some(tail_n)); + let omitted = total - head_n - tail_n; + + let (headers, head_rows) = df_to_strings(&head_df); + let (_, tail_rows) = df_to_strings(&tail_df); + + // Compute widths from both head and tail rows + let mut all_rows = head_rows.clone(); + all_rows.extend(tail_rows.clone()); + let widths = compute_col_widths(&headers, &all_rows); + + let mut out = render_table_header(&headers, &widths); + out.push_str(&render_table_rows(&head_rows, &widths)); + out.push_str(&format!("... ({omitted} rows omitted) ...\n")); + out.push_str(&render_table_rows(&tail_rows, &widths)); + out +} + +/// Render DataFrame as CSV. +pub fn format_csv(df: &DataFrame) -> String { + let mut buf: Vec<u8> = Vec::new(); + // CsvWriter is available via the "csv" feature (polars 0.46) + if CsvWriter::new(&mut buf) + .finish(&mut df.clone()) + .is_ok() + { + return String::from_utf8(buf).unwrap_or_else(|_| csv_fallback(df)); + } + csv_fallback(df) +} + +/// Render a message for an empty or header-only sheet. +pub fn format_empty_sheet(sheet: &SheetInfo) -> String { + if sheet.rows == 0 && sheet.cols == 0 { + format!("## Sheet: {} (empty)\n", sheet.name) + } else { + format!("## Sheet: {} (no data rows)\n", sheet.name) + } +} + +/// Render summary statistics for each column as a markdown table. +/// +/// Stats are rows, columns are DataFrame columns: +/// | stat | col1 | col2 | ... | +/// |------|------|------|-----| +/// | count | ... | ... | ... | +/// ... +pub fn format_describe(df: &DataFrame) -> String { + let columns = df.get_columns(); + let stats = ["count", "null_count", "mean", "std", "min", "max", "median", "unique"]; + + let mut headers = vec!["stat".to_string()]; + headers.extend(columns.iter().map(|c| c.name().to_string())); + + let rows: Vec<Vec<String>> = stats + .iter() + .map(|stat| { + let mut row = vec![stat.to_string()]; + row.extend(columns.iter().map(|col| compute_stat(col, stat))); + row + }) + .collect(); + + render_table(&headers, &rows) +} + +fn compute_stat(col: &Column, stat: &str) -> String { + let series = col.as_materialized_series(); + match stat { + "count" => series.len().to_string(), + "null_count" => series.null_count().to_string(), + "mean" => { + if is_numeric(series.dtype()) { + series.mean().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) + } else { + "-".into() + } + } + "std" => { + if is_numeric(series.dtype()) { + series.std(1).map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) + } else { + "-".into() + } + } + "min" => { + if is_numeric(series.dtype()) { + match series.min_reduce() { + Ok(v) => v.value().to_string(), + Err(_) => "-".into(), + } + } else { + "-".into() + } + } + "max" => { + if is_numeric(series.dtype()) { + match series.max_reduce() { + Ok(v) => v.value().to_string(), + Err(_) => "-".into(), + } + } else { + "-".into() + } + } + "median" => { + if is_numeric(series.dtype()) { + series.median().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) + } else { + "-".into() + } + } + "unique" => match series.n_unique() { + Ok(n) => n.to_string(), + Err(_) => "-".into(), + }, + _ => "-".into(), + } +} + +fn is_numeric(dtype: &DataType) -> bool { + matches!( + dtype, + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 + ) +} + +// --------------------------------------------------------------------------- +// Private helpers +// --------------------------------------------------------------------------- + +/// Extract headers and row data as strings from a DataFrame. +fn df_to_strings(df: &DataFrame) -> (Vec<String>, Vec<Vec<String>>) { + let columns = df.get_columns(); + let headers: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect(); + let rows: Vec<Vec<String>> = (0..df.height()) + .map(|i| columns.iter().map(|c| format_cell(c, i)).collect()) + .collect(); + (headers, rows) +} + +/// Compute the display width for each column. +pub fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> { + let mut widths: Vec<usize> = headers.iter().map(|h| h.len().max(3)).collect(); + for row in rows { + for (i, cell) in row.iter().enumerate() { + if i < widths.len() { + widths[i] = widths[i].max(cell.len()); + } + } + } + widths +} + +/// Render a markdown table header + separator line. +pub fn render_table_header(headers: &[String], widths: &[usize]) -> String { + let mut out = String::new(); + out.push('|'); + for (i, h) in headers.iter().enumerate() { + let _ = write!(out, " {:<w$} |", h, w = widths[i]); + } + out.push('\n'); + out.push('|'); + for w in widths { + out.push('-'); + for _ in 0..*w { + out.push('-'); + } + out.push_str("-|"); + } + out.push('\n'); + out +} + +/// Render markdown table data rows (no header). +pub fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String { + let mut out = String::new(); + for row in rows { + out.push('|'); + for (i, cell) in row.iter().enumerate() { + let w = if i < widths.len() { widths[i] } else { cell.len() }; + let _ = write!(out, " {:<w$} |", cell, w = w); + } + out.push('\n'); + } + out +} + +/// Render a complete aligned markdown table. +pub fn render_table(headers: &[String], rows: &[Vec<String>]) -> String { + let widths = compute_col_widths(headers, rows); + let mut out = render_table_header(headers, &widths); + out.push_str(&render_table_rows(rows, &widths)); + out +} + +/// Format a single cell value for markdown display. +fn format_cell(col: &Column, idx: usize) -> String { + match col.get(idx) { + Ok(AnyValue::Null) | Err(_) => String::new(), + Ok(v) => format_any_value(&v), + } +} + +/// Convert an AnyValue to its display string. +pub fn format_any_value(v: &AnyValue) -> String { + match v { + AnyValue::Null => String::new(), + AnyValue::Boolean(b) => b.to_string(), + AnyValue::Int8(n) => n.to_string(), + AnyValue::Int16(n) => n.to_string(), + AnyValue::Int32(n) => n.to_string(), + AnyValue::Int64(n) => n.to_string(), + AnyValue::UInt8(n) => n.to_string(), + AnyValue::UInt16(n) => n.to_string(), + AnyValue::UInt32(n) => n.to_string(), + AnyValue::UInt64(n) => n.to_string(), + AnyValue::Float32(f) => f.to_string(), + AnyValue::Float64(f) => f.to_string(), + AnyValue::String(s) => s.to_string(), + AnyValue::StringOwned(s) => s.to_string(), + other => format!("{other}"), + } +} + +/// Map a polars DataType to a human-readable label. +fn format_dtype(dtype: &DataType) -> &'static str { + match dtype { + DataType::Boolean => "Boolean", + DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => "Int", + DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => "UInt", + DataType::Float32 | DataType::Float64 => "Float", + DataType::String => "String", + DataType::Date => "Date", + DataType::Datetime(_, _) => "Datetime", + DataType::Duration(_) => "Duration", + DataType::Time => "Time", + DataType::Null => "Null", + _ => "Other", + } +} + +/// Manual CSV fallback if CsvWriter is unavailable. +fn csv_fallback(df: &DataFrame) -> String { + let columns = df.get_columns(); + let n_rows = df.height(); + + let mut out = String::new(); + + // Header + let header: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect(); + out.push_str(&header.join(",")); + out.push('\n'); + + // Rows + for row_idx in 0..n_rows { + let row: Vec<String> = columns + .iter() + .map(|col| { + let cell = format_cell(col, row_idx); + // Quote cells containing commas or quotes + if cell.contains(',') || cell.contains('"') || cell.contains('\n') { + format!("\"{}\"", cell.replace('"', "\"\"")) + } else { + cell + } + }) + .collect(); + out.push_str(&row.join(",")); + out.push('\n'); + } + + out +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{FileInfo, SheetInfo}; + + #[test] + fn test_format_header() { + use crate::format::Format; + let info = FileInfo { + file_size: 250_000, + format: Format::Excel, + sheets: vec![ + SheetInfo { name: "Sheet1".into(), rows: 100, cols: 5 }, + SheetInfo { name: "Sheet2".into(), rows: 50, cols: 3 }, + ], + }; + let out = format_header("test.xlsx", &info); + assert!(out.contains("# File: test.xlsx (244 KB) [Excel]")); + assert!(out.contains("# Sheets: 2")); + } + + #[test] + fn test_format_header_single_sheet() { + use crate::format::Format; + let info = FileInfo { + file_size: 1_000, + format: Format::Csv, + sheets: vec![SheetInfo { name: "data".into(), rows: 10, cols: 3 }], + }; + let out = format_header("data.csv", &info); + assert!(out.contains("[CSV]")); + assert!(!out.contains("Sheets")); + } + + #[test] + fn test_format_data_table() { + let s1 = Series::new("name".into(), &["Alice", "Bob"]); + let s2 = Series::new("value".into(), &[100i64, 200]); + let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); + let out = format_data_table(&df); + assert!(out.contains("| name | value |")); + assert!(out.contains("| Alice | 100 |")); + // Verify pipes are aligned: all lines have same length + let lines: Vec<&str> = out.trim().lines().collect(); + assert!(lines.len() >= 3); + let expected_len = lines[0].len(); + for line in &lines { + assert_eq!(line.len(), expected_len, "Misaligned: {line}"); + } + } + + #[test] + fn test_format_head_tail_small() { + let s = Series::new("x".into(), &[1i64, 2, 3]); + let df = DataFrame::new(vec![s.into_column()]).unwrap(); + let out = format_head_tail(&df, 25, 25); + assert!(!out.contains("omitted")); + assert!(out.contains("| 1 ")); + assert!(out.contains("| 3 ")); + } + + #[test] + fn test_format_head_tail_large() { + // 60 rows, head=25 tail=25 → 10 omitted + let values: Vec<i64> = (1..=60).collect(); + let s = Series::new("n".into(), values.as_slice()); + let df = DataFrame::new(vec![s.into_column()]).unwrap(); + let out = format_head_tail(&df, 25, 25); + assert!(out.contains("(10 rows omitted)")); + assert!(out.contains("| 1 ")); + assert!(out.contains("| 25 ")); + assert!(out.contains("| 36 ")); + assert!(out.contains("| 60 ")); + } + + #[test] + fn test_format_schema() { + let sheet = SheetInfo { + name: "Revenue".into(), + rows: 11, // 1 header + 10 data + cols: 2, + }; + let s1 = Series::new("date".into(), &["2024-01-01", "2024-01-02"]); + let s2 = Series::new("amount".into(), &[1.0f64, 2.0]); + let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); + let out = format_schema(&sheet, &df); + assert!(out.contains("## Sheet: Revenue (10 rows x 2 cols)")); + assert!(out.contains("| date")); + assert!(out.contains("| amount")); + assert!(out.contains("String")); + assert!(out.contains("Float")); + } + + #[test] + fn test_format_empty_sheet_completely_empty() { + let sheet = SheetInfo { name: "Blank".into(), rows: 0, cols: 0 }; + let out = format_empty_sheet(&sheet); + assert!(out.contains("(empty)")); + } + + #[test] + fn test_format_empty_sheet_header_only() { + let sheet = SheetInfo { name: "Headers".into(), rows: 1, cols: 3 }; + let out = format_empty_sheet(&sheet); + assert!(out.contains("(no data rows)")); + } + + #[test] + fn test_format_csv() { + let s1 = Series::new("a".into(), &["hello", "world"]); + let s2 = Series::new("b".into(), &[1i64, 2]); + let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); + let out = format_csv(&df); + assert!(out.contains("a,b")); + assert!(out.contains("hello")); + assert!(out.contains("world")); + } + + #[test] + fn test_format_describe() { + let s_name = Series::new("name".into(), &["Alice", "Bob", "Carol"]); + let s_val = Series::new("value".into(), &[10i64, 20, 30]); + let df = DataFrame::new(vec![s_name.into_column(), s_val.into_column()]).unwrap(); + let out = format_describe(&df); + // Header row contains stat and column names + assert!(out.contains("| stat")); + assert!(out.contains("name")); + assert!(out.contains("value")); + // All stat rows are present + assert!(out.contains("| count")); + assert!(out.contains("| null_count")); + assert!(out.contains("| mean")); + assert!(out.contains("| std")); + assert!(out.contains("| min")); + assert!(out.contains("| max")); + assert!(out.contains("| median")); + assert!(out.contains("| unique")); + // Non-numeric column shows "-" for mean + assert!(out.contains("| -")); + // Verify alignment: all table lines should have same length + let table_lines: Vec<&str> = out.trim().lines().filter(|l| l.starts_with('|')).collect(); + let expected_len = table_lines[0].len(); + for line in &table_lines { + assert_eq!(line.len(), expected_len, "Misaligned: {line}"); + } + } +}