feat: port formatter module from xl-cli-tools with format-name support - dt-cli-tools - CLI tools for viewing, filtering, and comparing tabular data files

commit 37a609215ba119cf6ecc566db0fca113512cfc95
parent f8c2df851876904698f838083527a6bb301c078c
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Mon, 30 Mar 2026 23:23:53 -0500

feat: port formatter module from xl-cli-tools with format-name support

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Diffstat:
M src/formatter.rs  | 528 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 528 insertions(+), 0 deletions(-)
diff --git a/src/formatter.rs b/src/formatter.rs
@@ -0,0 +1,528 @@
+use crate::metadata::{format_file_size, format_name, FileInfo, SheetInfo};
+use polars::prelude::*;
+use std::fmt::Write as FmtWrite;
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+/// Render the top-level file header.
+///
+/// ```text
+/// # File: report.xlsx (245 KB) [Excel]
+/// # Sheets: 3
+/// ```
+pub fn format_header(file_name: &str, info: &FileInfo) -> String {
+    let size_str = format_file_size(info.file_size);
+    let fmt_name = format_name(info.format);
+    let sheet_count = info.sheets.len();
+    if sheet_count > 1 {
+        format!("# File: {file_name} ({size_str}) [{fmt_name}]\n# Sheets: {sheet_count}\n")
+    } else {
+        format!("# File: {file_name} ({size_str}) [{fmt_name}]\n")
+    }
+}
+
+/// Render the schema block for a single sheet.
+///
+/// ```text
+/// ## Sheet: Revenue (1240 rows x 8 cols)
+///
+/// | Column | Type |
+/// |--------|------|
+/// | date   | Date |
+/// ...
+/// ```
+pub fn format_schema(sheet: &SheetInfo, df: &DataFrame) -> String {
+    let data_rows = if sheet.rows == 0 { 0 } else { sheet.rows - 1 };
+    let cols = sheet.cols;
+
+    let mut out = format!(
+        "## Sheet: {} ({} rows x {} cols)\n\n",
+        sheet.name, data_rows, cols
+    );
+    let headers = vec!["Column".to_string(), "Type".to_string()];
+    let rows: Vec<Vec<String>> = df
+        .get_columns()
+        .iter()
+        .map(|col| vec![col.name().to_string(), format_dtype(col.dtype()).to_string()])
+        .collect();
+    out.push_str(&render_table(&headers, &rows));
+    out
+}
+
+/// Render the multi-sheet listing (header + schema for each + prompt).
+pub fn format_sheet_listing(
+    file_name: &str,
+    info: &FileInfo,
+    schemas: &[(&SheetInfo, DataFrame)],
+) -> String {
+    let mut out = format_header(file_name, info);
+    out.push('\n');
+
+    for (sheet, df) in schemas {
+        if sheet.rows == 0 && sheet.cols == 0 {
+            out.push_str(&format_empty_sheet(sheet));
+        } else {
+            out.push_str(&format_schema(sheet, df));
+        }
+        out.push('\n');
+    }
+
+    out.push_str("Use --sheet <name> to view a specific sheet.\n");
+    out
+}
+
+/// Render the full DataFrame as a markdown table with aligned columns.
+pub fn format_data_table(df: &DataFrame) -> String {
+    let (headers, rows) = df_to_strings(df);
+    render_table(&headers, &rows)
+}
+
+/// Render head / tail view of a DataFrame with aligned columns.
+///
+/// If total rows <= head_n + tail_n, shows all rows.
+/// Otherwise shows first head_n rows, an omission line, then last tail_n rows.
+/// Column widths are computed from both head and tail so pipes stay aligned.
+pub fn format_head_tail(df: &DataFrame, head_n: usize, tail_n: usize) -> String {
+    let total = df.height();
+    if total <= head_n + tail_n {
+        return format_data_table(df);
+    }
+
+    let head_df = df.head(Some(head_n));
+    let tail_df = df.tail(Some(tail_n));
+    let omitted = total - head_n - tail_n;
+
+    let (headers, head_rows) = df_to_strings(&head_df);
+    let (_, tail_rows) = df_to_strings(&tail_df);
+
+    // Compute widths from both head and tail rows
+    let mut all_rows = head_rows.clone();
+    all_rows.extend(tail_rows.clone());
+    let widths = compute_col_widths(&headers, &all_rows);
+
+    let mut out = render_table_header(&headers, &widths);
+    out.push_str(&render_table_rows(&head_rows, &widths));
+    out.push_str(&format!("... ({omitted} rows omitted) ...\n"));
+    out.push_str(&render_table_rows(&tail_rows, &widths));
+    out
+}
+
+/// Render DataFrame as CSV.
+pub fn format_csv(df: &DataFrame) -> String {
+    let mut buf: Vec<u8> = Vec::new();
+    // CsvWriter is available via the "csv" feature (polars 0.46)
+    if CsvWriter::new(&mut buf)
+        .finish(&mut df.clone())
+        .is_ok()
+    {
+        return String::from_utf8(buf).unwrap_or_else(|_| csv_fallback(df));
+    }
+    csv_fallback(df)
+}
+
+/// Render a message for an empty or header-only sheet.
+pub fn format_empty_sheet(sheet: &SheetInfo) -> String {
+    if sheet.rows == 0 && sheet.cols == 0 {
+        format!("## Sheet: {} (empty)\n", sheet.name)
+    } else {
+        format!("## Sheet: {} (no data rows)\n", sheet.name)
+    }
+}
+
+/// Render summary statistics for each column as a markdown table.
+///
+/// Stats are rows, columns are DataFrame columns:
+/// | stat | col1 | col2 | ... |
+/// |------|------|------|-----|
+/// | count | ... | ... | ... |
+/// ...
+pub fn format_describe(df: &DataFrame) -> String {
+    let columns = df.get_columns();
+    let stats = ["count", "null_count", "mean", "std", "min", "max", "median", "unique"];
+
+    let mut headers = vec!["stat".to_string()];
+    headers.extend(columns.iter().map(|c| c.name().to_string()));
+
+    let rows: Vec<Vec<String>> = stats
+        .iter()
+        .map(|stat| {
+            let mut row = vec![stat.to_string()];
+            row.extend(columns.iter().map(|col| compute_stat(col, stat)));
+            row
+        })
+        .collect();
+
+    render_table(&headers, &rows)
+}
+
+fn compute_stat(col: &Column, stat: &str) -> String {
+    let series = col.as_materialized_series();
+    match stat {
+        "count" => series.len().to_string(),
+        "null_count" => series.null_count().to_string(),
+        "mean" => {
+            if is_numeric(series.dtype()) {
+                series.mean().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into())
+            } else {
+                "-".into()
+            }
+        }
+        "std" => {
+            if is_numeric(series.dtype()) {
+                series.std(1).map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into())
+            } else {
+                "-".into()
+            }
+        }
+        "min" => {
+            if is_numeric(series.dtype()) {
+                match series.min_reduce() {
+                    Ok(v) => v.value().to_string(),
+                    Err(_) => "-".into(),
+                }
+            } else {
+                "-".into()
+            }
+        }
+        "max" => {
+            if is_numeric(series.dtype()) {
+                match series.max_reduce() {
+                    Ok(v) => v.value().to_string(),
+                    Err(_) => "-".into(),
+                }
+            } else {
+                "-".into()
+            }
+        }
+        "median" => {
+            if is_numeric(series.dtype()) {
+                series.median().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into())
+            } else {
+                "-".into()
+            }
+        }
+        "unique" => match series.n_unique() {
+            Ok(n) => n.to_string(),
+            Err(_) => "-".into(),
+        },
+        _ => "-".into(),
+    }
+}
+
+fn is_numeric(dtype: &DataType) -> bool {
+    matches!(
+        dtype,
+        DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64
+            | DataType::Float32
+            | DataType::Float64
+    )
+}
+
+// ---------------------------------------------------------------------------
+// Private helpers
+// ---------------------------------------------------------------------------
+
+/// Extract headers and row data as strings from a DataFrame.
+fn df_to_strings(df: &DataFrame) -> (Vec<String>, Vec<Vec<String>>) {
+    let columns = df.get_columns();
+    let headers: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect();
+    let rows: Vec<Vec<String>> = (0..df.height())
+        .map(|i| columns.iter().map(|c| format_cell(c, i)).collect())
+        .collect();
+    (headers, rows)
+}
+
+/// Compute the display width for each column.
+pub fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> {
+    let mut widths: Vec<usize> = headers.iter().map(|h| h.len().max(3)).collect();
+    for row in rows {
+        for (i, cell) in row.iter().enumerate() {
+            if i < widths.len() {
+                widths[i] = widths[i].max(cell.len());
+            }
+        }
+    }
+    widths
+}
+
+/// Render a markdown table header + separator line.
+pub fn render_table_header(headers: &[String], widths: &[usize]) -> String {
+    let mut out = String::new();
+    out.push('|');
+    for (i, h) in headers.iter().enumerate() {
+        let _ = write!(out, " {:<w$} |", h, w = widths[i]);
+    }
+    out.push('\n');
+    out.push('|');
+    for w in widths {
+        out.push('-');
+        for _ in 0..*w {
+            out.push('-');
+        }
+        out.push_str("-|");
+    }
+    out.push('\n');
+    out
+}
+
+/// Render markdown table data rows (no header).
+pub fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String {
+    let mut out = String::new();
+    for row in rows {
+        out.push('|');
+        for (i, cell) in row.iter().enumerate() {
+            let w = if i < widths.len() { widths[i] } else { cell.len() };
+            let _ = write!(out, " {:<w$} |", cell, w = w);
+        }
+        out.push('\n');
+    }
+    out
+}
+
+/// Render a complete aligned markdown table.
+pub fn render_table(headers: &[String], rows: &[Vec<String>]) -> String {
+    let widths = compute_col_widths(headers, rows);
+    let mut out = render_table_header(headers, &widths);
+    out.push_str(&render_table_rows(rows, &widths));
+    out
+}
+
+/// Format a single cell value for markdown display.
+fn format_cell(col: &Column, idx: usize) -> String {
+    match col.get(idx) {
+        Ok(AnyValue::Null) | Err(_) => String::new(),
+        Ok(v) => format_any_value(&v),
+    }
+}
+
+/// Convert an AnyValue to its display string.
+pub fn format_any_value(v: &AnyValue) -> String {
+    match v {
+        AnyValue::Null => String::new(),
+        AnyValue::Boolean(b) => b.to_string(),
+        AnyValue::Int8(n) => n.to_string(),
+        AnyValue::Int16(n) => n.to_string(),
+        AnyValue::Int32(n) => n.to_string(),
+        AnyValue::Int64(n) => n.to_string(),
+        AnyValue::UInt8(n) => n.to_string(),
+        AnyValue::UInt16(n) => n.to_string(),
+        AnyValue::UInt32(n) => n.to_string(),
+        AnyValue::UInt64(n) => n.to_string(),
+        AnyValue::Float32(f) => f.to_string(),
+        AnyValue::Float64(f) => f.to_string(),
+        AnyValue::String(s) => s.to_string(),
+        AnyValue::StringOwned(s) => s.to_string(),
+        other => format!("{other}"),
+    }
+}
+
+/// Map a polars DataType to a human-readable label.
+fn format_dtype(dtype: &DataType) -> &'static str {
+    match dtype {
+        DataType::Boolean => "Boolean",
+        DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => "Int",
+        DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => "UInt",
+        DataType::Float32 | DataType::Float64 => "Float",
+        DataType::String => "String",
+        DataType::Date => "Date",
+        DataType::Datetime(_, _) => "Datetime",
+        DataType::Duration(_) => "Duration",
+        DataType::Time => "Time",
+        DataType::Null => "Null",
+        _ => "Other",
+    }
+}
+
+/// Manual CSV fallback if CsvWriter is unavailable.
+fn csv_fallback(df: &DataFrame) -> String {
+    let columns = df.get_columns();
+    let n_rows = df.height();
+
+    let mut out = String::new();
+
+    // Header
+    let header: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect();
+    out.push_str(&header.join(","));
+    out.push('\n');
+
+    // Rows
+    for row_idx in 0..n_rows {
+        let row: Vec<String> = columns
+            .iter()
+            .map(|col| {
+                let cell = format_cell(col, row_idx);
+                // Quote cells containing commas or quotes
+                if cell.contains(',') || cell.contains('"') || cell.contains('\n') {
+                    format!("\"{}\"", cell.replace('"', "\"\""))
+                } else {
+                    cell
+                }
+            })
+            .collect();
+        out.push_str(&row.join(","));
+        out.push('\n');
+    }
+
+    out
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{FileInfo, SheetInfo};
+
+    #[test]
+    fn test_format_header() {
+        use crate::format::Format;
+        let info = FileInfo {
+            file_size: 250_000,
+            format: Format::Excel,
+            sheets: vec![
+                SheetInfo { name: "Sheet1".into(), rows: 100, cols: 5 },
+                SheetInfo { name: "Sheet2".into(), rows: 50, cols: 3 },
+            ],
+        };
+        let out = format_header("test.xlsx", &info);
+        assert!(out.contains("# File: test.xlsx (244 KB) [Excel]"));
+        assert!(out.contains("# Sheets: 2"));
+    }
+
+    #[test]
+    fn test_format_header_single_sheet() {
+        use crate::format::Format;
+        let info = FileInfo {
+            file_size: 1_000,
+            format: Format::Csv,
+            sheets: vec![SheetInfo { name: "data".into(), rows: 10, cols: 3 }],
+        };
+        let out = format_header("data.csv", &info);
+        assert!(out.contains("[CSV]"));
+        assert!(!out.contains("Sheets"));
+    }
+
+    #[test]
+    fn test_format_data_table() {
+        let s1 = Series::new("name".into(), &["Alice", "Bob"]);
+        let s2 = Series::new("value".into(), &[100i64, 200]);
+        let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
+        let out = format_data_table(&df);
+        assert!(out.contains("| name  | value |"));
+        assert!(out.contains("| Alice | 100   |"));
+        // Verify pipes are aligned: all lines have same length
+        let lines: Vec<&str> = out.trim().lines().collect();
+        assert!(lines.len() >= 3);
+        let expected_len = lines[0].len();
+        for line in &lines {
+            assert_eq!(line.len(), expected_len, "Misaligned: {line}");
+        }
+    }
+
+    #[test]
+    fn test_format_head_tail_small() {
+        let s = Series::new("x".into(), &[1i64, 2, 3]);
+        let df = DataFrame::new(vec![s.into_column()]).unwrap();
+        let out = format_head_tail(&df, 25, 25);
+        assert!(!out.contains("omitted"));
+        assert!(out.contains("| 1 "));
+        assert!(out.contains("| 3 "));
+    }
+
+    #[test]
+    fn test_format_head_tail_large() {
+        // 60 rows, head=25 tail=25 → 10 omitted
+        let values: Vec<i64> = (1..=60).collect();
+        let s = Series::new("n".into(), values.as_slice());
+        let df = DataFrame::new(vec![s.into_column()]).unwrap();
+        let out = format_head_tail(&df, 25, 25);
+        assert!(out.contains("(10 rows omitted)"));
+        assert!(out.contains("| 1 "));
+        assert!(out.contains("| 25 "));
+        assert!(out.contains("| 36 "));
+        assert!(out.contains("| 60 "));
+    }
+
+    #[test]
+    fn test_format_schema() {
+        let sheet = SheetInfo {
+            name: "Revenue".into(),
+            rows: 11, // 1 header + 10 data
+            cols: 2,
+        };
+        let s1 = Series::new("date".into(), &["2024-01-01", "2024-01-02"]);
+        let s2 = Series::new("amount".into(), &[1.0f64, 2.0]);
+        let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
+        let out = format_schema(&sheet, &df);
+        assert!(out.contains("## Sheet: Revenue (10 rows x 2 cols)"));
+        assert!(out.contains("| date"));
+        assert!(out.contains("| amount"));
+        assert!(out.contains("String"));
+        assert!(out.contains("Float"));
+    }
+
+    #[test]
+    fn test_format_empty_sheet_completely_empty() {
+        let sheet = SheetInfo { name: "Blank".into(), rows: 0, cols: 0 };
+        let out = format_empty_sheet(&sheet);
+        assert!(out.contains("(empty)"));
+    }
+
+    #[test]
+    fn test_format_empty_sheet_header_only() {
+        let sheet = SheetInfo { name: "Headers".into(), rows: 1, cols: 3 };
+        let out = format_empty_sheet(&sheet);
+        assert!(out.contains("(no data rows)"));
+    }
+
+    #[test]
+    fn test_format_csv() {
+        let s1 = Series::new("a".into(), &["hello", "world"]);
+        let s2 = Series::new("b".into(), &[1i64, 2]);
+        let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
+        let out = format_csv(&df);
+        assert!(out.contains("a,b"));
+        assert!(out.contains("hello"));
+        assert!(out.contains("world"));
+    }
+
+    #[test]
+    fn test_format_describe() {
+        let s_name = Series::new("name".into(), &["Alice", "Bob", "Carol"]);
+        let s_val = Series::new("value".into(), &[10i64, 20, 30]);
+        let df = DataFrame::new(vec![s_name.into_column(), s_val.into_column()]).unwrap();
+        let out = format_describe(&df);
+        // Header row contains stat and column names
+        assert!(out.contains("| stat"));
+        assert!(out.contains("name"));
+        assert!(out.contains("value"));
+        // All stat rows are present
+        assert!(out.contains("| count"));
+        assert!(out.contains("| null_count"));
+        assert!(out.contains("| mean"));
+        assert!(out.contains("| std"));
+        assert!(out.contains("| min"));
+        assert!(out.contains("| max"));
+        assert!(out.contains("| median"));
+        assert!(out.contains("| unique"));
+        // Non-numeric column shows "-" for mean
+        assert!(out.contains("| -"));
+        // Verify alignment: all table lines should have same length
+        let table_lines: Vec<&str> = out.trim().lines().filter(|l| l.starts_with('|')).collect();
+        let expected_len = table_lines[0].len();
+        for line in &table_lines {
+            assert_eq!(line.len(), expected_len, "Misaligned: {line}");
+        }
+    }
+}

	dt-cli-tools CLI tools for viewing, filtering, and comparing tabular data files
	Log \| Files \| Refs \| README \| LICENSE