dt-cli-tools

CLI tools for viewing, filtering, and comparing tabular data files
Log | Files | Refs | README | LICENSE

formatter.rs (17645B)


      1 use crate::metadata::{format_file_size, format_name, FileInfo, SheetInfo};
      2 use polars::prelude::*;
      3 use std::fmt::Write as FmtWrite;
      4 
      5 // ---------------------------------------------------------------------------
      6 // Public API
      7 // ---------------------------------------------------------------------------
      8 
      9 /// Render the top-level file header.
     10 ///
     11 /// ```text
     12 /// # File: report.xlsx (245 KB) [Excel]
     13 /// # Sheets: 3
     14 /// ```
     15 pub fn format_header(file_name: &str, info: &FileInfo) -> String {
     16     let size_str = format_file_size(info.file_size);
     17     let fmt_name = format_name(info.format);
     18     let sheet_count = info.sheets.len();
     19     if sheet_count > 1 {
     20         format!("# File: {file_name} ({size_str}) [{fmt_name}]\n# Sheets: {sheet_count}\n")
     21     } else {
     22         format!("# File: {file_name} ({size_str}) [{fmt_name}]\n")
     23     }
     24 }
     25 
     26 /// Render the schema block for a single sheet.
     27 ///
     28 /// ```text
     29 /// ## Sheet: Revenue (1240 rows x 8 cols)
     30 ///
     31 /// | Column | Type |
     32 /// |--------|------|
     33 /// | date   | Date |
     34 /// ...
     35 /// ```
     36 pub fn format_schema(sheet: &SheetInfo, df: &DataFrame) -> String {
     37     let data_rows = if sheet.rows == 0 { 0 } else { sheet.rows - 1 };
     38     let cols = sheet.cols;
     39 
     40     let mut out = format!(
     41         "## Sheet: {} ({} rows x {} cols)\n\n",
     42         sheet.name, data_rows, cols
     43     );
     44     let headers = vec!["Column".to_string(), "Type".to_string()];
     45     let rows: Vec<Vec<String>> = df
     46         .get_columns()
     47         .iter()
     48         .map(|col| vec![col.name().to_string(), format_dtype(col.dtype()).to_string()])
     49         .collect();
     50     out.push_str(&render_table(&headers, &rows));
     51     out
     52 }
     53 
     54 /// Render the multi-sheet listing (header + schema for each + prompt).
     55 pub fn format_sheet_listing(
     56     file_name: &str,
     57     info: &FileInfo,
     58     schemas: &[(&SheetInfo, DataFrame)],
     59 ) -> String {
     60     let mut out = format_header(file_name, info);
     61     out.push('\n');
     62 
     63     for (sheet, df) in schemas {
     64         if sheet.rows == 0 && sheet.cols == 0 {
     65             out.push_str(&format_empty_sheet(sheet));
     66         } else {
     67             out.push_str(&format_schema(sheet, df));
     68         }
     69         out.push('\n');
     70     }
     71 
     72     out.push_str("Use --sheet <name> to view a specific sheet.\n");
     73     out
     74 }
     75 
     76 /// Render the full DataFrame as a markdown table with aligned columns.
     77 pub fn format_data_table(df: &DataFrame) -> String {
     78     let (headers, rows) = df_to_strings(df);
     79     render_table(&headers, &rows)
     80 }
     81 
     82 /// Render head / tail view of a DataFrame with aligned columns.
     83 ///
     84 /// If total rows <= head_n + tail_n, shows all rows.
     85 /// Otherwise shows first head_n rows, an omission line, then last tail_n rows.
     86 /// Column widths are computed from both head and tail so pipes stay aligned.
     87 pub fn format_head_tail(df: &DataFrame, head_n: usize, tail_n: usize) -> String {
     88     let total = df.height();
     89     if total <= head_n + tail_n {
     90         return format_data_table(df);
     91     }
     92 
     93     let head_df = df.head(Some(head_n));
     94     let tail_df = df.tail(Some(tail_n));
     95     let omitted = total - head_n - tail_n;
     96 
     97     let (headers, head_rows) = df_to_strings(&head_df);
     98     let (_, tail_rows) = df_to_strings(&tail_df);
     99 
    100     // Compute widths from both head and tail rows
    101     let mut all_rows = head_rows.clone();
    102     all_rows.extend(tail_rows.clone());
    103     let widths = compute_col_widths(&headers, &all_rows);
    104 
    105     let mut out = render_table_header(&headers, &widths);
    106     out.push_str(&render_table_rows(&head_rows, &widths));
    107     out.push_str(&format!("... ({omitted} rows omitted) ...\n"));
    108     out.push_str(&render_table_rows(&tail_rows, &widths));
    109     out
    110 }
    111 
    112 /// Render DataFrame as CSV.
    113 pub fn format_csv(df: &DataFrame) -> String {
    114     let mut buf: Vec<u8> = Vec::new();
    115     // CsvWriter is available via the "csv" feature (polars 0.46)
    116     if CsvWriter::new(&mut buf)
    117         .finish(&mut df.clone())
    118         .is_ok()
    119     {
    120         return String::from_utf8(buf).unwrap_or_else(|_| csv_fallback(df));
    121     }
    122     csv_fallback(df)
    123 }
    124 
    125 /// Render a message for an empty or header-only sheet.
    126 pub fn format_empty_sheet(sheet: &SheetInfo) -> String {
    127     if sheet.rows == 0 && sheet.cols == 0 {
    128         format!("## Sheet: {} (empty)\n", sheet.name)
    129     } else {
    130         format!("## Sheet: {} (no data rows)\n", sheet.name)
    131     }
    132 }
    133 
    134 /// Render summary statistics for each column as a markdown table.
    135 ///
    136 /// Stats are rows, columns are DataFrame columns:
    137 /// | stat | col1 | col2 | ... |
    138 /// |------|------|------|-----|
    139 /// | count | ... | ... | ... |
    140 /// ...
    141 pub fn format_describe(df: &DataFrame) -> String {
    142     let columns = df.get_columns();
    143     let stats = ["count", "null_count", "mean", "std", "min", "max", "median", "unique"];
    144 
    145     let mut headers = vec!["stat".to_string()];
    146     headers.extend(columns.iter().map(|c| c.name().to_string()));
    147 
    148     let rows: Vec<Vec<String>> = stats
    149         .iter()
    150         .map(|stat| {
    151             let mut row = vec![stat.to_string()];
    152             row.extend(columns.iter().map(|col| compute_stat(col, stat)));
    153             row
    154         })
    155         .collect();
    156 
    157     render_table(&headers, &rows)
    158 }
    159 
    160 fn compute_stat(col: &Column, stat: &str) -> String {
    161     let series = col.as_materialized_series();
    162     match stat {
    163         "count" => series.len().to_string(),
    164         "null_count" => series.null_count().to_string(),
    165         "mean" => {
    166             if is_numeric(series.dtype()) {
    167                 series.mean().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into())
    168             } else {
    169                 "-".into()
    170             }
    171         }
    172         "std" => {
    173             if is_numeric(series.dtype()) {
    174                 series.std(1).map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into())
    175             } else {
    176                 "-".into()
    177             }
    178         }
    179         "min" => {
    180             if is_numeric(series.dtype()) {
    181                 match series.min_reduce() {
    182                     Ok(v) => v.value().to_string(),
    183                     Err(_) => "-".into(),
    184                 }
    185             } else {
    186                 "-".into()
    187             }
    188         }
    189         "max" => {
    190             if is_numeric(series.dtype()) {
    191                 match series.max_reduce() {
    192                     Ok(v) => v.value().to_string(),
    193                     Err(_) => "-".into(),
    194                 }
    195             } else {
    196                 "-".into()
    197             }
    198         }
    199         "median" => {
    200             if is_numeric(series.dtype()) {
    201                 series.median().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into())
    202             } else {
    203                 "-".into()
    204             }
    205         }
    206         "unique" => match series.n_unique() {
    207             Ok(n) => n.to_string(),
    208             Err(_) => "-".into(),
    209         },
    210         _ => "-".into(),
    211     }
    212 }
    213 
    214 fn is_numeric(dtype: &DataType) -> bool {
    215     matches!(
    216         dtype,
    217         DataType::Int8
    218             | DataType::Int16
    219             | DataType::Int32
    220             | DataType::Int64
    221             | DataType::UInt8
    222             | DataType::UInt16
    223             | DataType::UInt32
    224             | DataType::UInt64
    225             | DataType::Float32
    226             | DataType::Float64
    227     )
    228 }
    229 
    230 // ---------------------------------------------------------------------------
    231 // Private helpers
    232 // ---------------------------------------------------------------------------
    233 
    234 /// Extract headers and row data as strings from a DataFrame.
    235 fn df_to_strings(df: &DataFrame) -> (Vec<String>, Vec<Vec<String>>) {
    236     let columns = df.get_columns();
    237     let headers: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect();
    238     let rows: Vec<Vec<String>> = (0..df.height())
    239         .map(|i| columns.iter().map(|c| format_cell(c, i)).collect())
    240         .collect();
    241     (headers, rows)
    242 }
    243 
    244 /// Compute the display width for each column.
    245 pub fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> {
    246     let mut widths: Vec<usize> = headers.iter().map(|h| h.len().max(3)).collect();
    247     for row in rows {
    248         for (i, cell) in row.iter().enumerate() {
    249             if i < widths.len() {
    250                 widths[i] = widths[i].max(cell.len());
    251             }
    252         }
    253     }
    254     widths
    255 }
    256 
    257 /// Render a markdown table header + separator line.
    258 pub fn render_table_header(headers: &[String], widths: &[usize]) -> String {
    259     let mut out = String::new();
    260     out.push('|');
    261     for (i, h) in headers.iter().enumerate() {
    262         let _ = write!(out, " {:<w$} |", h, w = widths[i]);
    263     }
    264     out.push('\n');
    265     out.push('|');
    266     for w in widths {
    267         out.push('-');
    268         for _ in 0..*w {
    269             out.push('-');
    270         }
    271         out.push_str("-|");
    272     }
    273     out.push('\n');
    274     out
    275 }
    276 
    277 /// Render markdown table data rows (no header).
    278 pub fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String {
    279     let mut out = String::new();
    280     for row in rows {
    281         out.push('|');
    282         for (i, cell) in row.iter().enumerate() {
    283             let w = if i < widths.len() { widths[i] } else { cell.len() };
    284             let _ = write!(out, " {:<w$} |", cell, w = w);
    285         }
    286         out.push('\n');
    287     }
    288     out
    289 }
    290 
    291 /// Render a complete aligned markdown table.
    292 pub fn render_table(headers: &[String], rows: &[Vec<String>]) -> String {
    293     let widths = compute_col_widths(headers, rows);
    294     let mut out = render_table_header(headers, &widths);
    295     out.push_str(&render_table_rows(rows, &widths));
    296     out
    297 }
    298 
    299 /// Format a single cell value for markdown display.
    300 fn format_cell(col: &Column, idx: usize) -> String {
    301     match col.get(idx) {
    302         Ok(AnyValue::Null) | Err(_) => String::new(),
    303         Ok(v) => format_any_value(&v),
    304     }
    305 }
    306 
    307 /// Convert an AnyValue to its display string.
    308 pub fn format_any_value(v: &AnyValue) -> String {
    309     match v {
    310         AnyValue::Null => String::new(),
    311         AnyValue::Boolean(b) => b.to_string(),
    312         AnyValue::Int8(n) => n.to_string(),
    313         AnyValue::Int16(n) => n.to_string(),
    314         AnyValue::Int32(n) => n.to_string(),
    315         AnyValue::Int64(n) => n.to_string(),
    316         AnyValue::UInt8(n) => n.to_string(),
    317         AnyValue::UInt16(n) => n.to_string(),
    318         AnyValue::UInt32(n) => n.to_string(),
    319         AnyValue::UInt64(n) => n.to_string(),
    320         AnyValue::Float32(f) => f.to_string(),
    321         AnyValue::Float64(f) => f.to_string(),
    322         AnyValue::String(s) => s.to_string(),
    323         AnyValue::StringOwned(s) => s.to_string(),
    324         other => format!("{other}"),
    325     }
    326 }
    327 
    328 /// Map a polars DataType to a human-readable label.
    329 fn format_dtype(dtype: &DataType) -> &'static str {
    330     match dtype {
    331         DataType::Boolean => "Boolean",
    332         DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => "Int",
    333         DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => "UInt",
    334         DataType::Float32 | DataType::Float64 => "Float",
    335         DataType::String => "String",
    336         DataType::Date => "Date",
    337         DataType::Datetime(_, _) => "Datetime",
    338         DataType::Duration(_) => "Duration",
    339         DataType::Time => "Time",
    340         DataType::Null => "Null",
    341         _ => "Other",
    342     }
    343 }
    344 
    345 /// Manual CSV fallback if CsvWriter is unavailable.
    346 fn csv_fallback(df: &DataFrame) -> String {
    347     let columns = df.get_columns();
    348     let n_rows = df.height();
    349 
    350     let mut out = String::new();
    351 
    352     // Header
    353     let header: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect();
    354     out.push_str(&header.join(","));
    355     out.push('\n');
    356 
    357     // Rows
    358     for row_idx in 0..n_rows {
    359         let row: Vec<String> = columns
    360             .iter()
    361             .map(|col| {
    362                 let cell = format_cell(col, row_idx);
    363                 // Quote cells containing commas or quotes
    364                 if cell.contains(',') || cell.contains('"') || cell.contains('\n') {
    365                     format!("\"{}\"", cell.replace('"', "\"\""))
    366                 } else {
    367                     cell
    368                 }
    369             })
    370             .collect();
    371         out.push_str(&row.join(","));
    372         out.push('\n');
    373     }
    374 
    375     out
    376 }
    377 
    378 // ---------------------------------------------------------------------------
    379 // Tests
    380 // ---------------------------------------------------------------------------
    381 
    382 #[cfg(test)]
    383 mod tests {
    384     use super::*;
    385     use crate::metadata::{FileInfo, SheetInfo};
    386 
    387     #[test]
    388     fn test_format_header() {
    389         use crate::format::Format;
    390         let info = FileInfo {
    391             file_size: 250_000,
    392             format: Format::Excel,
    393             sheets: vec![
    394                 SheetInfo { name: "Sheet1".into(), rows: 100, cols: 5 },
    395                 SheetInfo { name: "Sheet2".into(), rows: 50, cols: 3 },
    396             ],
    397         };
    398         let out = format_header("test.xlsx", &info);
    399         assert!(out.contains("# File: test.xlsx (244 KB) [Excel]"));
    400         assert!(out.contains("# Sheets: 2"));
    401     }
    402 
    403     #[test]
    404     fn test_format_header_single_sheet() {
    405         use crate::format::Format;
    406         let info = FileInfo {
    407             file_size: 1_000,
    408             format: Format::Csv,
    409             sheets: vec![SheetInfo { name: "data".into(), rows: 10, cols: 3 }],
    410         };
    411         let out = format_header("data.csv", &info);
    412         assert!(out.contains("[CSV]"));
    413         assert!(!out.contains("Sheets"));
    414     }
    415 
    416     #[test]
    417     fn test_format_data_table() {
    418         let s1 = Series::new("name".into(), &["Alice", "Bob"]);
    419         let s2 = Series::new("value".into(), &[100i64, 200]);
    420         let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
    421         let out = format_data_table(&df);
    422         assert!(out.contains("| name  | value |"));
    423         assert!(out.contains("| Alice | 100   |"));
    424         // Verify pipes are aligned: all lines have same length
    425         let lines: Vec<&str> = out.trim().lines().collect();
    426         assert!(lines.len() >= 3);
    427         let expected_len = lines[0].len();
    428         for line in &lines {
    429             assert_eq!(line.len(), expected_len, "Misaligned: {line}");
    430         }
    431     }
    432 
    433     #[test]
    434     fn test_format_head_tail_small() {
    435         let s = Series::new("x".into(), &[1i64, 2, 3]);
    436         let df = DataFrame::new(vec![s.into_column()]).unwrap();
    437         let out = format_head_tail(&df, 25, 25);
    438         assert!(!out.contains("omitted"));
    439         assert!(out.contains("| 1 "));
    440         assert!(out.contains("| 3 "));
    441     }
    442 
    443     #[test]
    444     fn test_format_head_tail_large() {
    445         // 60 rows, head=25 tail=25 → 10 omitted
    446         let values: Vec<i64> = (1..=60).collect();
    447         let s = Series::new("n".into(), values.as_slice());
    448         let df = DataFrame::new(vec![s.into_column()]).unwrap();
    449         let out = format_head_tail(&df, 25, 25);
    450         assert!(out.contains("(10 rows omitted)"));
    451         assert!(out.contains("| 1 "));
    452         assert!(out.contains("| 25 "));
    453         assert!(out.contains("| 36 "));
    454         assert!(out.contains("| 60 "));
    455     }
    456 
    457     #[test]
    458     fn test_format_schema() {
    459         let sheet = SheetInfo {
    460             name: "Revenue".into(),
    461             rows: 11, // 1 header + 10 data
    462             cols: 2,
    463         };
    464         let s1 = Series::new("date".into(), &["2024-01-01", "2024-01-02"]);
    465         let s2 = Series::new("amount".into(), &[1.0f64, 2.0]);
    466         let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
    467         let out = format_schema(&sheet, &df);
    468         assert!(out.contains("## Sheet: Revenue (10 rows x 2 cols)"));
    469         assert!(out.contains("| date"));
    470         assert!(out.contains("| amount"));
    471         assert!(out.contains("String"));
    472         assert!(out.contains("Float"));
    473     }
    474 
    475     #[test]
    476     fn test_format_empty_sheet_completely_empty() {
    477         let sheet = SheetInfo { name: "Blank".into(), rows: 0, cols: 0 };
    478         let out = format_empty_sheet(&sheet);
    479         assert!(out.contains("(empty)"));
    480     }
    481 
    482     #[test]
    483     fn test_format_empty_sheet_header_only() {
    484         let sheet = SheetInfo { name: "Headers".into(), rows: 1, cols: 3 };
    485         let out = format_empty_sheet(&sheet);
    486         assert!(out.contains("(no data rows)"));
    487     }
    488 
    489     #[test]
    490     fn test_format_csv() {
    491         let s1 = Series::new("a".into(), &["hello", "world"]);
    492         let s2 = Series::new("b".into(), &[1i64, 2]);
    493         let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
    494         let out = format_csv(&df);
    495         assert!(out.contains("a,b"));
    496         assert!(out.contains("hello"));
    497         assert!(out.contains("world"));
    498     }
    499 
    500     #[test]
    501     fn test_format_describe() {
    502         let s_name = Series::new("name".into(), &["Alice", "Bob", "Carol"]);
    503         let s_val = Series::new("value".into(), &[10i64, 20, 30]);
    504         let df = DataFrame::new(vec![s_name.into_column(), s_val.into_column()]).unwrap();
    505         let out = format_describe(&df);
    506         // Header row contains stat and column names
    507         assert!(out.contains("| stat"));
    508         assert!(out.contains("name"));
    509         assert!(out.contains("value"));
    510         // All stat rows are present
    511         assert!(out.contains("| count"));
    512         assert!(out.contains("| null_count"));
    513         assert!(out.contains("| mean"));
    514         assert!(out.contains("| std"));
    515         assert!(out.contains("| min"));
    516         assert!(out.contains("| max"));
    517         assert!(out.contains("| median"));
    518         assert!(out.contains("| unique"));
    519         // Non-numeric column shows "-" for mean
    520         assert!(out.contains("| -"));
    521         // Verify alignment: all table lines should have same length
    522         let table_lines: Vec<&str> = out.trim().lines().filter(|l| l.starts_with('|')).collect();
    523         let expected_len = table_lines[0].len();
    524         for line in &table_lines {
    525             assert_eq!(line.len(), expected_len, "Misaligned: {line}");
    526         }
    527     }
    528 }