formatter.rs (16916B)
1 use crate::metadata::{format_file_size, FileInfo, SheetInfo}; 2 use polars::prelude::*; 3 use std::fmt::Write as FmtWrite; 4 5 // --------------------------------------------------------------------------- 6 // Public API 7 // --------------------------------------------------------------------------- 8 9 /// Render the top-level file header. 10 /// 11 /// ```text 12 /// # File: report.xlsx (245 KB) 13 /// # Sheets: 3 14 /// ``` 15 pub fn format_header(file_name: &str, info: &FileInfo) -> String { 16 let size_str = format_file_size(info.file_size); 17 let sheet_count = info.sheets.len(); 18 format!("# File: {file_name} ({size_str})\n# Sheets: {sheet_count}\n") 19 } 20 21 /// Render the schema block for a single sheet. 22 /// 23 /// ```text 24 /// ## Sheet: Revenue (1240 rows x 8 cols) 25 /// 26 /// | Column | Type | 27 /// |--------|------| 28 /// | date | Date | 29 /// ... 30 /// ``` 31 pub fn format_schema(sheet: &SheetInfo, df: &DataFrame) -> String { 32 let data_rows = if sheet.rows == 0 { 0 } else { sheet.rows - 1 }; 33 let cols = sheet.cols; 34 35 let mut out = format!( 36 "## Sheet: {} ({} rows x {} cols)\n\n", 37 sheet.name, data_rows, cols 38 ); 39 let headers = vec!["Column".to_string(), "Type".to_string()]; 40 let rows: Vec<Vec<String>> = df 41 .get_columns() 42 .iter() 43 .map(|col| vec![col.name().to_string(), format_dtype(col.dtype()).to_string()]) 44 .collect(); 45 out.push_str(&render_table(&headers, &rows)); 46 out 47 } 48 49 /// Render the multi-sheet listing (header + schema for each + prompt). 50 pub fn format_sheet_listing( 51 file_name: &str, 52 info: &FileInfo, 53 schemas: &[(&SheetInfo, DataFrame)], 54 ) -> String { 55 let mut out = format_header(file_name, info); 56 out.push('\n'); 57 58 for (sheet, df) in schemas { 59 if sheet.rows == 0 && sheet.cols == 0 { 60 out.push_str(&format_empty_sheet(sheet)); 61 } else { 62 out.push_str(&format_schema(sheet, df)); 63 } 64 out.push('\n'); 65 } 66 67 out.push_str("Use --sheet <name> to view a specific sheet.\n"); 68 out 69 } 70 71 /// Render the full DataFrame as a markdown table with aligned columns. 72 pub fn format_data_table(df: &DataFrame) -> String { 73 let (headers, rows) = df_to_strings(df); 74 render_table(&headers, &rows) 75 } 76 77 /// Render head / tail view of a DataFrame with aligned columns. 78 /// 79 /// If total rows <= head_n + tail_n, shows all rows. 80 /// Otherwise shows first head_n rows, an omission line, then last tail_n rows. 81 /// Column widths are computed from both head and tail so pipes stay aligned. 82 pub fn format_head_tail(df: &DataFrame, head_n: usize, tail_n: usize) -> String { 83 let total = df.height(); 84 if total <= head_n + tail_n { 85 return format_data_table(df); 86 } 87 88 let head_df = df.head(Some(head_n)); 89 let tail_df = df.tail(Some(tail_n)); 90 let omitted = total - head_n - tail_n; 91 92 let (headers, head_rows) = df_to_strings(&head_df); 93 let (_, tail_rows) = df_to_strings(&tail_df); 94 95 // Compute widths from both head and tail rows 96 let mut all_rows = head_rows.clone(); 97 all_rows.extend(tail_rows.clone()); 98 let widths = compute_col_widths(&headers, &all_rows); 99 100 let mut out = render_table_header(&headers, &widths); 101 out.push_str(&render_table_rows(&head_rows, &widths)); 102 out.push_str(&format!("... ({omitted} rows omitted) ...\n")); 103 out.push_str(&render_table_rows(&tail_rows, &widths)); 104 out 105 } 106 107 /// Render DataFrame as CSV. 108 pub fn format_csv(df: &DataFrame) -> String { 109 let mut buf: Vec<u8> = Vec::new(); 110 // CsvWriter is available via the "csv" feature (polars 0.46) 111 if CsvWriter::new(&mut buf) 112 .finish(&mut df.clone()) 113 .is_ok() 114 { 115 return String::from_utf8(buf).unwrap_or_else(|_| csv_fallback(df)); 116 } 117 csv_fallback(df) 118 } 119 120 /// Render a message for an empty or header-only sheet. 121 pub fn format_empty_sheet(sheet: &SheetInfo) -> String { 122 if sheet.rows == 0 && sheet.cols == 0 { 123 format!("## Sheet: {} (empty)\n", sheet.name) 124 } else { 125 format!("## Sheet: {} (no data rows)\n", sheet.name) 126 } 127 } 128 129 /// Render summary statistics for each column as a markdown table. 130 /// 131 /// Stats are rows, columns are DataFrame columns: 132 /// | stat | col1 | col2 | ... | 133 /// |------|------|------|-----| 134 /// | count | ... | ... | ... | 135 /// ... 136 pub fn format_describe(df: &DataFrame) -> String { 137 let columns = df.get_columns(); 138 let stats = ["count", "null_count", "mean", "std", "min", "max", "median", "unique"]; 139 140 let mut headers = vec!["stat".to_string()]; 141 headers.extend(columns.iter().map(|c| c.name().to_string())); 142 143 let rows: Vec<Vec<String>> = stats 144 .iter() 145 .map(|stat| { 146 let mut row = vec![stat.to_string()]; 147 row.extend(columns.iter().map(|col| compute_stat(col, stat))); 148 row 149 }) 150 .collect(); 151 152 render_table(&headers, &rows) 153 } 154 155 fn compute_stat(col: &Column, stat: &str) -> String { 156 let series = col.as_materialized_series(); 157 match stat { 158 "count" => series.len().to_string(), 159 "null_count" => series.null_count().to_string(), 160 "mean" => { 161 if is_numeric(series.dtype()) { 162 series.mean().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) 163 } else { 164 "-".into() 165 } 166 } 167 "std" => { 168 if is_numeric(series.dtype()) { 169 series.std(1).map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) 170 } else { 171 "-".into() 172 } 173 } 174 "min" => { 175 if is_numeric(series.dtype()) { 176 match series.min_reduce() { 177 Ok(v) => v.value().to_string(), 178 Err(_) => "-".into(), 179 } 180 } else { 181 "-".into() 182 } 183 } 184 "max" => { 185 if is_numeric(series.dtype()) { 186 match series.max_reduce() { 187 Ok(v) => v.value().to_string(), 188 Err(_) => "-".into(), 189 } 190 } else { 191 "-".into() 192 } 193 } 194 "median" => { 195 if is_numeric(series.dtype()) { 196 series.median().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) 197 } else { 198 "-".into() 199 } 200 } 201 "unique" => match series.n_unique() { 202 Ok(n) => n.to_string(), 203 Err(_) => "-".into(), 204 }, 205 _ => "-".into(), 206 } 207 } 208 209 fn is_numeric(dtype: &DataType) -> bool { 210 matches!( 211 dtype, 212 DataType::Int8 213 | DataType::Int16 214 | DataType::Int32 215 | DataType::Int64 216 | DataType::UInt8 217 | DataType::UInt16 218 | DataType::UInt32 219 | DataType::UInt64 220 | DataType::Float32 221 | DataType::Float64 222 ) 223 } 224 225 // --------------------------------------------------------------------------- 226 // Private helpers 227 // --------------------------------------------------------------------------- 228 229 /// Extract headers and row data as strings from a DataFrame. 230 fn df_to_strings(df: &DataFrame) -> (Vec<String>, Vec<Vec<String>>) { 231 let columns = df.get_columns(); 232 let headers: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect(); 233 let rows: Vec<Vec<String>> = (0..df.height()) 234 .map(|i| columns.iter().map(|c| format_cell(c, i)).collect()) 235 .collect(); 236 (headers, rows) 237 } 238 239 /// Compute the display width for each column. 240 pub fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> { 241 let mut widths: Vec<usize> = headers.iter().map(|h| h.len().max(3)).collect(); 242 for row in rows { 243 for (i, cell) in row.iter().enumerate() { 244 if i < widths.len() { 245 widths[i] = widths[i].max(cell.len()); 246 } 247 } 248 } 249 widths 250 } 251 252 /// Render a markdown table header + separator line. 253 pub fn render_table_header(headers: &[String], widths: &[usize]) -> String { 254 let mut out = String::new(); 255 out.push('|'); 256 for (i, h) in headers.iter().enumerate() { 257 let _ = write!(out, " {:<w$} |", h, w = widths[i]); 258 } 259 out.push('\n'); 260 out.push('|'); 261 for w in widths { 262 out.push('-'); 263 for _ in 0..*w { 264 out.push('-'); 265 } 266 out.push_str("-|"); 267 } 268 out.push('\n'); 269 out 270 } 271 272 /// Render markdown table data rows (no header). 273 pub fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String { 274 let mut out = String::new(); 275 for row in rows { 276 out.push('|'); 277 for (i, cell) in row.iter().enumerate() { 278 let w = if i < widths.len() { widths[i] } else { cell.len() }; 279 let _ = write!(out, " {:<w$} |", cell, w = w); 280 } 281 out.push('\n'); 282 } 283 out 284 } 285 286 /// Render a complete aligned markdown table. 287 pub fn render_table(headers: &[String], rows: &[Vec<String>]) -> String { 288 let widths = compute_col_widths(headers, rows); 289 let mut out = render_table_header(headers, &widths); 290 out.push_str(&render_table_rows(rows, &widths)); 291 out 292 } 293 294 /// Format a single cell value for markdown display. 295 fn format_cell(col: &Column, idx: usize) -> String { 296 match col.get(idx) { 297 Ok(AnyValue::Null) | Err(_) => String::new(), 298 Ok(v) => format_any_value(&v), 299 } 300 } 301 302 /// Convert an AnyValue to its display string. 303 pub fn format_any_value(v: &AnyValue) -> String { 304 match v { 305 AnyValue::Null => String::new(), 306 AnyValue::Boolean(b) => b.to_string(), 307 AnyValue::Int8(n) => n.to_string(), 308 AnyValue::Int16(n) => n.to_string(), 309 AnyValue::Int32(n) => n.to_string(), 310 AnyValue::Int64(n) => n.to_string(), 311 AnyValue::UInt8(n) => n.to_string(), 312 AnyValue::UInt16(n) => n.to_string(), 313 AnyValue::UInt32(n) => n.to_string(), 314 AnyValue::UInt64(n) => n.to_string(), 315 AnyValue::Float32(f) => f.to_string(), 316 AnyValue::Float64(f) => f.to_string(), 317 AnyValue::String(s) => s.to_string(), 318 AnyValue::StringOwned(s) => s.to_string(), 319 other => format!("{other}"), 320 } 321 } 322 323 /// Map a polars DataType to a human-readable label. 324 fn format_dtype(dtype: &DataType) -> &'static str { 325 match dtype { 326 DataType::Boolean => "Boolean", 327 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => "Int", 328 DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => "UInt", 329 DataType::Float32 | DataType::Float64 => "Float", 330 DataType::String => "String", 331 DataType::Date => "Date", 332 DataType::Datetime(_, _) => "Datetime", 333 DataType::Duration(_) => "Duration", 334 DataType::Time => "Time", 335 DataType::Null => "Null", 336 _ => "Other", 337 } 338 } 339 340 /// Manual CSV fallback if CsvWriter is unavailable. 341 fn csv_fallback(df: &DataFrame) -> String { 342 let columns = df.get_columns(); 343 let n_rows = df.height(); 344 345 let mut out = String::new(); 346 347 // Header 348 let header: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect(); 349 out.push_str(&header.join(",")); 350 out.push('\n'); 351 352 // Rows 353 for row_idx in 0..n_rows { 354 let row: Vec<String> = columns 355 .iter() 356 .map(|col| { 357 let cell = format_cell(col, row_idx); 358 // Quote cells containing commas or quotes 359 if cell.contains(',') || cell.contains('"') || cell.contains('\n') { 360 format!("\"{}\"", cell.replace('"', "\"\"")) 361 } else { 362 cell 363 } 364 }) 365 .collect(); 366 out.push_str(&row.join(",")); 367 out.push('\n'); 368 } 369 370 out 371 } 372 373 // --------------------------------------------------------------------------- 374 // Tests 375 // --------------------------------------------------------------------------- 376 377 #[cfg(test)] 378 mod tests { 379 use super::*; 380 use crate::metadata::{FileInfo, SheetInfo}; 381 382 #[test] 383 fn test_format_header() { 384 let info = FileInfo { 385 file_size: 250_000, 386 sheets: vec![SheetInfo { 387 name: "Sheet1".into(), 388 rows: 100, 389 cols: 5, 390 }], 391 }; 392 let out = format_header("test.xlsx", &info); 393 assert!(out.contains("# File: test.xlsx (244 KB)")); 394 assert!(out.contains("# Sheets: 1")); 395 } 396 397 #[test] 398 fn test_format_data_table() { 399 let s1 = Series::new("name".into(), &["Alice", "Bob"]); 400 let s2 = Series::new("value".into(), &[100i64, 200]); 401 let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); 402 let out = format_data_table(&df); 403 assert!(out.contains("| name | value |")); 404 assert!(out.contains("| Alice | 100 |")); 405 // Verify pipes are aligned: all lines have same length 406 let lines: Vec<&str> = out.trim().lines().collect(); 407 assert!(lines.len() >= 3); 408 let expected_len = lines[0].len(); 409 for line in &lines { 410 assert_eq!(line.len(), expected_len, "Misaligned: {line}"); 411 } 412 } 413 414 #[test] 415 fn test_format_head_tail_small() { 416 let s = Series::new("x".into(), &[1i64, 2, 3]); 417 let df = DataFrame::new(vec![s.into_column()]).unwrap(); 418 let out = format_head_tail(&df, 25, 25); 419 assert!(!out.contains("omitted")); 420 assert!(out.contains("| 1 ")); 421 assert!(out.contains("| 3 ")); 422 } 423 424 #[test] 425 fn test_format_head_tail_large() { 426 // 60 rows, head=25 tail=25 → 10 omitted 427 let values: Vec<i64> = (1..=60).collect(); 428 let s = Series::new("n".into(), values.as_slice()); 429 let df = DataFrame::new(vec![s.into_column()]).unwrap(); 430 let out = format_head_tail(&df, 25, 25); 431 assert!(out.contains("(10 rows omitted)")); 432 assert!(out.contains("| 1 ")); 433 assert!(out.contains("| 25 ")); 434 assert!(out.contains("| 36 ")); 435 assert!(out.contains("| 60 ")); 436 } 437 438 #[test] 439 fn test_format_schema() { 440 let sheet = SheetInfo { 441 name: "Revenue".into(), 442 rows: 11, // 1 header + 10 data 443 cols: 2, 444 }; 445 let s1 = Series::new("date".into(), &["2024-01-01", "2024-01-02"]); 446 let s2 = Series::new("amount".into(), &[1.0f64, 2.0]); 447 let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); 448 let out = format_schema(&sheet, &df); 449 assert!(out.contains("## Sheet: Revenue (10 rows x 2 cols)")); 450 assert!(out.contains("| date")); 451 assert!(out.contains("| amount")); 452 assert!(out.contains("String")); 453 assert!(out.contains("Float")); 454 } 455 456 #[test] 457 fn test_format_empty_sheet_completely_empty() { 458 let sheet = SheetInfo { name: "Blank".into(), rows: 0, cols: 0 }; 459 let out = format_empty_sheet(&sheet); 460 assert!(out.contains("(empty)")); 461 } 462 463 #[test] 464 fn test_format_empty_sheet_header_only() { 465 let sheet = SheetInfo { name: "Headers".into(), rows: 1, cols: 3 }; 466 let out = format_empty_sheet(&sheet); 467 assert!(out.contains("(no data rows)")); 468 } 469 470 #[test] 471 fn test_format_csv() { 472 let s1 = Series::new("a".into(), &["hello", "world"]); 473 let s2 = Series::new("b".into(), &[1i64, 2]); 474 let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); 475 let out = format_csv(&df); 476 assert!(out.contains("a,b")); 477 assert!(out.contains("hello")); 478 assert!(out.contains("world")); 479 } 480 481 #[test] 482 fn test_format_describe() { 483 let s_name = Series::new("name".into(), &["Alice", "Bob", "Carol"]); 484 let s_val = Series::new("value".into(), &[10i64, 20, 30]); 485 let df = DataFrame::new(vec![s_name.into_column(), s_val.into_column()]).unwrap(); 486 let out = format_describe(&df); 487 // Header row contains stat and column names 488 assert!(out.contains("| stat")); 489 assert!(out.contains("name")); 490 assert!(out.contains("value")); 491 // All stat rows are present 492 assert!(out.contains("| count")); 493 assert!(out.contains("| null_count")); 494 assert!(out.contains("| mean")); 495 assert!(out.contains("| std")); 496 assert!(out.contains("| min")); 497 assert!(out.contains("| max")); 498 assert!(out.contains("| median")); 499 assert!(out.contains("| unique")); 500 // Non-numeric column shows "-" for mean 501 assert!(out.contains("| -")); 502 // Verify alignment: all table lines should have same length 503 let table_lines: Vec<&str> = out.trim().lines().filter(|l| l.starts_with('|')).collect(); 504 let expected_len = table_lines[0].len(); 505 for line in &table_lines { 506 assert_eq!(line.len(), expected_len, "Misaligned: {line}"); 507 } 508 } 509 }