formatter.rs (17645B)
1 use crate::metadata::{format_file_size, format_name, FileInfo, SheetInfo}; 2 use polars::prelude::*; 3 use std::fmt::Write as FmtWrite; 4 5 // --------------------------------------------------------------------------- 6 // Public API 7 // --------------------------------------------------------------------------- 8 9 /// Render the top-level file header. 10 /// 11 /// ```text 12 /// # File: report.xlsx (245 KB) [Excel] 13 /// # Sheets: 3 14 /// ``` 15 pub fn format_header(file_name: &str, info: &FileInfo) -> String { 16 let size_str = format_file_size(info.file_size); 17 let fmt_name = format_name(info.format); 18 let sheet_count = info.sheets.len(); 19 if sheet_count > 1 { 20 format!("# File: {file_name} ({size_str}) [{fmt_name}]\n# Sheets: {sheet_count}\n") 21 } else { 22 format!("# File: {file_name} ({size_str}) [{fmt_name}]\n") 23 } 24 } 25 26 /// Render the schema block for a single sheet. 27 /// 28 /// ```text 29 /// ## Sheet: Revenue (1240 rows x 8 cols) 30 /// 31 /// | Column | Type | 32 /// |--------|------| 33 /// | date | Date | 34 /// ... 35 /// ``` 36 pub fn format_schema(sheet: &SheetInfo, df: &DataFrame) -> String { 37 let data_rows = if sheet.rows == 0 { 0 } else { sheet.rows - 1 }; 38 let cols = sheet.cols; 39 40 let mut out = format!( 41 "## Sheet: {} ({} rows x {} cols)\n\n", 42 sheet.name, data_rows, cols 43 ); 44 let headers = vec!["Column".to_string(), "Type".to_string()]; 45 let rows: Vec<Vec<String>> = df 46 .get_columns() 47 .iter() 48 .map(|col| vec![col.name().to_string(), format_dtype(col.dtype()).to_string()]) 49 .collect(); 50 out.push_str(&render_table(&headers, &rows)); 51 out 52 } 53 54 /// Render the multi-sheet listing (header + schema for each + prompt). 55 pub fn format_sheet_listing( 56 file_name: &str, 57 info: &FileInfo, 58 schemas: &[(&SheetInfo, DataFrame)], 59 ) -> String { 60 let mut out = format_header(file_name, info); 61 out.push('\n'); 62 63 for (sheet, df) in schemas { 64 if sheet.rows == 0 && sheet.cols == 0 { 65 out.push_str(&format_empty_sheet(sheet)); 66 } else { 67 out.push_str(&format_schema(sheet, df)); 68 } 69 out.push('\n'); 70 } 71 72 out.push_str("Use --sheet <name> to view a specific sheet.\n"); 73 out 74 } 75 76 /// Render the full DataFrame as a markdown table with aligned columns. 77 pub fn format_data_table(df: &DataFrame) -> String { 78 let (headers, rows) = df_to_strings(df); 79 render_table(&headers, &rows) 80 } 81 82 /// Render head / tail view of a DataFrame with aligned columns. 83 /// 84 /// If total rows <= head_n + tail_n, shows all rows. 85 /// Otherwise shows first head_n rows, an omission line, then last tail_n rows. 86 /// Column widths are computed from both head and tail so pipes stay aligned. 87 pub fn format_head_tail(df: &DataFrame, head_n: usize, tail_n: usize) -> String { 88 let total = df.height(); 89 if total <= head_n + tail_n { 90 return format_data_table(df); 91 } 92 93 let head_df = df.head(Some(head_n)); 94 let tail_df = df.tail(Some(tail_n)); 95 let omitted = total - head_n - tail_n; 96 97 let (headers, head_rows) = df_to_strings(&head_df); 98 let (_, tail_rows) = df_to_strings(&tail_df); 99 100 // Compute widths from both head and tail rows 101 let mut all_rows = head_rows.clone(); 102 all_rows.extend(tail_rows.clone()); 103 let widths = compute_col_widths(&headers, &all_rows); 104 105 let mut out = render_table_header(&headers, &widths); 106 out.push_str(&render_table_rows(&head_rows, &widths)); 107 out.push_str(&format!("... ({omitted} rows omitted) ...\n")); 108 out.push_str(&render_table_rows(&tail_rows, &widths)); 109 out 110 } 111 112 /// Render DataFrame as CSV. 113 pub fn format_csv(df: &DataFrame) -> String { 114 let mut buf: Vec<u8> = Vec::new(); 115 // CsvWriter is available via the "csv" feature (polars 0.46) 116 if CsvWriter::new(&mut buf) 117 .finish(&mut df.clone()) 118 .is_ok() 119 { 120 return String::from_utf8(buf).unwrap_or_else(|_| csv_fallback(df)); 121 } 122 csv_fallback(df) 123 } 124 125 /// Render a message for an empty or header-only sheet. 126 pub fn format_empty_sheet(sheet: &SheetInfo) -> String { 127 if sheet.rows == 0 && sheet.cols == 0 { 128 format!("## Sheet: {} (empty)\n", sheet.name) 129 } else { 130 format!("## Sheet: {} (no data rows)\n", sheet.name) 131 } 132 } 133 134 /// Render summary statistics for each column as a markdown table. 135 /// 136 /// Stats are rows, columns are DataFrame columns: 137 /// | stat | col1 | col2 | ... | 138 /// |------|------|------|-----| 139 /// | count | ... | ... | ... | 140 /// ... 141 pub fn format_describe(df: &DataFrame) -> String { 142 let columns = df.get_columns(); 143 let stats = ["count", "null_count", "mean", "std", "min", "max", "median", "unique"]; 144 145 let mut headers = vec!["stat".to_string()]; 146 headers.extend(columns.iter().map(|c| c.name().to_string())); 147 148 let rows: Vec<Vec<String>> = stats 149 .iter() 150 .map(|stat| { 151 let mut row = vec![stat.to_string()]; 152 row.extend(columns.iter().map(|col| compute_stat(col, stat))); 153 row 154 }) 155 .collect(); 156 157 render_table(&headers, &rows) 158 } 159 160 fn compute_stat(col: &Column, stat: &str) -> String { 161 let series = col.as_materialized_series(); 162 match stat { 163 "count" => series.len().to_string(), 164 "null_count" => series.null_count().to_string(), 165 "mean" => { 166 if is_numeric(series.dtype()) { 167 series.mean().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) 168 } else { 169 "-".into() 170 } 171 } 172 "std" => { 173 if is_numeric(series.dtype()) { 174 series.std(1).map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) 175 } else { 176 "-".into() 177 } 178 } 179 "min" => { 180 if is_numeric(series.dtype()) { 181 match series.min_reduce() { 182 Ok(v) => v.value().to_string(), 183 Err(_) => "-".into(), 184 } 185 } else { 186 "-".into() 187 } 188 } 189 "max" => { 190 if is_numeric(series.dtype()) { 191 match series.max_reduce() { 192 Ok(v) => v.value().to_string(), 193 Err(_) => "-".into(), 194 } 195 } else { 196 "-".into() 197 } 198 } 199 "median" => { 200 if is_numeric(series.dtype()) { 201 series.median().map(|v| format!("{v:.4}")).unwrap_or_else(|| "-".into()) 202 } else { 203 "-".into() 204 } 205 } 206 "unique" => match series.n_unique() { 207 Ok(n) => n.to_string(), 208 Err(_) => "-".into(), 209 }, 210 _ => "-".into(), 211 } 212 } 213 214 fn is_numeric(dtype: &DataType) -> bool { 215 matches!( 216 dtype, 217 DataType::Int8 218 | DataType::Int16 219 | DataType::Int32 220 | DataType::Int64 221 | DataType::UInt8 222 | DataType::UInt16 223 | DataType::UInt32 224 | DataType::UInt64 225 | DataType::Float32 226 | DataType::Float64 227 ) 228 } 229 230 // --------------------------------------------------------------------------- 231 // Private helpers 232 // --------------------------------------------------------------------------- 233 234 /// Extract headers and row data as strings from a DataFrame. 235 fn df_to_strings(df: &DataFrame) -> (Vec<String>, Vec<Vec<String>>) { 236 let columns = df.get_columns(); 237 let headers: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect(); 238 let rows: Vec<Vec<String>> = (0..df.height()) 239 .map(|i| columns.iter().map(|c| format_cell(c, i)).collect()) 240 .collect(); 241 (headers, rows) 242 } 243 244 /// Compute the display width for each column. 245 pub fn compute_col_widths(headers: &[String], rows: &[Vec<String>]) -> Vec<usize> { 246 let mut widths: Vec<usize> = headers.iter().map(|h| h.len().max(3)).collect(); 247 for row in rows { 248 for (i, cell) in row.iter().enumerate() { 249 if i < widths.len() { 250 widths[i] = widths[i].max(cell.len()); 251 } 252 } 253 } 254 widths 255 } 256 257 /// Render a markdown table header + separator line. 258 pub fn render_table_header(headers: &[String], widths: &[usize]) -> String { 259 let mut out = String::new(); 260 out.push('|'); 261 for (i, h) in headers.iter().enumerate() { 262 let _ = write!(out, " {:<w$} |", h, w = widths[i]); 263 } 264 out.push('\n'); 265 out.push('|'); 266 for w in widths { 267 out.push('-'); 268 for _ in 0..*w { 269 out.push('-'); 270 } 271 out.push_str("-|"); 272 } 273 out.push('\n'); 274 out 275 } 276 277 /// Render markdown table data rows (no header). 278 pub fn render_table_rows(rows: &[Vec<String>], widths: &[usize]) -> String { 279 let mut out = String::new(); 280 for row in rows { 281 out.push('|'); 282 for (i, cell) in row.iter().enumerate() { 283 let w = if i < widths.len() { widths[i] } else { cell.len() }; 284 let _ = write!(out, " {:<w$} |", cell, w = w); 285 } 286 out.push('\n'); 287 } 288 out 289 } 290 291 /// Render a complete aligned markdown table. 292 pub fn render_table(headers: &[String], rows: &[Vec<String>]) -> String { 293 let widths = compute_col_widths(headers, rows); 294 let mut out = render_table_header(headers, &widths); 295 out.push_str(&render_table_rows(rows, &widths)); 296 out 297 } 298 299 /// Format a single cell value for markdown display. 300 fn format_cell(col: &Column, idx: usize) -> String { 301 match col.get(idx) { 302 Ok(AnyValue::Null) | Err(_) => String::new(), 303 Ok(v) => format_any_value(&v), 304 } 305 } 306 307 /// Convert an AnyValue to its display string. 308 pub fn format_any_value(v: &AnyValue) -> String { 309 match v { 310 AnyValue::Null => String::new(), 311 AnyValue::Boolean(b) => b.to_string(), 312 AnyValue::Int8(n) => n.to_string(), 313 AnyValue::Int16(n) => n.to_string(), 314 AnyValue::Int32(n) => n.to_string(), 315 AnyValue::Int64(n) => n.to_string(), 316 AnyValue::UInt8(n) => n.to_string(), 317 AnyValue::UInt16(n) => n.to_string(), 318 AnyValue::UInt32(n) => n.to_string(), 319 AnyValue::UInt64(n) => n.to_string(), 320 AnyValue::Float32(f) => f.to_string(), 321 AnyValue::Float64(f) => f.to_string(), 322 AnyValue::String(s) => s.to_string(), 323 AnyValue::StringOwned(s) => s.to_string(), 324 other => format!("{other}"), 325 } 326 } 327 328 /// Map a polars DataType to a human-readable label. 329 fn format_dtype(dtype: &DataType) -> &'static str { 330 match dtype { 331 DataType::Boolean => "Boolean", 332 DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => "Int", 333 DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => "UInt", 334 DataType::Float32 | DataType::Float64 => "Float", 335 DataType::String => "String", 336 DataType::Date => "Date", 337 DataType::Datetime(_, _) => "Datetime", 338 DataType::Duration(_) => "Duration", 339 DataType::Time => "Time", 340 DataType::Null => "Null", 341 _ => "Other", 342 } 343 } 344 345 /// Manual CSV fallback if CsvWriter is unavailable. 346 fn csv_fallback(df: &DataFrame) -> String { 347 let columns = df.get_columns(); 348 let n_rows = df.height(); 349 350 let mut out = String::new(); 351 352 // Header 353 let header: Vec<String> = columns.iter().map(|c| c.name().to_string()).collect(); 354 out.push_str(&header.join(",")); 355 out.push('\n'); 356 357 // Rows 358 for row_idx in 0..n_rows { 359 let row: Vec<String> = columns 360 .iter() 361 .map(|col| { 362 let cell = format_cell(col, row_idx); 363 // Quote cells containing commas or quotes 364 if cell.contains(',') || cell.contains('"') || cell.contains('\n') { 365 format!("\"{}\"", cell.replace('"', "\"\"")) 366 } else { 367 cell 368 } 369 }) 370 .collect(); 371 out.push_str(&row.join(",")); 372 out.push('\n'); 373 } 374 375 out 376 } 377 378 // --------------------------------------------------------------------------- 379 // Tests 380 // --------------------------------------------------------------------------- 381 382 #[cfg(test)] 383 mod tests { 384 use super::*; 385 use crate::metadata::{FileInfo, SheetInfo}; 386 387 #[test] 388 fn test_format_header() { 389 use crate::format::Format; 390 let info = FileInfo { 391 file_size: 250_000, 392 format: Format::Excel, 393 sheets: vec![ 394 SheetInfo { name: "Sheet1".into(), rows: 100, cols: 5 }, 395 SheetInfo { name: "Sheet2".into(), rows: 50, cols: 3 }, 396 ], 397 }; 398 let out = format_header("test.xlsx", &info); 399 assert!(out.contains("# File: test.xlsx (244 KB) [Excel]")); 400 assert!(out.contains("# Sheets: 2")); 401 } 402 403 #[test] 404 fn test_format_header_single_sheet() { 405 use crate::format::Format; 406 let info = FileInfo { 407 file_size: 1_000, 408 format: Format::Csv, 409 sheets: vec![SheetInfo { name: "data".into(), rows: 10, cols: 3 }], 410 }; 411 let out = format_header("data.csv", &info); 412 assert!(out.contains("[CSV]")); 413 assert!(!out.contains("Sheets")); 414 } 415 416 #[test] 417 fn test_format_data_table() { 418 let s1 = Series::new("name".into(), &["Alice", "Bob"]); 419 let s2 = Series::new("value".into(), &[100i64, 200]); 420 let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); 421 let out = format_data_table(&df); 422 assert!(out.contains("| name | value |")); 423 assert!(out.contains("| Alice | 100 |")); 424 // Verify pipes are aligned: all lines have same length 425 let lines: Vec<&str> = out.trim().lines().collect(); 426 assert!(lines.len() >= 3); 427 let expected_len = lines[0].len(); 428 for line in &lines { 429 assert_eq!(line.len(), expected_len, "Misaligned: {line}"); 430 } 431 } 432 433 #[test] 434 fn test_format_head_tail_small() { 435 let s = Series::new("x".into(), &[1i64, 2, 3]); 436 let df = DataFrame::new(vec![s.into_column()]).unwrap(); 437 let out = format_head_tail(&df, 25, 25); 438 assert!(!out.contains("omitted")); 439 assert!(out.contains("| 1 ")); 440 assert!(out.contains("| 3 ")); 441 } 442 443 #[test] 444 fn test_format_head_tail_large() { 445 // 60 rows, head=25 tail=25 → 10 omitted 446 let values: Vec<i64> = (1..=60).collect(); 447 let s = Series::new("n".into(), values.as_slice()); 448 let df = DataFrame::new(vec![s.into_column()]).unwrap(); 449 let out = format_head_tail(&df, 25, 25); 450 assert!(out.contains("(10 rows omitted)")); 451 assert!(out.contains("| 1 ")); 452 assert!(out.contains("| 25 ")); 453 assert!(out.contains("| 36 ")); 454 assert!(out.contains("| 60 ")); 455 } 456 457 #[test] 458 fn test_format_schema() { 459 let sheet = SheetInfo { 460 name: "Revenue".into(), 461 rows: 11, // 1 header + 10 data 462 cols: 2, 463 }; 464 let s1 = Series::new("date".into(), &["2024-01-01", "2024-01-02"]); 465 let s2 = Series::new("amount".into(), &[1.0f64, 2.0]); 466 let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); 467 let out = format_schema(&sheet, &df); 468 assert!(out.contains("## Sheet: Revenue (10 rows x 2 cols)")); 469 assert!(out.contains("| date")); 470 assert!(out.contains("| amount")); 471 assert!(out.contains("String")); 472 assert!(out.contains("Float")); 473 } 474 475 #[test] 476 fn test_format_empty_sheet_completely_empty() { 477 let sheet = SheetInfo { name: "Blank".into(), rows: 0, cols: 0 }; 478 let out = format_empty_sheet(&sheet); 479 assert!(out.contains("(empty)")); 480 } 481 482 #[test] 483 fn test_format_empty_sheet_header_only() { 484 let sheet = SheetInfo { name: "Headers".into(), rows: 1, cols: 3 }; 485 let out = format_empty_sheet(&sheet); 486 assert!(out.contains("(no data rows)")); 487 } 488 489 #[test] 490 fn test_format_csv() { 491 let s1 = Series::new("a".into(), &["hello", "world"]); 492 let s2 = Series::new("b".into(), &[1i64, 2]); 493 let df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap(); 494 let out = format_csv(&df); 495 assert!(out.contains("a,b")); 496 assert!(out.contains("hello")); 497 assert!(out.contains("world")); 498 } 499 500 #[test] 501 fn test_format_describe() { 502 let s_name = Series::new("name".into(), &["Alice", "Bob", "Carol"]); 503 let s_val = Series::new("value".into(), &[10i64, 20, 30]); 504 let df = DataFrame::new(vec![s_name.into_column(), s_val.into_column()]).unwrap(); 505 let out = format_describe(&df); 506 // Header row contains stat and column names 507 assert!(out.contains("| stat")); 508 assert!(out.contains("name")); 509 assert!(out.contains("value")); 510 // All stat rows are present 511 assert!(out.contains("| count")); 512 assert!(out.contains("| null_count")); 513 assert!(out.contains("| mean")); 514 assert!(out.contains("| std")); 515 assert!(out.contains("| min")); 516 assert!(out.contains("| max")); 517 assert!(out.contains("| median")); 518 assert!(out.contains("| unique")); 519 // Non-numeric column shows "-" for mean 520 assert!(out.contains("| -")); 521 // Verify alignment: all table lines should have same length 522 let table_lines: Vec<&str> = out.trim().lines().filter(|l| l.starts_with('|')).collect(); 523 let expected_len = table_lines[0].len(); 524 for line in &table_lines { 525 assert_eq!(line.len(), expected_len, "Misaligned: {line}"); 526 } 527 } 528 }