diff.rs (24600B)
1 // Diff engine for comparing two Excel sheets. 2 3 use anyhow::{Result, bail}; 4 use polars::prelude::*; 5 use std::collections::HashMap; 6 7 use crate::formatter; 8 9 /// Source file and sheet metadata for display. 10 #[derive(Debug, Clone)] 11 pub struct SheetSource { 12 pub file_name: String, 13 pub sheet_name: String, 14 } 15 16 /// A single row from an added or removed set. 17 #[derive(Debug, Clone)] 18 pub struct DiffRow { 19 pub values: Vec<String>, 20 } 21 22 /// A change in a single cell. 23 #[derive(Debug, Clone)] 24 pub struct CellChange { 25 pub column: String, 26 pub old_value: String, 27 pub new_value: String, 28 } 29 30 /// A row present in both files with cell-level differences. 31 #[derive(Debug, Clone)] 32 pub struct ModifiedRow { 33 pub key: Vec<String>, 34 pub changes: Vec<CellChange>, 35 } 36 37 /// Result of comparing two sheets. 38 #[derive(Debug, Clone)] 39 pub struct DiffResult { 40 pub headers: Vec<String>, 41 pub key_columns: Vec<String>, 42 pub added: Vec<DiffRow>, 43 pub removed: Vec<DiffRow>, 44 pub modified: Vec<ModifiedRow>, 45 pub source_a: SheetSource, 46 pub source_b: SheetSource, 47 } 48 49 impl DiffResult { 50 pub fn has_differences(&self) -> bool { 51 !self.added.is_empty() || !self.removed.is_empty() || !self.modified.is_empty() 52 } 53 } 54 55 /// Options controlling how the diff is performed. 56 #[derive(Debug, Clone, Default)] 57 pub struct DiffOptions { 58 pub key_columns: Vec<String>, 59 pub tolerance: Option<f64>, 60 } 61 62 // --------------------------------------------------------------------------- 63 // Helper functions 64 // --------------------------------------------------------------------------- 65 66 /// Format a cell value for display. Returns empty string for null. 67 fn cell_to_string(col: &Column, idx: usize) -> String { 68 match col.get(idx) { 69 Ok(AnyValue::Null) | Err(_) => String::new(), 70 Ok(v) => formatter::format_any_value(&v), 71 } 72 } 73 74 /// Format a cell value for hashing. Uses a sentinel for null so that null 75 /// and empty string produce different keys. 76 fn cell_to_key_part(col: &Column, idx: usize) -> String { 77 match col.get(idx) { 78 Ok(AnyValue::Null) | Err(_) => "\x01NULL\x01".to_string(), 79 Ok(v) => formatter::format_any_value(&v), 80 } 81 } 82 83 /// Build a string key for an entire row by joining all column values. 84 fn row_to_key(df: &DataFrame, row_idx: usize) -> String { 85 df.get_columns() 86 .iter() 87 .map(|col| cell_to_key_part(col, row_idx)) 88 .collect::<Vec<_>>() 89 .join("\0") 90 } 91 92 /// Collect display values for every column in a row. 93 fn row_to_strings(df: &DataFrame, row_idx: usize) -> Vec<String> { 94 df.get_columns() 95 .iter() 96 .map(|col| cell_to_string(col, row_idx)) 97 .collect() 98 } 99 100 // --------------------------------------------------------------------------- 101 // Positional diff 102 // --------------------------------------------------------------------------- 103 104 /// Compare two DataFrames positionally (no key columns). 105 /// 106 /// Uses multiset comparison: each unique row is tracked by frequency. 107 /// Rows present in A but not (or fewer times) in B are "removed"; 108 /// rows present in B but not (or fewer times) in A are "added". 109 pub fn diff_positional( 110 df_a: &DataFrame, 111 df_b: &DataFrame, 112 _opts: &DiffOptions, 113 source_a: SheetSource, 114 source_b: SheetSource, 115 ) -> Result<DiffResult> { 116 // Determine headers — use the longer header set. 117 let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect(); 118 let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect(); 119 120 let headers = if headers_b.len() > headers_a.len() { 121 if headers_a.len() != headers_b.len() { 122 eprintln!( 123 "Warning: column count differs ({} vs {}), using wider header set", 124 headers_a.len(), 125 headers_b.len() 126 ); 127 } 128 headers_b.clone() 129 } else { 130 if headers_a.len() != headers_b.len() { 131 eprintln!( 132 "Warning: column count differs ({} vs {}), using wider header set", 133 headers_a.len(), 134 headers_b.len() 135 ); 136 } 137 headers_a.clone() 138 }; 139 140 let num_headers = headers.len(); 141 142 // Build frequency maps: key → list of row indices (so we can consume them). 143 let mut freq_a: HashMap<String, Vec<usize>> = HashMap::new(); 144 for i in 0..df_a.height() { 145 let key = row_to_key(df_a, i); 146 freq_a.entry(key).or_default().push(i); 147 } 148 149 let mut freq_b: HashMap<String, Vec<usize>> = HashMap::new(); 150 for i in 0..df_b.height() { 151 let key = row_to_key(df_b, i); 152 freq_b.entry(key).or_default().push(i); 153 } 154 155 let mut removed = Vec::new(); 156 let mut added = Vec::new(); 157 158 // Walk A: for each row, try to consume a matching row from B. 159 for i in 0..df_a.height() { 160 let key = row_to_key(df_a, i); 161 let consumed = freq_b 162 .get_mut(&key) 163 .and_then(|indices| indices.pop()) 164 .is_some(); 165 if !consumed { 166 let mut vals = row_to_strings(df_a, i); 167 vals.resize(num_headers, String::new()); 168 removed.push(DiffRow { values: vals }); 169 } 170 } 171 172 // Walk B: for each row, try to consume a matching row from A. 173 for i in 0..df_b.height() { 174 let key = row_to_key(df_b, i); 175 let consumed = freq_a 176 .get_mut(&key) 177 .and_then(|indices| indices.pop()) 178 .is_some(); 179 if !consumed { 180 let mut vals = row_to_strings(df_b, i); 181 vals.resize(num_headers, String::new()); 182 added.push(DiffRow { values: vals }); 183 } 184 } 185 186 Ok(DiffResult { 187 headers, 188 key_columns: vec![], 189 added, 190 removed, 191 modified: vec![], 192 source_a, 193 source_b, 194 }) 195 } 196 197 // --------------------------------------------------------------------------- 198 // Key-based diff 199 // --------------------------------------------------------------------------- 200 201 /// A row indexed by its key columns. 202 struct KeyedRow { 203 values: Vec<String>, 204 key_values: Vec<String>, 205 } 206 207 /// Build a map from composite key string to KeyedRow for every row in the DataFrame. 208 fn build_key_map( 209 df: &DataFrame, 210 key_indices: &[usize], 211 columns: &[Column], 212 ) -> HashMap<String, KeyedRow> { 213 let mut map = HashMap::new(); 214 for i in 0..df.height() { 215 let key_values: Vec<String> = key_indices 216 .iter() 217 .map(|&ki| cell_to_string(&columns[ki], i)) 218 .collect(); 219 let composite_key = key_values.join("\0"); 220 let values: Vec<String> = columns.iter().map(|col| cell_to_string(col, i)).collect(); 221 map.insert( 222 composite_key, 223 KeyedRow { 224 values, 225 key_values, 226 }, 227 ); 228 } 229 map 230 } 231 232 /// Warn on stderr when duplicate keys are found. 233 fn check_duplicate_keys( 234 df: &DataFrame, 235 key_indices: &[usize], 236 columns: &[Column], 237 source: &SheetSource, 238 ) { 239 let mut seen: HashMap<String, usize> = HashMap::new(); 240 for i in 0..df.height() { 241 let key: String = key_indices 242 .iter() 243 .map(|&ki| cell_to_string(&columns[ki], i)) 244 .collect::<Vec<_>>() 245 .join("\0"); 246 let count = seen.entry(key.clone()).or_insert(0); 247 *count += 1; 248 if *count == 2 { 249 let display_key = key.replace('\0', ", "); 250 eprintln!( 251 "Warning: duplicate key [{}] in {}:{}", 252 display_key, source.file_name, source.sheet_name 253 ); 254 } 255 } 256 } 257 258 /// Check whether a polars DataType is a float type. 259 fn is_float_dtype(dt: &DataType) -> bool { 260 matches!(dt, DataType::Float32 | DataType::Float64) 261 } 262 263 /// Check whether a polars DataType is an integer type. 264 fn is_int_dtype(dt: &DataType) -> bool { 265 matches!( 266 dt, 267 DataType::Int8 268 | DataType::Int16 269 | DataType::Int32 270 | DataType::Int64 271 | DataType::UInt8 272 | DataType::UInt16 273 | DataType::UInt32 274 | DataType::UInt64 275 ) 276 } 277 278 /// Compare two string-rendered values with optional numeric tolerance. 279 /// 280 /// Rules: 281 /// - NaN == NaN is true. 282 /// - NaN vs non-NaN is false. 283 /// - Pure int+int columns use exact comparison (no tolerance applied). 284 /// - At least one float column applies tolerance. 285 /// - Otherwise exact string comparison. 286 fn values_equal_with_tolerance( 287 val_a: &str, 288 val_b: &str, 289 tolerance: f64, 290 df_a: &DataFrame, 291 df_b: &DataFrame, 292 col_name: &str, 293 ) -> bool { 294 let parsed_a = val_a.parse::<f64>(); 295 let parsed_b = val_b.parse::<f64>(); 296 297 match (parsed_a, parsed_b) { 298 (Ok(a), Ok(b)) => { 299 if a.is_nan() && b.is_nan() { 300 return true; 301 } 302 if a.is_nan() || b.is_nan() { 303 return false; 304 } 305 306 let dt_a = df_a 307 .column(col_name) 308 .map(|c| c.dtype().clone()) 309 .unwrap_or(DataType::String); 310 let dt_b = df_b 311 .column(col_name) 312 .map(|c| c.dtype().clone()) 313 .unwrap_or(DataType::String); 314 315 if is_int_dtype(&dt_a) && is_int_dtype(&dt_b) { 316 val_a == val_b 317 } else if is_float_dtype(&dt_a) || is_float_dtype(&dt_b) { 318 (a - b).abs() <= tolerance 319 } else { 320 val_a == val_b 321 } 322 } 323 _ => val_a == val_b, 324 } 325 } 326 327 /// Compare non-key columns of two keyed rows and return cell-level changes. 328 #[allow(clippy::too_many_arguments)] 329 fn compare_rows( 330 df_a: &DataFrame, 331 df_b: &DataFrame, 332 headers_a: &[String], 333 headers_b: &[String], 334 row_a: &KeyedRow, 335 row_b: &KeyedRow, 336 common_columns: &[String], 337 opts: &DiffOptions, 338 ) -> Vec<CellChange> { 339 let mut changes = Vec::new(); 340 for col_name in common_columns { 341 let idx_a = headers_a.iter().position(|h| h == col_name); 342 let idx_b = headers_b.iter().position(|h| h == col_name); 343 let val_a = idx_a 344 .map(|i| row_a.values.get(i).cloned().unwrap_or_default()) 345 .unwrap_or_default(); 346 let val_b = idx_b 347 .map(|i| row_b.values.get(i).cloned().unwrap_or_default()) 348 .unwrap_or_default(); 349 350 let equal = if let Some(tol) = opts.tolerance { 351 values_equal_with_tolerance(&val_a, &val_b, tol, df_a, df_b, col_name) 352 } else { 353 val_a == val_b 354 }; 355 356 if !equal { 357 changes.push(CellChange { 358 column: col_name.clone(), 359 old_value: val_a, 360 new_value: val_b, 361 }); 362 } 363 } 364 changes 365 } 366 367 /// Compare two DataFrames using key columns. 368 pub fn diff_keyed( 369 df_a: &DataFrame, 370 df_b: &DataFrame, 371 opts: &DiffOptions, 372 source_a: SheetSource, 373 source_b: SheetSource, 374 ) -> Result<DiffResult> { 375 let columns_a = df_a.get_columns(); 376 let columns_b = df_b.get_columns(); 377 let headers_a: Vec<String> = df_a.get_column_names().iter().map(|s| s.to_string()).collect(); 378 let headers_b: Vec<String> = df_b.get_column_names().iter().map(|s| s.to_string()).collect(); 379 380 // Resolve key column indices in both frames. 381 let mut key_indices_a = Vec::new(); 382 let mut key_indices_b = Vec::new(); 383 for key_col in &opts.key_columns { 384 match headers_a.iter().position(|h| h == key_col) { 385 Some(idx) => key_indices_a.push(idx), 386 None => bail!("Key column '{}' not found in {}", key_col, source_a.file_name), 387 } 388 match headers_b.iter().position(|h| h == key_col) { 389 Some(idx) => key_indices_b.push(idx), 390 None => bail!("Key column '{}' not found in {}", key_col, source_b.file_name), 391 } 392 } 393 394 // Find non-key columns. 395 let non_key_a: Vec<String> = headers_a 396 .iter() 397 .filter(|h| !opts.key_columns.contains(h)) 398 .cloned() 399 .collect(); 400 let non_key_b: Vec<String> = headers_b 401 .iter() 402 .filter(|h| !opts.key_columns.contains(h)) 403 .cloned() 404 .collect(); 405 406 // Common non-key columns (for modification detection). 407 let common_columns: Vec<String> = non_key_a 408 .iter() 409 .filter(|h| non_key_b.contains(h)) 410 .cloned() 411 .collect(); 412 413 // Warn about columns only in one file. 414 for col in &non_key_a { 415 if !non_key_b.contains(col) { 416 eprintln!("Warning: column '{}' only in {}", col, source_a.file_name); 417 } 418 } 419 for col in &non_key_b { 420 if !non_key_a.contains(col) { 421 eprintln!("Warning: column '{}' only in {}", col, source_b.file_name); 422 } 423 } 424 425 // Build output headers: key columns + all from A non-key + B-only non-key. 426 let mut headers = opts.key_columns.clone(); 427 headers.extend(non_key_a.iter().cloned()); 428 for col in &non_key_b { 429 if !non_key_a.contains(col) { 430 headers.push(col.clone()); 431 } 432 } 433 434 // Check for duplicate keys. 435 check_duplicate_keys(df_a, &key_indices_a, columns_a, &source_a); 436 check_duplicate_keys(df_b, &key_indices_b, columns_b, &source_b); 437 438 // Build key maps. 439 let map_a = build_key_map(df_a, &key_indices_a, columns_a); 440 let map_b = build_key_map(df_b, &key_indices_b, columns_b); 441 442 let mut removed = Vec::new(); 443 let mut added = Vec::new(); 444 let mut modified = Vec::new(); 445 446 // Keys in A but not in B → removed. 447 for (composite_key, row_a) in &map_a { 448 if !map_b.contains_key(composite_key) { 449 let mut vals = Vec::new(); 450 for h in &headers { 451 if let Some(idx) = headers_a.iter().position(|ha| ha == h) { 452 vals.push(row_a.values.get(idx).cloned().unwrap_or_default()); 453 } else { 454 vals.push(String::new()); 455 } 456 } 457 removed.push(DiffRow { values: vals }); 458 } 459 } 460 461 // Keys in B but not in A → added. 462 for (composite_key, row_b) in &map_b { 463 if !map_a.contains_key(composite_key) { 464 let mut vals = Vec::new(); 465 for h in &headers { 466 if let Some(idx) = headers_b.iter().position(|hb| hb == h) { 467 vals.push(row_b.values.get(idx).cloned().unwrap_or_default()); 468 } else { 469 vals.push(String::new()); 470 } 471 } 472 added.push(DiffRow { values: vals }); 473 } 474 } 475 476 // Keys in both → compare for modifications. 477 for (composite_key, row_a) in &map_a { 478 if let Some(row_b) = map_b.get(composite_key) { 479 let changes = compare_rows( 480 df_a, 481 df_b, 482 &headers_a, 483 &headers_b, 484 row_a, 485 row_b, 486 &common_columns, 487 opts, 488 ); 489 if !changes.is_empty() { 490 modified.push(ModifiedRow { 491 key: row_a.key_values.clone(), 492 changes, 493 }); 494 } 495 } 496 } 497 498 Ok(DiffResult { 499 headers, 500 key_columns: opts.key_columns.clone(), 501 added, 502 removed, 503 modified, 504 source_a, 505 source_b, 506 }) 507 } 508 509 // --------------------------------------------------------------------------- 510 // Entry point 511 // --------------------------------------------------------------------------- 512 513 /// Compare two DataFrames, dispatching to positional or key-based diff 514 /// depending on whether key columns are specified. 515 pub fn diff_sheets( 516 df_a: &DataFrame, 517 df_b: &DataFrame, 518 opts: &DiffOptions, 519 source_a: SheetSource, 520 source_b: SheetSource, 521 ) -> Result<DiffResult> { 522 if opts.key_columns.is_empty() { 523 diff_positional(df_a, df_b, opts, source_a, source_b) 524 } else { 525 diff_keyed(df_a, df_b, opts, source_a, source_b) 526 } 527 } 528 529 #[cfg(test)] 530 mod tests { 531 use super::*; 532 533 fn test_source_a() -> SheetSource { 534 SheetSource { 535 file_name: "a.xlsx".into(), 536 sheet_name: "Sheet1".into(), 537 } 538 } 539 540 fn test_source_b() -> SheetSource { 541 SheetSource { 542 file_name: "b.xlsx".into(), 543 sheet_name: "Sheet1".into(), 544 } 545 } 546 547 // ---- Positional diff tests ---- 548 549 #[test] 550 fn test_positional_no_diff() { 551 let df_a = df! { 552 "name" => &["Alice", "Bob"], 553 "score" => &[100, 200], 554 } 555 .unwrap(); 556 let df_b = df_a.clone(); 557 let opts = DiffOptions::default(); 558 559 let result = diff_positional(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 560 561 assert!(!result.has_differences()); 562 assert!(result.added.is_empty()); 563 assert!(result.removed.is_empty()); 564 assert!(result.modified.is_empty()); 565 } 566 567 #[test] 568 fn test_positional_added_removed() { 569 let df_a = df! { 570 "name" => &["Alice", "Bob"], 571 } 572 .unwrap(); 573 let df_b = df! { 574 "name" => &["Alice", "Charlie"], 575 } 576 .unwrap(); 577 let opts = DiffOptions::default(); 578 579 let result = diff_positional(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 580 581 assert!(result.has_differences()); 582 assert_eq!(result.removed.len(), 1); 583 assert_eq!(result.removed[0].values, vec!["Bob"]); 584 assert_eq!(result.added.len(), 1); 585 assert_eq!(result.added[0].values, vec!["Charlie"]); 586 } 587 588 #[test] 589 fn test_positional_duplicate_rows() { 590 let df_a = df! { 591 "val" => &["A", "A", "A"], 592 } 593 .unwrap(); 594 let df_b = df! { 595 "val" => &["A", "A"], 596 } 597 .unwrap(); 598 let opts = DiffOptions::default(); 599 600 let result = diff_positional(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 601 602 assert_eq!(result.removed.len(), 1); 603 assert_eq!(result.removed[0].values, vec!["A"]); 604 assert!(result.added.is_empty()); 605 } 606 607 // ---- Key-based diff tests ---- 608 609 #[test] 610 fn test_keyed_no_diff() { 611 let df_a = df! { 612 "id" => &[1, 2], 613 "name" => &["Alice", "Bob"], 614 } 615 .unwrap(); 616 let df_b = df_a.clone(); 617 let opts = DiffOptions { 618 key_columns: vec!["id".into()], 619 tolerance: None, 620 }; 621 622 let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 623 624 assert!(!result.has_differences()); 625 } 626 627 #[test] 628 fn test_keyed_added_removed() { 629 let df_a = df! { 630 "id" => &[1, 2], 631 "name" => &["Alice", "Bob"], 632 } 633 .unwrap(); 634 let df_b = df! { 635 "id" => &[2, 3], 636 "name" => &["Bob", "Charlie"], 637 } 638 .unwrap(); 639 let opts = DiffOptions { 640 key_columns: vec!["id".into()], 641 tolerance: None, 642 }; 643 644 let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 645 646 assert_eq!(result.removed.len(), 1); 647 assert!(result.removed[0].values.contains(&"1".to_string())); 648 assert!(result.removed[0].values.contains(&"Alice".to_string())); 649 650 assert_eq!(result.added.len(), 1); 651 assert!(result.added[0].values.contains(&"3".to_string())); 652 assert!(result.added[0].values.contains(&"Charlie".to_string())); 653 } 654 655 #[test] 656 fn test_keyed_modified() { 657 let df_a = df! { 658 "id" => &[1, 2], 659 "score" => &[100, 200], 660 } 661 .unwrap(); 662 let df_b = df! { 663 "id" => &[1, 2], 664 "score" => &[100, 250], 665 } 666 .unwrap(); 667 let opts = DiffOptions { 668 key_columns: vec!["id".into()], 669 tolerance: None, 670 }; 671 672 let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 673 674 assert!(result.added.is_empty()); 675 assert!(result.removed.is_empty()); 676 assert_eq!(result.modified.len(), 1); 677 678 let m = &result.modified[0]; 679 assert_eq!(m.key, vec!["2"]); 680 assert_eq!(m.changes.len(), 1); 681 assert_eq!(m.changes[0].column, "score"); 682 assert_eq!(m.changes[0].old_value, "200"); 683 assert_eq!(m.changes[0].new_value, "250"); 684 } 685 686 #[test] 687 fn test_keyed_composite_key() { 688 let df_a = df! { 689 "date" => &["2024-01-01", "2024-01-01", "2024-01-02"], 690 "ticker" => &["AAPL", "GOOG", "AAPL"], 691 "price" => &[150.0, 140.0, 151.0], 692 } 693 .unwrap(); 694 let df_b = df! { 695 "date" => &["2024-01-01", "2024-01-01", "2024-01-02"], 696 "ticker" => &["AAPL", "GOOG", "AAPL"], 697 "price" => &[150.0, 142.0, 151.0], 698 } 699 .unwrap(); 700 let opts = DiffOptions { 701 key_columns: vec!["date".into(), "ticker".into()], 702 tolerance: None, 703 }; 704 705 let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 706 707 assert!(result.added.is_empty()); 708 assert!(result.removed.is_empty()); 709 assert_eq!(result.modified.len(), 1); 710 711 let m = &result.modified[0]; 712 assert_eq!(m.key, vec!["2024-01-01", "GOOG"]); 713 assert_eq!(m.changes[0].column, "price"); 714 assert_eq!(m.changes[0].old_value, "140"); 715 assert_eq!(m.changes[0].new_value, "142"); 716 } 717 718 // ---- Tolerance tests ---- 719 720 #[test] 721 fn test_keyed_tolerance_within() { 722 let df_a = df! { 723 "id" => &[1], 724 "price" => &[100.001_f64], 725 } 726 .unwrap(); 727 let df_b = df! { 728 "id" => &[1], 729 "price" => &[100.002_f64], 730 } 731 .unwrap(); 732 let opts = DiffOptions { 733 key_columns: vec!["id".into()], 734 tolerance: Some(0.01), 735 }; 736 737 let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 738 739 assert!(!result.has_differences()); 740 } 741 742 #[test] 743 fn test_keyed_tolerance_exceeded() { 744 let df_a = df! { 745 "id" => &[1], 746 "price" => &[100.0_f64], 747 } 748 .unwrap(); 749 let df_b = df! { 750 "id" => &[1], 751 "price" => &[100.05_f64], 752 } 753 .unwrap(); 754 let opts = DiffOptions { 755 key_columns: vec!["id".into()], 756 tolerance: Some(0.01), 757 }; 758 759 let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 760 761 assert_eq!(result.modified.len(), 1); 762 assert_eq!(result.modified[0].changes[0].column, "price"); 763 } 764 765 #[test] 766 fn test_keyed_nan_handling() { 767 let df_a = df! { 768 "id" => &[1], 769 "value" => &[f64::NAN], 770 } 771 .unwrap(); 772 let df_b = df! { 773 "id" => &[1], 774 "value" => &[f64::NAN], 775 } 776 .unwrap(); 777 let opts = DiffOptions { 778 key_columns: vec!["id".into()], 779 tolerance: Some(0.01), 780 }; 781 782 let result = diff_keyed(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 783 784 assert!(!result.has_differences(), "NaN vs NaN should be treated as equal"); 785 } 786 787 // ---- diff_sheets entry point tests ---- 788 789 #[test] 790 fn test_diff_sheets_positional() { 791 let df_a = df! { 792 "name" => &["Alice", "Bob"], 793 } 794 .unwrap(); 795 let df_b = df! { 796 "name" => &["Alice", "Charlie"], 797 } 798 .unwrap(); 799 let opts = DiffOptions::default(); // No key columns → positional. 800 801 let result = diff_sheets(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 802 803 assert!(result.key_columns.is_empty()); 804 assert_eq!(result.removed.len(), 1); 805 assert_eq!(result.added.len(), 1); 806 } 807 808 #[test] 809 fn test_diff_sheets_keyed() { 810 let df_a = df! { 811 "id" => &[1, 2], 812 "score" => &[100, 200], 813 } 814 .unwrap(); 815 let df_b = df! { 816 "id" => &[1, 2], 817 "score" => &[100, 250], 818 } 819 .unwrap(); 820 let opts = DiffOptions { 821 key_columns: vec!["id".into()], 822 tolerance: None, 823 }; 824 825 let result = diff_sheets(&df_a, &df_b, &opts, test_source_a(), test_source_b()).unwrap(); 826 827 assert_eq!(result.key_columns, vec!["id"]); 828 assert_eq!(result.modified.len(), 1); 829 } 830 }