dt-cli-tools

CLI tools for viewing, filtering, and comparing tabular data files
Log | Files | Refs | README | LICENSE

dtdiff.rs (11954B)


      1 use std::io::IsTerminal;
      2 use std::path::PathBuf;
      3 use std::process;
      4 
      5 use anyhow::{Result, bail};
      6 use clap::Parser;
      7 use serde_json::{Map, Value, json};
      8 
      9 use dtcore::diff::{DiffOptions, DiffResult, SheetSource};
     10 use dtcore::format::{detect_format, Format};
     11 use dtcore::reader::{ReadOptions, read_file};
     12 
     13 #[derive(Parser)]
     14 #[command(
     15     name = "dtdiff",
     16     about = "Compare two tabular data files and show differences",
     17     version
     18 )]
     19 struct Args {
     20     /// First file to compare
     21     file_a: String,
     22 
     23     /// Second file to compare
     24     file_b: String,
     25 
     26     /// Override format detection (applies to both files)
     27     #[arg(long, value_name = "FMT")]
     28     format: Option<String>,
     29 
     30     /// Select sheet by name or index (Excel only)
     31     #[arg(long, value_name = "NAME|INDEX")]
     32     sheet: Option<String>,
     33 
     34     /// Key column(s) for matched comparison (comma-separated)
     35     #[arg(long, value_name = "COL")]
     36     key: Option<String>,
     37 
     38     /// Float comparison tolerance (default: 1e-10)
     39     #[arg(long)]
     40     tolerance: Option<f64>,
     41 
     42     /// Output as JSON
     43     #[arg(long)]
     44     json: bool,
     45 
     46     /// Output as CSV
     47     #[arg(long)]
     48     csv: bool,
     49 
     50     /// Disable colored output
     51     #[arg(long)]
     52     no_color: bool,
     53 }
     54 
     55 // ---------------------------------------------------------------------------
     56 // Output formatters (ported from xldiff.rs)
     57 // ---------------------------------------------------------------------------
     58 
     59 /// Format a row's values inline: `Name: "Alice"  Score: "90"`
     60 fn format_row_inline(headers: &[String], values: &[String]) -> String {
     61     headers
     62         .iter()
     63         .zip(values.iter())
     64         .map(|(h, v)| format!("{}: \"{}\"", h, v))
     65         .collect::<Vec<_>>()
     66         .join("  ")
     67 }
     68 
     69 /// Format diff result as colored (or plain) text output.
     70 fn format_text(result: &DiffResult, color: bool) -> String {
     71     if !result.has_differences() {
     72         return "No differences found.\n".to_string();
     73     }
     74 
     75     let (red, green, yellow, reset) = if color {
     76         ("\x1b[31m", "\x1b[32m", "\x1b[33m", "\x1b[0m")
     77     } else {
     78         ("", "", "", "")
     79     };
     80 
     81     let mut out = String::new();
     82 
     83     // Header
     84     out.push_str(&format!(
     85         "--- {} ({})\n+++ {} ({})\n\n",
     86         result.source_a.sheet_name,
     87         result.source_a.file_name,
     88         result.source_b.sheet_name,
     89         result.source_b.file_name,
     90     ));
     91 
     92     // Summary
     93     out.push_str(&format!(
     94         "Added: {} | Removed: {} | Modified: {}\n\n",
     95         result.added.len(),
     96         result.removed.len(),
     97         result.modified.len(),
     98     ));
     99 
    100     // Removed rows
    101     for row in &result.removed {
    102         out.push_str(&format!(
    103             "{}- {}{}",
    104             red,
    105             format_row_inline(&result.headers, &row.values),
    106             reset,
    107         ));
    108         out.push('\n');
    109     }
    110 
    111     // Added rows
    112     for row in &result.added {
    113         out.push_str(&format!(
    114             "{}+ {}{}",
    115             green,
    116             format_row_inline(&result.headers, &row.values),
    117             reset,
    118         ));
    119         out.push('\n');
    120     }
    121 
    122     // Modified rows
    123     for m in &result.modified {
    124         let key_display: Vec<String> = result
    125             .key_columns
    126             .iter()
    127             .zip(m.key.iter())
    128             .map(|(col, val)| format!("{}: \"{}\"", col, val))
    129             .collect();
    130         out.push_str(&format!(
    131             "{}~ {}{}",
    132             yellow,
    133             key_display.join("  "),
    134             reset,
    135         ));
    136         out.push('\n');
    137         for change in &m.changes {
    138             out.push_str(&format!(
    139                 "    {}: \"{}\" \u{2192} \"{}\"\n",
    140                 change.column, change.old_value, change.new_value,
    141             ));
    142         }
    143     }
    144 
    145     out
    146 }
    147 
    148 /// Format diff result as JSON.
    149 fn format_json(result: &DiffResult) -> String {
    150     let added: Vec<Value> = result
    151         .added
    152         .iter()
    153         .map(|row| {
    154             let mut map = Map::new();
    155             for (h, v) in result.headers.iter().zip(row.values.iter()) {
    156                 map.insert(h.clone(), Value::String(v.clone()));
    157             }
    158             Value::Object(map)
    159         })
    160         .collect();
    161 
    162     let removed: Vec<Value> = result
    163         .removed
    164         .iter()
    165         .map(|row| {
    166             let mut map = Map::new();
    167             for (h, v) in result.headers.iter().zip(row.values.iter()) {
    168                 map.insert(h.clone(), Value::String(v.clone()));
    169             }
    170             Value::Object(map)
    171         })
    172         .collect();
    173 
    174     let modified: Vec<Value> = result
    175         .modified
    176         .iter()
    177         .map(|m| {
    178             let mut key_map = Map::new();
    179             for (col, val) in result.key_columns.iter().zip(m.key.iter()) {
    180                 key_map.insert(col.clone(), Value::String(val.clone()));
    181             }
    182             let changes: Vec<Value> = m
    183                 .changes
    184                 .iter()
    185                 .map(|c| {
    186                     json!({
    187                         "column": c.column,
    188                         "old": c.old_value,
    189                         "new": c.new_value,
    190                     })
    191                 })
    192                 .collect();
    193             json!({
    194                 "key": Value::Object(key_map),
    195                 "changes": changes,
    196             })
    197         })
    198         .collect();
    199 
    200     let output = json!({
    201         "added": added,
    202         "removed": removed,
    203         "modified": modified,
    204     });
    205 
    206     serde_json::to_string_pretty(&output).unwrap() + "\n"
    207 }
    208 
    209 /// Quote a value per RFC 4180: if it contains comma, quote, or newline, wrap
    210 /// in double quotes and escape any internal quotes by doubling them.
    211 fn csv_quote(value: &str) -> String {
    212     if value.contains(',') || value.contains('"') || value.contains('\n') {
    213         format!("\"{}\"", value.replace('"', "\"\""))
    214     } else {
    215         value.to_string()
    216     }
    217 }
    218 
    219 /// Build a CSV row from a slice of values.
    220 fn csv_row(values: &[String]) -> String {
    221     values.iter().map(|v| csv_quote(v)).collect::<Vec<_>>().join(",")
    222 }
    223 
    224 /// Format diff result as CSV.
    225 ///
    226 /// Header: _status, col1, col2, ..., _old_col1, _old_col2, ...
    227 /// Added rows: "added" + values + empty _old_ columns
    228 /// Removed rows: "removed" + values + empty _old_ columns
    229 /// Modified rows: "modified" + new values + old values in _old_ columns
    230 fn format_csv_output(result: &DiffResult) -> String {
    231     let mut out = String::new();
    232 
    233     // Build header
    234     let mut header_parts: Vec<String> = vec!["_status".to_string()];
    235     for h in &result.headers {
    236         header_parts.push(h.clone());
    237     }
    238     for h in &result.headers {
    239         header_parts.push(format!("_old_{}", h));
    240     }
    241     out.push_str(&csv_row(&header_parts));
    242     out.push('\n');
    243 
    244     let empty_cols: Vec<String> = result.headers.iter().map(|_| String::new()).collect();
    245 
    246     // Removed rows
    247     for row in &result.removed {
    248         let mut parts: Vec<String> = vec!["removed".to_string()];
    249         parts.extend(row.values.iter().cloned());
    250         while parts.len() < 1 + result.headers.len() {
    251             parts.push(String::new());
    252         }
    253         parts.extend(empty_cols.iter().cloned());
    254         out.push_str(&csv_row(&parts));
    255         out.push('\n');
    256     }
    257 
    258     // Added rows
    259     for row in &result.added {
    260         let mut parts: Vec<String> = vec!["added".to_string()];
    261         parts.extend(row.values.iter().cloned());
    262         while parts.len() < 1 + result.headers.len() {
    263             parts.push(String::new());
    264         }
    265         parts.extend(empty_cols.iter().cloned());
    266         out.push_str(&csv_row(&parts));
    267         out.push('\n');
    268     }
    269 
    270     // Modified rows
    271     for m in &result.modified {
    272         let mut main_cols: Vec<String> = Vec::new();
    273         let mut old_cols: Vec<String> = Vec::new();
    274 
    275         for h in &result.headers {
    276             if let Some(key_idx) = result.key_columns.iter().position(|k| k == h) {
    277                 main_cols.push(m.key.get(key_idx).cloned().unwrap_or_default());
    278                 old_cols.push(String::new());
    279             } else if let Some(change) = m.changes.iter().find(|c| c.column == *h) {
    280                 main_cols.push(change.new_value.clone());
    281                 old_cols.push(change.old_value.clone());
    282             } else {
    283                 // Unchanged non-key column — leave empty in both
    284                 main_cols.push(String::new());
    285                 old_cols.push(String::new());
    286             }
    287         }
    288 
    289         let mut parts: Vec<String> = vec!["modified".to_string()];
    290         parts.extend(main_cols);
    291         parts.extend(old_cols);
    292         out.push_str(&csv_row(&parts));
    293         out.push('\n');
    294     }
    295 
    296     out
    297 }
    298 
    299 // ---------------------------------------------------------------------------
    300 // run / main
    301 // ---------------------------------------------------------------------------
    302 
    303 fn run(args: Args) -> Result<()> {
    304     let path_a = PathBuf::from(&args.file_a);
    305     let path_b = PathBuf::from(&args.file_b);
    306 
    307     // Validate files exist
    308     if !path_a.exists() {
    309         bail!("file not found: {}", path_a.display());
    310     }
    311     if !path_b.exists() {
    312         bail!("file not found: {}", path_b.display());
    313     }
    314 
    315     // Detect formats
    316     let fmt_a = detect_format(&path_a, args.format.as_deref())?;
    317     let fmt_b = detect_format(&path_b, args.format.as_deref())?;
    318 
    319     // Enforce same-format constraint
    320     if !fmt_a.same_family(fmt_b) {
    321         bail!(
    322             "files have incompatible formats: {:?} vs {:?}. Both files must use the same format family.",
    323             fmt_a,
    324             fmt_b
    325         );
    326     }
    327 
    328     // Build read options
    329     let opts_a = ReadOptions {
    330         sheet: args.sheet.clone(),
    331         skip_rows: None,
    332         separator: None,
    333     };
    334     let opts_b = ReadOptions {
    335         sheet: args.sheet.clone(),
    336         skip_rows: None,
    337         separator: None,
    338     };
    339 
    340     // Read DataFrames
    341     let df_a = read_file(&path_a, fmt_a, &opts_a)?;
    342     let df_b = read_file(&path_b, fmt_b, &opts_b)?;
    343 
    344     // Resolve key columns
    345     let key_columns: Vec<String> = if let Some(ref key_str) = args.key {
    346         key_str.split(',').map(|s| s.trim().to_string()).collect()
    347     } else {
    348         vec![]
    349     };
    350 
    351     // Build source labels
    352     let file_name_a = path_a
    353         .file_name()
    354         .map(|s| s.to_string_lossy().to_string())
    355         .unwrap_or_else(|| args.file_a.clone());
    356     let file_name_b = path_b
    357         .file_name()
    358         .map(|s| s.to_string_lossy().to_string())
    359         .unwrap_or_else(|| args.file_b.clone());
    360 
    361     // Use file name as "sheet name" for non-Excel formats; for Excel use the
    362     // sheet name from opts (or a placeholder if none was specified).
    363     let sheet_name_a = if fmt_a == Format::Excel {
    364         args.sheet.clone().unwrap_or_else(|| file_name_a.clone())
    365     } else {
    366         file_name_a.clone()
    367     };
    368     let sheet_name_b = if fmt_b == Format::Excel {
    369         args.sheet.clone().unwrap_or_else(|| file_name_b.clone())
    370     } else {
    371         file_name_b.clone()
    372     };
    373 
    374     let source_a = SheetSource {
    375         file_name: file_name_a,
    376         sheet_name: sheet_name_a,
    377     };
    378     let source_b = SheetSource {
    379         file_name: file_name_b,
    380         sheet_name: sheet_name_b,
    381     };
    382 
    383     let diff_opts = DiffOptions {
    384         key_columns,
    385         tolerance: args.tolerance,
    386     };
    387 
    388     // Run diff
    389     let result = dtcore::diff::diff_sheets(&df_a, &df_b, &diff_opts, source_a, source_b)?;
    390 
    391     // TTY detection for color
    392     let use_color = !args.no_color && std::io::stdout().is_terminal();
    393 
    394     // Format output: --json and --csv are mutually exclusive flags; default is text
    395     let output = if args.json {
    396         format_json(&result)
    397     } else if args.csv {
    398         format_csv_output(&result)
    399     } else {
    400         format_text(&result, use_color)
    401     };
    402 
    403     print!("{}", output);
    404 
    405     // Exit 1 if differences found (diff convention), 0 if identical
    406     if result.has_differences() {
    407         process::exit(1);
    408     }
    409 
    410     Ok(())
    411 }
    412 
    413 fn main() {
    414     let args = Args::parse();
    415     if let Err(err) = run(args) {
    416         eprintln!("dtdiff: {err}");
    417         process::exit(2);
    418     }
    419 }