dt-cli-tools

CLI tools for viewing, filtering, and comparing tabular data files
Log | Files | Refs | README | LICENSE

format.rs (14563B)


      1 use anyhow::{anyhow, Result};
      2 use std::io::Read;
      3 use std::path::Path;
      4 
      5 #[derive(Debug, Clone, Copy, PartialEq)]
      6 pub enum Format {
      7     Csv,
      8     Tsv,
      9     Parquet,
     10     Arrow,
     11     Json,
     12     Ndjson,
     13     Excel,
     14 }
     15 
     16 impl Format {
     17     /// Returns true if two formats belong to the same "family":
     18     /// - Csv and Tsv are the same family
     19     /// - Json and Ndjson are the same family
     20     /// - Everything else is only the same family as itself
     21     pub fn same_family(self, other: Format) -> bool {
     22         use Format::*;
     23         matches!(
     24             (self, other),
     25             (Csv, Tsv) | (Tsv, Csv) | (Json, Ndjson) | (Ndjson, Json)
     26         ) || self == other
     27     }
     28 }
     29 
     30 /// Parse a format from a string name. Case-insensitive.
     31 pub fn parse_format_str(s: &str) -> Result<Format> {
     32     match s.to_ascii_lowercase().as_str() {
     33         "csv" => Ok(Format::Csv),
     34         "tsv" | "tab" => Ok(Format::Tsv),
     35         "parquet" | "pq" => Ok(Format::Parquet),
     36         "arrow" | "feather" | "ipc" => Ok(Format::Arrow),
     37         "json" => Ok(Format::Json),
     38         "ndjson" | "jsonl" => Ok(Format::Ndjson),
     39         "excel" | "xlsx" | "xls" | "xlsb" | "ods" => Ok(Format::Excel),
     40         other => Err(anyhow!("unknown format: {:?}", other)),
     41     }
     42 }
     43 
     44 /// Detect format from a file's extension.
     45 pub fn detect_by_extension(path: &Path) -> Result<Format> {
     46     let ext = path
     47         .extension()
     48         .and_then(|e| e.to_str())
     49         .ok_or_else(|| anyhow!("file has no extension: {}", path.display()))?;
     50     parse_format_str(ext).map_err(|_| anyhow!("unknown file extension: {:?}", ext))
     51 }
     52 
     53 /// Read the first 8 bytes of a file and attempt to identify its format from
     54 /// magic bytes. Returns `None` for formats (CSV/TSV) that have no distinctive
     55 /// magic sequence.
     56 pub fn detect_by_magic(path: &Path) -> Result<Option<Format>> {
     57     let mut file = std::fs::File::open(path)
     58         .map_err(|e| anyhow!("cannot open {:?}: {}", path, e))?;
     59     let mut buf = [0u8; 8];
     60     let n = file.read(&mut buf)?;
     61     let bytes = &buf[..n];
     62 
     63     // Parquet: magic "PAR1" at start (4 bytes)
     64     if bytes.len() >= 4 && &bytes[..4] == b"PAR1" {
     65         return Ok(Some(Format::Parquet));
     66     }
     67 
     68     // Arrow IPC: magic "ARROW1" at start (6 bytes)
     69     if bytes.len() >= 6 && &bytes[..6] == b"ARROW1" {
     70         return Ok(Some(Format::Arrow));
     71     }
     72 
     73     // Excel ZIP-based (xlsx, xlsb): PK signature
     74     if bytes.len() >= 2 && bytes[0] == 0x50 && bytes[1] == 0x4B {
     75         return Ok(Some(Format::Excel));
     76     }
     77 
     78     // Excel OLE2 (xls): D0 CF 11 E0
     79     if bytes.len() >= 4
     80         && bytes[0] == 0xD0
     81         && bytes[1] == 0xCF
     82         && bytes[2] == 0x11
     83         && bytes[3] == 0xE0
     84     {
     85         return Ok(Some(Format::Excel));
     86     }
     87 
     88     // JSON / NDJSON: check first non-whitespace byte
     89     for &b in bytes {
     90         if b.is_ascii_whitespace() {
     91             continue;
     92         }
     93         if b == b'[' {
     94             return Ok(Some(Format::Json));
     95         }
     96         if b == b'{' {
     97             return Ok(Some(Format::Ndjson));
     98         }
     99         break;
    100     }
    101 
    102     // CSV/TSV and anything else has no distinctive magic
    103     Ok(None)
    104 }
    105 
    106 /// Detect format with priority: explicit override > magic bytes > extension.
    107 pub fn detect_format(path: &Path, override_fmt: Option<&str>) -> Result<Format> {
    108     if let Some(s) = override_fmt {
    109         return parse_format_str(s);
    110     }
    111 
    112     if let Some(fmt) = detect_by_magic(path)? {
    113         return Ok(fmt);
    114     }
    115 
    116     detect_by_extension(path)
    117 }
    118 
    119 /// Detect the delimiter used in a CSV-like file by sampling up to 8 KB / 10
    120 /// lines and counting occurrences of `,`, `\t`, and `;`. Returns the delimiter
    121 /// with the highest minimum count across lines. Defaults to `,`.
    122 pub fn detect_csv_delimiter(path: &Path) -> Result<u8> {
    123     const MAX_BYTES: usize = 8 * 1024;
    124     const MAX_LINES: usize = 10;
    125 
    126     let mut file = std::fs::File::open(path)
    127         .map_err(|e| anyhow!("cannot open {:?}: {}", path, e))?;
    128 
    129     let mut buf = vec![0u8; MAX_BYTES];
    130     let n = file.read(&mut buf)?;
    131     buf.truncate(n);
    132 
    133     let candidates: &[u8] = b",\t;";
    134     // min count per delimiter across lines; start at usize::MAX so we can take min
    135     let mut min_counts = [usize::MAX; 3];
    136     let mut line_count = 0usize;
    137 
    138     for line in buf.split(|&b| b == b'\n').take(MAX_LINES) {
    139         if line.is_empty() {
    140             continue;
    141         }
    142         line_count += 1;
    143         for (i, &delim) in candidates.iter().enumerate() {
    144             let count = line.iter().filter(|&&b| b == delim).count();
    145             if count < min_counts[i] {
    146                 min_counts[i] = count;
    147             }
    148         }
    149     }
    150 
    151     if line_count == 0 {
    152         return Ok(b',');
    153     }
    154 
    155     // Replace any usize::MAX (delimiter never appeared) with 0
    156     for m in min_counts.iter_mut() {
    157         if *m == usize::MAX {
    158             *m = 0;
    159         }
    160     }
    161 
    162     let best = min_counts
    163         .iter()
    164         .enumerate()
    165         .max_by_key(|&(_, &c)| c)
    166         .map(|(i, _)| candidates[i])
    167         .unwrap_or(b',');
    168 
    169     // If no delimiter had any occurrences, fall back to comma
    170     if min_counts.iter().all(|&c| c == 0) {
    171         Ok(b',')
    172     } else {
    173         Ok(best)
    174     }
    175 }
    176 
    177 #[cfg(test)]
    178 mod tests {
    179     use super::*;
    180     use std::io::Write;
    181     use tempfile::NamedTempFile;
    182 
    183     // ── parse_format_str ──────────────────────────────────────────────────────
    184 
    185     #[test]
    186     fn parse_csv() {
    187         assert_eq!(parse_format_str("csv").unwrap(), Format::Csv);
    188     }
    189 
    190     #[test]
    191     fn parse_tsv() {
    192         assert_eq!(parse_format_str("tsv").unwrap(), Format::Tsv);
    193         assert_eq!(parse_format_str("tab").unwrap(), Format::Tsv);
    194     }
    195 
    196     #[test]
    197     fn parse_parquet() {
    198         assert_eq!(parse_format_str("parquet").unwrap(), Format::Parquet);
    199         assert_eq!(parse_format_str("pq").unwrap(), Format::Parquet);
    200     }
    201 
    202     #[test]
    203     fn parse_arrow() {
    204         assert_eq!(parse_format_str("arrow").unwrap(), Format::Arrow);
    205         assert_eq!(parse_format_str("feather").unwrap(), Format::Arrow);
    206         assert_eq!(parse_format_str("ipc").unwrap(), Format::Arrow);
    207     }
    208 
    209     #[test]
    210     fn parse_json() {
    211         assert_eq!(parse_format_str("json").unwrap(), Format::Json);
    212     }
    213 
    214     #[test]
    215     fn parse_ndjson() {
    216         assert_eq!(parse_format_str("ndjson").unwrap(), Format::Ndjson);
    217         assert_eq!(parse_format_str("jsonl").unwrap(), Format::Ndjson);
    218     }
    219 
    220     #[test]
    221     fn parse_excel() {
    222         assert_eq!(parse_format_str("excel").unwrap(), Format::Excel);
    223         assert_eq!(parse_format_str("xlsx").unwrap(), Format::Excel);
    224         assert_eq!(parse_format_str("xls").unwrap(), Format::Excel);
    225         assert_eq!(parse_format_str("xlsb").unwrap(), Format::Excel);
    226         assert_eq!(parse_format_str("ods").unwrap(), Format::Excel);
    227     }
    228 
    229     #[test]
    230     fn parse_unknown_errors() {
    231         assert!(parse_format_str("unknown").is_err());
    232         assert!(parse_format_str("").is_err());
    233     }
    234 
    235     #[test]
    236     fn parse_case_insensitive() {
    237         assert_eq!(parse_format_str("CSV").unwrap(), Format::Csv);
    238         assert_eq!(parse_format_str("Parquet").unwrap(), Format::Parquet);
    239         assert_eq!(parse_format_str("NDJSON").unwrap(), Format::Ndjson);
    240     }
    241 
    242     // ── detect_by_extension ───────────────────────────────────────────────────
    243 
    244     fn ext_path(ext: &str) -> std::path::PathBuf {
    245         std::path::PathBuf::from(format!("file.{ext}"))
    246     }
    247 
    248     #[test]
    249     fn ext_csv() {
    250         assert_eq!(detect_by_extension(&ext_path("csv")).unwrap(), Format::Csv);
    251     }
    252 
    253     #[test]
    254     fn ext_tsv() {
    255         assert_eq!(detect_by_extension(&ext_path("tsv")).unwrap(), Format::Tsv);
    256         assert_eq!(detect_by_extension(&ext_path("tab")).unwrap(), Format::Tsv);
    257     }
    258 
    259     #[test]
    260     fn ext_parquet() {
    261         assert_eq!(
    262             detect_by_extension(&ext_path("parquet")).unwrap(),
    263             Format::Parquet
    264         );
    265         assert_eq!(
    266             detect_by_extension(&ext_path("pq")).unwrap(),
    267             Format::Parquet
    268         );
    269     }
    270 
    271     #[test]
    272     fn ext_arrow() {
    273         assert_eq!(
    274             detect_by_extension(&ext_path("arrow")).unwrap(),
    275             Format::Arrow
    276         );
    277         assert_eq!(
    278             detect_by_extension(&ext_path("feather")).unwrap(),
    279             Format::Arrow
    280         );
    281         assert_eq!(
    282             detect_by_extension(&ext_path("ipc")).unwrap(),
    283             Format::Arrow
    284         );
    285     }
    286 
    287     #[test]
    288     fn ext_json() {
    289         assert_eq!(
    290             detect_by_extension(&ext_path("json")).unwrap(),
    291             Format::Json
    292         );
    293     }
    294 
    295     #[test]
    296     fn ext_ndjson() {
    297         assert_eq!(
    298             detect_by_extension(&ext_path("ndjson")).unwrap(),
    299             Format::Ndjson
    300         );
    301         assert_eq!(
    302             detect_by_extension(&ext_path("jsonl")).unwrap(),
    303             Format::Ndjson
    304         );
    305     }
    306 
    307     #[test]
    308     fn ext_excel() {
    309         assert_eq!(
    310             detect_by_extension(&ext_path("xlsx")).unwrap(),
    311             Format::Excel
    312         );
    313         assert_eq!(
    314             detect_by_extension(&ext_path("xls")).unwrap(),
    315             Format::Excel
    316         );
    317         assert_eq!(
    318             detect_by_extension(&ext_path("xlsb")).unwrap(),
    319             Format::Excel
    320         );
    321         assert_eq!(
    322             detect_by_extension(&ext_path("ods")).unwrap(),
    323             Format::Excel
    324         );
    325     }
    326 
    327     #[test]
    328     fn ext_unknown_errors() {
    329         assert!(detect_by_extension(&ext_path("txt")).is_err());
    330         assert!(detect_by_extension(&ext_path("bin")).is_err());
    331     }
    332 
    333     #[test]
    334     fn ext_no_extension_errors() {
    335         assert!(detect_by_extension(Path::new("myfile")).is_err());
    336     }
    337 
    338     // ── detect_by_magic ───────────────────────────────────────────────────────
    339 
    340     fn temp_with(bytes: &[u8]) -> NamedTempFile {
    341         let mut f = NamedTempFile::new().unwrap();
    342         f.write_all(bytes).unwrap();
    343         f.flush().unwrap();
    344         f
    345     }
    346 
    347     #[test]
    348     fn magic_parquet() {
    349         let f = temp_with(b"PAR1\x00\x01\x02\x03");
    350         assert_eq!(
    351             detect_by_magic(f.path()).unwrap(),
    352             Some(Format::Parquet)
    353         );
    354     }
    355 
    356     #[test]
    357     fn magic_arrow() {
    358         let f = temp_with(b"ARROW1\x00\x00");
    359         assert_eq!(
    360             detect_by_magic(f.path()).unwrap(),
    361             Some(Format::Arrow)
    362         );
    363     }
    364 
    365     #[test]
    366     fn magic_xlsx() {
    367         // ZIP magic: PK (0x50 0x4B)
    368         let f = temp_with(&[0x50, 0x4B, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00]);
    369         assert_eq!(
    370             detect_by_magic(f.path()).unwrap(),
    371             Some(Format::Excel)
    372         );
    373     }
    374 
    375     #[test]
    376     fn magic_xls_ole() {
    377         // OLE2: D0 CF 11 E0
    378         let f = temp_with(&[0xD0, 0xCF, 0x11, 0xE0, 0x00, 0x00, 0x00, 0x00]);
    379         assert_eq!(
    380             detect_by_magic(f.path()).unwrap(),
    381             Some(Format::Excel)
    382         );
    383     }
    384 
    385     #[test]
    386     fn magic_json_array() {
    387         let f = temp_with(b"[{\"a\":1}]");
    388         assert_eq!(
    389             detect_by_magic(f.path()).unwrap(),
    390             Some(Format::Json)
    391         );
    392     }
    393 
    394     #[test]
    395     fn magic_ndjson() {
    396         let f = temp_with(b"{\"a\":1}\n{\"b\":2}\n");
    397         assert_eq!(
    398             detect_by_magic(f.path()).unwrap(),
    399             Some(Format::Ndjson)
    400         );
    401     }
    402 
    403     #[test]
    404     fn magic_csv_returns_none() {
    405         let f = temp_with(b"a,b,c\n1,2,3\n");
    406         assert_eq!(detect_by_magic(f.path()).unwrap(), None);
    407     }
    408 
    409     // ── detect_format ─────────────────────────────────────────────────────────
    410 
    411     #[test]
    412     fn detect_override_wins_over_extension() {
    413         // File content looks like CSV, but we override to parquet
    414         let mut f = NamedTempFile::with_suffix(".csv").unwrap();
    415         write!(f, "a,b\n1,2\n").unwrap();
    416         let result = detect_format(f.path(), Some("parquet")).unwrap();
    417         assert_eq!(result, Format::Parquet);
    418     }
    419 
    420     #[test]
    421     fn detect_magic_beats_extension() {
    422         // Write Parquet magic bytes but name the file .csv so extension says Csv
    423         let mut f = NamedTempFile::with_suffix(".csv").unwrap();
    424         f.write_all(b"PAR1\x00\x01\x02\x03").unwrap();
    425         let result = detect_format(f.path(), None).unwrap();
    426         assert_eq!(result, Format::Parquet);
    427     }
    428 
    429     #[test]
    430     fn detect_falls_back_to_extension() {
    431         // Plain CSV content → magic returns None → extension used
    432         let mut f = NamedTempFile::with_suffix(".tsv").unwrap();
    433         write!(f, "a\tb\n1\t2\n").unwrap();
    434         let result = detect_format(f.path(), None).unwrap();
    435         assert_eq!(result, Format::Tsv);
    436     }
    437 
    438     // ── same_family ───────────────────────────────────────────────────────────
    439 
    440     #[test]
    441     fn same_family_csv_tsv() {
    442         assert!(Format::Csv.same_family(Format::Tsv));
    443         assert!(Format::Tsv.same_family(Format::Csv));
    444     }
    445 
    446     #[test]
    447     fn same_family_json_ndjson() {
    448         assert!(Format::Json.same_family(Format::Ndjson));
    449         assert!(Format::Ndjson.same_family(Format::Json));
    450     }
    451 
    452     #[test]
    453     fn same_family_csv_parquet_different() {
    454         assert!(!Format::Csv.same_family(Format::Parquet));
    455         assert!(!Format::Parquet.same_family(Format::Csv));
    456     }
    457 
    458     #[test]
    459     fn same_family_same_format() {
    460         assert!(Format::Csv.same_family(Format::Csv));
    461         assert!(Format::Parquet.same_family(Format::Parquet));
    462     }
    463 
    464     // ── detect_csv_delimiter ─────────────────────────────────────────────────
    465 
    466     #[test]
    467     fn delimiter_comma() {
    468         let f = temp_with(b"a,b,c\n1,2,3\n4,5,6\n");
    469         assert_eq!(detect_csv_delimiter(f.path()).unwrap(), b',');
    470     }
    471 
    472     #[test]
    473     fn delimiter_tab() {
    474         let f = temp_with(b"a\tb\tc\n1\t2\t3\n4\t5\t6\n");
    475         assert_eq!(detect_csv_delimiter(f.path()).unwrap(), b'\t');
    476     }
    477 
    478     #[test]
    479     fn delimiter_semicolon() {
    480         let f = temp_with(b"a;b;c\n1;2;3\n4;5;6\n");
    481         assert_eq!(detect_csv_delimiter(f.path()).unwrap(), b';');
    482     }
    483 }