format.rs (14563B)
1 use anyhow::{anyhow, Result}; 2 use std::io::Read; 3 use std::path::Path; 4 5 #[derive(Debug, Clone, Copy, PartialEq)] 6 pub enum Format { 7 Csv, 8 Tsv, 9 Parquet, 10 Arrow, 11 Json, 12 Ndjson, 13 Excel, 14 } 15 16 impl Format { 17 /// Returns true if two formats belong to the same "family": 18 /// - Csv and Tsv are the same family 19 /// - Json and Ndjson are the same family 20 /// - Everything else is only the same family as itself 21 pub fn same_family(self, other: Format) -> bool { 22 use Format::*; 23 matches!( 24 (self, other), 25 (Csv, Tsv) | (Tsv, Csv) | (Json, Ndjson) | (Ndjson, Json) 26 ) || self == other 27 } 28 } 29 30 /// Parse a format from a string name. Case-insensitive. 31 pub fn parse_format_str(s: &str) -> Result<Format> { 32 match s.to_ascii_lowercase().as_str() { 33 "csv" => Ok(Format::Csv), 34 "tsv" | "tab" => Ok(Format::Tsv), 35 "parquet" | "pq" => Ok(Format::Parquet), 36 "arrow" | "feather" | "ipc" => Ok(Format::Arrow), 37 "json" => Ok(Format::Json), 38 "ndjson" | "jsonl" => Ok(Format::Ndjson), 39 "excel" | "xlsx" | "xls" | "xlsb" | "ods" => Ok(Format::Excel), 40 other => Err(anyhow!("unknown format: {:?}", other)), 41 } 42 } 43 44 /// Detect format from a file's extension. 45 pub fn detect_by_extension(path: &Path) -> Result<Format> { 46 let ext = path 47 .extension() 48 .and_then(|e| e.to_str()) 49 .ok_or_else(|| anyhow!("file has no extension: {}", path.display()))?; 50 parse_format_str(ext).map_err(|_| anyhow!("unknown file extension: {:?}", ext)) 51 } 52 53 /// Read the first 8 bytes of a file and attempt to identify its format from 54 /// magic bytes. Returns `None` for formats (CSV/TSV) that have no distinctive 55 /// magic sequence. 56 pub fn detect_by_magic(path: &Path) -> Result<Option<Format>> { 57 let mut file = std::fs::File::open(path) 58 .map_err(|e| anyhow!("cannot open {:?}: {}", path, e))?; 59 let mut buf = [0u8; 8]; 60 let n = file.read(&mut buf)?; 61 let bytes = &buf[..n]; 62 63 // Parquet: magic "PAR1" at start (4 bytes) 64 if bytes.len() >= 4 && &bytes[..4] == b"PAR1" { 65 return Ok(Some(Format::Parquet)); 66 } 67 68 // Arrow IPC: magic "ARROW1" at start (6 bytes) 69 if bytes.len() >= 6 && &bytes[..6] == b"ARROW1" { 70 return Ok(Some(Format::Arrow)); 71 } 72 73 // Excel ZIP-based (xlsx, xlsb): PK signature 74 if bytes.len() >= 2 && bytes[0] == 0x50 && bytes[1] == 0x4B { 75 return Ok(Some(Format::Excel)); 76 } 77 78 // Excel OLE2 (xls): D0 CF 11 E0 79 if bytes.len() >= 4 80 && bytes[0] == 0xD0 81 && bytes[1] == 0xCF 82 && bytes[2] == 0x11 83 && bytes[3] == 0xE0 84 { 85 return Ok(Some(Format::Excel)); 86 } 87 88 // JSON / NDJSON: check first non-whitespace byte 89 for &b in bytes { 90 if b.is_ascii_whitespace() { 91 continue; 92 } 93 if b == b'[' { 94 return Ok(Some(Format::Json)); 95 } 96 if b == b'{' { 97 return Ok(Some(Format::Ndjson)); 98 } 99 break; 100 } 101 102 // CSV/TSV and anything else has no distinctive magic 103 Ok(None) 104 } 105 106 /// Detect format with priority: explicit override > magic bytes > extension. 107 pub fn detect_format(path: &Path, override_fmt: Option<&str>) -> Result<Format> { 108 if let Some(s) = override_fmt { 109 return parse_format_str(s); 110 } 111 112 if let Some(fmt) = detect_by_magic(path)? { 113 return Ok(fmt); 114 } 115 116 detect_by_extension(path) 117 } 118 119 /// Detect the delimiter used in a CSV-like file by sampling up to 8 KB / 10 120 /// lines and counting occurrences of `,`, `\t`, and `;`. Returns the delimiter 121 /// with the highest minimum count across lines. Defaults to `,`. 122 pub fn detect_csv_delimiter(path: &Path) -> Result<u8> { 123 const MAX_BYTES: usize = 8 * 1024; 124 const MAX_LINES: usize = 10; 125 126 let mut file = std::fs::File::open(path) 127 .map_err(|e| anyhow!("cannot open {:?}: {}", path, e))?; 128 129 let mut buf = vec![0u8; MAX_BYTES]; 130 let n = file.read(&mut buf)?; 131 buf.truncate(n); 132 133 let candidates: &[u8] = b",\t;"; 134 // min count per delimiter across lines; start at usize::MAX so we can take min 135 let mut min_counts = [usize::MAX; 3]; 136 let mut line_count = 0usize; 137 138 for line in buf.split(|&b| b == b'\n').take(MAX_LINES) { 139 if line.is_empty() { 140 continue; 141 } 142 line_count += 1; 143 for (i, &delim) in candidates.iter().enumerate() { 144 let count = line.iter().filter(|&&b| b == delim).count(); 145 if count < min_counts[i] { 146 min_counts[i] = count; 147 } 148 } 149 } 150 151 if line_count == 0 { 152 return Ok(b','); 153 } 154 155 // Replace any usize::MAX (delimiter never appeared) with 0 156 for m in min_counts.iter_mut() { 157 if *m == usize::MAX { 158 *m = 0; 159 } 160 } 161 162 let best = min_counts 163 .iter() 164 .enumerate() 165 .max_by_key(|&(_, &c)| c) 166 .map(|(i, _)| candidates[i]) 167 .unwrap_or(b','); 168 169 // If no delimiter had any occurrences, fall back to comma 170 if min_counts.iter().all(|&c| c == 0) { 171 Ok(b',') 172 } else { 173 Ok(best) 174 } 175 } 176 177 #[cfg(test)] 178 mod tests { 179 use super::*; 180 use std::io::Write; 181 use tempfile::NamedTempFile; 182 183 // ── parse_format_str ────────────────────────────────────────────────────── 184 185 #[test] 186 fn parse_csv() { 187 assert_eq!(parse_format_str("csv").unwrap(), Format::Csv); 188 } 189 190 #[test] 191 fn parse_tsv() { 192 assert_eq!(parse_format_str("tsv").unwrap(), Format::Tsv); 193 assert_eq!(parse_format_str("tab").unwrap(), Format::Tsv); 194 } 195 196 #[test] 197 fn parse_parquet() { 198 assert_eq!(parse_format_str("parquet").unwrap(), Format::Parquet); 199 assert_eq!(parse_format_str("pq").unwrap(), Format::Parquet); 200 } 201 202 #[test] 203 fn parse_arrow() { 204 assert_eq!(parse_format_str("arrow").unwrap(), Format::Arrow); 205 assert_eq!(parse_format_str("feather").unwrap(), Format::Arrow); 206 assert_eq!(parse_format_str("ipc").unwrap(), Format::Arrow); 207 } 208 209 #[test] 210 fn parse_json() { 211 assert_eq!(parse_format_str("json").unwrap(), Format::Json); 212 } 213 214 #[test] 215 fn parse_ndjson() { 216 assert_eq!(parse_format_str("ndjson").unwrap(), Format::Ndjson); 217 assert_eq!(parse_format_str("jsonl").unwrap(), Format::Ndjson); 218 } 219 220 #[test] 221 fn parse_excel() { 222 assert_eq!(parse_format_str("excel").unwrap(), Format::Excel); 223 assert_eq!(parse_format_str("xlsx").unwrap(), Format::Excel); 224 assert_eq!(parse_format_str("xls").unwrap(), Format::Excel); 225 assert_eq!(parse_format_str("xlsb").unwrap(), Format::Excel); 226 assert_eq!(parse_format_str("ods").unwrap(), Format::Excel); 227 } 228 229 #[test] 230 fn parse_unknown_errors() { 231 assert!(parse_format_str("unknown").is_err()); 232 assert!(parse_format_str("").is_err()); 233 } 234 235 #[test] 236 fn parse_case_insensitive() { 237 assert_eq!(parse_format_str("CSV").unwrap(), Format::Csv); 238 assert_eq!(parse_format_str("Parquet").unwrap(), Format::Parquet); 239 assert_eq!(parse_format_str("NDJSON").unwrap(), Format::Ndjson); 240 } 241 242 // ── detect_by_extension ─────────────────────────────────────────────────── 243 244 fn ext_path(ext: &str) -> std::path::PathBuf { 245 std::path::PathBuf::from(format!("file.{ext}")) 246 } 247 248 #[test] 249 fn ext_csv() { 250 assert_eq!(detect_by_extension(&ext_path("csv")).unwrap(), Format::Csv); 251 } 252 253 #[test] 254 fn ext_tsv() { 255 assert_eq!(detect_by_extension(&ext_path("tsv")).unwrap(), Format::Tsv); 256 assert_eq!(detect_by_extension(&ext_path("tab")).unwrap(), Format::Tsv); 257 } 258 259 #[test] 260 fn ext_parquet() { 261 assert_eq!( 262 detect_by_extension(&ext_path("parquet")).unwrap(), 263 Format::Parquet 264 ); 265 assert_eq!( 266 detect_by_extension(&ext_path("pq")).unwrap(), 267 Format::Parquet 268 ); 269 } 270 271 #[test] 272 fn ext_arrow() { 273 assert_eq!( 274 detect_by_extension(&ext_path("arrow")).unwrap(), 275 Format::Arrow 276 ); 277 assert_eq!( 278 detect_by_extension(&ext_path("feather")).unwrap(), 279 Format::Arrow 280 ); 281 assert_eq!( 282 detect_by_extension(&ext_path("ipc")).unwrap(), 283 Format::Arrow 284 ); 285 } 286 287 #[test] 288 fn ext_json() { 289 assert_eq!( 290 detect_by_extension(&ext_path("json")).unwrap(), 291 Format::Json 292 ); 293 } 294 295 #[test] 296 fn ext_ndjson() { 297 assert_eq!( 298 detect_by_extension(&ext_path("ndjson")).unwrap(), 299 Format::Ndjson 300 ); 301 assert_eq!( 302 detect_by_extension(&ext_path("jsonl")).unwrap(), 303 Format::Ndjson 304 ); 305 } 306 307 #[test] 308 fn ext_excel() { 309 assert_eq!( 310 detect_by_extension(&ext_path("xlsx")).unwrap(), 311 Format::Excel 312 ); 313 assert_eq!( 314 detect_by_extension(&ext_path("xls")).unwrap(), 315 Format::Excel 316 ); 317 assert_eq!( 318 detect_by_extension(&ext_path("xlsb")).unwrap(), 319 Format::Excel 320 ); 321 assert_eq!( 322 detect_by_extension(&ext_path("ods")).unwrap(), 323 Format::Excel 324 ); 325 } 326 327 #[test] 328 fn ext_unknown_errors() { 329 assert!(detect_by_extension(&ext_path("txt")).is_err()); 330 assert!(detect_by_extension(&ext_path("bin")).is_err()); 331 } 332 333 #[test] 334 fn ext_no_extension_errors() { 335 assert!(detect_by_extension(Path::new("myfile")).is_err()); 336 } 337 338 // ── detect_by_magic ─────────────────────────────────────────────────────── 339 340 fn temp_with(bytes: &[u8]) -> NamedTempFile { 341 let mut f = NamedTempFile::new().unwrap(); 342 f.write_all(bytes).unwrap(); 343 f.flush().unwrap(); 344 f 345 } 346 347 #[test] 348 fn magic_parquet() { 349 let f = temp_with(b"PAR1\x00\x01\x02\x03"); 350 assert_eq!( 351 detect_by_magic(f.path()).unwrap(), 352 Some(Format::Parquet) 353 ); 354 } 355 356 #[test] 357 fn magic_arrow() { 358 let f = temp_with(b"ARROW1\x00\x00"); 359 assert_eq!( 360 detect_by_magic(f.path()).unwrap(), 361 Some(Format::Arrow) 362 ); 363 } 364 365 #[test] 366 fn magic_xlsx() { 367 // ZIP magic: PK (0x50 0x4B) 368 let f = temp_with(&[0x50, 0x4B, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00]); 369 assert_eq!( 370 detect_by_magic(f.path()).unwrap(), 371 Some(Format::Excel) 372 ); 373 } 374 375 #[test] 376 fn magic_xls_ole() { 377 // OLE2: D0 CF 11 E0 378 let f = temp_with(&[0xD0, 0xCF, 0x11, 0xE0, 0x00, 0x00, 0x00, 0x00]); 379 assert_eq!( 380 detect_by_magic(f.path()).unwrap(), 381 Some(Format::Excel) 382 ); 383 } 384 385 #[test] 386 fn magic_json_array() { 387 let f = temp_with(b"[{\"a\":1}]"); 388 assert_eq!( 389 detect_by_magic(f.path()).unwrap(), 390 Some(Format::Json) 391 ); 392 } 393 394 #[test] 395 fn magic_ndjson() { 396 let f = temp_with(b"{\"a\":1}\n{\"b\":2}\n"); 397 assert_eq!( 398 detect_by_magic(f.path()).unwrap(), 399 Some(Format::Ndjson) 400 ); 401 } 402 403 #[test] 404 fn magic_csv_returns_none() { 405 let f = temp_with(b"a,b,c\n1,2,3\n"); 406 assert_eq!(detect_by_magic(f.path()).unwrap(), None); 407 } 408 409 // ── detect_format ───────────────────────────────────────────────────────── 410 411 #[test] 412 fn detect_override_wins_over_extension() { 413 // File content looks like CSV, but we override to parquet 414 let mut f = NamedTempFile::with_suffix(".csv").unwrap(); 415 write!(f, "a,b\n1,2\n").unwrap(); 416 let result = detect_format(f.path(), Some("parquet")).unwrap(); 417 assert_eq!(result, Format::Parquet); 418 } 419 420 #[test] 421 fn detect_magic_beats_extension() { 422 // Write Parquet magic bytes but name the file .csv so extension says Csv 423 let mut f = NamedTempFile::with_suffix(".csv").unwrap(); 424 f.write_all(b"PAR1\x00\x01\x02\x03").unwrap(); 425 let result = detect_format(f.path(), None).unwrap(); 426 assert_eq!(result, Format::Parquet); 427 } 428 429 #[test] 430 fn detect_falls_back_to_extension() { 431 // Plain CSV content → magic returns None → extension used 432 let mut f = NamedTempFile::with_suffix(".tsv").unwrap(); 433 write!(f, "a\tb\n1\t2\n").unwrap(); 434 let result = detect_format(f.path(), None).unwrap(); 435 assert_eq!(result, Format::Tsv); 436 } 437 438 // ── same_family ─────────────────────────────────────────────────────────── 439 440 #[test] 441 fn same_family_csv_tsv() { 442 assert!(Format::Csv.same_family(Format::Tsv)); 443 assert!(Format::Tsv.same_family(Format::Csv)); 444 } 445 446 #[test] 447 fn same_family_json_ndjson() { 448 assert!(Format::Json.same_family(Format::Ndjson)); 449 assert!(Format::Ndjson.same_family(Format::Json)); 450 } 451 452 #[test] 453 fn same_family_csv_parquet_different() { 454 assert!(!Format::Csv.same_family(Format::Parquet)); 455 assert!(!Format::Parquet.same_family(Format::Csv)); 456 } 457 458 #[test] 459 fn same_family_same_format() { 460 assert!(Format::Csv.same_family(Format::Csv)); 461 assert!(Format::Parquet.same_family(Format::Parquet)); 462 } 463 464 // ── detect_csv_delimiter ───────────────────────────────────────────────── 465 466 #[test] 467 fn delimiter_comma() { 468 let f = temp_with(b"a,b,c\n1,2,3\n4,5,6\n"); 469 assert_eq!(detect_csv_delimiter(f.path()).unwrap(), b','); 470 } 471 472 #[test] 473 fn delimiter_tab() { 474 let f = temp_with(b"a\tb\tc\n1\t2\t3\n4\t5\t6\n"); 475 assert_eq!(detect_csv_delimiter(f.path()).unwrap(), b'\t'); 476 } 477 478 #[test] 479 fn delimiter_semicolon() { 480 let f = temp_with(b"a;b;c\n1;2;3\n4;5;6\n"); 481 assert_eq!(detect_csv_delimiter(f.path()).unwrap(), b';'); 482 } 483 }