excel.rs (12991B)
1 use anyhow::{Context, Result}; 2 use calamine::{open_workbook_auto, Data, Reader}; 3 use polars::prelude::*; 4 use std::path::Path; 5 6 use crate::metadata::SheetInfo; 7 use crate::reader::ReadOptions; 8 9 #[derive(Debug, Clone, Copy, PartialEq)] 10 enum InferredType { 11 Int, 12 Float, 13 String, 14 Bool, 15 DateTime, 16 Empty, 17 } 18 19 /// Read an Excel file into a DataFrame using the provided options. 20 /// 21 /// Sheet resolution order: 22 /// 1. `opts.sheet` treated as a sheet name (exact match) 23 /// 2. `opts.sheet` parsed as a 0-based index 24 /// 3. First sheet (default) 25 pub fn read(path: &Path, opts: &ReadOptions) -> Result<DataFrame> { 26 let mut workbook = open_workbook_auto(path) 27 .with_context(|| format!("Cannot open workbook: {}", path.display()))?; 28 29 let sheet_names = workbook.sheet_names().to_vec(); 30 if sheet_names.is_empty() { 31 return Ok(DataFrame::default()); 32 } 33 34 let sheet_name: String = match &opts.sheet { 35 Some(s) => { 36 // Try exact name match first 37 if sheet_names.contains(s) { 38 s.clone() 39 } else { 40 // Try to parse as 0-based index 41 match s.parse::<usize>() { 42 Ok(idx) if idx < sheet_names.len() => sheet_names[idx].clone(), 43 _ => { 44 return Err(anyhow::anyhow!( 45 "Sheet '{}' not found in workbook (available: {})", 46 s, 47 sheet_names.join(", ") 48 )); 49 } 50 } 51 } 52 } 53 None => sheet_names[0].clone(), 54 }; 55 56 let range = workbook 57 .worksheet_range(&sheet_name) 58 .with_context(|| format!("Cannot read sheet: {sheet_name}"))?; 59 60 let skip = opts.skip_rows.unwrap_or(0); 61 range_to_dataframe_skip(&range, skip) 62 } 63 64 /// Return sheet names and dimensions for an Excel file. 65 pub fn read_excel_info(path: &Path) -> Result<Vec<SheetInfo>> { 66 let mut workbook = open_workbook_auto(path) 67 .with_context(|| format!("Cannot open workbook: {}", path.display()))?; 68 69 let sheet_names = workbook.sheet_names().to_vec(); 70 let mut infos = Vec::with_capacity(sheet_names.len()); 71 72 for name in sheet_names { 73 let range = workbook 74 .worksheet_range(&name) 75 .with_context(|| format!("Cannot read sheet: {name}"))?; 76 let (rows, cols) = range.get_size(); 77 infos.push(SheetInfo { name, rows, cols }); 78 } 79 80 Ok(infos) 81 } 82 83 pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> { 84 range_to_dataframe_skip(range, 0) 85 } 86 87 /// Convert a calamine Range to a DataFrame, skipping `skip` rows before the header. 88 pub fn range_to_dataframe_skip(range: &calamine::Range<Data>, skip: usize) -> Result<DataFrame> { 89 let rows: Vec<&[Data]> = range.rows().skip(skip).collect(); 90 let cols = if rows.is_empty() { 91 0 92 } else { 93 rows.iter().map(|r| r.len()).max().unwrap_or(0) 94 }; 95 96 if rows.is_empty() || cols == 0 { 97 return Ok(DataFrame::default()); 98 } 99 100 // First row (after skip) = headers 101 let headers: Vec<String> = rows[0] 102 .iter() 103 .enumerate() 104 .map(|(i, cell)| match cell { 105 Data::String(s) => s.clone(), 106 _ => format!("column_{i}"), 107 }) 108 .collect(); 109 110 if rows.len() == 1 { 111 // Header only, no data 112 let series: Vec<Column> = headers 113 .iter() 114 .map(|name| { 115 Series::new_empty(PlSmallStr::from(name.as_str()), &DataType::Null).into_column() 116 }) 117 .collect(); 118 return DataFrame::new(series).map_err(Into::into); 119 } 120 121 let data_rows = &rows[1..]; 122 let mut columns: Vec<Column> = Vec::with_capacity(cols); 123 124 for col_idx in 0..cols { 125 let cells: Vec<&Data> = data_rows 126 .iter() 127 .map(|row| { 128 if col_idx < row.len() { 129 &row[col_idx] 130 } else { 131 &Data::Empty 132 } 133 }) 134 .collect(); 135 136 let col_type = infer_column_type(&cells); 137 let series = build_series(&headers[col_idx], &cells, col_type)?; 138 columns.push(series.into_column()); 139 } 140 141 DataFrame::new(columns).map_err(Into::into) 142 } 143 144 fn infer_column_type(cells: &[&Data]) -> InferredType { 145 let mut has_int = false; 146 let mut has_float = false; 147 let mut has_string = false; 148 let mut has_bool = false; 149 let mut has_datetime = false; 150 let mut all_empty = true; 151 152 for cell in cells { 153 match cell { 154 Data::Empty => {} 155 Data::String(_) | Data::DateTimeIso(_) | Data::DurationIso(_) => { 156 has_string = true; 157 all_empty = false; 158 } 159 Data::Float(_) => { 160 has_float = true; 161 all_empty = false; 162 } 163 Data::Int(_) => { 164 has_int = true; 165 all_empty = false; 166 } 167 Data::Bool(_) => { 168 has_bool = true; 169 all_empty = false; 170 } 171 Data::DateTime(_) => { 172 has_datetime = true; 173 all_empty = false; 174 } 175 Data::Error(_) => { 176 has_string = true; 177 all_empty = false; 178 } 179 } 180 } 181 182 if all_empty { 183 return InferredType::Empty; 184 } 185 // String trumps everything 186 if has_string { 187 return InferredType::String; 188 } 189 // DateTime only if all non-empty cells are datetime 190 if has_datetime && !has_int && !has_float && !has_bool { 191 return InferredType::DateTime; 192 } 193 // Bool only if all non-empty cells are bool 194 if has_bool && !has_int && !has_float && !has_datetime { 195 return InferredType::Bool; 196 } 197 // Float if any float or mix of int/float 198 if has_float { 199 return InferredType::Float; 200 } 201 if has_int { 202 return InferredType::Int; 203 } 204 // Fallback: mixed datetime/bool/etc -> string 205 InferredType::String 206 } 207 208 fn build_series(name: &str, cells: &[&Data], col_type: InferredType) -> Result<Series> { 209 let plname = PlSmallStr::from(name); 210 match col_type { 211 InferredType::Int => { 212 let values: Vec<Option<i64>> = cells 213 .iter() 214 .map(|cell| match cell { 215 Data::Int(v) => Some(*v), 216 Data::Empty => None, 217 _ => None, 218 }) 219 .collect(); 220 Ok(Series::new(plname, &values)) 221 } 222 InferredType::Float => { 223 let values: Vec<Option<f64>> = cells 224 .iter() 225 .map(|cell| match cell { 226 Data::Float(v) => Some(*v), 227 Data::Int(v) => Some(*v as f64), 228 Data::Empty => None, 229 _ => None, 230 }) 231 .collect(); 232 Ok(Series::new(plname, &values)) 233 } 234 InferredType::Bool => { 235 let values: Vec<Option<bool>> = cells 236 .iter() 237 .map(|cell| match cell { 238 Data::Bool(v) => Some(*v), 239 Data::Empty => None, 240 _ => None, 241 }) 242 .collect(); 243 Ok(Series::new(plname, &values)) 244 } 245 InferredType::DateTime => { 246 // calamine ExcelDateTime wraps a serial date float (days since 1899-12-30) 247 // Convert to milliseconds since Unix epoch for polars 248 let values: Vec<Option<i64>> = cells 249 .iter() 250 .map(|cell| match cell { 251 Data::DateTime(v) => { 252 let serial = v.as_f64(); 253 // Excel epoch: 1899-12-30 = -25569 days from Unix epoch 254 let days_from_unix = serial - 25569.0; 255 let ms = (days_from_unix * 86_400_000.0) as i64; 256 Some(ms) 257 } 258 Data::Empty => None, 259 _ => None, 260 }) 261 .collect(); 262 let series = Series::new(plname, &values); 263 Ok(series.cast(&DataType::Datetime(TimeUnit::Milliseconds, None))?) 264 } 265 InferredType::String | InferredType::Empty => { 266 let values: Vec<Option<String>> = cells 267 .iter() 268 .map(|cell| match cell { 269 Data::String(s) => Some(s.clone()), 270 Data::Float(v) => Some(v.to_string()), 271 Data::Int(v) => Some(v.to_string()), 272 Data::Bool(v) => Some(v.to_string()), 273 Data::DateTime(v) => Some(v.as_f64().to_string()), 274 Data::Error(e) => Some(format!("{e:?}")), 275 Data::DateTimeIso(s) | Data::DurationIso(s) => Some(s.clone()), 276 Data::Empty => None, 277 }) 278 .collect(); 279 Ok(Series::new(plname, &values)) 280 } 281 } 282 } 283 284 #[cfg(test)] 285 mod tests { 286 use super::*; 287 288 // Tests that use rust_xlsxwriter to create fixtures are skipped here because 289 // rust_xlsxwriter is not a dev-dependency in dt-cli-tools. The type-inference 290 // and range-conversion logic is tested below using calamine types directly. 291 // To enable the full xlsx integration tests, add `rust_xlsxwriter` to 292 // [dev-dependencies] in Cargo.toml and port the create_simple / create_empty_data 293 // / create_with_metadata_rows helpers from xl-cli-tool/src/reader.rs. 294 295 #[test] 296 fn test_infer_int_column() { 297 let cells = vec![&Data::Int(1), &Data::Int(2), &Data::Empty, &Data::Int(4)]; 298 assert_eq!(infer_column_type(&cells), InferredType::Int); 299 } 300 301 #[test] 302 fn test_infer_float_when_mixed_int_float() { 303 let cells = vec![&Data::Int(1), &Data::Float(2.5), &Data::Int(3)]; 304 assert_eq!(infer_column_type(&cells), InferredType::Float); 305 } 306 307 #[test] 308 fn test_infer_string_trumps_all() { 309 let s = Data::String("hello".to_string()); 310 let cells: Vec<&Data> = vec![&Data::Int(1), &s, &Data::Float(3.0)]; 311 assert_eq!(infer_column_type(&cells), InferredType::String); 312 } 313 314 #[test] 315 fn test_infer_empty_column() { 316 let cells: Vec<&Data> = vec![&Data::Empty, &Data::Empty]; 317 assert_eq!(infer_column_type(&cells), InferredType::Empty); 318 } 319 320 #[test] 321 fn test_infer_bool_column() { 322 let cells = vec![&Data::Bool(true), &Data::Bool(false), &Data::Empty]; 323 assert_eq!(infer_column_type(&cells), InferredType::Bool); 324 } 325 326 #[test] 327 fn test_empty_range() { 328 let range: calamine::Range<Data> = Default::default(); 329 let df = range_to_dataframe(&range).unwrap(); 330 assert_eq!(df.height(), 0); 331 assert_eq!(df.width(), 0); 332 } 333 334 #[test] 335 fn test_build_series_int() { 336 let cells = vec![&Data::Int(10), &Data::Int(20), &Data::Empty, &Data::Int(40)]; 337 let series = build_series("nums", &cells, InferredType::Int).unwrap(); 338 assert_eq!(series.dtype(), &DataType::Int64); 339 assert_eq!(series.len(), 4); 340 assert_eq!(series.null_count(), 1); 341 } 342 343 #[test] 344 fn test_build_series_float() { 345 let cells = vec![&Data::Float(1.5), &Data::Int(2), &Data::Empty]; 346 let series = build_series("vals", &cells, InferredType::Float).unwrap(); 347 assert_eq!(series.dtype(), &DataType::Float64); 348 assert_eq!(series.len(), 3); 349 assert_eq!(series.null_count(), 1); 350 } 351 352 #[test] 353 fn test_build_series_bool() { 354 let cells = vec![&Data::Bool(true), &Data::Bool(false), &Data::Empty]; 355 let series = build_series("flags", &cells, InferredType::Bool).unwrap(); 356 assert_eq!(series.dtype(), &DataType::Boolean); 357 assert_eq!(series.len(), 3); 358 assert_eq!(series.null_count(), 1); 359 } 360 361 #[test] 362 fn test_build_series_string() { 363 let s1 = Data::String("foo".to_string()); 364 let s2 = Data::String("bar".to_string()); 365 let cells: Vec<&Data> = vec![&s1, &s2, &Data::Empty]; 366 let series = build_series("words", &cells, InferredType::String).unwrap(); 367 assert_eq!(series.dtype(), &DataType::String); 368 assert_eq!(series.len(), 3); 369 assert_eq!(series.null_count(), 1); 370 } 371 372 #[test] 373 fn test_range_to_dataframe_skip_empty_range() { 374 use calamine::Range; 375 let range: Range<Data> = Default::default(); 376 let df = range_to_dataframe_skip(&range, 0).unwrap(); 377 assert_eq!(df.height(), 0); 378 assert_eq!(df.width(), 0); 379 } 380 381 #[test] 382 fn test_sheet_resolution_default_opts() { 383 // Confirm ReadOptions default has sheet=None and skip_rows=None 384 let opts = ReadOptions::default(); 385 assert!(opts.sheet.is_none()); 386 assert!(opts.skip_rows.is_none()); 387 } 388 }