dt-cli-tools

CLI tools for viewing, filtering, and comparing tabular data files
Log | Files | Refs | README | LICENSE

excel.rs (12991B)


      1 use anyhow::{Context, Result};
      2 use calamine::{open_workbook_auto, Data, Reader};
      3 use polars::prelude::*;
      4 use std::path::Path;
      5 
      6 use crate::metadata::SheetInfo;
      7 use crate::reader::ReadOptions;
      8 
      9 #[derive(Debug, Clone, Copy, PartialEq)]
     10 enum InferredType {
     11     Int,
     12     Float,
     13     String,
     14     Bool,
     15     DateTime,
     16     Empty,
     17 }
     18 
     19 /// Read an Excel file into a DataFrame using the provided options.
     20 ///
     21 /// Sheet resolution order:
     22 /// 1. `opts.sheet` treated as a sheet name (exact match)
     23 /// 2. `opts.sheet` parsed as a 0-based index
     24 /// 3. First sheet (default)
     25 pub fn read(path: &Path, opts: &ReadOptions) -> Result<DataFrame> {
     26     let mut workbook = open_workbook_auto(path)
     27         .with_context(|| format!("Cannot open workbook: {}", path.display()))?;
     28 
     29     let sheet_names = workbook.sheet_names().to_vec();
     30     if sheet_names.is_empty() {
     31         return Ok(DataFrame::default());
     32     }
     33 
     34     let sheet_name: String = match &opts.sheet {
     35         Some(s) => {
     36             // Try exact name match first
     37             if sheet_names.contains(s) {
     38                 s.clone()
     39             } else {
     40                 // Try to parse as 0-based index
     41                 match s.parse::<usize>() {
     42                     Ok(idx) if idx < sheet_names.len() => sheet_names[idx].clone(),
     43                     _ => {
     44                         return Err(anyhow::anyhow!(
     45                             "Sheet '{}' not found in workbook (available: {})",
     46                             s,
     47                             sheet_names.join(", ")
     48                         ));
     49                     }
     50                 }
     51             }
     52         }
     53         None => sheet_names[0].clone(),
     54     };
     55 
     56     let range = workbook
     57         .worksheet_range(&sheet_name)
     58         .with_context(|| format!("Cannot read sheet: {sheet_name}"))?;
     59 
     60     let skip = opts.skip_rows.unwrap_or(0);
     61     range_to_dataframe_skip(&range, skip)
     62 }
     63 
     64 /// Return sheet names and dimensions for an Excel file.
     65 pub fn read_excel_info(path: &Path) -> Result<Vec<SheetInfo>> {
     66     let mut workbook = open_workbook_auto(path)
     67         .with_context(|| format!("Cannot open workbook: {}", path.display()))?;
     68 
     69     let sheet_names = workbook.sheet_names().to_vec();
     70     let mut infos = Vec::with_capacity(sheet_names.len());
     71 
     72     for name in sheet_names {
     73         let range = workbook
     74             .worksheet_range(&name)
     75             .with_context(|| format!("Cannot read sheet: {name}"))?;
     76         let (rows, cols) = range.get_size();
     77         infos.push(SheetInfo { name, rows, cols });
     78     }
     79 
     80     Ok(infos)
     81 }
     82 
     83 pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> {
     84     range_to_dataframe_skip(range, 0)
     85 }
     86 
     87 /// Convert a calamine Range to a DataFrame, skipping `skip` rows before the header.
     88 pub fn range_to_dataframe_skip(range: &calamine::Range<Data>, skip: usize) -> Result<DataFrame> {
     89     let rows: Vec<&[Data]> = range.rows().skip(skip).collect();
     90     let cols = if rows.is_empty() {
     91         0
     92     } else {
     93         rows.iter().map(|r| r.len()).max().unwrap_or(0)
     94     };
     95 
     96     if rows.is_empty() || cols == 0 {
     97         return Ok(DataFrame::default());
     98     }
     99 
    100     // First row (after skip) = headers
    101     let headers: Vec<String> = rows[0]
    102         .iter()
    103         .enumerate()
    104         .map(|(i, cell)| match cell {
    105             Data::String(s) => s.clone(),
    106             _ => format!("column_{i}"),
    107         })
    108         .collect();
    109 
    110     if rows.len() == 1 {
    111         // Header only, no data
    112         let series: Vec<Column> = headers
    113             .iter()
    114             .map(|name| {
    115                 Series::new_empty(PlSmallStr::from(name.as_str()), &DataType::Null).into_column()
    116             })
    117             .collect();
    118         return DataFrame::new(series).map_err(Into::into);
    119     }
    120 
    121     let data_rows = &rows[1..];
    122     let mut columns: Vec<Column> = Vec::with_capacity(cols);
    123 
    124     for col_idx in 0..cols {
    125         let cells: Vec<&Data> = data_rows
    126             .iter()
    127             .map(|row| {
    128                 if col_idx < row.len() {
    129                     &row[col_idx]
    130                 } else {
    131                     &Data::Empty
    132                 }
    133             })
    134             .collect();
    135 
    136         let col_type = infer_column_type(&cells);
    137         let series = build_series(&headers[col_idx], &cells, col_type)?;
    138         columns.push(series.into_column());
    139     }
    140 
    141     DataFrame::new(columns).map_err(Into::into)
    142 }
    143 
    144 fn infer_column_type(cells: &[&Data]) -> InferredType {
    145     let mut has_int = false;
    146     let mut has_float = false;
    147     let mut has_string = false;
    148     let mut has_bool = false;
    149     let mut has_datetime = false;
    150     let mut all_empty = true;
    151 
    152     for cell in cells {
    153         match cell {
    154             Data::Empty => {}
    155             Data::String(_) | Data::DateTimeIso(_) | Data::DurationIso(_) => {
    156                 has_string = true;
    157                 all_empty = false;
    158             }
    159             Data::Float(_) => {
    160                 has_float = true;
    161                 all_empty = false;
    162             }
    163             Data::Int(_) => {
    164                 has_int = true;
    165                 all_empty = false;
    166             }
    167             Data::Bool(_) => {
    168                 has_bool = true;
    169                 all_empty = false;
    170             }
    171             Data::DateTime(_) => {
    172                 has_datetime = true;
    173                 all_empty = false;
    174             }
    175             Data::Error(_) => {
    176                 has_string = true;
    177                 all_empty = false;
    178             }
    179         }
    180     }
    181 
    182     if all_empty {
    183         return InferredType::Empty;
    184     }
    185     // String trumps everything
    186     if has_string {
    187         return InferredType::String;
    188     }
    189     // DateTime only if all non-empty cells are datetime
    190     if has_datetime && !has_int && !has_float && !has_bool {
    191         return InferredType::DateTime;
    192     }
    193     // Bool only if all non-empty cells are bool
    194     if has_bool && !has_int && !has_float && !has_datetime {
    195         return InferredType::Bool;
    196     }
    197     // Float if any float or mix of int/float
    198     if has_float {
    199         return InferredType::Float;
    200     }
    201     if has_int {
    202         return InferredType::Int;
    203     }
    204     // Fallback: mixed datetime/bool/etc -> string
    205     InferredType::String
    206 }
    207 
    208 fn build_series(name: &str, cells: &[&Data], col_type: InferredType) -> Result<Series> {
    209     let plname = PlSmallStr::from(name);
    210     match col_type {
    211         InferredType::Int => {
    212             let values: Vec<Option<i64>> = cells
    213                 .iter()
    214                 .map(|cell| match cell {
    215                     Data::Int(v) => Some(*v),
    216                     Data::Empty => None,
    217                     _ => None,
    218                 })
    219                 .collect();
    220             Ok(Series::new(plname, &values))
    221         }
    222         InferredType::Float => {
    223             let values: Vec<Option<f64>> = cells
    224                 .iter()
    225                 .map(|cell| match cell {
    226                     Data::Float(v) => Some(*v),
    227                     Data::Int(v) => Some(*v as f64),
    228                     Data::Empty => None,
    229                     _ => None,
    230                 })
    231                 .collect();
    232             Ok(Series::new(plname, &values))
    233         }
    234         InferredType::Bool => {
    235             let values: Vec<Option<bool>> = cells
    236                 .iter()
    237                 .map(|cell| match cell {
    238                     Data::Bool(v) => Some(*v),
    239                     Data::Empty => None,
    240                     _ => None,
    241                 })
    242                 .collect();
    243             Ok(Series::new(plname, &values))
    244         }
    245         InferredType::DateTime => {
    246             // calamine ExcelDateTime wraps a serial date float (days since 1899-12-30)
    247             // Convert to milliseconds since Unix epoch for polars
    248             let values: Vec<Option<i64>> = cells
    249                 .iter()
    250                 .map(|cell| match cell {
    251                     Data::DateTime(v) => {
    252                         let serial = v.as_f64();
    253                         // Excel epoch: 1899-12-30 = -25569 days from Unix epoch
    254                         let days_from_unix = serial - 25569.0;
    255                         let ms = (days_from_unix * 86_400_000.0) as i64;
    256                         Some(ms)
    257                     }
    258                     Data::Empty => None,
    259                     _ => None,
    260                 })
    261                 .collect();
    262             let series = Series::new(plname, &values);
    263             Ok(series.cast(&DataType::Datetime(TimeUnit::Milliseconds, None))?)
    264         }
    265         InferredType::String | InferredType::Empty => {
    266             let values: Vec<Option<String>> = cells
    267                 .iter()
    268                 .map(|cell| match cell {
    269                     Data::String(s) => Some(s.clone()),
    270                     Data::Float(v) => Some(v.to_string()),
    271                     Data::Int(v) => Some(v.to_string()),
    272                     Data::Bool(v) => Some(v.to_string()),
    273                     Data::DateTime(v) => Some(v.as_f64().to_string()),
    274                     Data::Error(e) => Some(format!("{e:?}")),
    275                     Data::DateTimeIso(s) | Data::DurationIso(s) => Some(s.clone()),
    276                     Data::Empty => None,
    277                 })
    278                 .collect();
    279             Ok(Series::new(plname, &values))
    280         }
    281     }
    282 }
    283 
    284 #[cfg(test)]
    285 mod tests {
    286     use super::*;
    287 
    288     // Tests that use rust_xlsxwriter to create fixtures are skipped here because
    289     // rust_xlsxwriter is not a dev-dependency in dt-cli-tools. The type-inference
    290     // and range-conversion logic is tested below using calamine types directly.
    291     // To enable the full xlsx integration tests, add `rust_xlsxwriter` to
    292     // [dev-dependencies] in Cargo.toml and port the create_simple / create_empty_data
    293     // / create_with_metadata_rows helpers from xl-cli-tool/src/reader.rs.
    294 
    295     #[test]
    296     fn test_infer_int_column() {
    297         let cells = vec![&Data::Int(1), &Data::Int(2), &Data::Empty, &Data::Int(4)];
    298         assert_eq!(infer_column_type(&cells), InferredType::Int);
    299     }
    300 
    301     #[test]
    302     fn test_infer_float_when_mixed_int_float() {
    303         let cells = vec![&Data::Int(1), &Data::Float(2.5), &Data::Int(3)];
    304         assert_eq!(infer_column_type(&cells), InferredType::Float);
    305     }
    306 
    307     #[test]
    308     fn test_infer_string_trumps_all() {
    309         let s = Data::String("hello".to_string());
    310         let cells: Vec<&Data> = vec![&Data::Int(1), &s, &Data::Float(3.0)];
    311         assert_eq!(infer_column_type(&cells), InferredType::String);
    312     }
    313 
    314     #[test]
    315     fn test_infer_empty_column() {
    316         let cells: Vec<&Data> = vec![&Data::Empty, &Data::Empty];
    317         assert_eq!(infer_column_type(&cells), InferredType::Empty);
    318     }
    319 
    320     #[test]
    321     fn test_infer_bool_column() {
    322         let cells = vec![&Data::Bool(true), &Data::Bool(false), &Data::Empty];
    323         assert_eq!(infer_column_type(&cells), InferredType::Bool);
    324     }
    325 
    326     #[test]
    327     fn test_empty_range() {
    328         let range: calamine::Range<Data> = Default::default();
    329         let df = range_to_dataframe(&range).unwrap();
    330         assert_eq!(df.height(), 0);
    331         assert_eq!(df.width(), 0);
    332     }
    333 
    334     #[test]
    335     fn test_build_series_int() {
    336         let cells = vec![&Data::Int(10), &Data::Int(20), &Data::Empty, &Data::Int(40)];
    337         let series = build_series("nums", &cells, InferredType::Int).unwrap();
    338         assert_eq!(series.dtype(), &DataType::Int64);
    339         assert_eq!(series.len(), 4);
    340         assert_eq!(series.null_count(), 1);
    341     }
    342 
    343     #[test]
    344     fn test_build_series_float() {
    345         let cells = vec![&Data::Float(1.5), &Data::Int(2), &Data::Empty];
    346         let series = build_series("vals", &cells, InferredType::Float).unwrap();
    347         assert_eq!(series.dtype(), &DataType::Float64);
    348         assert_eq!(series.len(), 3);
    349         assert_eq!(series.null_count(), 1);
    350     }
    351 
    352     #[test]
    353     fn test_build_series_bool() {
    354         let cells = vec![&Data::Bool(true), &Data::Bool(false), &Data::Empty];
    355         let series = build_series("flags", &cells, InferredType::Bool).unwrap();
    356         assert_eq!(series.dtype(), &DataType::Boolean);
    357         assert_eq!(series.len(), 3);
    358         assert_eq!(series.null_count(), 1);
    359     }
    360 
    361     #[test]
    362     fn test_build_series_string() {
    363         let s1 = Data::String("foo".to_string());
    364         let s2 = Data::String("bar".to_string());
    365         let cells: Vec<&Data> = vec![&s1, &s2, &Data::Empty];
    366         let series = build_series("words", &cells, InferredType::String).unwrap();
    367         assert_eq!(series.dtype(), &DataType::String);
    368         assert_eq!(series.len(), 3);
    369         assert_eq!(series.null_count(), 1);
    370     }
    371 
    372     #[test]
    373     fn test_range_to_dataframe_skip_empty_range() {
    374         use calamine::Range;
    375         let range: Range<Data> = Default::default();
    376         let df = range_to_dataframe_skip(&range, 0).unwrap();
    377         assert_eq!(df.height(), 0);
    378         assert_eq!(df.width(), 0);
    379     }
    380 
    381     #[test]
    382     fn test_sheet_resolution_default_opts() {
    383         // Confirm ReadOptions default has sheet=None and skip_rows=None
    384         let opts = ReadOptions::default();
    385         assert!(opts.sheet.is_none());
    386         assert!(opts.skip_rows.is_none());
    387     }
    388 }