xl-cli-tools

CLI tools for viewing and editing Excel files
Log | Files | Refs | README | LICENSE

commit d933fc931fe8da435a304fbc35d82cf65673b30f
parent b659a9d560ec18fb49c70d3921ad0635ad87e46a
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Fri, 13 Mar 2026 15:43:40 -0500

feat: add reader module — calamine to polars DataFrame bridge

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mxlcat/src/main.rs | 1+
Axlcat/src/reader.rs | 377+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 378 insertions(+), 0 deletions(-)

diff --git a/xlcat/src/main.rs b/xlcat/src/main.rs @@ -1,4 +1,5 @@ mod metadata; +mod reader; use anyhow::Result; use clap::Parser; diff --git a/xlcat/src/reader.rs b/xlcat/src/reader.rs @@ -0,0 +1,377 @@ +use anyhow::{Context, Result}; +use calamine::{open_workbook_auto, Data, Reader}; +use polars::prelude::*; +use std::path::Path; + +#[derive(Debug, Clone, Copy, PartialEq)] +enum InferredType { + Int, + Float, + String, + Bool, + DateTime, + Empty, +} + +pub fn read_sheet(path: &Path, sheet_name: &str) -> Result<DataFrame> { + let mut workbook = open_workbook_auto(path) + .with_context(|| format!("Cannot open workbook: {}", path.display()))?; + let range = workbook + .worksheet_range(sheet_name) + .with_context(|| format!("Cannot read sheet: {sheet_name}"))?; + range_to_dataframe(&range) +} + +pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> { + let (total_rows, cols) = range.get_size(); + if total_rows == 0 || cols == 0 { + return Ok(DataFrame::default()); + } + + let rows: Vec<&[Data]> = range.rows().collect(); + + // First row = headers + let headers: Vec<String> = rows[0] + .iter() + .enumerate() + .map(|(i, cell)| match cell { + Data::String(s) => s.clone(), + _ => format!("column_{i}"), + }) + .collect(); + + if total_rows == 1 { + // Header only, no data + let series: Vec<Column> = headers + .iter() + .map(|name| { + Series::new_empty(PlSmallStr::from(name.as_str()), &DataType::Null).into_column() + }) + .collect(); + return DataFrame::new(series).map_err(Into::into); + } + + let data_rows = &rows[1..]; + let mut columns: Vec<Column> = Vec::with_capacity(cols); + + for col_idx in 0..cols { + let cells: Vec<&Data> = data_rows + .iter() + .map(|row| { + if col_idx < row.len() { + &row[col_idx] + } else { + &Data::Empty + } + }) + .collect(); + + let col_type = infer_column_type(&cells); + let series = build_series(&headers[col_idx], &cells, col_type)?; + columns.push(series.into_column()); + } + + DataFrame::new(columns).map_err(Into::into) +} + +fn infer_column_type(cells: &[&Data]) -> InferredType { + let mut has_int = false; + let mut has_float = false; + let mut has_string = false; + let mut has_bool = false; + let mut has_datetime = false; + let mut all_empty = true; + + for cell in cells { + match cell { + Data::Empty => {} + Data::String(_) | Data::DateTimeIso(_) | Data::DurationIso(_) => { + has_string = true; + all_empty = false; + } + Data::Float(_) => { + has_float = true; + all_empty = false; + } + Data::Int(_) => { + has_int = true; + all_empty = false; + } + Data::Bool(_) => { + has_bool = true; + all_empty = false; + } + Data::DateTime(_) => { + has_datetime = true; + all_empty = false; + } + Data::Error(_) => { + has_string = true; + all_empty = false; + } + } + } + + if all_empty { + return InferredType::Empty; + } + // String trumps everything + if has_string { + return InferredType::String; + } + // DateTime only if all non-empty cells are datetime + if has_datetime && !has_int && !has_float && !has_bool { + return InferredType::DateTime; + } + // Bool only if all non-empty cells are bool + if has_bool && !has_int && !has_float && !has_datetime { + return InferredType::Bool; + } + // Float if any float or mix of int/float + if has_float { + return InferredType::Float; + } + if has_int { + return InferredType::Int; + } + // Fallback: mixed datetime/bool/etc → string + InferredType::String +} + +fn build_series(name: &str, cells: &[&Data], col_type: InferredType) -> Result<Series> { + let plname = PlSmallStr::from(name); + match col_type { + InferredType::Int => { + let values: Vec<Option<i64>> = cells + .iter() + .map(|cell| match cell { + Data::Int(v) => Some(*v), + Data::Empty => None, + _ => None, + }) + .collect(); + Ok(Series::new(plname, &values)) + } + InferredType::Float => { + let values: Vec<Option<f64>> = cells + .iter() + .map(|cell| match cell { + Data::Float(v) => Some(*v), + Data::Int(v) => Some(*v as f64), + Data::Empty => None, + _ => None, + }) + .collect(); + Ok(Series::new(plname, &values)) + } + InferredType::Bool => { + let values: Vec<Option<bool>> = cells + .iter() + .map(|cell| match cell { + Data::Bool(v) => Some(*v), + Data::Empty => None, + _ => None, + }) + .collect(); + Ok(Series::new(plname, &values)) + } + InferredType::DateTime => { + // calamine ExcelDateTime wraps a serial date float (days since 1899-12-30) + // Convert to milliseconds since Unix epoch for polars + let values: Vec<Option<i64>> = cells + .iter() + .map(|cell| match cell { + Data::DateTime(v) => { + let serial = v.as_f64(); + // Excel epoch: 1899-12-30 = -25569 days from Unix epoch + let days_from_unix = serial - 25569.0; + let ms = (days_from_unix * 86_400_000.0) as i64; + Some(ms) + } + Data::Empty => None, + _ => None, + }) + .collect(); + let series = Series::new(plname, &values); + Ok(series.cast(&DataType::Datetime(TimeUnit::Milliseconds, None))?) + } + InferredType::String | InferredType::Empty => { + let values: Vec<Option<String>> = cells + .iter() + .map(|cell| match cell { + Data::String(s) => Some(s.clone()), + Data::Float(v) => Some(v.to_string()), + Data::Int(v) => Some(v.to_string()), + Data::Bool(v) => Some(v.to_string()), + Data::DateTime(v) => Some(v.as_f64().to_string()), + Data::Error(e) => Some(format!("{e:?}")), + Data::DateTimeIso(s) | Data::DurationIso(s) => Some(s.clone()), + Data::Empty => None, + }) + .collect(); + Ok(Series::new(plname, &values)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rust_xlsxwriter::Workbook; + use tempfile::NamedTempFile; + + /// Create a simple xlsx with mixed types: string, float, float (int-like), bool + fn create_simple(path: &std::path::Path) { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Data").unwrap(); + ws.write_string(0, 0, "name").unwrap(); + ws.write_string(0, 1, "amount").unwrap(); + ws.write_string(0, 2, "count").unwrap(); + ws.write_string(0, 3, "active").unwrap(); + + let names = ["Alice", "Bob", "Charlie", "Diana", "Eve"]; + let amounts = [100.50, 200.75, 300.00, 400.25, 500.00]; + let counts = [10.0, 20.0, 30.0, 40.0, 50.0]; + let active = [true, false, true, false, true]; + + for i in 0..5u32 { + ws.write_string(i + 1, 0, names[i as usize]).unwrap(); + ws.write_number(i + 1, 1, amounts[i as usize]).unwrap(); + ws.write_number(i + 1, 2, counts[i as usize]).unwrap(); + ws.write_boolean(i + 1, 3, active[i as usize]).unwrap(); + } + wb.save(path).unwrap(); + } + + /// Create xlsx with header row only, no data + fn create_empty_data(path: &std::path::Path) { + let mut wb = Workbook::new(); + let ws = wb.add_worksheet().set_name("Empty").unwrap(); + ws.write_string(0, 0, "col_a").unwrap(); + ws.write_string(0, 1, "col_b").unwrap(); + wb.save(path).unwrap(); + } + + /// Create xlsx with a completely empty sheet + fn create_empty_sheet(path: &std::path::Path) { + let mut wb = Workbook::new(); + wb.add_worksheet().set_name("Blank").unwrap(); + wb.save(path).unwrap(); + } + + #[test] + fn test_infer_int_column() { + let cells = vec![ + &Data::Int(1), + &Data::Int(2), + &Data::Empty, + &Data::Int(4), + ]; + assert_eq!(infer_column_type(&cells), InferredType::Int); + } + + #[test] + fn test_infer_float_when_mixed_int_float() { + let cells = vec![ + &Data::Int(1), + &Data::Float(2.5), + &Data::Int(3), + ]; + assert_eq!(infer_column_type(&cells), InferredType::Float); + } + + #[test] + fn test_infer_string_trumps_all() { + let s = Data::String("hello".to_string()); + let cells: Vec<&Data> = vec![ + &Data::Int(1), + &s, + &Data::Float(3.0), + ]; + assert_eq!(infer_column_type(&cells), InferredType::String); + } + + #[test] + fn test_infer_empty_column() { + let cells: Vec<&Data> = vec![&Data::Empty, &Data::Empty]; + assert_eq!(infer_column_type(&cells), InferredType::Empty); + } + + #[test] + fn test_infer_bool_column() { + let cells = vec![ + &Data::Bool(true), + &Data::Bool(false), + &Data::Empty, + ]; + assert_eq!(infer_column_type(&cells), InferredType::Bool); + } + + #[test] + fn test_empty_range() { + let range: calamine::Range<Data> = Default::default(); + let df = range_to_dataframe(&range).unwrap(); + assert_eq!(df.height(), 0); + assert_eq!(df.width(), 0); + } + + #[test] + fn test_range_to_dataframe_basic() { + let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); + create_simple(tmp.path()); + + let df = read_sheet(tmp.path(), "Data").unwrap(); + assert_eq!(df.height(), 5); + assert_eq!(df.width(), 4); + + let names: Vec<String> = df + .get_column_names() + .iter() + .map(|s| s.to_string()) + .collect(); + assert_eq!(names, vec!["name", "amount", "count", "active"]); + } + + #[test] + fn test_read_sheet_types() { + let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); + create_simple(tmp.path()); + + let df = read_sheet(tmp.path(), "Data").unwrap(); + + // name column should be String + assert_eq!(df.column("name").unwrap().dtype(), &DataType::String); + + // amount column should be Float64 + assert_eq!(df.column("amount").unwrap().dtype(), &DataType::Float64); + + // active column should be Boolean + assert_eq!(df.column("active").unwrap().dtype(), &DataType::Boolean); + } + + #[test] + fn test_read_sheet_header_only() { + let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); + create_empty_data(tmp.path()); + + let df = read_sheet(tmp.path(), "Empty").unwrap(); + assert_eq!(df.height(), 0); + assert_eq!(df.width(), 2); + let names: Vec<String> = df + .get_column_names() + .iter() + .map(|s| s.to_string()) + .collect(); + assert_eq!(names, vec!["col_a", "col_b"]); + } + + #[test] + fn test_read_sheet_empty_sheet() { + let tmp = NamedTempFile::with_suffix(".xlsx").unwrap(); + create_empty_sheet(tmp.path()); + + let df = read_sheet(tmp.path(), "Blank").unwrap(); + assert_eq!(df.height(), 0); + assert_eq!(df.width(), 0); + } +}