feat: add reader module — calamine to polars DataFrame bridge - xl-cli-tools - CLI tools for viewing and editing Excel files

commit d933fc931fe8da435a304fbc35d82cf65673b30f
parent b659a9d560ec18fb49c70d3921ad0635ad87e46a
Author: Erik Loualiche <eloualic@umn.edu>
Date:   Fri, 13 Mar 2026 15:43:40 -0500

feat: add reader module — calamine to polars DataFrame bridge

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M xlcat/src/main.rs  | 1 +
A xlcat/src/reader.rs  | 377 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 378 insertions(+), 0 deletions(-)
diff --git a/xlcat/src/main.rs b/xlcat/src/main.rs
@@ -1,4 +1,5 @@
 mod metadata;
+mod reader;
 
 use anyhow::Result;
 use clap::Parser;
diff --git a/xlcat/src/reader.rs b/xlcat/src/reader.rs
@@ -0,0 +1,377 @@
+use anyhow::{Context, Result};
+use calamine::{open_workbook_auto, Data, Reader};
+use polars::prelude::*;
+use std::path::Path;
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+enum InferredType {
+    Int,
+    Float,
+    String,
+    Bool,
+    DateTime,
+    Empty,
+}
+
+pub fn read_sheet(path: &Path, sheet_name: &str) -> Result<DataFrame> {
+    let mut workbook = open_workbook_auto(path)
+        .with_context(|| format!("Cannot open workbook: {}", path.display()))?;
+    let range = workbook
+        .worksheet_range(sheet_name)
+        .with_context(|| format!("Cannot read sheet: {sheet_name}"))?;
+    range_to_dataframe(&range)
+}
+
+pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> {
+    let (total_rows, cols) = range.get_size();
+    if total_rows == 0 || cols == 0 {
+        return Ok(DataFrame::default());
+    }
+
+    let rows: Vec<&[Data]> = range.rows().collect();
+
+    // First row = headers
+    let headers: Vec<String> = rows[0]
+        .iter()
+        .enumerate()
+        .map(|(i, cell)| match cell {
+            Data::String(s) => s.clone(),
+            _ => format!("column_{i}"),
+        })
+        .collect();
+
+    if total_rows == 1 {
+        // Header only, no data
+        let series: Vec<Column> = headers
+            .iter()
+            .map(|name| {
+                Series::new_empty(PlSmallStr::from(name.as_str()), &DataType::Null).into_column()
+            })
+            .collect();
+        return DataFrame::new(series).map_err(Into::into);
+    }
+
+    let data_rows = &rows[1..];
+    let mut columns: Vec<Column> = Vec::with_capacity(cols);
+
+    for col_idx in 0..cols {
+        let cells: Vec<&Data> = data_rows
+            .iter()
+            .map(|row| {
+                if col_idx < row.len() {
+                    &row[col_idx]
+                } else {
+                    &Data::Empty
+                }
+            })
+            .collect();
+
+        let col_type = infer_column_type(&cells);
+        let series = build_series(&headers[col_idx], &cells, col_type)?;
+        columns.push(series.into_column());
+    }
+
+    DataFrame::new(columns).map_err(Into::into)
+}
+
+fn infer_column_type(cells: &[&Data]) -> InferredType {
+    let mut has_int = false;
+    let mut has_float = false;
+    let mut has_string = false;
+    let mut has_bool = false;
+    let mut has_datetime = false;
+    let mut all_empty = true;
+
+    for cell in cells {
+        match cell {
+            Data::Empty => {}
+            Data::String(_) | Data::DateTimeIso(_) | Data::DurationIso(_) => {
+                has_string = true;
+                all_empty = false;
+            }
+            Data::Float(_) => {
+                has_float = true;
+                all_empty = false;
+            }
+            Data::Int(_) => {
+                has_int = true;
+                all_empty = false;
+            }
+            Data::Bool(_) => {
+                has_bool = true;
+                all_empty = false;
+            }
+            Data::DateTime(_) => {
+                has_datetime = true;
+                all_empty = false;
+            }
+            Data::Error(_) => {
+                has_string = true;
+                all_empty = false;
+            }
+        }
+    }
+
+    if all_empty {
+        return InferredType::Empty;
+    }
+    // String trumps everything
+    if has_string {
+        return InferredType::String;
+    }
+    // DateTime only if all non-empty cells are datetime
+    if has_datetime && !has_int && !has_float && !has_bool {
+        return InferredType::DateTime;
+    }
+    // Bool only if all non-empty cells are bool
+    if has_bool && !has_int && !has_float && !has_datetime {
+        return InferredType::Bool;
+    }
+    // Float if any float or mix of int/float
+    if has_float {
+        return InferredType::Float;
+    }
+    if has_int {
+        return InferredType::Int;
+    }
+    // Fallback: mixed datetime/bool/etc → string
+    InferredType::String
+}
+
+fn build_series(name: &str, cells: &[&Data], col_type: InferredType) -> Result<Series> {
+    let plname = PlSmallStr::from(name);
+    match col_type {
+        InferredType::Int => {
+            let values: Vec<Option<i64>> = cells
+                .iter()
+                .map(|cell| match cell {
+                    Data::Int(v) => Some(*v),
+                    Data::Empty => None,
+                    _ => None,
+                })
+                .collect();
+            Ok(Series::new(plname, &values))
+        }
+        InferredType::Float => {
+            let values: Vec<Option<f64>> = cells
+                .iter()
+                .map(|cell| match cell {
+                    Data::Float(v) => Some(*v),
+                    Data::Int(v) => Some(*v as f64),
+                    Data::Empty => None,
+                    _ => None,
+                })
+                .collect();
+            Ok(Series::new(plname, &values))
+        }
+        InferredType::Bool => {
+            let values: Vec<Option<bool>> = cells
+                .iter()
+                .map(|cell| match cell {
+                    Data::Bool(v) => Some(*v),
+                    Data::Empty => None,
+                    _ => None,
+                })
+                .collect();
+            Ok(Series::new(plname, &values))
+        }
+        InferredType::DateTime => {
+            // calamine ExcelDateTime wraps a serial date float (days since 1899-12-30)
+            // Convert to milliseconds since Unix epoch for polars
+            let values: Vec<Option<i64>> = cells
+                .iter()
+                .map(|cell| match cell {
+                    Data::DateTime(v) => {
+                        let serial = v.as_f64();
+                        // Excel epoch: 1899-12-30 = -25569 days from Unix epoch
+                        let days_from_unix = serial - 25569.0;
+                        let ms = (days_from_unix * 86_400_000.0) as i64;
+                        Some(ms)
+                    }
+                    Data::Empty => None,
+                    _ => None,
+                })
+                .collect();
+            let series = Series::new(plname, &values);
+            Ok(series.cast(&DataType::Datetime(TimeUnit::Milliseconds, None))?)
+        }
+        InferredType::String | InferredType::Empty => {
+            let values: Vec<Option<String>> = cells
+                .iter()
+                .map(|cell| match cell {
+                    Data::String(s) => Some(s.clone()),
+                    Data::Float(v) => Some(v.to_string()),
+                    Data::Int(v) => Some(v.to_string()),
+                    Data::Bool(v) => Some(v.to_string()),
+                    Data::DateTime(v) => Some(v.as_f64().to_string()),
+                    Data::Error(e) => Some(format!("{e:?}")),
+                    Data::DateTimeIso(s) | Data::DurationIso(s) => Some(s.clone()),
+                    Data::Empty => None,
+                })
+                .collect();
+            Ok(Series::new(plname, &values))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rust_xlsxwriter::Workbook;
+    use tempfile::NamedTempFile;
+
+    /// Create a simple xlsx with mixed types: string, float, float (int-like), bool
+    fn create_simple(path: &std::path::Path) {
+        let mut wb = Workbook::new();
+        let ws = wb.add_worksheet().set_name("Data").unwrap();
+        ws.write_string(0, 0, "name").unwrap();
+        ws.write_string(0, 1, "amount").unwrap();
+        ws.write_string(0, 2, "count").unwrap();
+        ws.write_string(0, 3, "active").unwrap();
+
+        let names = ["Alice", "Bob", "Charlie", "Diana", "Eve"];
+        let amounts = [100.50, 200.75, 300.00, 400.25, 500.00];
+        let counts = [10.0, 20.0, 30.0, 40.0, 50.0];
+        let active = [true, false, true, false, true];
+
+        for i in 0..5u32 {
+            ws.write_string(i + 1, 0, names[i as usize]).unwrap();
+            ws.write_number(i + 1, 1, amounts[i as usize]).unwrap();
+            ws.write_number(i + 1, 2, counts[i as usize]).unwrap();
+            ws.write_boolean(i + 1, 3, active[i as usize]).unwrap();
+        }
+        wb.save(path).unwrap();
+    }
+
+    /// Create xlsx with header row only, no data
+    fn create_empty_data(path: &std::path::Path) {
+        let mut wb = Workbook::new();
+        let ws = wb.add_worksheet().set_name("Empty").unwrap();
+        ws.write_string(0, 0, "col_a").unwrap();
+        ws.write_string(0, 1, "col_b").unwrap();
+        wb.save(path).unwrap();
+    }
+
+    /// Create xlsx with a completely empty sheet
+    fn create_empty_sheet(path: &std::path::Path) {
+        let mut wb = Workbook::new();
+        wb.add_worksheet().set_name("Blank").unwrap();
+        wb.save(path).unwrap();
+    }
+
+    #[test]
+    fn test_infer_int_column() {
+        let cells = vec![
+            &Data::Int(1),
+            &Data::Int(2),
+            &Data::Empty,
+            &Data::Int(4),
+        ];
+        assert_eq!(infer_column_type(&cells), InferredType::Int);
+    }
+
+    #[test]
+    fn test_infer_float_when_mixed_int_float() {
+        let cells = vec![
+            &Data::Int(1),
+            &Data::Float(2.5),
+            &Data::Int(3),
+        ];
+        assert_eq!(infer_column_type(&cells), InferredType::Float);
+    }
+
+    #[test]
+    fn test_infer_string_trumps_all() {
+        let s = Data::String("hello".to_string());
+        let cells: Vec<&Data> = vec![
+            &Data::Int(1),
+            &s,
+            &Data::Float(3.0),
+        ];
+        assert_eq!(infer_column_type(&cells), InferredType::String);
+    }
+
+    #[test]
+    fn test_infer_empty_column() {
+        let cells: Vec<&Data> = vec![&Data::Empty, &Data::Empty];
+        assert_eq!(infer_column_type(&cells), InferredType::Empty);
+    }
+
+    #[test]
+    fn test_infer_bool_column() {
+        let cells = vec![
+            &Data::Bool(true),
+            &Data::Bool(false),
+            &Data::Empty,
+        ];
+        assert_eq!(infer_column_type(&cells), InferredType::Bool);
+    }
+
+    #[test]
+    fn test_empty_range() {
+        let range: calamine::Range<Data> = Default::default();
+        let df = range_to_dataframe(&range).unwrap();
+        assert_eq!(df.height(), 0);
+        assert_eq!(df.width(), 0);
+    }
+
+    #[test]
+    fn test_range_to_dataframe_basic() {
+        let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
+        create_simple(tmp.path());
+
+        let df = read_sheet(tmp.path(), "Data").unwrap();
+        assert_eq!(df.height(), 5);
+        assert_eq!(df.width(), 4);
+
+        let names: Vec<String> = df
+            .get_column_names()
+            .iter()
+            .map(|s| s.to_string())
+            .collect();
+        assert_eq!(names, vec!["name", "amount", "count", "active"]);
+    }
+
+    #[test]
+    fn test_read_sheet_types() {
+        let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
+        create_simple(tmp.path());
+
+        let df = read_sheet(tmp.path(), "Data").unwrap();
+
+        // name column should be String
+        assert_eq!(df.column("name").unwrap().dtype(), &DataType::String);
+
+        // amount column should be Float64
+        assert_eq!(df.column("amount").unwrap().dtype(), &DataType::Float64);
+
+        // active column should be Boolean
+        assert_eq!(df.column("active").unwrap().dtype(), &DataType::Boolean);
+    }
+
+    #[test]
+    fn test_read_sheet_header_only() {
+        let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
+        create_empty_data(tmp.path());
+
+        let df = read_sheet(tmp.path(), "Empty").unwrap();
+        assert_eq!(df.height(), 0);
+        assert_eq!(df.width(), 2);
+        let names: Vec<String> = df
+            .get_column_names()
+            .iter()
+            .map(|s| s.to_string())
+            .collect();
+        assert_eq!(names, vec!["col_a", "col_b"]);
+    }
+
+    #[test]
+    fn test_read_sheet_empty_sheet() {
+        let tmp = NamedTempFile::with_suffix(".xlsx").unwrap();
+        create_empty_sheet(tmp.path());
+
+        let df = read_sheet(tmp.path(), "Blank").unwrap();
+        assert_eq!(df.height(), 0);
+        assert_eq!(df.width(), 0);
+    }
+}

	xl-cli-tools CLI tools for viewing and editing Excel files
	Log \| Files \| Refs \| README \| LICENSE

M	xlcat/src/main.rs	\|	1	+
A	xlcat/src/reader.rs	\|	377	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++