commit 279df986b6657b229bdcb695e99294081a7a3fe4
parent ffcccf83cde6fa0e636583d99574c03047392193
Author: Erik Loualiche <eloualic@umn.edu>
Date: Tue, 31 Mar 2026 07:49:33 -0500
feat: add all format readers and reader dispatch
- CSV/TSV reader with delimiter auto-detection
- Parquet reader via ParquetReader
- Arrow IPC reader via IpcReader
- JSON/NDJSON reader via JsonReader/JsonLineReader
- Excel reader ported from xl-cli-tools with calamine
- Reader dispatch in reader.rs with read_file and read_file_info
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
6 files changed, 752 insertions(+), 0 deletions(-)
diff --git a/src/reader.rs b/src/reader.rs
@@ -0,0 +1,42 @@
+use anyhow::Result;
+use polars::prelude::*;
+use std::path::Path;
+
+use crate::format::Format;
+use crate::metadata::{FileInfo, SheetInfo};
+use crate::readers;
+
+/// Options that control how a file is read.
+#[derive(Debug, Clone, Default)]
+pub struct ReadOptions {
+ pub sheet: Option<String>, // Excel only
+ pub skip_rows: Option<usize>,
+ pub separator: Option<u8>, // CSV override
+}
+
+/// Read a file into a DataFrame, dispatching to the appropriate reader.
+pub fn read_file(path: &Path, format: Format, opts: &ReadOptions) -> Result<DataFrame> {
+ match format {
+ Format::Csv | Format::Tsv => readers::csv::read(path, opts),
+ Format::Parquet => readers::parquet::read(path, opts),
+ Format::Arrow => readers::arrow::read(path, opts),
+ Format::Json | Format::Ndjson => readers::json::read(path, format, opts),
+ Format::Excel => readers::excel::read(path, opts),
+ }
+}
+
+/// Read file metadata: size, format, and sheet info (for Excel).
+pub fn read_file_info(path: &Path, format: Format) -> Result<FileInfo> {
+ let file_size = std::fs::metadata(path)?.len();
+
+ let sheets = match format {
+ Format::Excel => readers::excel::read_excel_info(path)?,
+ _ => vec![],
+ };
+
+ Ok(FileInfo {
+ file_size,
+ format,
+ sheets,
+ })
+}
diff --git a/src/readers/arrow.rs b/src/readers/arrow.rs
@@ -0,0 +1,38 @@
+use anyhow::Result;
+use polars::prelude::*;
+use std::path::Path;
+
+use crate::reader::ReadOptions;
+
+pub fn read(path: &Path, opts: &ReadOptions) -> Result<DataFrame> {
+ let file = std::fs::File::open(path)?;
+ let mut df = IpcReader::new(file).finish()?;
+
+ if let Some(skip) = opts.skip_rows {
+ if skip > 0 && skip < df.height() {
+ df = df.slice(skip as i64, df.height() - skip);
+ }
+ }
+
+ Ok(df)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn read_arrow_roundtrip() {
+ let s1 = Series::new("x".into(), &[1i64, 2, 3]);
+ let mut df = DataFrame::new(vec![s1.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".arrow").unwrap();
+ let file = std::fs::File::create(f.path()).unwrap();
+ IpcWriter::new(file).finish(&mut df).unwrap();
+
+ let result = read(f.path(), &ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 3);
+ assert_eq!(result.width(), 1);
+ }
+}
diff --git a/src/readers/csv.rs b/src/readers/csv.rs
@@ -0,0 +1,69 @@
+use anyhow::Result;
+use polars::prelude::*;
+use std::path::Path;
+
+use crate::reader::ReadOptions;
+
+pub fn read(path: &Path, opts: &ReadOptions) -> Result<DataFrame> {
+ let separator = opts.separator.unwrap_or_else(|| {
+ crate::format::detect_csv_delimiter(path).unwrap_or(b',')
+ });
+
+ let reader = CsvReadOptions::default()
+ .with_has_header(true)
+ .with_skip_rows(opts.skip_rows.unwrap_or(0))
+ .with_parse_options(
+ CsvParseOptions::default().with_separator(separator),
+ )
+ .try_into_reader_with_file_path(Some(path.into()))?;
+
+ let df = reader.finish()?;
+ Ok(df)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use std::io::Write;
+ use tempfile::NamedTempFile;
+
+ fn default_opts() -> ReadOptions {
+ ReadOptions::default()
+ }
+
+ #[test]
+ fn read_basic_csv() {
+ let mut f = NamedTempFile::with_suffix(".csv").unwrap();
+ write!(f, "name,value\nAlice,100\nBob,200\n").unwrap();
+ f.flush().unwrap();
+
+ let df = read(f.path(), &default_opts()).unwrap();
+ assert_eq!(df.height(), 2);
+ assert_eq!(df.width(), 2);
+ }
+
+ #[test]
+ fn read_tsv() {
+ let mut f = NamedTempFile::with_suffix(".tsv").unwrap();
+ write!(f, "a\tb\n1\t2\n3\t4\n").unwrap();
+ f.flush().unwrap();
+
+ let opts = ReadOptions { separator: Some(b'\t'), ..Default::default() };
+ let df = read(f.path(), &opts).unwrap();
+ assert_eq!(df.height(), 2);
+ assert_eq!(df.width(), 2);
+ }
+
+ #[test]
+ fn read_with_skip() {
+ let mut f = NamedTempFile::with_suffix(".csv").unwrap();
+ write!(f, "metadata line\nname,value\nAlice,100\n").unwrap();
+ f.flush().unwrap();
+
+ let opts = ReadOptions { skip_rows: Some(1), ..Default::default() };
+ let df = read(f.path(), &opts).unwrap();
+ assert_eq!(df.height(), 1);
+ let names: Vec<String> = df.get_column_names().iter().map(|s| s.to_string()).collect();
+ assert_eq!(names, vec!["name", "value"]);
+ }
+}
diff --git a/src/readers/excel.rs b/src/readers/excel.rs
@@ -0,0 +1,388 @@
+use anyhow::{Context, Result};
+use calamine::{open_workbook_auto, Data, Reader};
+use polars::prelude::*;
+use std::path::Path;
+
+use crate::metadata::SheetInfo;
+use crate::reader::ReadOptions;
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+enum InferredType {
+ Int,
+ Float,
+ String,
+ Bool,
+ DateTime,
+ Empty,
+}
+
+/// Read an Excel file into a DataFrame using the provided options.
+///
+/// Sheet resolution order:
+/// 1. `opts.sheet` treated as a sheet name (exact match)
+/// 2. `opts.sheet` parsed as a 0-based index
+/// 3. First sheet (default)
+pub fn read(path: &Path, opts: &ReadOptions) -> Result<DataFrame> {
+ let mut workbook = open_workbook_auto(path)
+ .with_context(|| format!("Cannot open workbook: {}", path.display()))?;
+
+ let sheet_names = workbook.sheet_names().to_vec();
+ if sheet_names.is_empty() {
+ return Ok(DataFrame::default());
+ }
+
+ let sheet_name: String = match &opts.sheet {
+ Some(s) => {
+ // Try exact name match first
+ if sheet_names.contains(s) {
+ s.clone()
+ } else {
+ // Try to parse as 0-based index
+ match s.parse::<usize>() {
+ Ok(idx) if idx < sheet_names.len() => sheet_names[idx].clone(),
+ _ => {
+ return Err(anyhow::anyhow!(
+ "Sheet '{}' not found in workbook (available: {})",
+ s,
+ sheet_names.join(", ")
+ ));
+ }
+ }
+ }
+ }
+ None => sheet_names[0].clone(),
+ };
+
+ let range = workbook
+ .worksheet_range(&sheet_name)
+ .with_context(|| format!("Cannot read sheet: {sheet_name}"))?;
+
+ let skip = opts.skip_rows.unwrap_or(0);
+ range_to_dataframe_skip(&range, skip)
+}
+
+/// Return sheet names and dimensions for an Excel file.
+pub fn read_excel_info(path: &Path) -> Result<Vec<SheetInfo>> {
+ let mut workbook = open_workbook_auto(path)
+ .with_context(|| format!("Cannot open workbook: {}", path.display()))?;
+
+ let sheet_names = workbook.sheet_names().to_vec();
+ let mut infos = Vec::with_capacity(sheet_names.len());
+
+ for name in sheet_names {
+ let range = workbook
+ .worksheet_range(&name)
+ .with_context(|| format!("Cannot read sheet: {name}"))?;
+ let (rows, cols) = range.get_size();
+ infos.push(SheetInfo { name, rows, cols });
+ }
+
+ Ok(infos)
+}
+
+pub fn range_to_dataframe(range: &calamine::Range<Data>) -> Result<DataFrame> {
+ range_to_dataframe_skip(range, 0)
+}
+
+/// Convert a calamine Range to a DataFrame, skipping `skip` rows before the header.
+pub fn range_to_dataframe_skip(range: &calamine::Range<Data>, skip: usize) -> Result<DataFrame> {
+ let rows: Vec<&[Data]> = range.rows().skip(skip).collect();
+ let cols = if rows.is_empty() {
+ 0
+ } else {
+ rows.iter().map(|r| r.len()).max().unwrap_or(0)
+ };
+
+ if rows.is_empty() || cols == 0 {
+ return Ok(DataFrame::default());
+ }
+
+ // First row (after skip) = headers
+ let headers: Vec<String> = rows[0]
+ .iter()
+ .enumerate()
+ .map(|(i, cell)| match cell {
+ Data::String(s) => s.clone(),
+ _ => format!("column_{i}"),
+ })
+ .collect();
+
+ if rows.len() == 1 {
+ // Header only, no data
+ let series: Vec<Column> = headers
+ .iter()
+ .map(|name| {
+ Series::new_empty(PlSmallStr::from(name.as_str()), &DataType::Null).into_column()
+ })
+ .collect();
+ return DataFrame::new(series).map_err(Into::into);
+ }
+
+ let data_rows = &rows[1..];
+ let mut columns: Vec<Column> = Vec::with_capacity(cols);
+
+ for col_idx in 0..cols {
+ let cells: Vec<&Data> = data_rows
+ .iter()
+ .map(|row| {
+ if col_idx < row.len() {
+ &row[col_idx]
+ } else {
+ &Data::Empty
+ }
+ })
+ .collect();
+
+ let col_type = infer_column_type(&cells);
+ let series = build_series(&headers[col_idx], &cells, col_type)?;
+ columns.push(series.into_column());
+ }
+
+ DataFrame::new(columns).map_err(Into::into)
+}
+
+fn infer_column_type(cells: &[&Data]) -> InferredType {
+ let mut has_int = false;
+ let mut has_float = false;
+ let mut has_string = false;
+ let mut has_bool = false;
+ let mut has_datetime = false;
+ let mut all_empty = true;
+
+ for cell in cells {
+ match cell {
+ Data::Empty => {}
+ Data::String(_) | Data::DateTimeIso(_) | Data::DurationIso(_) => {
+ has_string = true;
+ all_empty = false;
+ }
+ Data::Float(_) => {
+ has_float = true;
+ all_empty = false;
+ }
+ Data::Int(_) => {
+ has_int = true;
+ all_empty = false;
+ }
+ Data::Bool(_) => {
+ has_bool = true;
+ all_empty = false;
+ }
+ Data::DateTime(_) => {
+ has_datetime = true;
+ all_empty = false;
+ }
+ Data::Error(_) => {
+ has_string = true;
+ all_empty = false;
+ }
+ }
+ }
+
+ if all_empty {
+ return InferredType::Empty;
+ }
+ // String trumps everything
+ if has_string {
+ return InferredType::String;
+ }
+ // DateTime only if all non-empty cells are datetime
+ if has_datetime && !has_int && !has_float && !has_bool {
+ return InferredType::DateTime;
+ }
+ // Bool only if all non-empty cells are bool
+ if has_bool && !has_int && !has_float && !has_datetime {
+ return InferredType::Bool;
+ }
+ // Float if any float or mix of int/float
+ if has_float {
+ return InferredType::Float;
+ }
+ if has_int {
+ return InferredType::Int;
+ }
+ // Fallback: mixed datetime/bool/etc -> string
+ InferredType::String
+}
+
+fn build_series(name: &str, cells: &[&Data], col_type: InferredType) -> Result<Series> {
+ let plname = PlSmallStr::from(name);
+ match col_type {
+ InferredType::Int => {
+ let values: Vec<Option<i64>> = cells
+ .iter()
+ .map(|cell| match cell {
+ Data::Int(v) => Some(*v),
+ Data::Empty => None,
+ _ => None,
+ })
+ .collect();
+ Ok(Series::new(plname, &values))
+ }
+ InferredType::Float => {
+ let values: Vec<Option<f64>> = cells
+ .iter()
+ .map(|cell| match cell {
+ Data::Float(v) => Some(*v),
+ Data::Int(v) => Some(*v as f64),
+ Data::Empty => None,
+ _ => None,
+ })
+ .collect();
+ Ok(Series::new(plname, &values))
+ }
+ InferredType::Bool => {
+ let values: Vec<Option<bool>> = cells
+ .iter()
+ .map(|cell| match cell {
+ Data::Bool(v) => Some(*v),
+ Data::Empty => None,
+ _ => None,
+ })
+ .collect();
+ Ok(Series::new(plname, &values))
+ }
+ InferredType::DateTime => {
+ // calamine ExcelDateTime wraps a serial date float (days since 1899-12-30)
+ // Convert to milliseconds since Unix epoch for polars
+ let values: Vec<Option<i64>> = cells
+ .iter()
+ .map(|cell| match cell {
+ Data::DateTime(v) => {
+ let serial = v.as_f64();
+ // Excel epoch: 1899-12-30 = -25569 days from Unix epoch
+ let days_from_unix = serial - 25569.0;
+ let ms = (days_from_unix * 86_400_000.0) as i64;
+ Some(ms)
+ }
+ Data::Empty => None,
+ _ => None,
+ })
+ .collect();
+ let series = Series::new(plname, &values);
+ Ok(series.cast(&DataType::Datetime(TimeUnit::Milliseconds, None))?)
+ }
+ InferredType::String | InferredType::Empty => {
+ let values: Vec<Option<String>> = cells
+ .iter()
+ .map(|cell| match cell {
+ Data::String(s) => Some(s.clone()),
+ Data::Float(v) => Some(v.to_string()),
+ Data::Int(v) => Some(v.to_string()),
+ Data::Bool(v) => Some(v.to_string()),
+ Data::DateTime(v) => Some(v.as_f64().to_string()),
+ Data::Error(e) => Some(format!("{e:?}")),
+ Data::DateTimeIso(s) | Data::DurationIso(s) => Some(s.clone()),
+ Data::Empty => None,
+ })
+ .collect();
+ Ok(Series::new(plname, &values))
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // Tests that use rust_xlsxwriter to create fixtures are skipped here because
+ // rust_xlsxwriter is not a dev-dependency in dt-cli-tools. The type-inference
+ // and range-conversion logic is tested below using calamine types directly.
+ // To enable the full xlsx integration tests, add `rust_xlsxwriter` to
+ // [dev-dependencies] in Cargo.toml and port the create_simple / create_empty_data
+ // / create_with_metadata_rows helpers from xl-cli-tool/src/reader.rs.
+
+ #[test]
+ fn test_infer_int_column() {
+ let cells = vec![&Data::Int(1), &Data::Int(2), &Data::Empty, &Data::Int(4)];
+ assert_eq!(infer_column_type(&cells), InferredType::Int);
+ }
+
+ #[test]
+ fn test_infer_float_when_mixed_int_float() {
+ let cells = vec![&Data::Int(1), &Data::Float(2.5), &Data::Int(3)];
+ assert_eq!(infer_column_type(&cells), InferredType::Float);
+ }
+
+ #[test]
+ fn test_infer_string_trumps_all() {
+ let s = Data::String("hello".to_string());
+ let cells: Vec<&Data> = vec![&Data::Int(1), &s, &Data::Float(3.0)];
+ assert_eq!(infer_column_type(&cells), InferredType::String);
+ }
+
+ #[test]
+ fn test_infer_empty_column() {
+ let cells: Vec<&Data> = vec![&Data::Empty, &Data::Empty];
+ assert_eq!(infer_column_type(&cells), InferredType::Empty);
+ }
+
+ #[test]
+ fn test_infer_bool_column() {
+ let cells = vec![&Data::Bool(true), &Data::Bool(false), &Data::Empty];
+ assert_eq!(infer_column_type(&cells), InferredType::Bool);
+ }
+
+ #[test]
+ fn test_empty_range() {
+ let range: calamine::Range<Data> = Default::default();
+ let df = range_to_dataframe(&range).unwrap();
+ assert_eq!(df.height(), 0);
+ assert_eq!(df.width(), 0);
+ }
+
+ #[test]
+ fn test_build_series_int() {
+ let cells = vec![&Data::Int(10), &Data::Int(20), &Data::Empty, &Data::Int(40)];
+ let series = build_series("nums", &cells, InferredType::Int).unwrap();
+ assert_eq!(series.dtype(), &DataType::Int64);
+ assert_eq!(series.len(), 4);
+ assert_eq!(series.null_count(), 1);
+ }
+
+ #[test]
+ fn test_build_series_float() {
+ let cells = vec![&Data::Float(1.5), &Data::Int(2), &Data::Empty];
+ let series = build_series("vals", &cells, InferredType::Float).unwrap();
+ assert_eq!(series.dtype(), &DataType::Float64);
+ assert_eq!(series.len(), 3);
+ assert_eq!(series.null_count(), 1);
+ }
+
+ #[test]
+ fn test_build_series_bool() {
+ let cells = vec![&Data::Bool(true), &Data::Bool(false), &Data::Empty];
+ let series = build_series("flags", &cells, InferredType::Bool).unwrap();
+ assert_eq!(series.dtype(), &DataType::Boolean);
+ assert_eq!(series.len(), 3);
+ assert_eq!(series.null_count(), 1);
+ }
+
+ #[test]
+ fn test_build_series_string() {
+ let s1 = Data::String("foo".to_string());
+ let s2 = Data::String("bar".to_string());
+ let cells: Vec<&Data> = vec![&s1, &s2, &Data::Empty];
+ let series = build_series("words", &cells, InferredType::String).unwrap();
+ assert_eq!(series.dtype(), &DataType::String);
+ assert_eq!(series.len(), 3);
+ assert_eq!(series.null_count(), 1);
+ }
+
+ #[test]
+ fn test_range_to_dataframe_skip_empty_range() {
+ use calamine::Range;
+ let range: Range<Data> = Default::default();
+ let df = range_to_dataframe_skip(&range, 0).unwrap();
+ assert_eq!(df.height(), 0);
+ assert_eq!(df.width(), 0);
+ }
+
+ #[test]
+ fn test_sheet_resolution_default_opts() {
+ // Confirm ReadOptions default has sheet=None and skip_rows=None
+ let opts = ReadOptions::default();
+ assert!(opts.sheet.is_none());
+ assert!(opts.skip_rows.is_none());
+ }
+}
diff --git a/src/readers/json.rs b/src/readers/json.rs
@@ -0,0 +1,176 @@
+use anyhow::Result;
+use polars::prelude::*;
+use std::path::Path;
+
+use crate::format::Format;
+use crate::reader::ReadOptions;
+
+pub fn read(path: &Path, format: Format, opts: &ReadOptions) -> Result<DataFrame> {
+ let file = std::fs::File::open(path)?;
+
+ let mut df = match format {
+ Format::Ndjson => {
+ // NDJSON: one JSON object per line — use JsonLineReader
+ JsonLineReader::new(file).finish()?
+ }
+ _ => {
+ // JSON array format — JsonReader defaults to JsonFormat::Json
+ JsonReader::new(file).finish()?
+ }
+ };
+
+ if let Some(skip) = opts.skip_rows {
+ if skip > 0 && skip < df.height() {
+ df = df.slice(skip as i64, df.height() - skip);
+ }
+ }
+
+ Ok(df)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use std::io::Write;
+ use tempfile::NamedTempFile;
+
+ fn default_opts() -> ReadOptions {
+ ReadOptions::default()
+ }
+
+ // ── JSON array ────────────────────────────────────────────────────────────
+
+ #[test]
+ fn read_json_array_basic() {
+ let mut f = NamedTempFile::with_suffix(".json").unwrap();
+ write!(
+ f,
+ r#"[{{"name":"Alice","value":100}},{{"name":"Bob","value":200}},{{"name":"Carol","value":300}}]"#
+ )
+ .unwrap();
+ f.flush().unwrap();
+
+ let df = read(f.path(), Format::Json, &default_opts()).unwrap();
+ assert_eq!(df.height(), 3);
+ assert_eq!(df.width(), 2);
+ }
+
+ #[test]
+ fn read_json_array_with_skip() {
+ let mut f = NamedTempFile::with_suffix(".json").unwrap();
+ write!(
+ f,
+ r#"[{{"id":1}},{{"id":2}},{{"id":3}},{{"id":4}},{{"id":5}}]"#
+ )
+ .unwrap();
+ f.flush().unwrap();
+
+ let opts = ReadOptions {
+ skip_rows: Some(2),
+ ..Default::default()
+ };
+ let df = read(f.path(), Format::Json, &opts).unwrap();
+ // 5 rows total, skip 2 → 3 rows remain
+ assert_eq!(df.height(), 3);
+ }
+
+ #[test]
+ fn read_json_array_skip_zero_noop() {
+ let mut f = NamedTempFile::with_suffix(".json").unwrap();
+ write!(f, r#"[{{"x":1}},{{"x":2}}]"#).unwrap();
+ f.flush().unwrap();
+
+ let opts = ReadOptions {
+ skip_rows: Some(0),
+ ..Default::default()
+ };
+ let df = read(f.path(), Format::Json, &opts).unwrap();
+ assert_eq!(df.height(), 2);
+ }
+
+ #[test]
+ fn read_json_array_single_row() {
+ let mut f = NamedTempFile::with_suffix(".json").unwrap();
+ write!(f, r#"[{{"a":42,"b":"hello"}}]"#).unwrap();
+ f.flush().unwrap();
+
+ let df = read(f.path(), Format::Json, &default_opts()).unwrap();
+ assert_eq!(df.height(), 1);
+ assert_eq!(df.width(), 2);
+ }
+
+ // ── NDJSON ────────────────────────────────────────────────────────────────
+
+ #[test]
+ fn read_ndjson_basic() {
+ let mut f = NamedTempFile::with_suffix(".ndjson").unwrap();
+ write!(
+ f,
+ "{}\n{}\n{}\n",
+ r#"{"name":"Alice","value":100}"#,
+ r#"{"name":"Bob","value":200}"#,
+ r#"{"name":"Carol","value":300}"#
+ )
+ .unwrap();
+ f.flush().unwrap();
+
+ let df = read(f.path(), Format::Ndjson, &default_opts()).unwrap();
+ assert_eq!(df.height(), 3);
+ assert_eq!(df.width(), 2);
+ }
+
+ #[test]
+ fn read_ndjson_with_skip() {
+ let mut f = NamedTempFile::with_suffix(".ndjson").unwrap();
+ for i in 1..=5 {
+ writeln!(f, r#"{{"id":{}}}"#, i).unwrap();
+ }
+ f.flush().unwrap();
+
+ let opts = ReadOptions {
+ skip_rows: Some(2),
+ ..Default::default()
+ };
+ let df = read(f.path(), Format::Ndjson, &opts).unwrap();
+ // 5 rows total, skip 2 → 3 rows remain
+ assert_eq!(df.height(), 3);
+ }
+
+ #[test]
+ fn read_ndjson_no_trailing_newline() {
+ let mut f = NamedTempFile::with_suffix(".jsonl").unwrap();
+ write!(f, "{}\n{}", r#"{"x":1}"#, r#"{"x":2}"#).unwrap();
+ f.flush().unwrap();
+
+ let df = read(f.path(), Format::Ndjson, &default_opts()).unwrap();
+ assert_eq!(df.height(), 2);
+ }
+
+ #[test]
+ fn read_ndjson_single_row() {
+ let mut f = NamedTempFile::with_suffix(".ndjson").unwrap();
+ writeln!(f, r#"{{"a":1,"b":"z"}}"#).unwrap();
+ f.flush().unwrap();
+
+ let df = read(f.path(), Format::Ndjson, &default_opts()).unwrap();
+ assert_eq!(df.height(), 1);
+ assert_eq!(df.width(), 2);
+ }
+
+ // ── skip_rows boundary ────────────────────────────────────────────────────
+
+ #[test]
+ fn skip_rows_ge_height_noop() {
+ let mut f = NamedTempFile::with_suffix(".json").unwrap();
+ write!(f, r#"[{{"v":1}},{{"v":2}}]"#).unwrap();
+ f.flush().unwrap();
+
+ let opts = ReadOptions {
+ skip_rows: Some(10),
+ ..Default::default()
+ };
+ let df = read(f.path(), Format::Json, &opts).unwrap();
+ // skip >= height: condition `skip < df.height()` is false → no-op
+ assert_eq!(df.height(), 2);
+ }
+}
diff --git a/src/readers/parquet.rs b/src/readers/parquet.rs
@@ -0,0 +1,39 @@
+use anyhow::Result;
+use polars::prelude::*;
+use std::path::Path;
+
+use crate::reader::ReadOptions;
+
+pub fn read(path: &Path, opts: &ReadOptions) -> Result<DataFrame> {
+ let file = std::fs::File::open(path)?;
+ let mut df = ParquetReader::new(file).finish()?;
+
+ if let Some(skip) = opts.skip_rows {
+ if skip > 0 && skip < df.height() {
+ df = df.slice(skip as i64, df.height() - skip);
+ }
+ }
+
+ Ok(df)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use tempfile::NamedTempFile;
+
+ #[test]
+ fn read_parquet_roundtrip() {
+ let s1 = Series::new("name".into(), &["Alice", "Bob"]);
+ let s2 = Series::new("value".into(), &[100i64, 200]);
+ let mut df = DataFrame::new(vec![s1.into_column(), s2.into_column()]).unwrap();
+
+ let f = NamedTempFile::with_suffix(".parquet").unwrap();
+ let file = std::fs::File::create(f.path()).unwrap();
+ ParquetWriter::new(file).finish(&mut df).unwrap();
+
+ let result = read(f.path(), &ReadOptions::default()).unwrap();
+ assert_eq!(result.height(), 2);
+ assert_eq!(result.width(), 2);
+ }
+}