wrds-download

TUI/CLI tool for browsing and downloading WRDS data
Log | Files | Refs | README

commit 285361cd91279250da280a71ef863788351867e9
parent 91876f0030b40e7c739f88a9b073dfcbe08cd6d8
Author: Erik Loualiche <eloualiche@users.noreply.github.com>
Date:   Mon, 23 Feb 2026 16:38:31 -0600

Merge pull request #8 from LouLouLibs/feat/python-cli

Add Python CLI alternative via uv, restructure repo
Diffstat:
A.github/workflows/ci-python.yml | 35+++++++++++++++++++++++++++++++++++
M.github/workflows/ci.yml | 3+++
M.github/workflows/release.yml | 5++++-
M.gitignore | 8++++++++
MREADME.md | 308+++++++++++++++++++++----------------------------------------------------------
Aclaude-skill-wrds-download-py/README.md | 36++++++++++++++++++++++++++++++++++++
Aclaude-skill-wrds-download-py/SKILL.md | 134+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mclaude-skill-wrds-download/README.md | 2+-
Mclaude-skill-wrds-download/SKILL.md | 6++++--
Ago/README.md | 189+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rcmd/download.go -> go/cmd/download.go | 0
Rcmd/download_test.go -> go/cmd/download_test.go | 0
Rcmd/info.go -> go/cmd/info.go | 0
Rcmd/root.go -> go/cmd/root.go | 0
Rcmd/tui.go -> go/cmd/tui.go | 0
Rgo.mod -> go/go.mod | 0
Rgo.sum -> go/go.sum | 0
Rinternal/config/config.go -> go/internal/config/config.go | 0
Rinternal/db/client.go -> go/internal/db/client.go | 0
Rinternal/db/meta.go -> go/internal/db/meta.go | 0
Rinternal/db/meta_test.go -> go/internal/db/meta_test.go | 0
Rinternal/export/export.go -> go/internal/export/export.go | 0
Rinternal/export/export_test.go -> go/internal/export/export_test.go | 0
Rinternal/tui/app.go -> go/internal/tui/app.go | 0
Rinternal/tui/dlform.go -> go/internal/tui/dlform.go | 0
Rinternal/tui/loginform.go -> go/internal/tui/loginform.go | 0
Rinternal/tui/styles.go -> go/internal/tui/styles.go | 0
Rmain.go -> go/main.go | 0
Apython/README.md | 157+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apython/pyproject.toml | 32++++++++++++++++++++++++++++++++
Apython/src/wrds_dl/__init__.py | 3+++
Apython/src/wrds_dl/__main__.py | 5+++++
Apython/src/wrds_dl/cli.py | 188+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apython/src/wrds_dl/config.py | 48++++++++++++++++++++++++++++++++++++++++++++++++
Apython/src/wrds_dl/db.py | 126+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apython/src/wrds_dl/export.py | 150+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apython/tests/__init__.py | 0
Apython/tests/test_cli.py | 37+++++++++++++++++++++++++++++++++++++
Apython/tests/test_config.py | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apython/tests/test_db.py | 43+++++++++++++++++++++++++++++++++++++++++++
Apython/tests/test_export.py | 48++++++++++++++++++++++++++++++++++++++++++++++++
Apython/tests/test_integration.py | 145+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
42 files changed, 1540 insertions(+), 233 deletions(-)

diff --git a/.github/workflows/ci-python.yml b/.github/workflows/ci-python.yml @@ -0,0 +1,35 @@ +name: CI (Python) + +on: + push: + branches: [main] + paths: ["python/**"] + pull_request: + branches: [main] + paths: ["python/**"] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v4 + + - uses: astral-sh/setup-uv@v5 + + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + + - name: Install dependencies + run: uv sync --python ${{ matrix.python-version }} + + - name: Lint + run: uv run ruff check src tests + + - name: Test + run: uv run pytest diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml @@ -10,6 +10,9 @@ on: jobs: test: runs-on: ubuntu-latest + defaults: + run: + working-directory: go steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml @@ -13,6 +13,9 @@ jobs: build: needs: [ci] runs-on: ubuntu-latest + defaults: + run: + working-directory: go strategy: matrix: include: @@ -42,7 +45,7 @@ jobs: - uses: actions/upload-artifact@v4 with: name: wrds-dl-${{ matrix.goos }}-${{ matrix.goarch }} - path: wrds-dl-${{ matrix.goos }}-${{ matrix.goarch }}${{ matrix.ext }} + path: go/wrds-dl-${{ matrix.goos }}-${{ matrix.goarch }}${{ matrix.ext }} release: needs: [build] diff --git a/.gitignore b/.gitignore @@ -3,3 +3,11 @@ wrds-dl-* *.parquet *.csv .claude/ + +# Python +python/.venv/ +python/uv.lock +__pycache__/ +*.pyc +*.egg-info/ +dist/ diff --git a/README.md b/README.md @@ -1,75 +1,99 @@ # wrds-dl -A terminal tool for browsing and downloading data from the [WRDS](https://wrds-www.wharton.upenn.edu/) PostgreSQL database. Comes with an interactive TUI for exploration and a CLI for scripted downloads. Output is Parquet or CSV — pure Go, no CGo, cross-platform. +A terminal tool for browsing and downloading data from the [WRDS](https://wrds-www.wharton.upenn.edu/) PostgreSQL database. Output is Parquet or CSV. -## Claude Code skill +Two implementations with the same CLI interface — pick whichever fits your environment: -A bundled [Claude Code](https://claude.com/claude-code) skill lets you download WRDS data using natural language: +| | [Go](go/) | [Python](python/) | +|---|---|---| +| **Install** | Pre-built binary (~19 MB) | `uv tool install` / `uv run` | +| **TUI browser** | Yes | No | +| **CLI commands** | `download`, `info` | `download`, `info` | +| **Dependencies** | None (static binary) | Python 3.10+, `uv` | +| **Best for** | Interactive exploration, offline use | HPC clusters, CI, quick installs | -``` -/wrds-download CRSP daily stock data for 2020 +## Quick start + +### Go + +```sh +# Download binary (macOS Apple Silicon example) +curl -L https://github.com/louloulibs/wrds-download/releases/latest/download/wrds-dl-darwin-arm64 \ + -o /usr/local/bin/wrds-dl +chmod +x /usr/local/bin/wrds-dl ``` -Claude will inspect the table, show you the structure, do a dry run for large tables, and download to Parquet. Install by copying the skill into your project or personal skills directory: +### Python ```sh -cp -r claude-skill-wrds-download ~/.claude/skills/wrds-download +# Install as a uv tool +uv tool install wrds-dl --from ./python + +# Or run directly +cd python && uv run wrds-dl --help ``` -See [`claude-skill-wrds-download/`](claude-skill-wrds-download/) for details. +## CLI (both implementations) -## Features +```sh +# CRSP monthly stock file — prices and returns for 2020 +wrds-dl download --schema crsp --table msf \ + --columns "permno,date,prc,ret,shrout" \ + --where "date >= '2020-01-01' AND date < '2021-01-01'" \ + --out crsp_msf_2020.parquet -- **TUI** — browse schemas and tables, inspect column metadata, trigger downloads without leaving the terminal -- **CLI** — scriptable `download` command with structured flags or raw SQL -- **`info` command** — inspect table metadata (columns, types, row count) from the command line or scripts -- **Parquet output** — streams rows via pgx and writes Parquet with ZSTD compression using parquet-go (pure Go) -- **CSV output** — streams rows to CSV via encoding/csv -- **Progress feedback** — live row count during large exports (CLI and TUI) -- **Dry-run mode** — preview the query, row count, and first 5 rows before committing to a download -- **Login flow** — interactive login screen with Duo 2FA support; saved credentials for one-press reconnect -- **Database switching** — browse and switch between WRDS databases from within the TUI -- **Standard auth** — reads from `PG*` environment variables, `~/.config/wrds-dl/credentials`, or `~/.pgpass` +# Inspect table metadata before downloading +wrds-dl info --schema crsp --table msf -## Installation +# Dry run — preview query, row count, and sample rows +wrds-dl download --schema crsp --table msf \ + --where "date = '2020-01-31'" --dry-run -### Pre-built binaries (recommended) +# CRSP daily stock file +wrds-dl download --schema crsp --table dsf \ + --where "date >= '2020-01-01' AND date < '2021-01-01'" \ + --out crsp_dsf_2020.parquet -Download the latest release from the [Releases page](https://github.com/louloulibs/wrds-download/releases): +# Select specific columns from Compustat +wrds-dl download --schema comp --table funda \ + --columns "gvkey,datadate,sale,at" \ + --out funda_subset.parquet -| Platform | Binary | -|---|---| -| macOS (Apple Silicon) | `wrds-dl-darwin-arm64` | -| macOS (Intel) | `wrds-dl-darwin-amd64` | -| Linux x86-64 | `wrds-dl-linux-amd64` | -| Windows x86-64 | `wrds-dl-windows-amd64.exe` | +# Raw SQL +wrds-dl download \ + --query "SELECT permno, date, prc FROM crsp.msf WHERE date >= '2015-01-01'" \ + --out crsp_msf_2015_onwards.parquet -```sh -# macOS example -curl -L https://github.com/louloulibs/wrds-download/releases/latest/download/wrds-dl-darwin-arm64 \ - -o /usr/local/bin/wrds-dl -chmod +x /usr/local/bin/wrds-dl +# CSV output +wrds-dl download --schema crsp --table msf \ + --columns "permno,date,ret" --limit 1000 \ + --out crsp_msf_sample.csv +``` + +## Claude Code skill + +Bundled [Claude Code](https://claude.com/claude-code) skills let you download WRDS data using natural language: + +``` +/wrds-download CRSP daily stock data for 2020 ``` -### Build from source +Two variants available: +- [`claude-skill-wrds-download/`](claude-skill-wrds-download/) — uses the Go binary +- [`claude-skill-wrds-download-py/`](claude-skill-wrds-download-py/) — uses the Python CLI (no binary needed) -Requires Go 1.25+. No CGo or C compiler needed. +Install by copying the skill into your skills directory: ```sh -git clone https://github.com/louloulibs/wrds-download -cd wrds-download -CGO_ENABLED=0 go build -ldflags="-s -w" -o wrds-dl . -mv wrds-dl /usr/local/bin/ +cp -r claude-skill-wrds-download-py ~/.claude/skills/wrds-download ``` ## Authentication -WRDS uses Duo two-factor authentication. The TUI always starts on a login screen so you control when the connection (and Duo push) fires. +WRDS uses Duo two-factor authentication. Configure credentials before using the CLI. ### Option 1: Environment variables -Set the standard PostgreSQL environment variables before running: - ```sh export PGUSER=your_username export PGPASSWORD=your_password @@ -85,7 +109,7 @@ export PGDATABASE=wrds ### Option 2: Saved credentials -On first login via the TUI, check "Save to ~/.config/wrds-dl/credentials". On subsequent launches, press `enter` on the "Login as ..." button. The credentials file is stored at `~/.config/wrds-dl/credentials` (or `$XDG_CONFIG_HOME/wrds-dl/credentials`) with `0600` permissions: +Store credentials at `~/.config/wrds-dl/credentials` (or `$XDG_CONFIG_HOME/wrds-dl/credentials`) with `0600` permissions: ``` PGUSER=your_username @@ -93,6 +117,8 @@ PGPASSWORD=your_password PGDATABASE=wrds ``` +The Go TUI can save these automatically on first login. + ### Option 3: ~/.pgpass Standard PostgreSQL password file: @@ -101,194 +127,18 @@ Standard PostgreSQL password file: wrds-pgdata.wharton.upenn.edu:9737:*:your_username:your_password ``` -## TUI - -Launch the interactive browser: - -```sh -wrds-dl tui -``` - -The TUI has three panes: **Schemas**, **Tables**, and **Preview** (column catalog with types, descriptions, and table stats). - -### Keybindings - -| Key | Action | -|---|---| -| `tab` / `shift+tab` | Cycle focus between panes | -| `right` / `l` | Drill into selected schema or table | -| `left` / `h` | Go back one pane | -| `d` | Open download dialog for the selected table | -| `b` | Switch database | -| `/` | Filter current list (schemas, tables, or columns) | -| `esc` | Cancel / dismiss overlay | -| `q` / `ctrl+c` | Quit | - -### Download dialog - -Press `d` on a selected table to open the download form: - -| Field | Description | -|---|---| -| SELECT columns | Comma-separated column names, or `*` for all | -| WHERE clause | SQL filter without the `WHERE` keyword | -| LIMIT rows | Maximum number of rows to download (leave empty for no limit) | -| Output path | File path; defaults to `./schema_table.parquet` | -| Format | `parquet` or `csv` | - -Navigate with `tab`/`shift+tab`, confirm with `enter` on the last field. - -During download, the spinner shows a live row count updated every 10,000 rows. - -## CLI - -### Structured download - -```sh -wrds-dl download \ - --schema crsp \ - --table dsf \ - --where "date >= '2020-01-01' AND date < '2021-01-01'" \ - --out crsp_dsf_2020.parquet -``` - -### Select specific columns - -```sh -wrds-dl download \ - --schema comp \ - --table funda \ - --columns "gvkey,datadate,sale,at" \ - --out funda_subset.parquet -``` - -### Raw SQL - -```sh -wrds-dl download \ - --query "SELECT permno, date, prc FROM crsp.dsf WHERE date > '2020-01-01'" \ - --out crsp_dsf.parquet -``` - -### CSV output - -```sh -wrds-dl download \ - --schema comp \ - --table funda \ - --out funda.csv -``` - -Format is inferred from the output file extension (`.parquet` or `.csv`). Override with `--format`. - -### Dry run - -Preview what a download will do before committing: - -```sh -wrds-dl download \ - --schema crsp \ - --table dsf \ - --where "date = '2020-01-02'" \ - --dry-run -``` - -This prints the SQL query, the row count, and the first 5 rows as a table. No `--out` flag is required for dry runs. - -### All download flags - -| Flag | Description | -|---|---| -| `--schema` | Schema name (e.g. `crsp`, `comp`) | -| `--table` | Table name (e.g. `dsf`, `funda`) | -| `-c`, `--columns` | Columns to select (comma-separated, default `*`) | -| `--where` | SQL `WHERE` clause, without the keyword | -| `--query` | Full SQL query — overrides `--schema`, `--table`, `--where`, `--columns` | -| `--out` | Output file path (required unless `--dry-run`) | -| `--format` | `parquet` or `csv` (inferred from extension if omitted) | -| `--limit` | Row limit, useful for testing (default: no limit) | -| `--dry-run` | Preview query, row count, and first 5 rows without downloading | - -### Table info - -Inspect table metadata without downloading data: - -```sh -wrds-dl info --schema crsp --table dsf -``` - -Output: - -``` -crsp.dsf - Daily Stock File - ~245302893 rows, 47 GB - -NAME TYPE NULLABLE DESCRIPTION -cusip character varying(8) YES CUSIP - HISTORICAL -permno double precision YES PERMNO -permco double precision YES PERMCO -... -``` - -For machine-readable output (useful in scripts and coding assistants): - -```sh -wrds-dl info --schema crsp --table dsf --json -``` - -## How it works - -`wrds-dl` connects directly to the WRDS PostgreSQL server using [pgx](https://github.com/jackc/pgx). All operations — metadata browsing, column inspection, and data download — go through a single pooled connection (limited to 1 to avoid triggering multiple Duo 2FA prompts). - -**Downloads** stream rows from Postgres and write them incrementally: -- **Parquet**: rows are batched (10,000 per row group) and written with ZSTD compression via [parquet-go](https://github.com/parquet-go/parquet-go). String columns use PLAIN encoding for broad compatibility (R, Python, Julia). -- **CSV**: rows are streamed directly to disk via Go's `encoding/csv`. - -Progress is reported every 10,000 rows — printed to stderr on the CLI and shown in the TUI spinner overlay. - -PostgreSQL types are mapped to Parquet types: `bool` → BOOLEAN, `int2/int4` → INT32, `int8` → INT64, `float4` → FLOAT, `float8` → DOUBLE, `date` → DATE, `timestamp/timestamptz` → TIMESTAMP (microseconds), `numeric` → STRING, `text/varchar/char` → STRING. - -Schema and table names are quoted as PostgreSQL identifiers to prevent SQL injection. Column names from `--columns` are individually quoted. - ## Project structure ``` wrds-download/ -├── main.go # entrypoint -├── claude-skill-wrds-download/ -│ ├── SKILL.md # Claude Code skill for natural-language downloads -│ └── README.md # skill installation instructions -├── cmd/ -│ ├── root.go # cobra root command -│ ├── tui.go # `wrds-dl tui` — launches interactive browser -│ ├── download.go # `wrds-dl download` — CLI download with --dry-run -│ └── info.go # `wrds-dl info` — table metadata inspection -├── internal/ -│ ├── db/ -│ │ ├── client.go # pgx pool, DSN construction, connection management -│ │ └── meta.go # schema/table/column queries against pg_catalog -│ ├── export/ -│ │ └── export.go # Export() — pgx streaming → parquet-go / csv writer -│ ├── tui/ -│ │ ├── app.go # root Bubble Tea model, Update/View, pane navigation -│ │ ├── loginform.go # login dialog with saved-credentials support -│ │ ├── dlform.go # download dialog (columns, where, limit, output, format) -│ │ └── styles.go # lipgloss styles and colors -│ └── config/ -│ └── config.go # credentials file read/write (~/.config/wrds-dl/) -└── .github/workflows/ - ├── ci.yml # CI: go vet, build, and test on push/PR - └── release.yml # Release: cross-compile 4 targets, attach to GitHub Release +├── go/ # Go implementation (TUI + CLI) +│ ├── main.go +│ ├── cmd/ # download, info, tui commands +│ └── internal/ # db, export, tui, config modules +├── python/ # Python implementation (CLI only) +│ ├── pyproject.toml +│ └── src/wrds_dl/ # cli, db, export, config modules +├── claude-skill-wrds-download/ # Claude skill (Go binary) +├── claude-skill-wrds-download-py/ # Claude skill (Python/uv) +└── .github/workflows/ # CI for both implementations ``` - -## Dependencies - -| Package | Purpose | -|---|---| -| [`jackc/pgx/v5`](https://github.com/jackc/pgx) | PostgreSQL driver — all queries and data streaming | -| [`parquet-go/parquet-go`](https://github.com/parquet-go/parquet-go) | Parquet file writing with ZSTD compression (pure Go) | -| [`charmbracelet/bubbletea`](https://github.com/charmbracelet/bubbletea) | TUI framework | -| [`charmbracelet/bubbles`](https://github.com/charmbracelet/bubbles) | List, text-input, spinner components | -| [`charmbracelet/lipgloss`](https://github.com/charmbracelet/lipgloss) | Terminal layout and styling | -| [`spf13/cobra`](https://github.com/spf13/cobra) | CLI commands and flags | diff --git a/claude-skill-wrds-download-py/README.md b/claude-skill-wrds-download-py/README.md @@ -0,0 +1,36 @@ +# Claude Code Skill: WRDS Download (Python) + +A [Claude Code](https://claude.com/claude-code) skill for downloading WRDS data using natural language. This variant uses the **Python** `wrds-dl` CLI (via `uv`) — no pre-built binary required. + +## Installation + +Copy the skill folder into your Claude Code skills directory: + +```sh +# Personal (all projects) +cp -r claude-skill-wrds-download-py ~/.claude/skills/wrds-download + +# Or project-local +cp -r claude-skill-wrds-download-py .claude/skills/wrds-download +``` + +## Prerequisites + +1. **`uv`** — install from https://docs.astral.sh/uv/ +2. **`wrds-dl`** — install the Python CLI: + ```sh + uv tool install wrds-dl --from /path/to/wrds-download/python + ``` +3. **WRDS credentials** — set environment variables or save credentials: + ```sh + export PGUSER=your_username + export PGPASSWORD=your_password + ``` + +## Usage + +``` +/wrds-download CRSP daily stock data for 2020 +``` + +Claude will inspect the table, show you the structure, do a dry run for large tables, and download to Parquet. diff --git a/claude-skill-wrds-download-py/SKILL.md b/claude-skill-wrds-download-py/SKILL.md @@ -0,0 +1,134 @@ +--- +name: wrds-download +description: Download data from the WRDS (Wharton Research Data Services) PostgreSQL database to local Parquet or CSV files. Use when the user asks to get data from WRDS, download financial data, or mentions WRDS schemas like crsp, comp, optionm, ibes, etc. +allowed-tools: Bash(wrds-dl *), Bash(uv run *wrds*), Read, Grep +argument-hint: [description of data needed] +--- + +# WRDS Data Download (Python) + +You help users download data from the Wharton Research Data Services (WRDS) PostgreSQL database using the `wrds-dl` Python CLI tool (via `uv`). + +## Prerequisites + +The user must have: +- **`uv`** installed (https://docs.astral.sh/uv/) +- **WRDS credentials** configured via one of: + - Environment variables: `PGUSER` and `PGPASSWORD` + - Saved credentials at `~/.config/wrds-dl/credentials` + - Standard `~/.pgpass` file + +Run the CLI with `uv run wrds-dl` from the `python/` directory of the wrds-download repo, or `wrds-dl` if installed via `uv tool install`. + +If `wrds-dl` is not found and `uv` is available, install it: +```bash +uv tool install wrds-dl --from /path/to/wrds-download/python +``` + +## Workflow + +Follow these steps for every download request: + +### Step 1: Identify the table + +Parse the user's request to determine the WRDS schema and table. Common mappings: + +| Dataset | Schema | Key Tables | +|---------|--------|------------| +| CRSP daily stock | `crsp` | `dsf` (daily), `msf` (monthly), `dsi` (index) | +| CRSP events | `crsp` | `dsedelist`, `stocknames` | +| Compustat annual | `comp` | `funda` | +| Compustat quarterly | `comp` | `fundq` | +| Compustat global | `comp_global_daily` | `g_funda`, `g_fundq` | +| IBES | `ibes` | `statsum_epsus`, `actu_epsus` | +| OptionMetrics | `optionm` | `opprcd` (prices), `secprd` (security) | +| TAQ | `taqmsec` | `ctm_YYYYMMDD` | +| CRSP/Compustat merged | `crsp` | `ccmxpf_linktable` | +| BoardEx | `boardex` | `na_wrds_company_profile` | +| Institutional (13F) | `tfn` | `s34` | +| Audit Analytics | `audit` | `auditnonreli` | +| Ravenpack | `ravenpack` | `rpa_djnw` | +| Bank Regulatory | `bank` | `call_schedule_rc`, `bhck` | + +If unsure which table, ask the user or use `wrds-dl info` to explore. + +### Step 2: Inspect the table + +Always run `wrds-dl info` first to understand the table structure: + +```bash +wrds-dl info --schema <schema> --table <table> +``` + +Use the output to: +- Confirm the table exists and has the expected columns +- Note column names for the user's requested variables +- Check the estimated row count to warn about large downloads + +For JSON output (useful for parsing): `wrds-dl info --schema <schema> --table <table> --json` + +### Step 3: Dry run + +For tables with more than 1 million estimated rows, or when a WHERE clause is involved, always do a dry run first: + +```bash +wrds-dl download --schema <schema> --table <table> \ + --columns "<cols>" --where "<filter>" --dry-run +``` + +Show the user the row count and sample rows. Ask for confirmation before proceeding if the row count is very large (>10M rows). + +### Step 4: Download + +Build and run the download command: + +```bash +wrds-dl download \ + --schema <schema> \ + --table <table> \ + --columns "<comma-separated columns>" \ + --where "<SQL filter>" \ + --out <output_file> \ + --format <parquet|csv> +``` + +#### Defaults and conventions +- **Format**: Use Parquet unless the user asks for CSV. Parquet is smaller and faster. +- **Output path**: Name the file descriptively, e.g., `crsp_dsf_2020.parquet` or `comp_funda_2010_2023.parquet`. +- **Columns**: Select only the columns the user needs. Don't use `*` on wide tables — ask what variables they need. +- **Limit**: Use `--limit` for testing. Suggest `--limit 1000` if the user is exploring. + +#### Common filters +- Date ranges: `--where "date >= '2020-01-01' AND date < '2021-01-01'"` +- Specific firms by permno: `--where "permno IN (10107, 93436)"` +- Specific firms by gvkey: `--where "gvkey IN ('001690', '012141')"` +- Fiscal year: `--where "fyear >= 2010 AND fyear <= 2023"` + +### Step 5: Verify + +After download completes, confirm the file was created and report its size: + +```bash +ls -lh <output_file> +``` + +## Error handling + +- **Authentication errors**: Remind the user to set `PGUSER`/`PGPASSWORD` or configure `~/.config/wrds-dl/credentials`. +- **Table not found**: Use `wrds-dl info` to check schema/table names. WRDS schemas and table names are lowercase. +- **Timeout on large tables**: Suggest adding a `--where` filter or `--limit` to reduce the result set. +- **Duo 2FA prompt**: The connection triggers a Duo push. Tell the user to approve it on their phone. + +## Example interactions + +**User**: "Download CRSP daily stock data for 2020" +-> `wrds-dl info --schema crsp --table dsf` +-> `wrds-dl download --schema crsp --table dsf --where "date >= '2020-01-01' AND date < '2021-01-01'" --out crsp_dsf_2020.parquet` + +**User**: "Get Compustat annual fundamentals, just gvkey, datadate, and sales" +-> `wrds-dl info --schema comp --table funda` +-> `wrds-dl download --schema comp --table funda --columns "gvkey,datadate,sale" --out comp_funda.parquet` + +**User**: "I need IBES analyst estimates" +-> `wrds-dl info --schema ibes --table statsum_epsus` +-> Ask what date range and variables they need, then download. diff --git a/claude-skill-wrds-download/README.md b/claude-skill-wrds-download/README.md @@ -18,7 +18,7 @@ cp -r claude-skill-wrds-download ~/.claude/skills/wrds-download ## Prerequisites -1. **`wrds-dl` binary** on your PATH — see [Installation](../README.md#installation) +1. **`wrds-dl`** on your PATH — either the [Go binary](../go/) or the [Python CLI](../python/) (`uv tool install wrds-dl --from ./python`) 2. **WRDS credentials** configured via environment variables, saved credentials, or `~/.pgpass` ## Usage diff --git a/claude-skill-wrds-download/SKILL.md b/claude-skill-wrds-download/SKILL.md @@ -11,12 +11,14 @@ You help users download data from the Wharton Research Data Services (WRDS) Post ## Prerequisites -The `wrds-dl` binary must be installed and on the PATH. The user must have WRDS credentials configured via one of: +The `wrds-dl` CLI must be installed and on the PATH. Either the Go binary or the Python version works — they have the same commands and flags. The user must have WRDS credentials configured via one of: - Environment variables: `PGUSER` and `PGPASSWORD` - Saved credentials at `~/.config/wrds-dl/credentials` - Standard `~/.pgpass` file -If `wrds-dl` is not found, tell the user to install it from https://github.com/LouLouLibs/wrds-download/releases or build from source with `go build`. +If `wrds-dl` is not found, tell the user to install it: +- **Go binary**: download from https://github.com/LouLouLibs/wrds-download/releases +- **Python (via uv)**: `uv tool install wrds-dl --from /path/to/wrds-download/python` ## Workflow diff --git a/go/README.md b/go/README.md @@ -0,0 +1,189 @@ +# wrds-dl (Go) + +Full-featured terminal tool for browsing and downloading data from the [WRDS](https://wrds-www.wharton.upenn.edu/) PostgreSQL database. Includes an interactive TUI for exploration and a CLI for scripted downloads. Output is Parquet or CSV — pure Go, no CGo, cross-platform. + +## Features + +- **TUI** — browse schemas and tables, inspect column metadata, trigger downloads without leaving the terminal +- **CLI** — scriptable `download` command with structured flags or raw SQL +- **`info` command** — inspect table metadata (columns, types, row count) from the command line or scripts +- **Parquet output** — streams rows via pgx and writes Parquet with ZSTD compression using parquet-go (pure Go) +- **CSV output** — streams rows to CSV via encoding/csv +- **Progress feedback** — live row count during large exports (CLI and TUI) +- **Dry-run mode** — preview the query, row count, and first 5 rows before committing to a download +- **Login flow** — interactive login screen with Duo 2FA support; saved credentials for one-press reconnect +- **Database switching** — browse and switch between WRDS databases from within the TUI + +## Installation + +### Pre-built binaries (recommended) + +Download the latest release from the [Releases page](https://github.com/louloulibs/wrds-download/releases): + +| Platform | Binary | +|---|---| +| macOS (Apple Silicon) | `wrds-dl-darwin-arm64` | +| macOS (Intel) | `wrds-dl-darwin-amd64` | +| Linux x86-64 | `wrds-dl-linux-amd64` | +| Windows x86-64 | `wrds-dl-windows-amd64.exe` | + +```sh +# macOS example +curl -L https://github.com/louloulibs/wrds-download/releases/latest/download/wrds-dl-darwin-arm64 \ + -o /usr/local/bin/wrds-dl +chmod +x /usr/local/bin/wrds-dl +``` + +### Build from source + +Requires Go 1.25+. No CGo or C compiler needed. + +```sh +cd go +CGO_ENABLED=0 go build -ldflags="-s -w" -o wrds-dl . +mv wrds-dl /usr/local/bin/ +``` + +## TUI + +Launch the interactive browser: + +```sh +wrds-dl tui +``` + +The TUI has three panes: **Schemas**, **Tables**, and **Preview** (column catalog with types, descriptions, and table stats). + +### Keybindings + +| Key | Action | +|---|---| +| `tab` / `shift+tab` | Cycle focus between panes | +| `right` / `l` | Drill into selected schema or table | +| `left` / `h` | Go back one pane | +| `d` | Open download dialog for the selected table | +| `b` | Switch database | +| `/` | Filter current list (schemas, tables, or columns) | +| `esc` | Cancel / dismiss overlay | +| `q` / `ctrl+c` | Quit | + +### Download dialog + +Press `d` on a selected table to open the download form: + +| Field | Description | +|---|---| +| SELECT columns | Comma-separated column names, or `*` for all | +| WHERE clause | SQL filter without the `WHERE` keyword | +| LIMIT rows | Maximum number of rows to download (leave empty for no limit) | +| Output path | File path; defaults to `./schema_table.parquet` | +| Format | `parquet` or `csv` | + +## CLI + +### CRSP monthly stock file + +```sh +# Inspect the table first +wrds-dl info --schema crsp --table msf + +# Download prices and returns for 2020 +wrds-dl download \ + --schema crsp \ + --table msf \ + --columns "permno,date,prc,ret,shrout" \ + --where "date >= '2020-01-01' AND date < '2021-01-01'" \ + --out crsp_msf_2020.parquet + +# Dry run — check row count before committing +wrds-dl download \ + --schema crsp \ + --table msf \ + --where "date = '2020-01-31'" \ + --dry-run +``` + +### CRSP daily stock file + +```sh +wrds-dl download \ + --schema crsp \ + --table dsf \ + --where "date >= '2020-01-01' AND date < '2021-01-01'" \ + --out crsp_dsf_2020.parquet +``` + +### Select specific columns + +```sh +wrds-dl download \ + --schema comp \ + --table funda \ + --columns "gvkey,datadate,sale,at" \ + --out funda_subset.parquet +``` + +### Raw SQL + +```sh +wrds-dl download \ + --query "SELECT permno, date, prc FROM crsp.msf WHERE date >= '2015-01-01'" \ + --out crsp_msf_2015_onwards.parquet +``` + +### CSV output + +```sh +wrds-dl download \ + --schema crsp \ + --table msf \ + --columns "permno,date,ret" --limit 1000 \ + --out crsp_msf_sample.csv +``` + +Format is inferred from the output file extension (`.parquet` or `.csv`). Override with `--format`. + +### All download flags + +| Flag | Description | +|---|---| +| `--schema` | Schema name (e.g. `crsp`, `comp`) | +| `--table` | Table name (e.g. `dsf`, `funda`) | +| `-c`, `--columns` | Columns to select (comma-separated, default `*`) | +| `--where` | SQL `WHERE` clause, without the keyword | +| `--query` | Full SQL query — overrides `--schema`, `--table`, `--where`, `--columns` | +| `--out` | Output file path (required unless `--dry-run`) | +| `--format` | `parquet` or `csv` (inferred from extension if omitted) | +| `--limit` | Row limit, useful for testing (default: no limit) | +| `--dry-run` | Preview query, row count, and first 5 rows without downloading | + +### Table info + +```sh +wrds-dl info --schema crsp --table dsf +``` + +For machine-readable output: + +```sh +wrds-dl info --schema crsp --table dsf --json +``` + +## How it works + +`wrds-dl` connects directly to the WRDS PostgreSQL server using [pgx](https://github.com/jackc/pgx). All operations go through a single pooled connection (limited to 1 to avoid triggering multiple Duo 2FA prompts). + +**Downloads** stream rows from Postgres and write them incrementally: +- **Parquet**: rows are batched (10,000 per row group) and written with ZSTD compression via [parquet-go](https://github.com/parquet-go/parquet-go). String columns use PLAIN encoding for broad compatibility (R, Python, Julia). +- **CSV**: rows are streamed directly to disk via Go's `encoding/csv`. + +## Dependencies + +| Package | Purpose | +|---|---| +| [`jackc/pgx/v5`](https://github.com/jackc/pgx) | PostgreSQL driver — all queries and data streaming | +| [`parquet-go/parquet-go`](https://github.com/parquet-go/parquet-go) | Parquet file writing with ZSTD compression (pure Go) | +| [`charmbracelet/bubbletea`](https://github.com/charmbracelet/bubbletea) | TUI framework | +| [`charmbracelet/bubbles`](https://github.com/charmbracelet/bubbles) | List, text-input, spinner components | +| [`charmbracelet/lipgloss`](https://github.com/charmbracelet/lipgloss) | Terminal layout and styling | +| [`spf13/cobra`](https://github.com/spf13/cobra) | CLI commands and flags | diff --git a/cmd/download.go b/go/cmd/download.go diff --git a/cmd/download_test.go b/go/cmd/download_test.go diff --git a/cmd/info.go b/go/cmd/info.go diff --git a/cmd/root.go b/go/cmd/root.go diff --git a/cmd/tui.go b/go/cmd/tui.go diff --git a/go.mod b/go/go.mod diff --git a/go.sum b/go/go.sum diff --git a/internal/config/config.go b/go/internal/config/config.go diff --git a/internal/db/client.go b/go/internal/db/client.go diff --git a/internal/db/meta.go b/go/internal/db/meta.go diff --git a/internal/db/meta_test.go b/go/internal/db/meta_test.go diff --git a/internal/export/export.go b/go/internal/export/export.go diff --git a/internal/export/export_test.go b/go/internal/export/export_test.go diff --git a/internal/tui/app.go b/go/internal/tui/app.go diff --git a/internal/tui/dlform.go b/go/internal/tui/dlform.go diff --git a/internal/tui/loginform.go b/go/internal/tui/loginform.go diff --git a/internal/tui/styles.go b/go/internal/tui/styles.go diff --git a/main.go b/go/main.go diff --git a/python/README.md b/python/README.md @@ -0,0 +1,157 @@ +# wrds-dl (Python) + +Lightweight CLI for downloading data from the [WRDS](https://wrds-www.wharton.upenn.edu/) PostgreSQL database. No binary to install — runs anywhere Python and [`uv`](https://docs.astral.sh/uv/) are available. + +Same CLI interface as the [Go version](../go/) (`download` and `info` commands with identical flags), but without the TUI. + +## Installation + +### With uv (recommended) + +```sh +# Install as a tool (available system-wide) +uv tool install wrds-dl --from ./python + +# Or run directly without installing +cd python +uv run wrds-dl --help +``` + +### With pip + +```sh +pip install ./python +wrds-dl --help +``` + +## CLI + +### CRSP monthly stock file + +```sh +# Inspect the table first +wrds-dl info --schema crsp --table msf + +# Download prices and returns for 2020 +wrds-dl download \ + --schema crsp \ + --table msf \ + --columns "permno,date,prc,ret,shrout" \ + --where "date >= '2020-01-01' AND date < '2021-01-01'" \ + --out crsp_msf_2020.parquet + +# Dry run — check row count before committing +wrds-dl download \ + --schema crsp \ + --table msf \ + --where "date = '2020-01-31'" \ + --dry-run +``` + +### CRSP daily stock file + +```sh +wrds-dl download \ + --schema crsp \ + --table dsf \ + --where "date >= '2020-01-01' AND date < '2021-01-01'" \ + --out crsp_dsf_2020.parquet +``` + +### Select specific columns + +```sh +wrds-dl download \ + --schema comp \ + --table funda \ + --columns "gvkey,datadate,sale,at" \ + --out funda_subset.parquet +``` + +### Raw SQL + +```sh +wrds-dl download \ + --query "SELECT permno, date, prc FROM crsp.msf WHERE date >= '2015-01-01'" \ + --out crsp_msf_2015_onwards.parquet +``` + +### CSV output + +```sh +wrds-dl download \ + --schema crsp \ + --table msf \ + --columns "permno,date,ret" --limit 1000 \ + --out crsp_msf_sample.csv +``` + +Format is inferred from the output file extension (`.parquet` or `.csv`). Override with `--format`. + +### All download flags + +| Flag | Description | +|---|---| +| `--schema` | Schema name (e.g. `crsp`, `comp`) | +| `--table` | Table name (e.g. `dsf`, `funda`) | +| `-c`, `--columns` | Columns to select (comma-separated, default `*`) | +| `--where` | SQL `WHERE` clause, without the keyword | +| `--query` | Full SQL query — overrides `--schema`, `--table`, `--where`, `--columns` | +| `--out` | Output file path (required unless `--dry-run`) | +| `--format` | `parquet` or `csv` (inferred from extension if omitted) | +| `--limit` | Row limit, useful for testing (default: no limit) | +| `--dry-run` | Preview query, row count, and first 5 rows without downloading | + +### Table info + +Inspect table metadata without downloading data: + +```sh +wrds-dl info --schema crsp --table dsf +``` + +Output: + +``` +crsp.dsf + Daily Stock File + ~245302893 rows, 47 GB + +NAME TYPE NULLABLE DESCRIPTION +cusip character varying(8) YES CUSIP - HISTORICAL +permno double precision YES PERMNO +... +``` + +For machine-readable output: + +```sh +wrds-dl info --schema crsp --table dsf --json +``` + +## How it works + +Connects directly to the WRDS PostgreSQL server using [psycopg](https://www.psycopg.org/) (v3) with bundled `libpq` for cross-platform portability (ARM macOS, x86 Linux, etc.). + +**Downloads** use server-side cursors to stream rows without loading the entire result into memory: +- **Parquet**: rows are batched (10,000 per row group) and written with ZSTD compression via [PyArrow](https://arrow.apache.org/docs/python/). +- **CSV**: rows are streamed to disk via Python's `csv` module. + +Progress is reported to stderr every 10,000 rows. + +## Dependencies + +| Package | Purpose | +|---|---| +| [`psycopg[binary]`](https://www.psycopg.org/) | PostgreSQL driver with bundled libpq | +| [`pyarrow`](https://arrow.apache.org/docs/python/) | Parquet writing with ZSTD compression | +| [`click`](https://click.palletsprojects.com/) | CLI framework | + +## Development + +```sh +cd python +uv sync +uv run pytest +uv run ruff check src tests +``` diff --git a/python/pyproject.toml b/python/pyproject.toml @@ -0,0 +1,32 @@ +[project] +name = "wrds-dl" +version = "0.3.0" +description = "Lightweight CLI for downloading WRDS data to Parquet or CSV" +readme = "README.md" +license = "MIT" +requires-python = ">=3.10" +dependencies = [ + "psycopg[binary]>=3.1", + "pyarrow>=14", + "click>=8", +] + +[project.scripts] +wrds-dl = "wrds_dl.cli:cli" + +[dependency-groups] +dev = ["pytest>=8", "ruff>=0.4"] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/wrds_dl"] + +[tool.pytest.ini_options] +testpaths = ["tests"] + +[tool.ruff] +target-version = "py310" +line-length = 99 diff --git a/python/src/wrds_dl/__init__.py b/python/src/wrds_dl/__init__.py @@ -0,0 +1,3 @@ +"""wrds-dl: lightweight CLI for downloading WRDS data to Parquet or CSV.""" + +__version__ = "0.3.0" diff --git a/python/src/wrds_dl/__main__.py b/python/src/wrds_dl/__main__.py @@ -0,0 +1,5 @@ +"""Allow running as ``python -m wrds_dl``.""" + +from wrds_dl.cli import cli + +cli() diff --git a/python/src/wrds_dl/cli.py b/python/src/wrds_dl/cli.py @@ -0,0 +1,188 @@ +"""Click CLI — download and info subcommands matching the Go wrds-dl interface.""" + +from __future__ import annotations + +import json +import sys + +import click +import psycopg + +from wrds_dl.config import apply_credentials +from wrds_dl.db import build_query, connect, dsn_from_env, quote_ident, table_meta + + +@click.group() +def cli() -> None: + """Download data from the WRDS PostgreSQL database to Parquet or CSV.""" + + +@cli.command() +@click.option("--schema", default="", help="Schema name (e.g. crsp)") +@click.option("--table", default="", help="Table name (e.g. dsf)") +@click.option("-c", "--columns", default="*", help="Columns to select (comma-separated, default *)") +@click.option("--where", "where_clause", default="", help="SQL WHERE clause (without the WHERE keyword)") +@click.option("--query", default="", help="Full SQL query (overrides --schema/--table/--where)") +@click.option("--out", default="", help="Output file path (required unless --dry-run)") +@click.option("--format", "fmt", default="", help="Output format: parquet or csv (inferred from extension)") +@click.option("--limit", default=0, type=int, help="Limit number of rows (0 = no limit)") +@click.option("--dry-run", is_flag=True, help="Preview query, row count, and first 5 rows") +def download( + schema: str, + table: str, + columns: str, + where_clause: str, + query: str, + out: str, + fmt: str, + limit: int, + dry_run: bool, +) -> None: + """Download WRDS data to Parquet or CSV.""" + apply_credentials() + + # Build query. + if query: + sql = query + elif schema and table: + sql = build_query(schema, table, columns, where_clause, limit) + else: + raise click.UsageError("Either --query or both --schema and --table must be specified") + + if dry_run: + _run_dry_run(sql) + return + + if not out: + raise click.UsageError('Required option "--out" not provided') + + # Resolve format. + resolved_fmt = fmt.lower() if fmt else ("csv" if out.lower().endswith(".csv") else "parquet") + + click.echo(f"Exporting to {out} ({resolved_fmt})...", err=True) + + from wrds_dl.export import export_data + + def progress(rows: int) -> None: + click.echo(f"Exported {rows} rows...", err=True) + + export_data(sql, out, resolved_fmt, progress) + click.echo(f"Done: {out}", err=True) + + +def _run_dry_run(sql: str) -> None: + """Print query, row count, and first 5 rows.""" + conn = psycopg.connect(dsn_from_env()) + try: + with conn.cursor() as cur: + click.echo("Query:") + click.echo(f" {sql}") + click.echo() + + # Row count. + cur.execute(f"SELECT count(*) FROM ({sql}) sub") + row = cur.fetchone() + count = row[0] if row else 0 + click.echo(f"Row count: {count}") + click.echo() + + # Preview first 5 rows. + cur.execute(f"SELECT * FROM ({sql}) sub LIMIT 5") + if cur.description is None: + return + + col_names = [desc.name for desc in cur.description] + rows = cur.fetchall() + + # Calculate column widths. + widths = [len(name) for name in col_names] + str_rows = [] + for row in rows: + cells = [str(v) if v is not None else "NULL" for v in row] + str_rows.append(cells) + for i, cell in enumerate(cells): + widths[i] = max(widths[i], len(cell)) + + # Print header and rows. + header = " ".join(name.ljust(widths[i]) for i, name in enumerate(col_names)) + click.echo(header) + for cells in str_rows: + click.echo(" ".join(cell.ljust(widths[i]) for i, cell in enumerate(cells))) + finally: + conn.close() + + +@cli.command() +@click.option("--schema", required=True, help="Schema name (required)") +@click.option("--table", required=True, help="Table name (required)") +@click.option("--json", "as_json", is_flag=True, help="Output as JSON") +def info(schema: str, table: str, as_json: bool) -> None: + """Show table metadata (columns, types, row count).""" + apply_credentials() + + conn = connect() + try: + meta = table_meta(conn, schema, table) + finally: + conn.close() + + if as_json: + _print_info_json(meta) + else: + _print_info_table(meta) + + +def _print_info_json(meta) -> None: + data = { + "schema": meta.schema, + "table": meta.table, + "comment": meta.comment or None, + "row_count": meta.row_count, + "size": meta.size or None, + "columns": [ + { + "name": c.name, + "type": c.data_type, + "nullable": c.nullable, + **({"description": c.description} if c.description else {}), + } + for c in meta.columns + ], + } + # Match Go: omit null keys + data = {k: v for k, v in data.items() if v is not None} + click.echo(json.dumps(data, indent=2)) + + +def _print_info_table(meta) -> None: + click.echo(f"{meta.schema}.{meta.table}") + if meta.comment: + click.echo(f" {meta.comment}") + + parts = [] + if meta.row_count > 0: + parts.append(f"~{meta.row_count} rows") + if meta.size: + parts.append(meta.size) + if parts: + click.echo(f" {', '.join(parts)}") + + click.echo() + + # Column table with tab-aligned output. + widths = [4, 4, 8, 11] # NAME, TYPE, NULLABLE, DESCRIPTION minimums + rows = [] + for c in meta.columns: + nullable = "YES" if c.nullable else "NO" + row = [c.name, c.data_type, nullable, c.description] + rows.append(row) + for i, cell in enumerate(row): + widths[i] = max(widths[i], len(cell)) + + header = " ".join( + label.ljust(widths[i]) + for i, label in enumerate(["NAME", "TYPE", "NULLABLE", "DESCRIPTION"]) + ) + click.echo(header) + for row in rows: + click.echo(" ".join(cell.ljust(widths[i]) for i, cell in enumerate(row))) diff --git a/python/src/wrds_dl/config.py b/python/src/wrds_dl/config.py @@ -0,0 +1,48 @@ +"""Credentials file I/O — reads/writes ~/.config/wrds-dl/credentials.""" + +from __future__ import annotations + +import os +from pathlib import Path + + +def credentials_path() -> Path: + """Return the path to the credentials file, respecting $XDG_CONFIG_HOME.""" + base = os.environ.get("XDG_CONFIG_HOME") or Path.home() / ".config" + return Path(base) / "wrds-dl" / "credentials" + + +def load_credentials() -> tuple[str, str, str]: + """Read PGUSER, PGPASSWORD, PGDATABASE from the credentials file. + + Returns (user, password, database). Missing values are empty strings. + """ + user = password = database = "" + path = credentials_path() + if not path.is_file(): + return user, password, database + + for line in path.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + key, _, val = line.partition("=") + key, val = key.strip(), val.strip() + if key == "PGUSER": + user = val + elif key == "PGPASSWORD": + password = val + elif key == "PGDATABASE": + database = val + return user, password, database + + +def apply_credentials() -> None: + """Load credentials from config and set env vars for any values not already set.""" + user, password, database = load_credentials() + if not os.environ.get("PGUSER") and user: + os.environ["PGUSER"] = user + if not os.environ.get("PGPASSWORD") and password: + os.environ["PGPASSWORD"] = password + if not os.environ.get("PGDATABASE") and database: + os.environ["PGDATABASE"] = database diff --git a/python/src/wrds_dl/db.py b/python/src/wrds_dl/db.py @@ -0,0 +1,126 @@ +"""PostgreSQL connection, DSN construction, and metadata queries.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field + +import psycopg + + +def _getenv(key: str, fallback: str = "") -> str: + return os.environ.get(key) or fallback + + +def dsn_from_env() -> str: + """Build a PostgreSQL connection string from standard PG* environment variables.""" + host = _getenv("PGHOST", "wrds-pgdata.wharton.upenn.edu") + port = _getenv("PGPORT", "9737") + user = _getenv("PGUSER") + password = _getenv("PGPASSWORD") + database = _getenv("PGDATABASE", "wrds") + + if not user: + raise RuntimeError("PGUSER not set") + + dsn = f"host={host} port={port} user={user} sslmode=require" + if password: + dsn += f" password={password}" + if database: + dsn += f" dbname={database}" + return dsn + + +def connect() -> psycopg.Connection: + """Return a psycopg connection using DSN from environment variables.""" + return psycopg.connect(dsn_from_env()) + + +def quote_ident(s: str) -> str: + """Quote a PostgreSQL identifier to prevent SQL injection.""" + return '"' + s.replace('"', '""') + '"' + + +def build_query( + schema: str, + table: str, + columns: str = "*", + where: str = "", + limit: int = 0, +) -> str: + """Build a SELECT query with quoted identifiers.""" + if columns and columns != "*": + parts = [quote_ident(c.strip()) for c in columns.split(",")] + sel = ", ".join(parts) + else: + sel = "*" + + q = f"SELECT {sel} FROM {quote_ident(schema)}.{quote_ident(table)}" + if where: + q += f" WHERE {where}" + if limit > 0: + q += f" LIMIT {limit}" + return q + + +@dataclass +class ColumnMeta: + name: str + data_type: str + nullable: bool + description: str + + +@dataclass +class TableMeta: + schema: str + table: str + comment: str = "" + row_count: int = 0 + size: str = "" + columns: list[ColumnMeta] = field(default_factory=list) + + +def table_meta(conn: psycopg.Connection, schema: str, table: str) -> TableMeta: + """Fetch catalog metadata for a table (no data scan).""" + meta = TableMeta(schema=schema, table=table) + + # Table-level stats (best effort). + with conn.cursor() as cur: + cur.execute( + """ + SELECT c.reltuples::bigint, + COALESCE(pg_size_pretty(pg_total_relation_size(c.oid)), ''), + COALESCE(obj_description(c.oid), '') + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = %s AND c.relname = %s + """, + (schema, table), + ) + row = cur.fetchone() + if row: + meta.row_count, meta.size, meta.comment = row + + # Column metadata with descriptions from pg_description. + with conn.cursor() as cur: + cur.execute( + """ + SELECT a.attname, + pg_catalog.format_type(a.atttypid, a.atttypmod), + NOT a.attnotnull, + COALESCE(d.description, '') + FROM pg_attribute a + JOIN pg_class c ON a.attrelid = c.oid + JOIN pg_namespace n ON c.relnamespace = n.oid + LEFT JOIN pg_description d ON d.objoid = c.oid AND d.objsubid = a.attnum + WHERE n.nspname = %s AND c.relname = %s + AND a.attnum > 0 AND NOT a.attisdropped + ORDER BY a.attnum + """, + (schema, table), + ) + for name, dtype, nullable, desc in cur: + meta.columns.append(ColumnMeta(name, dtype, nullable, desc)) + + return meta diff --git a/python/src/wrds_dl/export.py b/python/src/wrds_dl/export.py @@ -0,0 +1,150 @@ +"""Export query results to Parquet or CSV with streaming and progress.""" + +from __future__ import annotations + +import csv +import sys +from decimal import Decimal +from typing import Callable + +import psycopg +import pyarrow as pa +import pyarrow.parquet as pq + +from wrds_dl.db import dsn_from_env + +ROW_GROUP_SIZE = 10_000 + +# Map PostgreSQL type OIDs to PyArrow types. +_PG_OID_TO_ARROW: dict[int, pa.DataType] = { + 16: pa.bool_(), # bool + 21: pa.int32(), # int2 + 23: pa.int32(), # int4 + 20: pa.int64(), # int8 + 700: pa.float32(), # float4 + 701: pa.float64(), # float8 + 1082: pa.date32(), # date + 1114: pa.timestamp("us"), # timestamp + 1184: pa.timestamp("us", tz="UTC"), # timestamptz +} + + +def _arrow_type_for_oid(oid: int) -> pa.DataType: + return _PG_OID_TO_ARROW.get(oid, pa.string()) + + +def export_data( + query: str, + out_path: str, + fmt: str = "parquet", + progress_fn: Callable[[int], None] | None = None, +) -> None: + """Run *query* against WRDS and write results to *out_path*.""" + conn = psycopg.connect(dsn_from_env()) + try: + with conn.cursor(name="wrds_export") as cur: + cur.itersize = ROW_GROUP_SIZE + cur.execute(query) + + if cur.description is None: + raise RuntimeError("Query returned no columns") + + col_names = [desc.name for desc in cur.description] + col_oids = [desc.type_code for desc in cur.description] + + if fmt == "csv": + _write_csv(cur, col_names, out_path, progress_fn) + else: + _write_parquet(cur, col_names, col_oids, out_path, progress_fn) + finally: + conn.close() + + +def _write_csv( + cur: psycopg.Cursor, + col_names: list[str], + out_path: str, + progress_fn: Callable[[int], None] | None, +) -> None: + with open(out_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(col_names) + total = 0 + for row in cur: + writer.writerow(_format_row(row)) + total += 1 + if progress_fn and total % ROW_GROUP_SIZE == 0: + progress_fn(total) + + +def _write_parquet( + cur: psycopg.Cursor, + col_names: list[str], + col_oids: list[int], + out_path: str, + progress_fn: Callable[[int], None] | None, +) -> None: + arrow_types = [_arrow_type_for_oid(oid) for oid in col_oids] + schema = pa.schema([(name, typ) for name, typ in zip(col_names, arrow_types)]) + + writer = pq.ParquetWriter(out_path, schema, compression="zstd") + try: + batch_rows: list[tuple] = [] + total = 0 + + for row in cur: + batch_rows.append(row) + if len(batch_rows) >= ROW_GROUP_SIZE: + _flush_batch(writer, schema, batch_rows, col_names) + total += len(batch_rows) + batch_rows = [] + if progress_fn: + progress_fn(total) + + if batch_rows: + _flush_batch(writer, schema, batch_rows, col_names) + total += len(batch_rows) + finally: + writer.close() + + +def _flush_batch( + writer: pq.ParquetWriter, + schema: pa.Schema, + rows: list[tuple], + col_names: list[str], +) -> None: + """Convert a batch of rows into a PyArrow table and write it.""" + columns: dict[str, list] = {name: [] for name in col_names} + for row in rows: + for i, val in enumerate(row): + # Strip trailing zeros from Decimal values (numeric columns) + # so output matches Go's pgx behaviour. + if isinstance(val, Decimal): + val = str(val.normalize()) + columns[col_names[i]].append(val) + + arrays = [] + for i, name in enumerate(col_names): + try: + arrays.append(pa.array(columns[name], type=schema.field(name).type)) + except (pa.ArrowInvalid, pa.ArrowTypeError): + # Fallback: convert to strings + arrays.append(pa.array([str(v) if v is not None else None for v in columns[name]], + type=pa.string())) + + table = pa.table(dict(zip(col_names, arrays))) + writer.write_table(table) + + +def _format_row(row: tuple) -> list[str]: + """Format a row for CSV output.""" + out = [] + for v in row: + if v is None: + out.append("") + elif isinstance(v, Decimal): + out.append(str(v.normalize())) + else: + out.append(str(v)) + return out diff --git a/python/tests/__init__.py b/python/tests/__init__.py diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py @@ -0,0 +1,37 @@ +"""Tests for CLI commands — help output and flag parsing.""" + +from click.testing import CliRunner + +from wrds_dl.cli import cli + + +def test_cli_help(): + runner = CliRunner() + result = runner.invoke(cli, ["--help"]) + assert result.exit_code == 0 + assert "download" in result.output + assert "info" in result.output + + +def test_download_help(): + runner = CliRunner() + result = runner.invoke(cli, ["download", "--help"]) + assert result.exit_code == 0 + for flag in ["--schema", "--table", "--columns", "--where", "--query", "--out", "--format", + "--limit", "--dry-run"]: + assert flag in result.output + + +def test_info_help(): + runner = CliRunner() + result = runner.invoke(cli, ["info", "--help"]) + assert result.exit_code == 0 + for flag in ["--schema", "--table", "--json"]: + assert flag in result.output + + +def test_download_no_args(): + runner = CliRunner() + result = runner.invoke(cli, ["download"]) + assert result.exit_code != 0 + assert "Either --query or both --schema and --table" in result.output diff --git a/python/tests/test_config.py b/python/tests/test_config.py @@ -0,0 +1,65 @@ +"""Tests for config module — credentials file parsing.""" + +import os +from pathlib import Path +from unittest import mock + +from wrds_dl.config import apply_credentials, credentials_path, load_credentials + + +class TestCredentialsPath: + def test_default(self): + with mock.patch.dict(os.environ, {}, clear=True): + path = credentials_path() + assert path == Path.home() / ".config" / "wrds-dl" / "credentials" + + def test_xdg(self): + with mock.patch.dict(os.environ, {"XDG_CONFIG_HOME": "/tmp/xdg"}): + path = credentials_path() + assert path == Path("/tmp/xdg/wrds-dl/credentials") + + +class TestLoadCredentials: + def test_missing_file(self, tmp_path): + with mock.patch("wrds_dl.config.credentials_path", return_value=tmp_path / "missing"): + user, pw, db = load_credentials() + assert user == "" + assert pw == "" + assert db == "" + + def test_valid_file(self, tmp_path): + creds = tmp_path / "credentials" + creds.write_text("PGUSER=alice\nPGPASSWORD=secret\nPGDATABASE=wrds\n") + with mock.patch("wrds_dl.config.credentials_path", return_value=creds): + user, pw, db = load_credentials() + assert user == "alice" + assert pw == "secret" + assert db == "wrds" + + def test_comments_and_blanks(self, tmp_path): + creds = tmp_path / "credentials" + creds.write_text("# comment\n\nPGUSER=bob\n") + with mock.patch("wrds_dl.config.credentials_path", return_value=creds): + user, pw, db = load_credentials() + assert user == "bob" + assert pw == "" + + +class TestApplyCredentials: + def test_sets_missing_env(self, tmp_path): + creds = tmp_path / "credentials" + creds.write_text("PGUSER=alice\nPGPASSWORD=secret\n") + with mock.patch("wrds_dl.config.credentials_path", return_value=creds): + env = {"PATH": os.environ.get("PATH", "")} + with mock.patch.dict(os.environ, env, clear=True): + apply_credentials() + assert os.environ["PGUSER"] == "alice" + assert os.environ["PGPASSWORD"] == "secret" + + def test_does_not_override(self, tmp_path): + creds = tmp_path / "credentials" + creds.write_text("PGUSER=alice\n") + with mock.patch("wrds_dl.config.credentials_path", return_value=creds): + with mock.patch.dict(os.environ, {"PGUSER": "existing"}): + apply_credentials() + assert os.environ["PGUSER"] == "existing" diff --git a/python/tests/test_db.py b/python/tests/test_db.py @@ -0,0 +1,43 @@ +"""Tests for db module — query building and identifier quoting.""" + +from wrds_dl.db import build_query, quote_ident + + +class TestQuoteIdent: + def test_simple(self): + assert quote_ident("foo") == '"foo"' + + def test_double_quotes(self): + assert quote_ident('foo"bar') == '"foo""bar"' + + def test_empty(self): + assert quote_ident("") == '""' + + def test_spaces(self): + assert quote_ident("my table") == '"my table"' + + +class TestBuildQuery: + def test_basic(self): + q = build_query("crsp", "dsf") + assert q == 'SELECT * FROM "crsp"."dsf"' + + def test_columns(self): + q = build_query("crsp", "dsf", columns="permno,date,prc") + assert q == 'SELECT "permno", "date", "prc" FROM "crsp"."dsf"' + + def test_where(self): + q = build_query("crsp", "dsf", where="date = '2020-01-02'") + assert q == """SELECT * FROM "crsp"."dsf" WHERE date = '2020-01-02'""" + + def test_limit(self): + q = build_query("crsp", "dsf", limit=100) + assert q == 'SELECT * FROM "crsp"."dsf" LIMIT 100' + + def test_all_options(self): + q = build_query("comp", "funda", columns="gvkey,sale", where="fyear >= 2020", limit=1000) + assert q == 'SELECT "gvkey", "sale" FROM "comp"."funda" WHERE fyear >= 2020 LIMIT 1000' + + def test_star_columns(self): + q = build_query("crsp", "dsf", columns="*") + assert q == 'SELECT * FROM "crsp"."dsf"' diff --git a/python/tests/test_export.py b/python/tests/test_export.py @@ -0,0 +1,48 @@ +"""Tests for export module — type mapping and format helpers.""" + +import pyarrow as pa + +from wrds_dl.export import _PG_OID_TO_ARROW, _arrow_type_for_oid, _format_row + + +class TestArrowTypeForOid: + def test_bool(self): + assert _arrow_type_for_oid(16) == pa.bool_() + + def test_int2(self): + assert _arrow_type_for_oid(21) == pa.int32() + + def test_int4(self): + assert _arrow_type_for_oid(23) == pa.int32() + + def test_int8(self): + assert _arrow_type_for_oid(20) == pa.int64() + + def test_float4(self): + assert _arrow_type_for_oid(700) == pa.float32() + + def test_float8(self): + assert _arrow_type_for_oid(701) == pa.float64() + + def test_date(self): + assert _arrow_type_for_oid(1082) == pa.date32() + + def test_timestamp(self): + assert _arrow_type_for_oid(1114) == pa.timestamp("us") + + def test_timestamptz(self): + assert _arrow_type_for_oid(1184) == pa.timestamp("us", tz="UTC") + + def test_unknown_defaults_to_string(self): + assert _arrow_type_for_oid(9999) == pa.string() + + +class TestFormatRow: + def test_basic(self): + assert _format_row((1, "hello", None, 3.14)) == ["1", "hello", "", "3.14"] + + def test_empty(self): + assert _format_row(()) == [] + + def test_all_none(self): + assert _format_row((None, None)) == ["", ""] diff --git a/python/tests/test_integration.py b/python/tests/test_integration.py @@ -0,0 +1,145 @@ +"""Integration test: download a small CRSP MSF sample and verify output. + +Requires WRDS credentials (PGUSER/PGPASSWORD or ~/.config/wrds-dl/credentials). +Skipped automatically when credentials are unavailable. + +If the Go wrds-dl binary is found, downloads the same data with both +implementations and asserts their content hashes match. +""" + +from __future__ import annotations + +import hashlib +import os +import shutil +import subprocess +import tempfile +from pathlib import Path + +import pyarrow.parquet as pq +import pytest + +from wrds_dl.config import load_credentials + +# A narrow, deterministic query: 10 rows from crsp.msf for Jan 2020. +QUERY = ( + "SELECT permno, date, prc, ret, shrout " + "FROM crsp.msf " + "WHERE date = '2020-01-31' " + "ORDER BY permno " + "LIMIT 10" +) + +REPO_ROOT = Path(__file__).resolve().parents[2] +GO_BINARY = REPO_ROOT / "wrds-dl" # pre-built binary at repo root + + +def _has_credentials() -> bool: + if os.environ.get("PGUSER"): + return True + user, pw, _ = load_credentials() + return bool(user and pw) + + +pytestmark = pytest.mark.skipif( + not _has_credentials(), + reason="WRDS credentials not available", +) + + +def _content_hash(parquet_path: str) -> str: + """Read a parquet file, sort deterministically, and return a SHA-256 of the content. + + Converts all values to their repr() for a canonical representation + that is independent of the parquet writer (parquet-go vs pyarrow). + """ + table = pq.read_table(parquet_path) + # Normalize column order alphabetically. + col_names = sorted(table.column_names) + table = table.select(col_names) + # Sort rows by all columns. + sort_keys = [(col, "ascending") for col in col_names] + table = table.sort_by(sort_keys) + # Hash a canonical string representation of every cell. + h = hashlib.sha256() + h.update(",".join(col_names).encode()) + for i in range(table.num_rows): + for col_name in col_names: + val = table.column(col_name)[i].as_py() + h.update(repr(val).encode()) + h.update(b"|") + h.update(b"\n") + return h.hexdigest() + + +def test_python_download_parquet(): + """Download a small sample with the Python CLI and verify the parquet output.""" + with tempfile.TemporaryDirectory() as tmpdir: + out = os.path.join(tmpdir, "test_py.parquet") + + from click.testing import CliRunner + from wrds_dl.cli import cli + + runner = CliRunner() + result = runner.invoke(cli, ["download", "--query", QUERY, "--out", out]) + assert result.exit_code == 0, f"Python download failed: {result.output}" + + # Verify parquet file. + table = pq.read_table(out) + assert table.num_rows == 10 + assert set(table.column_names) == {"permno", "date", "prc", "ret", "shrout"} + + py_hash = _content_hash(out) + assert len(py_hash) == 64 # valid sha256 + + +@pytest.mark.skipif( + not GO_BINARY.is_file(), + reason=f"Go binary not found at {GO_BINARY}", +) +def test_go_python_parity(): + """Download the same data with Go and Python, assert content hashes match.""" + with tempfile.TemporaryDirectory() as tmpdir: + py_out = os.path.join(tmpdir, "py.parquet") + go_out = os.path.join(tmpdir, "go.parquet") + + # Python download. + from click.testing import CliRunner + from wrds_dl.cli import cli + + runner = CliRunner() + result = runner.invoke(cli, ["download", "--query", QUERY, "--out", py_out]) + assert result.exit_code == 0, f"Python download failed: {result.output}" + + # Go download. + env = os.environ.copy() + proc = subprocess.run( + [str(GO_BINARY), "download", "--query", QUERY, "--out", go_out], + capture_output=True, + text=True, + env=env, + timeout=60, + ) + assert proc.returncode == 0, f"Go download failed: {proc.stderr}" + + # Compare content hashes. + py_hash = _content_hash(py_out) + go_hash = _content_hash(go_out) + + # Read both tables for diagnostics on failure. + py_table = pq.read_table(py_out) + go_table = pq.read_table(go_out) + + assert py_table.num_rows == go_table.num_rows, ( + f"Row count mismatch: Python={py_table.num_rows}, Go={go_table.num_rows}" + ) + assert set(py_table.column_names) == set(go_table.column_names), ( + f"Column mismatch: Python={py_table.column_names}, Go={go_table.column_names}" + ) + assert py_hash == go_hash, ( + f"Content hash mismatch:\n" + f" Python: {py_hash}\n" + f" Go: {go_hash}\n" + f" Python schema:\n{py_table.schema}\n" + f" Go schema:\n{go_table.schema}\n" + )