TigerFetch.jl

Download TIGER/Line shapefiles from the US Census Bureau
Log | Files | Refs | README | LICENSE

validation.jl (8663B)


      1 # ABOUTME: Census shapefile validation utilities for robust testing
      2 # ABOUTME: Provides functions to validate downloaded Census files without hardcoded hashes
      3 
      4 """
      5     ToleranceParams
      6 
      7 Parameters defining acceptable ranges for Census file validation.
      8 """
      9 struct ToleranceParams
     10     min_size_mb::Float64
     11     max_size_mb::Float64
     12     min_features::Union{Int, Nothing}
     13     max_features::Union{Int, Nothing}
     14     geographic_bounds::Union{NamedTuple, Nothing}
     15 end
     16 
     17 """
     18     validate_file_basics(filepath::String, tolerance::ToleranceParams) -> Bool
     19 
     20 Basic file validation: existence, ZIP format, and reasonable size.
     21 """
     22 function validate_file_basics(filepath::String, tolerance::ToleranceParams)
     23     # File must exist
     24     !isfile(filepath) && return false
     25     
     26     # Must be a ZIP file
     27     !endswith(filepath, ".zip") && return false
     28     
     29     # Size check
     30     size_mb = stat(filepath).size / (1024^2)
     31     if size_mb < tolerance.min_size_mb || size_mb > tolerance.max_size_mb
     32         @warn "File size $(round(size_mb, digits=2))MB outside expected range [$(tolerance.min_size_mb), $(tolerance.max_size_mb)]MB"
     33         return false
     34     end
     35     
     36     return true
     37 end
     38 
     39 """
     40     validate_zip_structure(filepath::String) -> Bool
     41 
     42 Validate ZIP file can be opened and contains expected shapefile components.
     43 """
     44 function validate_zip_structure(filepath::String)
     45     try
     46         # Try to read ZIP file structure using unzip -t (test archive)
     47         result = run(pipeline(`unzip -t $filepath`, stdout=devnull, stderr=devnull))
     48         
     49         # Get file listing
     50         zip_contents = readchomp(`unzip -l $filepath`)
     51         
     52         # Check for essential shapefile components
     53         has_shp = occursin(r"\.shp$"m, zip_contents)
     54         has_dbf = occursin(r"\.dbf$"m, zip_contents)  
     55         has_shx = occursin(r"\.shx$"m, zip_contents)
     56         
     57         if !has_shp || !has_dbf || !has_shx
     58             @warn "Missing essential shapefile components: .shp=$has_shp, .dbf=$has_dbf, .shx=$has_shx"
     59             return false
     60         end
     61         
     62         return true
     63     catch e
     64         @warn "Failed to validate ZIP structure: $e"
     65         return false
     66     end
     67 end
     68 
     69 """
     70     validate_census_filename(filepath::String, expected_pattern::Regex) -> Bool
     71 
     72 Validate Census filename follows expected TIGER/Line naming convention.
     73 """
     74 function validate_census_filename(filepath::String, expected_pattern::Regex)
     75     filename = basename(filepath)
     76     if !occursin(expected_pattern, filename)
     77         @warn "Filename '$filename' doesn't match expected pattern $expected_pattern"
     78         return false
     79     end
     80     return true
     81 end
     82 
     83 """
     84     count_dbf_records(filepath::String) -> Union{Int, Nothing}
     85 
     86 Extract the record count from the .dbf header inside a shapefile ZIP.
     87 The DBF header stores the record count as a UInt32 at bytes 4-7 (little-endian).
     88 """
     89 function count_dbf_records(filepath::String)
     90     try
     91         # Find the .dbf filename inside the ZIP
     92         zip_listing = readchomp(`unzip -l $filepath`)
     93         dbf_match = match(r"(\S+\.dbf)$"m, zip_listing)
     94         isnothing(dbf_match) && return nothing
     95 
     96         dbf_name = dbf_match.captures[1]
     97 
     98         # Extract just the .dbf to a temp dir and read its header
     99         tmp = mktempdir()
    100         run(pipeline(`unzip -j -o $filepath $dbf_name -d $tmp`, stdout=devnull, stderr=devnull))
    101         dbf_path = joinpath(tmp, basename(dbf_name))
    102 
    103         header = open(dbf_path) do io
    104             read(io, 8)
    105         end
    106         rm(tmp; recursive=true, force=true)
    107 
    108         length(header) < 8 && return nothing
    109 
    110         # Record count is at bytes 4-7 (0-indexed), little-endian UInt32
    111         n_records = reinterpret(UInt32, header[5:8])[1]
    112         return Int(n_records)
    113     catch e
    114         @warn "Could not read DBF record count: $e"
    115         return nothing
    116     end
    117 end
    118 
    119 """
    120     validate_feature_count(filepath::String, tolerance::ToleranceParams) -> Bool
    121 
    122 Validate that the number of features (DBF records) falls within the expected range.
    123 Skips validation if tolerance bounds are nothing.
    124 """
    125 function validate_feature_count(filepath::String, tolerance::ToleranceParams)
    126     isnothing(tolerance.min_features) && isnothing(tolerance.max_features) && return true
    127 
    128     n_features = count_dbf_records(filepath)
    129     if isnothing(n_features)
    130         @warn "Could not determine feature count, skipping validation"
    131         return true
    132     end
    133 
    134     if !isnothing(tolerance.min_features) && n_features < tolerance.min_features
    135         @warn "Feature count $n_features below minimum $(tolerance.min_features)"
    136         return false
    137     end
    138     if !isnothing(tolerance.max_features) && n_features > tolerance.max_features
    139         @warn "Feature count $n_features above maximum $(tolerance.max_features)"
    140         return false
    141     end
    142 
    143     @info "Feature count: $n_features (expected [$(tolerance.min_features), $(tolerance.max_features)])"
    144     return true
    145 end
    146 
    147 """
    148     validate_census_file_integrity(filepath::String, file_type::String, tolerance::ToleranceParams) -> Bool
    149 
    150 Comprehensive validation of a Census shapefile using the hybrid approach.
    151 """
    152 function validate_census_file_integrity(filepath::String, file_type::String, tolerance::ToleranceParams)
    153     @info "Validating $file_type file: $(basename(filepath))"
    154     
    155     # 1. Basic file validation
    156     if !validate_file_basics(filepath, tolerance)
    157         @error "Basic file validation failed"
    158         return false
    159     end
    160     
    161     # 2. ZIP structure validation
    162     if !validate_zip_structure(filepath)
    163         @error "ZIP structure validation failed"
    164         return false
    165     end
    166     
    167     # 3. Feature count validation
    168     if !validate_feature_count(filepath, tolerance)
    169         @error "Feature count validation failed"
    170         return false
    171     end
    172 
    173     # 4. Filename pattern validation
    174     expected_patterns = Dict(
    175         "state" => r"tl_\d{4}_us_state\.zip$",
    176         "county" => r"tl_\d{4}_us_county\.zip$",
    177         "cbsa" => r"tl_\d{4}_us_cbsa\.zip$",
    178         "urbanarea" => r"tl_\d{4}_us_uac20\.zip$",
    179         "zipcode" => r"tl_\d{4}_us_zcta520\.zip$",
    180         "metrodivision" => r"tl_\d{4}_us_metdiv\.zip$",
    181         "rails" => r"tl_\d{4}_us_rails\.zip$",
    182         "primaryroads" => r"tl_\d{4}_us_primaryroads\.zip$",
    183         "cousub" => r"tl_\d{4}_\d{2}_cousub\.zip$",
    184         "tract" => r"tl_\d{4}_\d{2}_tract\.zip$",
    185         "place" => r"tl_\d{4}_\d{2}_place\.zip$",
    186         "consolidatedcity" => r"tl_\d{4}_\d{2}_concity\.zip$",
    187         "primarysecondaryroads" => r"tl_\d{4}_\d{2}_prisecroads\.zip$",
    188         "areawater" => r"tl_\d{4}_\d{5}_areawater\.zip$",
    189         "linearwater" => r"tl_\d{4}_\d{5}_linearwater\.zip$",
    190         "road" => r"tl_\d{4}_\d{5}_roads\.zip$"
    191     )
    192     
    193     if haskey(expected_patterns, file_type)
    194         if !validate_census_filename(filepath, expected_patterns[file_type])
    195             @error "Filename validation failed"
    196             return false
    197         end
    198     end
    199     
    200     @info "✓ File validation passed for $file_type"
    201     return true
    202 end
    203 
    204 # Define tolerance parameters for different geography types
    205 # Updated based on actual 2024 Census data observations
    206 const TOLERANCE_PARAMS = Dict(
    207     # National geographies
    208     "state" => ToleranceParams(5.0, 50.0, 45, 65, nothing),  # 56 in 2024
    209     "county" => ToleranceParams(75.0, 150.0, 2900, 3500, nothing),  # 3235 in 2024
    210     "cbsa" => ToleranceParams(25.0, 60.0, 800, 1100, nothing),  # 935 in 2024
    211     "urbanarea" => ToleranceParams(60.0, 100.0, 2200, 3200, nothing),  # 2644 in 2024
    212     "zipcode" => ToleranceParams(400.0, 700.0, 28000, 38000, nothing),  # ~33k ZIP codes
    213     "metrodivision" => ToleranceParams(0.5, 5.0, 25, 50, nothing),  # 37 in 2024
    214     "rails" => ToleranceParams(25.0, 150.0, nothing, nothing, nothing),  # Rails was ~32MB
    215     "primaryroads" => ToleranceParams(25.0, 150.0, nothing, nothing, nothing),  # Primary roads was ~43MB
    216     
    217     # State geographies (examples for typical states)
    218     "cousub" => ToleranceParams(5.0, 50.0, 50, 2000, nothing),  # Varies widely by state
    219     "tract" => ToleranceParams(5.0, 100.0, 200, 5000, nothing),  # MN tract was ~7.15MB
    220     "place" => ToleranceParams(2.0, 50.0, 100, 3000, nothing),  # MN place was ~3.96MB
    221     "consolidatedcity" => ToleranceParams(0.01, 10.0, 0, 50, nothing),  # KS was ~0.02MB
    222     "primarysecondaryroads" => ToleranceParams(2.0, 200.0, nothing, nothing, nothing),  # MN was ~3.9MB
    223     
    224     # County geographies (examples for typical counties)
    225     "areawater" => ToleranceParams(0.1, 50.0, 0, 5000, nothing),  # Varies by geography
    226     "linearwater" => ToleranceParams(0.01, 100.0, 0, 10000, nothing),  # Varies widely by county
    227     "road" => ToleranceParams(0.1, 200.0, 100, 50000, nothing)  # Varies widely by county
    228 )