validation.jl (8663B)
1 # ABOUTME: Census shapefile validation utilities for robust testing 2 # ABOUTME: Provides functions to validate downloaded Census files without hardcoded hashes 3 4 """ 5 ToleranceParams 6 7 Parameters defining acceptable ranges for Census file validation. 8 """ 9 struct ToleranceParams 10 min_size_mb::Float64 11 max_size_mb::Float64 12 min_features::Union{Int, Nothing} 13 max_features::Union{Int, Nothing} 14 geographic_bounds::Union{NamedTuple, Nothing} 15 end 16 17 """ 18 validate_file_basics(filepath::String, tolerance::ToleranceParams) -> Bool 19 20 Basic file validation: existence, ZIP format, and reasonable size. 21 """ 22 function validate_file_basics(filepath::String, tolerance::ToleranceParams) 23 # File must exist 24 !isfile(filepath) && return false 25 26 # Must be a ZIP file 27 !endswith(filepath, ".zip") && return false 28 29 # Size check 30 size_mb = stat(filepath).size / (1024^2) 31 if size_mb < tolerance.min_size_mb || size_mb > tolerance.max_size_mb 32 @warn "File size $(round(size_mb, digits=2))MB outside expected range [$(tolerance.min_size_mb), $(tolerance.max_size_mb)]MB" 33 return false 34 end 35 36 return true 37 end 38 39 """ 40 validate_zip_structure(filepath::String) -> Bool 41 42 Validate ZIP file can be opened and contains expected shapefile components. 43 """ 44 function validate_zip_structure(filepath::String) 45 try 46 # Try to read ZIP file structure using unzip -t (test archive) 47 result = run(pipeline(`unzip -t $filepath`, stdout=devnull, stderr=devnull)) 48 49 # Get file listing 50 zip_contents = readchomp(`unzip -l $filepath`) 51 52 # Check for essential shapefile components 53 has_shp = occursin(r"\.shp$"m, zip_contents) 54 has_dbf = occursin(r"\.dbf$"m, zip_contents) 55 has_shx = occursin(r"\.shx$"m, zip_contents) 56 57 if !has_shp || !has_dbf || !has_shx 58 @warn "Missing essential shapefile components: .shp=$has_shp, .dbf=$has_dbf, .shx=$has_shx" 59 return false 60 end 61 62 return true 63 catch e 64 @warn "Failed to validate ZIP structure: $e" 65 return false 66 end 67 end 68 69 """ 70 validate_census_filename(filepath::String, expected_pattern::Regex) -> Bool 71 72 Validate Census filename follows expected TIGER/Line naming convention. 73 """ 74 function validate_census_filename(filepath::String, expected_pattern::Regex) 75 filename = basename(filepath) 76 if !occursin(expected_pattern, filename) 77 @warn "Filename '$filename' doesn't match expected pattern $expected_pattern" 78 return false 79 end 80 return true 81 end 82 83 """ 84 count_dbf_records(filepath::String) -> Union{Int, Nothing} 85 86 Extract the record count from the .dbf header inside a shapefile ZIP. 87 The DBF header stores the record count as a UInt32 at bytes 4-7 (little-endian). 88 """ 89 function count_dbf_records(filepath::String) 90 try 91 # Find the .dbf filename inside the ZIP 92 zip_listing = readchomp(`unzip -l $filepath`) 93 dbf_match = match(r"(\S+\.dbf)$"m, zip_listing) 94 isnothing(dbf_match) && return nothing 95 96 dbf_name = dbf_match.captures[1] 97 98 # Extract just the .dbf to a temp dir and read its header 99 tmp = mktempdir() 100 run(pipeline(`unzip -j -o $filepath $dbf_name -d $tmp`, stdout=devnull, stderr=devnull)) 101 dbf_path = joinpath(tmp, basename(dbf_name)) 102 103 header = open(dbf_path) do io 104 read(io, 8) 105 end 106 rm(tmp; recursive=true, force=true) 107 108 length(header) < 8 && return nothing 109 110 # Record count is at bytes 4-7 (0-indexed), little-endian UInt32 111 n_records = reinterpret(UInt32, header[5:8])[1] 112 return Int(n_records) 113 catch e 114 @warn "Could not read DBF record count: $e" 115 return nothing 116 end 117 end 118 119 """ 120 validate_feature_count(filepath::String, tolerance::ToleranceParams) -> Bool 121 122 Validate that the number of features (DBF records) falls within the expected range. 123 Skips validation if tolerance bounds are nothing. 124 """ 125 function validate_feature_count(filepath::String, tolerance::ToleranceParams) 126 isnothing(tolerance.min_features) && isnothing(tolerance.max_features) && return true 127 128 n_features = count_dbf_records(filepath) 129 if isnothing(n_features) 130 @warn "Could not determine feature count, skipping validation" 131 return true 132 end 133 134 if !isnothing(tolerance.min_features) && n_features < tolerance.min_features 135 @warn "Feature count $n_features below minimum $(tolerance.min_features)" 136 return false 137 end 138 if !isnothing(tolerance.max_features) && n_features > tolerance.max_features 139 @warn "Feature count $n_features above maximum $(tolerance.max_features)" 140 return false 141 end 142 143 @info "Feature count: $n_features (expected [$(tolerance.min_features), $(tolerance.max_features)])" 144 return true 145 end 146 147 """ 148 validate_census_file_integrity(filepath::String, file_type::String, tolerance::ToleranceParams) -> Bool 149 150 Comprehensive validation of a Census shapefile using the hybrid approach. 151 """ 152 function validate_census_file_integrity(filepath::String, file_type::String, tolerance::ToleranceParams) 153 @info "Validating $file_type file: $(basename(filepath))" 154 155 # 1. Basic file validation 156 if !validate_file_basics(filepath, tolerance) 157 @error "Basic file validation failed" 158 return false 159 end 160 161 # 2. ZIP structure validation 162 if !validate_zip_structure(filepath) 163 @error "ZIP structure validation failed" 164 return false 165 end 166 167 # 3. Feature count validation 168 if !validate_feature_count(filepath, tolerance) 169 @error "Feature count validation failed" 170 return false 171 end 172 173 # 4. Filename pattern validation 174 expected_patterns = Dict( 175 "state" => r"tl_\d{4}_us_state\.zip$", 176 "county" => r"tl_\d{4}_us_county\.zip$", 177 "cbsa" => r"tl_\d{4}_us_cbsa\.zip$", 178 "urbanarea" => r"tl_\d{4}_us_uac20\.zip$", 179 "zipcode" => r"tl_\d{4}_us_zcta520\.zip$", 180 "metrodivision" => r"tl_\d{4}_us_metdiv\.zip$", 181 "rails" => r"tl_\d{4}_us_rails\.zip$", 182 "primaryroads" => r"tl_\d{4}_us_primaryroads\.zip$", 183 "cousub" => r"tl_\d{4}_\d{2}_cousub\.zip$", 184 "tract" => r"tl_\d{4}_\d{2}_tract\.zip$", 185 "place" => r"tl_\d{4}_\d{2}_place\.zip$", 186 "consolidatedcity" => r"tl_\d{4}_\d{2}_concity\.zip$", 187 "primarysecondaryroads" => r"tl_\d{4}_\d{2}_prisecroads\.zip$", 188 "areawater" => r"tl_\d{4}_\d{5}_areawater\.zip$", 189 "linearwater" => r"tl_\d{4}_\d{5}_linearwater\.zip$", 190 "road" => r"tl_\d{4}_\d{5}_roads\.zip$" 191 ) 192 193 if haskey(expected_patterns, file_type) 194 if !validate_census_filename(filepath, expected_patterns[file_type]) 195 @error "Filename validation failed" 196 return false 197 end 198 end 199 200 @info "✓ File validation passed for $file_type" 201 return true 202 end 203 204 # Define tolerance parameters for different geography types 205 # Updated based on actual 2024 Census data observations 206 const TOLERANCE_PARAMS = Dict( 207 # National geographies 208 "state" => ToleranceParams(5.0, 50.0, 45, 65, nothing), # 56 in 2024 209 "county" => ToleranceParams(75.0, 150.0, 2900, 3500, nothing), # 3235 in 2024 210 "cbsa" => ToleranceParams(25.0, 60.0, 800, 1100, nothing), # 935 in 2024 211 "urbanarea" => ToleranceParams(60.0, 100.0, 2200, 3200, nothing), # 2644 in 2024 212 "zipcode" => ToleranceParams(400.0, 700.0, 28000, 38000, nothing), # ~33k ZIP codes 213 "metrodivision" => ToleranceParams(0.5, 5.0, 25, 50, nothing), # 37 in 2024 214 "rails" => ToleranceParams(25.0, 150.0, nothing, nothing, nothing), # Rails was ~32MB 215 "primaryroads" => ToleranceParams(25.0, 150.0, nothing, nothing, nothing), # Primary roads was ~43MB 216 217 # State geographies (examples for typical states) 218 "cousub" => ToleranceParams(5.0, 50.0, 50, 2000, nothing), # Varies widely by state 219 "tract" => ToleranceParams(5.0, 100.0, 200, 5000, nothing), # MN tract was ~7.15MB 220 "place" => ToleranceParams(2.0, 50.0, 100, 3000, nothing), # MN place was ~3.96MB 221 "consolidatedcity" => ToleranceParams(0.01, 10.0, 0, 50, nothing), # KS was ~0.02MB 222 "primarysecondaryroads" => ToleranceParams(2.0, 200.0, nothing, nothing, nothing), # MN was ~3.9MB 223 224 # County geographies (examples for typical counties) 225 "areawater" => ToleranceParams(0.1, 50.0, 0, 5000, nothing), # Varies by geography 226 "linearwater" => ToleranceParams(0.01, 100.0, 0, 10000, nothing), # Varies widely by county 227 "road" => ToleranceParams(0.1, 200.0, 100, 50000, nothing) # Varies widely by county 228 )