import os import glob import itertools import time import numpy as np import xarray as xr import pandas as pd import geopandas as gpd from shapely.geometry import Point from matplotlib.path import Path def _get_first_file_lat_lon(base_dir, sectors, years, months, days_of_week): """ Finds the first valid NetCDF file in the combinations list and extracts the 2D latitude and longitude arrays. This is used to build the spatial masks once. """ combinations = list(itertools.product(sectors, years, months, days_of_week)) for sector, year, month, day in combinations: folder_path = os.path.join(base_dir, f"GRA2PESv2.0beta_{sector}", f"{year}{month}", day) file_pattern = os.path.join(folder_path, f"GRA2PESv2.0beta_{sector}_{year}{month}_{day}_*.nc") files = glob.glob(file_pattern) if files: # Found our first file, extract lat/lon with xr.open_dataset(files[0]) as ds: lat = ds['lat'].values lon = ds['lon'].values return lat, lon raise FileNotFoundError("Could not find any NetCDF files matching the provided combinations.") def _summarize_with_masks( base_dir: str, sectors: list, years: list, months: list, days_of_week: list, variables: list, output_csv: str, masks_dict: dict, mask_col_name: str, grid_area_km2: float ): """ Core processing loop. Iterates through the files, sums time/level dimensions into a 2D array, and applies precalculated boolean numpy masks. Args: masks_dict: A dictionary mapping a region name to a 2D boolean numpy array mask. e.g., {'Colorado': np.array([[True, ...], ...]), 'Utah': ...} mask_col_name: Name of the column to use in the output CSV for the regions. """ results = [] combinations = list(itertools.product(sectors, years, months, days_of_week)) total_iters = len(combinations) print(f"Starting processing for {total_iters} combinations over {len(masks_dict)} masked areas...") start_time = time.time() for idx, (sector, year, month, day) in enumerate(combinations, 1): folder_path = os.path.join(base_dir, f"GRA2PESv2.0beta_{sector}", f"{year}{month}", day) file_pattern = os.path.join(folder_path, f"GRA2PESv2.0beta_{sector}_{year}{month}_{day}_*.nc") files = glob.glob(file_pattern) if not files: print(f"[{idx}/{total_iters}] Missing files for: {sector} | {year}-{month} | {day} - Skipping.") continue print(f"[{idx}/{total_iters}] Processing {sector} | {year}-{month} | {day} ({len(files)} files found)") try: with xr.open_mfdataset( files, combine='by_coords', cache=False, parallel=True, engine='netcdf4', chunks={'time': 12} ) as ds: ds_sub = ds[variables] # Determine which dimensions exist to sum over dims_to_sum = [d for d in ['time', 'level'] if d in ds_sub.dims] # Compute the sum over time and level first. # This brings the heavily reduced 2D spatial grid into memory. ds_reduced = ds_sub.sum(dim=dims_to_sum).compute() # Multiply by area ds_reduced = ds_reduced * grid_area_km2 # For each masked region, apply the 2D boolean mask and sum for region_name, mask_2d in masks_dict.items(): row = { 'Sector': sector, 'Year': year, 'Month': month, 'DayOfWeek': day, mask_col_name: region_name } for var in variables: # Extract the 2D values as a numpy array, subset by the boolean mask, and sum var_values_2d = ds_reduced[var].values row[var] = float(np.nansum(var_values_2d[mask_2d])) results.append(row) except Exception as e: print(f"Error processing {sector}_{year}{month}_{day}: {e}") df = pd.DataFrame(results) df.to_csv(output_csv, index=False) elapsed = time.time() - start_time print(f"\nProcessing complete in {elapsed:.2f} seconds! Results saved to {output_csv}") # ===================================================================== # METHOD 1: SUMMARIZE BY SHAPEFILE # ===================================================================== def summarize_by_shapefile( base_dir: str, sectors: list, years: list, months: list, days_of_week: list, variables: list, output_csv: str, shapefile_path: str, shapefile_column: str = None, grid_area_km2: float = 16.0 ): """Summarizes netcdf emissions by geometries defined in a Shapefile.""" print("Reading first file to build Shapefile spatial mask...") lat, lon = _get_first_file_lat_lon(base_dir, sectors, years, months, days_of_week) # Load shapefile and convert to standard lat/lon gdf = gpd.read_file(shapefile_path).to_crs(epsg=4326) # Flatten grid coordinates into a GeoDataFrame of Points lon_flat = lon.flatten() lat_flat = lat.flatten() df_pts = pd.DataFrame({'lon': lon_flat, 'lat': lat_flat, 'idx': np.arange(len(lon_flat))}) gdf_pts = gpd.GeoDataFrame(df_pts, geometry=gpd.points_from_xy(df_pts.lon, df_pts.lat), crs="EPSG:4326") masks_dict = {} if shapefile_column and shapefile_column in gdf.columns: # Spatial join points to polygons to categorize by column value print(f"Mapping grid points to Shapefile geometries by column '{shapefile_column}'...") joined = gpd.sjoin(gdf_pts, gdf, how="inner", predicate="within") for val, group in joined.groupby(shapefile_column): mask_flat = np.zeros(len(lon_flat), dtype=bool) mask_flat[group['idx'].values] = True masks_dict[val] = mask_flat.reshape(lat.shape) else: # Merge all polygons into one giant boundary print("Mapping grid points to merged Shapefile boundary...") merged_geom = gdf.geometry.unary_union gdf_single = gpd.GeoDataFrame(geometry=[merged_geom], crs="EPSG:4326") joined = gpd.sjoin(gdf_pts, gdf_single, how="inner", predicate="within") mask_flat = np.zeros(len(lon_flat), dtype=bool) mask_flat[joined['idx'].values] = True masks_dict['Merged_Shapefile_Area'] = mask_flat.reshape(lat.shape) shapefile_column = "Area" _summarize_with_masks( base_dir, sectors, years, months, days_of_week, variables, output_csv, masks_dict, shapefile_column, grid_area_km2 ) # ===================================================================== # METHOD 2: SUMMARIZE BY POLYGON # ===================================================================== def summarize_by_polygon( base_dir: str, sectors: list, years: list, months: list, days_of_week: list, variables: list, output_csv: str, polygons_dict: dict, grid_area_km2: float = 16.0 ): """ Summarizes netcdf emissions by multiple polygons. polygons_dict: Dictionary mapping polygon names to lists of (lat, lon) tuples. e.g., {'Region1': [(35, -100), (35, -90), (45, -90), (45, -100)]} """ print(f"Reading first file to build spatial masks for {len(polygons_dict)} polygons...") lat, lon = _get_first_file_lat_lon(base_dir, sectors, years, months, days_of_week) points = np.column_stack((lon.flatten(), lat.flatten())) masks_dict = {} for poly_name, polygon_coords in polygons_dict.items(): # Path uses (x, y) which maps to (lon, lat) path_coords = [(pt_lon, pt_lat) for pt_lat, pt_lon in polygon_coords] poly_path = Path(path_coords) # Fast evaluation of flattened coordinates mask_flat = poly_path.contains_points(points) masks_dict[poly_name] = mask_flat.reshape(lat.shape) _summarize_with_masks( base_dir, sectors, years, months, days_of_week, variables, output_csv, masks_dict, "Area", grid_area_km2 ) # ===================================================================== # METHOD 3: SUMMARIZE BY BOUNDING BOX # ===================================================================== def summarize_by_bounding_box( base_dir: str, sectors: list, years: list, months: list, days_of_week: list, variables: list, output_csv: str, bboxes_dict: dict, grid_area_km2: float = 16.0 ): """ Summarizes netcdf emissions within multiple bounding boxes. bboxes_dict: Dictionary mapping bbox names to dictionaries containing lat/lon bounds. e.g., {'ZoneA': {'lat_bounds': [35, 45], 'lon_bounds': [-115, -95]}} """ print(f"Reading first file to build spatial masks for {len(bboxes_dict)} bounding boxes...") lat, lon = _get_first_file_lat_lon(base_dir, sectors, years, months, days_of_week) masks_dict = {} for bbox_name, bounds in bboxes_dict.items(): lat_bounds = bounds['lat_bounds'] lon_bounds = bounds['lon_bounds'] lat_min, lat_max = sorted(lat_bounds) lon_start, lon_end = lon_bounds[0], lon_bounds[1] lat_mask = (lat >= lat_min) & (lat <= lat_max) # Check for longitude wraparound across the antimeridian if lon_start > lon_end: lon_mask = (lon >= lon_start) | (lon <= lon_end) else: lon_mask = (lon >= lon_start) & (lon <= lon_end) mask_2d = lat_mask & lon_mask masks_dict[bbox_name] = mask_2d _summarize_with_masks( base_dir, sectors, years, months, days_of_week, variables, output_csv, masks_dict, "Area", grid_area_km2 ) # ===================================================================== # EXAMPLES # ===================================================================== if __name__ == "__main__": # ========================================== # USER CONFIGURATION # ========================================== # Base directory path BASE_DIR = "/wrk/users/charkins/emissions/GRA2PESv2.0_data/GRA2PESv2.0beta" # Iteration arrays SECTORS = ["total", "WASTE"] YEARS = ["2021"] MONTHS = ["01", "02"] DAYS_OF_WEEK = ["weekdy", "satdy","sundy"] # List of variables you want to sum VARIABLES_TO_SUM = ["CO2", "ffCO2", "NOX", "VOC"] GRID_AREA_KM2 = 16.0 # ========================================== # EXECUTION EXAMPLES # ========================================== print("Example 1: Summarize by Bounding Boxes") my_bboxes = { "WesternUS": {"lat_bounds": [30.0, 49.0], "lon_bounds": [-125.0, -100.0]}, "EasternUS": {"lat_bounds": [25.0, 48.0], "lon_bounds": [-90.0, -65.0]} } summarize_by_bounding_box( base_dir=BASE_DIR, sectors=SECTORS, years=YEARS, months=MONTHS, days_of_week=DAYS_OF_WEEK, variables=VARIABLES_TO_SUM, output_csv="GRA2PES_bbox_summary.csv", bboxes_dict=my_bboxes, grid_area_km2=GRID_AREA_KM2 ) print("\nExample 2: Summarize by Polygons") my_polygons = { "ColoradoBox": [ (37.0, -109.0), (41.0, -109.0), (41.0, -102.0), (37.0, -102.0) ], "WyomingBox": [ (41.0, -111.0), (45.0, -111.0), (45.0, -104.0), (41.0, -104.0) ] } summarize_by_polygon( base_dir=BASE_DIR, sectors=SECTORS, years=YEARS, months=MONTHS, days_of_week=DAYS_OF_WEEK, variables=VARIABLES_TO_SUM, output_csv="GRA2PES_polygon_summary.csv", polygons_dict=my_polygons, grid_area_km2=GRID_AREA_KM2 ) print("\nExample 3: Summarize by Shapefile") summarize_by_shapefile( base_dir=BASE_DIR, sectors=SECTORS, years=YEARS, months=MONTHS, days_of_week=DAYS_OF_WEEK, variables=VARIABLES_TO_SUM, output_csv="GRA2PES_shapefile_summary.csv", shapefile_path="/wrk/charkins/Domains/US_states_shp/tl_2025_us_state.shp", shapefile_column="NAME", # Providing this summarizes by each state individually grid_area_km2=GRID_AREA_KM2 )