import os
import glob
import itertools
import time
import numpy as np
import xarray as xr
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from matplotlib.path import Path


def _get_first_file_lat_lon(base_dir, sectors, years, months, days_of_week):
    """
    Finds the first valid NetCDF file in the combinations list and extracts the 
    2D latitude and longitude arrays. This is used to build the spatial masks once.
    """
    combinations = list(itertools.product(sectors, years, months, days_of_week))
    for sector, year, month, day in combinations:
        folder_path = os.path.join(base_dir, f"GRA2PESv2.0beta_{sector}", f"{year}{month}", day)
        file_pattern = os.path.join(folder_path, f"GRA2PESv2.0beta_{sector}_{year}{month}_{day}_*.nc")
        files = glob.glob(file_pattern)
        if files:
            # Found our first file, extract lat/lon
            with xr.open_dataset(files[0]) as ds:
                lat = ds['lat'].values
                lon = ds['lon'].values
            return lat, lon
            
    raise FileNotFoundError("Could not find any NetCDF files matching the provided combinations.")


def _summarize_with_masks(
    base_dir: str, 
    sectors: list, 
    years: list, 
    months: list, 
    days_of_week: list, 
    variables: list, 
    output_csv: str,
    masks_dict: dict,
    mask_col_name: str,
    grid_area_km2: float
):
    """
    Core processing loop. Iterates through the files, sums time/level dimensions into 
    a 2D array, and applies precalculated boolean numpy masks.
    
    Args:
        masks_dict: A dictionary mapping a region name to a 2D boolean numpy array mask.
                    e.g., {'Colorado': np.array([[True, ...], ...]), 'Utah': ...}
        mask_col_name: Name of the column to use in the output CSV for the regions.
    """
    results = []
    combinations = list(itertools.product(sectors, years, months, days_of_week))
    total_iters = len(combinations)
    
    print(f"Starting processing for {total_iters} combinations over {len(masks_dict)} masked areas...")
    start_time = time.time()
    
    for idx, (sector, year, month, day) in enumerate(combinations, 1):
        folder_path = os.path.join(base_dir, f"GRA2PESv2.0beta_{sector}", f"{year}{month}", day)
        file_pattern = os.path.join(folder_path, f"GRA2PESv2.0beta_{sector}_{year}{month}_{day}_*.nc")
        files = glob.glob(file_pattern)
        
        if not files:
            print(f"[{idx}/{total_iters}] Missing files for: {sector} | {year}-{month} | {day} - Skipping.")
            continue
            
        print(f"[{idx}/{total_iters}] Processing {sector} | {year}-{month} | {day} ({len(files)} files found)")
        
        try:
            with xr.open_mfdataset(
                files, 
                combine='by_coords',
                cache=False, 
                parallel=True,
                engine='netcdf4',
                chunks={'time': 12} 
            ) as ds:
                ds_sub = ds[variables]
                
                # Determine which dimensions exist to sum over
                dims_to_sum = [d for d in ['time', 'level'] if d in ds_sub.dims]
                
                # Compute the sum over time and level first. 
                # This brings the heavily reduced 2D spatial grid into memory.
                ds_reduced = ds_sub.sum(dim=dims_to_sum).compute()
                
                # Multiply by area
                ds_reduced = ds_reduced * grid_area_km2
                
                # For each masked region, apply the 2D boolean mask and sum
                for region_name, mask_2d in masks_dict.items():
                    row = {
                        'Sector': sector,
                        'Year': year,
                        'Month': month,
                        'DayOfWeek': day,
                        mask_col_name: region_name
                    }
                    
                    for var in variables:
                        # Extract the 2D values as a numpy array, subset by the boolean mask, and sum
                        var_values_2d = ds_reduced[var].values
                        row[var] = float(np.nansum(var_values_2d[mask_2d]))
                        
                    results.append(row)
                
        except Exception as e:
            print(f"Error processing {sector}_{year}{month}_{day}: {e}")

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    
    elapsed = time.time() - start_time
    print(f"\nProcessing complete in {elapsed:.2f} seconds! Results saved to {output_csv}")


# =====================================================================
# METHOD 1: SUMMARIZE BY SHAPEFILE
# =====================================================================
def summarize_by_shapefile(
    base_dir: str, sectors: list, years: list, months: list, days_of_week: list, 
    variables: list, output_csv: str, shapefile_path: str, shapefile_column: str = None, 
    grid_area_km2: float = 16.0
):
    """Summarizes netcdf emissions by geometries defined in a Shapefile."""
    print("Reading first file to build Shapefile spatial mask...")
    lat, lon = _get_first_file_lat_lon(base_dir, sectors, years, months, days_of_week)
    
    # Load shapefile and convert to standard lat/lon
    gdf = gpd.read_file(shapefile_path).to_crs(epsg=4326)
    
    # Flatten grid coordinates into a GeoDataFrame of Points
    lon_flat = lon.flatten()
    lat_flat = lat.flatten()
    df_pts = pd.DataFrame({'lon': lon_flat, 'lat': lat_flat, 'idx': np.arange(len(lon_flat))})
    gdf_pts = gpd.GeoDataFrame(df_pts, geometry=gpd.points_from_xy(df_pts.lon, df_pts.lat), crs="EPSG:4326")
    
    masks_dict = {}
    
    if shapefile_column and shapefile_column in gdf.columns:
        # Spatial join points to polygons to categorize by column value
        print(f"Mapping grid points to Shapefile geometries by column '{shapefile_column}'...")
        joined = gpd.sjoin(gdf_pts, gdf, how="inner", predicate="within")
        
        for val, group in joined.groupby(shapefile_column):
            mask_flat = np.zeros(len(lon_flat), dtype=bool)
            mask_flat[group['idx'].values] = True
            masks_dict[val] = mask_flat.reshape(lat.shape)
    else:
        # Merge all polygons into one giant boundary
        print("Mapping grid points to merged Shapefile boundary...")
        merged_geom = gdf.geometry.unary_union
        gdf_single = gpd.GeoDataFrame(geometry=[merged_geom], crs="EPSG:4326")
        
        joined = gpd.sjoin(gdf_pts, gdf_single, how="inner", predicate="within")
        mask_flat = np.zeros(len(lon_flat), dtype=bool)
        mask_flat[joined['idx'].values] = True
        masks_dict['Merged_Shapefile_Area'] = mask_flat.reshape(lat.shape)
        shapefile_column = "Area"
        
    _summarize_with_masks(
        base_dir, sectors, years, months, days_of_week, variables, output_csv, 
        masks_dict, shapefile_column, grid_area_km2
    )


# =====================================================================
# METHOD 2: SUMMARIZE BY POLYGON
# =====================================================================
def summarize_by_polygon(
    base_dir: str, sectors: list, years: list, months: list, days_of_week: list, 
    variables: list, output_csv: str, polygons_dict: dict, grid_area_km2: float = 16.0
):
    """
    Summarizes netcdf emissions by multiple polygons.
    polygons_dict: Dictionary mapping polygon names to lists of (lat, lon) tuples.
                   e.g., {'Region1': [(35, -100), (35, -90), (45, -90), (45, -100)]}
    """
    print(f"Reading first file to build spatial masks for {len(polygons_dict)} polygons...")
    lat, lon = _get_first_file_lat_lon(base_dir, sectors, years, months, days_of_week)
    
    points = np.column_stack((lon.flatten(), lat.flatten()))
    masks_dict = {}

    for poly_name, polygon_coords in polygons_dict.items():
        # Path uses (x, y) which maps to (lon, lat)
        path_coords = [(pt_lon, pt_lat) for pt_lat, pt_lon in polygon_coords]
        poly_path = Path(path_coords)
        
        # Fast evaluation of flattened coordinates
        mask_flat = poly_path.contains_points(points)
        masks_dict[poly_name] = mask_flat.reshape(lat.shape)
    
    _summarize_with_masks(
        base_dir, sectors, years, months, days_of_week, variables, output_csv, 
        masks_dict, "Area", grid_area_km2
    )


# =====================================================================
# METHOD 3: SUMMARIZE BY BOUNDING BOX
# =====================================================================
def summarize_by_bounding_box(
    base_dir: str, sectors: list, years: list, months: list, days_of_week: list, 
    variables: list, output_csv: str, bboxes_dict: dict, grid_area_km2: float = 16.0
):
    """
    Summarizes netcdf emissions within multiple bounding boxes. 
    bboxes_dict: Dictionary mapping bbox names to dictionaries containing lat/lon bounds.
                 e.g., {'ZoneA': {'lat_bounds': [35, 45], 'lon_bounds': [-115, -95]}}
    """
    print(f"Reading first file to build spatial masks for {len(bboxes_dict)} bounding boxes...")
    lat, lon = _get_first_file_lat_lon(base_dir, sectors, years, months, days_of_week)
    
    masks_dict = {}

    for bbox_name, bounds in bboxes_dict.items():
        lat_bounds = bounds['lat_bounds']
        lon_bounds = bounds['lon_bounds']

        lat_min, lat_max = sorted(lat_bounds)
        lon_start, lon_end = lon_bounds[0], lon_bounds[1]
        
        lat_mask = (lat >= lat_min) & (lat <= lat_max)
        
        # Check for longitude wraparound across the antimeridian
        if lon_start > lon_end:
            lon_mask = (lon >= lon_start) | (lon <= lon_end)
        else:
            lon_mask = (lon >= lon_start) & (lon <= lon_end)
            
        mask_2d = lat_mask & lon_mask
        masks_dict[bbox_name] = mask_2d
    
    _summarize_with_masks(
        base_dir, sectors, years, months, days_of_week, variables, output_csv, 
        masks_dict, "Area", grid_area_km2
    )


# =====================================================================
# EXAMPLES
# =====================================================================
if __name__ == "__main__":
    # ==========================================
    # USER CONFIGURATION
    # ==========================================
    
    # Base directory path
    BASE_DIR = "/wrk/users/charkins/emissions/GRA2PESv2.0_data/GRA2PESv2.0beta"
    
    # Iteration arrays
    SECTORS = ["total", "WASTE"] 
    YEARS = ["2021"]
    MONTHS = ["01", "02"]
    DAYS_OF_WEEK = ["weekdy", "satdy","sundy"]
    
    # List of variables you want to sum
    VARIABLES_TO_SUM = ["CO2", "ffCO2", "NOX", "VOC"]
    GRID_AREA_KM2 = 16.0
    
    # ==========================================
    # EXECUTION EXAMPLES
    # ==========================================
    
    print("Example 1: Summarize by Bounding Boxes")
    my_bboxes = {
        "WesternUS": {"lat_bounds": [30.0, 49.0], "lon_bounds": [-125.0, -100.0]},
        "EasternUS": {"lat_bounds": [25.0, 48.0], "lon_bounds": [-90.0, -65.0]}
    }
    summarize_by_bounding_box(
        base_dir=BASE_DIR, sectors=SECTORS, years=YEARS, months=MONTHS, 
        days_of_week=DAYS_OF_WEEK, variables=VARIABLES_TO_SUM, 
        output_csv="GRA2PES_bbox_summary.csv",
        bboxes_dict=my_bboxes,
        grid_area_km2=GRID_AREA_KM2
    )

    print("\nExample 2: Summarize by Polygons")
    my_polygons = {
        "ColoradoBox": [
            (37.0, -109.0), 
            (41.0, -109.0), 
            (41.0, -102.0), 
            (37.0, -102.0)
        ],
        "WyomingBox": [
            (41.0, -111.0),
            (45.0, -111.0),
            (45.0, -104.0),
            (41.0, -104.0)
        ]
    }
    summarize_by_polygon(
        base_dir=BASE_DIR, sectors=SECTORS, years=YEARS, months=MONTHS, 
        days_of_week=DAYS_OF_WEEK, variables=VARIABLES_TO_SUM, 
        output_csv="GRA2PES_polygon_summary.csv",
        polygons_dict=my_polygons,
        grid_area_km2=GRID_AREA_KM2
    )

    print("\nExample 3: Summarize by Shapefile")
    summarize_by_shapefile(
        base_dir=BASE_DIR, sectors=SECTORS, years=YEARS, months=MONTHS, 
        days_of_week=DAYS_OF_WEEK, variables=VARIABLES_TO_SUM, 
        output_csv="GRA2PES_shapefile_summary.csv",
        shapefile_path="/wrk/charkins/Domains/US_states_shp/tl_2025_us_state.shp", 
        shapefile_column="NAME", # Providing this summarizes by each state individually
        grid_area_km2=GRID_AREA_KM2
    )