fusedio · fused-io · Jan 7, 2025 · Jan 7, 2025
diff --git a/...NUCOPIA-Preparing_Abundant_Model_Datasets/CORNUCOPIA-Preparing_Abundant_Model_Datasets.py b/...NUCOPIA-Preparing_Abundant_Model_Datasets/CORNUCOPIA-Preparing_Abundant_Model_Datasets.py
@@ -0,0 +1,153 @@
+@fused.udf
+def udf():
+    import geopandas as gpd
+    from shapely import wkt
+    from shapely.wkt import loads
+    import pandas as pd
+    import duckdb
+    from shapely.wkt import loads
+    conn = duckdb.connect()
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from sklearn.linear_model import LinearRegression
+    from sklearn.metrics import mean_squared_error
+
+    @fused.cache
+    def load_sif_data():
+        conn.sql("INSTALL spatial; LOAD spatial; INSTALL httpfs; LOAD httpfs;")
+        table = 's3://fused-asset/misc/plinio/sif_output/county=*/year=*/month=*/*.parquet'
+        df = conn.sql(f"""SELECT * FROM read_parquet('{table}',hive_partitioning = true)""").df()
+        df['geometry'] = df['geometry'].apply(loads)
+        gdf_sif = gpd.GeoDataFrame(df)
+        return gdf_sif
+
+    gdf_sif = load_sif_data()
+
+    # df_metric = conn.sql(f"""SELECT year, count(*) FROM read_parquet('{table}',hive_partitioning = true) GROUP BY year ORDER BY year LIMIT 10""").df()
+    # print(df_metric)
+
+
+    @fused.cache
+    def load_target_data():
+        # Bring in target data
+        years = ["2015", "2016", "2017", "2018", "2019", "2020"]
+        dfs = []
+        # path = 's3://soldatanasasifglobalifoco2modis1863/USDA/data/Actual_yields/USDA_crop_yields_2018.csv'
+        # Fetch for each year
+        for year in years:
+            path = f's3://soldatanasasifglobalifoco2modis1863/USDA/data/Actual_yields/USDA_crop_yields_{year}.csv'
+            df = pd.read_csv(path)
+
+            # Select only the relevant columns
+            cols = [
+                'Value',  # Bushels / acre
+                'Year',
+                'County ANSI',
+            ]
+            # df = df[cols]
+
+            dfs.append(df)
+
+        # Concatenate all DataFrames into a single DataFrame
+        df_actuals = pd.concat(dfs, ignore_index=True)
+        print(df_actuals)
+        return df_actuals
+
+    df_actuals = load_target_data()
+    df_actuals['GEOID'] = df_actuals['State ANSI'].astype(str).str.zfill(2) + df_actuals['County ANSI'].apply(lambda x: str(int(x)).zfill(3) if not pd.isna(x) else "")
+    df_actuals['year'] = df_actuals['Year'] 
+
+
+    # gdf_counties = gpd.read_parquet('s3://fused-asset/data/tiger/county/tl_rd22_us_county small.parquet')
+    result = gdf_sif.merge(df_actuals, on=['year', 'GEOID'], how='left')
+    result = gpd.GeoDataFrame(result)
+    result.crs = "EPSG:4326"
+
+    # TODO: there's some NaN rows, why?
+    result['county_area_m2']=result.to_crs(result.estimate_utm_crs()).area
+    # _result = result.drop(columns='geometry')
+
+    # Convert geometry so it plays well with DuckDB
+    result['geometry'] = result['geometry'].apply(lambda geom: geom.wkt)
+
+
+    out = conn.sql("""
+        SELECT 
+            ROUND(corn_sif_sum/county_sif_sum, 2) AS m_pct, 
+            corn_sif_mean,
+            corn_sif_sum,
+            county_area_m2,
+            (county_area_m2*m_pct) as area_corn_m2,
+            (area_corn_m2*.00024711) as area_corn_acres,
+            (county_area_m2*00024711) as area_county_acres,
+            (area_corn_acres*Value) as bushels_sum_actual,
+            year,
+            month,
+            county,
+            GEOID,
+            Value as bushels_per_acre_actual,
+            period,
+            STATE,
+            geometry
+
+        FROM result 
+        WHERE GEOID = '19119'
+        LIMIT 1000
+
+    """).df()
+
+    # Convert geometry back to Python format
+    out['geometry'] = out['geometry'].apply(loads)
+    # print(out)
+
+
+    #out table
+    out = gpd.GeoDataFrame(out)
+    out['date'] = out['month'].astype(str) + out['period'].astype(str)
+    print(out.drop(columns='geometry').T)   
+    # print(out)
+    year_totals = out.groupby(['GEOID', 'year']).agg({
+        'bushels_per_acre_actual': 'mean',  # Adjust aggregation logic as needed
+        'm_pct': 'mean',
+        'bushels_sum_actual': 'mean',       # Example aggregation
+        'geometry': 'first',              # Choose how to handle geometry
+        'area_county_acres':'mean',
+        'area_corn_acres':'mean'
+    }).reset_index()
+    year_totals = year_totals.drop_duplicates(subset=['GEOID', 'year'])
+    print(year_totals.drop(columns='geometry').T)
+
+    #Pivot
+    pivot = out.pivot_table(
+        index=['GEOID','year'], 
+        columns=['date'], 
+        values=['corn_sif_mean'], 
+        aggfunc='first')
+    pivot.columns = ['_'.join(filter(None, col)) if isinstance(col, tuple) else col for col in pivot.columns]
+    pivot = pivot.rename(columns={'GEOID_': 'GEOID', 'year_': 'year', 'bushels_per_acre_': 'bushels_per_acre'})
+    # print(pivot.columns)
+    # print(out.columns)
+    print(pivot)
+
+    #Make a table that is only year data
+    out_reduced = out[['GEOID', 'year', 'bushels_per_acre_actual','area_corn_acres', 'area_county_acres','geometry','m_pct','bushels_sum_actual']]
+    # print(out_reduced)
+    # print(out_reduced.drop(columns='geometry').T)
+    # print(pivot.dtypes)
+    # print(year_totals.dtypes)
+
+
+    # Merge with all the corn sif mean totals
+    result = pivot.merge(year_totals, on=['GEOID', 'year'], how='left')
+
+    # Ensure the result is a GeoDataFrame
+    result = gpd.GeoDataFrame(result, geometry='geometry')
+    # print(result.info())
+    # result.rename(columns={'Value': 'bushelsperacre'}, inplace=True)
+    print(result)
+    print(result.head().drop(columns='geometry').T)
+    # print(result.head().drop(columns='geometry'))
+    # print(result)
+
+    return result
+
diff --git a/community/kristinscholten/CORNUCOPIA-Preparing_Abundant_Model_Datasets/README.MD b/community/kristinscholten/CORNUCOPIA-Preparing_Abundant_Model_Datasets/README.MD
@@ -0,0 +1,10 @@
+<!--fused:readme-->
+## Overview
+
+Integrates Solar-Induced Fluorescence (SIF) data (2015–2020) with USDA crop yields to create machine learning training datasets. Calculates crop-specific SIF ratios, areas (measured in km and acres), and bushels using satellite and yield data processed with DuckDB. Outputs a GeoDataFrame for spatial and temporal trend analysis, supporting agricultural planning and yield prediction.
+
+## External links
+
+- [Solar Induced Fluorescence UDF](https://example.com)
+- [Crop Mask Zonal Statistics UDF](https://example.com)
+- [Data Dictionary for SIF](https://daac.ornl.gov/SIF-ESDR/guides/Global_SIF_OCO2_MODIS.html)
diff --git a/community/kristinscholten/CORNUCOPIA-Preparing_Abundant_Model_Datasets/meta.json b/community/kristinscholten/CORNUCOPIA-Preparing_Abundant_Model_Datasets/meta.json
@@ -0,0 +1,117 @@
+{
+  "version": "0.0.3",
+  "job_config": {
+    "version": "0.0.3",
+    "name": null,
+    "steps": [
+      {
+        "type": "udf",
+        "udf": {
+          "type": "geopandas_v2",
+          "name": "CORNUCOPIA-Preparing_Abundant_Model_Datasets",
+          "entrypoint": "udf",
+          "parameters": {},
+          "metadata": {
+            "fused:vizConfig": {
+              "tileLayer": {
+                "@@type": "TileLayer",
+                "minZoom": 0,
+                "maxZoom": 19,
+                "tileSize": 256,
+                "pickable": true
+              },
+              "rasterLayer": {
+                "@@type": "BitmapLayer",
+                "pickable": true
+              },
+              "vectorLayer": {
+                "@@type": "GeoJsonLayer",
+                "stroked": true,
+                "filled": true,
+                "pickable": true,
+                "opacity": 0.01,
+                "lineWidthMinPixels": 1,
+                "pointRadiusMinPixels": 1,
+                "getFillColor": {
+                  "@@function": "colorContinuous",
+                  "attr": "corn_sif_mean",
+                  "domain": [
+                    0,
+                    1
+                  ],
+                  "colors": "OrYel",
+                  "nullColor": [
+                    184,
+                    184,
+                    184
+                  ]
+                }
+              }
+            },
+            "fused:udfType": "auto",
+            "fused:slug": "CORNUCOPIA-Preparing_Abundant_Model_Datasets",
+            "fused:name": "CORNUCOPIA-Preparing_Abundant_Model_Datasets",
+            "fused:id": null,
+            "fused:description": "## Overview\n\nIntegrates Solar-Induced Fluorescence (SIF) data (2015–2020) with USDA crop yields to create machine learning training datasets. Calculates crop-specific SIF ratios, areas (measured in km and acres), and bushels using satellite and yield data processed with DuckDB. Outputs a GeoDataFrame for spatial and temporal trend analysis, supporting agricultural planning and yield prediction.\n\n## External links\n\n- [Solar Induced Fluorescence UDF](https://example.com)\n- [Crop Mask Zonal Statistics UDF](https://example.com)\n- [Data Dictionary for SIF](https://daac.ornl.gov/SIF-ESDR/guides/Global_SIF_OCO2_MODIS.html)",
+            "fused:gitRepo": "fusedio/udfs",
+            "fused:gitRef": "1aa6c936b8239f7980c342e1d67b4a7c1ed02bdb",
+            "fused:gitUrl": "https://github.com/fusedio/udfs/tree/1aa6c936b8239f7980c342e1d67b4a7c1ed02bdb/community/kristinscholten/CORNUCOPIA-Preparing_Abundant_Model_Datasets/",
+            "fused:gitShortUrl": "https://github.com/fusedio/udfs/tree/1aa6c93/community/kristinscholten/CORNUCOPIA-Preparing_Abundant_Model_Datasets/",
+            "fused:gitPath": "community/kristinscholten/CORNUCOPIA-Preparing_Abundant_Model_Datasets",
+            "fused:gitLastModified": "2025-01-07T16:47:12.157Z",
+            "fused:gitHistory": [
+              {
+                "fused:vizConfig": {
+                  "tileLayer": {
+                    "@@type": "TileLayer",
+                    "minZoom": 0,
+                    "maxZoom": 19,
+                    "tileSize": 256,
+                    "pickable": true
+                  },
+                  "rasterLayer": {
+                    "@@type": "BitmapLayer",
+                    "pickable": true
+                  },
+                  "vectorLayer": {
+                    "@@type": "GeoJsonLayer",
+                    "stroked": true,
+                    "filled": true,
+                    "pickable": true,
+                    "opacity": 0.01,
+                    "lineWidthMinPixels": 1,
+                    "pointRadiusMinPixels": 1,
+                    "getFillColor": {
+                      "@@function": "colorContinuous",
+                      "attr": "corn_sif_mean",
+                      "domain": [
+                        0,
+                        1
+                      ],
+                      "colors": "OrYel",
+                      "nullColor": [
+                        184,
+                        184,
+                        184
+                      ]
+                    }
+                  }
+                },
+                "fused:udfType": "auto",
+                "fused:slug": "Kristin_Query_Sif_Tables",
+                "fused:name": "Kristin_Query_Sif_Tables",
+                "fused:id": null,
+                "fused:description": "## Overview\n\nIntegrates Solar-Induced Fluorescence (SIF) data (2015–2020) with USDA crop yields to create machine learning training datasets. Calculates crop-specific SIF ratios, areas (measured in km and acres), and bushels using satellite and yield data processed with DuckDB. Outputs a GeoDataFrame for spatial and temporal trend analysis, supporting agricultural planning and yield prediction.\n\n## External links\n\n- [Solar Induced Fluorescence UDF](https://example.com)\n- [Crop Mask Zonal Statistics UDF](https://example.com)\n- [Data Dictionary for SIF](https://daac.ornl.gov/SIF-ESDR/guides/Global_SIF_OCO2_MODIS.html)"
+              }
+            ],
+            "fused:gitPullRequestBranch": "[email protected]",
+            "fused:gitPullRequestLink": "https://github.com/fusedio/udfs/pull/560"
+          },
+          "source": "CORNUCOPIA-Preparing_Abundant_Model_Datasets.py",
+          "headers": []
+        }
+      }
+    ],
+    "metadata": null
+  }
+}