iiasa · gidden · Oct 5, 2023 · Oct 5, 2023
diff --git a/tests/test_data/all_touched.nc b/tests/test_data/all_touched.nc
diff --git a/tests/test_data/generate_v2_exp_values.ipynb b/tests/test_data/generate_v2_exp_values.ipynb
@@ -0,0 +1,313 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f87510f9-cfd8-4c66-b36f-8c5785efe746",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from aneris.harmonize import Harmonizer\n",
+    "import pandas as pd\n",
+    "from pandas_indexing import concat, isin, ismatch, semijoin\n",
+    "import pandas_indexing.accessors\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "a0906247-5d3a-4667-80ba-abd8c30a0cfb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scen = pd.read_excel('./test_v2_data.xlsx', sheet_name='unharmonized', index_col=list(range(5)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "6e2a30f8-174a-4b04-8abc-57390e366d66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hist = pd.read_excel('./test_v2_data.xlsx', sheet_name='historical', index_col=list(range(5))).rename_axis(index={\"region\": \"country\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "3bab5282-a2a9-4101-986e-865f99148ba4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>2015</th>\n",
+       "      <th>2020</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>model</th>\n",
+       "      <th>scenario</th>\n",
+       "      <th>variable</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>country</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"6\" valign=\"top\">hm</th>\n",
+       "      <th rowspan=\"6\" valign=\"top\">hs</th>\n",
+       "      <th rowspan=\"3\" valign=\"top\">v1</th>\n",
+       "      <th rowspan=\"3\" valign=\"top\">u</th>\n",
+       "      <th>CAN</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>USA</th>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MEX</th>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">v2</th>\n",
+       "      <th rowspan=\"3\" valign=\"top\">u</th>\n",
+       "      <th>CAN</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>USA</th>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MEX</th>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                      2015  2020\n",
+       "model scenario variable unit country            \n",
+       "hm    hs       v1       u    CAN         1     2\n",
+       "                             USA         2     3\n",
+       "                             MEX         3     3\n",
+       "               v2       u    CAN         1     2\n",
+       "                             USA         2     3\n",
+       "                             MEX         3     3"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "effcef9f-f71e-4be1-92d0-d710634b846c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "regionmapping = pd.read_excel('./test_v2_data.xlsx', sheet_name='regiondef')\n",
+    "regionmap = pd.MultiIndex.from_frame( regionmapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "16cb3c12-128a-4e22-a9ab-cf4763594921",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hist_agg = hist.pix.semijoin(regionmap)\n",
+    "hist_agg = hist_agg.groupby(level=hist_agg.index.names.difference([\"country\"])).sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "bb1bf7a8-add1-413f-a61d-3f7183ea84c8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>2015</th>\n",
+       "      <th>2020</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>model</th>\n",
+       "      <th>scenario</th>\n",
+       "      <th>variable</th>\n",
+       "      <th>unit</th>\n",
+       "      <th>region</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"4\" valign=\"top\">hm</th>\n",
+       "      <th rowspan=\"4\" valign=\"top\">hs</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">v1</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">u</th>\n",
+       "      <th>MEX</th>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NAM</th>\n",
+       "      <td>3</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">v2</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">u</th>\n",
+       "      <th>MEX</th>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NAM</th>\n",
+       "      <td>3</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     2015  2020\n",
+       "model scenario variable unit region            \n",
+       "hm    hs       v1       u    MEX        3     3\n",
+       "                             NAM        3     5\n",
+       "               v2       u    MEX        3     3\n",
+       "                             NAM        3     5"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hist_agg"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "ea55eee6-828d-453d-b142-a9e2f975e88c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Data to harmonize exceeds historical data avaiablility:\nMultiIndex([('m', 's', 'v1', 'u', 'MEX'),\n            ('m', 's', 'v1', 'u', 'NAM'),\n            ('m', 's', 'v2', 'u', 'MEX'),\n            ('m', 's', 'v2', 'u', 'NAM')],\n           names=['model', 'scenario', 'variable', 'unit', 'region'])",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[43], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m h \u001b[38;5;241m=\u001b[39m \u001b[43mHarmonizer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43mscen\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m      3\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhist_agg\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m      4\u001b[0m \u001b[43m    \u001b[49m\u001b[43mharm_idx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mscen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m      5\u001b[0m \u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32m~\\OneDrive - IIASA\\work\\iiasa\\aneris_new\\src\\aneris\\harmonize.py:134\u001b[0m, in \u001b[0;36mHarmonizer.__init__\u001b[1;34m(self, data, history, config, harm_idx, method_choice)\u001b[0m\n\u001b[0;32m    132\u001b[0m hist_check \u001b[38;5;241m=\u001b[39m projectlevel(history\u001b[38;5;241m.\u001b[39mindex, harm_idx)\n\u001b[0;32m    133\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data_check\u001b[38;5;241m.\u001b[39mdifference(hist_check)\u001b[38;5;241m.\u001b[39mempty:\n\u001b[1;32m--> 134\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m    135\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mData to harmonize exceeds historical data avaiablility:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    136\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdata_check\u001b[38;5;241m.\u001b[39mdifference(hist_check)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    137\u001b[0m     )\n\u001b[0;32m    139\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcheck_idx\u001b[39m(df, label):\n\u001b[0;32m    140\u001b[0m     final_idx \u001b[38;5;241m=\u001b[39m harm_idx \u001b[38;5;241m+\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munit\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "\u001b[1;31mValueError\u001b[0m: Data to harmonize exceeds historical data avaiablility:\nMultiIndex([('m', 's', 'v1', 'u', 'MEX'),\n            ('m', 's', 'v1', 'u', 'NAM'),\n            ('m', 's', 'v2', 'u', 'MEX'),\n            ('m', 's', 'v2', 'u', 'NAM')],\n           names=['model', 'scenario', 'variable', 'unit', 'region'])"
+     ]
+    }
+   ],
+   "source": [
+    "h = Harmonizer(\n",
+    "    scen,\n",
+    "    hist_agg,\n",
+    "    harm_idx=scen.index.names,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b9be3df-8e0a-489c-a67a-d65728045358",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tests/test_data/test_v2_data.xlsx b/tests/test_data/test_v2_data.xlsx