kerzendorf-lab · atharva-2001 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/notebooks/generate_site_simple.ipynb b/notebooks/generate_site_simple.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Website Generation\n",
+    "\n",
+    "This notebook provides a simple interface to generate the website."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Add src to path\n",
+    "sys.path.insert(0, str(Path.cwd().parent))\n",
+    "\n",
+    "from src.main import SiteGenerator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate Full Site"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generator = SiteGenerator(log_level=\"INFO\")\n",
+    "generator.run()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Debug Individual Stages\n",
+    "\n",
+    "If something fails, you can run stages individually:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create generator\n",
+    "generator = SiteGenerator(log_level=\"DEBUG\")\n",
+    "\n",
+    "# Load data\n",
+    "generator.load_articles()\n",
+    "generator.load_members()\n",
+    "generator.load_website_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process data\n",
+    "generator.process_article_categories()\n",
+    "generator.process_member_roles()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Inspect loaded data\n",
+    "print(f\"Articles: {len(generator.data['articles_df'])}\")\n",
+    "print(f\"Members: {len(generator.data['member_info_df'])}\")\n",
+    "print(f\"Current: {len(generator.data['current_members_df'])}\")\n",
+    "print(f\"Alumni: {len(generator.data['alumni_members_df'])}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# View specific data\n",
+    "generator.data['current_members_df'][['current_role', 'full_name']]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/config.py b/src/config.py
@@ -0,0 +1,66 @@
+from pathlib import Path
+from typing import List, Dict
+
+BASE_DIR = Path(__file__).parent.parent
+GROUP_DATA_DIR = BASE_DIR.parent / "group-data"
+TEMPLATE_DIR_PATH = BASE_DIR / "templates"
+WEBSITE_DATA_PATH = GROUP_DATA_DIR / "website_data"
+HOSTING_PATH = BASE_DIR.parent / "kerzendorf-lab.github.io"
+ARTICLE_DIR_PATH = BASE_DIR.parent / "research_news" / "articles"
+ARTICLE_IMAGE_DESTINATION_DIR = HOSTING_PATH / "website_files" / "images" / "article_content"
+MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members"
+SUB_RESEARCH_PATH = HOSTING_PATH / "sub_research"
+OPPORTUNITIES_PATH = WEBSITE_DATA_PATH / "content" / "opportunities.json"
+ROLE_HIERARCHY_PATH = WEBSITE_DATA_PATH / "role_hierarchy.json"
+GALLERY_CONTENT_SOURCE = WEBSITE_DATA_PATH / "content" / "gallery"
+
+GENERAL_TAGS: List[str] = [
+    "Paper", "Poster", "Talk", "Award", "New Team Member",
+    "PhD", "Conference", "Undergraduate", "Event", "Achievement"
+]
+
+TAG_COLORS: Dict[str, str] = {
+    'paper': '#FF6B6B',
+    'poster': '#4ECDC4',
+    'talk': '#45B7D1',
+    'award': '#96CEB4',
+    'new team member': '#FFBE0B',
+    'phd': '#9B5DE5',
+    'conference': '#FF006E',
+    'undergraduate': '#8338EC',
+    'event': '#3A86FF',
+    'achievement': '#FB5607',
+    'astrophysics': '#2EC4B6',
+    'machine learning': '#FF9F1C',
+    'software': '#E71D36',
+    'research': '#011627',
+    'news': '#41EAD4'
+}
+
+ARTICLE_METADATA_FIELDS: List[str] = [
+    "article_id", "category", "date", "tags",
+    "title", "cover_image", "short_description"
+]
+
+GROUP_FILTER: List[str] = ["DTI", "TARDIS", "ICER", "kerzendorf"]
+INSTITUTION_FILTER: str = "Michigan State University"
+
+ROLE_MAP: Dict[str, str] = {
+    "Assistant Professor": "Professor",
+    "Professorial Assistant": "Undergraduate Student",
+    "Visiting Researcher": "Postdoctoral Researcher"
+}
+
+DEGREE_MAP: Dict[str, str] = {
+    "Masters": "Graduate Student",
+    "PhD": "Postdoctorate",
+    "Bachelors": "Undergraduate Student",
+}
+
+INDIVIDUAL_MEMBER_SECTION_MAP: Dict[str, str] = {
+    "education": "Education",
+    "experiences": "Experience",
+    "projects": "Projects",
+    "awards": "Awards & Recognition",
+    "outreach": "Outreach Programs",
+}
diff --git a/src/data_loaders/__init__.py b/src/data_loaders/__init__.py
@@ -0,0 +1,6 @@
+from .base import BaseDataLoader
+from .articles import ArticleLoader
+from .members import MemberLoader
+from .website_data import WebsiteDataLoader
+
+__all__ = ['BaseDataLoader', 'ArticleLoader', 'MemberLoader', 'WebsiteDataLoader']
diff --git a/src/data_loaders/articles.py b/src/data_loaders/articles.py
@@ -0,0 +1,119 @@
+import pandas as pd
+from pathlib import Path
+from datetime import datetime, date
+from typing import Dict, Any
+
+from .base import BaseDataLoader
+from src.config import ARTICLE_DIR_PATH, ARTICLE_IMAGE_DESTINATION_DIR
+from src.utils.path_helpers import set_new_image_path
+
+class ArticleLoader(BaseDataLoader):
+    def __init__(self, logger=None):
+        super().__init__(logger)
+        self.article_dir = ARTICLE_DIR_PATH
+        self.image_dest_dir = ARTICLE_IMAGE_DESTINATION_DIR
+
+    def load(self) -> pd.DataFrame:
+        self.logger.info(f"Loading articles from {self.article_dir}")
+
+        if not self.article_dir.exists():
+            raise FileNotFoundError(
+                f"Article directory not found: {self.article_dir}. "
+                f"Expected at: {self.article_dir.absolute()}. "
+                f"Check that research_news repo is in correct location."
+            )
+
+        article_content_list = []
+        today = date.today()
+        today_datetime = datetime.combine(today, datetime.min.time())
+
+        info_files = list(self.article_dir.rglob('info.json'))
+        self.logger.info(f"Found {len(info_files)} article info.json files")
+
+        for content_file_path in info_files:
+            article_content = self._load_single_article(
+                content_file_path,
+                today_datetime
+            )
+            if article_content:
+                article_content_list.append(article_content)
+
+        if not article_content_list:
+            self.logger.warning("No articles loaded successfully")
+            return pd.DataFrame()
+
+        df = pd.DataFrame(article_content_list)
+        df = self._process_article_dataframe(df)
+
+        self.logger.info(
+            f"Successfully loaded {len(df)} articles "
+            f"({len(df[df['category'] == 'News'])} news, "
+            f"{len(df[df['category'] != 'News'])} research)"
+        )
+
+        return df
+
+    def _load_single_article(
+        self,
+        content_file_path: Path,
+        today_datetime: datetime
+    ) -> Dict[str, Any] | None:
+        article_content = self.load_json_file(content_file_path)
+
+        required_fields = ['date', 'platforms', 'cover_image', 'content', 'category']
+        missing_fields = [f for f in required_fields if f not in article_content]
+        if missing_fields:
+            raise ValueError(
+                f"Article {content_file_path.parent.name} missing required fields: "
+                f"{', '.join(missing_fields)}. "
+                f"File: {content_file_path}"
+            )
+
+        article_date = datetime.strptime(article_content["date"], "%m-%d-%Y")
+
+        if "kg" not in article_content["platforms"]:
+            self.logger.debug(
+                f"Skipping {content_file_path.parent.name}: 'kg' not in platforms"
+            )
+            return None
+
+        if article_date > today_datetime:
+            self.logger.debug(
+                f"Skipping {content_file_path.parent.name}: "
+                f"future date {article_content['date']}"
+            )
+            return None
+
+        image_path = Path(article_content["cover_image"])
+        article_content["cover_image"] = set_new_image_path(
+            content_file_path,
+            image_path,
+            self.image_dest_dir
+        )
+
+        for content_key, content_value in article_content["content"].items():
+            if "img" in content_key:
+                new_content_value = set_new_image_path(
+                    content_file_path,
+                    Path(content_value),
+                    self.image_dest_dir
+                )
+                article_content["content"][content_key] = new_content_value
+
+        return article_content
+
+    def _process_article_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+        df["date"] = pd.to_datetime(df["date"], format="%m-%d-%Y")
+
+        df["cover_image_height"] = (
+            df["cover_image_height"].fillna("330px").replace("", "330px")
+        )
+        df["cover_image_width"] = (
+            df["cover_image_width"].fillna("520px").replace("", "520px")
+        )
+
+        df["category"] = df["category"].replace("Overview", "Computational Metascience")
+
+        df['image_name'] = df['cover_image'].apply(lambda x: Path(x).name)
+
+        return df
diff --git a/src/data_loaders/base.py b/src/data_loaders/base.py
@@ -0,0 +1,65 @@
+import json
+import logging
+from pathlib import Path
+from typing import Any, Dict, List
+from abc import ABC, abstractmethod
+
+class BaseDataLoader(ABC):
+    def __init__(self, logger: logging.Logger = None):
+        self.logger = logger or logging.getLogger(__name__)
+
+    def load_json_file(self, file_path: Path) -> Dict[str, Any]:
+        if not file_path.exists():
+            raise FileNotFoundError(
+                f"JSON file not found: {file_path}. "
+                f"Expected at: {file_path.absolute()}"
+            )
+
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            self.logger.debug(f"Successfully loaded JSON: {file_path.name}")
+            return data
+        except json.JSONDecodeError as e:
+            raise ValueError(
+                f"Invalid JSON in file: {file_path}. "
+                f"Error at line {e.lineno}, column {e.colno}: {e.msg}. "
+                f"Check file syntax at {file_path.absolute()}"
+            ) from e
+        except UnicodeDecodeError as e:
+            raise ValueError(
+                f"Encoding error in file: {file_path}. "
+                f"File must be UTF-8 encoded. Error: {e}"
+            ) from e
+
+    def load_json_files(self, pattern: str, base_path: Path) -> List[Dict[str, Any]]:
+        files = list(base_path.rglob(pattern))
+
+        if not files:
+            self.logger.warning(
+                f"No files matching pattern '{pattern}' found in {base_path}"
+            )
+            return []
+
+        data_list = []
+        failed_files = []
+
+        for file_path in files:
+            try:
+                data = self.load_json_file(file_path)
+                data_list.append(data)
+            except (FileNotFoundError, ValueError) as e:
+                failed_files.append((file_path, str(e)))
+                self.logger.error(f"Failed to load {file_path}: {e}")
+
+        if failed_files:
+            self.logger.warning(
+                f"Failed to load {len(failed_files)} files. "
+                f"Loaded {len(data_list)} successfully."
+            )
+
+        return data_list
+
+    @abstractmethod
+    def load(self) -> Any:
+        pass