Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
65586a3
feat: add article data loader with validation and image handling
atharva-2001 Nov 25, 2025
ab2ee11
feat: add member data loader with comprehensive validation
atharva-2001 Nov 25, 2025
b737f2b
feat: add website data loader for config and gallery
atharva-2001 Nov 25, 2025
0361a15
feat: add article processor for categorization and link handling
atharva-2001 Nov 25, 2025
a2ef4a7
feat: add member processor for status determination and role hierarchy
atharva-2001 Nov 25, 2025
ca98e94
feat: add base page renderer with Jinja2 setup
atharva-2001 Nov 25, 2025
7548e69
feat: add simple page renderers for homepage, contact, support, join us
atharva-2001 Nov 25, 2025
181545e
feat: add member page renderers for current, alumni, and individual p…
atharva-2001 Nov 25, 2025
a8e27c8
feat: add article page renderers for research and news pages
atharva-2001 Nov 25, 2025
de26e4a
feat: add gallery page renderer with image processing
atharva-2001 Nov 25, 2025
dc0d19e
feat: add asset copier utility
atharva-2001 Nov 26, 2025
c01a14b
feat: create main orchestrator structure with stage pipeline
atharva-2001 Nov 26, 2025
ac5ee69
feat: add data loading methods to main orchestrator
atharva-2001 Nov 26, 2025
2277694
feat: add data processing methods to main orchestrator
atharva-2001 Nov 26, 2025
e828404
feat: add rendering methods and main entry point
atharva-2001 Nov 26, 2025
44de78e
feat: add simplified notebook interface for site generation
atharva-2001 Nov 26, 2025
e9d7dc5
feat: logging and base files
atharva-2001 Nov 28, 2025
3fbe67e
get rid of bare exceptions
atharva-2001 Dec 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions notebooks/generate_site_simple.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Website Generation\n",
"\n",
"This notebook provides a simple interface to generate the website."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from pathlib import Path\n",
"\n",
"# Add src to path\n",
"sys.path.insert(0, str(Path.cwd().parent))\n",
"\n",
"from src.main import SiteGenerator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate Full Site"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"generator = SiteGenerator(log_level=\"INFO\")\n",
"generator.run()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Debug Individual Stages\n",
"\n",
"If something fails, you can run stages individually:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create generator\n",
"generator = SiteGenerator(log_level=\"DEBUG\")\n",
"\n",
"# Load data\n",
"generator.load_articles()\n",
"generator.load_members()\n",
"generator.load_website_data()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Process data\n",
"generator.process_article_categories()\n",
"generator.process_member_roles()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Inspect loaded data\n",
"print(f\"Articles: {len(generator.data['articles_df'])}\")\n",
"print(f\"Members: {len(generator.data['member_info_df'])}\")\n",
"print(f\"Current: {len(generator.data['current_members_df'])}\")\n",
"print(f\"Alumni: {len(generator.data['alumni_members_df'])}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# View specific data\n",
"generator.data['current_members_df'][['current_role', 'full_name']]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Empty file added src/__init__.py
Empty file.
66 changes: 66 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from pathlib import Path
from typing import List, Dict

BASE_DIR = Path(__file__).parent.parent
GROUP_DATA_DIR = BASE_DIR.parent / "group-data"
TEMPLATE_DIR_PATH = BASE_DIR / "templates"
WEBSITE_DATA_PATH = GROUP_DATA_DIR / "website_data"
HOSTING_PATH = BASE_DIR.parent / "kerzendorf-lab.github.io"
ARTICLE_DIR_PATH = BASE_DIR.parent / "research_news" / "articles"
ARTICLE_IMAGE_DESTINATION_DIR = HOSTING_PATH / "website_files" / "images" / "article_content"
MEMBERS_DIR_PATH = GROUP_DATA_DIR / "members"
SUB_RESEARCH_PATH = HOSTING_PATH / "sub_research"
OPPORTUNITIES_PATH = WEBSITE_DATA_PATH / "content" / "opportunities.json"
ROLE_HIERARCHY_PATH = WEBSITE_DATA_PATH / "role_hierarchy.json"
GALLERY_CONTENT_SOURCE = WEBSITE_DATA_PATH / "content" / "gallery"

GENERAL_TAGS: List[str] = [
"Paper", "Poster", "Talk", "Award", "New Team Member",
"PhD", "Conference", "Undergraduate", "Event", "Achievement"
]

TAG_COLORS: Dict[str, str] = {
'paper': '#FF6B6B',
'poster': '#4ECDC4',
'talk': '#45B7D1',
'award': '#96CEB4',
'new team member': '#FFBE0B',
'phd': '#9B5DE5',
'conference': '#FF006E',
'undergraduate': '#8338EC',
'event': '#3A86FF',
'achievement': '#FB5607',
'astrophysics': '#2EC4B6',
'machine learning': '#FF9F1C',
'software': '#E71D36',
'research': '#011627',
'news': '#41EAD4'
}

ARTICLE_METADATA_FIELDS: List[str] = [
"article_id", "category", "date", "tags",
"title", "cover_image", "short_description"
]

GROUP_FILTER: List[str] = ["DTI", "TARDIS", "ICER", "kerzendorf"]
INSTITUTION_FILTER: str = "Michigan State University"

ROLE_MAP: Dict[str, str] = {
"Assistant Professor": "Professor",
"Professorial Assistant": "Undergraduate Student",
"Visiting Researcher": "Postdoctoral Researcher"
}

DEGREE_MAP: Dict[str, str] = {
"Masters": "Graduate Student",
"PhD": "Postdoctorate",
"Bachelors": "Undergraduate Student",
}

INDIVIDUAL_MEMBER_SECTION_MAP: Dict[str, str] = {
"education": "Education",
"experiences": "Experience",
"projects": "Projects",
"awards": "Awards & Recognition",
"outreach": "Outreach Programs",
}
6 changes: 6 additions & 0 deletions src/data_loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .base import BaseDataLoader
from .articles import ArticleLoader
from .members import MemberLoader
from .website_data import WebsiteDataLoader

__all__ = ['BaseDataLoader', 'ArticleLoader', 'MemberLoader', 'WebsiteDataLoader']
119 changes: 119 additions & 0 deletions src/data_loaders/articles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import pandas as pd
from pathlib import Path
from datetime import datetime, date
from typing import Dict, Any

from .base import BaseDataLoader
from src.config import ARTICLE_DIR_PATH, ARTICLE_IMAGE_DESTINATION_DIR
from src.utils.path_helpers import set_new_image_path

class ArticleLoader(BaseDataLoader):
def __init__(self, logger=None):
super().__init__(logger)
self.article_dir = ARTICLE_DIR_PATH
self.image_dest_dir = ARTICLE_IMAGE_DESTINATION_DIR

def load(self) -> pd.DataFrame:
self.logger.info(f"Loading articles from {self.article_dir}")

if not self.article_dir.exists():
raise FileNotFoundError(
f"Article directory not found: {self.article_dir}. "
f"Expected at: {self.article_dir.absolute()}. "
f"Check that research_news repo is in correct location."
)

article_content_list = []
today = date.today()
today_datetime = datetime.combine(today, datetime.min.time())

info_files = list(self.article_dir.rglob('info.json'))
self.logger.info(f"Found {len(info_files)} article info.json files")

for content_file_path in info_files:
article_content = self._load_single_article(
content_file_path,
today_datetime
)
if article_content:
article_content_list.append(article_content)

if not article_content_list:
self.logger.warning("No articles loaded successfully")
return pd.DataFrame()

df = pd.DataFrame(article_content_list)
df = self._process_article_dataframe(df)

self.logger.info(
f"Successfully loaded {len(df)} articles "
f"({len(df[df['category'] == 'News'])} news, "
f"{len(df[df['category'] != 'News'])} research)"
)

return df

def _load_single_article(
self,
content_file_path: Path,
today_datetime: datetime
) -> Dict[str, Any] | None:
article_content = self.load_json_file(content_file_path)

required_fields = ['date', 'platforms', 'cover_image', 'content', 'category']
missing_fields = [f for f in required_fields if f not in article_content]
if missing_fields:
raise ValueError(
f"Article {content_file_path.parent.name} missing required fields: "
f"{', '.join(missing_fields)}. "
f"File: {content_file_path}"
)

article_date = datetime.strptime(article_content["date"], "%m-%d-%Y")

if "kg" not in article_content["platforms"]:
self.logger.debug(
f"Skipping {content_file_path.parent.name}: 'kg' not in platforms"
)
return None

if article_date > today_datetime:
self.logger.debug(
f"Skipping {content_file_path.parent.name}: "
f"future date {article_content['date']}"
)
return None

image_path = Path(article_content["cover_image"])
article_content["cover_image"] = set_new_image_path(
content_file_path,
image_path,
self.image_dest_dir
)

for content_key, content_value in article_content["content"].items():
if "img" in content_key:
new_content_value = set_new_image_path(
content_file_path,
Path(content_value),
self.image_dest_dir
)
article_content["content"][content_key] = new_content_value

return article_content

def _process_article_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
df["date"] = pd.to_datetime(df["date"], format="%m-%d-%Y")

df["cover_image_height"] = (
df["cover_image_height"].fillna("330px").replace("", "330px")
)
df["cover_image_width"] = (
df["cover_image_width"].fillna("520px").replace("", "520px")
)

df["category"] = df["category"].replace("Overview", "Computational Metascience")

df['image_name'] = df['cover_image'].apply(lambda x: Path(x).name)

return df
65 changes: 65 additions & 0 deletions src/data_loaders/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import json
import logging
from pathlib import Path
from typing import Any, Dict, List
from abc import ABC, abstractmethod

class BaseDataLoader(ABC):
def __init__(self, logger: logging.Logger = None):
self.logger = logger or logging.getLogger(__name__)

def load_json_file(self, file_path: Path) -> Dict[str, Any]:
if not file_path.exists():
raise FileNotFoundError(
f"JSON file not found: {file_path}. "
f"Expected at: {file_path.absolute()}"
)

try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
self.logger.debug(f"Successfully loaded JSON: {file_path.name}")
return data
except json.JSONDecodeError as e:
raise ValueError(
f"Invalid JSON in file: {file_path}. "
f"Error at line {e.lineno}, column {e.colno}: {e.msg}. "
f"Check file syntax at {file_path.absolute()}"
) from e
except UnicodeDecodeError as e:
raise ValueError(
f"Encoding error in file: {file_path}. "
f"File must be UTF-8 encoded. Error: {e}"
) from e

def load_json_files(self, pattern: str, base_path: Path) -> List[Dict[str, Any]]:
files = list(base_path.rglob(pattern))

if not files:
self.logger.warning(
f"No files matching pattern '{pattern}' found in {base_path}"
)
return []

data_list = []
failed_files = []

for file_path in files:
try:
data = self.load_json_file(file_path)
data_list.append(data)
except (FileNotFoundError, ValueError) as e:
failed_files.append((file_path, str(e)))
self.logger.error(f"Failed to load {file_path}: {e}")

if failed_files:
self.logger.warning(
f"Failed to load {len(failed_files)} files. "
f"Loaded {len(data_list)} successfully."
)

return data_list

@abstractmethod
def load(self) -> Any:
pass
Loading