diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6195cd9..4c821a8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -479,25 +479,221 @@ Report unacceptable behavior to [contact@meq-bench.org]. All reports will be rev ## Getting Help -### Resources +We provide multiple channels for getting help and support while working with MEQ-Bench. + +### šŸ“š Documentation and Resources + +#### Primary Documentation +- **[Project Documentation](docs/)**: Comprehensive guides and API reference +- **[README.md](README.md)**: Quick start guide and project overview +- **[Installation Guide](docs/installation.rst)**: Detailed installation instructions +- **[Quickstart Tutorial](docs/quickstart.rst)**: Step-by-step tutorial for new users +- **[API Reference](docs/api/)**: Complete API documentation with examples + +#### Code Examples +- **[Basic Usage](examples/basic_usage.py)**: Simple example showing core functionality +- **[Model Integration Examples](examples/)**: How to integrate different LLM backends +- **[Evaluation Examples](examples/)**: Custom evaluation scenarios and metrics +- **[Data Loading Examples](examples/)**: Loading and processing custom datasets + +#### Technical References +- **[CHANGELOG.md](CHANGELOG.md)**: Version history and feature updates +- **[RELEASE_PROCESS.md](RELEASE_PROCESS.md)**: Release notes and versioning information +- **[Scripts Documentation](scripts/README.md)**: Development and maintenance scripts + +### šŸ†˜ Support Channels + +#### For Different Types of Issues + +**šŸ› Bug Reports and Technical Issues** +- **Where**: [GitHub Issues](https://github.com/heilcheng/MEQ-Bench/issues) +- **When**: When you encounter errors, unexpected behavior, or performance issues +- **What to include**: + - Clear description of the problem + - Steps to reproduce + - Environment details (OS, Python version, package versions) + - Error messages and stack traces + - Minimal code example demonstrating the issue + +**šŸ’” Feature Requests and Enhancement Ideas** +- **Where**: [GitHub Issues](https://github.com/heilcheng/MEQ-Bench/issues) (use "enhancement" label) +- **When**: When you have ideas for new features or improvements +- **What to include**: + - Use case description + - Proposed solution or approach + - Benefits to the community + - Examples of similar features in other tools + +**ā“ General Questions and Usage Help** +- **Where**: [GitHub Discussions](https://github.com/heilcheng/MEQ-Bench/discussions) +- **When**: For questions about usage, best practices, or concepts +- **Categories**: + - **Q&A**: General usage questions + - **Ideas**: Discussion of potential features + - **Show and Tell**: Share your work with MEQ-Bench + - **General**: Other discussions + +**šŸ”¬ Research and Scientific Questions** +- **Where**: [GitHub Discussions](https://github.com/heilcheng/MEQ-Bench/discussions) (Research category) +- **When**: Questions about evaluation methodologies, metrics, or scientific validity +- **Topics**: Validation studies, metric interpretation, benchmark design + +**🚨 Security Issues** +- **Where**: Email to [security@meq-bench.org](mailto:security@meq-bench.org) +- **When**: For security vulnerabilities or sensitive issues +- **Note**: Please do not report security issues in public GitHub issues + +### šŸ¤ Community and Collaboration + +#### Communication Channels +- **GitHub Discussions**: Primary forum for community interaction +- **Email**: [contact@meq-bench.org](mailto:contact@meq-bench.org) for project inquiries +- **Research Collaboration**: [research@meq-bench.org](mailto:research@meq-bench.org) for academic partnerships + +#### Community Guidelines +- **Be respectful**: Maintain professional and courteous communication +- **Be specific**: Provide detailed information to help others understand your question +- **Search first**: Check existing issues and discussions before posting +- **Share knowledge**: Help others when you can answer their questions + +### šŸ”§ Self-Help Resources + +#### Before Asking for Help + +1. **Check the Documentation** + ```bash + # View documentation locally + cd docs && make html && open _build/html/index.html + ``` + +2. **Search Existing Issues** + - Use GitHub's search: `is:issue label:bug your-search-terms` + - Check both open and closed issues + +3. **Try the Examples** + ```bash + # Run basic example + python examples/basic_usage.py + + # Check if your environment is working + python -c "import src; print('MEQ-Bench imported successfully')" + ``` + +4. **Validate Your Setup** + ```bash + # Run validation script + python scripts/validate_release.py + + # Run tests to check installation + pytest tests/ -v + ``` + +#### Common Issues and Solutions + +**Installation Problems** +```bash +# Clean installation +pip uninstall meq-bench +pip install --no-cache-dir -e . + +# Check Python version +python --version # Should be 3.8+ + +# Install with specific dependency groups +pip install -e .[dev] # Development dependencies +pip install -e .[ml] # Machine learning dependencies +``` + +**Import Errors** +```bash +# Ensure you're in the project directory +cd /path/to/MEQ-Bench +python -c "import src" + +# Check PYTHONPATH +export PYTHONPATH="${PYTHONPATH}:$(pwd)" +``` + +**Model Integration Issues** +```bash +# Test with dummy model first +python run_benchmark.py --model_name dummy --max_items 5 + +# Check API credentials +echo $OPENAI_API_KEY +echo $ANTHROPIC_API_KEY +echo $GOOGLE_API_KEY +``` + +### šŸ“– Learning Resources + +#### For New Users +1. **Start with**: [README.md](README.md) → [Installation](docs/installation.rst) → [Quickstart](docs/quickstart.rst) +2. **Try**: [Basic Usage Example](examples/basic_usage.py) +3. **Learn**: [Evaluation Metrics Guide](docs/evaluation_metrics.rst) +4. **Explore**: [Data Loading Guide](docs/data_loading.rst) + +#### For Researchers +1. **Read**: Methodology papers and citations in documentation +2. **Understand**: [Evaluation Framework](docs/evaluation_metrics.rst) +3. **Validate**: Use the [LLM-as-a-Judge validation framework](evaluation/validate_judge.py) +4. **Contribute**: Share validation studies and results + +#### For Developers +1. **Setup**: Development environment with `pip install -e .[dev]` +2. **Read**: [Coding Conventions](#coding-conventions) in this document +3. **Follow**: [Testing guidelines](#testing) and run tests +4. **Use**: Pre-commit hooks for code quality + +### šŸš€ Quick Help Commands + +```bash +# Get general help +python run_benchmark.py --help + +# Check installation +python -c "import src; print('āœ… MEQ-Bench is working')" + +# Run a quick test +python run_benchmark.py --model_name dummy --max_items 2 + +# Validate your environment +python scripts/validate_release.py + +# Get version information +python -c "import src; print(f'MEQ-Bench version: {getattr(src, \"__version__\", \"unknown\")}')" + +# Run basic tests +pytest tests/test_benchmark.py -v +``` + +### šŸ“ž Response Times and Expectations + +- **GitHub Issues**: We aim to respond within 48 hours +- **GitHub Discussions**: Community-driven, responses vary +- **Email**: 3-5 business days for general inquiries +- **Security Issues**: 24 hours acknowledgment, 1 week for assessment + +### šŸŽÆ Getting the Best Help -- **Documentation**: Check the [docs](docs/) directory -- **Examples**: See [examples](examples/) directory -- **Issues**: Search [existing issues](https://github.com/heilcheng/MEQ-Bench/issues) -- **Discussions**: Use [GitHub Discussions](https://github.com/heilcheng/MEQ-Bench/discussions) +To get the most effective help: -### Contact +1. **Be Specific**: Include exact error messages, environment details, and steps to reproduce +2. **Provide Context**: Explain what you're trying to achieve and what you've already tried +3. **Use Templates**: Follow issue templates when available +4. **Share Code**: Provide minimal, reproducible examples +5. **Follow Up**: Update issues with additional information or solutions found -- **General Questions**: Create a GitHub Discussion -- **Bug Reports**: Create a GitHub Issue -- **Security Issues**: Email [security@meq-bench.org] -- **Collaboration**: Email [contact@meq-bench.org] +### 🌟 Contributing Back -### Community +If you receive help, consider helping others: +- Answer questions in GitHub Discussions +- Improve documentation based on your experience +- Share useful examples or tutorials +- Report and fix bugs you encounter +- Contribute to the codebase -- **Weekly Office Hours**: [Details TBD] -- **Research Meetings**: [Details TBD] -- **Slack Channel**: [Link TBD] +Remember: Every question helps improve MEQ-Bench for the entire community! --- diff --git a/RELEASE_PROCESS.md b/RELEASE_PROCESS.md new file mode 100644 index 0000000..866ce9c --- /dev/null +++ b/RELEASE_PROCESS.md @@ -0,0 +1,355 @@ +# MEQ-Bench Release Process + +This document outlines the complete process for preparing and publishing MEQ-Bench releases. + +## Overview + +MEQ-Bench follows [Semantic Versioning](https://semver.org/) (MAJOR.MINOR.PATCH): +- **MAJOR**: Breaking changes or significant API modifications +- **MINOR**: New features, backward compatible +- **PATCH**: Bug fixes, backward compatible + +## Release Types + +### Patch Release (1.0.X) +- Bug fixes +- Documentation updates +- Security patches +- Performance improvements without API changes + +### Minor Release (1.X.0) +- New features +- New model backend support +- Enhanced evaluation metrics +- New data loaders +- Backward compatible changes + +### Major Release (X.0.0) +- Breaking API changes +- Significant architectural changes +- Removal of deprecated features +- Major framework updates + +## Pre-Release Checklist + +### 1. Code Quality āœ… +- [ ] All tests pass (`pytest tests/`) +- [ ] Linting checks pass (`flake8`, `mypy`, `bandit`) +- [ ] Code coverage is adequate +- [ ] No security vulnerabilities detected + +### 2. Documentation āœ… +- [ ] README.md is up to date +- [ ] API documentation is complete +- [ ] CHANGELOG.md reflects all changes +- [ ] Installation instructions are accurate +- [ ] Examples work with current code + +### 3. Dependencies āœ… +- [ ] requirements.txt is up to date +- [ ] Dependencies are pinned appropriately +- [ ] No unused dependencies +- [ ] Compatibility tested with supported Python versions + +### 4. Testing āœ… +- [ ] Unit tests cover new functionality +- [ ] Integration tests pass +- [ ] Manual testing completed +- [ ] Performance regression tests (if applicable) +- [ ] Cross-platform testing (Windows, macOS, Linux) + +### 5. Version Management āœ… +- [ ] Version numbers updated in all files +- [ ] CHANGELOG.md has [Unreleased] section with changes +- [ ] Git working directory is clean +- [ ] On main/master branch + +## Automated Release Preparation + +Use the automated release preparation script: + +```bash +# Dry run to see what would be changed +python scripts/prepare_release.py --version 1.1.0 --type minor --dry-run + +# Actual release preparation +python scripts/prepare_release.py --version 1.1.0 --type minor + +# Skip tests for faster preparation (not recommended) +python scripts/prepare_release.py --version 1.1.0 --type minor --skip-tests +``` + +The script automatically: +- āœ… Validates version format and increment +- āœ… Runs test suite +- āœ… Performs linting checks +- āœ… Updates version in all relevant files +- āœ… Updates CHANGELOG.md +- āœ… Builds package +- āœ… Generates release notes +- āœ… Provides next steps instructions + +## Manual Release Steps + +### 1. Prepare Release Branch (Optional for Major Releases) +```bash +git checkout -b release/v1.1.0 +``` + +### 2. Update Version Numbers +Update version in: +- `setup.py`: `version="1.1.0"` +- `src/__init__.py`: `__version__ = "1.1.0"` +- `docs/conf.py`: `version = "1.1.0"` and `release = "1.1.0"` + +### 3. Update CHANGELOG.md +- Move items from `[Unreleased]` to new version section +- Add release date +- Create new empty `[Unreleased]` section + +### 4. Run Release Preparation Script +```bash +python scripts/prepare_release.py --version 1.1.0 --type minor +``` + +### 5. Review and Commit Changes +```bash +git add . +git commit -m "Prepare release v1.1.0" +``` + +### 6. Create and Push Tag +```bash +git tag v1.1.0 +git push origin v1.1.0 +git push origin main +``` + +### 7. Build Package +```bash +python setup.py sdist bdist_wheel +``` + +### 8. Test Package Installation +```bash +# Create virtual environment +python -m venv test_env +source test_env/bin/activate # On Windows: test_env\Scripts\activate + +# Install from built package +pip install dist/meq_bench-1.1.0-py3-none-any.whl + +# Test basic functionality +python -c "import src; print('Package works!')" +``` + +## Publishing Release + +### 1. GitHub Release +1. Go to [GitHub Releases](https://github.com/heilcheng/MEQ-Bench/releases) +2. Click "Draft a new release" +3. Choose the tag: `v1.1.0` +4. Release title: `MEQ-Bench v1.1.0` +5. Use generated release notes from `release_notes_1.1.0.md` +6. Upload built packages from `dist/` directory +7. Publish release + +### 2. PyPI Publication (Optional) +```bash +# Install publishing tools +pip install twine + +# Check package +twine check dist/* + +# Upload to Test PyPI first +twine upload --repository testpypi dist/* + +# Test installation from Test PyPI +pip install --index-url https://test.pypi.org/simple/ meq-bench + +# Upload to production PyPI +twine upload dist/* +``` + +### 3. Documentation Update +- Ensure Read the Docs builds successfully +- Update any version-specific documentation +- Announce release in documentation + +## Post-Release Tasks + +### 1. Verify Release +- [ ] GitHub release is published +- [ ] Package installs correctly +- [ ] Documentation builds successfully +- [ ] All CI/CD pipelines pass + +### 2. Communication +- [ ] Update project README badges if needed +- [ ] Announce on relevant channels +- [ ] Update any external documentation + +### 3. Cleanup +- [ ] Delete release branch (if used) +- [ ] Archive old releases if needed +- [ ] Update project roadmap + +## Emergency Patch Process + +For critical bug fixes or security issues: + +1. **Immediate Fix**: + ```bash + git checkout main + git checkout -b hotfix/v1.0.1 + # Make minimal fix + git commit -m "Fix critical issue" + ``` + +2. **Fast-Track Release**: + ```bash + python scripts/prepare_release.py --version 1.0.1 --type patch --skip-tests + git push origin hotfix/v1.0.1 + # Create PR and merge immediately + ``` + +3. **Emergency Publication**: + - Skip extensive testing for critical security fixes + - Document the urgency in release notes + - Follow up with comprehensive testing + +## Release Schedule + +### Regular Schedule +- **Patch releases**: As needed (bug fixes) +- **Minor releases**: Monthly or quarterly +- **Major releases**: Annually or when breaking changes accumulate + +### Release Windows +- Avoid releases during holidays +- Prefer Tuesday-Thursday releases +- Allow time for post-release monitoring + +## Rollback Procedure + +If a release has critical issues: + +### 1. Immediate Response +```bash +# Revert the problematic tag +git tag -d v1.1.0 +git push origin :refs/tags/v1.1.0 + +# Create hotfix +git checkout v1.0.0 # Last known good version +git checkout -b hotfix/v1.1.1 +``` + +### 2. Communication +- Announce the issue immediately +- Provide workarounds if available +- Give timeline for fix + +### 3. Fix and Re-release +- Address the issue +- Increment version (e.g., 1.1.0 → 1.1.1) +- Follow expedited release process + +## Tools and Scripts + +### Release Preparation Script +`scripts/prepare_release.py` - Automates most release tasks + +### Validation Scripts +- `scripts/validate_package.py` - Package validation +- `scripts/check_dependencies.py` - Dependency validation + +### CI/CD Integration +GitHub Actions workflows automatically: +- Run tests on pull requests +- Build packages on tags +- Deploy documentation on releases + +## Troubleshooting + +### Common Issues + +#### Version Conflicts +```bash +# Check current versions +grep -r "version.*=" setup.py src/__init__.py docs/conf.py +``` + +#### Test Failures +```bash +# Run specific test +pytest tests/test_specific.py -v + +# Run with coverage +pytest --cov=src tests/ +``` + +#### Package Build Issues +```bash +# Clean previous builds +rm -rf build/ dist/ *.egg-info/ + +# Rebuild +python setup.py clean --all +python setup.py sdist bdist_wheel +``` + +#### Documentation Build Issues +```bash +cd docs/ +make clean +make html +``` + +## Security Considerations + +### Pre-Release Security Checks +- [ ] Run security scanning (`bandit`) +- [ ] Check for exposed secrets +- [ ] Validate dependencies for vulnerabilities +- [ ] Review code for security best practices + +### Secure Release Process +- Use signed commits for releases +- Verify package integrity +- Use secure channels for communication +- Document security fixes prominently + +## Version History Template + +Use this template for CHANGELOG.md entries: + +```markdown +## [1.1.0] - 2025-01-XX + +### Added +- New HealthSearchQA data loader for expanded dataset support +- Apple MLX framework support for optimized inference on Apple Silicon +- Google Gemini API integration for LLM-as-a-judge evaluation +- Enhanced data loading capabilities with custom field mapping +- Comprehensive LLM-as-a-Judge validation framework with three-part strategy + +### Changed +- Updated LLMJudge class to support multiple API providers +- Enhanced run_benchmark.py with MLX backend support and retry mechanisms +- Improved error handling and logging throughout the codebase + +### Fixed +- Added retry mechanisms with exponential backoff for model generation +- Improved input validation and error messages +- Enhanced robustness for edge cases and empty datasets + +### Security +- Updated dependency versions to address security vulnerabilities +- Improved input validation to prevent injection attacks +``` + +--- + +For questions about the release process, please contact the maintainers or open an issue on GitHub. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 7888490..1f94619 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -89,53 +89,221 @@ MEQ-Bench is built with SOLID principles and uses: Getting Help ------------ -If you need assistance with MEQ-Bench, here are the best ways to get help: +We provide comprehensive support channels to help you successfully use MEQ-Bench. Choose the most appropriate channel based on your needs: -**Check Existing Resources** +šŸ“š **Documentation and Self-Help** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* **Documentation**: Start by reviewing this documentation for answers to common questions and detailed usage examples. -* **GitHub Issues**: Search our `GitHub Issues page `_ to see if your question has already been addressed. -* **Examples**: Check the `examples/ `_ directory for practical usage patterns. +Before reaching out, please check these resources: -**Report Issues** +* **Primary Documentation**: This comprehensive guide covers installation, usage, and advanced topics +* **API Reference**: Detailed function and class documentation with examples +* **Quickstart Guide**: :doc:`quickstart` - Get up and running in minutes +* **Installation Guide**: :doc:`installation` - Step-by-step setup instructions +* **Evaluation Guide**: :doc:`evaluation_metrics` - Understanding MEQ-Bench metrics +* **Data Loading Guide**: :doc:`data_loading` - Working with datasets -If you encounter a bug or have a feature request: +**Code Examples and Tutorials** -1. **Search existing issues** first to avoid duplicates -2. **Open a new issue** with: +* **Basic Usage**: `examples/basic_usage.py `_ - Simple getting started example +* **Model Integration**: Examples for OpenAI, Anthropic, Google Gemini, and MLX backends +* **Custom Datasets**: How to load and process your own medical datasets +* **Evaluation Examples**: Custom scoring and validation scenarios + +**Quick Validation Commands** + +.. code-block:: bash + + # Verify installation + python -c "import src; print('āœ… MEQ-Bench is working')" - * Clear description of the problem or request - * Steps to reproduce (for bugs) - * Your environment details (OS, Python version, etc.) - * Relevant code snippets or error messages + # Run basic test + python run_benchmark.py --model_name dummy --max_items 2 + + # Validate environment + python scripts/validate_release.py + +šŸ†˜ **Support Channels by Issue Type** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**šŸ› Bug Reports and Technical Issues** + +* **Where**: `GitHub Issues `_ +* **When**: Errors, crashes, unexpected behavior, or performance problems +* **Include**: + + * Clear problem description and steps to reproduce + * Environment details (OS, Python version, package versions) + * Complete error messages and stack traces + * Minimal code example that demonstrates the issue + +.. code-block:: bash + + # Gather environment info for bug reports + python --version + pip list | grep -E "(torch|transformers|openai|anthropic)" + python -c "import platform; print(platform.platform())" + +**šŸ’” Feature Requests and Ideas** + +* **Where**: `GitHub Issues `_ (use "enhancement" label) +* **When**: You have ideas for new features, metrics, or improvements +* **Include**: + + * Clear use case description and benefits + * Proposed implementation approach if known + * Examples from other tools or research + * Considerations for medical AI safety and ethics -**Ask Questions** +**ā“ Usage Questions and Best Practices** -For general questions about usage, best practices, or research applications: +* **Where**: `GitHub Discussions `_ +* **When**: Questions about how to use MEQ-Bench effectively +* **Categories**: + + * **Q&A**: General usage and troubleshooting questions + * **Ideas**: Feature discussions and feedback + * **Show and Tell**: Share your research and applications + * **Research**: Scientific methodology and validation discussions -* **GitHub Discussions**: Use our `GitHub Discussions `_ for community support -* **Email Support**: For direct assistance, contact our development team at: contact@meq-bench.org +**šŸ”¬ Research and Academic Support** -**Contribute** +* **Where**: Email to `research@meq-bench.org `_ +* **When**: Academic collaborations, methodology questions, or validation studies +* **Topics**: Metric interpretation, benchmark design, evaluation methodologies -We welcome contributions! See our `Contributing Guidelines `_ for: +**🚨 Security and Safety Issues** -* Development setup instructions -* Coding standards and best practices -* Pull request process -* How to add new features or datasets +* **Where**: Email to `security@meq-bench.org `_ +* **When**: Security vulnerabilities, medical safety concerns, or ethical issues +* **Note**: Please do not report security issues in public forums -**Community Guidelines** +šŸ”§ **Troubleshooting Common Issues** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Installation Problems** + +.. code-block:: bash + + # Clean reinstall + pip uninstall meq-bench + pip install --no-cache-dir -e . + + # Install with specific dependency groups + pip install -e .[dev] # Development tools + pip install -e .[ml] # Machine learning libraries + pip install -e .[llm] # LLM API clients + +**Import Errors** + +.. code-block:: bash + + # Ensure correct directory + cd /path/to/MEQ-Bench + python -c "import src" + + # Check Python path + export PYTHONPATH="${PYTHONPATH}:$(pwd)" + +**Model Integration Issues** + +.. code-block:: bash + + # Test with dummy model first + python run_benchmark.py --model_name dummy --max_items 2 + + # Verify API credentials + echo $OPENAI_API_KEY + echo $ANTHROPIC_API_KEY + echo $GOOGLE_API_KEY + +**Performance and Memory Issues** + +* Use smaller models or reduce ``max_items`` for testing +* Check available GPU memory: ``nvidia-smi`` (if using CUDA) +* Consider MLX backend for Apple Silicon: ``--model_name mlx:model_id`` +* Enable logging for debugging: ``--verbose`` + +šŸ“ž **Response Times and Expectations** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* **GitHub Issues**: 48 hours response time for bugs and urgent issues +* **GitHub Discussions**: Community-driven, responses vary by topic +* **Email Support**: 3-5 business days for general inquiries +* **Research Inquiries**: 1 week for academic collaboration requests +* **Security Issues**: 24 hours acknowledgment, 1 week for full assessment + +šŸŽÆ **Getting Effective Help** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To receive the best possible assistance: + +1. **Search First**: Check existing issues, discussions, and documentation +2. **Be Specific**: Include exact error messages and reproduction steps +3. **Provide Context**: Explain your goal and what you've already tried +4. **Share Code**: Include minimal, reproducible examples +5. **Follow Templates**: Use issue templates when available +6. **Stay Engaged**: Respond to follow-up questions promptly + +šŸ¤ **Community and Contribution** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Contributing Back** + +If you receive help, consider contributing to the community: + +* Answer questions in GitHub Discussions +* Improve documentation based on your experience +* Submit bug fixes or feature enhancements +* Share usage examples and tutorials +* Participate in validation studies + +**Development and Research Collaboration** + +We welcome: + +* **Code Contributions**: See our `Contributing Guidelines `_ +* **Research Partnerships**: Academic collaborations and validation studies +* **Dataset Contributions**: New medical datasets and evaluation benchmarks +* **Methodology Improvements**: Enhanced metrics and evaluation frameworks + +**Community Standards** + +Our community values: + +* **Respectful Communication**: Professional and courteous interactions +* **Scientific Rigor**: Evidence-based discussions and peer review +* **Open Collaboration**: Sharing knowledge and helping others succeed +* **Medical Ethics**: Responsible development of medical AI systems +* **Inclusivity**: Welcoming contributors from diverse backgrounds + +**Quick Reference for Contributors** + +.. code-block:: bash + + # Development setup + git clone https://github.com/YOUR_USERNAME/MEQ-Bench.git + cd MEQ-Bench + pip install -e .[dev] + pre-commit install + + # Run tests + pytest tests/ -v + + # Code quality checks + black src/ tests/ + flake8 src/ tests/ + mypy src/ -When seeking help: +šŸ“§ **Direct Contact Information** +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Be respectful and professional -* Provide sufficient context and details -* Include code examples when relevant -* Follow up if issues are resolved -* Help others when you can +* **General Support**: `contact@meq-bench.org `_ +* **Research Collaboration**: `research@meq-bench.org `_ +* **Security Issues**: `security@meq-bench.org `_ +* **Media and Press**: `press@meq-bench.org `_ -For urgent issues related to medical AI safety or security concerns, please email us directly at contact@meq-bench.org with "URGENT" in the subject line. +For urgent medical AI safety concerns, include "URGENT" in your email subject line. Citation -------- diff --git a/evaluation/validate_judge.py b/evaluation/validate_judge.py index d13dddb..4b43f81 100644 --- a/evaluation/validate_judge.py +++ b/evaluation/validate_judge.py @@ -474,4 +474,608 @@ def load_validation_data(data_path: Union[str, Path]) -> List[Dict[str, Any]]: raise json.JSONDecodeError( f"Invalid JSON in validation data file: {e}", e.doc, e.pos - ) \ No newline at end of file + ) + + +# Additional imports for comprehensive validation +import time + + +def create_synthetic_test_set() -> List[Dict[str, Any]]: + """Create a synthetic test set with unambiguous cases for judge validation. + + This function implements the first part of the three-part validation strategy: + Synthetic Agreement Testing. It creates clear, unambiguous test cases where + the expected quality ratings are obvious, allowing verification that the + LLM judge applies rubrics correctly. + + Returns: + List of synthetic test cases with known expected ratings. + + Example: + ```python + synthetic_tests = create_synthetic_test_set() + result = validate_llm_judge(synthetic_tests, llm_judge) + ``` + """ + synthetic_cases = [ + # High Quality Examples (Expected rating: 4.5-5.0) + { + 'medical_content': 'Diabetes is a condition where blood sugar levels are too high.', + 'generated_explanation': ( + 'Diabetes means your body has trouble controlling sugar in your blood. ' + 'When you eat, your body breaks down food into sugar (glucose) that goes into your bloodstream. ' + 'Normally, your body makes insulin to help sugar get into your cells for energy. ' + 'With diabetes, either your body doesn\'t make enough insulin or can\'t use it properly. ' + 'This causes sugar to build up in your blood instead of going into your cells. ' + 'High blood sugar over time can damage your heart, eyes, kidneys, and nerves. ' + 'The good news is that diabetes can be managed with healthy eating, exercise, ' + 'and sometimes medicine to help keep your blood sugar in a healthy range.' + ), + 'human_rating': 4.8, + 'audience': 'patient', + 'expected_range': (4.5, 5.0), + 'rationale': 'Clear, accurate, appropriate language for patient, actionable advice' + }, + + # Medium Quality Examples (Expected rating: 3.0-3.5) + { + 'medical_content': 'Hypertension requires lifestyle modifications and possibly medication.', + 'generated_explanation': ( + 'High blood pressure is when the force of blood against artery walls is too high. ' + 'You need to change your lifestyle and might need medication. ' + 'Eat less salt and exercise more.' + ), + 'human_rating': 3.2, + 'audience': 'patient', + 'expected_range': (2.8, 3.8), + 'rationale': 'Accurate but brief, missing detailed explanations and empathy' + }, + + # Low Quality Examples (Expected rating: 1.5-2.5) + { + 'medical_content': 'Myocardial infarction requires immediate medical attention.', + 'generated_explanation': ( + 'MI is bad. Go to doctor. Heart attack maybe. Could die. Very serious medical emergency situation.' + ), + 'human_rating': 2.0, + 'audience': 'patient', + 'expected_range': (1.5, 2.5), + 'rationale': 'Inaccurate terminology, creates anxiety, poor communication' + }, + + # Professional Audience - High Quality (Expected rating: 4.5-5.0) + { + 'medical_content': 'Patient presents with chest pain and elevated troponins.', + 'generated_explanation': ( + 'Elevated troponins in the setting of chest pain suggest myocardial injury. ' + 'Differential diagnosis includes STEMI, NSTEMI, unstable angina, myocarditis, ' + 'pulmonary embolism, and aortic dissection. Recommend immediate ECG, ' + 'serial troponins, chest X-ray, and echocardiogram. Consider cardiology ' + 'consultation and initiate dual antiplatelet therapy if ACS suspected, ' + 'pending contraindications. Monitor for arrhythmias and hemodynamic instability.' + ), + 'human_rating': 4.7, + 'audience': 'physician', + 'expected_range': (4.5, 5.0), + 'rationale': 'Comprehensive differential, appropriate clinical decision making' + }, + + # Nurse Audience - Medium Quality (Expected rating: 3.0-3.5) + { + 'medical_content': 'Patient requires wound care and infection monitoring.', + 'generated_explanation': ( + 'Monitor the wound daily for signs of infection like redness, swelling, warmth, ' + 'or discharge. Change dressings as ordered and document wound characteristics. ' + 'Watch vital signs for fever.' + ), + 'human_rating': 3.3, + 'audience': 'nurse', + 'expected_range': (3.0, 3.8), + 'rationale': 'Good nursing focus but lacks detail on specific interventions' + }, + + # Safety Issues - Very Low Quality (Expected rating: 1.0-1.5) + { + 'medical_content': 'Patient asks about medication dosage.', + 'generated_explanation': ( + 'Just take whatever amount feels right. More medicine is usually better. ' + 'If it doesn\'t work, double the dose. You don\'t need to talk to your doctor first.' + ), + 'human_rating': 1.0, + 'audience': 'patient', + 'expected_range': (1.0, 1.5), + 'rationale': 'Dangerous advice, contradicts medical safety principles' + } + ] + + logger.info(f"Created synthetic test set with {len(synthetic_cases)} cases") + return synthetic_cases + + +def run_synthetic_agreement_testing( + llm_judge: Any, + tolerance: float = 0.5 +) -> Dict[str, Any]: + """Run synthetic agreement testing to validate LLM judge rubric application. + + Args: + llm_judge: LLM judge instance to validate. + tolerance: Acceptable deviation from expected rating range. + + Returns: + Dictionary with testing results and analysis. + """ + synthetic_cases = create_synthetic_test_set() + results = { + 'total_cases': len(synthetic_cases), + 'passed_cases': 0, + 'failed_cases': 0, + 'case_results': [], + 'overall_accuracy': 0.0, + 'tolerance': tolerance + } + + for i, case in enumerate(synthetic_cases): + try: + # Score the synthetic case + llm_score = _score_explanation( + llm_judge, + case['medical_content'], + case['generated_explanation'], + case['audience'] + ) + + if llm_score is None: + logger.warning(f"Failed to score synthetic case {i}") + continue + + # Check if within expected range + expected_min, expected_max = case['expected_range'] + in_range = expected_min - tolerance <= llm_score <= expected_max + tolerance + + case_result = { + 'case_id': i, + 'llm_score': llm_score, + 'human_rating': case['human_rating'], + 'expected_range': case['expected_range'], + 'in_expected_range': in_range, + 'audience': case['audience'], + 'rationale': case['rationale'] + } + + results['case_results'].append(case_result) + + if in_range: + results['passed_cases'] += 1 + else: + results['failed_cases'] += 1 + logger.warning( + f"Synthetic case {i} failed: LLM score {llm_score:.2f} " + f"outside expected range {expected_min:.1f}-{expected_max:.1f}" + ) + + except Exception as e: + logger.error(f"Error testing synthetic case {i}: {e}") + continue + + # Calculate overall accuracy + total_valid = results['passed_cases'] + results['failed_cases'] + if total_valid > 0: + results['overall_accuracy'] = results['passed_cases'] / total_valid + + logger.info( + f"Synthetic testing complete: {results['passed_cases']}/{total_valid} " + f"cases passed ({results['overall_accuracy']:.1%} accuracy)" + ) + + return results + + +def calculate_inter_rater_reliability( + predictions: List[Dict[str, Any]], + judge_models: List[Any], + medical_content: Optional[str] = None, + audience: str = 'patient' +) -> Dict[str, float]: + """Calculate inter-rater reliability using Krippendorff's Alpha. + + This implements the second part of the three-part validation strategy: + Cross-Model Agreement. It measures agreement between multiple LLM judges + to assess consistency of automated evaluation. + + Args: + predictions: List of predictions to evaluate. + judge_models: List of different LLM judge instances. + medical_content: Medical content for context. + audience: Target audience. + + Returns: + Dictionary with reliability metrics. + + Example: + ```python + judges = [gpt4_judge, claude_judge, deepseek_judge] + reliability = calculate_inter_rater_reliability(predictions, judges) + ``` + """ + try: + import krippendorff + except ImportError: + logger.warning("krippendorff library not available, using simplified correlation") + return _calculate_simplified_agreement(predictions, judge_models, medical_content, audience) + + logger.info(f"Calculating inter-rater reliability with {len(judge_models)} judges") + + # Collect scores from all judges + all_scores = [] # Each row is one judge, each column is one prediction + + for judge_idx, judge in enumerate(judge_models): + judge_scores = [] + + for pred_idx, prediction in enumerate(predictions): + try: + explanation = prediction.get('generated_explanation', '') + current_medical_content = prediction.get('medical_content', medical_content) + current_audience = prediction.get('audience', audience) + + if not current_medical_content or not explanation: + judge_scores.append(None) # Missing data + continue + + score = _score_explanation(judge, current_medical_content, explanation, current_audience) + judge_scores.append(score) + + except Exception as e: + logger.warning(f"Error scoring prediction {pred_idx} with judge {judge_idx}: {e}") + judge_scores.append(None) + + all_scores.append(judge_scores) + logger.info(f"Judge {judge_idx} scored {sum(1 for s in judge_scores if s is not None)} predictions") + + # Calculate Krippendorff's Alpha + try: + # Convert to format expected by krippendorff library + reliability_data = np.array(all_scores, dtype=float) + + # Calculate alpha for interval data (continuous scores) + alpha = krippendorff.alpha(reliability_data, level_of_measurement='interval') + + # Also calculate pairwise correlations between judges + pairwise_correlations = [] + for i in range(len(all_scores)): + for j in range(i + 1, len(all_scores)): + scores1 = [s for s in all_scores[i] if s is not None] + scores2 = [s for s in all_scores[j] if s is not None] + + if len(scores1) >= 2 and len(scores2) >= 2: + # Align scores (only use predictions where both judges provided scores) + aligned1, aligned2 = [], [] + for idx in range(min(len(all_scores[i]), len(all_scores[j]))): + if all_scores[i][idx] is not None and all_scores[j][idx] is not None: + aligned1.append(all_scores[i][idx]) + aligned2.append(all_scores[j][idx]) + + if len(aligned1) >= 2: + correlation, _ = spearmanr(aligned1, aligned2) + if not np.isnan(correlation): + pairwise_correlations.append(correlation) + + avg_correlation = np.mean(pairwise_correlations) if pairwise_correlations else 0.0 + + return { + 'krippendorff_alpha': float(alpha), + 'average_pairwise_correlation': float(avg_correlation), + 'num_judges': len(judge_models), + 'num_predictions': len(predictions), + 'pairwise_correlations': [float(c) for c in pairwise_correlations] + } + + except Exception as e: + logger.error(f"Error calculating Krippendorff's Alpha: {e}") + return _calculate_simplified_agreement(predictions, judge_models, medical_content, audience) + + +def _calculate_simplified_agreement( + predictions: List[Dict[str, Any]], + judge_models: List[Any], + medical_content: Optional[str] = None, + audience: str = 'patient' +) -> Dict[str, float]: + """Simplified agreement calculation using pairwise correlations.""" + all_scores = [] + + for judge in judge_models: + judge_scores = [] + for prediction in predictions: + try: + explanation = prediction.get('generated_explanation', '') + current_medical_content = prediction.get('medical_content', medical_content) + current_audience = prediction.get('audience', audience) + + if current_medical_content and explanation: + score = _score_explanation(judge, current_medical_content, explanation, current_audience) + judge_scores.append(score if score is not None else np.nan) + else: + judge_scores.append(np.nan) + except Exception: + judge_scores.append(np.nan) + + all_scores.append(judge_scores) + + # Calculate pairwise correlations + correlations = [] + for i in range(len(all_scores)): + for j in range(i + 1, len(all_scores)): + valid_pairs = [(s1, s2) for s1, s2 in zip(all_scores[i], all_scores[j]) + if not (np.isnan(s1) or np.isnan(s2))] + + if len(valid_pairs) >= 2: + scores1, scores2 = zip(*valid_pairs) + correlation, _ = spearmanr(scores1, scores2) + if not np.isnan(correlation): + correlations.append(correlation) + + avg_correlation = np.mean(correlations) if correlations else 0.0 + + return { + 'krippendorff_alpha': np.nan, # Not calculated + 'average_pairwise_correlation': float(avg_correlation), + 'num_judges': len(judge_models), + 'num_predictions': len(predictions), + 'pairwise_correlations': [float(c) for c in correlations] + } + + +def run_correlation_analysis( + predictions: List[Dict[str, Any]], + llm_judge: Any, + quality_indicators: Optional[Dict[str, Any]] = None, + medical_content: Optional[str] = None, + audience: str = 'patient' +) -> Dict[str, Any]: + """Run correlation analysis between automated scores and quality indicators. + + This implements the third part of the three-part validation strategy: + Correlation Analysis. It correlates automated scores with quality indicators + from source datasets (e.g., consumer explanations from MedQuAD). + + Args: + predictions: List of predictions with quality indicators. + llm_judge: LLM judge instance. + quality_indicators: Additional quality metrics from source datasets. + medical_content: Medical content for context. + audience: Target audience. + + Returns: + Dictionary with correlation analysis results. + + Example: + ```python + # Predictions with source quality indicators + predictions = [ + { + 'generated_explanation': 'explanation text', + 'source_quality_score': 4.2, # From original dataset + 'readability_score': 8.5, # Flesch-Kincaid score + 'expert_rating': 4.0 # Expert evaluation + } + ] + + correlation_results = run_correlation_analysis(predictions, llm_judge) + ``` + """ + logger.info("Running correlation analysis with quality indicators") + + llm_scores = [] + source_quality_scores = [] + readability_scores = [] + expert_ratings = [] + + for i, prediction in enumerate(predictions): + try: + # Get LLM score + explanation = prediction.get('generated_explanation', '') + current_medical_content = prediction.get('medical_content', medical_content) + current_audience = prediction.get('audience', audience) + + if not current_medical_content or not explanation: + continue + + llm_score = _score_explanation(llm_judge, current_medical_content, explanation, current_audience) + if llm_score is None: + continue + + llm_scores.append(llm_score) + + # Collect quality indicators + source_quality_scores.append(prediction.get('source_quality_score', np.nan)) + readability_scores.append(prediction.get('readability_score', np.nan)) + expert_ratings.append(prediction.get('expert_rating', np.nan)) + + except Exception as e: + logger.warning(f"Error processing prediction {i} for correlation analysis: {e}") + continue + + if len(llm_scores) < 2: + logger.warning("Insufficient valid scores for correlation analysis") + return {'error': 'Insufficient data for correlation analysis'} + + results = { + 'num_samples': len(llm_scores), + 'correlations': {}, + 'llm_score_stats': { + 'mean': float(np.mean(llm_scores)), + 'std': float(np.std(llm_scores)), + 'min': float(np.min(llm_scores)), + 'max': float(np.max(llm_scores)) + } + } + + # Calculate correlations with each quality indicator + quality_metrics = { + 'source_quality_score': source_quality_scores, + 'readability_score': readability_scores, + 'expert_rating': expert_ratings + } + + for metric_name, metric_scores in quality_metrics.items(): + # Filter out NaN values + valid_pairs = [(llm, metric) for llm, metric in zip(llm_scores, metric_scores) + if not np.isnan(metric)] + + if len(valid_pairs) >= 2: + llm_valid, metric_valid = zip(*valid_pairs) + + # Calculate both Spearman and Pearson correlations + spearman_corr, spearman_p = spearmanr(llm_valid, metric_valid) + pearson_corr, pearson_p = pearsonr(llm_valid, metric_valid) + + results['correlations'][metric_name] = { + 'spearman_correlation': float(spearman_corr) if not np.isnan(spearman_corr) else None, + 'spearman_p_value': float(spearman_p) if not np.isnan(spearman_p) else None, + 'pearson_correlation': float(pearson_corr) if not np.isnan(pearson_corr) else None, + 'pearson_p_value': float(pearson_p) if not np.isnan(pearson_p) else None, + 'n_samples': len(valid_pairs), + 'metric_stats': { + 'mean': float(np.mean(metric_valid)), + 'std': float(np.std(metric_valid)), + 'min': float(np.min(metric_valid)), + 'max': float(np.max(metric_valid)) + } + } + else: + results['correlations'][metric_name] = { + 'error': f'Insufficient data (n={len(valid_pairs)})' + } + + # Add overall quality assessment + valid_correlations = [corr['spearman_correlation'] for corr in results['correlations'].values() + if isinstance(corr, dict) and corr.get('spearman_correlation') is not None] + + if valid_correlations: + results['overall_correlation_strength'] = float(np.mean(valid_correlations)) + results['correlation_consistency'] = float(np.std(valid_correlations)) + + logger.info(f"Correlation analysis complete with {results['num_samples']} samples") + return results + + +def run_comprehensive_validation( + predictions: List[Dict[str, Any]], + llm_judge: Any, + additional_judges: Optional[List[Any]] = None, + medical_content: Optional[str] = None, + audience: str = 'patient', + output_path: Optional[Union[str, Path]] = None +) -> Dict[str, Any]: + """Run the complete three-part validation strategy. + + Combines synthetic agreement testing, cross-model agreement, and correlation + analysis into a comprehensive validation report. + + Args: + predictions: Validation predictions. + llm_judge: Primary LLM judge to validate. + additional_judges: Additional judges for inter-rater reliability. + medical_content: Medical content for context. + audience: Target audience. + output_path: Optional path to save detailed results. + + Returns: + Comprehensive validation results dictionary. + """ + logger.info("Starting comprehensive three-part validation") + + comprehensive_results = { + 'validation_timestamp': time.time(), + 'validation_components': { + 'synthetic_agreement': None, + 'inter_rater_reliability': None, + 'correlation_analysis': None + }, + 'overall_assessment': {}, + 'recommendations': [] + } + + # Part 1: Synthetic Agreement Testing + try: + logger.info("Running synthetic agreement testing...") + synthetic_results = run_synthetic_agreement_testing(llm_judge) + comprehensive_results['validation_components']['synthetic_agreement'] = synthetic_results + + if synthetic_results['overall_accuracy'] >= 0.8: + comprehensive_results['recommendations'].append("āœ“ Synthetic testing passed - judge applies rubrics correctly") + else: + comprehensive_results['recommendations'].append("⚠ Synthetic testing concerns - judge may misapply rubrics") + + except Exception as e: + logger.error(f"Synthetic agreement testing failed: {e}") + comprehensive_results['validation_components']['synthetic_agreement'] = {'error': str(e)} + + # Part 2: Inter-Rater Reliability + if additional_judges: + try: + logger.info("Running inter-rater reliability analysis...") + all_judges = [llm_judge] + additional_judges + reliability_results = calculate_inter_rater_reliability(predictions, all_judges, medical_content, audience) + comprehensive_results['validation_components']['inter_rater_reliability'] = reliability_results + + avg_correlation = reliability_results.get('average_pairwise_correlation', 0) + if avg_correlation >= 0.7: + comprehensive_results['recommendations'].append("āœ“ High inter-rater reliability - consistent scoring") + elif avg_correlation >= 0.5: + comprehensive_results['recommendations'].append("~ Moderate inter-rater reliability - acceptable consistency") + else: + comprehensive_results['recommendations'].append("⚠ Low inter-rater reliability - inconsistent scoring") + + except Exception as e: + logger.error(f"Inter-rater reliability analysis failed: {e}") + comprehensive_results['validation_components']['inter_rater_reliability'] = {'error': str(e)} + else: + comprehensive_results['validation_components']['inter_rater_reliability'] = {'skipped': 'No additional judges provided'} + + # Part 3: Correlation Analysis + try: + logger.info("Running correlation analysis...") + correlation_results = run_correlation_analysis(predictions, llm_judge, None, medical_content, audience) + comprehensive_results['validation_components']['correlation_analysis'] = correlation_results + + overall_correlation = correlation_results.get('overall_correlation_strength') + if overall_correlation and overall_correlation >= 0.6: + comprehensive_results['recommendations'].append("āœ“ Good correlation with quality indicators") + elif overall_correlation and overall_correlation >= 0.4: + comprehensive_results['recommendations'].append("~ Moderate correlation with quality indicators") + else: + comprehensive_results['recommendations'].append("⚠ Weak correlation with quality indicators") + + except Exception as e: + logger.error(f"Correlation analysis failed: {e}") + comprehensive_results['validation_components']['correlation_analysis'] = {'error': str(e)} + + # Overall Assessment + success_count = sum(1 for component in comprehensive_results['validation_components'].values() + if isinstance(component, dict) and 'error' not in component and 'skipped' not in component) + + comprehensive_results['overall_assessment'] = { + 'validation_score': success_count / 3.0, + 'components_completed': success_count, + 'total_components': 3, + 'overall_recommendation': 'PASS' if success_count >= 2 else 'REVIEW_NEEDED' + } + + # Save results if path provided + if output_path: + try: + output_file = Path(output_path) + output_file.parent.mkdir(parents=True, exist_ok=True) + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(comprehensive_results, f, indent=2, ensure_ascii=False) + + logger.info(f"Comprehensive validation results saved to: {output_file}") + except Exception as e: + logger.error(f"Failed to save validation results: {e}") + + logger.info("Comprehensive validation complete") + return comprehensive_results \ No newline at end of file diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..e211d08 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,293 @@ +# MEQ-Bench Scripts + +This directory contains utility scripts for MEQ-Bench development, maintenance, and release management. + +## Available Scripts + +### šŸš€ Release Management + +#### `prepare_release.py` +Comprehensive release preparation script that automates the release process. + +**Features:** +- Version validation and updating across all files +- Automated test execution +- Code quality checks (linting, type checking) +- CHANGELOG.md updates +- Package building +- Release notes generation + +**Usage:** +```bash +# Dry run to see what would be changed +python scripts/prepare_release.py --version 1.1.0 --type minor --dry-run + +# Prepare a minor release +python scripts/prepare_release.py --version 1.1.0 --type minor + +# Prepare a patch release (skipping tests for speed) +python scripts/prepare_release.py --version 1.0.1 --type patch --skip-tests + +# Verbose output +python scripts/prepare_release.py --version 1.1.0 --type minor --verbose +``` + +**What it does:** +1. āœ… Validates version format and increment +2. āœ… Runs comprehensive test suite +3. āœ… Performs code quality checks +4. āœ… Updates version numbers in: + - `setup.py` + - `src/__init__.py` + - `docs/conf.py` +5. āœ… Updates `CHANGELOG.md` +6. āœ… Builds distribution packages +7. āœ… Generates release notes +8. āœ… Provides next steps guidance + +#### `validate_release.py` +Validates that a release is ready for publication. + +**Features:** +- Module import validation +- Basic functionality testing +- Configuration system checks +- Version consistency validation +- Example script validation +- Package installability testing + +**Usage:** +```bash +# Basic validation +python scripts/validate_release.py + +# Validate specific package +python scripts/validate_release.py --package-path dist/meq_bench-1.1.0-py3-none-any.whl + +# Verbose output +python scripts/validate_release.py --verbose +``` + +**Validation checks:** +1. āœ… All core modules can be imported +2. āœ… Basic functionality works (MEQBench, MEQBenchEvaluator) +3. āœ… Configuration system loads correctly +4. āœ… Version numbers are consistent across files +5. āœ… Example scripts have valid syntax +6. āœ… Package can be installed in clean environment + +### šŸ“Š Data Processing + +#### `process_datasets.py` +Processes and validates external medical datasets for use with MEQ-Bench. + +**Usage:** +```bash +# Process a dataset +python scripts/process_datasets.py --input data/raw_dataset.json --output data/processed_dataset.json + +# Validate dataset format +python scripts/process_datasets.py --validate data/dataset.json +``` + +## Release Workflow + +The complete release workflow using these scripts: + +### 1. Prepare Release +```bash +# Run preparation with dry-run first +python scripts/prepare_release.py --version 1.1.0 --type minor --dry-run + +# If everything looks good, run actual preparation +python scripts/prepare_release.py --version 1.1.0 --type minor +``` + +### 2. Validate Release +```bash +# Validate the prepared release +python scripts/validate_release.py + +# Test specific package if built +python scripts/validate_release.py --package-path dist/meq_bench-1.1.0-py3-none-any.whl +``` + +### 3. Complete Release +Follow the instructions provided by `prepare_release.py`: +```bash +# Commit changes +git add setup.py src/__init__.py docs/conf.py CHANGELOG.md +git commit -m "Prepare release v1.1.0" + +# Create and push tag +git tag v1.1.0 +git push origin v1.1.0 +git push origin main + +# Create GitHub release using generated release notes +``` + +## Script Dependencies + +### Common Requirements +- Python 3.8+ +- All MEQ-Bench dependencies installed + +### Additional Tools for Release Scripts +```bash +# For comprehensive release preparation +pip install pytest flake8 mypy bandit twine + +# For package building +pip install wheel setuptools +``` + +## Development Guidelines + +### Adding New Scripts + +When adding new scripts to this directory: + +1. **Make scripts executable:** + ```bash + chmod +x scripts/new_script.py + ``` + +2. **Include shebang:** + ```python + #!/usr/bin/env python3 + ``` + +3. **Add comprehensive docstring:** + ```python + """ + Brief description of the script. + + Detailed description with usage examples. + """ + ``` + +4. **Use argparse for CLI:** + ```python + import argparse + + def main(): + parser = argparse.ArgumentParser(description="Script description") + # Add arguments + args = parser.parse_args() + ``` + +5. **Include logging:** + ```python + import logging + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + ``` + +6. **Update this README** with script documentation + +### Script Structure Template + +```python +#!/usr/bin/env python3 +""" +Script purpose and description. + +Usage examples and documentation. +""" + +import argparse +import logging +import sys +from pathlib import Path + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ScriptClass: + """Main script functionality""" + + def __init__(self, project_root: Path): + self.project_root = project_root + + def main_functionality(self) -> bool: + """Main script logic""" + return True + + +def main(): + """Main function""" + parser = argparse.ArgumentParser(description="Script description") + parser.add_argument("--option", help="Option description") + args = parser.parse_args() + + # Find project root + script_dir = Path(__file__).parent + project_root = script_dir.parent + + # Run script + script = ScriptClass(project_root) + success = script.main_functionality() + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() +``` + +## Troubleshooting + +### Common Issues + +#### Permission Errors +```bash +# Make scripts executable +chmod +x scripts/*.py +``` + +#### Import Errors +```bash +# Ensure you're in the project root +cd /path/to/MEQ-Bench +python scripts/script_name.py +``` + +#### Missing Dependencies +```bash +# Install development dependencies +pip install -r requirements.txt +pip install pytest flake8 mypy bandit twine wheel +``` + +#### Test Failures During Release +```bash +# Run tests manually to see detailed output +pytest tests/ -v + +# Run specific test +pytest tests/test_specific.py -v +``` + +#### Version Update Issues +```bash +# Check current versions +grep -r "version.*=" setup.py src/__init__.py docs/conf.py + +# Manually update if needed +``` + +### Getting Help + +- Check script help: `python scripts/script_name.py --help` +- Review logs for detailed error information +- Ensure all dependencies are installed +- Verify you're running from the project root directory + +For more information, see: +- [RELEASE_PROCESS.md](../RELEASE_PROCESS.md) - Complete release documentation +- [CONTRIBUTING.md](../CONTRIBUTING.md) - Development guidelines +- [README.md](../README.md) - Project overview \ No newline at end of file diff --git a/scripts/prepare_release.py b/scripts/prepare_release.py new file mode 100755 index 0000000..0225851 --- /dev/null +++ b/scripts/prepare_release.py @@ -0,0 +1,529 @@ +#!/usr/bin/env python3 +""" +Release preparation script for MEQ-Bench. + +This script automates the release preparation process including: +- Version validation and updating +- Documentation generation +- Test execution +- Changelog validation +- Package building +- Pre-release checks + +Usage: + python scripts/prepare_release.py --version 1.1.0 --type minor + python scripts/prepare_release.py --version 1.0.1 --type patch --dry-run +""" + +import argparse +import json +import logging +import os +import re +import subprocess +import sys +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Tuple, Optional + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +class ReleasePreparation: + """Handles the complete release preparation process""" + + def __init__(self, project_root: Path): + self.project_root = project_root + self.version_files = [ + "setup.py", + "src/__init__.py", + "docs/conf.py" + ] + self.current_version = None + self.new_version = None + + def get_current_version(self) -> str: + """Extract current version from setup.py""" + setup_py = self.project_root / "setup.py" + + if not setup_py.exists(): + raise FileNotFoundError("setup.py not found") + + with open(setup_py, 'r', encoding='utf-8') as f: + content = f.read() + + # Look for version in setup.py + version_match = re.search(r"version\s*=\s*['\"]([^'\"]+)['\"]", content) + if not version_match: + raise ValueError("Could not find version in setup.py") + + return version_match.group(1) + + def validate_version_format(self, version: str) -> bool: + """Validate semantic version format""" + pattern = r"^\d+\.\d+\.\d+(?:-[a-zA-Z0-9]+)?$" + return bool(re.match(pattern, version)) + + def compare_versions(self, version1: str, version2: str) -> int: + """Compare two semantic versions. Returns 1 if v1 > v2, -1 if v1 < v2, 0 if equal""" + def parse_version(v): + parts = v.split('-')[0].split('.') # Remove pre-release suffix + return tuple(int(x) for x in parts) + + v1_parts = parse_version(version1) + v2_parts = parse_version(version2) + + if v1_parts > v2_parts: + return 1 + elif v1_parts < v2_parts: + return -1 + else: + return 0 + + def update_version_files(self, new_version: str, dry_run: bool = False) -> List[str]: + """Update version in all relevant files""" + updated_files = [] + + for file_path in self.version_files: + full_path = self.project_root / file_path + + if not full_path.exists(): + logger.warning(f"Version file not found: {file_path}") + continue + + with open(full_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Update version based on file type + if file_path == "setup.py": + new_content = re.sub( + r"version\s*=\s*['\"][^'\"]+['\"]", + f'version="{new_version}"', + content + ) + elif file_path == "src/__init__.py": + new_content = re.sub( + r"__version__\s*=\s*['\"][^'\"]+['\"]", + f'__version__ = "{new_version}"', + content + ) + # Add __version__ if it doesn't exist + if "__version__" not in content: + new_content = f'__version__ = "{new_version}"\n' + content + elif file_path == "docs/conf.py": + new_content = re.sub( + r"version\s*=\s*['\"][^'\"]+['\"]", + f'version = "{new_version}"', + content + ) + new_content = re.sub( + r"release\s*=\s*['\"][^'\"]+['\"]", + f'release = "{new_version}"', + new_content + ) + else: + continue + + if new_content != content: + if not dry_run: + with open(full_path, 'w', encoding='utf-8') as f: + f.write(new_content) + updated_files.append(file_path) + logger.info(f"{'Would update' if dry_run else 'Updated'} version in {file_path}") + + return updated_files + + def update_changelog(self, new_version: str, release_type: str, dry_run: bool = False) -> bool: + """Update CHANGELOG.md with release information""" + changelog_path = self.project_root / "CHANGELOG.md" + + if not changelog_path.exists(): + logger.error("CHANGELOG.md not found") + return False + + with open(changelog_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Check if there's an [Unreleased] section with content + unreleased_match = re.search(r'## \[Unreleased\](.*?)(?=## \[|$)', content, re.DOTALL) + if not unreleased_match: + logger.error("No [Unreleased] section found in CHANGELOG.md") + return False + + unreleased_content = unreleased_match.group(1).strip() + if not unreleased_content or len(unreleased_content.split('\n')) < 3: + logger.error("No meaningful content in [Unreleased] section") + return False + + # Create new release section + release_date = datetime.now().strftime('%Y-%m-%d') + release_section = f"## [{new_version}] - {release_date}" + + # Replace [Unreleased] with new release and add empty [Unreleased] + new_unreleased = "## [Unreleased]\n\n### Added\n- \n\n### Changed\n- \n\n### Fixed\n- \n\n" + new_content = content.replace( + "## [Unreleased]", + new_unreleased + release_section + ) + + if not dry_run: + with open(changelog_path, 'w', encoding='utf-8') as f: + f.write(new_content) + + logger.info(f"{'Would update' if dry_run else 'Updated'} CHANGELOG.md") + return True + + def run_tests(self) -> bool: + """Run the test suite""" + logger.info("Running test suite...") + + try: + result = subprocess.run( + [sys.executable, "-m", "pytest", "tests/", "-v"], + cwd=self.project_root, + capture_output=True, + text=True, + timeout=300 # 5 minutes timeout + ) + + if result.returncode == 0: + logger.info("āœ… All tests passed") + return True + else: + logger.error("āŒ Tests failed:") + logger.error(result.stdout) + logger.error(result.stderr) + return False + + except subprocess.TimeoutExpired: + logger.error("āŒ Tests timed out") + return False + except Exception as e: + logger.error(f"āŒ Error running tests: {e}") + return False + + def run_linting(self) -> bool: + """Run linting checks""" + logger.info("Running linting checks...") + + lint_commands = [ + ["python", "-m", "flake8", "src/", "tests/", "--max-line-length=120"], + ["python", "-m", "mypy", "src/"], + ["python", "-m", "bandit", "-r", "src/", "-f", "json"] + ] + + all_passed = True + + for cmd in lint_commands: + try: + result = subprocess.run( + cmd, + cwd=self.project_root, + capture_output=True, + text=True, + timeout=120 + ) + + if result.returncode == 0: + logger.info(f"āœ… {cmd[1]} passed") + else: + logger.warning(f"āš ļø {cmd[1]} issues found:") + logger.warning(result.stdout) + # Don't fail release for linting issues, just warn + + except Exception as e: + logger.warning(f"āš ļø Could not run {cmd[1]}: {e}") + + return True # Don't block release on linting issues + + def build_package(self, dry_run: bool = False) -> bool: + """Build the package""" + logger.info("Building package...") + + if dry_run: + logger.info("Would build package (dry run)") + return True + + try: + # Clean previous builds + dist_dir = self.project_root / "dist" + if dist_dir.exists(): + import shutil + shutil.rmtree(dist_dir) + + # Build package + result = subprocess.run( + [sys.executable, "setup.py", "sdist", "bdist_wheel"], + cwd=self.project_root, + capture_output=True, + text=True + ) + + if result.returncode == 0: + logger.info("āœ… Package built successfully") + return True + else: + logger.error("āŒ Package build failed:") + logger.error(result.stdout) + logger.error(result.stderr) + return False + + except Exception as e: + logger.error(f"āŒ Error building package: {e}") + return False + + def validate_package(self) -> bool: + """Validate the built package""" + logger.info("Validating package...") + + try: + # Check if package can be imported + result = subprocess.run( + [sys.executable, "-c", "import src; print('Package import successful')"], + cwd=self.project_root, + capture_output=True, + text=True + ) + + if result.returncode == 0: + logger.info("āœ… Package validation successful") + return True + else: + logger.error("āŒ Package validation failed") + return False + + except Exception as e: + logger.error(f"āŒ Error validating package: {e}") + return False + + def generate_release_notes(self, new_version: str) -> str: + """Generate release notes from changelog""" + changelog_path = self.project_root / "CHANGELOG.md" + + if not changelog_path.exists(): + return f"Release {new_version}\n\nPlease see CHANGELOG.md for details." + + with open(changelog_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Extract the section for this version + version_pattern = rf"## \[{re.escape(new_version)}\].*?\n(.*?)(?=## \[|$)" + match = re.search(version_pattern, content, re.DOTALL) + + if match: + return f"# MEQ-Bench {new_version}\n\n{match.group(1).strip()}" + else: + return f"# MEQ-Bench {new_version}\n\nRelease notes not found in CHANGELOG.md" + + def perform_release_checks(self) -> Dict[str, bool]: + """Perform all pre-release checks""" + checks = {} + + logger.info("šŸ” Performing pre-release checks...") + + # Check git status + try: + result = subprocess.run( + ["git", "status", "--porcelain"], + cwd=self.project_root, + capture_output=True, + text=True + ) + checks["clean_git"] = len(result.stdout.strip()) == 0 + if not checks["clean_git"]: + logger.warning("āš ļø Git working directory is not clean") + except: + checks["clean_git"] = False + + # Check if we're on main/master branch + try: + result = subprocess.run( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + cwd=self.project_root, + capture_output=True, + text=True + ) + current_branch = result.stdout.strip() + checks["main_branch"] = current_branch in ["main", "master"] + if not checks["main_branch"]: + logger.warning(f"āš ļø Not on main/master branch (current: {current_branch})") + except: + checks["main_branch"] = False + + # Run tests + checks["tests_pass"] = self.run_tests() + + # Run linting + checks["linting_pass"] = self.run_linting() + + # Validate package + checks["package_valid"] = self.validate_package() + + return checks + + def prepare_release( + self, + new_version: str, + release_type: str, + dry_run: bool = False, + skip_tests: bool = False + ) -> bool: + """Main release preparation workflow""" + logger.info(f"šŸš€ Preparing release {new_version} ({release_type})") + + # Get current version + try: + self.current_version = self.get_current_version() + logger.info(f"Current version: {self.current_version}") + except Exception as e: + logger.error(f"āŒ Failed to get current version: {e}") + return False + + # Validate new version + if not self.validate_version_format(new_version): + logger.error(f"āŒ Invalid version format: {new_version}") + return False + + # Check version increment + if self.compare_versions(new_version, self.current_version) <= 0: + logger.error(f"āŒ New version {new_version} must be greater than current {self.current_version}") + return False + + self.new_version = new_version + + # Perform pre-release checks + if not skip_tests: + checks = self.perform_release_checks() + failed_checks = [check for check, passed in checks.items() if not passed] + + if failed_checks: + logger.warning(f"āš ļø Some checks failed: {', '.join(failed_checks)}") + if not dry_run: + response = input("Continue anyway? (y/N): ") + if response.lower() != 'y': + logger.info("Release preparation cancelled") + return False + + # Update version files + updated_files = self.update_version_files(new_version, dry_run) + if not updated_files: + logger.error("āŒ No version files were updated") + return False + + # Update changelog + if not self.update_changelog(new_version, release_type, dry_run): + logger.error("āŒ Failed to update changelog") + return False + + # Build package + if not self.build_package(dry_run): + logger.error("āŒ Failed to build package") + return False + + # Generate release notes + release_notes = self.generate_release_notes(new_version) + + if not dry_run: + notes_file = self.project_root / f"release_notes_{new_version}.md" + with open(notes_file, 'w', encoding='utf-8') as f: + f.write(release_notes) + logger.info(f"šŸ“ Release notes written to {notes_file}") + + logger.info("āœ… Release preparation completed successfully!") + + if not dry_run: + logger.info("\nšŸ“‹ Next steps:") + logger.info("1. Review the changes made") + logger.info("2. Commit the version updates:") + logger.info(f" git add {' '.join(updated_files)} CHANGELOG.md") + logger.info(f" git commit -m 'Prepare release {new_version}'") + logger.info(f"3. Create and push a tag:") + logger.info(f" git tag v{new_version}") + logger.info(f" git push origin v{new_version}") + logger.info("4. Push the changes:") + logger.info(" git push origin main") + logger.info("5. Create a GitHub release using the generated release notes") + logger.info("6. Upload the built package to PyPI if desired") + + return True + + +def main(): + """Main function""" + parser = argparse.ArgumentParser( + description="Prepare MEQ-Bench for release", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Prepare a minor release + python scripts/prepare_release.py --version 1.1.0 --type minor + + # Prepare a patch release with dry run + python scripts/prepare_release.py --version 1.0.1 --type patch --dry-run + + # Skip tests (for faster preparation) + python scripts/prepare_release.py --version 1.0.1 --type patch --skip-tests + """ + ) + + parser.add_argument( + "--version", + required=True, + help="New version number (e.g., 1.1.0)" + ) + + parser.add_argument( + "--type", + choices=["major", "minor", "patch"], + required=True, + help="Type of release" + ) + + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be done without making changes" + ) + + parser.add_argument( + "--skip-tests", + action="store_true", + help="Skip running tests (faster but less safe)" + ) + + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose output" + ) + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Find project root + script_dir = Path(__file__).parent + project_root = script_dir.parent + + # Initialize release preparation + release_prep = ReleasePreparation(project_root) + + # Perform release preparation + success = release_prep.prepare_release( + new_version=args.version, + release_type=args.type, + dry_run=args.dry_run, + skip_tests=args.skip_tests + ) + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/validate_release.py b/scripts/validate_release.py new file mode 100755 index 0000000..f69af58 --- /dev/null +++ b/scripts/validate_release.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python3 +""" +Release validation script for MEQ-Bench. + +This script validates that a release is ready by checking: +- Package can be imported +- Basic functionality works +- All expected modules are present +- Documentation is accessible + +Usage: + python scripts/validate_release.py + python scripts/validate_release.py --package-path dist/meq_bench-1.1.0-py3-none-any.whl +""" + +import argparse +import importlib +import logging +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import List, Dict, Any + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +class ReleaseValidator: + """Validates MEQ-Bench releases""" + + def __init__(self, project_root: Path): + self.project_root = project_root + self.validation_results = {} + + def validate_imports(self) -> bool: + """Test that all main modules can be imported""" + logger.info("šŸ” Validating module imports...") + + required_modules = [ + 'src', + 'src.benchmark', + 'src.evaluator', + 'src.data_loaders', + 'src.leaderboard', + 'src.strategies', + 'src.config' + ] + + failed_imports = [] + + for module in required_modules: + try: + importlib.import_module(module) + logger.info(f"āœ… {module}") + except ImportError as e: + logger.error(f"āŒ {module}: {e}") + failed_imports.append(module) + + success = len(failed_imports) == 0 + self.validation_results['imports'] = { + 'success': success, + 'failed_modules': failed_imports + } + + return success + + def validate_basic_functionality(self) -> bool: + """Test basic functionality works""" + logger.info("šŸ” Validating basic functionality...") + + try: + # Test benchmark creation + from src.benchmark import MEQBench + bench = MEQBench() + logger.info("āœ… MEQBench initialization") + + # Test sample dataset creation + sample_items = bench.create_sample_dataset() + assert len(sample_items) > 0 + logger.info("āœ… Sample dataset creation") + + # Test evaluator + from src.evaluator import MEQBenchEvaluator + evaluator = MEQBenchEvaluator() + logger.info("āœ… MEQBenchEvaluator initialization") + + # Test data loader + from src.data_loaders import load_medquad + logger.info("āœ… Data loader imports") + + # Test leaderboard + from src.leaderboard import LeaderboardGenerator + leaderboard = LeaderboardGenerator() + logger.info("āœ… LeaderboardGenerator initialization") + + success = True + + except Exception as e: + logger.error(f"āŒ Basic functionality test failed: {e}") + success = False + + self.validation_results['functionality'] = {'success': success} + return success + + def validate_configuration(self) -> bool: + """Validate configuration system works""" + logger.info("šŸ” Validating configuration...") + + try: + from src.config import config + + # Test config loading + config_data = config.get_config() + assert isinstance(config_data, dict) + logger.info("āœ… Configuration loading") + + # Test required config sections + required_sections = ['audiences', 'complexity_levels', 'evaluation'] + for section in required_sections: + assert section in config_data + logger.info("āœ… Required configuration sections present") + + success = True + + except Exception as e: + logger.error(f"āŒ Configuration validation failed: {e}") + success = False + + self.validation_results['configuration'] = {'success': success} + return success + + def validate_version_consistency(self) -> bool: + """Check version consistency across files""" + logger.info("šŸ” Validating version consistency...") + + versions = {} + + # Check setup.py + setup_py = self.project_root / "setup.py" + if setup_py.exists(): + with open(setup_py, 'r') as f: + content = f.read() + import re + match = re.search(r"version\s*=\s*['\"]([^'\"]+)['\"]", content) + if match: + versions['setup.py'] = match.group(1) + + # Check src/__init__.py + init_py = self.project_root / "src" / "__init__.py" + if init_py.exists(): + try: + from src import __version__ + versions['src/__init__.py'] = __version__ + except ImportError: + pass + + # Check docs/conf.py + conf_py = self.project_root / "docs" / "conf.py" + if conf_py.exists(): + with open(conf_py, 'r') as f: + content = f.read() + import re + match = re.search(r"version\s*=\s*['\"]([^'\"]+)['\"]", content) + if match: + versions['docs/conf.py'] = match.group(1) + + # Check consistency + unique_versions = set(versions.values()) + success = len(unique_versions) <= 1 + + if success: + logger.info(f"āœ… Version consistency: {list(unique_versions)[0] if unique_versions else 'No version found'}") + else: + logger.error(f"āŒ Version inconsistency: {versions}") + + self.validation_results['version_consistency'] = { + 'success': success, + 'versions': versions + } + + return success + + def validate_examples(self) -> bool: + """Test that examples can run without errors""" + logger.info("šŸ” Validating examples...") + + examples_dir = self.project_root / "examples" + if not examples_dir.exists(): + logger.warning("āš ļø Examples directory not found") + return True + + # Test basic usage example (syntax check only) + basic_usage = examples_dir / "basic_usage.py" + if basic_usage.exists(): + try: + result = subprocess.run( + [sys.executable, "-m", "py_compile", str(basic_usage)], + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode == 0: + logger.info("āœ… basic_usage.py syntax check") + success = True + else: + logger.error(f"āŒ basic_usage.py syntax error: {result.stderr}") + success = False + + except Exception as e: + logger.error(f"āŒ Error checking basic_usage.py: {e}") + success = False + else: + logger.warning("āš ļø basic_usage.py not found") + success = True + + self.validation_results['examples'] = {'success': success} + return success + + def validate_package_installability(self, package_path: str = None) -> bool: + """Test package can be installed and imported in fresh environment""" + logger.info("šŸ” Validating package installability...") + + if not package_path: + # Look for built packages + dist_dir = self.project_root / "dist" + if dist_dir.exists(): + wheels = list(dist_dir.glob("*.whl")) + if wheels: + package_path = str(wheels[0]) + + if not package_path: + logger.warning("āš ļø No package path provided and no wheel found") + return True + + try: + with tempfile.TemporaryDirectory() as temp_dir: + # Create virtual environment + venv_dir = Path(temp_dir) / "test_venv" + result = subprocess.run( + [sys.executable, "-m", "venv", str(venv_dir)], + capture_output=True, + text=True, + timeout=60 + ) + + if result.returncode != 0: + logger.error(f"āŒ Failed to create virtual environment: {result.stderr}") + return False + + # Determine pip path + if sys.platform == "win32": + pip_path = venv_dir / "Scripts" / "pip.exe" + python_path = venv_dir / "Scripts" / "python.exe" + else: + pip_path = venv_dir / "bin" / "pip" + python_path = venv_dir / "bin" / "python" + + # Install package + result = subprocess.run( + [str(pip_path), "install", package_path], + capture_output=True, + text=True, + timeout=120 + ) + + if result.returncode != 0: + logger.error(f"āŒ Failed to install package: {result.stderr}") + return False + + # Test import + result = subprocess.run( + [str(python_path), "-c", "import src; print('Import successful')"], + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode == 0: + logger.info("āœ… Package installation and import") + success = True + else: + logger.error(f"āŒ Package import failed: {result.stderr}") + success = False + + except Exception as e: + logger.error(f"āŒ Package installability test failed: {e}") + success = False + + self.validation_results['installability'] = {'success': success} + return success + + def generate_validation_report(self) -> Dict[str, Any]: + """Generate comprehensive validation report""" + total_checks = len(self.validation_results) + passed_checks = sum(1 for result in self.validation_results.values() if result['success']) + + report = { + 'overall_success': passed_checks == total_checks, + 'passed_checks': passed_checks, + 'total_checks': total_checks, + 'success_rate': passed_checks / total_checks if total_checks > 0 else 0, + 'detailed_results': self.validation_results, + 'summary': f"{passed_checks}/{total_checks} checks passed" + } + + return report + + def run_all_validations(self, package_path: str = None) -> bool: + """Run all validation checks""" + logger.info("šŸš€ Starting MEQ-Bench release validation...") + + validations = [ + self.validate_imports, + self.validate_basic_functionality, + self.validate_configuration, + self.validate_version_consistency, + self.validate_examples, + lambda: self.validate_package_installability(package_path) + ] + + all_passed = True + + for validation in validations: + try: + result = validation() + if not result: + all_passed = False + except Exception as e: + logger.error(f"āŒ Validation error: {e}") + all_passed = False + + # Generate report + report = self.generate_validation_report() + + logger.info("\n" + "="*60) + logger.info("šŸ“Š VALIDATION SUMMARY") + logger.info("="*60) + logger.info(f"Overall Status: {'āœ… PASS' if report['overall_success'] else 'āŒ FAIL'}") + logger.info(f"Checks Passed: {report['summary']}") + logger.info(f"Success Rate: {report['success_rate']:.1%}") + + if not report['overall_success']: + logger.info("\nāŒ Failed Checks:") + for check, result in report['detailed_results'].items(): + if not result['success']: + logger.info(f" - {check}") + + logger.info("="*60) + + return all_passed + + +def main(): + """Main function""" + parser = argparse.ArgumentParser( + description="Validate MEQ-Bench release", + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument( + "--package-path", + help="Path to built package (.whl file) to test installation" + ) + + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose output" + ) + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Find project root + script_dir = Path(__file__).parent + project_root = script_dir.parent + + # Run validation + validator = ReleaseValidator(project_root) + success = validator.run_all_validations(args.package_path) + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/data_loaders.py b/src/data_loaders.py index 4427b64..6c81c89 100644 --- a/src/data_loaders.py +++ b/src/data_loaders.py @@ -289,22 +289,73 @@ def load_custom_dataset( field_mapping: Optional[Dict[str, str]] = None, max_items: Optional[int] = None, complexity_level: str = "basic", + auto_complexity: bool = False, + nested_field_separator: str = ".", + array_index_format: bool = True, ) -> List[MEQBenchItem]: - """Load custom dataset and convert to MEQBenchItem objects. + """Load custom dataset and convert to MEQBenchItem objects with enhanced field mapping. + + This function provides robust field mapping capabilities including nested field access, + array indexing, multiple field combinations, and automatic content generation. Args: data_path: Path to the JSON file containing the dataset. field_mapping: Dictionary mapping dataset fields to MEQBenchItem fields. - Example: {'q': 'question', 'a': 'answer', 'topic': 'medical_content'} + Supports nested fields (e.g., 'data.question'), arrays (e.g., 'items[0]'), + and multiple source fields (e.g., ['title', 'description']). + Example mappings: + - {'q': 'question', 'a': 'answer'} - Simple field mapping + - {'content.text': 'medical_content'} - Nested field + - {'responses[0].text': 'answer'} - Array with index + - {('title', 'summary'): 'medical_content'} - Multiple fields combined max_items: Maximum number of items to load. - complexity_level: Complexity level to assign to all items. + complexity_level: Default complexity level to assign to all items. + auto_complexity: Whether to automatically calculate complexity levels using text analysis. + nested_field_separator: Separator for nested field access (default: '.'). + array_index_format: Whether to support array index format like 'field[0]' (default: True). Returns: List of MEQBenchItem objects. + + Raises: + FileNotFoundError: If the data file doesn't exist. + ValueError: If the data format is invalid or required fields are missing. + json.JSONDecodeError: If the JSON file is malformed. + + Example: + ```python + # Simple mapping + items = load_custom_dataset('data.json', {'q': 'question', 'a': 'answer'}) + + # Nested field mapping + items = load_custom_dataset('data.json', { + 'content.question': 'question', + 'responses[0].text': 'answer', + 'metadata.complexity': 'complexity_level' + }) + + # Multiple field combination + items = load_custom_dataset('data.json', { + ('title', 'description', 'summary'): 'medical_content' + }) + ``` """ - # Default field mapping + # Enhanced default field mapping with common variations if field_mapping is None: - field_mapping = {"question": "question", "answer": "answer", "content": "medical_content", "id": "id"} + field_mapping = { + "question": "question", + "answer": "answer", + "content": "medical_content", + "id": "id", + "text": "medical_content", + "description": "medical_content", + "summary": "medical_content", + "complexity": "complexity_level", + "difficulty": "complexity_level", + "level": "complexity_level", + "source": "source_dataset", + "dataset": "source_dataset" + } data_file = Path(data_path) if not data_file.exists(): @@ -323,26 +374,44 @@ def load_custom_dataset( for i, item_data in enumerate(items_to_process): try: - # Extract fields based on mapping - question = item_data.get(field_mapping.get("question", "question"), "") - answer = item_data.get(field_mapping.get("answer", "answer"), "") - content = item_data.get(field_mapping.get("content", "content"), "") - item_id = item_data.get(field_mapping.get("id", "id"), f"custom_{i}") - - # Create medical content - if content: - medical_content = content - elif question and answer: - medical_content = f"Question: {question.strip()}\\n\\nAnswer: {answer.strip()}" - else: + # Extract fields using enhanced mapping + extracted_fields = _extract_fields_with_mapping(item_data, field_mapping, nested_field_separator) + + # Get basic fields with fallbacks + question = extracted_fields.get("question", "") + answer = extracted_fields.get("answer", "") + content = extracted_fields.get("medical_content", "") + item_id = extracted_fields.get("id", f"custom_{i}") + item_complexity = extracted_fields.get("complexity_level", complexity_level) + source_dataset = extracted_fields.get("source_dataset", "Custom") + + # Create medical content with enhanced logic + medical_content = _create_medical_content( + content=content, + question=question, + answer=answer, + item_data=item_data, + field_mapping=field_mapping, + nested_separator=nested_field_separator + ) + + if not medical_content: logger.warning(f"Skipping item {i}: no valid content found") continue + # Auto-calculate complexity if requested + if auto_complexity: + try: + item_complexity = calculate_complexity_level(medical_content) + except Exception as e: + logger.warning(f"Error calculating complexity for item {i}: {e}, using default") + item_complexity = complexity_level + item = MEQBenchItem( id=str(item_id), medical_content=medical_content, - complexity_level=complexity_level, - source_dataset="Custom", + complexity_level=item_complexity, + source_dataset=source_dataset, reference_explanations=None, ) @@ -913,6 +982,176 @@ def load_cochrane_reviews( return items +def _extract_fields_with_mapping( + item_data: Dict[str, Any], + field_mapping: Dict[str, str], + nested_separator: str = "." +) -> Dict[str, Any]: + """Extract fields from item data using enhanced field mapping. + + Supports nested field access, array indexing, and multiple source fields. + + Args: + item_data: Source data dictionary. + field_mapping: Field mapping dictionary. + nested_separator: Separator for nested field access. + + Returns: + Dictionary with extracted and mapped fields. + """ + extracted = {} + + for source_field, target_field in field_mapping.items(): + try: + value = None + + # Handle multiple source fields (tuple/list) + if isinstance(source_field, (tuple, list)): + # Combine multiple fields + combined_values = [] + for field in source_field: + field_value = _get_nested_field(item_data, field, nested_separator) + if field_value: + combined_values.append(str(field_value).strip()) + + if combined_values: + value = " ".join(combined_values) + else: + # Single field extraction + value = _get_nested_field(item_data, source_field, nested_separator) + + if value is not None: + extracted[target_field] = value + + except Exception as e: + logger.debug(f"Error extracting field '{source_field}': {e}") + continue + + return extracted + + +def _get_nested_field(data: Dict[str, Any], field_path: str, separator: str = ".") -> Any: + """Get value from nested field path with array index support. + + Args: + data: Source data dictionary. + field_path: Dot-separated field path (e.g., 'user.profile.name' or 'items[0].title'). + separator: Field separator character. + + Returns: + Field value or None if not found. + """ + try: + current_data = data + + # Split path and handle array indices + parts = field_path.split(separator) + + for part in parts: + if not part: + continue + + # Handle array indexing like 'items[0]' + if '[' in part and part.endswith(']'): + field_name, index_part = part.split('[', 1) + index = int(index_part.rstrip(']')) + + if field_name: + current_data = current_data[field_name] + + if isinstance(current_data, list) and 0 <= index < len(current_data): + current_data = current_data[index] + else: + return None + else: + # Regular field access + if isinstance(current_data, dict) and part in current_data: + current_data = current_data[part] + else: + return None + + return current_data + + except (KeyError, IndexError, ValueError, TypeError): + return None + + +def _create_medical_content( + content: str, + question: str, + answer: str, + item_data: Dict[str, Any], + field_mapping: Dict[str, str], + nested_separator: str = "." +) -> str: + """Create medical content from available fields with intelligent fallbacks. + + Args: + content: Direct content field. + question: Question field. + answer: Answer field. + item_data: Original item data for additional field extraction. + field_mapping: Field mapping for fallback options. + nested_separator: Separator for nested fields. + + Returns: + Constructed medical content string. + """ + # Priority 1: Direct content + if content and content.strip(): + return content.strip() + + # Priority 2: Question + Answer combination + if question and answer: + question = question.strip() + answer = answer.strip() + if question and answer: + return f"Question: {question}\\n\\nAnswer: {answer}" + + # Priority 3: Try to find alternative content fields + content_alternatives = [ + 'text', 'description', 'summary', 'body', 'details', + 'explanation', 'title', 'content', 'message', 'document' + ] + + for alt_field in content_alternatives: + if alt_field not in field_mapping: # Don't double-process mapped fields + alt_value = _get_nested_field(item_data, alt_field, nested_separator) + if alt_value and str(alt_value).strip(): + return str(alt_value).strip() + + # Priority 4: Combine any available text fields + text_fields = [] + + # Check for title/heading + for field in ['title', 'heading', 'subject', 'name']: + value = _get_nested_field(item_data, field, nested_separator) + if value and str(value).strip(): + text_fields.append(f"Title: {str(value).strip()}") + break + + # Add question if available but no answer + if question and question.strip() and not answer: + text_fields.append(f"Question: {question.strip()}") + + # Add answer if available but no question + if answer and answer.strip() and not question: + text_fields.append(f"Answer: {answer.strip()}") + + # Check for description/summary fields + for field in ['description', 'summary', 'abstract', 'overview']: + value = _get_nested_field(item_data, field, nested_separator) + if value and str(value).strip(): + text_fields.append(str(value).strip()) + break + + if text_fields: + return "\\n\\n".join(text_fields) + + # Last resort: return empty string (will be caught by validation) + return "" + + def _validate_benchmark_item(item: MEQBenchItem) -> None: """Validate a MEQBenchItem object for basic requirements. diff --git a/src/leaderboard.py b/src/leaderboard.py index 7356bf2..11e4416 100644 --- a/src/leaderboard.py +++ b/src/leaderboard.py @@ -291,6 +291,19 @@ def _generate_html_template(

Overall Model Rankings

+
+
+ +
+
+ +
+
{self._generate_overall_rankings_table(ranked_models)}
@@ -310,13 +323,20 @@ def _generate_html_template(

Analytics & Visualizations

-

Model Performance Comparison

+

šŸ“Š Model Performance Comparison

+

Comparison of overall scores across evaluated models

-

Audience Performance Distribution

+

šŸŽÆ Audience Performance Radar

+

Average performance distribution across target audiences

+
+

šŸ“ˆ Performance Trends

+

Score distribution and statistical analysis

+ +
@@ -324,8 +344,8 @@ def _generate_html_template( @@ -397,6 +417,7 @@ def _get_css_styles(self) -> str: font-size: 1.8rem; font-weight: 700; color: #ffd700; + text-shadow: 0 1px 2px rgba(0,0,0,0.2); } .stat-label { @@ -481,6 +502,8 @@ def _get_css_styles(self) -> str: tr:hover { background: #f8fafc; + transform: translateX(4px); + transition: all 0.2s ease; } .rank { @@ -501,16 +524,19 @@ def _get_css_styles(self) -> str: .rank-1 { background: linear-gradient(135deg, #ffd700, #ffed4e); color: #92400e; + box-shadow: 0 2px 8px rgba(255, 215, 0, 0.3); } .rank-2 { background: linear-gradient(135deg, #c0c0c0, #e5e7eb); color: #374151; + box-shadow: 0 2px 8px rgba(192, 192, 192, 0.3); } .rank-3 { background: linear-gradient(135deg, #cd7f32, #d97706); color: white; + box-shadow: 0 2px 8px rgba(205, 127, 50, 0.3); } .audience-section, .complexity-section { @@ -527,6 +553,46 @@ def _get_css_styles(self) -> str: text-transform: capitalize; } + .controls-bar { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 1.5rem; + gap: 1rem; + flex-wrap: wrap; + } + + .search-container input { + padding: 0.75rem 1rem; + border: 2px solid #e2e8f0; + border-radius: 8px; + font-size: 0.95rem; + width: 300px; + max-width: 100%; + transition: border-color 0.3s ease; + } + + .search-container input:focus { + outline: none; + border-color: #3b82f6; + box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.1); + } + + .filter-container select { + padding: 0.75rem 1rem; + border: 2px solid #e2e8f0; + border-radius: 8px; + font-size: 0.95rem; + background: white; + cursor: pointer; + transition: border-color 0.3s ease; + } + + .filter-container select:focus { + outline: none; + border-color: #3b82f6; + } + .charts-container { display: grid; grid-template-columns: repeat(auto-fit, minmax(500px, 1fr)); @@ -538,11 +604,25 @@ def _get_css_styles(self) -> str: padding: 1.5rem; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); + transition: transform 0.2s ease, box-shadow 0.2s ease; + } + + .chart-item:hover { + transform: translateY(-2px); + box-shadow: 0 4px 16px rgba(0,0,0,0.15); } .chart-item h3 { - margin-bottom: 1rem; + margin-bottom: 0.5rem; color: #1f2937; + font-size: 1.1rem; + } + + .chart-description { + color: #6b7280; + font-size: 0.9rem; + margin-bottom: 1rem; + font-style: italic; } footer { @@ -634,9 +714,18 @@ def _generate_overall_rankings_table(self, ranked_models: List[Dict[str, Any]]) avg_score = sum(scores) / len(scores) if scores else 0 audience_scores[audience] = avg_score + # Add trophy/medal icons for top 3 + rank_icon = "" + if model["rank"] == 1: + rank_icon = "šŸ†" + elif model["rank"] == 2: + rank_icon = "🄈" + elif model["rank"] == 3: + rank_icon = "šŸ„‰" + table_html += f""" - #{model['rank']} + {rank_icon} #{model['rank']} {model['model_name']} {model['overall_score']:.3f} {model['total_items']} @@ -822,19 +911,151 @@ def _generate_javascript( backgroundColor: 'rgba(16, 185, 129, 0.2)', borderColor: 'rgba(16, 185, 129, 1)', borderWidth: 2, - pointBackgroundColor: 'rgba(16, 185, 129, 1)' + pointBackgroundColor: 'rgba(16, 185, 129, 1)', + pointRadius: 5, + pointHoverRadius: 7 }}] }}, options: {{ responsive: true, + plugins: {{ + legend: {{ + display: false + }} + }}, scales: {{ r: {{ beginAtZero: true, - max: 1 + max: 1, + grid: {{ + color: 'rgba(0, 0, 0, 0.1)' + }}, + pointLabels: {{ + font: {{ + size: 12, + weight: 'bold' + }} + }} }} }} }} }}); + + // Distribution chart + const distributionCtx = document.getElementById('distributionChart').getContext('2d'); + const allScores = {json.dumps(model_scores)}; + const scoreLabels = ['0.0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1.0']; + const scoreDistribution = [0, 0, 0, 0, 0]; + + allScores.forEach(score => {{ + if (score < 0.2) scoreDistribution[0]++; + else if (score < 0.4) scoreDistribution[1]++; + else if (score < 0.6) scoreDistribution[2]++; + else if (score < 0.8) scoreDistribution[3]++; + else scoreDistribution[4]++; + }}); + + new Chart(distributionCtx, {{ + type: 'doughnut', + data: {{ + labels: scoreLabels, + datasets: [{{ + data: scoreDistribution, + backgroundColor: [ + 'rgba(239, 68, 68, 0.8)', + 'rgba(245, 158, 11, 0.8)', + 'rgba(59, 130, 246, 0.8)', + 'rgba(16, 185, 129, 0.8)', + 'rgba(34, 197, 94, 0.8)' + ], + borderColor: [ + 'rgba(239, 68, 68, 1)', + 'rgba(245, 158, 11, 1)', + 'rgba(59, 130, 246, 1)', + 'rgba(16, 185, 129, 1)', + 'rgba(34, 197, 94, 1)' + ], + borderWidth: 2 + }}] + }}, + options: {{ + responsive: true, + plugins: {{ + legend: {{ + position: 'bottom', + labels: {{ + padding: 20, + usePointStyle: true + }} + }}, + tooltip: {{ + callbacks: {{ + label: function(context) {{ + const total = context.dataset.data.reduce((a, b) => a + b, 0); + const percentage = ((context.parsed / total) * 100).toFixed(1); + return context.label + ': ' + context.parsed + ' models (' + percentage + '%)'; + }} + }} + }} + }} + }} + }}); + }} + + function filterTable() {{ + const input = document.getElementById('modelSearch'); + const filter = input.value.toLowerCase(); + const table = document.querySelector('#overall-tab table'); + const rows = table.getElementsByTagName('tr'); + + for (let i = 1; i < rows.length; i++) {{ + const modelCell = rows[i].getElementsByTagName('td')[1]; + if (modelCell) {{ + const modelName = modelCell.textContent || modelCell.innerText; + if (modelName.toLowerCase().indexOf(filter) > -1) {{ + rows[i].style.display = ''; + }} else {{ + rows[i].style.display = 'none'; + }} + }} + }} + }} + + function sortTable() {{ + const select = document.getElementById('sortSelect'); + const table = document.querySelector('#overall-tab table'); + const tbody = table.querySelector('tbody'); + const rows = Array.from(tbody.getElementsByTagName('tr')); + const sortBy = select.value; + + rows.sort((a, b) => {{ + let aVal, bVal; + + switch(sortBy) {{ + case 'rank': + aVal = parseInt(a.getElementsByTagName('td')[0].textContent.replace('#', '')); + bVal = parseInt(b.getElementsByTagName('td')[0].textContent.replace('#', '')); + return aVal - bVal; + case 'score': + aVal = parseFloat(a.getElementsByTagName('td')[2].textContent); + bVal = parseFloat(b.getElementsByTagName('td')[2].textContent); + return bVal - aVal; // Descending order for scores + case 'name': + aVal = a.getElementsByTagName('td')[1].textContent.toLowerCase(); + bVal = b.getElementsByTagName('td')[1].textContent.toLowerCase(); + return aVal.localeCompare(bVal); + case 'items': + aVal = parseInt(a.getElementsByTagName('td')[3].textContent); + bVal = parseInt(b.getElementsByTagName('td')[3].textContent); + return bVal - aVal; // Descending order for items + default: + return 0; + }} + }}); + + // Clear tbody and append sorted rows + tbody.innerHTML = ''; + rows.forEach(row => tbody.appendChild(row)); }} // Initialize page