Merge pull request #37 from instructkr/dev-v2

LogicKor V2 Update
instructkr · Jun 28, 2024 · eabb637 · eabb637
2 parents cd32ad8 + a61d06f
commit eabb637
Show file tree

Hide file tree

Showing 397 changed files with 15,119 additions and 1,541 deletions.
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
@@ -0,0 +1,33 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+      - name: Get pip cache dir
+        id: pip-cache
+        run: |
+          echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
+      - name: pip/pre-commit cache
+        uses: actions/cache@v3
+        with:
+          path: |
+            ${{ steps.pip-cache.outputs.dir }}
+            ~/.cache/pre-commit
+          key: ${{ runner.os }}-pip-pre-commit-${{ hashFiles('**/.pre-commit-config.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-pre-commit
+      - name: pre-commit #  don't use in self-hosted `- uses: pre-commit/[email protected]`
+        run: |
+          pip install pre-commit
+          pre-commit install --install-hooks
+          pre-commit run -a
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.DS_Store
+.ipynb_checkpoints
+**/.DS_Store
+**/.ipynb_checkpoints
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,33 @@
+exclude: ^(exps)
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+        types: [file, python]
+      - id: trailing-whitespace
+        types: [file, python]
+      - id: mixed-line-ending
+      - id: check-added-large-files
+        args: [--maxkb=4096]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.4.10
+    hooks:
+      # Run the linter.
+      - id: ruff
+      # Run the formatter.
+      - id: ruff-format
+  - repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        name: isort (python)
+        args: ["--profile", "black", "-l", "120", "--lines-after-imports", "2"]
+  - repo: https://github.com/pycqa/flake8.git
+    rev: 7.1.0
+    hooks:
+      - id: flake8
+        types: [python]
+        args: ["--max-line-length", "120", "--ignore", "F811,F841,E203,E402,E501,E712,W503,E704,E731"]
diff --git a/__pycache__/templates.cpython-310.pyc b/__pycache__/templates.cpython-310.pyc
diff --git a/evaluated/Bllossom/llama-3-Korean-Bllossom-70B/1-shot.jsonl b/evaluated/Bllossom/llama-3-Korean-Bllossom-70B/1-shot.jsonl
diff --git a/evaluated/Bllossom/llama-3-Korean-Bllossom-70B/cot-1-shot.jsonl b/evaluated/Bllossom/llama-3-Korean-Bllossom-70B/cot-1-shot.jsonl
diff --git a/evaluated/Bllossom/llama-3-Korean-Bllossom-70B/default.jsonl b/evaluated/Bllossom/llama-3-Korean-Bllossom-70B/default.jsonl
diff --git a/evaluated/CohereForAI/aya-23-35B/1-shot.jsonl b/evaluated/CohereForAI/aya-23-35B/1-shot.jsonl
diff --git a/evaluated/CohereForAI/aya-23-35B/cot-1-shot.jsonl b/evaluated/CohereForAI/aya-23-35B/cot-1-shot.jsonl
diff --git a/evaluated/CohereForAI/aya-23-35B/default.jsonl b/evaluated/CohereForAI/aya-23-35B/default.jsonl
diff --git a/evaluated/CohereForAI/c4ai-command-r-plus/1-shot.jsonl b/evaluated/CohereForAI/c4ai-command-r-plus/1-shot.jsonl
diff --git a/evaluated/CohereForAI/c4ai-command-r-plus/cot-1-shot.jsonl b/evaluated/CohereForAI/c4ai-command-r-plus/cot-1-shot.jsonl
diff --git a/evaluated/CohereForAI/c4ai-command-r-plus/default.jsonl b/evaluated/CohereForAI/c4ai-command-r-plus/default.jsonl
diff --git a/evaluated/OrionStarAI/Orion-14B-Chat/1-shot.jsonl b/evaluated/OrionStarAI/Orion-14B-Chat/1-shot.jsonl
diff --git a/evaluated/OrionStarAI/Orion-14B-Chat/cot-1-shot.jsonl b/evaluated/OrionStarAI/Orion-14B-Chat/cot-1-shot.jsonl
diff --git a/evaluated/OrionStarAI/Orion-14B-Chat/default.jsonl b/evaluated/OrionStarAI/Orion-14B-Chat/default.jsonl
diff --git a/evaluated/Qwen/Qwen1.5-14B-Chat/1-shot.jsonl b/evaluated/Qwen/Qwen1.5-14B-Chat/1-shot.jsonl
diff --git a/evaluated/Qwen/Qwen1.5-14B-Chat/cot-1-shot.jsonl b/evaluated/Qwen/Qwen1.5-14B-Chat/cot-1-shot.jsonl
diff --git a/evaluated/Qwen/Qwen1.5-14B-Chat/default.jsonl b/evaluated/Qwen/Qwen1.5-14B-Chat/default.jsonl
diff --git a/evaluated/Qwen/Qwen2-57B-A14B-Instruct/1-shot.jsonl b/evaluated/Qwen/Qwen2-57B-A14B-Instruct/1-shot.jsonl
diff --git a/evaluated/Qwen/Qwen2-57B-A14B-Instruct/cot-1-shot.jsonl b/evaluated/Qwen/Qwen2-57B-A14B-Instruct/cot-1-shot.jsonl
diff --git a/evaluated/Qwen/Qwen2-57B-A14B-Instruct/default.jsonl b/evaluated/Qwen/Qwen2-57B-A14B-Instruct/default.jsonl
diff --git a/evaluated/Qwen/Qwen2-72B-Instruct/1-shot.jsonl b/evaluated/Qwen/Qwen2-72B-Instruct/1-shot.jsonl
diff --git a/evaluated/Qwen/Qwen2-72B-Instruct/cot-1-shot.jsonl b/evaluated/Qwen/Qwen2-72B-Instruct/cot-1-shot.jsonl
diff --git a/evaluated/Qwen/Qwen2-72B-Instruct/default.jsonl b/evaluated/Qwen/Qwen2-72B-Instruct/default.jsonl
diff --git a/evaluated/Qwen/Qwen2-7B-Instruct/1-shot.jsonl b/evaluated/Qwen/Qwen2-7B-Instruct/1-shot.jsonl
diff --git a/evaluated/Qwen/Qwen2-7B-Instruct/cot-1-shot.jsonl b/evaluated/Qwen/Qwen2-7B-Instruct/cot-1-shot.jsonl
diff --git a/evaluated/Qwen/Qwen2-7B-Instruct/default.jsonl b/evaluated/Qwen/Qwen2-7B-Instruct/default.jsonl
diff --git a/evaluated/THUDM/glm-4-9b-chat/1-shot.jsonl b/evaluated/THUDM/glm-4-9b-chat/1-shot.jsonl
diff --git a/evaluated/THUDM/glm-4-9b-chat/cot-1-shot.jsonl b/evaluated/THUDM/glm-4-9b-chat/cot-1-shot.jsonl
diff --git a/evaluated/THUDM/glm-4-9b-chat/default.jsonl b/evaluated/THUDM/glm-4-9b-chat/default.jsonl
diff --git a/evaluated/allganize/Llama-3-Alpha-Ko-8B-Instruct/1-shot.jsonl b/evaluated/allganize/Llama-3-Alpha-Ko-8B-Instruct/1-shot.jsonl
diff --git a/evaluated/allganize/Llama-3-Alpha-Ko-8B-Instruct/cot-1-shot.jsonl b/evaluated/allganize/Llama-3-Alpha-Ko-8B-Instruct/cot-1-shot.jsonl
diff --git a/evaluated/allganize/Llama-3-Alpha-Ko-8B-Instruct/default.jsonl b/evaluated/allganize/Llama-3-Alpha-Ko-8B-Instruct/default.jsonl
diff --git a/evaluated/alpindale/WizardLM-2-8x22B/1-shot.jsonl b/evaluated/alpindale/WizardLM-2-8x22B/1-shot.jsonl
diff --git a/evaluated/alpindale/WizardLM-2-8x22B/cot-1-shot.jsonl b/evaluated/alpindale/WizardLM-2-8x22B/cot-1-shot.jsonl
diff --git a/evaluated/alpindale/WizardLM-2-8x22B/default.jsonl b/evaluated/alpindale/WizardLM-2-8x22B/default.jsonl
diff --git a/evaluated/anthropic/claude-3-5-sonnet-20240620/1-shot.jsonl b/evaluated/anthropic/claude-3-5-sonnet-20240620/1-shot.jsonl
diff --git a/evaluated/anthropic/claude-3-5-sonnet-20240620/cot-1-shot.jsonl b/evaluated/anthropic/claude-3-5-sonnet-20240620/cot-1-shot.jsonl
diff --git a/evaluated/anthropic/claude-3-5-sonnet-20240620/default.jsonl b/evaluated/anthropic/claude-3-5-sonnet-20240620/default.jsonl
diff --git a/evaluated/anthropic/claude-3-haiku-20240307/1-shot.jsonl b/evaluated/anthropic/claude-3-haiku-20240307/1-shot.jsonl
diff --git a/evaluated/anthropic/claude-3-haiku-20240307/cot-1-shot.jsonl b/evaluated/anthropic/claude-3-haiku-20240307/cot-1-shot.jsonl
diff --git a/evaluated/anthropic/claude-3-haiku-20240307/default.jsonl b/evaluated/anthropic/claude-3-haiku-20240307/default.jsonl
diff --git a/evaluated/anthropic/claude-3-opus-20240229/1-shot.jsonl b/evaluated/anthropic/claude-3-opus-20240229/1-shot.jsonl
diff --git a/evaluated/anthropic/claude-3-opus-20240229/cot-1-shot.jsonl b/evaluated/anthropic/claude-3-opus-20240229/cot-1-shot.jsonl
diff --git a/evaluated/anthropic/claude-3-opus-20240229/default.jsonl b/evaluated/anthropic/claude-3-opus-20240229/default.jsonl
diff --git a/evaluated/anthropic/claude-3-sonnet-20240229/1-shot.jsonl b/evaluated/anthropic/claude-3-sonnet-20240229/1-shot.jsonl
diff --git a/evaluated/anthropic/claude-3-sonnet-20240229/cot-1-shot.jsonl b/evaluated/anthropic/claude-3-sonnet-20240229/cot-1-shot.jsonl
diff --git a/evaluated/anthropic/claude-3-sonnet-20240229/default.jsonl b/evaluated/anthropic/claude-3-sonnet-20240229/default.jsonl
diff --git a/evaluated/chihoonlee10/T3Q-ko-solar-dpo-v3.0/1-shot.jsonl b/evaluated/chihoonlee10/T3Q-ko-solar-dpo-v3.0/1-shot.jsonl
diff --git a/evaluated/chihoonlee10/T3Q-ko-solar-dpo-v3.0/cot-1-shot.jsonl b/evaluated/chihoonlee10/T3Q-ko-solar-dpo-v3.0/cot-1-shot.jsonl
diff --git a/evaluated/chihoonlee10/T3Q-ko-solar-dpo-v3.0/default.jsonl b/evaluated/chihoonlee10/T3Q-ko-solar-dpo-v3.0/default.jsonl
diff --git a/evaluated/davidkim205/nox-solar-10.7b-v4/1-shot.jsonl b/evaluated/davidkim205/nox-solar-10.7b-v4/1-shot.jsonl
diff --git a/evaluated/davidkim205/nox-solar-10.7b-v4/cot-1-shot.jsonl b/evaluated/davidkim205/nox-solar-10.7b-v4/cot-1-shot.jsonl
diff --git a/evaluated/davidkim205/nox-solar-10.7b-v4/default.jsonl b/evaluated/davidkim205/nox-solar-10.7b-v4/default.jsonl
diff --git a/evaluated/google/gemma-7b-it/1-shot.jsonl b/evaluated/google/gemma-7b-it/1-shot.jsonl
diff --git a/evaluated/google/gemma-7b-it/cot-1-shot.jsonl b/evaluated/google/gemma-7b-it/cot-1-shot.jsonl
diff --git a/evaluated/google/gemma-7b-it/default.jsonl b/evaluated/google/gemma-7b-it/default.jsonl
diff --git a/evaluated/google/palm-2-codechat-bison-32k/1-shot.jsonl b/evaluated/google/palm-2-codechat-bison-32k/1-shot.jsonl
diff --git a/evaluated/google/palm-2-codechat-bison-32k/cot-1-shot.jsonl b/evaluated/google/palm-2-codechat-bison-32k/cot-1-shot.jsonl
diff --git a/evaluated/google/palm-2-codechat-bison-32k/default.jsonl b/evaluated/google/palm-2-codechat-bison-32k/default.jsonl
diff --git a/evaluated/maywell/Synatra-7B-v0.3-dpo/1-shot.jsonl b/evaluated/maywell/Synatra-7B-v0.3-dpo/1-shot.jsonl
diff --git a/evaluated/maywell/Synatra-7B-v0.3-dpo/cot-1-shot.jsonl b/evaluated/maywell/Synatra-7B-v0.3-dpo/cot-1-shot.jsonl
diff --git a/evaluated/maywell/Synatra-7B-v0.3-dpo/default.jsonl b/evaluated/maywell/Synatra-7B-v0.3-dpo/default.jsonl
diff --git a/evaluated/maywell/Synatra-kiqu-10.7B/1-shot.jsonl b/evaluated/maywell/Synatra-kiqu-10.7B/1-shot.jsonl
diff --git a/evaluated/maywell/Synatra-kiqu-10.7B/cot-1-shot.jsonl b/evaluated/maywell/Synatra-kiqu-10.7B/cot-1-shot.jsonl
diff --git a/evaluated/maywell/Synatra-kiqu-10.7B/default.jsonl b/evaluated/maywell/Synatra-kiqu-10.7B/default.jsonl
diff --git a/evaluated/maywell/Synatra-kiqu-7B/1-shot.jsonl b/evaluated/maywell/Synatra-kiqu-7B/1-shot.jsonl
diff --git a/evaluated/maywell/Synatra-kiqu-7B/cot-1-shot.jsonl b/evaluated/maywell/Synatra-kiqu-7B/cot-1-shot.jsonl
diff --git a/evaluated/maywell/Synatra-kiqu-7B/default.jsonl b/evaluated/maywell/Synatra-kiqu-7B/default.jsonl
diff --git a/evaluated/maywell/TinyWand-kiqu/1-shot.jsonl b/evaluated/maywell/TinyWand-kiqu/1-shot.jsonl
diff --git a/evaluated/maywell/TinyWand-kiqu/cot-1-shot.jsonl b/evaluated/maywell/TinyWand-kiqu/cot-1-shot.jsonl
diff --git a/evaluated/maywell/TinyWand-kiqu/default.jsonl b/evaluated/maywell/TinyWand-kiqu/default.jsonl
diff --git a/evaluated/meta-llama/Meta-Llama-3-70B-Instruct/1-shot.jsonl b/evaluated/meta-llama/Meta-Llama-3-70B-Instruct/1-shot.jsonl
diff --git a/evaluated/meta-llama/Meta-Llama-3-70B-Instruct/cot-1-shot.jsonl b/evaluated/meta-llama/Meta-Llama-3-70B-Instruct/cot-1-shot.jsonl
diff --git a/evaluated/meta-llama/Meta-Llama-3-70B-Instruct/default.jsonl b/evaluated/meta-llama/Meta-Llama-3-70B-Instruct/default.jsonl
diff --git a/evaluated/meta-llama/Meta-Llama-3-8B-Instruct/1-shot.jsonl b/evaluated/meta-llama/Meta-Llama-3-8B-Instruct/1-shot.jsonl
diff --git a/evaluated/meta-llama/Meta-Llama-3-8B-Instruct/cot-1-shot.jsonl b/evaluated/meta-llama/Meta-Llama-3-8B-Instruct/cot-1-shot.jsonl
diff --git a/evaluated/meta-llama/Meta-Llama-3-8B-Instruct/default.jsonl b/evaluated/meta-llama/Meta-Llama-3-8B-Instruct/default.jsonl
diff --git a/evaluated/meta-llama/llama-2-13b-chat/1-shot.jsonl b/evaluated/meta-llama/llama-2-13b-chat/1-shot.jsonl
diff --git a/evaluated/meta-llama/llama-2-13b-chat/cot-1-shot.jsonl b/evaluated/meta-llama/llama-2-13b-chat/cot-1-shot.jsonl
diff --git a/evaluated/meta-llama/llama-2-13b-chat/default.jsonl b/evaluated/meta-llama/llama-2-13b-chat/default.jsonl
diff --git a/evaluated/meta-llama/llama-2-70b-chat/1-shot.jsonl b/evaluated/meta-llama/llama-2-70b-chat/1-shot.jsonl
diff --git a/evaluated/meta-llama/llama-2-70b-chat/cot-1-shot.jsonl b/evaluated/meta-llama/llama-2-70b-chat/cot-1-shot.jsonl
diff --git a/evaluated/meta-llama/llama-2-70b-chat/default.jsonl b/evaluated/meta-llama/llama-2-70b-chat/default.jsonl
diff --git a/evaluated/microsoft/Phi-3-medium-4k-instruct/1-shot.jsonl b/evaluated/microsoft/Phi-3-medium-4k-instruct/1-shot.jsonl
diff --git a/evaluated/microsoft/Phi-3-medium-4k-instruct/cot-1-shot.jsonl b/evaluated/microsoft/Phi-3-medium-4k-instruct/cot-1-shot.jsonl
diff --git a/evaluated/microsoft/Phi-3-medium-4k-instruct/default.jsonl b/evaluated/microsoft/Phi-3-medium-4k-instruct/default.jsonl
diff --git a/evaluated/mirlab/AkaLlama-llama3-70b-v0.1/1-shot.jsonl b/evaluated/mirlab/AkaLlama-llama3-70b-v0.1/1-shot.jsonl
diff --git a/evaluated/mirlab/AkaLlama-llama3-70b-v0.1/cot-1-shot.jsonl b/evaluated/mirlab/AkaLlama-llama3-70b-v0.1/cot-1-shot.jsonl
diff --git a/evaluated/mirlab/AkaLlama-llama3-70b-v0.1/default.jsonl b/evaluated/mirlab/AkaLlama-llama3-70b-v0.1/default.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct-v0.1/1-shot.jsonl b/evaluated/mistralai/mistral-7b-instruct-v0.1/1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct-v0.1/cot-1-shot.jsonl b/evaluated/mistralai/mistral-7b-instruct-v0.1/cot-1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct-v0.1/default.jsonl b/evaluated/mistralai/mistral-7b-instruct-v0.1/default.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct-v0.2/1-shot.jsonl b/evaluated/mistralai/mistral-7b-instruct-v0.2/1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct-v0.2/cot-1-shot.jsonl b/evaluated/mistralai/mistral-7b-instruct-v0.2/cot-1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct-v0.2/default.jsonl b/evaluated/mistralai/mistral-7b-instruct-v0.2/default.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct-v0.3/1-shot.jsonl b/evaluated/mistralai/mistral-7b-instruct-v0.3/1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct-v0.3/cot-1-shot.jsonl b/evaluated/mistralai/mistral-7b-instruct-v0.3/cot-1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct-v0.3/default.jsonl b/evaluated/mistralai/mistral-7b-instruct-v0.3/default.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct/1-shot.jsonl b/evaluated/mistralai/mistral-7b-instruct/1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct/cot-1-shot.jsonl b/evaluated/mistralai/mistral-7b-instruct/cot-1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-7b-instruct/default.jsonl b/evaluated/mistralai/mistral-7b-instruct/default.jsonl
diff --git a/evaluated/mistralai/mistral-tiny/1-shot.jsonl b/evaluated/mistralai/mistral-tiny/1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-tiny/cot-1-shot.jsonl b/evaluated/mistralai/mistral-tiny/cot-1-shot.jsonl
diff --git a/evaluated/mistralai/mistral-tiny/default.jsonl b/evaluated/mistralai/mistral-tiny/default.jsonl
diff --git a/evaluated/mistralai/mixtral-8x22b/1-shot.jsonl b/evaluated/mistralai/mixtral-8x22b/1-shot.jsonl
diff --git a/evaluated/mistralai/mixtral-8x22b/cot-1-shot.jsonl b/evaluated/mistralai/mixtral-8x22b/cot-1-shot.jsonl
diff --git a/evaluated/mistralai/mixtral-8x22b/default.jsonl b/evaluated/mistralai/mixtral-8x22b/default.jsonl
diff --git a/evaluated/mistralai/mixtral-8x7b-instruct/1-shot.jsonl b/evaluated/mistralai/mixtral-8x7b-instruct/1-shot.jsonl
diff --git a/evaluated/mistralai/mixtral-8x7b-instruct/cot-1-shot.jsonl b/evaluated/mistralai/mixtral-8x7b-instruct/cot-1-shot.jsonl
diff --git a/evaluated/mistralai/mixtral-8x7b-instruct/default.jsonl b/evaluated/mistralai/mixtral-8x7b-instruct/default.jsonl
diff --git a/evaluated/neversleep/llama-3-lumimaid-70b/1-shot.jsonl b/evaluated/neversleep/llama-3-lumimaid-70b/1-shot.jsonl
diff --git a/evaluated/neversleep/llama-3-lumimaid-70b/cot-1-shot.jsonl b/evaluated/neversleep/llama-3-lumimaid-70b/cot-1-shot.jsonl
diff --git a/evaluated/neversleep/llama-3-lumimaid-70b/default.jsonl b/evaluated/neversleep/llama-3-lumimaid-70b/default.jsonl
diff --git a/evaluated/neversleep/llama-3-lumimaid-8b/1-shot.jsonl b/evaluated/neversleep/llama-3-lumimaid-8b/1-shot.jsonl
diff --git a/evaluated/neversleep/llama-3-lumimaid-8b/cot-1-shot.jsonl b/evaluated/neversleep/llama-3-lumimaid-8b/cot-1-shot.jsonl
diff --git a/evaluated/neversleep/llama-3-lumimaid-8b/default.jsonl b/evaluated/neversleep/llama-3-lumimaid-8b/default.jsonl
diff --git a/evaluated/neversleep/noromaid-20b/1-shot.jsonl b/evaluated/neversleep/noromaid-20b/1-shot.jsonl
diff --git a/evaluated/neversleep/noromaid-20b/cot-1-shot.jsonl b/evaluated/neversleep/noromaid-20b/cot-1-shot.jsonl
diff --git a/evaluated/neversleep/noromaid-20b/default.jsonl b/evaluated/neversleep/noromaid-20b/default.jsonl
diff --git a/evaluated/nlpai-lab/KULLM3/1-shot.jsonl b/evaluated/nlpai-lab/KULLM3/1-shot.jsonl
diff --git a/evaluated/nlpai-lab/KULLM3/cot-1-shot.jsonl b/evaluated/nlpai-lab/KULLM3/cot-1-shot.jsonl
diff --git a/evaluated/nlpai-lab/KULLM3/default.jsonl b/evaluated/nlpai-lab/KULLM3/default.jsonl
diff --git a/evaluated/nousresearch/nous-hermes-2-mistral-7b-dpo/1-shot.jsonl b/evaluated/nousresearch/nous-hermes-2-mistral-7b-dpo/1-shot.jsonl
diff --git a/evaluated/nousresearch/nous-hermes-2-mistral-7b-dpo/cot-1-shot.jsonl b/evaluated/nousresearch/nous-hermes-2-mistral-7b-dpo/cot-1-shot.jsonl
diff --git a/evaluated/nousresearch/nous-hermes-2-mistral-7b-dpo/default.jsonl b/evaluated/nousresearch/nous-hermes-2-mistral-7b-dpo/default.jsonl
diff --git a/evaluated/openai/gpt-3.5-turbo-0125/1-shot.jsonl b/evaluated/openai/gpt-3.5-turbo-0125/1-shot.jsonl
diff --git a/evaluated/openai/gpt-3.5-turbo-0125/cot-1-shot.jsonl b/evaluated/openai/gpt-3.5-turbo-0125/cot-1-shot.jsonl
diff --git a/evaluated/openai/gpt-3.5-turbo-0125/default.jsonl b/evaluated/openai/gpt-3.5-turbo-0125/default.jsonl
diff --git a/evaluated/openai/gpt-3.5-turbo-1106/1-shot.jsonl b/evaluated/openai/gpt-3.5-turbo-1106/1-shot.jsonl
diff --git a/evaluated/openai/gpt-3.5-turbo-1106/cot-1-shot.jsonl b/evaluated/openai/gpt-3.5-turbo-1106/cot-1-shot.jsonl
diff --git a/evaluated/openai/gpt-3.5-turbo-1106/default.jsonl b/evaluated/openai/gpt-3.5-turbo-1106/default.jsonl
diff --git a/evaluated/openai/gpt-4-0125-preview/1-shot.jsonl b/evaluated/openai/gpt-4-0125-preview/1-shot.jsonl
diff --git a/evaluated/openai/gpt-4-0125-preview/cot-1-shot.jsonl b/evaluated/openai/gpt-4-0125-preview/cot-1-shot.jsonl
diff --git a/evaluated/openai/gpt-4-0125-preview/default.jsonl b/evaluated/openai/gpt-4-0125-preview/default.jsonl
diff --git a/evaluated/openai/gpt-4-0613/1-shot.jsonl b/evaluated/openai/gpt-4-0613/1-shot.jsonl
diff --git a/evaluated/openai/gpt-4-0613/cot-1-shot.jsonl b/evaluated/openai/gpt-4-0613/cot-1-shot.jsonl
diff --git a/evaluated/openai/gpt-4-0613/default.jsonl b/evaluated/openai/gpt-4-0613/default.jsonl
diff --git a/evaluated/openai/gpt-4-1106-preview/1-shot.jsonl b/evaluated/openai/gpt-4-1106-preview/1-shot.jsonl
diff --git a/evaluated/openai/gpt-4-1106-preview/cot-1-shot.jsonl b/evaluated/openai/gpt-4-1106-preview/cot-1-shot.jsonl
diff --git a/evaluated/openai/gpt-4-1106-preview/default.jsonl b/evaluated/openai/gpt-4-1106-preview/default.jsonl
diff --git a/evaluated/openai/gpt-4-turbo-2024-04-09/1-shot.jsonl b/evaluated/openai/gpt-4-turbo-2024-04-09/1-shot.jsonl
diff --git a/evaluated/openai/gpt-4-turbo-2024-04-09/cot-1-shot.jsonl b/evaluated/openai/gpt-4-turbo-2024-04-09/cot-1-shot.jsonl
diff --git a/evaluated/openai/gpt-4-turbo-2024-04-09/default.jsonl b/evaluated/openai/gpt-4-turbo-2024-04-09/default.jsonl
diff --git a/evaluated/openai/gpt-4o-2024-05-13/1-shot.jsonl b/evaluated/openai/gpt-4o-2024-05-13/1-shot.jsonl
diff --git a/evaluated/openai/gpt-4o-2024-05-13/cot-1-shot.jsonl b/evaluated/openai/gpt-4o-2024-05-13/cot-1-shot.jsonl
diff --git a/evaluated/openai/gpt-4o-2024-05-13/default.jsonl b/evaluated/openai/gpt-4o-2024-05-13/default.jsonl
diff --git a/evaluated/openchat/openchat-3.5-0106/1-shot.jsonl b/evaluated/openchat/openchat-3.5-0106/1-shot.jsonl
diff --git a/evaluated/openchat/openchat-3.5-0106/cot-1-shot.jsonl b/evaluated/openchat/openchat-3.5-0106/cot-1-shot.jsonl
diff --git a/evaluated/openchat/openchat-3.5-0106/default.jsonl b/evaluated/openchat/openchat-3.5-0106/default.jsonl
diff --git a/evaluated/perplexity/llama-3-sonar-large-32k-chat/1-shot.jsonl b/evaluated/perplexity/llama-3-sonar-large-32k-chat/1-shot.jsonl
diff --git a/evaluated/perplexity/llama-3-sonar-large-32k-chat/cot-1-shot.jsonl b/evaluated/perplexity/llama-3-sonar-large-32k-chat/cot-1-shot.jsonl
diff --git a/evaluated/perplexity/llama-3-sonar-large-32k-chat/default.jsonl b/evaluated/perplexity/llama-3-sonar-large-32k-chat/default.jsonl
diff --git a/evaluated/perplexity/llama-3-sonar-small-32k-online/1-shot.jsonl b/evaluated/perplexity/llama-3-sonar-small-32k-online/1-shot.jsonl
diff --git a/evaluated/perplexity/llama-3-sonar-small-32k-online/cot-1-shot.jsonl b/evaluated/perplexity/llama-3-sonar-small-32k-online/cot-1-shot.jsonl
diff --git a/evaluated/perplexity/llama-3-sonar-small-32k-online/default.jsonl b/evaluated/perplexity/llama-3-sonar-small-32k-online/default.jsonl
diff --git a/evaluated/qwen/qwen-14b-chat/1-shot.jsonl b/evaluated/qwen/qwen-14b-chat/1-shot.jsonl
diff --git a/evaluated/qwen/qwen-14b-chat/cot-1-shot.jsonl b/evaluated/qwen/qwen-14b-chat/cot-1-shot.jsonl
diff --git a/evaluated/qwen/qwen-14b-chat/default.jsonl b/evaluated/qwen/qwen-14b-chat/default.jsonl
diff --git a/evaluated/qwen/qwen-32b-chat/1-shot.jsonl b/evaluated/qwen/qwen-32b-chat/1-shot.jsonl
diff --git a/evaluated/qwen/qwen-32b-chat/cot-1-shot.jsonl b/evaluated/qwen/qwen-32b-chat/cot-1-shot.jsonl
diff --git a/evaluated/qwen/qwen-32b-chat/default.jsonl b/evaluated/qwen/qwen-32b-chat/default.jsonl
diff --git a/evaluated/qwen/qwen-72b-chat/1-shot.jsonl b/evaluated/qwen/qwen-72b-chat/1-shot.jsonl
diff --git a/evaluated/qwen/qwen-72b-chat/cot-1-shot.jsonl b/evaluated/qwen/qwen-72b-chat/cot-1-shot.jsonl
diff --git a/evaluated/qwen/qwen-72b-chat/default.jsonl b/evaluated/qwen/qwen-72b-chat/default.jsonl
diff --git a/evaluated/qwen/qwen-7b-chat/1-shot.jsonl b/evaluated/qwen/qwen-7b-chat/1-shot.jsonl
diff --git a/evaluated/qwen/qwen-7b-chat/cot-1-shot.jsonl b/evaluated/qwen/qwen-7b-chat/cot-1-shot.jsonl
diff --git a/evaluated/qwen/qwen-7b-chat/default.jsonl b/evaluated/qwen/qwen-7b-chat/default.jsonl
diff --git a/evaluated/undi95/toppy-m-7b/1-shot.jsonl b/evaluated/undi95/toppy-m-7b/1-shot.jsonl
diff --git a/evaluated/undi95/toppy-m-7b/cot-1-shot.jsonl b/evaluated/undi95/toppy-m-7b/cot-1-shot.jsonl
diff --git a/evaluated/undi95/toppy-m-7b/default.jsonl b/evaluated/undi95/toppy-m-7b/default.jsonl
diff --git a/evaluated/yanolja/Bookworm-10.7B-v0.4-DPO/1-shot.jsonl b/evaluated/yanolja/Bookworm-10.7B-v0.4-DPO/1-shot.jsonl
diff --git a/evaluated/yanolja/Bookworm-10.7B-v0.4-DPO/cot-1-shot.jsonl b/evaluated/yanolja/Bookworm-10.7B-v0.4-DPO/cot-1-shot.jsonl
diff --git a/evaluated/yanolja/Bookworm-10.7B-v0.4-DPO/default.jsonl b/evaluated/yanolja/Bookworm-10.7B-v0.4-DPO/default.jsonl
diff --git a/evaluated/yanolja/EEVE-Korean-Instruct-10.8B-v1.0/1-shot.jsonl b/evaluated/yanolja/EEVE-Korean-Instruct-10.8B-v1.0/1-shot.jsonl
diff --git a/evaluated/yanolja/EEVE-Korean-Instruct-10.8B-v1.0/cot-1-shot.jsonl b/evaluated/yanolja/EEVE-Korean-Instruct-10.8B-v1.0/cot-1-shot.jsonl
diff --git a/evaluated/yanolja/EEVE-Korean-Instruct-10.8B-v1.0/default.jsonl b/evaluated/yanolja/EEVE-Korean-Instruct-10.8B-v1.0/default.jsonl
diff --git a/evaluator.py b/evaluator.py
@@ -0,0 +1,168 @@
+import argparse
+import json
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+from pathlib import Path
+from threading import Lock
+from typing import Dict, Union
+
+import pandas as pd
+from openai import OpenAI
+
+from templates import JUDGE_TEMPLATE
+
+
+# Constants
+TIME_START = datetime.now().strftime("%Y%m%d_%H%M%S")
+LOCK = Lock()
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o", "--model-output-dir", help="Model Output Directory", required=True
+    )
+    parser.add_argument("-k", "--openai-api-key", help="OpenAI API Key", required=True)
+    parser.add_argument(
+        "-j", "--judge-model", help="Judge Model", default="gpt-4-1106-preview"
+    )
+    parser.add_argument("-t", "--threads", help="Thread count", default=42, type=int)
+    return parser.parse_args()
+
+
+def create_azure_client(api_key: str):
+    return OpenAI(api_key=api_key)
+
+
+def create_answers(
+    client, model_output, judge_model, is_multi_turn: bool = False, i=0
+) -> Dict[str, Union[str, float]]:
+    model_questions = model_output["questions"]
+    model_outputs = model_output["outputs"]
+    model_references = model_output["references"]
+
+    prompt = (
+        f"아래의 내용을 주어진 평가 기준들을 충실히 반영하여 평가해라. 특히 모델 답변이 언어 요구사항을 준수하는지 반드시 확인해야 한다.\n\n"
+        f"**Question**\n{model_questions[0]}"
+    )
+
+    if model_references and model_references[0]:
+        prompt += f"\n\n**Additional Reference**\n{model_references[0]}"
+
+    prompt += f"\n\n**Model's Response**\n{model_outputs[0]}"
+
+    if is_multi_turn:
+        prompt += f"\n\n**Follow-up Question.**\n{model_questions[1]}"
+        if model_references and model_references[1]:
+            prompt += f"\n\n**Additional Reference**\n{model_references[1]}"
+        prompt += f"\n\n**Model's Response**\n{model_outputs[1]}"
+
+    prompt += "\n\n[[대화 종료. 평가 시작.]]"
+
+    try:
+        response = client.chat.completions.create(
+            model=judge_model,
+            temperature=0.0,
+            n=1,
+            messages=[
+                {
+                    "role": "system",
+                    "content": JUDGE_TEMPLATE[
+                        "multi_turn" if is_multi_turn else "single_turn"
+                    ],
+                },
+                {"role": "user", "content": prompt},
+            ],
+        )
+
+        content = response.choices[0].message.content
+        judge_message_match = re.search(
+            r"평가:(.*?)점수:", content.replace("*", ""), re.DOTALL
+        )
+        judge_message = (
+            judge_message_match.group(1).strip()
+            if judge_message_match
+            else "No judge message found"
+        )
+        judge_score_match = re.search(
+            r"점수:\s*(\d+(\.\d+)?)", content.replace("*", "")
+        )
+        if judge_score_match:
+            judge_score = float(judge_score_match.group(1))
+        else:
+            raise ValueError("No score found in response")
+
+        return {"judge_message": judge_message, "judge_score": judge_score}
+
+    except Exception as e:
+        print("Error. Retrying after 20 sec", e)
+        time.sleep(20)
+
+        # 꼭 아래 이유가 아닐 수 있음. 핸들링 필요.
+        if i > 3:
+            print("Impossible prompt, aborting..!")
+            return {
+                "judge_message": "Impossible to judge due to repetition.",
+                "judge_score": 0.0,
+            }
+        i += 1
+        return create_answers(client, model_output, judge_model, is_multi_turn, i)
+
+
+def process_item(client, row, judge_model, output_file):
+    query_single = create_answers(client, row, judge_model)
+    query_multi = create_answers(client, row, judge_model, is_multi_turn=True)
+
+    row["query_single"] = query_single
+    row["query_multi"] = query_multi
+    row = row.to_dict()
+
+    with LOCK:
+        with output_file.open("a", encoding="utf-8-sig") as f:
+            f.write(json.dumps(row, ensure_ascii=False))
+            f.write("\n")
+
+
+def process_file(
+    client, file_path: Path, output_dir: Path, judge_model, threads: int, args
+):
+    print(f"- 현재 Processing : {file_path}")
+    df_model_outputs = pd.read_json(file_path, lines=True)
+
+    output_file = output_dir / file_path.relative_to(args.model_output_dir)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    with ThreadPoolExecutor(max_workers=threads) as executor:
+        for row in df_model_outputs.iterrows():
+            executor.submit(process_item, client, row[1], judge_model, output_file)
+
+
+def is_hidden(filepath: Path) -> bool:
+    return any(part.startswith(".") for part in filepath.parts)
+
+
+def main():
+    args = get_args()
+    client = create_azure_client(args.openai_api_key)
+
+    input_dir = Path(args.model_output_dir)
+    output_dir = Path("./evaluated")
+
+    # Filter out hidden files
+    json_files = [file for file in input_dir.rglob("*.jsonl") if not is_hidden(file)]
+
+    for file_path in json_files:
+        output_file_path = output_dir / file_path.relative_to(input_dir)
+        if output_file_path.exists():
+            print(f"이미 평가 완료.. : {file_path}")
+            continue
+        process_file(
+            client, file_path, output_dir, args.judge_model, args.threads, args
+        )
+        time.sleep(20)  # to handle ratelimit!
+
+
+if __name__ == "__main__":
+    main()