diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 000000000..68550d87f
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "python.analysis.typeCheckingMode": "basic",
+    "mypy-type-checker.args": [
+        "--config-file=mypy.ini"
+    ]
+}
\ No newline at end of file
diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 000000000..3910f6277
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,24 @@
+# AI Evals
+
+## Running the tests
+
+1. Create a new virtual environment
+```
+python -m venv venv
+```
+
+2. Activate the virtual environment: 
+```
+source venv/bin/activate
+```
+
+3. Install the dependencies: 
+```
+pip install -r requirements.txt
+```
+
+4. Run the tests from the `mito` folder: 
+TODO: Improve the running so that we don't have to be in the `mito` folder.
+```
+python -m evals.main
+```
\ No newline at end of file
diff --git a/evals/__init__.py b/evals/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/evals/ai_api_calls/__init__.py b/evals/ai_api_calls/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/evals/ai_api_calls/get_open_ai_completion.py b/evals/ai_api_calls/get_open_ai_completion.py
new file mode 100644
index 000000000..0e706da71
--- /dev/null
+++ b/evals/ai_api_calls/get_open_ai_completion.py
@@ -0,0 +1,13 @@
+import os
+from openai import OpenAI
+
+def get_open_ai_completion(prompt: str):
+    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    
+    response = client.chat.completions.create(
+        model="gpt-4",
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.0
+    )
+    
+    return response.choices[0].message.content
\ No newline at end of file
diff --git a/evals/eval_types.py b/evals/eval_types.py
new file mode 100644
index 000000000..4b1f9ec29
--- /dev/null
+++ b/evals/eval_types.py
@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal
+
+@dataclass
+class NotebookState:
+    """Represents the state of variables in a notebook at test time"""
+    global_vars: Dict[str, Any]
+    cell_contents: List[str]
+    
+    
+@dataclass
+class TestCase:
+    """A single test case with input state and expected output"""
+    name: str
+    notebook_state: NotebookState
+    user_input: str
+    expected_code: str
+    tags: List[Literal[
+        'variable declaration', 
+        'function declaration',
+        'dataframe transformation'
+    ]]
+
diff --git a/evals/main.py b/evals/main.py
new file mode 100644
index 000000000..66710eed8
--- /dev/null
+++ b/evals/main.py
@@ -0,0 +1,79 @@
+from typing import List
+from evals.ai_api_calls.get_open_ai_completion import get_open_ai_completion
+from evals.prompts.simple_prompt import get_simple_prompt
+from evals.eval_types import NotebookState, TestCase
+from evals.utils import get_globals_to_compare, get_script_from_cells, print_green, print_red
+
+
+EMPTY_NOTEBOOK_STATE: NotebookState = NotebookState(
+  global_vars={},
+  cell_contents=[]
+)
+
+INITIALIZED_VARIABLES_NOTEBOOK_STATE: NotebookState = NotebookState(
+  global_vars={'x': 1, 'y': 2, 'z': 3},
+  cell_contents=['x = 1', 'y = 2', 'z = 3', '']
+)
+
+
+TESTS: List[TestCase] = [
+    TestCase(
+        name="empty_notebook_variable_declaration",
+        notebook_state=EMPTY_NOTEBOOK_STATE,
+        user_input="create a variable x and set it equal to 1",
+        expected_code='x=1',
+        tags=['variable declaration']
+    ),
+    TestCase(
+        name="empty_notebook_function_declaration",
+        notebook_state=EMPTY_NOTEBOOK_STATE,
+        user_input="create a function my_sum that takes two arguments and returns their sum",
+        expected_code="""def my_sum(a, b):
+    return a + b""",
+        tags=['function declaration']
+    ),
+    TestCase(
+        name="initialized_variables_variable_declaration",
+        notebook_state=INITIALIZED_VARIABLES_NOTEBOOK_STATE,
+        user_input="create a new variable that is the product of x, y, and z",
+        expected_code="w = x * y * z",
+        tags=['variable declaration']
+    )
+]
+
+for test in TESTS:
+    
+    # Get the script from the cells
+    current_cell_contents_script = get_script_from_cells(test.notebook_state.cell_contents)
+
+    # Get the expected code script 
+    expected_code = current_cell_contents_script + "\n" + test.expected_code
+
+    # Create the actual code script produced by the LLM
+    prompt = get_simple_prompt(test.user_input, test.notebook_state)
+    ai_generated_code = get_open_ai_completion(prompt)
+    actual_code = current_cell_contents_script + "\n" + ai_generated_code
+
+    # So that we can compare the results of the two scripts, create global context for 
+    # each script. When calling exec, the globals are updated in place.
+    expected_globals = {}
+    actual_globals = {}
+
+    exec(expected_code, expected_globals)
+    exec(actual_code, actual_globals)
+
+    expected_globals = get_globals_to_compare(expected_globals)
+    actual_globals = get_globals_to_compare(actual_globals)
+
+    # TODO: Add statistics on how many tests pass/fail
+
+    if expected_globals == actual_globals:
+        print_green(f"Test {test.name} passed")
+    else:
+        print_red(f"Test {test.name} failed")
+        print("Expected globals:")
+        print(expected_globals)
+        print("Actual globals:")
+        print(actual_globals)
+    
+    
\ No newline at end of file
diff --git a/evals/mypy.ini b/evals/mypy.ini
new file mode 100644
index 000000000..2a8561a52
--- /dev/null
+++ b/evals/mypy.ini
@@ -0,0 +1,16 @@
+[mypy]
+python_version = 3.8
+warn_return_any = False
+warn_unused_configs = True
+disallow_untyped_defs = False
+disallow_incomplete_defs = False
+check_untyped_defs = True
+disallow_untyped_decorators = False
+no_implicit_optional = True
+warn_redundant_casts = True
+warn_unused_ignores = True
+warn_no_return = True
+warn_unreachable = True
+strict_optional = True
+ignore_missing_imports = True
+disable_error_code = var-annotated 
\ No newline at end of file
diff --git a/evals/prompts/__init__.py b/evals/prompts/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/evals/prompts/simple_prompt.py b/evals/prompts/simple_prompt.py
new file mode 100644
index 000000000..d66a2f670
--- /dev/null
+++ b/evals/prompts/simple_prompt.py
@@ -0,0 +1,35 @@
+from evals.eval_types import NotebookState
+
+
+def get_simple_prompt(user_input: str, notebook_state: NotebookState) -> str:
+		return f"""You are an expert python programmer. You are given a set of variables, existing code, and a task. 
+
+Respond with the python code and nothing else.
+
+<Example>
+You have these variables: 
+{{'x': 1, 'y': 2}}
+
+The current code cell is: 
+x = 1
+y = 2
+
+Your job is to: 
+Create a new variable z that is the sum of x and y
+
+Response:
+z = x + y
+</Example>
+
+Now complete this task:
+
+You have these variables:
+{notebook_state.global_vars}
+
+The current code cell is:
+{notebook_state.cell_contents[-1] if len(notebook_state.cell_contents) > 0 else ""}
+
+Your job is to: 
+{user_input}
+
+Response:"""
\ No newline at end of file
diff --git a/evals/requirements.txt b/evals/requirements.txt
new file mode 100644
index 000000000..34139c636
--- /dev/null
+++ b/evals/requirements.txt
@@ -0,0 +1,3 @@
+mypy>=1.0.0
+types-setuptools
+openai>=1.0.0
\ No newline at end of file
diff --git a/evals/utils.py b/evals/utils.py
new file mode 100644
index 000000000..20ea1d71e
--- /dev/null
+++ b/evals/utils.py
@@ -0,0 +1,29 @@
+from typing import List, Dict, Any
+
+
+def get_script_from_cells(cells: List[str]) -> str:
+    return "\n".join(cells)
+
+def get_globals_to_compare(globals: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Globals have a lot of stuff we don't actually care about comparing. 
+    For now, we only care about comparing variables created by the script.
+    This functionremoves everything else
+    """
+
+    globals = {k: v for k, v in globals.items() if k != "__builtins__"}
+
+    # Remove functions from the globals since we don't want to compare them
+    globals = {k: v for k, v in globals.items() if not callable(v)}
+
+    return globals
+
+def print_green(text: str):
+    print("\033[92m", end="")
+    print(text)
+    print("\033[0m", end="")
+
+def print_red(text: str):
+    print("\033[91m", end="")
+    print(text)
+    print("\033[0m", end="")
\ No newline at end of file
diff --git a/mito-ai/mito-ai/_version.py b/mito-ai/mito-ai/_version.py
deleted file mode 100644
index b566da045..000000000
--- a/mito-ai/mito-ai/_version.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-# Copyright (c) Saga Inc.
-import json
-import os
-from pathlib import Path
-
-"""
-This code is responsible for getting the package.json that is bundled
-with the javascript code bundle inside this Python package, so we can 
-get the version of the package. This means we can easily bump the package
-version in one place (package.json) and have it automatically update here too. 
-This is nice for the release process. 
-
-Since this is a Jupyter Lab 4 extension, themito-ai/labextension folder
-contains the package.json directly. So we can just read it there. 
-"""
-lab_extension_folder = os.path.join(Path(__file__).parent, 'labextension')
-
-package_json_path = os.path.join(lab_extension_folder, 'package.json')
-package_json = json.loads(open(package_json_path).read())
-
-__version__ = package_json['version']
-