diff --git a/automation-api/lib/llms/iflytek/spark_api.py b/automation-api/lib/llms/iflytek/spark_api.py
index ab51604..8811cc6 100644
--- a/automation-api/lib/llms/iflytek/spark_api.py
+++ b/automation-api/lib/llms/iflytek/spark_api.py
@@ -100,7 +100,8 @@ class SparkClient:
 
     def __init__(self, appid: str, api_key: str, api_secret: str) -> None:
         self.appid = appid
-        self.ws_url = Ws_Param(appid, api_key, api_secret, self.gpt_url).create_url()
+        self.api_key = api_key
+        self.api_secret = api_secret
 
     def gen_parameters(
         self,
@@ -133,9 +134,12 @@ def gen_payload(self, content):
         return data
 
     def generate_text(self, content, **kwargs) -> Dict[str, Any]:
+        ws_url = Ws_Param(
+            self.appid, self.api_key, self.api_secret, self.gpt_url
+        ).create_url()
         data = self.gen_parameters(**kwargs)
         data.update(self.gen_payload(content))
-        res = get_reply(self.ws_url, data)
+        res = get_reply(ws_url, data)
         return res
 
     def chat(self):
diff --git a/automation-api/lib/pilot/helpers.py b/automation-api/lib/pilot/helpers.py
index e7503fd..1465c98 100644
--- a/automation-api/lib/pilot/helpers.py
+++ b/automation-api/lib/pilot/helpers.py
@@ -1,7 +1,7 @@
 import json
 import uuid
 from datetime import datetime
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import pandas as pd
 from langchain.chains import LLMChain
@@ -81,19 +81,29 @@ def class_objects_from_df(df: pd.DataFrame, cls: type) -> list:
     return [cls(**rec) for rec in df.to_dict(orient="records")]
 
 
-def get_questions(sheet: AiEvalData) -> List[QuestionAndOptions]:
-    questions = filter_included_rows(sheet.questions.data.df)
+def get_questions(
+    sheet: AiEvalData, include_all: bool = False, language: Optional[str] = None
+) -> List[QuestionAndOptions]:
+    if include_all:
+        questions = sheet.questions.data.df
+    else:
+        questions = filter_included_rows(sheet.questions.data.df)
+
+    if language is not None:
+        questions = questions.loc[questions["language"] == language]
+
     options = sheet.question_options.data.df
     qs = class_objects_from_df(questions, Question)
 
     res = []
     for q in qs:
         qid = q.question_id
+        lang = q.language
         qopts = [
             QuestionOption(**rec)
-            for rec in options.loc[options["question_id"] == qid].to_dict(
-                orient="records"
-            )
+            for rec in options.loc[
+                (options["question_id"] == qid) & (options["language"] == lang)
+            ].to_dict(orient="records")
         ]
         res.append((q, qopts))
 
@@ -198,8 +208,13 @@ def check_llm_eval_output(eval_output: str) -> str:
         return "failed"
 
 
-def get_prompt_variants(sheet: AiEvalData) -> List[PromptVariation]:
-    prompt_variations = filter_included_rows(sheet.prompt_variations.data.df)
+def get_prompt_variants(
+    sheet: AiEvalData, include_all: bool = False
+) -> List[PromptVariation]:
+    if include_all:
+        prompt_variations = sheet.prompt_variations.data.df
+    else:
+        prompt_variations = filter_included_rows(sheet.prompt_variations.data.df)
     res = class_objects_from_df(prompt_variations, PromptVariation)
     return res
 
diff --git a/automation-api/notebooks/exploration-notebook.py b/automation-api/notebooks/exploration-notebook.py
index 003ab18..ce818e0 100644
--- a/automation-api/notebooks/exploration-notebook.py
+++ b/automation-api/notebooks/exploration-notebook.py
@@ -76,9 +76,6 @@
 # -
 
 
-
-
-
 # ## Read from AI Eval spreadsheet
 
 
@@ -116,3 +113,72 @@
 ai_eval_data.gen_ai_models.data.df
 
 
+# ## Test individual question and prompt
+# Here is how you can ask one question to llm
+
+# read the ai eval spreadsheet
+ai_eval_data = read_ai_eval_data(ai_eval_spreadsheet)
+
+# there are some helper functions to parse the spreadsheet
+from lib.pilot.helpers import get_questions, get_prompt_variants, create_question_data_for_test
+
+# load all questions and filter by language
+qs = get_questions(ai_eval_data, include_all=True, language='en-US')
+
+# define a function to load one question
+def get_question(qid, qs):
+    if isinstance(qid, int):
+        qid = str(qid)
+    for q, opts in qs:
+        if q.question_id == qid:
+            return (q, opts)
+
+
+q = get_question("10", qs)
+
+q
+
+# load all prompt variants
+prompts = get_prompt_variants(ai_eval_data, include_all=True)
+prompts
+
+# define a function to load one prompt
+def get_prompt_variant(pid, ps):
+    for p in ps:
+        if pid == p.variation_id:
+            return p
+
+
+pv = get_prompt_variant("simple", prompts)
+pv
+
+# There are many fields in PromptVariant objects
+# - pv.question_template: how to format the question.
+#   Expect `question`, `option_a`, `option_b`, `option_c` as input.
+# - pv.question_prompt_template: how to format the prompt to input into LLM.
+#   Expect `question` as input (which formatted according to the above template)
+# - pv.ai_prefix and pv.question_prefix are for memory, which will be the prefixes
+#   for question messages and llm response prefix.
+# - pv.follow_up_answer_correctness_evaluation_prompt_template: the template to
+#   format a followup question to double check the answer.
+#   Expect `question`, `option_a`, `option_b`, `option_c`, `text` as input.
+
+# to run a model with given prompt and question:
+
+# format the question with question template
+qd = create_question_data_for_test(pv.question_template, q)  # return a dict
+print(qd['question'])
+
+# get llm and run model
+llm = get_openai_model('gpt-3.5-turbo', temperature=0.1)
+answer = run_model(llm, pv.question_prompt_template, verbose=True, **qd)
+print(answer)
+
+# if the llm is good at following instructions and produce answers in the format we want
+# we can check the answer with this function
+from lib.pilot.helpers import simple_evaluation
+
+simple_evaluation(q, answer)
+
+# +
+# otherwise, we can use another LLM to check if the answer is correct.