Make dataset generation work with preferences & seed file (#49)

freddiev4 · web-flow · commit 3af612c04249 · 2024-06-10T14:53:50.000-07:00
* pass along description

* Start adding preferences

* Add human preferences to ds generation

* Remove rich prompt typo

* Fix up seed file reading

* Refactor

* Add method for generating full dataset

* Ask how many dataset examples

* Fix typo &amp; comment out client method
diff --git a/quotientai/cli/generate/dataset.py b/quotientai/cli/generate/dataset.py
@@ -1,20 +1,24 @@
 import json
+import os
 import time
 
+from pathlib import Path
+from typing import List, Optional
+
 from quotientai._enums import GenerateDatasetType
 from quotientai.client import QuotientClient
 from rich import print
 from rich.console import Console
 from rich.panel import Panel
 from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
 from rich.prompt import Confirm, IntPrompt, Prompt
+from rich.table import Table
 
 console = Console()
+client = QuotientClient()
 
-from rich.table import Table
 
-
-def view_examples(graded_examples):
+def show_graded_examples(graded_examples):
     table = Table(show_header=True, header_style="bold magenta")
     table.add_column("ID")
     table.add_column("Context")
@@ -36,55 +40,59 @@ def view_examples(graded_examples):
     console.print(table)
 
 
-def get_seed_data(seed: str):
+def get_seed_data(seed: str) -> List[Optional[str]]:
     if seed is None:
         seed_path = Confirm.ask(
-            "Do you have a seed file (.jsonl) with examples to assist the creation of the dataset?",
+            "Do you have a seed file ([green].jsonl[/green]) with examples to assist the creation of the dataset?",
         )
         if not seed_path:
-            seed_data = "Here is some fake data REPLACE ME"
             console.print(
                 "No problem! We'll generate some examples for you, and you can grade them\n"
             )
-            return
+            return []
         else:
-            valid_format = False
-            while not valid_format:
-                filepath = Prompt.ask("Please provide the path to the seed file.")
-
-                if filepath.endswith(".jsonl") or filepath.endswith(".jsonlines"):
-                    valid_format = True
+            valid_file = False
+            while not valid_file:
+                filepath = Prompt.ask("Please provide the path to the seed file")
+
+                is_valid_format = filepath.endswith(".jsonl") or filepath.endswith(".jsonlines")
+                is_valid_path = os.path.exists(filepath)
+                if is_valid_format and is_valid_path:
+                    valid_file = True
                 else:
                     console.print(
-                        "The seed file should be in the .jsonl format. Please provide a valid file."
+                        f"[yellow]Please provide a valid .jsonl file. Got {filepath}"
                     )
     else:
         filepath = seed
 
     try:
         with open(filepath, "r") as file:
-            seed_data = file.readlines()
-            seed_data = [json.loads(seed) for seed in seed_data]
+            raw_data = file.readlines()
+            raw_data = [json.loads(line) for line in raw_data]
     except FileNotFoundError:
         console.print("The file could not be found. Please provide a valid file.")
 
-        valid_field = False
-        # check that we can get the field name by looking at the first example
-        while not valid_field:
-            field = Prompt.ask(
-                "Please indicate the field in the JSONL file that contain an example to use as a seed."
+    valid_field = False
+    # check that we can get the field name by looking at the first example
+    while not valid_field:
+        available_fields = list(raw_data[0].keys())
+        field = Prompt.ask(
+            "Please indicate the field in the file that contains examples to use as a seed. "
+            f"Available fields: [magenta]{available_fields}"
+        )
+        if field not in raw_data[0]:
+            console.print(
+                f"The field '{field}' is not present in the seed file."
             )
-            if field not in seed_data[0]:
-                console.print(
-                    f"The field '{field}' is not present in the seed file. Please provide a valid field."
-                )
-            else:
-                valid_field = True
+        else:
+            valid_field = True
 
-    console.print("Here is an example from the seed file:")
-    seed_one = seed_data[0][field]
-    console.print(seed_one)
+    console.print("\nHere is an example from the seed file:")
+    seed_one = raw_data[0][field]
+    console.print(f"[green] {seed_one}\n")
 
+    seed_data = [line[field] for line in raw_data]
     return seed_data
 
 
@@ -103,7 +111,6 @@ def grade_examples(
         task = progress.add_task("generation", total=0)
 
         # Step 4
-        client = QuotientClient()
         examples = client.generate_examples(
             generation_type=generation_type,
             description=description,
@@ -173,6 +180,36 @@ def grade_examples(
     console.print()
     return data
 
+def select_next_action():
+    next_action_choices = {
+        1: {
+            "type": "Generate more examples",
+            "description": "Continue grading more examples.",
+        },
+        2: {
+            "type": "View graded examples",
+            "description": "View the graded examples.",
+        },
+        3: {
+            "type": "Stop grading and generate the dataset",
+            "description": "Stop grading and generate the dataset.",
+        },
+    }
+
+    console.print("What would you like to do next?")
+    for index, choice in next_action_choices.items():
+        console.print(
+            f"[magenta]{index}[/magenta]. {choice['type']}: [white]{choice['description']}[/white]",
+            style="yellow",
+        )
+
+    console.print()
+    next_action = IntPrompt.ask(
+        "Choose an option",
+        choices=[str(index) for index in next_action_choices.keys()],
+    )
+    return next_action
+
 
 def generation_workflow(seed: str = None):
     """
@@ -236,11 +273,13 @@ def generation_workflow(seed: str = None):
     )
 
     description = Prompt.ask(
-        "[bold]Please describe in detail what the context is like[/bold]"
+        "[bold]Please describe in detail the context of your problem[/bold]"
     )
 
     # if the seed is not provided, ask the user if they have a seed file
-    seed_data = get_seed_data(seed=seed)
+    seed_data: Optional[List[str]] = get_seed_data(seed=seed)
+    if seed_data:
+        seed_data = seed_data[0]
 
     graded_examples = []
     preferences = []
@@ -276,33 +315,7 @@ def generation_workflow(seed: str = None):
             f"For better results, we recommend grading [red]5 to 10[/red] examples.\n"
         )
 
-        next_action_choices = {
-            1: {
-                "type": "Generate more examples",
-                "description": "Continue grading more examples.",
-            },
-            2: {
-                "type": "View graded examples",
-                "description": "View the graded examples.",
-            },
-            3: {
-                "type": "Stop grading and generate the dataset",
-                "description": "Stop grading and generate the dataset.",
-            },
-        }
-
-        console.print("What would you like to do next?")
-        for index, choice in next_action_choices.items():
-            console.print(
-                f"[magenta]{index}[/magenta]. {choice['type']}: [white]{choice['description']}[/white]",
-                style="yellow",
-            )
-
-        console.print()
-        next_action = IntPrompt.ask(
-            "Choose an option",
-            choices=[str(index) for index in next_action_choices.keys()],
-        )
+        next_action = select_next_action()
 
         if next_action == 1:
             # Generate more examples
@@ -312,26 +325,32 @@ def generation_workflow(seed: str = None):
             )
             continue
         elif next_action == 2:
-            view_examples(graded_examples)
-            if Confirm.ask("Would you like to continue grading more examples?"):
-                num_examples = IntPrompt.ask(
-                    "How many more examples would you like to generate?",
-                    default=3,
-                    min_value=3,
-                    max_value=10,
-                )
-                continue
-            else:
-                # Stop grading and generate the dataset
-                console.print()
-                console.print(
-                    "[bold]🧪 We will now generate a dataset using the graded examples as a seed.[/bold]"
-                )
-                return
+            show_graded_examples(graded_examples)
         else:
             # Stop grading and generate the dataset
             console.print()
+            console.print("Sweet!")
+            num_dataset_examples = IntPrompt.ask(
+                "How many examples would you like to generate for your dataset? [magenta](Max: 1000)[/magenta]",
+            )
+            console.print(
+                f"[bold]🧪 We will now generate a dataset with {num_dataset_examples} examples, using the graded examples as a seed...[/bold]\n"
+            )
+            time.sleep(5)
+            # client.generate_dataset(
+            #     generation_type=generation_type,
+            #     description=description,
+            #     num_examples=num_examples,
+            #     seed_data=seed_data,
+            #     preferences=preferences,
+            # )
+            console.print(
+                "[green][bold]🚀 Dataset request submitted! "
+                "You will soon receive an email with your downloadable dataset![/bold][/green]"
+            )
             console.print(
-                "[bold]🧪 We will now generate a dataset using the graded examples as a seed.[/bold]"
+                "[yellow]Note: If you see the email in your spam folder please "
+                "let us know at [red]support@quotientai.co[/red][/yellow]"
             )
+            time.sleep(0.5)
             return
diff --git a/quotientai/client.py b/quotientai/client.py
@@ -1166,3 +1166,50 @@ def generate_examples(
 
         except Exception as e:
             raise QuotientAIException(f"Failed to generate examples: {str(e)}") from e
+
+    @require_api_key
+    def generate_dataset(
+        self,
+        generation_type: GenerateDatasetType,
+        description: str,
+        num_examples: int = 3,
+        seed_data: str = None,
+        preferences: List[dict] = None,
+    ) -> List[str]:
+        try:
+            url = f"{self.eval_scheduler_url}/generate/dataset"
+
+            headers = {
+                "Authorization": f"Bearer {self.api_key}",
+            }
+            params = {
+                "generation_type": generation_type.value,
+            }
+
+            data = {
+                "inputs": seed_data,
+                "description": description,
+                "num_examples": num_examples,
+                "preferences": preferences,
+            }
+            response = requests.post(
+                url,
+                headers=headers,
+                params=params,
+                json=data,
+            )
+            result = response.json()
+            if response.status_code != 200:
+                if "detail" in result:
+                    raise FastAPIError(response.status_code, result["detail"])
+                else:
+                    response.raise_for_status()
+
+            return result
+        except FastAPIError as fast_err:
+            raise QuotientAIException(
+                f"Failed to generate dataset: {fast_err.status_code} {fast_err.detail}"
+            ) from fast_err
+
+        except Exception as e:
+            raise QuotientAIException(f"Failed to generate dataset: {str(e)}") from e