Issue pytorch#1123 datasets doc improvement (pytorch#1134)

Co-authored-by: Rafi Ayub <[email protected]> Co-authored-by: RdoubleA <[email protected]>
felipemello1 · Jul 26, 2024 · 1157b94 · 1157b94
1 parent e101420
commit 1157b94
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 13 deletions.
diff --git a/docs/source/tutorials/datasets.rst b/docs/source/tutorials/datasets.rst
@@ -60,7 +60,7 @@ all of our built-in datasets and dataset builders are using Hugging Face's ``loa
 to load in your data, whether local or on the hub.
 
 You can pass in a Hugging Face dataset path to the ``source`` parameter in any of our builders
-to specify which dataset on the hub to download. Additionally, all builders accept
+to specify which dataset on the hub to download or use from a local directory path (see `Local and remote datasets`_). Additionally, all builders accept
 any keyword-arguments that ``load_dataset()`` supports. You can see a full list
 on Hugging Face's `documentation. <https://huggingface.co/docs/datasets/en/loading>`_
 
@@ -295,6 +295,17 @@ and create your own class.
     dataset.template=import.path.to.CustomTemplate
 
 
+torchtune uses :code:`importlib.import_module` (see ``importlib`` `docs <https://docs.python.org/3/library/importlib.html>`_ for more details)
+to locate components from their dotpaths. You can place your custom template class
+in any Python file as long as the file is accessible by Python's import mechanism.
+This means the module should be in a directory that is included in Python's search
+paths (:code:`sys.path`). This often includes:
+
+- The current directory from which your Python interpreter or script is run.
+- Directories where Python packages are installed (like :code:`site-packages`).
+- Any directories added to :code:`sys.path` at runtime using :code:`sys.path.append` or through the :code:`PYTHONPATH` environment variable.
+
+
 Custom chat dataset and chat formats
 ------------------------------------
 

diff --git a/torchtune/datasets/_chat.py b/torchtune/datasets/_chat.py
@@ -46,8 +46,11 @@ class ChatDataset(Dataset):
 
     Args:
         tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
-        source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset``
+        source (str): path to dataset repository on Hugging Face. For local datasets,
+            define source as the data file type (e.g. "json", "csv", "text") and pass
+            in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
             (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
+            for more details.
         convert_to_messages (Callable[[Mapping[str, Any]], List[Message]]): function that keys into the desired field in the sample
             and converts to a list of :class:`~torchtune.data.Message` that follows the Llama format with the expected keys
         chat_format (Optional[ChatFormat]): template used to format the chat. This is used to add structured text around the actual
@@ -56,7 +59,8 @@ class ChatDataset(Dataset):
             unless you want to structure messages in a particular way for inference.
         max_seq_len (int): Maximum number of tokens in the returned input and label token id lists.
         train_on_input (bool): Whether the model is trained on the prompt or not. Default is False.
-        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``.
+        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``,
+            such as ``data_files`` or ``split``.
     """
 
     def __init__(
@@ -122,8 +126,11 @@ def chat_dataset(
 
     Args:
         tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
-        source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset``
+        source (str): path to dataset repository on Hugging Face. For local datasets,
+            define source as the data file type (e.g. "json", "csv", "text") and pass
+            in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
             (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
+            for more details.
         conversation_style (str): string specifying expected style of conversations in the dataset
             for automatic conversion to the :class:`~torchtune.data.Message` structure. Supported styles are: "sharegpt", "openai"
         chat_format (Optional[str]): full import path of :class:`~torchtune.data.ChatFormat` class used to format the messages.
@@ -132,7 +139,8 @@ def chat_dataset(
         max_seq_len (int): Maximum number of tokens in the returned input and label token id lists.
         train_on_input (bool): Whether the model is trained on the prompt or not. Default is False.
         packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False.
-        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``.
+        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``,
+            such as ``data_files`` or ``split``.
 
     Examples:
         >>> from torchtune.datasets import chat_dataset

diff --git a/torchtune/datasets/_instruct.py b/torchtune/datasets/_instruct.py
@@ -39,19 +39,25 @@ class InstructDataset(Dataset):
 
     Args:
         tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
-        source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset``
+        source (str): path to dataset repository on Hugging Face. For local datasets,
+            define source as the data file type (e.g. "json", "csv", "text") and pass
+            in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
             (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
+            for more details.
         template (InstructTemplate): template used to format the prompt. If the placeholder variable
             names in the template do not match the column/key names in the dataset, use ``column_map`` to map them.
         transform (Optional[Callable]): transform to apply to the sample before formatting to the template.
             Default is None.
         column_map (Optional[Dict[str, str]]): a mapping from the expected placeholder names in the template
             to the column/key names in the sample. If None, assume these are identical.
+            The output column can be indicated using the ``output`` key mapping.
+            If no placeholder for the ``output`` column is provided in ``column_map`` it is assumed to be ``output``.
         train_on_input (bool): Whether the model is trained on the prompt or not. Default is False.
         max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
             Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
             and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
-        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``.
+        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``,
+            such as ``data_files`` or ``split``.
     """
 
     def __init__(
@@ -130,8 +136,11 @@ def instruct_dataset(
 
     Args:
         tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
-        source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset``
+        source (str): path to dataset repository on Hugging Face. For local datasets,
+            define source as the data file type (e.g. "json", "csv", "text") and pass
+            in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
             (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
+            for more details.
         template (str): full import path of class used to format the prompt. If the placeholder variable
             names in the template do not match the column/key names in the dataset, use ``column_map`` to map them.
         column_map (Optional[Dict[str, str]]): a mapping from the expected placeholder names in the template
@@ -141,7 +150,8 @@ def instruct_dataset(
             Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
             and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
         packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False.
-        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``.
+        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``,
+            such as ``data_files`` or ``split``.
 
     Examples:
         >>> from torchtune.datasets import instruct_dataset

diff --git a/torchtune/datasets/_preference.py b/torchtune/datasets/_preference.py
@@ -28,8 +28,11 @@ class PreferenceDataset(Dataset):
 
     Args:
         tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
-        source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset``
+        source (str): path to dataset repository on Hugging Face. For local datasets,
+            define source as the data file type (e.g. "json", "csv", "text") and pass
+            in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
             (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
+            for more details.
         template (InstructTemplate): template used to format the prompt. If the placeholder variable
             names in the template do not match the column/key names in the dataset, use ``column_map`` to map them.
         transform (Optional[Callable]): transform to apply to the sample before formatting to the template.
@@ -39,7 +42,8 @@ class PreferenceDataset(Dataset):
         max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
             Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
             and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
-        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``.
+        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``,
+            such as ``data_files`` or ``split``.
     """
 
     def __init__(

diff --git a/torchtune/datasets/_text_completion.py b/torchtune/datasets/_text_completion.py
@@ -20,16 +20,20 @@ class TextCompletionDataset(Dataset):
 
     Args:
         tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method.
-        source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset``
+        source (str): path to dataset repository on Hugging Face. For local datasets,
+            define source as the data file type (e.g. "json", "csv", "text") and pass
+            in the filepath in ``data_files``. See Hugging Face's ``load_dataset``
             (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path)
+            for more details.
         column (str): name of column in the sample that contains the text data. This is typically required
             for Hugging Face datasets or tabular data. For local datasets with a single column, use the default "text",
             which is what is assigned by Hugging Face datasets when loaded into memory. Default is "text".
         max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
             Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
             and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
         add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True.
-        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``.
+        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``,
+            such as ``data_files`` or ``split``.
     """
 
     def __init__(