Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

DF data validation cli #74

Merged
merged 4 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 6 additions & 13 deletions src/resin/tokenizer/openai.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import tiktoken
from typing import List
from .base import BaseTokenizer
from ..models.data_models import Messages, MessageBase, Role
from ..models.data_models import Messages


class OpenAITokenizer(BaseTokenizer):
Expand All @@ -14,15 +14,18 @@ def __init__(self, model_name: str = "gpt-3.5-turbo"):

def tokenize(self, text: str) -> List[str]:
return [self._encoder.decode([encoded_token])
for encoded_token in self._encoder.encode(text)]
for encoded_token in self._encode(text)]

def detokenize(self, tokens: List[str]) -> str:
if not isinstance(tokens, List):
raise TypeError(f"detokenize expect List[str], got f{type(tokens)}")
return "".join(tokens)

def token_count(self, text: str) -> int:
return len(self._encoder.encode(text))
return len(self._encode(text))

def _encode(self, text):
return self._encoder.encode(text, disallowed_special=())

def messages_token_count(self, messages: Messages) -> int:
# Adapted from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb # noqa
Expand All @@ -33,13 +36,3 @@ def messages_token_count(self, messages: Messages) -> int:
num_tokens += self.token_count(value)
num_tokens += self.FIXED_PREFIX_TOKENS
return num_tokens

@staticmethod
def test_messages_token_count(tokenizer):
messages = [MessageBase(role=Role.USER, content="hello"),
MessageBase(role=Role.ASSISTANT, content="hi")]
assert tokenizer.messages_token_count(messages) == 11

@staticmethod
def test_messages_token_count_empty_messages(tokenizer):
assert tokenizer.messages_token_count([]) == 0
11 changes: 7 additions & 4 deletions src/resin_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ def health(host, port, ssl):
@click.argument("index-name", nargs=1, envvar="INDEX_NAME", type=str, required=True)
@click.option("--tokenizer-model", default="gpt-3.5-turbo", help="Tokenizer model")
def new(index_name, tokenizer_model):
Tokenizer.initialize(OpenAITokenizer, model_name=tokenizer_model)
kb = KnowledgeBase(index_name=index_name)
click.echo("Resin is going to create a new index: ", nl=False)
click.echo(click.style(f"{kb.index_name}", fg="green"))
click.confirm(click.style("Do you want to continue?", fg="red"), abort=True)
Tokenizer.initialize(OpenAITokenizer, tokenizer_model)
with spinner:
kb.create_resin_index()
click.echo(click.style("Success!", fg="green"))
Expand All @@ -126,7 +126,7 @@ def upsert(index_name, data_path, tokenizer_model):
'`export INDEX_NAME="MY_INDEX_NAME`')
click.echo(click.style(msg, fg="red"), err=True)
sys.exit(1)
Tokenizer.initialize(OpenAITokenizer, tokenizer_model)
Tokenizer.initialize(OpenAITokenizer, model_name=tokenizer_model)
if data_path is None:
msg = ("Data path is not provided," +
" please provide it with --data-path or set it with env var")
Expand Down Expand Up @@ -173,8 +173,11 @@ def upsert(index_name, data_path, tokenizer_model):
click.echo(click.style(msg, fg="red"), err=True)
sys.exit(1)
pd.options.display.max_colwidth = 20
click.echo(data[0].json(exclude_none=True, indent=2))
click.confirm(click.style("\nDoes this data look right?", fg="red"), abort=True)

click.echo(pd.DataFrame([doc.dict(exclude_none=True) for doc in data[:5]]))
click.echo(click.style(f"\nTotal records: {len(data)}"))
click.confirm(click.style("\nDoes this data look right?", fg="red"),
abort=True)
kb.upsert(data)
click.echo(click.style("Success!", fg="green"))

Expand Down
9 changes: 9 additions & 0 deletions tests/unit/tokenizer/test_openai_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,12 @@ def test_messages_token_count(tokenizer):
@staticmethod
def test_messages_token_count_empty_messages(tokenizer):
assert tokenizer.messages_token_count([]) == 3

@staticmethod
def test_special_tokens_to_natural_text(tokenizer):
tokens = tokenizer.tokenize("<|endoftext|>")
assert tokens == ['<', '|', 'endo', 'ft', 'ext', '|', '>']

assert tokenizer.detokenize(tokens) == "<|endoftext|>"

assert tokenizer.token_count("<|endoftext|>") == 7