Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Commit

Permalink
Merge pull request #73 from pinecone-io/special-tokens-as-natural-text
Browse files Browse the repository at this point in the history
openai tokenizer to treat specail tokens as natural text
  • Loading branch information
acatav authored Oct 17, 2023
2 parents 47c67ad + 444d7de commit 9b15cde
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 13 deletions.
19 changes: 6 additions & 13 deletions src/resin/tokenizer/openai.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import tiktoken
from typing import List
from .base import BaseTokenizer
from ..models.data_models import Messages, MessageBase, Role
from ..models.data_models import Messages


class OpenAITokenizer(BaseTokenizer):
Expand All @@ -14,15 +14,18 @@ def __init__(self, model_name: str = "gpt-3.5-turbo"):

def tokenize(self, text: str) -> List[str]:
return [self._encoder.decode([encoded_token])
for encoded_token in self._encoder.encode(text)]
for encoded_token in self._encode(text)]

def detokenize(self, tokens: List[str]) -> str:
if not isinstance(tokens, List):
raise TypeError(f"detokenize expect List[str], got f{type(tokens)}")
return "".join(tokens)

def token_count(self, text: str) -> int:
return len(self._encoder.encode(text))
return len(self._encode(text))

def _encode(self, text):
return self._encoder.encode(text, disallowed_special=())

def messages_token_count(self, messages: Messages) -> int:
# Adapted from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb # noqa
Expand All @@ -33,13 +36,3 @@ def messages_token_count(self, messages: Messages) -> int:
num_tokens += self.token_count(value)
num_tokens += self.FIXED_PREFIX_TOKENS
return num_tokens

@staticmethod
def test_messages_token_count(tokenizer):
messages = [MessageBase(role=Role.USER, content="hello"),
MessageBase(role=Role.ASSISTANT, content="hi")]
assert tokenizer.messages_token_count(messages) == 11

@staticmethod
def test_messages_token_count_empty_messages(tokenizer):
assert tokenizer.messages_token_count([]) == 0
9 changes: 9 additions & 0 deletions tests/unit/tokenizer/test_openai_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,12 @@ def test_messages_token_count(tokenizer):
@staticmethod
def test_messages_token_count_empty_messages(tokenizer):
assert tokenizer.messages_token_count([]) == 3

@staticmethod
def test_special_tokens_to_natural_text(tokenizer):
tokens = tokenizer.tokenize("<|endoftext|>")
assert tokens == ['<', '|', 'endo', 'ft', 'ext', '|', '>']

assert tokenizer.detokenize(tokens) == "<|endoftext|>"

assert tokenizer.token_count("<|endoftext|>") == 7

0 comments on commit 9b15cde

Please sign in to comment.