From 0e9f227afd4537018c7fe5c3018b22871708cb65 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Thu, 4 May 2023 18:33:08 +0200 Subject: [PATCH 1/8] Update low level examples --- examples/low_level_api/Chat.py | 70 ++++++ examples/low_level_api/Miku.py | 59 +++++ examples/low_level_api/ReasonAct.py | 49 +++++ examples/low_level_api/common.py | 137 ++++++++---- .../low_level_api/low_level_api_chat_cpp.py | 206 +++++++++++++++--- .../low_level_api/low_level_api_llama_cpp.py | 35 ++- 6 files changed, 475 insertions(+), 81 deletions(-) create mode 100644 examples/low_level_api/Chat.py create mode 100644 examples/low_level_api/Miku.py create mode 100644 examples/low_level_api/ReasonAct.py diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py new file mode 100644 index 000000000..101550805 --- /dev/null +++ b/examples/low_level_api/Chat.py @@ -0,0 +1,70 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "ChatLLaMa") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "USER") +N_PREDICTS = int(env_or_def("N_PREDICTS", "2048")) +N_THREAD = int(env_or_def("N_THREAD", "8")) + +today = datetime.datetime.today() +DATE_YEAR=today.strftime("%Y") +DATE_TIME=today.strftime("%H:%M") + +prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. +{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +{USER_NAME}: Hello, {AI_NAME}! +{AI_NAME}: Hello {USER_NAME}! How may I help you today? +{USER_NAME}: What year is it? +{AI_NAME}: We are in {DATE_YEAR}. +{USER_NAME}: Please tell me the largest city in Europe. +{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia. +{USER_NAME}: What can you tell me about Moscow? +{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. +{USER_NAME}: What is a cat? +{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +{USER_NAME}: How do I pass command line arguments to a Node.js program? +{AI_NAME}: The arguments are stored in process.argv. + + argv[0] is the path to the Node. js executable. + argv[1] is the path to the script file. + argv[2] is the first argument passed to the script. + argv[3] is the second argument passed to the script and so on. +{USER_NAME}: Name a color. +{AI_NAME}: Blue. +{USER_NAME}: What time is it? +{AI_NAME}: It is {DATE_TIME}. +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_ctx=2048, + temp=0.7, + top_k=40, + top_p=0.5, + repeat_last_n=256, + n_batch=1024, + repeat_penalty=1.17647, + model=MODEL, + n_threads=N_THREAD, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + input_prefix=" ", + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/Miku.py b/examples/low_level_api/Miku.py new file mode 100644 index 000000000..eb9a2cfa9 --- /dev/null +++ b/examples/low_level_api/Miku.py @@ -0,0 +1,59 @@ +#!/bin/python +import sys, os +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "Miku") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "Anon") +N_PREDICTS = int(env_or_def("N_PREDICTS", "4096")) +N_THREAD = int(env_or_def("N_THREAD", "0")) + +prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer. +{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. +{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. +{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. +{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. +The conversation is only between {USER_NAME} and {AI_NAME} +The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice. +{AI_NAME} can only communicate through text, so she can't send images or videos. + + +{USER_NAME}: Hello! +{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression! +{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ +{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) +{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! +{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! +{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that! +{AI_NAME}: What do you like to do in your free time? ^_^ +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_batch=1024, + n_ctx=2048, + n_keep=-1, + repeat_last_n=256, + repeat_penalty=1.17647, + temp=0.7, + top_k=40, + top_p=0.5, + model=MODEL, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + prompt=prompt, +) + +if N_THREAD > 0: + params.n_threads = N_THREAD + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/ReasonAct.py b/examples/low_level_api/ReasonAct.py new file mode 100644 index 000000000..82e5c4487 --- /dev/null +++ b/examples/low_level_api/ReasonAct.py @@ -0,0 +1,49 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") + +prompt=f"""You run in a loop of Thought, Action, Observation. +At the end of the loop either Answer or restate your Thought and Action. +Use Thought to describe your thoughts about the question you have been asked. +Use Action to run one of these actions available to you: +- calculate[python math expression] +Observation will be the result of running those actions + + +Question: What is 4 * 7 / 3? +Thought: Do I need to use an action? Yes, I use calculate to do math +Action: calculate[4 * 7 / 3] +Observation: 9.3333333333 +Thought: Do I need to use an action? No, have the result +Answer: The calculate tool says it is 9.3333333333 +Question: What is capital of france? +Thought: Do I need to use an action? No, I know the answer +Answer: Paris is the capital of France +Question:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + interactive=True, + interactive_start=True, + top_k=10000, + temp=0.2, + repeat_penalty=1, + n_threads=7, + n_ctx=2048, + antiprompt=["Question:","Observation:"], + model=MODEL, + input_prefix=" ", + n_predict=-1, + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 061ec3ae9..6c35cc548 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -1,8 +1,9 @@ import os import argparse +import re from dataclasses import dataclass, field -from typing import List, Optional +from typing import List # Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp @@ -12,23 +13,35 @@ class GptParams: seed: int = -1 n_threads: int = min(4, os.cpu_count() or 1) n_predict: int = 128 - repeat_last_n: int = 64 n_parts: int = -1 n_ctx: int = 512 n_batch: int = 8 n_keep: int = 0 + ignore_eos: bool = False + logit_bias: dict[int, float] = field(default_factory=dict) top_k: int = 40 top_p: float = 0.95 + tfs_z: float = 1.00 + typical_p: float = 1.00 temp: float = 0.80 repeat_penalty: float = 1.10 + repeat_last_n: int = 64 + frequency_penalty: float = 0.0 + presence_penalty: float = 0.0 + mirostat: int = 0 + mirostat_tau: float = 5.0 + mirostat_eta: float = 0.1 model: str = "./models/llama-7B/ggml-model.bin" prompt: str = "" + path_session: str = "" input_prefix: str = " " - antiprompt: List[str] = field(default_factory=list) + lora_adapter: str = "" + lora_base: str = "" + memory_f16: bool = True random_prompt: bool = False use_color: bool = False @@ -38,7 +51,7 @@ class GptParams: interactive_start: bool = False instruct: bool = False - ignore_eos: bool = False + penalize_nl: bool = True perplexity: bool = False use_mmap: bool = True use_mlock: bool = False @@ -61,42 +74,74 @@ class GptParams: instruct_inp_suffix: str="\n\n### Response:\n\n" -def gpt_params_parse(argv = None, params: Optional[GptParams] = None): - if params is None: - params = GptParams() - +def gpt_params_parse(argv = None): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed") parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads") - parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") - parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") + parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx") - parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") - parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") + parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") + parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep") + + parser.add_argument( + "-l", + "--logit-bias", + type=str, + action='append', + help="--logit-bias TOKEN_ID(+/-)BIAS", + dest="logit_bias_str" + ) + parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") + parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") + parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z") parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") - parser.add_argument("--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") - parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") - parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") - parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep") + parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") + parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z") + parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty") + parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat") + parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau",dest="mirostat_tau") + parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta") + parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") + parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") + parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") + parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") parser.add_argument( - "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" + "-r", + "--reverse-prompt", + type=str, + action='append', + help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", + dest="antiprompt" ) - parser.add_argument("--embedding", action="store_true", help="", dest="embedding") + + parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter") + parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base") + + parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") + parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt") parser.add_argument( - "--interactive-start", + "--color", action="store_true", - help="run in interactive mode", - dest="interactive" + help="colorise output to distinguish prompt and user input from generations", + dest="use_color" ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" + ) + + parser.add_argument("--embedding", action="store_true", help="", dest="embedding") parser.add_argument( "--interactive-first", action="store_true", help="run in interactive mode and wait for input right away", dest="interactive_start" ) + parser.add_argument( "-ins", "--instruct", @@ -104,34 +149,40 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None): help="run in instruction mode (use with Alpaca or Vicuna models)", dest="instruct" ) - parser.add_argument( - "--color", - action="store_true", - help="colorise output to distinguish prompt and user input from generations", - dest="use_color" - ) - parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") + parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl") + parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") + parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") - parser.add_argument( - "-r", - "--reverse-prompt", - type=str, - action='append', - help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", - dest="antiprompt" - ) - parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") - parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") - parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") - parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt") - parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + + #Custom args parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix") parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") + + parser.add_argument( + "--interactive-start", + action="store_true", + help="run in interactive mode", + dest="interactive" + ) + args = parser.parse_args(argv) - return args + + logit_bias_str = args.logit_bias_str + delattr(args, "logit_bias_str") + params = GptParams(**vars(args)) + + if (params.lora_adapter): + params.use_mmap = False + + if (logit_bias_str != None): + for i in logit_bias_str: + if (m := re.match(r"(\d+)([-+]\d+)", i)): + params.logit_bias[int(m.group(1))] = int(m.group(2)) + + return params def gpt_random_prompt(rng): return [ @@ -148,4 +199,4 @@ def gpt_random_prompt(rng): ][rng % 10] if __name__ == "__main__": - print(GptParams(gpt_params_parse())) + print(gpt_params_parse()) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 6fced65d8..4e129ee86 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -10,9 +10,10 @@ You should also still be feeding the model with a "primer" prompt that shows it the expected format. """ +import ctypes import sys from time import time -from os import cpu_count +from os import cpu_count, path import llama_cpp from common import GptParams, gpt_params_parse, gpt_random_prompt @@ -77,6 +78,7 @@ def __init__(self, params: GptParams) -> None: # runtime args self.input_consumed = 0 self.n_past = 0 + self.n_session_consumed = 0 self.first_antiprompt = [] self.remaining_tokens = self.params.n_predict self.output_echo = self.params.input_echo @@ -94,6 +96,19 @@ def __init__(self, params: GptParams) -> None: if (not self.ctx): raise RuntimeError(f"error: failed to load model '{self.params.model}'") + if (self.params.ignore_eos): + self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf") + + if (len(self.params.lora_adapter) > 0): + if (llama_cpp.llama_apply_lora_from_file( + self.ctx, + self.params.lora_adapter, + self.params.lora_base if len(self.params.lora_base) > 0 else None, + self.params.n_threads + ) != 0): + print("error: failed to apply lora adapter") + return + print(file=sys.stderr) print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ | {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) @@ -117,13 +132,49 @@ def __init__(self, params: GptParams) -> None: with open(self.params.file) as f: self.params.prompt = f.read() + self.session_tokens: list[llama_cpp.llama_token] = [] + if (len(self.params.path_session) > 0): + print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr) + + if (path.exists(self.params.path_session)): + _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() + _n_token_count_out = llama_cpp.c_int() + if (llama_cpp.llama_load_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + _session_tokens, + self.params.n_ctx, + ctypes.byref(_n_token_count_out) + ) != 0): + print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) + return + self.session_tokens = _session_tokens[:_n_token_count_out] + print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) + else: + print(f"session file does not exist, will create", file=sys.stderr) + # tokenize the prompt self.embd = [] self.embd_inp = self._tokenize(self.params.prompt) - if (len(self.embd_inp) > self.params.n_ctx - 4): + if (len(self.embd_inp) > self.n_ctx - 4): raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") + # debug message about similarity of saved session, if applicable + n_matching_session_tokens = 0 + if len(self.session_tokens) > 0: + for id in self.session_tokens: + if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]: + break + n_matching_session_tokens += 1 + + if n_matching_session_tokens >= len(self.embd_inp): + print(f"session file has exact match for prompt!") + elif n_matching_session_tokens < (len(self.embd_inp) / 2): + print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") + else: + print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + # number of tokens to keep when resetting context if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): self.params.n_keep = len(self.embd_inp) @@ -132,6 +183,7 @@ def __init__(self, params: GptParams) -> None: self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False) # in instruct mode, we inject a prefix and a suffix to each input by the user + self.antiecho = None if (self.params.instruct): self.params.interactive_start = True _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) @@ -171,16 +223,24 @@ def __init__(self, params: GptParams) -> None: if len(self.params.input_prefix) > 0: print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr) - print(f"""sampling: temp = {self.params.temp},\ + print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\ +repeat_penalty = {self.params.repeat_penalty},\ +presence_penalty = {self.params.presence_penalty},\ +frequency_penalty = {self.params.frequency_penalty},\ top_k = {self.params.top_k},\ +tfs_z = {self.params.tfs_z},\ top_p = {self.params.top_p},\ -repeat_last_n = {self.params.repeat_last_n},\ -repeat_penalty = {self.params.repeat_penalty} - -generate: n_ctx = {self.n_ctx}, \ -n_batch = {self.params.n_batch}, \ -n_predict = {self.params.n_predict}, \ +typical_p = {self.params.typical_p},\ +temp = {self.params.temp},\ +mirostat = {self.params.mirostat},\ +mirostat_lr = {self.params.mirostat_eta},\ +mirostat_ent = {self.params.mirostat_tau},\ + +generate: n_ctx = {self.n_ctx},\ +n_batch = {self.params.n_batch},\ +n_predict = {self.params.n_predict},\ n_keep = {self.params.n_keep} + """, file=sys.stderr) # determine antiprompt tokens @@ -198,6 +258,9 @@ def __init__(self, params: GptParams) -> None: """, file=sys.stderr) self.set_color(CONSOLE_COLOR_PROMPT) + self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) + + # tokenize a prompt def _tokenize(self, prompt, bos=True): _arr = (llama_cpp.llama_token * (len(prompt) + 1))() @@ -229,31 +292,117 @@ def generate(self): self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd) ] self.embd = _insert + self.embd + self.params.path_session = "" + + # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + # REVIEW + if self.n_session_consumed < len(self.session_tokens): + for i in range(len(self.embd)): + if self.embd[i] != self.session_tokens[self.n_session_consumed]: + self.session_tokens = self.session_tokens[:self.n_session_consumed] + break + + self.n_past += 1 + self.n_session_consumed += 1 + + if self.n_session_consumed >= len(self.session_tokens): + i += 1 + break + + if i > 0: + self.embd = self.embd[i:] + + # evaluate tokens in batches + # embd is typically prepared beforehand to fit within a batch, but not always + #TODO BUG: The batching code causes nonsensical generation + """for i in range(0, len(self.embd), self.params.n_batch): + n_eval = self.params.n_batch + _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval]) + if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: + print(f"failed to eval") + return + + self.n_past += n_eval""" if (llama_cpp.llama_eval( self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads ) != 0): raise Exception("Failed to llama_eval!") + if len(self.embd) > 0 and not len(self.params.path_session) > 0: + self.session_tokens.extend(self.embd) + self.n_session_consumed = len(self.session_tokens) + self.n_past += len(self.embd) self.embd = [] - if len(self.embd_inp) <= self.input_consumed: + if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting # out of user input, sample next token + top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k + repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n + + # optionally save the session on first sample (for faster prompt loading next time) + if len(self.params.path_session) > 0 and self.need_to_save_session: + self.need_to_save_session = False + llama_cpp.llama_save_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + self.session_tokens, + len(self.session_tokens) + ) + + id = 0 + + logits = llama_cpp.llama_get_logits(self.ctx) + n_vocab = llama_cpp.llama_n_vocab(self.ctx) + + # Apply params.logit_bias map + for key, value in self.params.logit_bias.items(): + logits[key] += value + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + # Apply penalties + nl_logit = logits[llama_cpp.llama_token_nl()] + last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx) + + _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) + llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, + _arr, + last_n_repeat, self.params.repeat_penalty) + llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, + _arr, + last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty) + + if not self.params.penalize_nl: + logits[llama_cpp.llama_token_nl()] = nl_logit + + if self.params.temp <= 0: + # Greedy sampling + id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) + else: + if self.params.mirostat == 1: + mirostat_mu = 2.0 * self.params.mirostat_tau + mirostat_m = 100 + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu) + elif self.params.mirostat == 2: + mirostat_mu = 2.0 * self.params.mirostat_tau + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu) + else: + # Temperature sampling + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + id = llama_cpp.llama_sample_token(self.ctx, candidates_p) + # print("`{}`".format(candidates_p.size)) - if (self.params.ignore_eos): - logits = llama_cpp.llama_get_logits(self.ctx) - logits[llama_cpp.llama_token_eos()] = llama_cpp.c_float(0) - - _arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):] - id = llama_cpp.llama_sample_top_p_top_k( - self.ctx, - (llama_cpp.llama_token * len(_arr))(*_arr), - len(_arr), - self.params.top_k, - self.params.top_p, - self.params.temp, - self.params.repeat_penalty, - ) self.last_n_tokens.pop(0) self.last_n_tokens.append(id) @@ -288,7 +437,7 @@ def generate(self): # display tokens if self.output_echo: for id in self.embd: - if self.params.instruct: + if self.antiecho != None: for r in self.antiecho(id): yield r else: @@ -316,7 +465,7 @@ def generate(self): if (not self.params.instruct): for i in self.llama_token_eot: yield i - break + break # respect n_predict even if antiprompt is present if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): @@ -356,7 +505,7 @@ def input(self, prompt: str): def output(self): self.remaining_tokens = self.params.n_predict for id in self.generate(): - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") # read user input def read_input(self): @@ -415,8 +564,7 @@ def interact(self): {USER_NAME}: Name a color. {AI_NAME}: Blue {USER_NAME}:""" - args = gpt_params_parse() - params = GptParams(**vars(args)) + params = gpt_params_parse() with LLaMAInteract(params) as m: m.interact() diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py index 4fb5a0366..9e38ec7cb 100644 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -37,6 +37,10 @@ last_n_size = 64 last_n_tokens_data = [0] * last_n_size n_batch = 24 +last_n_repeat = 64 +repeat_penalty = 1 +frequency_penalty = 0.0 +presence_penalty = 0.0 while remaining_tokens > 0: if len(embd) > 0: @@ -47,15 +51,28 @@ n_past += len(embd) embd = [] if len(embd_inp) <= input_consumed: - id = llama_cpp.llama_sample_top_p_top_k( - ctx, - (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data), - len(last_n_tokens_data), - 40, - 0.8, - 0.2, - 1.0 / 0.85, - ) + logits = llama_cpp.llama_get_logits(ctx) + n_vocab = llama_cpp.llama_n_vocab(ctx) + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) + llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p, + _arr, + last_n_repeat, repeat_penalty) + llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p, + _arr, + last_n_repeat, frequency_penalty, presence_penalty) + + llama_cpp.llama_sample_top_k(ctx, candidates_p, 40) + llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8) + llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2) + id = llama_cpp.llama_sample_token(ctx, candidates_p) + last_n_tokens_data = last_n_tokens_data[1:] + [id] embd.append(id) input_noecho = False From 1895c1103379156f4bd2ae895cdab080ab9cd104 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 13:18:25 +0200 Subject: [PATCH 2/8] Rename postfix to suffix to match upstream --- examples/low_level_api/Chat.py | 1 + examples/low_level_api/common.py | 4 ++-- examples/low_level_api/low_level_api_chat_cpp.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py index 101550805..fcef8cd80 100644 --- a/examples/low_level_api/Chat.py +++ b/examples/low_level_api/Chat.py @@ -63,6 +63,7 @@ def env_or_def(env, default): interactive=True, antiprompt=[f"{USER_NAME}:"], input_prefix=" ", + input_suffix=f"{AI_NAME}:", prompt=prompt, ) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 6c35cc548..7a25582ea 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -37,6 +37,7 @@ class GptParams: prompt: str = "" path_session: str = "" input_prefix: str = " " + input_suffix: str = "" antiprompt: List[str] = field(default_factory=list) lora_adapter: str = "" @@ -64,7 +65,6 @@ class GptParams: # Set to "\nUser:" etc. # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" fix_prefix: str = "" - output_postfix: str = "" input_echo: bool = True, # Default instructions for Alpaca @@ -110,6 +110,7 @@ def gpt_params_parse(argv = None): parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix") parser.add_argument( "-r", "--reverse-prompt", @@ -158,7 +159,6 @@ def gpt_params_parse(argv = None): #Custom args parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") - parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix") parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") parser.add_argument( diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 4e129ee86..72ced2bd9 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -527,8 +527,8 @@ def interact(self): self.input(self.read_input()) else: print(self.params.input_prefix, end="") - self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}") - print(self.params.output_postfix,end="") + self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}") + print(self.params.input_suffix,end="") self.set_color(CONSOLE_COLOR_DEFAULT) try: From 9797394c81133eebb367bd0673b6c89eefd5a38e Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 13:27:52 +0200 Subject: [PATCH 3/8] Wrong logit_bias parsed type --- examples/low_level_api/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 7a25582ea..2bfe35646 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -180,7 +180,7 @@ def gpt_params_parse(argv = None): if (logit_bias_str != None): for i in logit_bias_str: if (m := re.match(r"(\d+)([-+]\d+)", i)): - params.logit_bias[int(m.group(1))] = int(m.group(2)) + params.logit_bias[int(m.group(1))] = float(m.group(2)) return params From 3ceb47b597a8819db3afa851df4ae3211f2cb680 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 13:35:50 +0200 Subject: [PATCH 4/8] Fix mirastat requiring c_float --- .../low_level_api/low_level_api_chat_cpp.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 72ced2bd9..55b24cde2 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -357,7 +357,7 @@ def generate(self): # Apply params.logit_bias map for key, value in self.params.logit_bias.items(): - logits[key] += value + logits[key] += llama_cpp.c_float(value) _arr = (llama_cpp.llama_token_data * n_vocab)(*[ llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) @@ -372,14 +372,14 @@ def generate(self): _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, _arr, - last_n_repeat, self.params.repeat_penalty) + last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty)) llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, _arr, - last_n_repeat, self.params.frequency_penalty, self.params.presence_penalty) + last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) if not self.params.penalize_nl: logits[llama_cpp.llama_token_nl()] = nl_logit - + if self.params.temp <= 0: # Greedy sampling id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) @@ -387,19 +387,19 @@ def generate(self): if self.params.mirostat == 1: mirostat_mu = 2.0 * self.params.mirostat_tau mirostat_m = 100 - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) - id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_m, mirostat_mu) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu)) elif self.params.mirostat == 2: mirostat_mu = 2.0 * self.params.mirostat_tau - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) - id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, self.params.mirostat_tau, self.params.mirostat_eta, mirostat_mu) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) else: # Temperature sampling llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k) - llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, self.params.tfs_z) - llama_cpp.llama_sample_typical(self.ctx, candidates_p, self.params.typical_p) - llama_cpp.llama_sample_top_p(self.ctx, candidates_p, self.params.top_p) - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, self.params.temp) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z)) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p)) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p)) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) id = llama_cpp.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) From 996f63e9e1804b2d9a91c5081665ea536a85542f Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 15:16:58 +0200 Subject: [PATCH 5/8] Add utf8 to chat example --- examples/low_level_api/common.py | 2 +- .../low_level_api/low_level_api_chat_cpp.py | 73 +++++++------- examples/low_level_api/util.py | 95 +++++++++++++++++++ 3 files changed, 130 insertions(+), 40 deletions(-) create mode 100644 examples/low_level_api/util.py diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py index 2bfe35646..55d08db5f 100644 --- a/examples/low_level_api/common.py +++ b/examples/low_level_api/common.py @@ -102,7 +102,7 @@ def gpt_params_parse(argv = None): parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z") parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty") parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat") - parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau",dest="mirostat_tau") + parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau") parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta") parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 55b24cde2..9a9bc019f 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -17,34 +17,7 @@ import llama_cpp from common import GptParams, gpt_params_parse, gpt_random_prompt - -ANSI_COLOR_RESET = "\x1b[0m" -ANSI_COLOR_YELLOW = "\x1b[33m" -ANSI_BOLD = "\x1b[1m" -ANSI_COLOR_GREEN = "\x1b[32m" - -CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET -CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW -CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN - -# Iterative search -# Actively searches and prevents a pattern from being returned -class IterSearch: - def __init__(self, pattern): - self.pattern = list(pattern) - self.buffer = [] - - def __call__(self, char): - self.buffer += [char] - - if (self.pattern[:len(self.buffer)] == self.buffer): - if (len(self.buffer) >= len(self.pattern)): - self.buffer.clear() - return [] - - _tmp = self.buffer[:] - self.buffer.clear() - return _tmp +import util # A LLaMA interactive session class LLaMAInteract: @@ -82,6 +55,7 @@ def __init__(self, params: GptParams) -> None: self.first_antiprompt = [] self.remaining_tokens = self.params.n_predict self.output_echo = self.params.input_echo + self.multibyte_fix = [] # model load self.lparams = llama_cpp.llama_context_default_params() @@ -188,7 +162,7 @@ def __init__(self, params: GptParams) -> None: self.params.interactive_start = True _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) self.first_antiprompt.append(_ptn) - self.antiecho = IterSearch(_ptn) + self.antiecho = util.IterSearch(_ptn) # enable interactive mode if reverse prompt or interactive start is specified if (len(self.params.antiprompt) != 0 or self.params.interactive_start): @@ -256,14 +230,14 @@ def __init__(self, params: GptParams) -> None: - If you want to submit another line, end your input in '\\'. """, file=sys.stderr) - self.set_color(CONSOLE_COLOR_PROMPT) + self.set_color(util.CONSOLE_COLOR_PROMPT) self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) # tokenize a prompt def _tokenize(self, prompt, bos=True): - _arr = (llama_cpp.llama_token * (len(prompt) + 1))() + _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))() _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos) return _arr[:_n] @@ -295,7 +269,6 @@ def generate(self): self.params.path_session = "" # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) - # REVIEW if self.n_session_consumed < len(self.session_tokens): for i in range(len(self.embd)): if self.embd[i] != self.session_tokens[self.n_session_consumed]: @@ -445,7 +418,7 @@ def generate(self): # reset color to default if we there is no pending user input if (self.params.input_echo and len(self.embd_inp) == self.input_consumed): - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) if (self.params.interactive and len(self.embd_inp) <= self.input_consumed): # if antiprompt is present, stop @@ -486,12 +459,12 @@ def __exit__(self, type, value, tb): def exit(self): llama_cpp.llama_free(self.ctx) - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) # return past text def past(self): for id in self.last_n_tokens[-self.n_past:]: - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore") # write input def input(self, prompt: str): @@ -505,7 +478,29 @@ def input(self, prompt: str): def output(self): self.remaining_tokens = self.params.n_predict for id in self.generate(): - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + cur_char = llama_cpp.llama_token_to_str(self.ctx, id) + + # Add remainder of missing bytes + if None in self.multibyte_fix: + self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char + + # Return completed utf char + if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix: + yield (b"".join(self.multibyte_fix)).decode("utf8") + self.multibyte_fix = [] + continue + + # Contains multi-byte UTF8 + for num, pattern in [(2, 192), (3, 224), (4, 240)]: + # Bitwise AND check + if pattern & int.from_bytes(cur_char) == pattern: + self.multibyte_fix = [cur_char] + ([None] * (num-1)) + + # Stop incomplete bytes from passing + if len(self.multibyte_fix) > 0: + continue + + yield cur_char.decode("utf8") # read user input def read_input(self): @@ -521,7 +516,7 @@ def interact(self): self.params.input_echo = False while self.params.interactive: - self.set_color(CONSOLE_COLOR_USER_INPUT) + self.set_color(util.CONSOLE_COLOR_USER_INPUT) if (self.params.instruct): print('\n> ', end="") self.input(self.read_input()) @@ -529,13 +524,13 @@ def interact(self): print(self.params.input_prefix, end="") self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}") print(self.params.input_suffix,end="") - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) try: for i in self.output(): print(i,end="",flush=True) except KeyboardInterrupt: - self.set_color(CONSOLE_COLOR_DEFAULT) + self.set_color(util.CONSOLE_COLOR_DEFAULT) if not self.params.instruct: print(self.params.fix_prefix,end="") self.input(self.params.fix_prefix) diff --git a/examples/low_level_api/util.py b/examples/low_level_api/util.py new file mode 100644 index 000000000..9d0ec2f70 --- /dev/null +++ b/examples/low_level_api/util.py @@ -0,0 +1,95 @@ + +ANSI_COLOR_RESET = "\x1b[0m" +ANSI_COLOR_YELLOW = "\x1b[33m" +ANSI_BOLD = "\x1b[1m" +ANSI_COLOR_GREEN = "\x1b[32m" + +CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET +CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW +CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN + +# Iterative search +# Actively searches and prevents a pattern from being returned +class IterSearch: + def __init__(self, pattern): + self.pattern = list(pattern) + self.buffer = [] + + def __call__(self, char): + self.buffer += [char] + + if (self.pattern[:len(self.buffer)] == self.buffer): + if (len(self.buffer) >= len(self.pattern)): + self.buffer.clear() + return [] + + _tmp = self.buffer[:] + self.buffer.clear() + return _tmp + +class Circle: + def __init__(self, size, default=0): + self.list = [default] * size + self.maxsize = size + self.size = 0 + self.offset = 0 + + def append(self, elem): + if self.size < self.maxsize: + self.list[self.size] = elem + self.size += 1 + else: + self.list[self.offset] = elem + self.offset = (self.offset + 1) % self.maxsize + + def __getitem__(self, val): + if isinstance(val, int): + if 0 > val or val >= self.size: + raise IndexError('Index out of range') + return self.list[val] if self.size < self.maxsize else self.list[(self.offset + val) % self.maxsize] + elif isinstance(val, slice): + start, stop, step = val.start, val.stop, val.step + if step is None: + step = 1 + if start is None: + start = 0 + if stop is None: + stop = self.size + if start < 0: + start = self.size + start + if stop < 0: + stop = self.size + stop + + indices = range(start, stop, step) + return [self.list[(self.offset + i) % self.maxsize] for i in indices if i < self.size] + else: + raise TypeError('Invalid argument type') + + + + +if __name__ == "__main__": + c = Circle(5) + + c.append(1) + print(c.list) + print(c[:]) + assert c[0] == 1 + assert c[:5] == [1] + + for i in range(2,5+1): + c.append(i) + print(c.list) + print(c[:]) + assert c[0] == 1 + assert c[:5] == [1,2,3,4,5] + + for i in range(5+1,9+1): + c.append(i) + print(c.list) + print(c[:]) + assert c[0] == 5 + assert c[:5] == [5,6,7,8,9] + #assert c[:-5] == [5,6,7,8,9] + assert c[:10] == [5,6,7,8,9] + From fd80ddf703373f523bda4e62d24564fa8930f670 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Sat, 6 May 2023 22:22:28 +0200 Subject: [PATCH 6/8] Fix a bug with wrong type --- examples/low_level_api/low_level_api_chat_cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 9a9bc019f..272b454c5 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -330,7 +330,7 @@ def generate(self): # Apply params.logit_bias map for key, value in self.params.logit_bias.items(): - logits[key] += llama_cpp.c_float(value) + logits[key] += value _arr = (llama_cpp.llama_token_data * n_vocab)(*[ llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) From 2c0d9b182cd417338a85396660d9828070b3373f Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Mon, 8 May 2023 15:27:03 +0200 Subject: [PATCH 7/8] Fix session loading and saving in low level example chat --- .../low_level_api/low_level_api_chat_cpp.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 272b454c5..b86d723b8 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -112,16 +112,17 @@ def __init__(self, params: GptParams) -> None: if (path.exists(self.params.path_session)): _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() - _n_token_count_out = llama_cpp.c_int() + _n_token_count_out = llama_cpp.c_size_t() if (llama_cpp.llama_load_session_file( self.ctx, self.params.path_session.encode("utf8"), _session_tokens, self.params.n_ctx, ctypes.byref(_n_token_count_out) - ) != 0): + ) != 1): print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) return + _n_token_count_out = _n_token_count_out.value self.session_tokens = _session_tokens[:_n_token_count_out] print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) else: @@ -135,19 +136,21 @@ def __init__(self, params: GptParams) -> None: raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") # debug message about similarity of saved session, if applicable - n_matching_session_tokens = 0 + self.n_matching_session_tokens = 0 if len(self.session_tokens) > 0: for id in self.session_tokens: - if n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[n_matching_session_tokens]: + if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]: break - n_matching_session_tokens += 1 + self.n_matching_session_tokens += 1 - if n_matching_session_tokens >= len(self.embd_inp): + if self.n_matching_session_tokens >= len(self.embd_inp): print(f"session file has exact match for prompt!") - elif n_matching_session_tokens < (len(self.embd_inp) / 2): - print(f"warning: session file has low similarity to prompt ({n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") + elif self.n_matching_session_tokens < (len(self.embd_inp) / 2): + print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") else: - print(f"session file matches {n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + + self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) # number of tokens to keep when resetting context if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): @@ -232,9 +235,6 @@ def __init__(self, params: GptParams) -> None: """, file=sys.stderr) self.set_color(util.CONSOLE_COLOR_PROMPT) - self.need_to_save_session = len(self.params.path_session) > 0 and n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) - - # tokenize a prompt def _tokenize(self, prompt, bos=True): _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))() @@ -302,7 +302,7 @@ def generate(self): ) != 0): raise Exception("Failed to llama_eval!") - if len(self.embd) > 0 and not len(self.params.path_session) > 0: + if len(self.embd) > 0 and len(self.params.path_session) > 0: self.session_tokens.extend(self.embd) self.n_session_consumed = len(self.session_tokens) @@ -319,7 +319,7 @@ def generate(self): llama_cpp.llama_save_session_file( self.ctx, self.params.path_session.encode("utf8"), - self.session_tokens, + (llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens), len(self.session_tokens) ) From eaf9f19aa98fa93fb078f31c6f65ce176629f808 Mon Sep 17 00:00:00 2001 From: Mug <2797716+SagsMug@users.noreply.github.com> Date: Mon, 8 May 2023 15:27:42 +0200 Subject: [PATCH 8/8] Fix lora --- examples/low_level_api/low_level_api_chat_cpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index b86d723b8..8773cb1e3 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -76,8 +76,8 @@ def __init__(self, params: GptParams) -> None: if (len(self.params.lora_adapter) > 0): if (llama_cpp.llama_apply_lora_from_file( self.ctx, - self.params.lora_adapter, - self.params.lora_base if len(self.params.lora_base) > 0 else None, + self.params.lora_adapter.encode("utf8"), + self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None, self.params.n_threads ) != 0): print("error: failed to apply lora adapter")