diff --git a/packages/llm/local_llm/Dockerfile b/packages/llm/local_llm/Dockerfile index 72c6a371c..c2480786f 100644 --- a/packages/llm/local_llm/Dockerfile +++ b/packages/llm/local_llm/Dockerfile @@ -1,7 +1,7 @@ #--- # name: local_llm # group: llm -# depends: [mlc, jetson-utils, riva-client:python] +# depends: [mlc, riva-client:python, jetson-inference, torch2trt] # requires: '>=34.1.0' # docs: docs.md #--- diff --git a/packages/llm/local_llm/agent.py b/packages/llm/local_llm/agent.py index 990b22218..4a0c97152 100644 --- a/packages/llm/local_llm/agent.py +++ b/packages/llm/local_llm/agent.py @@ -35,7 +35,7 @@ def __call__(self, input, channel=0, **kwargs): """ Operator overload for process() """ - return self.process[channel](input, **kwargs) + return self.process(input, channel, **kwargs) def start(self): """ diff --git a/packages/llm/local_llm/agents/chat.py b/packages/llm/local_llm/agents/chat.py index a3aa86405..db236e2d6 100644 --- a/packages/llm/local_llm/agents/chat.py +++ b/packages/llm/local_llm/agents/chat.py @@ -66,8 +66,6 @@ def print_input_prompt(self): if __name__ == "__main__": - from local_llm.utils import ArgParser - parser = ArgParser() parser.add_argument("-it", "--interactive", action="store_true", help="enable interactive user input from the terminal") args = parser.parse_args() diff --git a/packages/llm/local_llm/agents/mp_chat.py b/packages/llm/local_llm/agents/mp_chat.py new file mode 100644 index 000000000..43d3bbdab --- /dev/null +++ b/packages/llm/local_llm/agents/mp_chat.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +from local_llm import Agent, Pipeline + +from local_llm.plugins import UserPrompt, ChatQuery, PrintStream, ProcessProxy +from local_llm.utils import ArgParser + + +class MultiprocessChat(Agent): + """ + Test of running a LLM and chat session in a subprocess. + """ + def __init__(self, **kwargs): + super().__init__() + + self.pipeline = Pipeline([ + UserPrompt(interactive=False, **kwargs), # interactive=False if kwargs.get('prompt') else True + ProcessProxy((lambda **kwargs: ChatQuery(**kwargs)), **kwargs), + PrintStream(color='green') + ]) + + +if __name__ == "__main__": + parser = ArgParser() + args = parser.parse_args() + + agent = MultiprocessChat(**vars(args)).run() \ No newline at end of file diff --git a/packages/llm/local_llm/agents/mp_test.py b/packages/llm/local_llm/agents/mp_test.py new file mode 100644 index 000000000..0d2d63d30 --- /dev/null +++ b/packages/llm/local_llm/agents/mp_test.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +import time + +from local_llm import Agent, Pipeline + +from local_llm.plugins import PrintStream, ProcessProxy +from local_llm.utils import ArgParser + + +class MultiprocessTest(Agent): + """ + This is a test of the ProcessProxy plugin for running pipelines and plugins in their own processes. + """ + def __init__(self, **kwargs): + super().__init__() + + self.pipeline = Pipeline([ + ProcessProxy((lambda **kwargs: PrintStream(**kwargs)), color='green', relay=True, **kwargs), + PrintStream(color='blue', **kwargs), + ]) + + +if __name__ == "__main__": + parser = ArgParser() + args = parser.parse_args() + + agent = MultiprocessTest(**vars(args)).start() + + while True: + agent("INSERT MESSAGE HERE") + time.sleep(1.0) \ No newline at end of file diff --git a/packages/llm/local_llm/agents/mp_video.py b/packages/llm/local_llm/agents/mp_video.py new file mode 100644 index 000000000..93bd922c1 --- /dev/null +++ b/packages/llm/local_llm/agents/mp_video.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +import logging + +from local_llm import Agent + +from local_llm.plugins import VideoSource, VideoOutput, ProcessProxy +from local_llm.utils import ArgParser + + +class MultiprocessVideo(Agent): + """ + Test of running a video stream across processes. + """ + def __init__(self, **kwargs): + super().__init__() + + self.video_source = VideoSource(return_tensors='np', **kwargs) + self.video_output = ProcessProxy((lambda **kwargs: VideoOutput(**kwargs)), **kwargs) + + self.video_source.add(self.on_video, threaded=False) + self.video_source.add(self.video_output) + + self.pipeline = [self.video_source] + + def on_video(self, image): + logging.debug(f"captured {image.shape} ({image.dtype}) frame from {self.video_source.resource}") + + +if __name__ == "__main__": + from local_llm.utils import ArgParser + + parser = ArgParser(extras=['video_input', 'video_output', 'log']) + args = parser.parse_args() + + agent = MultiprocessVideo(**vars(args)).run() \ No newline at end of file diff --git a/packages/llm/local_llm/agents/video_query.py b/packages/llm/local_llm/agents/video_query.py index 9d4c25ed1..f0e3b9e2a 100644 --- a/packages/llm/local_llm/agents/video_query.py +++ b/packages/llm/local_llm/agents/video_query.py @@ -1,56 +1,116 @@ #!/usr/bin/env python3 -from local_llm import Agent, Pipeline, ChatTemplates +import time +import threading -from local_llm.plugins import UserPrompt, ChatQuery, PrintStream +from local_llm import Agent + +from local_llm.plugins import VideoSource, VideoOutput, ChatQuery, PrintStream, ProcessProxy from local_llm.utils import ArgParser, print_table from termcolor import cprint +from jetson_utils import cudaFont, cudaMemcpy, cudaToNumpy, cudaDeviceSynchronize + -# -# TODO max-new-tokens and other generation args... -# TODO custom arg parser -# -class ChatAgent(Agent): +class VideoQuery(Agent): """ - Agent for two-turn multimodal chat. + Perpetual always-on closed-loop visual agent that applies prompts to a video stream. """ - def __init__(self, model="meta-llama/Llama-2-7b-chat-hf", interactive=True, **kwargs): + def __init__(self, model="liuhaotian/llava-v1.5-13b", **kwargs): super().__init__() + + # load model in another process for smooth streaming + self.llm = ProcessProxy((lambda **kwargs: ChatQuery(model, drop_inputs=True, **kwargs)), **kwargs) + self.llm.add(PrintStream(color='green', relay=True).add(self.on_eos)) + self.llm.start() + + # test / warm-up query + self.warmup = True + self.last_text = "" + self.eos = False - """ - # Equivalent to: - self.pipeline = UserPrompt(interactive=interactive, **kwargs).add( - LLMQuery(model, **kwargs).add( - PrintStream(relay=True).add(self.on_eos) - )) - """ - self.pipeline = Pipeline([ - UserPrompt(interactive=interactive, **kwargs), - ChatQuery(model, **kwargs), - PrintStream(relay=True), - self.on_eos - ]) + self.llm("What is 2+2?") - self.model = self.pipeline[ChatQuery].model - self.interactive = interactive + while self.warmup: + time.sleep(0.25) + + # create video streams + self.video_source = VideoSource(**kwargs) + self.video_output = VideoOutput(**kwargs) - self.print_input_prompt() - - def on_eos(self, input): - if input.endswith(''): - print_table(self.model.stats) - self.print_input_prompt() + self.video_source.add(self.on_video, threaded=False) + self.video_output.start() + + self.font = cudaFont() - def print_input_prompt(self): - if self.interactive: - cprint('>> PROMPT: ', 'blue', end='', flush=True) + # setup prompts + self.prompt = 0 + self.prompts = kwargs.get('prompt') + if not self.prompts: + self.prompts = [ + 'Describe the image concisely.', + 'How many fingers is the person holding up?', + 'What does the text in the image say?', + 'There is a question asked in the image. What is the answer?', + ] -if __name__ == "__main__": - from local_llm.utils import ArgParser + self.keyboard_thread = threading.Thread(target=self.poll_keyboard) + self.keyboard_thread.start() + + # entry node + self.pipeline = [self.video_source] + + def on_video(self, image): + np_image = cudaToNumpy(image) + cudaDeviceSynchronize() + + self.llm([ + 'reset', + np_image, + self.prompts[self.prompt], + ]) + + text = self.last_text.replace('\n', '').replace('', '').strip() - parser = ArgParser() - parser.add_argument("-it", "--interactive", action="store_true", help="enable interactive user input from the terminal") + if text: + self.font.OverlayText(image, text=text, x=5, y=42, color=self.font.White, background=self.font.Gray40) + + self.font.OverlayText(image, text=self.prompts[self.prompt], x=5, y=5, color=(120,215,21), background=self.font.Gray40) + self.video_output(image) + + def on_eos(self, text): + if self.eos: + self.last_text = text # new query response + self.eos = False + elif not self.warmup: # don't view warmup response + self.last_text = self.last_text + text + + if text.endswith(''): + #print_table(self.llm.model.stats) + self.warmup = False + self.eos = True + + def poll_keyboard(self): + while True: + try: + key = input().strip() #getch.getch() + + if key == 'd' or key == 'l': + self.prompt = (self.prompt + 1) % len(self.prompts) + elif key == 'a' or key == 'j': + self.prompt = self.prompt - 1 + if self.prompt < 0: + self.prompt = len(self.prompts) - 1 + + num = int(key) + + if num > 0 and num <= len(self.prompts): + self.prompt = num - 1 + except Exception as err: + continue + +if __name__ == "__main__": + parser = ArgParser(extras=ArgParser.Defaults+['video_input', 'video_output']) args = parser.parse_args() - - agent = ChatAgent(**vars(args)).run() \ No newline at end of file + agent = VideoQuery(**vars(args)).run() + \ No newline at end of file diff --git a/packages/llm/local_llm/agents/video_stream.py b/packages/llm/local_llm/agents/video_stream.py new file mode 100644 index 000000000..79f168cf9 --- /dev/null +++ b/packages/llm/local_llm/agents/video_stream.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import logging + +from local_llm import Agent + +from local_llm.plugins import VideoSource, VideoOutput +from local_llm.utils import ArgParser + + +class VideoStream(Agent): + """ + Relay, view, or test a video stream. Use the --video-input and --video-output arguments + to set the video source and output protocols used from: + + https://github.com/dusty-nv/jetson-inference/blob/master/docs/aux-streaming.md + + For example, this will capture a V4L2 camera and serve it via WebRTC with H.264 encoding: + + python3 -m local_llm.agents.video_stream \ + --video-input /dev/video0 \ + --video-output webrtc://@:8554/output + + It's also used as a basic test of video streaming before using more complex agents that rely on it. + """ + def __init__(self, video_input=None, video_output=None, **kwargs): + super().__init__() + + self.video_source = VideoSource(video_input, **kwargs) + self.video_output = VideoOutput(video_output, **kwargs) + + self.video_source.add(self.on_video, threaded=False) + self.video_source.add(self.video_output) + + self.pipeline = [self.video_source] + + def on_video(self, image): + logging.debug(f"captured {image.width}x{image.height} frame from {self.video_source.resource}") + + +if __name__ == "__main__": + parser = ArgParser(extras=['video_input', 'video_output', 'log']) + args = parser.parse_args() + + agent = VideoStream(**vars(args)).run() \ No newline at end of file diff --git a/packages/llm/local_llm/agents/voice_chat.py b/packages/llm/local_llm/agents/voice_chat.py index 07381b5da..52c36ac9b 100644 --- a/packages/llm/local_llm/agents/voice_chat.py +++ b/packages/llm/local_llm/agents/voice_chat.py @@ -78,8 +78,6 @@ def on_eos(self, text): if __name__ == "__main__": - from local_llm.utils import ArgParser - parser = ArgParser(extras=ArgParser.Defaults+['asr', 'tts', 'audio_output']) args = parser.parse_args() diff --git a/packages/llm/local_llm/agents/web_chat.py b/packages/llm/local_llm/agents/web_chat.py index feed079d6..f22d492e1 100644 --- a/packages/llm/local_llm/agents/web_chat.py +++ b/packages/llm/local_llm/agents/web_chat.py @@ -112,8 +112,6 @@ def start(self): if __name__ == "__main__": - from local_llm.utils import ArgParser - parser = ArgParser(extras=ArgParser.Defaults+['asr', 'tts', 'audio_output', 'web']) args = parser.parse_args() diff --git a/packages/llm/local_llm/history.py b/packages/llm/local_llm/history.py index 4b5c1f506..d172cde3a 100644 --- a/packages/llm/local_llm/history.py +++ b/packages/llm/local_llm/history.py @@ -285,7 +285,7 @@ def embed_chat(self, use_cache=True): cached = embed_key in entry and use_cache if logging.getLogger().isEnabledFor(logging.DEBUG) and not cached: - logging.debug(f"processing chat entry {i} role='{entry.role}' template='{role_template}' open_user_prompt={open_user_prompt} cached={'true' if cached else 'false'} {key}='{entry[key]}'".replace('\n', '\\n')) + logging.debug(f"processing chat entry {i} role='{entry.role}' template='{role_template}' open_user_prompt={open_user_prompt} cached={'true' if cached else 'false'} {key}='{entry[key] if isinstance(entry[key], str) else type(entry[key])}'".replace('\n', '\\n')) if use_cache: if embed_key not in entry: # TODO and entry.role != 'bot' -- only compute bot embeddings when needed diff --git a/packages/llm/local_llm/local_llm.py b/packages/llm/local_llm/local_llm.py index 28c97e592..5e0571d37 100644 --- a/packages/llm/local_llm/local_llm.py +++ b/packages/llm/local_llm/local_llm.py @@ -76,9 +76,11 @@ def from_pretrained(model, api=None, **kwargs): else: raise ValueError(f"invalid API: {api}") + # moved CLIP to after LLM is loaded because of MLC CUDA errors when running in subprocess + model.init_vision() model.config.load_time = time.perf_counter() - load_begin - print_table(model.config) + return model def generate(self, inputs, streaming=True, **kwargs): @@ -144,7 +146,8 @@ def __init__(self, model_path, **kwargs): self.model_path = model_path self.model_config = AutoConfig.from_pretrained(model_path) - self.init_vision(**kwargs) + # moved CLIP to after LLM is loaded because of MLC CUDA errors when running in subprocess + #self.init_vision(**kwargs) def init_vision(self, **kwargs): """ diff --git a/packages/llm/local_llm/models/mlc.py b/packages/llm/local_llm/models/mlc.py index 6b6c94572..a6c90c44e 100644 --- a/packages/llm/local_llm/models/mlc.py +++ b/packages/llm/local_llm/models/mlc.py @@ -285,13 +285,9 @@ def _generate(self, stream): # decode until EOS or max_new_tokens time_begin_decode = time.perf_counter() - + while True: - #time_begin_sample = time.perf_counter() token = self._sample(output[0], do_sample, temperature, top_p, repetition_penalty) - #sample_time = (time.perf_counter() - time_begin_sample) * 1000 - #print(f"SAMPLE_TIME: {sample_time:.2f} SAMPLE_RATE: {1000/sample_time:.2f}") - stream.output_tokens.append(token) stream.event.set() @@ -306,15 +302,11 @@ def _generate(self, stream): stream.kv_cache.num_tokens += 1 self.stats.output_tokens += 1 - - #time_begin_decode2 = time.perf_counter() + output = self._decode( tvm.nd.array(np.array([[stream.output_tokens[-1]]], dtype=np.int32), self.device), tvm.runtime.ShapeTuple([stream.kv_cache.num_tokens]), stream.kv_cache, self.params ) - #decode_time = (time.perf_counter() - time_begin_decode2) * 1000 - #print(f"DECODE_TIME: {decode_time:.2f} DECODE_RATE: {1000/decode_time:.2f}") - #time.sleep(0.225) time_end_decode = time.perf_counter() diff --git a/packages/llm/local_llm/plugin.py b/packages/llm/local_llm/plugin.py index eee863b9d..d719c3e24 100644 --- a/packages/llm/local_llm/plugin.py +++ b/packages/llm/local_llm/plugin.py @@ -127,6 +127,7 @@ def input(self, input): TODO: multiple input channels? """ if self.threaded: + #self.start() # thread may not be started if plugin only called from a callback if self.drop_inputs: self.clear_inputs() self.input_queue.put(input) diff --git a/packages/llm/local_llm/plugins/__init__.py b/packages/llm/local_llm/plugins/__init__.py index ce2de01b1..ec65a9fdb 100644 --- a/packages/llm/local_llm/plugins/__init__.py +++ b/packages/llm/local_llm/plugins/__init__.py @@ -5,6 +5,7 @@ from .print_stream import PrintStream from .user_prompt import UserPrompt from .rate_limit import RateLimit +from .process_proxy import ProcessProxy from .audio import AudioOutputDevice, AudioOutputFile from .video import VideoSource, VideoOutput diff --git a/packages/llm/local_llm/plugins/chat_query.py b/packages/llm/local_llm/plugins/chat_query.py index 240d4a7b5..b08da189c 100644 --- a/packages/llm/local_llm/plugins/chat_query.py +++ b/packages/llm/local_llm/plugins/chat_query.py @@ -2,7 +2,7 @@ import logging from local_llm import Plugin, LocalLM, ChatHistory -from local_llm.utils import print_table +from local_llm.utils import ImageTypes, print_table class ChatQuery(Plugin): @@ -75,6 +75,10 @@ def __init__(self, model="meta-llama/Llama-2-7b-chat-hf", **kwargs): self.temperature = kwargs.get('temperature', 0.7) self.top_p = kwargs.get('top_p', 0.95) + #warmup_query = '2+2 is ' + #logging.debug(f"Warming up LLM with query '{warmup_query}'") + #logging.debug(f"Warmup response: '{self.model.generate(warmup_query, streaming=False)}'") + def process(self, input, **kwargs): """ Generate the reply to a prompt or the latest ChatHistory. @@ -105,13 +109,13 @@ def process(self, input, **kwargs): return # add prompt to chat history - if isinstance(input, str) or isinstance(input, dict): + if isinstance(input, str) or isinstance(input, dict) or isinstance(input, ImageTypes): self.chat_history.append(role='user', msg=input) chat_history = self.chat_history elif isinstance(input, ChatHistory): chat_history = input else: - raise TypeError(f"LLMQuery plugin expects inputs of type str, dict, or ChatHistory (was {type(input)})") + raise TypeError(f"LLMQuery plugin expects inputs of type str, dict, image, or ChatHistory (was {type(input)})") # images should be followed by text prompts if 'image' in chat_history[-1] and 'text' not in chat_history[-1]: diff --git a/packages/llm/local_llm/plugins/process_proxy.py b/packages/llm/local_llm/plugins/process_proxy.py new file mode 100644 index 000000000..c1366fe94 --- /dev/null +++ b/packages/llm/local_llm/plugins/process_proxy.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +import os +import time +import pickle +import logging +import threading +import multiprocessing as mp + +from local_llm import Plugin + + +class ProcessProxy(Plugin): + """ + Proxy wrapper for running a plugin in a subprocess + """ + def __init__(self, plugin_factory, **kwargs): + """ + Parameters: + plugin_factory (callable) -- Factory function that returns a plugin instance. + This will be called from the new process to create it. + """ + self.data_parent, self.data_child = mp.Pipe(duplex=True) + self.control_parent, self.control_child = mp.Pipe(duplex=True) + + self.subprocess = mp.Process(target=self.run_process, args=(plugin_factory, kwargs)) + self.subprocess.start() + + init_msg = self.control_parent.recv() + + if init_msg['status'] != 'initialized': + raise RuntimeError(f"subprocess has an invalid initialization status ({init_msg['status']})") + + super().__init__(output_channels=init_msg['output_channels'], **kwargs) + logging.info(f"ProcessProxy initialized, output_channels={self.output_channels}") + + def input(self, input): + #time_begin = time.perf_counter() + #input_type = type(input) + input = pickle.dumps(input, protocol=-1) + #logging.debug(f"ProcessProxy time to pickle {input_type} - {(time.perf_counter()-time_begin)*1000} ms") + self.data_parent.send_bytes(input) + + def start(self): + self.control_parent.send('start') + self.assert_message(self.control_parent.recv(), 'started') + return super().start() + + def run(self): + while True: + output, channel = pickle.loads(self.data_parent.recv_bytes()) + #logging.debug(f"parent process recieved {type(output)} (channel={channel})") + self.output(output, channel) + + def run_process(self, factory, kwargs): + logging.debug(f"subprocess {os.getpid()} started") + + try: + # create an instance of the plugin + plugin = factory(**kwargs) + except Exception as error: + self.control_child.send({'status': str(type(error))}) + raise error + + self.control_child.send({ + 'status': 'initialized', + 'output_channels': plugin.output_channels, + }) + + # forward outputs back to parent process + for i in range(plugin.output_channels): + plugin.add(OutputProxy(self.data_child, i), i) + + # start the plugin processing + self.assert_message(self.control_child.recv(), 'start') + + try: + plugin.start() + except Exception as error: + self.control_child.send(str(type(error))) + raise error + + # start a thread to recieve control messages from the parent + control_thread = threading.Thread(target=self.run_control) + control_thread.start() + + self.control_child.send('started') + + # recieve inputs from the parent to process + while True: + #time_begin = time.perf_counter() + input = self.data_child.recv_bytes() + input = pickle.loads(input) + #logging.debug(f"subprocess recieved {str(type(input))} input (depickle={(time.perf_counter()-time_begin)*1000} ms)") + plugin(input) + + def run_control(self): + while True: + msg = self.control_child.recv() + logging.debug(f"subprocess recieved control msg from parent process ('{msg}')") + + def assert_message(self, msg, expected, verbose=True): + if msg != expected: + raise RuntimeError(f"recieved unexpected cross-process message '{msg}' (expected '{expected}'") + elif verbose: + logging.debug(f"recieved cross-process message '{msg}'") + + +class OutputProxy(Plugin): + def __init__(self, pipe, channel, **kwargs): + super().__init__(threaded=False) + self.pipe = pipe + self.channel = channel + self.enabled = True + + def process(self, input, **kwargs): + #logging.debug(f"subprocess sending {type(input)} {input} (channel={self.channel})") + try: + if self.enabled: + self.pipe.send_bytes(pickle.dumps((input, self.channel), protocol=-1)) #self.pipe.send((input, self.channel)) + except ValueError as error: + if 'pickle' in str(error): + logging.info(f"subprocess output could not be pickled ({type(input)}), disabling channel {self.channel}") + self.enabled = False + \ No newline at end of file diff --git a/packages/llm/local_llm/plugins/video.py b/packages/llm/local_llm/plugins/video.py index d47fbc097..6d4f5b919 100644 --- a/packages/llm/local_llm/plugins/video.py +++ b/packages/llm/local_llm/plugins/video.py @@ -5,7 +5,7 @@ from local_llm import Plugin from local_llm.utils import cuda_image -from jetson_utils import videoSource, videoOutput, cudaMemcpy +from jetson_utils import videoSource, videoOutput, cudaDeviceSynchronize, cudaToNumpy class VideoSource(Plugin): @@ -16,7 +16,7 @@ class VideoSource(Plugin): def __init__(self, video_input='/dev/video0', video_input_width=None, video_input_height=None, video_input_codec=None, video_input_framerate=None, - **kwargs): + video_input_save=None, return_tensors='cuda', **kwargs): """ Parameters: @@ -24,6 +24,7 @@ def __init__(self, video_input='/dev/video0', input_width (int) -- the disired width in pixels (default uses stream's resolution) input_height (int) -- the disired height in pixels (default uses stream's resolution) input_codec (str) -- force a particular codec ('h264', 'h265', 'vp8', 'vp9', 'mjpeg', ect) + return_tensors (str) -- the object datatype of the image to output ('np', 'pt', 'cuda') """ super().__init__(**kwargs) @@ -41,20 +42,32 @@ def __init__(self, video_input='/dev/video0', if video_input_framerate: options['framerate'] = video_input_framerate + if video_input_save: + options['save'] = video_input_save + self.stream = videoSource(video_input, options=options) - self.resource = video_input - + self.resource = video_input # self.stream.GetOptions().resource['string'] + self.return_tensors = return_tensors + def run(self): """ Capture images from the video source as long as it's streaming """ while True: image = self.stream.Capture(format='rgb8', timeout=-1) - + if image is None: continue + + if self.return_tensors == 'pt': + image = torch.as_tensor(image, device='cuda') + elif self.return_tensors == 'np': + image = cudaToNumpy(image) + cudaDeviceSynchronize() + elif self.return_tensors != 'cuda': + raise ValueError(f"return_tensors should be 'np', 'pt', or 'cuda' (was '{self.return_tensors}')") - self.output(cudaMemcpy(image)) + self.output(image) class VideoOutput(Plugin): @@ -62,7 +75,7 @@ class VideoOutput(Plugin): Saves images to a compressed video or directory of individual images, the display, or a network stream. https://github.com/dusty-nv/jetson-inference/blob/master/docs/aux-streaming.md """ - def __init__(self, video_output=None, video_output_codec=None, video_output_bitrate=None, **kwargs): + def __init__(self, video_output=None, video_output_codec=None, video_output_bitrate=None, video_output_save=None, **kwargs): """ Parameters: @@ -80,8 +93,14 @@ def __init__(self, video_output=None, video_output_codec=None, video_output_bitr if video_output_bitrate: options['bitrate'] = video_output_bitrate + if video_output_save: + options['save'] = video_output_save + + if video_output is None: + video_output = '' + self.stream = videoOutput(video_output, options=options) - self.resource = output + self.resource = video_output def process(self, input, **kwargs): """ diff --git a/packages/llm/local_llm/requirements.txt b/packages/llm/local_llm/requirements.txt index 6ed0b2055..4ceb674a4 100644 --- a/packages/llm/local_llm/requirements.txt +++ b/packages/llm/local_llm/requirements.txt @@ -2,4 +2,5 @@ websockets termcolor tabulate flask -tqdm \ No newline at end of file +tqdm +getch \ No newline at end of file diff --git a/packages/llm/local_llm/utils/args.py b/packages/llm/local_llm/utils/args.py index 6d71b33f6..cbfa29a40 100644 --- a/packages/llm/local_llm/utils/args.py +++ b/packages/llm/local_llm/utils/args.py @@ -5,6 +5,7 @@ import logging from .log import LogFormatter +from .prompts import DefaultChatPrompts, DefaultCompletionPrompts class ArgParser(argparse.ArgumentParser): @@ -61,12 +62,14 @@ def __init__(self, extras=Defaults, **kwargs): self.add_argument("--video-input-height", type=int, default=None, help="manually set the resolution of the video input") self.add_argument("--video-input-codec", type=str, default=None, choices=['h264', 'h265', 'vp8', 'vp9', 'mjpeg'], help="manually set the input video codec to use") self.add_argument("--video-input-framerate", type=int, default=None, help="set the desired framerate of input video") + self.add_argument("--video-input-save", type=str, default=None, help="path to video file to save the incoming video feed to") if 'video_output' in extras: self.add_argument("--video-output", type=str, default=None, help="display, stream URL, file/dir path") self.add_argument("--video-output-codec", type=str, default=None, choices=['h264', 'h265', 'vp8', 'vp9', 'mjpeg'], help="set the output video codec to use") self.add_argument("--video-output-bitrate", type=int, default=None, help="set the output bitrate to use") - + self.add_argument("--video-output-save", type=str, default=None, help="path to video file to save the outgoing video stream to") + # AUDIO if 'audio_input' not in extras and 'asr' in extras: extras += ['audio_input'] diff --git a/packages/llm/local_llm/utils/image.py b/packages/llm/local_llm/utils/image.py index 054b61707..422133535 100644 --- a/packages/llm/local_llm/utils/image.py +++ b/packages/llm/local_llm/utils/image.py @@ -18,6 +18,18 @@ def is_image(image): """ return isinstance(image, ImageTypes) + +def image_size(image): + """ + Returns the dimensions of the image as a tuple (height, width, channels) + """ + if isinstance(image, (cudaImage, np.ndarray, torch.Tensor)): + return image.shape + elif isinstance(image, PIL.Image.Image): + return image.size + else: + raise TypeError(f"expected an image of type {ImageTypes} (was {type(image)})") + def load_image(path): """ @@ -65,6 +77,17 @@ def cuda_image(image): format=torch_image_format(input) ) + +def torch_image(image): + """ + Convert the image to a type that is compatible with PyTorch (torch.Tensor, ndarray, PIL.Image) + """ + if isinstance(image, cudaImage): + return torch.as_tensor(image, device='cuda') + elif is_image(image): + return image + raise TypeError(f"expected an image of type {ImageTypes} (was {type(image)})") + def torch_image_format(tensor): """ diff --git a/packages/llm/local_llm/vision/clip_hf.py b/packages/llm/local_llm/vision/clip_hf.py index 2d13618df..a37479bee 100644 --- a/packages/llm/local_llm/vision/clip_hf.py +++ b/packages/llm/local_llm/vision/clip_hf.py @@ -6,7 +6,7 @@ import logging from transformers import CLIPImageProcessor, CLIPVisionModel -from ..utils import AttributeDict, load_image, download_model, print_table +from ..utils import AttributeDict, load_image, torch_image, image_size, download_model, print_table _clip_model_cache = dict(image={}, text={}) @@ -56,11 +56,11 @@ def __call__(self, image, crop=False, hidden_state=None, return_tensors='pt', ** """ if isinstance(image, str): image = load_image(image) - + else: + image = torch_image(image) + time_begin_pre = time.perf_counter() - - image_size = image.size - + if not crop: image = image.resize(self.config.input_shape, PIL.Image.BILINEAR) # PIL.Image.BICUBIC @@ -85,7 +85,7 @@ def __call__(self, image, crop=False, hidden_state=None, return_tensors='pt', ** self.stats.clip_rate = 1.0 / self.stats.clip_time self.stats.preprocess_time = time_begin_enc - time_begin_pre self.stats.encode_time = time_end_enc - time_begin_enc - self.stats.input_shape = f"{image_size[0]}x{image_size[1]} -> {self.model.config.image_size}x{self.model.config.image_size}" + self.stats.input_shape = f"{image_size(image)} -> {self.model.config.image_size}x{self.model.config.image_size}" self.stats.output_shape = self.config.output_shape if return_tensors == 'pt': diff --git a/packages/llm/mlc/Dockerfile b/packages/llm/mlc/Dockerfile index a20f63ff2..18e126a83 100644 --- a/packages/llm/mlc/Dockerfile +++ b/packages/llm/mlc/Dockerfile @@ -18,12 +18,13 @@ ARG MLC_PATCH ARG CUDAARCHS ARG TORCH_CUDA_ARCH_LIST +ARG LLVM_VERSION # install LLVM the upstream way instead of apt because of: # https://discourse.llvm.org/t/llvm-15-0-7-missing-libpolly-a/67942 RUN wget https://apt.llvm.org/llvm.sh && \ chmod +x llvm.sh && \ - ./llvm.sh all && \ + ./llvm.sh ${LLVM_VERSION} all && \ ln -s /usr/bin/llvm-config-* /usr/bin/llvm-config # could NOT find zstd (missing: zstd_LIBRARY zstd_INCLUDE_DIR) diff --git a/packages/llm/mlc/config.py b/packages/llm/mlc/config.py index f79102853..58ee536c0 100644 --- a/packages/llm/mlc/config.py +++ b/packages/llm/mlc/config.py @@ -11,7 +11,7 @@ 'TORCH_CUDA_ARCH_LIST': ';'.join([f'{x/10:.1f}' for x in CUDA_ARCHITECTURES]) } -def mlc(version, patch, tag=None, requires=None, default=False): +def mlc(version, patch, llvm=17, tag=None, requires=None, default=False): pkg = copy.deepcopy(package) if default: @@ -28,6 +28,7 @@ def mlc(version, patch, tag=None, requires=None, default=False): pkg['build_args'].update({ 'MLC_VERSION': version, 'MLC_PATCH': patch, + 'LLVM_VERSION': llvm, }) pkg['notes'] = f"[{repo}](https://github.com/{repo}/tree/{version}) commit SHA [`{version}`](https://github.com/{repo}/tree/{version})" @@ -40,7 +41,7 @@ def mlc(version, patch, tag=None, requires=None, default=False): #default_dev=(L4T_VERSION.major >= 36) package = [ - mlc(latest_sha, 'patches/51fb0f4.diff', tag='dev'), #, default=default_dev), - mlc('9bf5723', 'patches/9bf5723.diff', requires='==35.*'), # 10/20/2023 - mlc('51fb0f4', 'patches/51fb0f4.diff', default=True), # 12/15/2023 + mlc(latest_sha, 'patches/51fb0f4.diff', llvm=18, tag='dev'), #, default=default_dev), + mlc('9bf5723', 'patches/9bf5723.diff', llvm=17, requires='==35.*'), # 10/20/2023 + mlc('51fb0f4', 'patches/51fb0f4.diff', llvm=17, default=True), # 12/15/2023 ] \ No newline at end of file