updated local_llm and MLC containers

MISTLab · Jan 27, 2024 · dcdb996 · dcdb996
1 parent a408d40
commit dcdb996
Show file tree

Hide file tree

Showing 24 changed files with 447 additions and 84 deletions.
diff --git a/packages/llm/local_llm/Dockerfile b/packages/llm/local_llm/Dockerfile
@@ -1,7 +1,7 @@
 #---
 # name: local_llm
 # group: llm
-# depends: [mlc, jetson-utils, riva-client:python]
+# depends: [mlc, riva-client:python, jetson-inference, torch2trt]
 # requires: '>=34.1.0'
 # docs: docs.md
 #---

diff --git a/packages/llm/local_llm/agent.py b/packages/llm/local_llm/agent.py
@@ -35,7 +35,7 @@ def __call__(self, input, channel=0, **kwargs):
         """
         Operator overload for process()
         """
-        return self.process[channel](input, **kwargs)
+        return self.process(input, channel, **kwargs)
 
     def start(self):
         """

diff --git a/packages/llm/local_llm/agents/chat.py b/packages/llm/local_llm/agents/chat.py
@@ -66,8 +66,6 @@ def print_input_prompt(self):
 
 
 if __name__ == "__main__":
-    from local_llm.utils import ArgParser
-
     parser = ArgParser()
     parser.add_argument("-it", "--interactive", action="store_true", help="enable interactive user input from the terminal")
     args = parser.parse_args()

diff --git a/packages/llm/local_llm/agents/mp_chat.py b/packages/llm/local_llm/agents/mp_chat.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+from local_llm import Agent, Pipeline
+
+from local_llm.plugins import UserPrompt, ChatQuery, PrintStream, ProcessProxy
+from local_llm.utils import ArgParser
+
+
+class MultiprocessChat(Agent):
+    """
+    Test of running a LLM and chat session in a subprocess.
+    """
+    def __init__(self, **kwargs):
+        super().__init__()
+
+        self.pipeline = Pipeline([
+            UserPrompt(interactive=False, **kwargs),  # interactive=False if kwargs.get('prompt') else True
+            ProcessProxy((lambda **kwargs: ChatQuery(**kwargs)), **kwargs),
+            PrintStream(color='green')  
+        ])
+
+
+if __name__ == "__main__":
+    parser = ArgParser()
+    args = parser.parse_args()
+
+    agent = MultiprocessChat(**vars(args)).run()
diff --git a/packages/llm/local_llm/agents/mp_test.py b/packages/llm/local_llm/agents/mp_test.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+import time
+
+from local_llm import Agent, Pipeline
+
+from local_llm.plugins import PrintStream, ProcessProxy
+from local_llm.utils import ArgParser
+
+
+class MultiprocessTest(Agent):
+    """
+    This is a test of the ProcessProxy plugin for running pipelines and plugins in their own processes.
+    """
+    def __init__(self, **kwargs):
+        super().__init__()
+
+        self.pipeline = Pipeline([
+            ProcessProxy((lambda **kwargs: PrintStream(**kwargs)), color='green', relay=True, **kwargs),
+            PrintStream(color='blue', **kwargs),
+        ])
+
+
+if __name__ == "__main__":
+    parser = ArgParser()
+    args = parser.parse_args()
+
+    agent = MultiprocessTest(**vars(args)).start()
+
+    while True:
+        agent("INSERT MESSAGE HERE")
+        time.sleep(1.0)
diff --git a/packages/llm/local_llm/agents/mp_video.py b/packages/llm/local_llm/agents/mp_video.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+import logging
+
+from local_llm import Agent
+
+from local_llm.plugins import VideoSource, VideoOutput, ProcessProxy
+from local_llm.utils import ArgParser
+
+
+class MultiprocessVideo(Agent):
+    """
+    Test of running a video stream across processes.
+    """
+    def __init__(self, **kwargs):
+        super().__init__()
+
+        self.video_source = VideoSource(return_tensors='np', **kwargs)
+        self.video_output = ProcessProxy((lambda **kwargs: VideoOutput(**kwargs)), **kwargs)
+
+        self.video_source.add(self.on_video, threaded=False)
+        self.video_source.add(self.video_output)
+
+        self.pipeline = [self.video_source]
+
+    def on_video(self, image):
+        logging.debug(f"captured {image.shape} ({image.dtype}) frame from {self.video_source.resource}")
+
+
+if __name__ == "__main__":
+    from local_llm.utils import ArgParser
+
+    parser = ArgParser(extras=['video_input', 'video_output', 'log'])
+    args = parser.parse_args()
+
+    agent = MultiprocessVideo(**vars(args)).run()
diff --git a/packages/llm/local_llm/agents/video_query.py b/packages/llm/local_llm/agents/video_query.py
@@ -1,56 +1,116 @@
 #!/usr/bin/env python3
-from local_llm import Agent, Pipeline, ChatTemplates
+import time
+import threading
 
-from local_llm.plugins import UserPrompt, ChatQuery, PrintStream
+from local_llm import Agent
+
+from local_llm.plugins import VideoSource, VideoOutput, ChatQuery, PrintStream, ProcessProxy
 from local_llm.utils import ArgParser, print_table
 
 from termcolor import cprint
+from jetson_utils import cudaFont, cudaMemcpy, cudaToNumpy, cudaDeviceSynchronize
+
 
-#
-# TODO max-new-tokens and other generation args...
-# TODO custom arg parser
-#
-class ChatAgent(Agent):
+class VideoQuery(Agent):
     """
-    Agent for two-turn multimodal chat.
+    Perpetual always-on closed-loop visual agent that applies prompts to a video stream.
     """
-    def __init__(self, model="meta-llama/Llama-2-7b-chat-hf", interactive=True, **kwargs):
+    def __init__(self, model="liuhaotian/llava-v1.5-13b", **kwargs):
         super().__init__()
+
+        # load model in another process for smooth streaming
+        self.llm = ProcessProxy((lambda **kwargs: ChatQuery(model, drop_inputs=True, **kwargs)), **kwargs)
+        self.llm.add(PrintStream(color='green', relay=True).add(self.on_eos))
+        self.llm.start()
+
+        # test / warm-up query
+        self.warmup = True
+        self.last_text = ""
+        self.eos = False
 
-        """
-        # Equivalent to:
-        self.pipeline = UserPrompt(interactive=interactive, **kwargs).add(
-            LLMQuery(model, **kwargs).add(
-            PrintStream(relay=True).add(self.on_eos)     
-        ))
-        """
-        self.pipeline = Pipeline([
-            UserPrompt(interactive=interactive, **kwargs),
-            ChatQuery(model, **kwargs),
-            PrintStream(relay=True),
-            self.on_eos    
-        ])
+        self.llm("What is 2+2?")
 
-        self.model = self.pipeline[ChatQuery].model
-        self.interactive = interactive
+        while self.warmup:
+            time.sleep(0.25)
+
+        # create video streams    
+        self.video_source = VideoSource(**kwargs)
+        self.video_output = VideoOutput(**kwargs)
 
-        self.print_input_prompt()
-
-    def on_eos(self, input):
-        if input.endswith('</s>'):
-            print_table(self.model.stats)
-            self.print_input_prompt()
+        self.video_source.add(self.on_video, threaded=False)
+        self.video_output.start()
+
+        self.font = cudaFont()
 
-    def print_input_prompt(self):
-        if self.interactive:
-            cprint('>> PROMPT: ', 'blue', end='', flush=True)
+        # setup prompts
+        self.prompt = 0
+        self.prompts = kwargs.get('prompt')
 
+        if not self.prompts:
+            self.prompts = [
+                'Describe the image concisely.', 
+                'How many fingers is the person holding up?',
+                'What does the text in the image say?',
+                'There is a question asked in the image.  What is the answer?',
+            ]
 
-if __name__ == "__main__":
-    from local_llm.utils import ArgParser
+        self.keyboard_thread = threading.Thread(target=self.poll_keyboard)
+        self.keyboard_thread.start()
+
+        # entry node
+        self.pipeline = [self.video_source]
+
+    def on_video(self, image):
+        np_image = cudaToNumpy(image)
+        cudaDeviceSynchronize()
+
+        self.llm([
+            'reset',
+            np_image,
+            self.prompts[self.prompt],
+        ])
+
+        text = self.last_text.replace('\n', '').replace('</s>', '').strip()
 
-    parser = ArgParser()
-    parser.add_argument("-it", "--interactive", action="store_true", help="enable interactive user input from the terminal")
+        if text:
+            self.font.OverlayText(image, text=text, x=5, y=42, color=self.font.White, background=self.font.Gray40)
+
+        self.font.OverlayText(image, text=self.prompts[self.prompt], x=5, y=5, color=(120,215,21), background=self.font.Gray40)
+        self.video_output(image)
+
+    def on_eos(self, text):
+        if self.eos:
+            self.last_text = text  # new query response
+            self.eos = False
+        elif not self.warmup:  # don't view warmup response
+            self.last_text = self.last_text + text
+
+        if text.endswith('</s>'):
+            #print_table(self.llm.model.stats)
+            self.warmup = False
+            self.eos = True
+
+    def poll_keyboard(self):
+        while True:
+            try:
+                key = input().strip() #getch.getch()
+
+                if key == 'd' or key == 'l':
+                    self.prompt = (self.prompt + 1) % len(self.prompts)
+                elif key == 'a' or key == 'j':
+                    self.prompt = self.prompt - 1
+                    if self.prompt < 0:
+                        self.prompt = len(self.prompts) - 1
+
+                num = int(key)
+
+                if num > 0 and num <= len(self.prompts):
+                    self.prompt = num - 1
+            except Exception as err:
+                continue
+
+if __name__ == "__main__":
+    parser = ArgParser(extras=ArgParser.Defaults+['video_input', 'video_output'])
     args = parser.parse_args()
-
-    agent = ChatAgent(**vars(args)).run() 
+    agent = VideoQuery(**vars(args)).run() 
+
diff --git a/packages/llm/local_llm/agents/video_stream.py b/packages/llm/local_llm/agents/video_stream.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+import logging
+
+from local_llm import Agent
+
+from local_llm.plugins import VideoSource, VideoOutput
+from local_llm.utils import ArgParser
+
+
+class VideoStream(Agent):
+    """
+    Relay, view, or test a video stream.  Use the --video-input and --video-output arguments
+    to set the video source and output protocols used from:
+    
+      https://github.com/dusty-nv/jetson-inference/blob/master/docs/aux-streaming.md
+      
+    For example, this will capture a V4L2 camera and serve it via WebRTC with H.264 encoding:
+    
+      python3 -m local_llm.agents.video_stream \
+        --video-input /dev/video0 \
+        --video-output webrtc://@:8554/output
+      
+    It's also used as a basic test of video streaming before using more complex agents that rely on it.
+    """
+    def __init__(self, video_input=None, video_output=None, **kwargs):
+        super().__init__()
+
+        self.video_source = VideoSource(video_input, **kwargs)
+        self.video_output = VideoOutput(video_output, **kwargs)
+
+        self.video_source.add(self.on_video, threaded=False)
+        self.video_source.add(self.video_output)
+
+        self.pipeline = [self.video_source]
+
+    def on_video(self, image):
+        logging.debug(f"captured {image.width}x{image.height} frame from {self.video_source.resource}")
+
+
+if __name__ == "__main__":
+    parser = ArgParser(extras=['video_input', 'video_output', 'log'])
+    args = parser.parse_args()
+
+    agent = VideoStream(**vars(args)).run() 
diff --git a/packages/llm/local_llm/agents/voice_chat.py b/packages/llm/local_llm/agents/voice_chat.py
@@ -78,8 +78,6 @@ def on_eos(self, text):
 
 
 if __name__ == "__main__":
-    from local_llm.utils import ArgParser
-
     parser = ArgParser(extras=ArgParser.Defaults+['asr', 'tts', 'audio_output'])
     args = parser.parse_args()
 

diff --git a/packages/llm/local_llm/agents/web_chat.py b/packages/llm/local_llm/agents/web_chat.py
@@ -112,8 +112,6 @@ def start(self):
 
 
 if __name__ == "__main__":
-    from local_llm.utils import ArgParser
-
     parser = ArgParser(extras=ArgParser.Defaults+['asr', 'tts', 'audio_output', 'web'])
     args = parser.parse_args()
 

diff --git a/packages/llm/local_llm/history.py b/packages/llm/local_llm/history.py
@@ -285,7 +285,7 @@ def embed_chat(self, use_cache=True):
                 cached = embed_key in entry and use_cache
 
                 if logging.getLogger().isEnabledFor(logging.DEBUG) and not cached:
-                    logging.debug(f"processing chat entry {i}  role='{entry.role}' template='{role_template}' open_user_prompt={open_user_prompt} cached={'true' if cached else 'false'} {key}='{entry[key]}'".replace('\n', '\\n'))
+                    logging.debug(f"processing chat entry {i}  role='{entry.role}' template='{role_template}' open_user_prompt={open_user_prompt} cached={'true' if cached else 'false'} {key}='{entry[key] if isinstance(entry[key], str) else type(entry[key])}'".replace('\n', '\\n'))
 
                 if use_cache:
                     if embed_key not in entry: # TODO  and entry.role != 'bot'  -- only compute bot embeddings when needed

diff --git a/packages/llm/local_llm/local_llm.py b/packages/llm/local_llm/local_llm.py
@@ -76,9 +76,11 @@ def from_pretrained(model, api=None, **kwargs):
         else:
             raise ValueError(f"invalid API: {api}")
 
+        # moved CLIP to after LLM is loaded because of MLC CUDA errors when running in subprocess
+        model.init_vision()  
         model.config.load_time = time.perf_counter() - load_begin
-
         print_table(model.config)
+
         return model
 
     def generate(self, inputs, streaming=True, **kwargs):
@@ -144,7 +146,8 @@ def __init__(self, model_path, **kwargs):
         self.model_path = model_path
         self.model_config = AutoConfig.from_pretrained(model_path)
 
-        self.init_vision(**kwargs)
+        # moved CLIP to after LLM is loaded because of MLC CUDA errors when running in subprocess
+        #self.init_vision(**kwargs)
 
     def init_vision(self, **kwargs):
         """