From 8a88903e9ab6145e1d55b1042419fbe26d5ddb8d Mon Sep 17 00:00:00 2001 From: RemiliaForever Date: Fri, 3 Oct 2025 19:51:01 +0800 Subject: [PATCH 1/3] fix(run): pass param to serve --- runner/cmd/nexa-cli/run.go | 7 ++++++ runner/server/handler/completion.go | 33 ++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/runner/cmd/nexa-cli/run.go b/runner/cmd/nexa-cli/run.go index ba7a6f6e..73b5b36a 100644 --- a/runner/cmd/nexa-cli/run.go +++ b/runner/cmd/nexa-cli/run.go @@ -257,6 +257,13 @@ func runFunc(cmd *cobra.Command, args []string) { FrequencyPenalty: openai.Float(float64(frequencyPenalty)), Seed: openai.Int(int64(seed)), }, + + option.WithJSONSet("enable_think", enableThink), + option.WithJSONSet("top_k", topK), + option.WithJSONSet("min_p", minP), + option.WithJSONSet("repetition_penalty", repetitionPenalty), + option.WithJSONSet("grammar_path", grammarPath), + option.WithJSONSet("grammar_string", grammarString), option.WithJSONSet("enable_json", enableJson), option.WithHeaderAdd("Nexa-KeepCache", "true")) diff --git a/runner/server/handler/completion.go b/runner/server/handler/completion.go index 6018294a..341be2e4 100644 --- a/runner/server/handler/completion.go +++ b/runner/server/handler/completion.go @@ -29,20 +29,33 @@ type ChatCompletionNewParams openai.ChatCompletionNewParams // ChatCompletionRequest defines the request body for the chat completions API. // example: { "model": "nexaml/nexaml-models", "messages": [ { "role": "user", "content": "why is the sky blue?" } ] } type ChatCompletionRequest struct { - Stream bool `json:"stream" default:"false"` + Stream bool `json:"stream"` - EnableThink bool `json:"enable_think" default:"true"` - - TopK int32 `json:"top_k" default:"0"` - MinP float32 `json:"min_p" default:"0.0"` - ReqetitionPenalty float32 `json:"repetition_penalty" default:"1.0"` - GrammarPath string `json:"grammar_path" default:""` - GrammarString string `json:"grammar_string" default:""` - EnableJson bool `json:"enable_json" default:"false"` + EnableThink bool `json:"enable_think"` + TopK int32 `json:"top_k"` + MinP float32 `json:"min_p"` + ReqetitionPenalty float32 `json:"repetition_penalty"` + GrammarPath string `json:"grammar_path"` + GrammarString string `json:"grammar_string"` + EnableJson bool `json:"enable_json"` ChatCompletionNewParams } +func defaultChatCompletionRequest() ChatCompletionRequest { + return ChatCompletionRequest{ + Stream: false, + EnableThink: true, + + TopK: 0, + MinP: 0.0, + ReqetitionPenalty: 1.0, + GrammarPath: "", + GrammarString: "", + EnableJson: false, + } +} + var toolCallRegex = regexp.MustCompile(`([\s\S]+)<\/tool_call>` + "|" + "```json([\\s\\S]+)```") // @Router /chat/completions [post] @@ -53,7 +66,7 @@ var toolCallRegex = regexp.MustCompile(`([\s\S]+)<\/tool_call>` + "|" // @Produce json // @Success 200 {object} openai.ChatCompletion "Successful response for non-streaming requests." func ChatCompletions(c *gin.Context) { - param := ChatCompletionRequest{} + param := defaultChatCompletionRequest() if err := c.ShouldBindJSON(¶m); err != nil { c.JSON(http.StatusBadRequest, map[string]any{"error": err.Error()}) return From cc249f2560b52db1a9d339b0890a3de39309ceae Mon Sep 17 00:00:00 2001 From: RemiliaForever Date: Fri, 3 Oct 2025 19:57:11 +0800 Subject: [PATCH 2/3] fix(run): add system prompt in warm up request --- runner/cmd/nexa-cli/run.go | 9 +++++++-- runner/server/handler/completion.go | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/runner/cmd/nexa-cli/run.go b/runner/cmd/nexa-cli/run.go index 73b5b36a..57f5ffca 100644 --- a/runner/cmd/nexa-cli/run.go +++ b/runner/cmd/nexa-cli/run.go @@ -37,6 +37,7 @@ func run() *cobra.Command { llmFlags := pflag.NewFlagSet("LLM/VLM Model", pflag.ExitOnError) llmFlags.SortFlags = false + llmFlags.BoolVarP(&enableThink, "think", "", true, "enable thinking mode") llmFlags.StringVarP(&systemPrompt, "system-prompt", "s", "", "system prompt to set model behavior") runCmd.Flags().AddFlagSet(llmFlags) @@ -186,10 +187,14 @@ func runFunc(cmd *cobra.Command, args []string) { // warm up spin := render.NewSpinner("loading model...") spin.Start() - _, err = client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{ + warmUpRequest := openai.ChatCompletionNewParams{ Messages: nil, Model: model, - }) + } + if systemPrompt != "" { + warmUpRequest.Messages = append(warmUpRequest.Messages, openai.SystemMessage(systemPrompt)) + } + _, err = client.Chat.Completions.New(context.TODO(), warmUpRequest) spin.Stop() if err != nil { diff --git a/runner/server/handler/completion.go b/runner/server/handler/completion.go index 341be2e4..9d9c1e6f 100644 --- a/runner/server/handler/completion.go +++ b/runner/server/handler/completion.go @@ -138,7 +138,7 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) { return } // Empty request for warm up - if len(param.Messages) == 0 { + if len(param.Messages) <= 0 || (systemPrompt != "" && len(param.Messages) <= 1) { c.JSON(http.StatusOK, nil) return } @@ -368,7 +368,7 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) { } // Empty request for warm up, just reset model state - if len(param.Messages) == 0 { + if len(param.Messages) <= 0 || (systemPrompt != "" && len(param.Messages) <= 1) { c.JSON(http.StatusOK, nil) return } From d61ef79c5d2d1567762da4e39d9a70d8c286a585 Mon Sep 17 00:00:00 2001 From: RemiliaForever Date: Fri, 3 Oct 2025 20:07:53 +0800 Subject: [PATCH 3/3] fix(serve): fix typo --- runner/server/handler/completion.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/runner/server/handler/completion.go b/runner/server/handler/completion.go index 9d9c1e6f..6740eea5 100644 --- a/runner/server/handler/completion.go +++ b/runner/server/handler/completion.go @@ -34,7 +34,7 @@ type ChatCompletionRequest struct { EnableThink bool `json:"enable_think"` TopK int32 `json:"top_k"` MinP float32 `json:"min_p"` - ReqetitionPenalty float32 `json:"repetition_penalty"` + RepetitionPenalty float32 `json:"repetition_penalty"` GrammarPath string `json:"grammar_path"` GrammarString string `json:"grammar_string"` EnableJson bool `json:"enable_json"` @@ -49,7 +49,7 @@ func defaultChatCompletionRequest() ChatCompletionRequest { TopK: 0, MinP: 0.0, - ReqetitionPenalty: 1.0, + RepetitionPenalty: 1.0, GrammarPath: "", GrammarString: "", EnableJson: false, @@ -138,7 +138,7 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) { return } // Empty request for warm up - if len(param.Messages) <= 0 || (systemPrompt != "" && len(param.Messages) <= 1) { + if len(param.Messages) == 0 || (systemPrompt != "" && len(param.Messages) <= 1) { c.JSON(http.StatusOK, nil) return } @@ -368,7 +368,7 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) { } // Empty request for warm up, just reset model state - if len(param.Messages) <= 0 || (systemPrompt != "" && len(param.Messages) <= 1) { + if len(param.Messages) == 0 || (systemPrompt != "" && len(param.Messages) <= 1) { c.JSON(http.StatusOK, nil) return } @@ -507,7 +507,7 @@ func parseSamplerConfig(param ChatCompletionRequest) *nexa_sdk.SamplerConfig { TopP: float32(param.TopP.Value), TopK: param.TopK, MinP: param.MinP, - RepetitionPenalty: param.ReqetitionPenalty, + RepetitionPenalty: param.RepetitionPenalty, PresencePenalty: float32(param.PresencePenalty.Value), FrequencyPenalty: float32(param.FrequencyPenalty.Value), Seed: int32(param.Seed.Value),