From 8a88903e9ab6145e1d55b1042419fbe26d5ddb8d Mon Sep 17 00:00:00 2001
From: RemiliaForever <remilia@koumakan.cc>
Date: Fri, 3 Oct 2025 19:51:01 +0800
Subject: [PATCH 1/3] fix(run): pass param to serve

---
 runner/cmd/nexa-cli/run.go          |  7 ++++++
 runner/server/handler/completion.go | 33 ++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/runner/cmd/nexa-cli/run.go b/runner/cmd/nexa-cli/run.go
index ba7a6f6e..73b5b36a 100644
--- a/runner/cmd/nexa-cli/run.go
+++ b/runner/cmd/nexa-cli/run.go
@@ -257,6 +257,13 @@ func runFunc(cmd *cobra.Command, args []string) {
 				FrequencyPenalty: openai.Float(float64(frequencyPenalty)),
 				Seed:             openai.Int(int64(seed)),
 			},
+
+				option.WithJSONSet("enable_think", enableThink),
+				option.WithJSONSet("top_k", topK),
+				option.WithJSONSet("min_p", minP),
+				option.WithJSONSet("repetition_penalty", repetitionPenalty),
+				option.WithJSONSet("grammar_path", grammarPath),
+				option.WithJSONSet("grammar_string", grammarString),
 				option.WithJSONSet("enable_json", enableJson),
 				option.WithHeaderAdd("Nexa-KeepCache", "true"))
 
diff --git a/runner/server/handler/completion.go b/runner/server/handler/completion.go
index 6018294a..341be2e4 100644
--- a/runner/server/handler/completion.go
+++ b/runner/server/handler/completion.go
@@ -29,20 +29,33 @@ type ChatCompletionNewParams openai.ChatCompletionNewParams
 // ChatCompletionRequest defines the request body for the chat completions API.
 // example: { "model": "nexaml/nexaml-models", "messages": [ { "role": "user", "content": "why is the sky blue?" } ] }
 type ChatCompletionRequest struct {
-	Stream bool `json:"stream" default:"false"`
+	Stream bool `json:"stream"`
 
-	EnableThink bool `json:"enable_think" default:"true"`
-
-	TopK              int32   `json:"top_k" default:"0"`
-	MinP              float32 `json:"min_p" default:"0.0"`
-	ReqetitionPenalty float32 `json:"repetition_penalty" default:"1.0"`
-	GrammarPath       string  `json:"grammar_path" default:""`
-	GrammarString     string  `json:"grammar_string" default:""`
-	EnableJson        bool    `json:"enable_json" default:"false"`
+	EnableThink       bool    `json:"enable_think"`
+	TopK              int32   `json:"top_k"`
+	MinP              float32 `json:"min_p"`
+	ReqetitionPenalty float32 `json:"repetition_penalty"`
+	GrammarPath       string  `json:"grammar_path"`
+	GrammarString     string  `json:"grammar_string"`
+	EnableJson        bool    `json:"enable_json"`
 
 	ChatCompletionNewParams
 }
 
+func defaultChatCompletionRequest() ChatCompletionRequest {
+	return ChatCompletionRequest{
+		Stream:      false,
+		EnableThink: true,
+
+		TopK:              0,
+		MinP:              0.0,
+		ReqetitionPenalty: 1.0,
+		GrammarPath:       "",
+		GrammarString:     "",
+		EnableJson:        false,
+	}
+}
+
 var toolCallRegex = regexp.MustCompile(`<tool_call>([\s\S]+)<\/tool_call>` + "|" + "```json([\\s\\S]+)```")
 
 // @Router			/chat/completions [post]
@@ -53,7 +66,7 @@ var toolCallRegex = regexp.MustCompile(`<tool_call>([\s\S]+)<\/tool_call>` + "|"
 // @Produce		json
 // @Success		200	{object}	openai.ChatCompletion	"Successful response for non-streaming requests."
 func ChatCompletions(c *gin.Context) {
-	param := ChatCompletionRequest{}
+	param := defaultChatCompletionRequest()
 	if err := c.ShouldBindJSON(&param); err != nil {
 		c.JSON(http.StatusBadRequest, map[string]any{"error": err.Error()})
 		return

From cc249f2560b52db1a9d339b0890a3de39309ceae Mon Sep 17 00:00:00 2001
From: RemiliaForever <remilia@koumakan.cc>
Date: Fri, 3 Oct 2025 19:57:11 +0800
Subject: [PATCH 2/3] fix(run): add system prompt in warm up request

---
 runner/cmd/nexa-cli/run.go          | 9 +++++++--
 runner/server/handler/completion.go | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/runner/cmd/nexa-cli/run.go b/runner/cmd/nexa-cli/run.go
index 73b5b36a..57f5ffca 100644
--- a/runner/cmd/nexa-cli/run.go
+++ b/runner/cmd/nexa-cli/run.go
@@ -37,6 +37,7 @@ func run() *cobra.Command {
 
 	llmFlags := pflag.NewFlagSet("LLM/VLM Model", pflag.ExitOnError)
 	llmFlags.SortFlags = false
+	llmFlags.BoolVarP(&enableThink, "think", "", true, "enable thinking mode")
 	llmFlags.StringVarP(&systemPrompt, "system-prompt", "s", "", "system prompt to set model behavior")
 	runCmd.Flags().AddFlagSet(llmFlags)
 
@@ -186,10 +187,14 @@ func runFunc(cmd *cobra.Command, args []string) {
 	// warm up
 	spin := render.NewSpinner("loading model...")
 	spin.Start()
-	_, err = client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{
+	warmUpRequest := openai.ChatCompletionNewParams{
 		Messages: nil,
 		Model:    model,
-	})
+	}
+	if systemPrompt != "" {
+		warmUpRequest.Messages = append(warmUpRequest.Messages, openai.SystemMessage(systemPrompt))
+	}
+	_, err = client.Chat.Completions.New(context.TODO(), warmUpRequest)
 	spin.Stop()
 
 	if err != nil {
diff --git a/runner/server/handler/completion.go b/runner/server/handler/completion.go
index 341be2e4..9d9c1e6f 100644
--- a/runner/server/handler/completion.go
+++ b/runner/server/handler/completion.go
@@ -138,7 +138,7 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 		return
 	}
 	// Empty request for warm up
-	if len(param.Messages) == 0 {
+	if len(param.Messages) <= 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
 		c.JSON(http.StatusOK, nil)
 		return
 	}
@@ -368,7 +368,7 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 	}
 
 	// Empty request for warm up, just reset model state
-	if len(param.Messages) == 0 {
+	if len(param.Messages) <= 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
 		c.JSON(http.StatusOK, nil)
 		return
 	}

From d61ef79c5d2d1567762da4e39d9a70d8c286a585 Mon Sep 17 00:00:00 2001
From: RemiliaForever <remilia@koumakan.cc>
Date: Fri, 3 Oct 2025 20:07:53 +0800
Subject: [PATCH 3/3] fix(serve): fix typo

---
 runner/server/handler/completion.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/runner/server/handler/completion.go b/runner/server/handler/completion.go
index 9d9c1e6f..6740eea5 100644
--- a/runner/server/handler/completion.go
+++ b/runner/server/handler/completion.go
@@ -34,7 +34,7 @@ type ChatCompletionRequest struct {
 	EnableThink       bool    `json:"enable_think"`
 	TopK              int32   `json:"top_k"`
 	MinP              float32 `json:"min_p"`
-	ReqetitionPenalty float32 `json:"repetition_penalty"`
+	RepetitionPenalty float32 `json:"repetition_penalty"`
 	GrammarPath       string  `json:"grammar_path"`
 	GrammarString     string  `json:"grammar_string"`
 	EnableJson        bool    `json:"enable_json"`
@@ -49,7 +49,7 @@ func defaultChatCompletionRequest() ChatCompletionRequest {
 
 		TopK:              0,
 		MinP:              0.0,
-		ReqetitionPenalty: 1.0,
+		RepetitionPenalty: 1.0,
 		GrammarPath:       "",
 		GrammarString:     "",
 		EnableJson:        false,
@@ -138,7 +138,7 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 		return
 	}
 	// Empty request for warm up
-	if len(param.Messages) <= 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
+	if len(param.Messages) == 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
 		c.JSON(http.StatusOK, nil)
 		return
 	}
@@ -368,7 +368,7 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 	}
 
 	// Empty request for warm up, just reset model state
-	if len(param.Messages) <= 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
+	if len(param.Messages) == 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
 		c.JSON(http.StatusOK, nil)
 		return
 	}
@@ -507,7 +507,7 @@ func parseSamplerConfig(param ChatCompletionRequest) *nexa_sdk.SamplerConfig {
 		TopP:              float32(param.TopP.Value),
 		TopK:              param.TopK,
 		MinP:              param.MinP,
-		RepetitionPenalty: param.ReqetitionPenalty,
+		RepetitionPenalty: param.RepetitionPenalty,
 		PresencePenalty:   float32(param.PresencePenalty.Value),
 		FrequencyPenalty:  float32(param.FrequencyPenalty.Value),
 		Seed:              int32(param.Seed.Value),