From 56176897f7bad8bdec0c6a6db53223ac7683ba5e Mon Sep 17 00:00:00 2001 From: ksg Date: Sat, 25 Jan 2025 04:10:32 +0900 Subject: [PATCH 1/6] Refactor Ollama analyzer for stricter JSON use Signed-off-by: ksg --- .../analyzers/llm_analyzers/ollama.cr | 79 ++++++++++++------- src/llm/ollama/ollama.cr | 1 + 2 files changed, 51 insertions(+), 29 deletions(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index f4ec7ab2..71142318 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -26,11 +26,27 @@ module Analyzer::AI # Filter files that are likely to contain endpoints filter_prompt = <<-PROMPT - !! Respond only in JSON format. Do not include explanations, comments, or any additional text. !! - --- - Analyze the following list of file paths and identify which files are likely to represent endpoints, including API endpoints, web pages, or static resources. - Exclude directories from the analysis and focus only on individual files. - Return the result as a JSON array of file paths that should be analyzed further. + Analyze the provided list of file paths and identify individual files that are likely to represent endpoints, such as API endpoints, web pages, or static resources. + Ignore directories and focus exclusively on files. + + Return the result strictly in the following JSON structure: + { + "files": [ + "string / e.g., /path/to/file1", + "string / e.g., /path/to/file2", + "string / e.g., /path/to/file3" + ] + } + + If no relevant files are found, return: + { + "files": [] + } + + Guidelines: + - Do not include directories in the output. + - Focus on files related to endpoints (API, web pages, or static resources). + - Provide only the JSON response with no explanations or additional text. File paths: #{all_paths.join("\n")} @@ -40,7 +56,7 @@ module Analyzer::AI filtered_paths = JSON.parse(filter_response.to_s) logger.debug_sub filter_response - filtered_paths.as_a.each do |fpath| + filtered_paths["files"].as_a.each do |fpath| target_paths << fpath.as_s end else @@ -61,30 +77,34 @@ module Analyzer::AI begin prompt = <<-PROMPT - !! Respond only in JSON format. Do not include explanations, comments, or any additional text. !! - --- - Analyze the given source code and extract the endpoint and parameter details. Strictly follow this JSON structure: - - [ - { - "url": "string / e.g., /api/v1/users", - "method": "string / e.g., GET, POST, PUT, DELETE", - "params": [ - { - "name": "string / e.g., id", - "param_type": "string / one of: query, json, form, header, cookie, path", - "value": "string / optional, default empty" - } - ] - } - ] - - - Ensure `param_type` uses only these values: `query`, `json`, `form`, `header`, `cookie`, `path`. - - If no endpoints are found in the code, respond with an empty array `[]`. - - Do not deviate from the specified JSON structure. + Analyze the provided source code to extract details about the endpoints and their parameters. + + Return the result strictly in the following JSON structure: + { + "endpoints": [ + { + "url": "string / e.g., /api/v1/users", + "method": "string / e.g., GET, POST, PUT, DELETE", + "params": [ + { + "name": "string / e.g., id", + "param_type": "string / one of: query, json, form, header, cookie, path", + "value": "string / optional, default empty" + } + ] + } + ] + } + + If no endpoints are found, return: + {"endpoints": []} + + Guidelines: + - `param_type` must strictly use one of these values: `query`, `json`, `form`, `header`, `cookie`, `path`. + - Do not include explanations, comments, or additional text. + - Provide only the JSON response as output. Input Code: - #{content} PROMPT @@ -93,7 +113,8 @@ module Analyzer::AI logger.debug_sub response response_json = JSON.parse(response.to_s) - response_json.as_a.each do |endpoint| + next unless response_json["endpoints"].as_a.size > 0 + response_json["endpoints"].as_a.each do |endpoint| url = endpoint["url"].as_s method = endpoint["method"].as_s params = endpoint["params"].as_a.map do |param| diff --git a/src/llm/ollama/ollama.cr b/src/llm/ollama/ollama.cr index 4b7e3065..ad7a193c 100644 --- a/src/llm/ollama/ollama.cr +++ b/src/llm/ollama/ollama.cr @@ -11,6 +11,7 @@ module LLM :model => @model, :prompt => prompt, :stream => false, + :format => "json", } response = Crest.post(@api, body, json: true) From 56d6fbccc6b418484d46559ffd14e898c37271fc Mon Sep 17 00:00:00 2001 From: ksg Date: Sat, 25 Jan 2025 04:18:32 +0900 Subject: [PATCH 2/6] Standardize file path handling in Ollama analyzer Signed-off-by: ksg --- src/analyzer/analyzers/llm_analyzers/ollama.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index 71142318..e044542f 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -49,7 +49,7 @@ module Analyzer::AI - Provide only the JSON response with no explanations or additional text. File paths: - #{all_paths.join("\n")} + #{all_paths.map { |path| "- \"#{File.expand_path(path)}\"" }.join("\n")} PROMPT filter_response = ollama.request(filter_prompt) From 6c293f7fdd9dfbe327a7134da5bbc84e4692ad4f Mon Sep 17 00:00:00 2001 From: ksg Date: Sat, 25 Jan 2025 04:38:20 +0900 Subject: [PATCH 3/6] Refactor prompt for clarity and conciseness Signed-off-by: ksg --- src/analyzer/analyzers/llm_analyzers/ollama.cr | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index e044542f..74ba3740 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -26,7 +26,7 @@ module Analyzer::AI # Filter files that are likely to contain endpoints filter_prompt = <<-PROMPT - Analyze the provided list of file paths and identify individual files that are likely to represent endpoints, such as API endpoints, web pages, or static resources. + Analyze the provided list of file paths and identify individual files that are likely to represent endpoints, such as API endpoints, web pages or static resources. Ignore directories and focus exclusively on files. Return the result strictly in the following JSON structure: @@ -45,7 +45,7 @@ module Analyzer::AI Guidelines: - Do not include directories in the output. - - Focus on files related to endpoints (API, web pages, or static resources). + - Focus on files related to endpoints (API, web pages or static resources). - Provide only the JSON response with no explanations or additional text. File paths: @@ -100,9 +100,11 @@ module Analyzer::AI {"endpoints": []} Guidelines: - - `param_type` must strictly use one of these values: `query`, `json`, `form`, `header`, `cookie`, `path`. - - Do not include explanations, comments, or additional text. - - Provide only the JSON response as output. + - The JSON should include only the fields: "url", "method" and "params" for each endpoint. + - The "method" field should strictly use one of these values: GET, POST, PUT, DELETE. + - The "params" field should consist of "name", "param_type" and "value". + - "param_type" must strictly use one of these values: "query", "json", "form", "header", "cookie" and "path". + - Do not include explanations, comments or additional text. Input Code: #{content} From 2e72c093966f19453596e52adb38b2621c7d992d Mon Sep 17 00:00:00 2001 From: ksg Date: Sat, 25 Jan 2025 16:21:58 +0900 Subject: [PATCH 4/6] Standardize response formatting across Ollama methods --- .../analyzers/llm_analyzers/ollama.cr | 107 +++++++++++------- src/llm/ollama/ollama.cr | 22 +++- 2 files changed, 85 insertions(+), 44 deletions(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index 74ba3740..50f0b0fe 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -26,33 +26,30 @@ module Analyzer::AI # Filter files that are likely to contain endpoints filter_prompt = <<-PROMPT - Analyze the provided list of file paths and identify individual files that are likely to represent endpoints, such as API endpoints, web pages or static resources. - Ignore directories and focus exclusively on files. + Analyze the following list of file paths and identify which files are likely to represent endpoints, including API endpoints, web pages, or static resources. + Exclude directories from the analysis and focus only on individual files. + Return the result as a JSON array of file paths that should be analyzed further. - Return the result strictly in the following JSON structure: - { - "files": [ - "string / e.g., /path/to/file1", - "string / e.g., /path/to/file2", - "string / e.g., /path/to/file3" - ] - } + File paths: + #{all_paths.join("\n")} + PROMPT - If no relevant files are found, return: + format = <<-FORMAT { - "files": [] + "type": "object", + "properties": { + "files": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["files"] } + FORMAT - Guidelines: - - Do not include directories in the output. - - Focus on files related to endpoints (API, web pages or static resources). - - Provide only the JSON response with no explanations or additional text. - - File paths: - #{all_paths.map { |path| "- \"#{File.expand_path(path)}\"" }.join("\n")} - PROMPT - - filter_response = ollama.request(filter_prompt) + filter_response = ollama.request_with_format(filter_prompt, format) filtered_paths = JSON.parse(filter_response.to_s) logger.debug_sub filter_response @@ -79,23 +76,6 @@ module Analyzer::AI prompt = <<-PROMPT Analyze the provided source code to extract details about the endpoints and their parameters. - Return the result strictly in the following JSON structure: - { - "endpoints": [ - { - "url": "string / e.g., /api/v1/users", - "method": "string / e.g., GET, POST, PUT, DELETE", - "params": [ - { - "name": "string / e.g., id", - "param_type": "string / one of: query, json, form, header, cookie, path", - "value": "string / optional, default empty" - } - ] - } - ] - } - If no endpoints are found, return: {"endpoints": []} @@ -110,7 +90,48 @@ module Analyzer::AI #{content} PROMPT - response = ollama.request(prompt) + format = <<-FORMAT + { + "type": "object", + "properties": { + "endpoints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "method": { + "type": "string" + }, + "params": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "param_type": { + "type": "string" + }, + "value": { + "type": "string" + } + }, + "required": ["name", "param_type", "value"] + } + } + }, + "required": ["url", "method", "params"] + } + } + } + } + FORMAT + + response = ollama.request_with_format(prompt, format) logger.debug "Ollama response (#{relative_path}):" logger.debug_sub response @@ -130,8 +151,8 @@ module Analyzer::AI @result << Endpoint.new(url, method, params, details) end rescue ex : Exception - puts "Error processing file: #{path}" - puts "Error: #{ex.message}" + logger.debug "Error processing file: #{path}" + logger.debug "Error: #{ex.message}" end end end @@ -145,7 +166,7 @@ module Analyzer::AI end def ignore_extensions - [".css", ".xml", ".json", ".yml", ".yaml", ".md", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".ico", ".eot", ".ttf", ".woff", ".woff2", ".otf", ".mp3", ".mp4", ".avi", ".mov", ".webm", ".zip", ".tar", ".gz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".txt", ".csv", ".log", ".sql", ".bak", ".swp"] + [".css", ".xml", ".json", ".yml", ".yaml", ".md", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".ico", ".eot", ".ttf", ".woff", ".woff2", ".otf", ".mp3", ".mp4", ".avi", ".mov", ".webm", ".zip", ".tar", ".gz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".txt", ".csv", ".log", ".sql", ".bak", ".swp", ".jar"] end end end diff --git a/src/llm/ollama/ollama.cr b/src/llm/ollama/ollama.cr index ad7a193c..c8c0788b 100644 --- a/src/llm/ollama/ollama.cr +++ b/src/llm/ollama/ollama.cr @@ -1,3 +1,5 @@ +require "json" + module LLM class Ollama def initialize(url : String, model : String) @@ -11,7 +13,25 @@ module LLM :model => @model, :prompt => prompt, :stream => false, - :format => "json", + } + + response = Crest.post(@api, body, json: true) + response_json = JSON.parse response.body + + response_json["response"] + rescue ex : Exception + puts "Error: #{ex.message}" + + "" + end + + def request_with_format(prompt : String, format : String) + body = { + :model => @model, + :prompt => prompt, + :stream => false, + :format => JSON.parse(format), + :temperature => 0.5, } response = Crest.post(@api, body, json: true) From 4069803c7cdd5c5f4c5906213ffbb3d2fb1d8039 Mon Sep 17 00:00:00 2001 From: ksg Date: Sat, 25 Jan 2025 16:40:00 +0900 Subject: [PATCH 5/6] Refactor and consolidate Ollama request handling Signed-off-by: ksg --- .../analyzers/llm_analyzers/ollama.cr | 24 +++++++++++-------- src/llm/ollama/ollama.cr | 23 +++--------------- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index 50f0b0fe..2cb3be05 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -27,11 +27,17 @@ module Analyzer::AI # Filter files that are likely to contain endpoints filter_prompt = <<-PROMPT Analyze the following list of file paths and identify which files are likely to represent endpoints, including API endpoints, web pages, or static resources. - Exclude directories from the analysis and focus only on individual files. - Return the result as a JSON array of file paths that should be analyzed further. - File paths: - #{all_paths.join("\n")} + If no files are found, return: + {"files": []} + + Guidelines: + - Focus only on individual files. + - Do not include directories. + - Do not include explanations, comments or additional text. + + Input Files: + #{all_paths.map { |path| File.expand_path(path) }.join("\n")} PROMPT format = <<-FORMAT @@ -49,7 +55,7 @@ module Analyzer::AI } FORMAT - filter_response = ollama.request_with_format(filter_prompt, format) + filter_response = ollama.request(filter_prompt, format) filtered_paths = JSON.parse(filter_response.to_s) logger.debug_sub filter_response @@ -80,10 +86,8 @@ module Analyzer::AI {"endpoints": []} Guidelines: - - The JSON should include only the fields: "url", "method" and "params" for each endpoint. - - The "method" field should strictly use one of these values: GET, POST, PUT, DELETE. - - The "params" field should consist of "name", "param_type" and "value". - - "param_type" must strictly use one of these values: "query", "json", "form", "header", "cookie" and "path". + - The "method" field should strictly use one of these values: "GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD". + - The "param_type" must strictly use one of these values: "query", "json", "form", "header", "cookie" and "path". - Do not include explanations, comments or additional text. Input Code: @@ -131,7 +135,7 @@ module Analyzer::AI } FORMAT - response = ollama.request_with_format(prompt, format) + response = ollama.request(prompt, format) logger.debug "Ollama response (#{relative_path}):" logger.debug_sub response diff --git a/src/llm/ollama/ollama.cr b/src/llm/ollama/ollama.cr index c8c0788b..9c871a4d 100644 --- a/src/llm/ollama/ollama.cr +++ b/src/llm/ollama/ollama.cr @@ -8,30 +8,13 @@ module LLM @model = model end - def request(prompt : String) + def request(prompt : String, format : String = "json") body = { :model => @model, :prompt => prompt, :stream => false, - } - - response = Crest.post(@api, body, json: true) - response_json = JSON.parse response.body - - response_json["response"] - rescue ex : Exception - puts "Error: #{ex.message}" - - "" - end - - def request_with_format(prompt : String, format : String) - body = { - :model => @model, - :prompt => prompt, - :stream => false, - :format => JSON.parse(format), - :temperature => 0.5, + :temperature => 0.3, + :format => format == "json" ? "json" : JSON.parse(format) } response = Crest.post(@api, body, json: true) From 1641e38d26c76a2fc6db3d4e67c56e4b90aec903 Mon Sep 17 00:00:00 2001 From: ksg Date: Sat, 25 Jan 2025 16:50:21 +0900 Subject: [PATCH 6/6] Improve the Ollama analyzer for validating responses. Signed-off-by: ksg --- src/analyzer/analyzers/llm_analyzers/ollama.cr | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index 2cb3be05..135edd54 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -28,16 +28,13 @@ module Analyzer::AI filter_prompt = <<-PROMPT Analyze the following list of file paths and identify which files are likely to represent endpoints, including API endpoints, web pages, or static resources. - If no files are found, return: - {"files": []} - Guidelines: - Focus only on individual files. - Do not include directories. - Do not include explanations, comments or additional text. Input Files: - #{all_paths.map { |path| File.expand_path(path) }.join("\n")} + #{all_paths.map { |path| "- #{File.expand_path(path)}" }.join("\n")} PROMPT format = <<-FORMAT @@ -82,9 +79,6 @@ module Analyzer::AI prompt = <<-PROMPT Analyze the provided source code to extract details about the endpoints and their parameters. - If no endpoints are found, return: - {"endpoints": []} - Guidelines: - The "method" field should strictly use one of these values: "GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD". - The "param_type" must strictly use one of these values: "query", "json", "form", "header", "cookie" and "path". @@ -131,7 +125,8 @@ module Analyzer::AI "required": ["url", "method", "params"] } } - } + }, + "required": ["endpoints"] } FORMAT