diff --git a/gradle.properties b/gradle.properties index bd6f7adc8..b82136041 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,7 +1,7 @@ pluginName=Cognotik pluginRepositoryUrl=https://github.com/SimiaCryptus/Cognotik libraryGroup=com.simiacryptus -libraryVersion=2.0.19 +libraryVersion=2.0.20 gradleVersion=8.13 org.gradle.caching=true diff --git a/intellij/src/main/kotlin/cognotik/actions/plan/TaskConfigEditDialog.kt b/intellij/src/main/kotlin/cognotik/actions/plan/TaskConfigEditDialog.kt index 47f56a3dc..df0390a1a 100644 --- a/intellij/src/main/kotlin/cognotik/actions/plan/TaskConfigEditDialog.kt +++ b/intellij/src/main/kotlin/cognotik/actions/plan/TaskConfigEditDialog.kt @@ -20,8 +20,9 @@ import com.simiacryptus.cognotik.plan.tools.SelfHealingTask import com.simiacryptus.cognotik.plan.tools.SubPlanningTask import com.simiacryptus.cognotik.plan.tools.mcp.MCPToolTask import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask -import com.simiacryptus.cognotik.plan.tools.online.FetchMethod -import com.simiacryptus.cognotik.plan.tools.online.SeedMethod +import com.simiacryptus.cognotik.plan.tools.online.processing.ProcessingStrategyType +import com.simiacryptus.cognotik.plan.tools.online.fetch.FetchMethod +import com.simiacryptus.cognotik.plan.tools.online.seed.SeedMethod import java.awt.Component import java.awt.Dimension import javax.swing.* @@ -329,8 +330,17 @@ private fun com.intellij.ui.dsl.builder.Panel.createSubPlanningFields(config: Su - private fun com.intellij.ui.dsl.builder.Panel.createCrawlerFields(config: CrawlerAgentTask.CrawlerTaskTypeConfig) { +private fun com.intellij.ui.dsl.builder.Panel.createCrawlerFields(config: CrawlerAgentTask.CrawlerTaskTypeConfig) { group("Web Crawler Settings") { + row("Processing Strategy:") { + val strategies = ProcessingStrategyType.entries.map { it.name }.toTypedArray() + val combo = ComboBox(strategies) + combo.selectedItem = config.processing_strategy?.name ?: "DefaultSummarizer" + combo.toolTipText = "Strategy for processing and analyzing page content" + cell(combo) + .comment("Select how pages should be processed and analyzed") + configFields["processing_strategy"] = combo + } row("Seed Method:") { val methods = SeedMethod.entries.map { it.name }.toTypedArray() val combo = ComboBox(methods) @@ -356,7 +366,7 @@ private fun com.intellij.ui.dsl.builder.Panel.createSubPlanningFields(config: Su } row("Max Pages Per Task:") { val field = JBTextField(config.max_pages_per_task?.toString() ?: "30") - field.toolTipText = "Maximum number of pages to process (1-100)" + field.toolTipText = "Maximum number of pages to process (1-500)" cell(field) .comment("Limit the number of pages crawled per task") configFields["max_pages_per_task"] = field @@ -493,9 +503,9 @@ private fun com.intellij.ui.dsl.builder.Panel.createSubPlanningFields(config: Su val maxPages = (configFields["max_pages_per_task"] as? JBTextField)?.text?.trim() if (!maxPages.isNullOrEmpty()) { val value = maxPages.toIntOrNull() - if (value == null || value !in 1..100) { + if (value == null || value !in 1..1000) { Messages.showWarningDialog( - "Max Pages Per Task must be between 1 and 100", + "Max Pages Per Task must be between 1 and 1000", "Invalid Value" ) configFields["max_pages_per_task"]?.requestFocusInWindow() @@ -645,11 +655,15 @@ private fun com.intellij.ui.dsl.builder.Panel.createSubPlanningFields(config: Su } - is CrawlerAgentTask.CrawlerTaskTypeConfig -> { +is CrawlerAgentTask.CrawlerTaskTypeConfig -> { CrawlerAgentTask.CrawlerTaskTypeConfig( task_type = baseConfig.task_type!!, name = baseConfig.name, model = baseConfig.model, + processing_strategy = ProcessingStrategyType.valueOf( + (configFields["processing_strategy"] as? ComboBox<*>)?.selectedItem as? String + ?: "DefaultSummarizer" + ), seed_method = SeedMethod.valueOf( (configFields["seed_method"] as? ComboBox<*>)?.selectedItem as? String ?: "GoogleProxy" diff --git a/jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ImageModificationAgent.kt b/jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ImageProcessingAgent.kt similarity index 84% rename from jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ImageModificationAgent.kt rename to jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ImageProcessingAgent.kt index 8ddd8c430..9cd7f1839 100644 --- a/jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ImageModificationAgent.kt +++ b/jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ImageProcessingAgent.kt @@ -11,10 +11,10 @@ import java.util.* import javax.imageio.ImageIO /** - * Agent that processes images using multimodal chat models. - * Takes an input image and text prompt, and returns modified image with description. + * Agent that processes text/images input and generates text/images output based on the prompt. + * Can be used for image generation, image captioning, and image editing tasks. */ -open class ImageModificationAgent( +open class ImageProcessingAgent( prompt: String = "Analyze and describe the image based on the user's request", name: String? = null, model: ChatInterface, @@ -57,7 +57,7 @@ open class ImageModificationAgent( return ImageAndText(text = text, image = image ?: input.map { it.image }.firstOrNull()) } - override fun withModel(model: ChatInterface): ImageModificationAgent = ImageModificationAgent( + override fun withModel(model: ChatInterface): ImageProcessingAgent = ImageProcessingAgent( prompt = prompt, name = name, model = model, @@ -65,7 +65,7 @@ open class ImageModificationAgent( ) companion object { - private val log = org.slf4j.LoggerFactory.getLogger(ImageModificationAgent::class.java) + private val log = org.slf4j.LoggerFactory.getLogger(ImageProcessingAgent::class.java) } } diff --git a/jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ParsedAgent.kt b/jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ParsedAgent.kt index e2cc31647..1c8720646 100644 --- a/jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ParsedAgent.kt +++ b/jo-penai/src/main/kotlin/com/simiacryptus/cognotik/agents/ParsedAgent.kt @@ -9,6 +9,7 @@ import com.simiacryptus.cognotik.util.LoggerFactory import com.simiacryptus.cognotik.util.MultiExeption import com.simiacryptus.cognotik.util.ValidatedObject import com.simiacryptus.cognotik.util.toContentList +import com.simiacryptus.cognotik.util.toJson import java.util.function.Function open class ParsedAgent( @@ -175,4 +176,20 @@ open class ParsedAgent( private val log = LoggerFactory.getLogger(ParsedAgent::class.java) } -} \ No newline at end of file +} + + +inline fun Any.parserCast( + model: ChatInterface, describer: TypeDescriber = object : AbbrevWhitelistYamlDescriber( + "com.simiacryptus", "aicoder.actions" + ) { + override val includeMethods: Boolean get() = false + } +) : T = ParsedAgent( + prompt = "", + resultClass = T::class.java, + model = model, + parsingChatter = model, + describer = describer +).getParser().apply(this.toJson()) + diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/cognitive/ConversationalMode.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/cognitive/ConversationalMode.kt index 6a9b954d7..d94b960ea 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/cognitive/ConversationalMode.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/cognitive/ConversationalMode.kt @@ -80,7 +80,7 @@ open class ConversationalMode( isProcessing = true } - task.echo(userMessage.renderMarkdown()) + task.echo(userMessage.renderMarkdown) writeToTranscript("## User\n\n$userMessage\n\n") this.task.ui.pool.submit { try { diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/GenerateImageTask.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/GenerateImageTask.kt index b1b5b5385..1aa0ba215 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/GenerateImageTask.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/GenerateImageTask.kt @@ -1,7 +1,7 @@ package com.simiacryptus.cognotik.plan.tools.file import com.simiacryptus.cognotik.agents.ImageAndText -import com.simiacryptus.cognotik.agents.ImageModificationAgent +import com.simiacryptus.cognotik.agents.ImageProcessingAgent import com.simiacryptus.cognotik.describe.Description import com.simiacryptus.cognotik.plan.OrchestrationConfig import com.simiacryptus.cognotik.plan.TaskOrchestrator @@ -106,7 +106,7 @@ GenerateImage - Create images using AI image generation models task.add(MarkdownUtil.renderMarkdown("### Generating image...", ui = task.ui)) // Use the image generation agent - val imageAgent = ImageModificationAgent( + val imageAgent = ImageProcessingAgent( prompt = "Transform the user request into an image", name = "ImageGenerator", model = orchestrationConfig.imageChatChatter, diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/GeneratePresentationTask.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/GeneratePresentationTask.kt index 6f92ea821..8ca73c9fa 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/GeneratePresentationTask.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/GeneratePresentationTask.kt @@ -2,7 +2,7 @@ package com.simiacryptus.cognotik.plan.tools.file import com.simiacryptus.cognotik.agents.ChatAgent import com.simiacryptus.cognotik.agents.ImageAndText -import com.simiacryptus.cognotik.agents.ImageModificationAgent +import com.simiacryptus.cognotik.agents.ImageProcessingAgent import com.simiacryptus.cognotik.describe.Description import com.simiacryptus.cognotik.plan.OrchestrationConfig import com.simiacryptus.cognotik.plan.TaskOrchestrator @@ -456,7 +456,7 @@ Provide only the CSS code within a code block: ui = task.ui ) ) - val imageAgent = ImageModificationAgent( + val imageAgent = ImageProcessingAgent( prompt = "Create a professional, visually appealing image for a presentation slide", model = orchestrationConfig.imageChatChatter, temperature = 0.7, diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/WriteHtmlTask.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/WriteHtmlTask.kt index 266f31de4..02a2e60b2 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/WriteHtmlTask.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/file/WriteHtmlTask.kt @@ -2,7 +2,7 @@ package com.simiacryptus.cognotik.plan.tools.file import com.simiacryptus.cognotik.agents.ChatAgent import com.simiacryptus.cognotik.agents.ImageAndText -import com.simiacryptus.cognotik.agents.ImageModificationAgent +import com.simiacryptus.cognotik.agents.ImageProcessingAgent import com.simiacryptus.cognotik.apps.general.renderMarkdown import com.simiacryptus.cognotik.describe.Description import com.simiacryptus.cognotik.plan.OrchestrationConfig @@ -240,7 +240,7 @@ DESCRIPTION: another detailed description val filename = filename try { newTask.add(MarkdownUtil.renderMarkdown("Generating image: `$filename`...", ui = ui)) - val imageAgent = ImageModificationAgent( + val imageAgent = ImageProcessingAgent( prompt = "Create a high-quality image for a web page based on the description", model = orchestrationConfig.imageChatChatter, temperature = 0.7, diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/CrawlerAgentTask.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/CrawlerAgentTask.kt index b00b21f61..eaa549ab1 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/CrawlerAgentTask.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/CrawlerAgentTask.kt @@ -3,12 +3,18 @@ package com.simiacryptus.cognotik.plan.tools.online import com.fasterxml.jackson.core.JsonProcessingException import com.fasterxml.jackson.databind.ObjectMapper import com.simiacryptus.cognotik.agents.ChatAgent -import com.simiacryptus.cognotik.agents.ParsedAgent -import com.simiacryptus.cognotik.agents.ParsedResponse +import com.simiacryptus.cognotik.agents.CodeAgent.Companion.indent import com.simiacryptus.cognotik.apps.general.renderMarkdown +import com.simiacryptus.cognotik.chat.model.ChatInterface import com.simiacryptus.cognotik.describe.Description -import com.simiacryptus.cognotik.describe.TypeDescriber import com.simiacryptus.cognotik.plan.* +import com.simiacryptus.cognotik.plan.tools.online.fetch.FetchMethod +import com.simiacryptus.cognotik.plan.tools.online.fetch.FetchStrategy +import com.simiacryptus.cognotik.plan.tools.online.processing.PageProcessingStrategy +import com.simiacryptus.cognotik.plan.tools.online.processing.PageProcessingStrategy.PageProcessingResult +import com.simiacryptus.cognotik.plan.tools.online.processing.PageProcessingStrategy.ProcessingContext +import com.simiacryptus.cognotik.plan.tools.online.processing.ProcessingStrategyType +import com.simiacryptus.cognotik.plan.tools.online.seed.SeedMethod import com.simiacryptus.cognotik.platform.model.ApiChatModel import com.simiacryptus.cognotik.util.* import com.simiacryptus.cognotik.webui.session.SessionTask @@ -32,13 +38,13 @@ class CrawlerAgentTask( orchestrationConfig: OrchestrationConfig, planTask: CrawlerTaskExecutionConfigData?, ) : AbstractTask( - orchestrationConfig, - planTask + orchestrationConfig, planTask ) { class CrawlerTaskTypeConfig( @Description("Method to seed the crawler (optional)") val seed_method: SeedMethod? = SeedMethod.GoogleProxy, @Description("Method used to fetch content from URLs (optional)") val fetch_method: FetchMethod? = FetchMethod.HttpClient, + @Description("Strategy for processing pages (optional)") val processing_strategy: ProcessingStrategyType? = ProcessingStrategyType.DefaultSummarizer, @Description("Whitespace-separated list of allowed domains/URL prefixes to restrict crawling (optional)") val allowed_domains: String? = null, @Description("Respect robots.txt rules when crawling (default: true)") val respect_robots_txt: Boolean? = true, @Description("Maximum number of pages to process in a single task") val max_pages_per_task: Int? = null, @@ -82,20 +88,18 @@ class CrawlerAgentTask( @Description("The search query to use for Google search") val search_query: String? = null, @Description("Direct URLs to analyze (comma-separated)") val direct_urls: List? = null, @Description("The query considered when processing the content - this should contain a detailed listing of the desired data, evaluation criteria, and filtering priorities used to transform the page into the desired summary") val content_queries: Any? = null, - @Description("Whitespace-separated list of allowed domains/URL prefixes to restrict crawling (optional)") val allowed_domains: String? = null, + //@Description("Whitespace-separated list of allowed domains/URL prefixes to restrict crawling (optional)") val allowed_domains: String? = null, task_description: String? = null, task_dependencies: List? = null, state: TaskState? = null, ) : TaskExecutionConfig( - task_type = CrawlerAgent.name, - task_description = task_description, - task_dependencies = task_dependencies?.toMutableList(), - state = state + task_type = CrawlerAgent.name, task_description = task_description, task_dependencies = task_dependencies?.toMutableList(), state = state ), ValidatedObject { override fun validate(): String? { if (search_query.isNullOrBlank() && direct_urls.isNullOrEmpty()) { return "Either search_query or direct_urls must be provided" } + if (!direct_urls.isNullOrEmpty()) { direct_urls.forEach { url -> if (!url.matches(Regex("^(http|https)://.*"))) { @@ -103,12 +107,12 @@ class CrawlerAgentTask( } } } - if (!allowed_domains.isNullOrBlank()) { - val domains = allowed_domains.split(Regex("\\s+")).filter { it.isNotBlank() } - if (domains.isEmpty()) { - return "allowed_domains must contain at least one valid domain when specified" - } - } +// if (!allowed_domains.isNullOrBlank()) { +// val domains = allowed_domains.split(Regex("\\s+")).filter { it.isNotBlank() } +// if (domains.isEmpty()) { +// return "allowed_domains must contain at least one valid domain when specified" +// } +// } return ValidatedObject.validateFields(this) } } @@ -121,18 +125,39 @@ class CrawlerAgentTask( // Use a priority queue that sorts by calculated priority (higher first) private val pageQueue = java.util.PriorityQueue( - compareByDescending { it.calculatePriority() } - ) + compareByDescending { it.calculatePriority() }) private val seenUrls = ConcurrentHashMap.newKeySet() - override fun promptSegment() = """ - CrawlerAgent - Search Google, fetch top results, and analyze content - ** Specify the search query - ** Or provide direct URLs to analyze - ** Specify a detailed query/analysis prompt to guide content processing - ** Results will be saved to .websearch directory for future reference - ** Links found in analysis can be automatically followed for deeper research - """.trimIndent() + override fun promptSegment(): String { + val str = buildString { + appendLine("CrawlerAgent - Search Google, fetch top results, and analyze content") + appendLine("** Specify the search query") + appendLine("** Or provide direct URLs to analyze") + appendLine("** Specify a detailed query/analysis prompt to guide content processing") + appendLine("** Choose a processing strategy: DefaultSummarizer, FactChecking, or JobMatching") + appendLine("** Results will be saved to .websearch directory for future reference") + appendLine("** Links found in analysis can be automatically followed for deeper research") + val typeConfig = this@CrawlerAgentTask.typeConfig + if (null != typeConfig) { + when (typeConfig.processing_strategy) { + ProcessingStrategyType.DefaultSummarizer -> { + // No additional notes for DefaultSummarizer + } + + else -> { + appendLine( + "** Using processing strategy: ${typeConfig.processing_strategy?.name} - ${ + typeConfig.processing_strategy?.createStrategy()?.description?.indent( + " " + ) + }" + ) + } + } + } + } + return str + } fun cleanup() { try { @@ -152,12 +177,9 @@ class CrawlerAgentTask( } data class LinkData( - @Description("The URL of the link to crawl") - val url: String? = null, - @Description("The title of the link (optional)") - val title: String? = null, - @Description("Tags associated with the link (optional)") - val tags: List? = null, + @Description("The URL of the link to crawl") var url: String? = null, + @Description("The title of the link (optional)") val title: String? = null, + @Description("Tags associated with the link (optional)") val tags: List? = null, @Description("1-100") val relevance_score: Double = 100.0 ) : ValidatedObject { var started: Boolean = false @@ -172,8 +194,8 @@ class CrawlerAgentTask( if (url.isNullOrBlank()) { return "link cannot be null or blank" } - if (!url.matches(Regex("^(http|https)://.*"))) { - return "link must be a valid HTTP/HTTPS URL: $url" + if (false == url?.matches(Regex("^(http|https)://.*"))) { + url = "https://$url" } if (relevance_score < 1.0 || relevance_score > 100.0) { return "relevance_score must be between 1 and 100" @@ -183,9 +205,7 @@ class CrawlerAgentTask( } enum class PageType { - Error, - Irrelevant, - OK + Error, Irrelevant, OK } data class ParsedPage( @@ -206,33 +226,52 @@ class CrawlerAgentTask( } override fun run( - agent: TaskOrchestrator, - messages: List, - task: SessionTask, - resultFn: (String) -> Unit, - orchestrationConfig: OrchestrationConfig + agent: TaskOrchestrator, messages: List, task: SessionTask, resultFn: (String) -> Unit, orchestrationConfig: OrchestrationConfig ) { log.info("Starting CrawlerAgentTask.run() with messages count: ${messages.size}") + var transcriptStream: FileOutputStream? = null try { - resultFn(innerRun(agent, task, orchestrationConfig)) + transcriptStream = if (typeConfig?.generate_transcript != false) { + initializeTranscript(task) + } else null + val chatInterface = ( + typeConfig?.model?.let { this@CrawlerAgentTask.orchestrationConfig.instance(it) } + ?: this@CrawlerAgentTask.orchestrationConfig.parsingChatter + ).getChildClient(task) + resultFn(innerRun(agent, messages, task, orchestrationConfig, transcriptStream, chatInterface)) } catch (e: Throwable) { log.error("Unhandled exception in CrawlerAgentTask", e) val errorMessage = "Error: ${e.message ?: "Unknown error occurred"}" resultFn(errorMessage) task.error(e) } finally { + transcriptStream?.let { stream -> + try { + stream.close() + log.debug("Transcript stream closed in run() finally block") + } catch (e: Exception) { + log.error("Failed to close transcript stream", e) + } + } cleanup() } } private fun innerRun( agent: TaskOrchestrator, + messages: List, task: SessionTask, - orchestrationConfig: OrchestrationConfig + orchestrationConfig: OrchestrationConfig, + transcriptStream: FileOutputStream?, + chatInterface: ChatInterface ): String { - var transcriptStream: FileOutputStream? = null try { - val typeConfig = typeConfig ?: throw RuntimeException() + val typeConfig = typeConfig ?: throw RuntimeException("Missing type config") + // Initialize processing strategy + val strategyType = typeConfig.processing_strategy ?: ProcessingStrategyType.DefaultSummarizer + val processingStrategy = strategyType.createStrategy() + log.info("Using processing strategy: ${strategyType.name} - ${processingStrategy.javaClass.simpleName}") + val startTime = System.currentTimeMillis() log.info( "Starting CrawlerAgentTask with config: search_query='${executionConfig?.search_query}', direct_urls='${ @@ -250,17 +289,16 @@ class CrawlerAgentTask( log.debug("Created websearch directory: ${webSearchDir.absolutePath}") } val tabs = TabbedDisplay(task) - // Initialize transcript if enabled - if (typeConfig.generate_transcript != false) { - transcriptStream = initializeTranscript(task) - transcriptStream?.let { stream -> - writeTranscriptHeader(stream, startTime) - } + val crawlTask = task.linkedTask("Crawl Details") + val crawlTabs = TabbedDisplay(crawlTask) + task.update() + transcriptStream?.let { stream -> + writeTranscriptHeader(stream) } val seedMethod = when { !executionConfig?.direct_urls.isNullOrEmpty() -> SeedMethod.DirectUrls - typeConfig.seed_method != null -> typeConfig.seed_method!! + typeConfig.seed_method != null -> typeConfig.seed_method !executionConfig?.search_query.isNullOrBlank() -> SeedMethod.GoogleProxy else -> { log.error("No seed method specified and no search query or direct URLs provided") @@ -280,8 +318,8 @@ class CrawlerAgentTask( return "Warning: No seed items found to start crawling" } // Create seed links tab - val seedLinksTask = task.ui.newTask(false) - tabs["Seed Links"] = seedLinksTask.placeholder + val seedLinksTask = crawlTask.ui.newTask(false) + crawlTabs["Seed Links"] = seedLinksTask.placeholder val seedLinksContent = buildString { appendLine("# Seed Links") appendLine() @@ -292,7 +330,7 @@ class CrawlerAgentTask( appendLine("---") appendLine() seedItems.forEachIndexed { index, item -> - appendLine("## ${index + 1}. [${item.title ?: "Untitled"}](${item.link})") + appendLine("## ${index + 1}. [${item.title}](${item.link})") appendLine() appendLine("- **URL:** ${item.link}") appendLine("- **Relevance Score:** ${item.relevance_score}") @@ -303,7 +341,7 @@ class CrawlerAgentTask( } } seedLinksTask.add(seedLinksContent.renderMarkdown) - task.update() + crawlTask.update() // Log seed links to transcript transcriptStream?.let { stream -> writeToTranscript(stream, "## Seed Links\n\n$seedLinksContent\n\n") @@ -312,25 +350,20 @@ class CrawlerAgentTask( synchronized(pageQueueLock) { seedItems.forEach { item -> - if (item.link != null && isBlacklistedDomain(item.link)) { + if (isBlacklistedDomain(item.link)) { log.info("Skipping blacklisted seed URL: ${item.link}") return@forEach } - if (typeConfig.respect_robots_txt == true && !robotsTxtParser.isAllowed(item.link ?: "")) { + if (typeConfig.respect_robots_txt == true && !robotsTxtParser.isAllowed(item.link)) { log.info("Skipping seed URL disallowed by robots.txt: ${item.link}") return@forEach } LinkData( - url = item.link, - title = item.title, - tags = item.tags, - relevance_score = item.relevance_score + url = item.link, title = item.title, tags = item.tags, relevance_score = item.relevance_score ).let { linkData -> log.debug("Adding seed item to page queue: {}", linkData) if (!addToQueue( - linkData, - typeConfig.max_depth ?: 3, - typeConfig.max_queue_size ?: 100 + linkData, typeConfig.max_depth ?: 3, typeConfig.max_queue_size ?: 100 ) ) { log.warn("No valid seed items found after processing") @@ -349,6 +382,21 @@ class CrawlerAgentTask( val concurrentProcessing = /*taskConfig?.concurrent_page_processing ?:*/ typeConfig.concurrent_page_processing ?: 3 log.info("Processing configuration: maxPages=$maxPages, concurrentProcessing=$concurrentProcessing") +// Create processing context + val processingContext = ProcessingContext( + executionConfig = executionConfig ?: throw RuntimeException("Missing execution config"), + typeConfig = typeConfig, + orchestrationConfig = orchestrationConfig, + messages = messages, + task = crawlTask, + webSearchDir = webSearchDir, + processedCount = AtomicInteger(0), + maxPages = maxPages, + transcriptStream = transcriptStream + ) + // Track all page results for strategy + val allPageResults = ConcurrentHashMap() + val completionService: CompletionService = ExecutorCompletionService(agent.pool) val activeTasks = ConcurrentHashMap.newKeySet() @@ -356,8 +404,7 @@ class CrawlerAgentTask( val errorCount = AtomicInteger(0) val maxErrors = maxPages / 2 // Stop if too many errors log.info("Starting crawling loop with maxErrors threshold: $maxErrors") - val fetchStrategy = (this@CrawlerAgentTask.typeConfig?.fetch_method - ?: FetchMethod.HttpClient).createStrategy( + val fetchStrategy = (this@CrawlerAgentTask.typeConfig?.fetch_method ?: FetchMethod.HttpClient).createStrategy( this@CrawlerAgentTask ) @@ -376,8 +423,7 @@ class CrawlerAgentTask( "queue_size=${pageQueue.size}, seen=${seenUrls.size}, active=${activeTasks.size}" } // Queue new tasks while we have capacity and unstarted pages - while ( - activeTasks.size < concurrentProcessing && // Limit concurrent tasks + while (activeTasks.size < concurrentProcessing && // Limit concurrent tasks synchronized(pageQueueLock) { pageQueue.isNotEmpty() } && // There are still unstarted pages errorCount.get() < maxErrors && // Not too many errors processedCount.get() < maxPages // Haven't hit max pages yet @@ -388,8 +434,7 @@ class CrawlerAgentTask( activeTasks = activeTasks, errorCount = errorCount, maxErrors = maxErrors, - task = task, - tabs = tabs, + tabs = crawlTabs, processedCount = processedCount, maxPages = maxPages, maxDepth = maxDepthConfig, @@ -397,9 +442,11 @@ class CrawlerAgentTask( webSearchDir = webSearchDir, agent = agent, fetchStrategy = fetchStrategy, - orchestrationConfig = orchestrationConfig, analysisResultsMap = analysisResultsMap, - transcriptStream = transcriptStream + transcriptStream = transcriptStream, + processingStrategy = processingStrategy, + processingContext = processingContext, + allPageResults = allPageResults ) } @@ -426,7 +473,15 @@ class CrawlerAgentTask( } log.info("Crawling progress: processed=${processedCount.get()}/$maxPages, queue=${pageQueue.size}, active_tasks=${activeTasks.size}, errors=${errorCount.get()}/$maxErrors") - //while (activeTasks.isNotEmpty()) sleep(1000) + + // Check if strategy wants to terminate early + val continuationDecision = processingStrategy.shouldContinueCrawling( + allPageResults.values.toList(), processingContext + ) + if (!continuationDecision.shouldContinue) { + log.info("Strategy requested early termination: ${continuationDecision.reason}") + break + } } if (loopIterations.get() >= 1000) { log.warn("Reached maximum iteration limit: ${1000}") @@ -439,10 +494,17 @@ class CrawlerAgentTask( } val totalTime = System.currentTimeMillis() - startTime log.info("CrawlerAgentTask completed: total_time=${totalTime}ms, pages_processed=${processedCount.get()}, errors=${errorCount.get()}, success_rate=${if (processedCount.get() > 0) ((processedCount.get() - errorCount.get()) * 100 / processedCount.get()) else 0}%") + // Add page queue details tab + addPageQueueDetailsTab(crawlTabs, crawlTask, processedCount.get(), errorCount.get()) + task.complete("Completed in ${totalTime / 1000} seconds, processed ${processedCount.get()} pages with ${errorCount.get()} errors.") // Write completion stats to transcript transcriptStream?.let { stream -> - writeTranscriptFooter(stream, totalTime, processedCount.get(), errorCount.get()) + try { + writeTranscriptFooter(stream, totalTime, processedCount.get(), errorCount.get()) + } catch (e: Exception) { + log.error("Failed to write transcript footer", e) + } } val analysisResults = (1..processedCount.get()).asSequence().mapNotNull { @@ -457,31 +519,32 @@ class CrawlerAgentTask( val summaryTask = task.ui.newTask(false) tabs["Final Summary"] = summaryTask.placeholder - val finalOutput = - if (typeConfig.create_final_summary != false && analysisResults.length > typeConfig.max_final_output_size ?: 15000) { - log.info("Creating final summary: original_size=${analysisResults.length}, max_size=${typeConfig.max_final_output_size ?: 15000}") - try { - createFinalSummary(analysisResults, summaryTask) - } catch (e: Exception) { - log.error("Failed to create final summary, using truncated results", e) - analysisResults.substring( - 0, minOf( - analysisResults.length, - typeConfig.max_final_output_size ?: 15000 - ) - ) + - "\n\n---\n\n*Note: Summary generation failed, showing truncated results*" - } + + // Use strategy to generate final output + val finalOutput = try { + log.info("Generating final output using strategy: ${strategyType.name}") + processingStrategy.generateFinalOutput( + allPageResults.values.toList(), processingContext + ) + } catch (e: Exception) { + log.error("Failed to generate final output using strategy, falling back to basic summary", e) + if (typeConfig.create_final_summary != false && analysisResults.length > (typeConfig.max_final_output_size ?: 15000)) { + createFinalSummary(analysisResults, chatInterface) } else { - log.info("Using analysis results directly: size=${analysisResults.length}") analysisResults } + } + try { summaryTask.add(finalOutput.renderMarkdown) task.update() // Write final summary to transcript transcriptStream?.let { stream -> - writeToTranscript(stream, "\n\n## Final Summary\n\n$finalOutput\n\n") + try { + writeToTranscript(stream, "\n\n## Final Summary\n\n$finalOutput\n\n") + } catch (e: Exception) { + log.error("Failed to write final summary to transcript", e) + } } } catch (e: Exception) { log.error("Failed to update task with final summary", e) @@ -492,9 +555,6 @@ class CrawlerAgentTask( log.error("Unhandled exception in CrawlerAgentTask", e) task.error(e) return "Error: ${e.javaClass.simpleName} - ${e.message ?: "Unknown error"}" - } finally { - transcriptStream?.close() - log.debug("Transcript stream closed") } } @@ -503,9 +563,11 @@ class CrawlerAgentTask( val (link, file) = task.createFile("crawler_transcript.md") val transcriptStream = file?.outputStream() task.complete( - "Writing transcript to $link " + - "html " + - "pdf" + "Writing transcript to $link " + "html " + "pdf" ) log.info("Initialized transcript file: $link") transcriptStream @@ -515,7 +577,7 @@ class CrawlerAgentTask( } } - private fun writeTranscriptHeader(stream: FileOutputStream, startTime: Long) { + private fun writeTranscriptHeader(stream: FileOutputStream) { try { val header = buildString { appendLine("# Crawler Agent Transcript") @@ -532,7 +594,9 @@ class CrawlerAgentTask( stream.write(header.toByteArray(StandardCharsets.UTF_8)) stream.flush() } catch (e: Exception) { - log.error("Failed to write transcript header", e) + if (e !is java.io.IOException || e.message?.contains("closed") != true) { + log.error("Failed to write transcript header", e) + } } } @@ -541,7 +605,9 @@ class CrawlerAgentTask( stream.write(content.toByteArray(StandardCharsets.UTF_8)) stream.flush() } catch (e: Exception) { - log.error("Failed to write to transcript", e) + if (e !is java.io.IOException || e.message?.contains("closed") != true) { + log.error("Failed to write to transcript", e) + } } } @@ -563,23 +629,24 @@ class CrawlerAgentTask( stream.write(footer.toByteArray(StandardCharsets.UTF_8)) stream.flush() } catch (e: Exception) { - log.error("Failed to write transcript footer", e) + if (e !is java.io.IOException || e.message?.contains("closed") != true) { + log.error("Failed to write transcript footer", e) + } } } fun addToQueue( - newLink: LinkData, - maxDepth: Int, - maxQueueSize: Int + newLink: LinkData, maxDepth: Int, maxQueueSize: Int ): Boolean = synchronized(pageQueueLock) { - val typeConfig = typeConfig ?: throw RuntimeException() - if (newLink.url.isNullOrBlank()) { + val typeConfig = typeConfig ?: throw RuntimeException("Missing type config") + val newUrl = newLink.url + if (newUrl.isNullOrBlank()) { log.warn("Attempted to add invalid or empty URL to queue: $newLink") return false } - if (typeConfig.respect_robots_txt == true && !robotsTxtParser.isAllowed(newLink.url)) { - log.debug("Skipping URL disallowed by robots.txt: ${newLink.url}") + if (typeConfig.respect_robots_txt == true && !robotsTxtParser.isAllowed(newUrl)) { + log.debug("Skipping URL disallowed by robots.txt: $newUrl") return false } if (pageQueue.size >= maxQueueSize) { @@ -587,16 +654,16 @@ class CrawlerAgentTask( return false } if (newLink.depth > maxDepth) { - log.debug("Skipping link due to depth limit (depth=${newLink.depth} > maxDepth=$maxDepth): ${newLink.url}") + log.debug("Skipping link due to depth limit (depth=${newLink.depth} > maxDepth=$maxDepth): $newUrl") return false } - if (seenUrls.contains(newLink.url)) { - log.debug("Skipping duplicate link already in queue: ${newLink.url}") + if (seenUrls.contains(newUrl)) { + log.debug("Skipping duplicate link already in queue: $newUrl") return false } - seenUrls.add(newLink.url) + seenUrls.add(newUrl) pageQueue.add(newLink) - log.debug("Added new link to queue: ${newLink.url} (depth=${newLink.depth}, priority=${newLink.calculatePriority()})") + log.debug("Added new link to queue: $newUrl (depth=${newLink.depth}, priority=${newLink.calculatePriority()})") true } @@ -611,11 +678,7 @@ class CrawlerAgentTask( } private fun shouldContinue( - maxPages: Int, - errorCount: AtomicInteger, - maxErrors: Int, - loopIterations: AtomicInteger, - activeTasks: MutableSet + maxPages: Int, errorCount: AtomicInteger, maxErrors: Int, loopIterations: AtomicInteger, activeTasks: MutableSet ): Boolean = synchronized(pageQueueLock) { val completed = seenUrls.size - pageQueue.size - activeTasks.size val unstarted = pageQueue.size @@ -625,10 +688,7 @@ class CrawlerAgentTask( // 1. We have active tasks (they might add more links), OR // 2. We have unstarted pages in the queue // AND we haven't hit our limits - val shouldContinue = (hasActiveTasks || unstarted > 0) && - completed < maxPages && - errorCount.get() < maxErrors && - loopIterations.getAndIncrement() < 1000 + val shouldContinue = (hasActiveTasks || unstarted > 0) && completed < maxPages && errorCount.get() < maxErrors && loopIterations.getAndIncrement() < 1000 if (!shouldContinue) { log.info("Stopping crawl: completed=$completed/$maxPages, unstarted=$unstarted, active=$hasActiveTasks, errors=${errorCount.get()}/$maxErrors") @@ -643,7 +703,6 @@ class CrawlerAgentTask( activeTasks: MutableSet, errorCount: AtomicInteger, maxErrors: Int, - task: SessionTask, tabs: TabbedDisplay, processedCount: AtomicInteger, maxPages: Int, @@ -652,13 +711,16 @@ class CrawlerAgentTask( webSearchDir: File, agent: TaskOrchestrator, fetchStrategy: FetchStrategy, - orchestrationConfig: OrchestrationConfig, analysisResultsMap: ConcurrentHashMap, - transcriptStream: FileOutputStream? + transcriptStream: FileOutputStream?, + processingStrategy: PageProcessingStrategy, + processingContext: ProcessingContext, + allPageResults: ConcurrentHashMap ): Boolean { log.info("Status before queuing next page: $queueStats, active_tasks=${activeTasks.size}, errors=${errorCount.get()}/$maxErrors") val page = getNextPage() ?: return true - if (page.url.isNullOrBlank()) { + val pageUrl = page.url + if (pageUrl.isNullOrBlank()) { log.error("Invalid page link encountered: $page") errorCount.incrementAndGet() page.completed = true @@ -666,17 +728,17 @@ class CrawlerAgentTask( page.error = "Invalid or empty URL" return false } - activeTasks.add(page.url) + activeTasks.add(pageUrl) - log.info("Queuing page for processing: url='${page.url}', title='${page.title}', depth=${page.depth}, relevance=${page.relevance_score}") + log.info("Queuing page for processing: url='$pageUrl', title='${page.title}', depth=${page.depth}, relevance=${page.relevance_score}") val subTask = try { - task.ui.newTask(false).apply { - tabs[page.url] = placeholder - task.update() + tabs.task.ui.newTask(false).apply { + tabs[pageUrl] = placeholder + tabs.task.update() } } catch (e: Exception) { - log.error("Failed to create subtask for URL: ${page.url}", e) + log.error("Failed to create subtask for URL: $pageUrl", e) errorCount.incrementAndGet() page.completed = true page.completed = true @@ -688,7 +750,7 @@ class CrawlerAgentTask( try { crawlPage( processedCount, - page.url, + pageUrl, page, maxPages, maxDepth, @@ -696,20 +758,24 @@ class CrawlerAgentTask( webSearchDir, agent, fetchStrategy, - orchestrationConfig, errorCount, subTask, analysisResultsMap, - transcriptStream + transcriptStream, + processingStrategy, + processingContext.copy( + task = subTask + ), + allPageResults ) } catch (e: Exception) { - log.error("Uncaught exception in page processing task for: ${page.url}", e) + log.error("Uncaught exception in page processing task for: $pageUrl", e) errorCount.incrementAndGet() page.completed = true page.completed = true page.error = "Uncaught exception: ${e.message}" } finally { - activeTasks.remove(page.url) + activeTasks.remove(pageUrl) } }) return false @@ -725,16 +791,21 @@ class CrawlerAgentTask( webSearchDir: File, agent: TaskOrchestrator, fetchStrategy: FetchStrategy, - orchestrationConfig: OrchestrationConfig, errorCount: AtomicInteger, task: SessionTask, analysisResultsMap: ConcurrentHashMap, - transcriptStream: FileOutputStream? + transcriptStream: FileOutputStream?, + processingStrategy: PageProcessingStrategy, + processingContext: ProcessingContext, + allPageResults: ConcurrentHashMap ) { - val typeConfig = typeConfig ?: throw RuntimeException() + val typeConfig = typeConfig ?: throw RuntimeException("Missing type config") val pageStartTime = System.currentTimeMillis() log.info("Starting to process page ${processedCount.get() + 1}: url='${link}', title='${page.title}'") val currentIndex = processedCount.incrementAndGet() + // Update processing context with current count + processingContext.processedCount.set(currentIndex) + // Apply crawl delay if robots.txt specifies one if (typeConfig.respect_robots_txt == true) { robotsTxtParser.getCrawlDelay(link)?.let { delay -> @@ -749,182 +820,188 @@ class CrawlerAgentTask( try { val url = link val title = page.title - val processPageResult = - buildString { - this.appendLine("## ${currentIndex}. [${title}]($url)") - this.appendLine() - try { - // Log page processing start to transcript - transcriptStream?.let { stream -> + val processPageResult = buildString { + this.appendLine("## ${currentIndex}. [${title}]($url)") + this.appendLine() + try { + // Log page processing start to transcript + transcriptStream?.let { stream -> + try { writeToTranscript(stream, "### Processing Page ${currentIndex}: [$title]($url)\n\n") writeToTranscript(stream, "**Started:** ${LocalDateTime.now().format(DateTimeFormatter.ofPattern("HH:mm:ss"))}\n\n") + } catch (e: Exception) { + log.debug("Failed to write page start to transcript (stream may be closed)", e) } + } - val content = fetchAndProcessUrl( - url, - webSearchDir = webSearchDir, - index = currentIndex, - pool = agent.pool, - fetchStrategy = fetchStrategy + val content = fetchAndProcessUrl( + url, webSearchDir = webSearchDir, index = currentIndex, pool = agent.pool, fetchStrategy = fetchStrategy + ) + log.debug("Fetched content for '$url': ${content.length} characters") + if (content.length < (typeConfig.min_content_length ?: 500)) { + log.info("Content too short for '$url': ${content.length} < ${typeConfig.min_content_length ?: 500} chars, skipping") + this.appendLine("*Content too short (${content.length} chars), skipping this result*") + this.appendLine() + // Record as irrelevant for strategy + val pageResult = PageProcessingResult( + url = url, + pageType = PageType.Irrelevant, + content = "*Content too short*", + extractedLinks = null, + metadata = mapOf("content_length" to content.length) ) - log.debug("Fetched content for '$url': ${content.length} characters") - if (content.length < typeConfig.min_content_length ?: 500) { - log.info("Content too short for '$url': ${content.length} < ${typeConfig.min_content_length ?: 500} chars, skipping") - this.appendLine("*Content too short (${content.length} chars), skipping this result*") - this.appendLine() - return@buildString - } + allPageResults[currentIndex] = pageResult + return@buildString + } - val analysisGoal = when { - this@CrawlerAgentTask.executionConfig?.content_queries != null -> executionConfig.toJson() - this@CrawlerAgentTask.executionConfig?.task_description?.isNotBlank() == true -> executionConfig.toString() - else -> "Analyze the content and provide insights." - } - log.debug("Analyzing content for '$url' with goal: $analysisGoal") - val analysis: ParsedResponse = - transformContent( - content, - analysisGoal, - orchestrationConfig, - task - ) + // Use strategy to process the page + log.debug("Processing page with strategy: ${processingStrategy.javaClass.simpleName}") + val pageResult = processingStrategy.processPage(url, content, processingContext) + allPageResults[currentIndex] = pageResult - val parsedPage = analysis.obj - if (parsedPage.page_type == PageType.Error) { - log.warn("Analysis returned error for '$url': ${parsedPage.page_information}") - this.appendLine( - "*Error processing this result: ${ - parsedPage.page_information?.let { - JsonUtil.toJson( - it - ) - } - }*" - ) - this.appendLine() - saveAnalysis(webSearchDir.resolve("error").apply { - mkdirs() - }, url, analysis, currentIndex) - return@buildString - } + // Handle different page types + if (pageResult.pageType == PageType.Error) { + log.warn("Strategy returned error for '$url': ${pageResult.metadata["error"]}") + this.appendLine("*Error processing this result: ${pageResult.metadata["error"]}*") + this.appendLine() + saveStrategyResult(webSearchDir.resolve("error").apply { mkdirs() }, url, pageResult, currentIndex) + return@buildString + } - if (parsedPage.page_type == PageType.Irrelevant) { - log.info("Content marked as irrelevant for '$url', skipping") - this.appendLine("*Irrelevant content, skipping this result*") - this.appendLine() - saveAnalysis(webSearchDir.resolve("irrelevant").apply { - mkdirs() - }, url, analysis, currentIndex) - return@buildString - } - log.debug("Successfully analyzed content for '$url', saving results") + if (pageResult.pageType == PageType.Irrelevant) { + log.info("Strategy marked content as irrelevant for '$url'") + this.appendLine("*Irrelevant content, skipping this result*") + this.appendLine() + saveStrategyResult(webSearchDir.resolve("irrelevant").apply { mkdirs() }, url, pageResult, currentIndex) + return@buildString + } - saveAnalysis( - webSearchDir = webSearchDir, - url = url, - analysis = analysis, - index = currentIndex - ) + saveStrategyResult(webSearchDir, url, pageResult, currentIndex) - this.appendLine(analysis.text) + this.appendLine(pageResult.content) + this.appendLine() + // Check for early termination + if (pageResult.shouldTerminate) { + log.info("Strategy requested termination: ${pageResult.terminationReason}") this.appendLine() + this.appendLine("---") + this.appendLine() + this.appendLine("**Crawling terminated:** ${pageResult.terminationReason}") + this.appendLine() + } - if (typeConfig.follow_links == true) { - - var linkData = parsedPage.link_data - val allowRevisit = /*taskConfig?.allow_revisit_pages ?:*/ - typeConfig.allow_revisit_pages == true - if (linkData.isNullOrEmpty()) { - linkData = extractLinksFromMarkdown(analysis.text) - log.debug("Extracted ${linkData.size} links from markdown for '$url'") - } else { - log.debug("Using ${linkData.size} structured links from analysis for '$url'") - } - // Add extracted links section to UI - if (linkData.isNotEmpty()) { - this.appendLine() - this.appendLine("### Extracted Links (${linkData.size} found)") - this.appendLine() - } + if (typeConfig.follow_links == true) { + var linkData = pageResult.extractedLinks + val allowRevisit = /*taskConfig?.allow_revisit_pages ?:*/ typeConfig.allow_revisit_pages == true + if (linkData.isNullOrEmpty()) { + linkData = extractLinksFromMarkdown(pageResult.content) + log.debug("Extracted ${linkData.size} links from markdown for '$url'") + } else { + log.debug("Using ${linkData.size} structured links from analysis for '$url'") + } + // Add extracted links section to UI + if (linkData.isNotEmpty()) { + this.appendLine() + this.appendLine("### Extracted Links (${linkData.size} found)") + this.appendLine() + } - var addedCount = 0 - val skippedLinks = mutableListOf>() - - linkData - .take(10) // Limit links per page to prevent explosion - .filter { link -> - val isValid = VALID_URL_PATTERN.matcher(link.url!!).matches() - val isNotBlacklisted = !isBlacklistedDomain(link.url) - val isNotDuplicate = allowRevisit || !seenUrls.contains(link.url) - val isAllowedByRobots = typeConfig.respect_robots_txt != true || - robotsTxtParser.isAllowed(link.url) - - if (!isValid) { - skippedLinks.add(link to "Invalid URL format") - } else if (!isNotBlacklisted) { - skippedLinks.add(link to "Blacklisted domain") - } else if (!isNotDuplicate) { - skippedLinks.add(link to "Already in queue") - } else if (!isAllowedByRobots) { - skippedLinks.add(link to "Disallowed by robots.txt") - } - isValid && isNotBlacklisted && isNotDuplicate && isAllowedByRobots - } - .forEach { link -> - val newLink = link.apply { depth = page.depth + 1 } - if (addToQueue(newLink, maxDepth, maxQueueSize)) { - addedCount++ - this.appendLine("- ✅ **[${link.title ?: "Untitled"}](${link.url})** (depth: ${newLink.depth}, relevance: ${link.relevance_score})") - } else { - skippedLinks.add(link to "Queue limit reached or max depth exceeded") - } + var addedCount = 0 + val skippedLinks = mutableListOf>() + + linkData.take(10) // Limit links per page to prevent explosion + .filter { link -> + val linkUrl = link.url + val isValid = VALID_URL_PATTERN.matcher(linkUrl!!).matches() + val isNotBlacklisted = !isBlacklistedDomain(linkUrl) + val isNotDuplicate = allowRevisit || !seenUrls.contains(linkUrl) + val isAllowedByRobots = typeConfig.respect_robots_txt != true || robotsTxtParser.isAllowed(linkUrl) + + if (!isValid) { + skippedLinks.add(link to "Invalid URL format") + } else if (!isNotBlacklisted) { + skippedLinks.add(link to "Blacklisted domain") + } else if (!isNotDuplicate) { + skippedLinks.add(link to "Already in queue") + } else if (!isAllowedByRobots) { + skippedLinks.add(link to "Disallowed by robots.txt") } - // Show skipped links - if (skippedLinks.isNotEmpty()) { - this.appendLine() - this.appendLine("
") - this.appendLine("Skipped Links (${skippedLinks.size})") - this.appendLine() - skippedLinks.forEach { (link, reason) -> - this.appendLine("- ⏭️ **[${link.title ?: "Untitled"}](${link.url})** - *${reason}*") - } - this.appendLine() - this.appendLine("
") - this.appendLine() - } - log.info("Added $addedCount new links to queue from '$url' (filtered from ${linkData.size} total)") - // Add summary - if (linkData.isNotEmpty()) { - this.appendLine() - this.appendLine("**Link Processing Summary:** ${addedCount} added to queue, ${skippedLinks.size} skipped") - this.appendLine() + isValid && isNotBlacklisted && isNotDuplicate && isAllowedByRobots + }.forEach { link -> + val newLink = link.apply { depth = page.depth + 1 } + if (addToQueue(newLink, maxDepth, maxQueueSize)) { + addedCount++ + this.appendLine("- ✅ **[${link.title ?: "Untitled"}](${link.url})** (depth: ${newLink.depth}, relevance: ${link.relevance_score})") + } else { + skippedLinks.add(link to "Queue limit reached or max depth exceeded") + } } - // Log link processing to transcript - transcriptStream?.let { stream -> - writeToTranscript( - stream, - "**Links Found:** ${linkData.size}, **Added to Queue:** $addedCount, **Skipped:** ${skippedLinks.size}\n\n" - ) + // Show skipped links + if (skippedLinks.isNotEmpty()) { + this.appendLine() + this.appendLine("
") + this.appendLine("Skipped Links (${skippedLinks.size})") + this.appendLine() + skippedLinks.forEach { (link, reason) -> + this.appendLine("- ⏭️ **[${link.title ?: "Untitled"}](${link.url})** - *${reason}*") } + this.appendLine() + this.appendLine("
") + this.appendLine() } - } catch (e: Exception) { - log.error("Error processing URL: $url", e) - errorCount.incrementAndGet() - synchronized(pageQueueLock) { - page.error = e.message + log.info("Added $addedCount new links to queue from '$url' (filtered from ${linkData.size} total)") + // Add summary + if (linkData.isNotEmpty()) { + this.appendLine() + this.appendLine("**Link Processing Summary:** ${addedCount} added to queue, ${skippedLinks.size} skipped") + this.appendLine() } - this.appendLine("*Error processing this result: ${e.message}*") - this.appendLine() - // Log error to transcript transcriptStream?.let { stream -> + writeToTranscript( + stream, buildString { + appendLine() + appendLine("### Link Processing Summary for [${title}]($url)") + appendLine("
") + appendLine("**Links Found:** ${linkData.size}, **Added to Queue:** $addedCount, **Skipped:** ${skippedLinks.size}") + appendLine() + linkData.forEach { link -> + val wasAdded = seenUrls.contains(link.url) + appendLine( + "- ${if (wasAdded) "✅" else "⏭️"} **[${link.title ?: "Untitled"}](${link.url})** - Relevance: ${link.relevance_score} ${ + link.tags?.joinToString( + ", " + )?.let { " - Tags: $it" } ?: "" + }") + } + appendLine() + appendLine("
") + appendLine() + } + ) + } + } + } catch (e: Exception) { + log.error("Error processing URL: $url", e) + errorCount.incrementAndGet() + synchronized(pageQueueLock) { + page.error = e.message + } + this.appendLine("*Error processing this result: ${e.message}*") + this.appendLine() + // Log error to transcript + transcriptStream?.let { stream -> + try { writeToTranscript(stream, "**Error:** ${e.message}\n\n") + } catch (ex: Exception) { + log.debug("Failed to write error to transcript (stream may be closed)", ex) } } } + } task.add(processPageResult.renderMarkdown) analysisResultsMap[currentIndex] = processPageResult log.info("Successfully processed page ${currentIndex}: url='${link}', processing_time=${System.currentTimeMillis() - pageStartTime}ms") @@ -934,13 +1011,16 @@ class CrawlerAgentTask( errorCount.incrementAndGet() page.error = e.message page.error = e.message - analysisResultsMap[currentIndex] = - "## ${currentIndex}. [${page.title}](${link})\n\n*Error processing this result: ${e.message}*\n\n" + analysisResultsMap[currentIndex] = "## ${currentIndex}. [${page.title}](${link})\n\n*Error processing this result: ${e.message}*\n\n" } finally { // Log page completion to transcript transcriptStream?.let { stream -> - writeToTranscript(stream, "**Completed:** ${LocalDateTime.now().format(DateTimeFormatter.ofPattern("HH:mm:ss"))}\n") - writeToTranscript(stream, "**Processing Time:** ${System.currentTimeMillis() - pageStartTime}ms\n\n---\n\n") + try { + writeToTranscript(stream, "**Completed:** ${LocalDateTime.now().format(DateTimeFormatter.ofPattern("HH:mm:ss"))}\n") + writeToTranscript(stream, "**Processing Time:** ${System.currentTimeMillis() - pageStartTime}ms\n\n---\n\n") + } catch (e: Exception) { + log.debug("Failed to write page completion to transcript (stream may be closed)", e) + } } page.completed = true @@ -953,20 +1033,29 @@ class CrawlerAgentTask( private fun isBlacklistedDomain(url: String): Boolean { val blacklistedDomains = setOf( - "facebook.com", "twitter.com", "instagram.com", "linkedin.com", - "youtube.com", "tiktok.com", "pinterest.com", "reddit.com", - "amazon.com", "ebay.com", "aliexpress.com" + "facebook.com", + "twitter.com", + "instagram.com", + "linkedin.com", + "youtube.com", + "tiktok.com", + "pinterest.com", + "reddit.com", + "amazon.com", + "ebay.com", + "aliexpress.com" ) return try { val uri = URI.create(url) - val typeConfig = typeConfig ?: throw RuntimeException() + val typeConfig = typeConfig ?: throw RuntimeException("Missing type config") // Check if URL is restricted by allowed_domains whitelist val allowedDomains = - ((typeConfig.allowed_domains?.split(Regex("\\s+"))?.filter { it.isNotBlank() } ?: listOf()) + - (executionConfig?.allowed_domains?.split(Regex("\\s+"))?.filter { it.isNotBlank() } - ?: listOf())).toSet() - if (!allowedDomains.isNullOrEmpty()) { + (( + (typeConfig.allowed_domains?.split(Regex("\\s+"))?.filter { it.isNotBlank() } ?: listOf()) + //+ (executionConfig?.allowed_domains?.split(Regex("\\s+") + )?.filter { it.isNotBlank() } ?: listOf()).toSet() + if (allowedDomains.isNotEmpty()) { val isAllowed = allowedDomains.any { allowedDomainOrPrefix -> val normalizedAllowed = allowedDomainOrPrefix.lowercase().trim() when { @@ -1004,15 +1093,14 @@ class CrawlerAgentTask( } } - private fun createFinalSummary(analysisResults: String, task: SessionTask): String { + private fun createFinalSummary(analysisResults: String, chatInterface: ChatInterface): String { log.info("Creating final summary of analysis results (original size: ${analysisResults.length})") - val typeConfig = typeConfig ?: throw RuntimeException() + val typeConfig = typeConfig ?: throw RuntimeException("Missing type config") if (analysisResults.length < (typeConfig.max_final_output_size ?: 15000) * 1.2) { log.info("Analysis results only slightly exceed max size, truncating instead of summarizing") return analysisResults.substring( - 0, - min(analysisResults.length, typeConfig.max_final_output_size ?: 15000) + 0, min(analysisResults.length, typeConfig.max_final_output_size ?: 15000) ) + "\n\n---\n\n*Note: Some content has been truncated due to length limitations.*" } @@ -1036,8 +1124,7 @@ class CrawlerAgentTask( "Include the most important links that should be followed up on.", "Keep your response under ${(typeConfig.max_final_output_size ?: 15000) / 1000}K characters." ).joinToString("\n\n"), - model = (typeConfig.model?.let { orchestrationConfig.instance(it) } - ?: orchestrationConfig.parsingChatter).getChildClient(task), + model = chatInterface, ).answer( listOf( "Here are summaries of each analyzed page:\n${analysisResults}" @@ -1060,6 +1147,147 @@ class CrawlerAgentTask( return sections } + private fun addPageQueueDetailsTab( + tabs: TabbedDisplay, + task: SessionTask, + processedCount: Int, + errorCount: Int + ) { + try { + val queueDetailsTask = task.ui.newTask(false) + tabs["Queue Details"] = queueDetailsTask.placeholder + val queueDetails = buildString { + appendLine("# Page Queue Details") + appendLine() + appendLine("## Summary Statistics") + appendLine() + appendLine("- **Total Pages Processed:** $processedCount") + appendLine("- **Successful:** ${processedCount - errorCount}") + appendLine("- **Errors:** $errorCount") + appendLine("- **Success Rate:** ${if (processedCount > 0) ((processedCount - errorCount) * 100 / processedCount) else 0}%") + synchronized(pageQueueLock) { + val unprocessedPages = pageQueue.toList() + val allSeenUrls = seenUrls.toList() + appendLine("- **Total URLs Discovered:** ${allSeenUrls.size}") + appendLine("- **Pages Not Processed:** ${unprocessedPages.size}") + appendLine() + // Processed pages section + appendLine("## Processed Pages") + appendLine() + val processedPages = allSeenUrls.filter { url -> + unprocessedPages.none { it.url == url } + } + if (processedPages.isEmpty()) { + appendLine("*No pages were processed.*") + } else { + appendLine("| # | URL | Status | Depth | Processing Time | Error |") + appendLine("|---|-----|--------|-------|-----------------|-------|") + processedPages.forEachIndexed { index, url -> + val status = if (urlContentCache.containsKey(url)) "✅ Success" else "❌ Failed" + // Try to find the LinkData for this URL to get more details + val linkData = seenUrls.find { it == url } + val depth = "N/A" // We don't track this for completed pages currently + val processingTime = "N/A" // We don't track this for completed pages currently + val error = "" // We don't track this for completed pages currently + appendLine("| ${index + 1} | [${url.take(50)}...](${url}) | $status | $depth | $processingTime | $error |") + } + } + appendLine() + // Unprocessed pages section + appendLine("## Unprocessed Pages (Still in Queue)") + appendLine() + if (unprocessedPages.isEmpty()) { + appendLine("*All discovered pages were processed.*") + } else { + appendLine("| # | URL | Title | Relevance | Depth | Priority | Status |") + appendLine("|---|-----|-------|-----------|-------|----------|--------|") + unprocessedPages.sortedByDescending { it.calculatePriority() }.forEachIndexed { index, page -> + val url = page.url ?: "N/A" + val title = page.title?.take(30) ?: "Untitled" + val relevance = String.format("%.1f", page.relevance_score) + val depth = page.depth.toString() + val priority = String.format("%.2f", page.calculatePriority()) + val status = when { + page.error != null -> "❌ Error: ${page.error}" + page.started && !page.completed -> "⏳ In Progress" + page.completed -> "✅ Completed" + else -> "⏸️ Queued" + } + appendLine("| ${index + 1} | [${url.take(50)}...](${url}) | $title | $relevance | $depth | $priority | $status |") + } + } + appendLine() + // Depth distribution + appendLine("## Depth Distribution") + appendLine() + val depthCounts = unprocessedPages.groupBy { it.depth }.mapValues { it.value.size } + val maxDepth = (depthCounts.keys.maxOrNull() ?: 0) + if (depthCounts.isEmpty()) { + appendLine("*No depth data available.*") + } else { + appendLine("| Depth | Count | Percentage |") + appendLine("|-------|-------|------------|") + (0..maxDepth).forEach { depth -> + val count = depthCounts[depth] ?: 0 + val percentage = if (unprocessedPages.isNotEmpty()) { + (count * 100.0 / unprocessedPages.size).toInt() + } else 0 + appendLine("| $depth | $count | $percentage% |") + } + } + appendLine() + // Relevance distribution + appendLine("## Relevance Score Distribution") + appendLine() + val relevanceBuckets = unprocessedPages.groupBy { + ((it.relevance_score / 10).toInt() * 10).coerceIn(0, 100) + }.mapValues { it.value.size } + if (relevanceBuckets.isEmpty()) { + appendLine("*No relevance data available.*") + } else { + appendLine("| Score Range | Count | Percentage |") + appendLine("|-------------|-------|------------|") + (0..100 step 10).reversed().forEach { bucket -> + val count = relevanceBuckets[bucket] ?: 0 + val percentage = if (unprocessedPages.isNotEmpty()) { + (count * 100.0 / unprocessedPages.size).toInt() + } else 0 + val range = "${bucket}-${bucket + 9}" + appendLine("| $range | $count | $percentage% |") + } + } + appendLine() + // Top unprocessed pages by priority + appendLine("## Top 10 Unprocessed Pages by Priority") + appendLine() + val topPages = unprocessedPages.sortedByDescending { it.calculatePriority() }.take(10) + if (topPages.isEmpty()) { + appendLine("*No unprocessed pages.*") + } else { + topPages.forEachIndexed { index, page -> + appendLine("### ${index + 1}. [${page.title ?: "Untitled"}](${page.url})") + appendLine() + appendLine("- **URL:** ${page.url}") + appendLine("- **Relevance Score:** ${String.format("%.1f", page.relevance_score)}") + appendLine("- **Depth:** ${page.depth}") + appendLine("- **Priority:** ${String.format("%.2f", page.calculatePriority())}") + if (!page.tags.isNullOrEmpty()) { + appendLine("- **Tags:** ${page.tags.joinToString(", ")}") + } + appendLine() + } + } + } + } + queueDetailsTask.add(queueDetails.renderMarkdown) + task.update() + log.info("Added page queue details tab with statistics") + } catch (e: Exception) { + log.error("Failed to create page queue details tab", e) + } + } + + private fun summarizeSection(content: String): String { val firstParagraph = content.split("\n\n").firstOrNull()?.trim() ?: "" @@ -1072,7 +1300,7 @@ class CrawlerAgentTask( private fun fetchAndProcessUrl( url: String, webSearchDir: File, index: Int, pool: ExecutorService, fetchStrategy: FetchStrategy ): String { - val typeConfig = typeConfig ?: throw RuntimeException() + val typeConfig = typeConfig ?: throw RuntimeException("Missing type config") if (url.isBlank()) { throw IllegalArgumentException("URL cannot be blank") } @@ -1083,9 +1311,7 @@ class CrawlerAgentTask( return urlContentCache[url]!! } log.debug( - "Fetching content for URL: {} using method: {}", - url, - typeConfig.fetch_method ?: FetchMethod.HttpClient + "Fetching content for URL: {} using method: {}", url, typeConfig.fetch_method ?: FetchMethod.HttpClient ) return try { @@ -1129,9 +1355,7 @@ class CrawlerAgentTask( log.debug("Extracted ${links.size} valid links from markdown") return links.map { (linkText, linkUrl) -> LinkData( - url = linkUrl, - title = linkText, - relevance_score = 50.0 + url = linkUrl, title = linkText, relevance_score = 50.0 ) } } @@ -1163,18 +1387,22 @@ class CrawlerAgentTask( } } - private fun saveAnalysis(webSearchDir: File, url: String, analysis: ParsedResponse, index: Int) { + private fun saveStrategyResult( + webSearchDir: File, url: String, result: PageProcessingResult, index: Int + ) { try { val timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) val urlSafe = url.replace(Regex("https?://"), "").replace(Regex("[^a-zA-Z0-9]"), "_").take(100) - val analysisFile = File(webSearchDir, "${urlSafe}_${index}_${timestamp}.md") + val resultFile = File(webSearchDir, "${urlSafe}_${index}_${timestamp}.md") val metadata = mapOf( "url" to url, "timestamp" to LocalDateTime.now().toString(), "index" to index, + "page_type" to result.pageType.name, "query" to (executionConfig?.search_query ?: ""), - "content_query" to (executionConfig?.content_queries ?: "") + "content_query" to (executionConfig?.content_queries ?: ""), + "metadata" to result.metadata ) val metadataJson = try { ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(metadata) @@ -1183,121 +1411,14 @@ class CrawlerAgentTask( "{}" } - val objJson = try { - analysis.obj.let { JsonUtil.toJson(it) } - } catch (e: Exception) { - log.error("Failed to serialize analysis object for URL: $url", e) - "" - } - - val contentWithHeader = "\n\n${analysis.text}" - analysisFile.writeText(contentWithHeader) - log.debug("Saved analysis to file: ${analysisFile.absolutePath} (size: ${contentWithHeader.length} chars)") + val contentWithHeader = "\n\n${result.content}" + resultFile.writeText(contentWithHeader) + log.debug("Saved strategy result to file: ${resultFile.absolutePath} (size: ${contentWithHeader.length} chars)") } catch (e: Exception) { - log.error("Failed to save analysis for URL: $url", e) - } - } - - private fun transformContent( - content: String, - analysisGoal: String, - orchestrationConfig: OrchestrationConfig, - task: SessionTask - ): ParsedResponse { - val describer = TaskContextYamlDescriber(orchestrationConfig) - val maxChunkSize = 50000 - if (content.length <= maxChunkSize) { - log.debug("Content size (${content.length}) within limit, processing as single chunk") - return pageParsedResponse(orchestrationConfig, analysisGoal, content, describer, task) - } - - log.debug("Content size (${content.length}) exceeds limit, splitting into chunks") - val chunks = splitContentIntoChunks(content, maxChunkSize) - log.debug("Split content into ${chunks.size} chunks") - val chunkResults = chunks.mapIndexed { index, chunk -> - log.debug("Processing chunk ${index + 1}/${chunks.size} (size: ${chunk.length})") - val chunkGoal = "$analysisGoal (Part ${index + 1}/${chunks.size})" - pageParsedResponse(orchestrationConfig, chunkGoal, chunk, describer, task) - } - if (chunkResults.size == 1) { - log.debug("Only one chunk result, returning directly") - return chunkResults[0] - } - log.debug("Combining ${chunkResults.size} chunk results into final analysis") - val combinedAnalysis = chunkResults.joinToString("\n\n---\n\n") { it.text } - return pageParsedResponse(orchestrationConfig, analysisGoal, combinedAnalysis, describer, task) - } - - private fun pageParsedResponse( - orchestrationConfig: OrchestrationConfig, - analysisGoal: String, - content: String, - describer: TypeDescriber, - task: SessionTask - ) = try { - val typeConfig = typeConfig ?: throw RuntimeException() - val model = (typeConfig.model?.let { orchestrationConfig.instance(it) } - ?: orchestrationConfig.parsingChatter).getChildClient(task) - ParsedAgent( - prompt = listOf( - "Below are analyses of different parts of a web page related to this goal: $analysisGoal", - "Create a unified summary that combines the key insights from all parts.", - "Use markdown formatting for your response, with * characters for bullets.", - "Identify the most important links that should be followed up on according to the goal." - ).joinToString("\n\n"), - resultClass = ParsedPage::class.java, - model = model, - describer = describer, - parsingChatter = model, - ).answer(listOf(content)) - } catch (e: Exception) { - log.error("Error during content transformation", e) - object : ParsedResponse( - clazz = ParsedPage::class.java - ) { - override val obj: ParsedPage - get() = ParsedPage( - page_type = PageType.Error, - page_information = "Error during analysis: ${e.message}" - ) - override val text: String - get() = "Error during analysis: ${e.message}" + log.error("Failed to save strategy result for URL: $url", e) } } - private fun splitContentIntoChunks(content: String, maxChunkSize: Int): List { - val chunks = mutableListOf() - var remainingContent = content - while (remainingContent.isNotEmpty()) { - val chunkSize = if (remainingContent.length <= maxChunkSize) { - remainingContent.length - } else { - val breakPoint = findBreakPoint(remainingContent, maxChunkSize) - breakPoint - } - chunks.add(remainingContent.substring(0, chunkSize)) - remainingContent = remainingContent.substring(chunkSize) - } - return chunks - } - - private fun findBreakPoint(text: String, maxSize: Int): Int { - val paragraphBreakSearch = text.substring(0, minOf(maxSize, text.length)).lastIndexOf("\n\n") - if (paragraphBreakSearch > maxSize * 0.7) { - return paragraphBreakSearch + 2 - } - val newlineSearch = text.substring(0, minOf(maxSize, text.length)).lastIndexOf("\n") - if (newlineSearch > maxSize * 0.7) { - return newlineSearch + 1 - } - val sentenceSearch = text.substring(0, minOf(maxSize, text.length)).lastIndexOf(". ") - if (sentenceSearch > maxSize * 0.7) { - return sentenceSearch + 2 - - } - return minOf(maxSize, text.length) - } - companion object { private val log = LoggerFactory.getLogger(CrawlerAgentTask::class.java) private val LINK_PATTERN = Pattern.compile("""\[([^]]+)]\(([^)]+)\)""") @@ -1314,9 +1435,10 @@ class CrawlerAgentTask(
  • Fetches top search results
  • Analyzes content for specific goals
  • Generates detailed analysis reports
  • - + """ ) } + } \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/README.md b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/README.md new file mode 100644 index 000000000..b1b02dfee --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/README.md @@ -0,0 +1,444 @@ +# Online Tools Package + +## Overview + +The `online` package provides a comprehensive framework for web crawling, content extraction, and intelligent data processing. It combines AI-powered analysis with configurable processing strategies to extract structured information from websites, making it ideal for research, data mining, competitive analysis, and automated information gathering. + +## Architecture + +The package is organized into four main sub-packages: + +``` +online/ +├── seed/ # URL discovery and seed generation +├── fetch/ # Content retrieval strategies +├── processing/ # Page analysis and data extraction +└── CrawlerAgentTask.kt # Main orchestration logic +``` + +### Component Flow + +``` +[Seed Strategy] → [URL Queue] → [Fetch Strategy] → [Processing Strategy] → [Output] + ↓ ↓ ↓ + Initial URLs Raw Content Structured Data +``` + +## Core Components + +### 1. CrawlerAgentTask + +The main orchestrator that coordinates all crawling operations. + +**Key Features:** +- Intelligent URL queue management with priority-based processing +- Concurrent page processing with configurable parallelism +- Automatic link discovery and depth-limited crawling +- Robots.txt compliance +- Domain whitelisting/blacklisting +- Real-time transcript generation +- Comprehensive error handling and recovery + +**Configuration:** + +```kotlin +data class CrawlerTaskTypeConfig( + val seed_method: SeedMethod? = SeedMethod.GoogleProxy, + val fetch_method: FetchMethod? = FetchMethod.HttpClient, + val processing_strategy: ProcessingStrategyType? = ProcessingStrategyType.DefaultSummarizer, + val allowed_domains: String? = null, + val respect_robots_txt: Boolean? = true, + val max_pages_per_task: Int? = 30, + val max_depth: Int? = 3, + val max_queue_size: Int? = 100, + val concurrent_page_processing: Int? = 3, + val follow_links: Boolean? = true, + val allow_revisit_pages: Boolean? = false, + val create_final_summary: Boolean? = true, + val generate_transcript: Boolean? = true +) +``` + +### 2. Seed Package + +Provides multiple strategies for discovering initial URLs to crawl. + +**Available Methods:** +- **DirectUrls**: Use provided URLs directly +- **GoogleProxy**: Quick Google searches via proxy +- **GoogleSearch**: Full Google Custom Search API integration +- **SearchAPI.io**: Unified API for multiple search engines (Google, Bing, DuckDuckGo, Amazon, eBay, etc.) + +See [seed/README.md](seed/README.md) for detailed documentation. + +### 3. Fetch Package + +Handles content retrieval from URLs with multiple strategies. + +**Available Methods:** +- **HttpClient**: Fast, efficient HTTP-based fetching (default) +- **Selenium**: Browser-based rendering for JavaScript-heavy sites + +**Supported Content Types:** +- HTML pages +- PDF documents +- Microsoft Office files (DOC, DOCX, XLS, XLSX, PPT, PPTX) +- OpenDocument formats (ODT) +- Plain text files + +See [fetch/README.md](fetch/README.md) for detailed documentation. + +### 4. Processing Package + +Provides specialized strategies for analyzing and extracting data from web pages. + +**Available Strategies:** + +1. **DefaultSummarizer**: General content analysis and summarization +2. **FactChecking**: Multi-source claim verification with evidence tracking +3. **JobMatching**: Automated job search and application material generation +4. **SchemaExtraction**: Structured data extraction with custom schemas +5. **DataTableAccumulation**: Build comprehensive datasets with configurable columns + +See [processing/README.md](processing/README.md) for detailed documentation. + +## Quick Start + +### Basic Web Search and Analysis + +```kotlin +val task = CrawlerAgentTask( + orchestrationConfig = config, + planTask = CrawlerTaskExecutionConfigData( + search_query = "artificial intelligence trends 2024", + content_queries = "Extract key trends, predictions, and expert opinions" + ) +) + +// Configure task type +task.typeConfig = CrawlerTaskTypeConfig( + seed_method = SeedMethod.GoogleProxy, + fetch_method = FetchMethod.HttpClient, + processing_strategy = ProcessingStrategyType.DefaultSummarizer, + max_pages_per_task = 10 +) + +// Run the task +task.run(agent, messages, sessionTask, resultFn, orchestrationConfig) +``` + +### Job Search Automation + +```kotlin +val task = CrawlerAgentTask( + orchestrationConfig = config, + planTask = CrawlerTaskExecutionConfigData( + search_query = "senior software engineer remote", + content_queries = JobMatchingConfig( + user_experience = "5 years Kotlin, AWS, microservices...", + target_roles = listOf("Senior Engineer", "Tech Lead"), + required_skills = listOf("Kotlin", "AWS", "Docker"), + preferred_locations = listOf("Remote", "San Francisco"), + min_match_score = 0.7, + work_arrangement_preference = "remote", + min_salary = 150000 + ) + ) +) + +task.typeConfig = CrawlerTaskTypeConfig( + processing_strategy = ProcessingStrategyType.JobMatching, + max_pages_per_task = 20 +) +``` + +### Data Extraction + +```kotlin +val task = CrawlerAgentTask( + orchestrationConfig = config, + planTask = CrawlerTaskExecutionConfigData( + search_query = "best laptops 2024", + content_queries = SchemaExtractionConfig( + schema_definition = """ + { + "model": "string", + "price": "number", + "rating": "number", + "specs": { + "processor": "string", + "ram": "string", + "storage": "string" + } + } + """, + aggregate_results = true, + deduplicate = true + ) + ) +) + +task.typeConfig = CrawlerTaskTypeConfig( + processing_strategy = ProcessingStrategyType.SchemaExtraction, + max_pages_per_task = 15 +) +``` + +### Fact Checking + +```kotlin +val task = CrawlerAgentTask( + orchestrationConfig = config, + planTask = CrawlerTaskExecutionConfigData( + search_query = "company X employee count revenue", + content_queries = FactCheckingConfig( + claims_to_verify = listOf( + "Company X has 10,000 employees", + "Company X revenue is $1B annually" + ), + required_sources = 3, + confidence_threshold = 0.8 + ) + ) +) + +task.typeConfig = CrawlerTaskTypeConfig( + processing_strategy = ProcessingStrategyType.FactChecking, + max_pages_per_task = 20 +) +``` + +## Advanced Features + +### Domain Whitelisting + +Restrict crawling to specific domains or URL prefixes: + +```kotlin +task.typeConfig = CrawlerTaskTypeConfig( + allowed_domains = "example.com wikipedia.org https://docs.example.com/api", + // Space-separated list of domains or URL prefixes +) +``` + +### Link Following + +Automatically discover and follow links found in analyzed pages: + +```kotlin +task.typeConfig = CrawlerTaskTypeConfig( + follow_links = true, + max_depth = 3, // How many levels deep to crawl + max_queue_size = 100 // Maximum URLs in queue +) +``` + +### Robots.txt Compliance + +Respect website crawling rules: + +```kotlin +task.typeConfig = CrawlerTaskTypeConfig( + respect_robots_txt = true // Honors robots.txt and crawl delays +) +``` + +### Concurrent Processing + +Control parallelism for faster crawling: + +```kotlin +task.typeConfig = CrawlerTaskTypeConfig( + concurrent_page_processing = 5 // Process 5 pages simultaneously +) +``` + +## Output Structure + +The crawler generates organized output in the `.websearch` directory: + +``` +.websearch/ +├── crawler_transcript.md # Real-time processing log +├── raw_pages/ # Original HTML content +├── reduced_pages/ # Simplified HTML +├── documents/ # Downloaded documents +├── extracted_text/ # Text from documents +├── aggregated_data.json # Extracted structured data +├── data_table.csv # Tabular datasets +└── job_matches/ # Job application materials + ├── Company_Position_timestamp.md + └── ... +``` + +## Transcript Generation + +The crawler generates a detailed markdown transcript of all operations: + +```markdown +# Crawler Agent Transcript + +**Started:** 2024-01-15 10:30:00 +**Search Query:** artificial intelligence trends + +## Seed Links +1. [AI Trends 2024](https://example.com/ai-trends) + - Relevance: 95.0 + +### Processing Page 1: [AI Trends 2024](https://example.com/ai-trends) +**Started:** 10:30:15 +**Completed:** 10:30:45 +**Processing Time:** 30000ms + +[Content analysis...] + +### Link Processing Summary +**Links Found:** 15, **Added to Queue:** 8, **Skipped:** 7 + +## Final Summary +[Comprehensive analysis of all processed pages...] +``` + +## Error Handling + +The package implements comprehensive error handling: + +1. **Network Errors**: Automatic retries with exponential backoff +2. **Content Errors**: Graceful handling of malformed content +3. **Processing Errors**: Individual page failures don't stop the crawl +4. **Resource Limits**: Automatic cleanup and memory management +5. **Early Termination**: Strategies can stop crawling when goals are met + +## Performance Considerations + +### Memory Management + +- URL queue size limits prevent memory exhaustion +- Content caching with size limits +- Automatic cleanup of temporary resources +- Selenium WebDriver cleanup on task completion + +### Speed Optimization + +- Concurrent page processing (configurable) +- HTTP connection pooling +- Content size limits (5MB HTML, 10MB documents) +- Early termination when goals are met +- Priority-based URL queue + +### Cost Control + +- Page limits per task +- Depth limits for crawling +- Domain restrictions +- Robots.txt compliance +- Configurable processing strategies + +## Best Practices + +### 1. Choose the Right Strategy + +- **Research & Analysis**: Use `DefaultSummarizer` +- **Claim Verification**: Use `FactChecking` +- **Job Hunting**: Use `JobMatching` +- **Data Mining**: Use `SchemaExtraction` or `DataTableAccumulation` + +### 2. Configure Appropriately + +```kotlin +// For quick research (fast, limited scope) +CrawlerTaskTypeConfig( + max_pages_per_task = 10, + max_depth = 2, + concurrent_page_processing = 3 +) + +// For comprehensive analysis (thorough, slower) +CrawlerTaskTypeConfig( + max_pages_per_task = 50, + max_depth = 4, + concurrent_page_processing = 5, + follow_links = true +) + +// For targeted extraction (focused, efficient) +CrawlerTaskTypeConfig( + max_pages_per_task = 20, + max_depth = 2, + allowed_domains = "target-site.com", + follow_links = false +) +``` + +### 3. Handle Results + +```kotlin +// Access structured data +val extractedData = File(".websearch/aggregated_data.json") + .readText() + .let { ObjectMapper().readValue(it, List::class.java) } + +// Read transcript +val transcript = File(".websearch/crawler_transcript.md").readText() + +// Process job matches +File(".websearch/job_matches").listFiles()?.forEach { jobReport -> + println("Found match: ${jobReport.name}") +} +``` + +### 4. Monitor Progress + +```kotlin +// Enable transcript for real-time monitoring +task.typeConfig = CrawlerTaskTypeConfig( + generate_transcript = true +) + +// Check logs for detailed operation info +// Logs include: URL queue status, processing times, error rates +``` + +## Security Considerations + +1. **API Keys**: Store securely in user settings, never log +2. **Domain Restrictions**: Use `allowed_domains` to prevent unauthorized crawling +3. **Robots.txt**: Respect website policies with `respect_robots_txt = true` +4. **Rate Limiting**: Built-in delays and concurrent processing limits +5. **Content Validation**: All URLs and content validated before processing + +## Troubleshooting + +### Common Issues + +**No results returned:** +- Check search query or direct URLs +- Verify API credentials (for Google/SearchAPI methods) +- Check domain restrictions +- Review transcript for errors + +**Too many errors:** +- Reduce concurrent processing +- Check network connectivity +- Verify target sites are accessible +- Review robots.txt compliance + +**Memory issues:** +- Reduce `max_queue_size` +- Lower `max_pages_per_task` +- Decrease `concurrent_page_processing` + +**Slow performance:** +- Increase `concurrent_page_processing` +- Use `HttpClient` instead of `Selenium` +- Reduce `max_depth` +- Set stricter domain restrictions + +## Dependencies + +- **Jackson**: JSON parsing and serialization +- **Java HTTP Client**: HTTP requests +- **Selenium WebDriver**: Browser automation (optional) +- **DocumentReader**: Document text extraction +- **CognoTik Platform**: AI/LLM integration + diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/FetchMethod.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/FetchMethod.kt similarity index 88% rename from webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/FetchMethod.kt rename to webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/FetchMethod.kt index 7a8f25f7a..10623fa50 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/FetchMethod.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/FetchMethod.kt @@ -1,6 +1,7 @@ -package com.simiacryptus.cognotik.plan.tools.online +package com.simiacryptus.cognotik.plan.tools.online.fetch import com.simiacryptus.cognotik.plan.OrchestrationConfig +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask import com.simiacryptus.cognotik.util.EnabledStrategy import com.simiacryptus.cognotik.util.LoggerFactory import java.io.File diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/HttpClientFetch.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/HttpClientFetch.kt similarity index 98% rename from webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/HttpClientFetch.kt rename to webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/HttpClientFetch.kt index fca3bf8a7..06532da7d 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/HttpClientFetch.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/HttpClientFetch.kt @@ -1,7 +1,8 @@ -package com.simiacryptus.cognotik.plan.tools.online +package com.simiacryptus.cognotik.plan.tools.online.fetch import com.simiacryptus.cognotik.input.getReader import com.simiacryptus.cognotik.plan.OrchestrationConfig +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask import com.simiacryptus.cognotik.util.HtmlSimplifier import java.io.File import java.io.FileOutputStream diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/README.md b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/README.md new file mode 100644 index 000000000..fa0eb3fe6 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/README.md @@ -0,0 +1,196 @@ +# Online Fetch Package + +This package provides flexible strategies for fetching web content, supporting both HTTP-based and browser-based approaches to web scraping and content retrieval. + +## Overview + +The fetch package implements a strategy pattern for web content retrieval, allowing seamless switching between different fetching methods based on requirements and availability. It handles various content types including HTML, documents (PDF, DOCX, etc.), and plain text. + +## Components + +### FetchStrategy Interface + +The core interface that defines the contract for fetching web content: + +```kotlin +interface FetchStrategy : EnabledStrategy { + fun fetch( + url: String, + webSearchDir: File, + index: Int, + pool: ExecutorService, + orchestrationConfig: OrchestrationConfig + ): String +} +``` + +### Fetch Methods + +#### 1. HttpClient (Default) + +A lightweight, efficient HTTP-based fetching strategy using Java's built-in HttpClient. + +**Features:** +- Fast and resource-efficient +- Handles multiple content types: + - HTML pages (with simplification) + - PDF documents + - Microsoft Office documents (DOC, DOCX, XLS, XLSX, PPT, PPTX) + - OpenDocument formats (ODT) + - Plain text files +- SSL/TLS support with flexible certificate validation +- Automatic text extraction from documents +- Content size limits (5MB for HTML, 10MB for documents) +- Proper error handling and fallbacks + +**Usage:** +```kotlin +val strategy = HttpClientFetch().createStrategy(task) +val content = strategy.fetch(url, webSearchDir, index, pool, config) +``` + +#### 2. Selenium (Optional) + +A browser-based fetching strategy using Selenium WebDriver for JavaScript-heavy sites. + +**Features:** +- Full browser rendering +- JavaScript execution support +- Automatic fallback to HttpClient on failure +- Can be enabled/disabled via `FetchConfig.isSeleniumEnabled` + +**Usage:** +```kotlin +FetchConfig.isSeleniumEnabled = true +val strategy = Selenium().createStrategy(task) +val content = strategy.fetch(url, webSearchDir, index, pool, config) +``` + +### FetchMethod Enum + +Factory enum for creating fetch strategies: + +```kotlin +enum class FetchMethod : FetchMethodFactory { + Selenium, + HttpClient +} +``` + +## Content Processing + +### HTML Processing + +1. **Raw Content Storage**: Original HTML is saved to `raw_pages/` +2. **Simplification**: HTML is cleaned and simplified using `HtmlSimplifier`: + - Removes CSS, scripts, and interactive elements + - Preserves semantic structure + - Removes event handlers and media elements +3. **Reduced Content Storage**: Simplified HTML is saved to `reduced_pages/` + +### Document Processing + +1. **Binary Download**: Documents are downloaded as byte arrays +2. **Storage**: Original documents saved to `documents/` directory +3. **Text Extraction**: Text content extracted using `DocumentReader` +4. **Extracted Text Storage**: Plain text saved to `extracted_text/` + +Supported formats: +- PDF (`.pdf`) +- Microsoft Word (`.doc`, `.docx`) +- Microsoft Excel (`.xls`, `.xlsx`) +- Microsoft PowerPoint (`.ppt`, `.pptx`) +- OpenDocument Text (`.odt`) +- Rich Text Format (`.rtf`) + +## Configuration + +### FetchConfig + +Global configuration for fetch behavior: + +```kotlin +object FetchConfig { + var isSeleniumEnabled: Boolean = false +} +``` + +### Content Limits + +- **HTML Content**: 5MB maximum (truncated if larger) +- **Document Files**: 10MB maximum (skipped if larger) +- **HTTP Timeout**: 60 seconds +- **Connection Timeout**: 30 seconds + +## Error Handling + +The package implements robust error handling: + +1. **HTTP Errors**: Non-2xx status codes throw descriptive exceptions +2. **Selenium Fallback**: Automatic fallback to HttpClient if Selenium fails +3. **Document Extraction**: Graceful handling of extraction failures +4. **Content Type Validation**: Skips unsupported content types with warnings +5. **Size Limits**: Enforces reasonable content size limits + +## Directory Structure + +``` +webSearchDir/ +├── raw_pages/ # Original HTML content +├── reduced_pages/ # Simplified HTML content +├── documents/ # Original document files +├── text_pages/ # Plain text content +└── extracted_text/ # Extracted text from documents +``` + +## Logging + +Comprehensive logging at multiple levels: +- **INFO**: Major operations (fetching, processing) +- **DEBUG**: Detailed operation steps +- **WARN**: Fallbacks and skipped content +- **ERROR**: Failures and exceptions + +## Best Practices + +1. **Use HttpClient by default** - It's faster and more reliable for most content +2. **Enable Selenium only when needed** - For JavaScript-heavy sites +3. **Monitor content sizes** - Large documents may be skipped +4. **Handle exceptions** - Network issues and timeouts can occur +5. **Check content types** - Not all content types are supported + +## Example Usage + +```kotlin +// Create a crawler task +val task = CrawlerAgentTask(...) + +// Use HttpClient (recommended) +val httpStrategy = HttpClientFetch().createStrategy(task) +val content = httpStrategy.fetch( + url = "https://example.com", + webSearchDir = File("./output"), + index = 0, + pool = executorService, + orchestrationConfig = config +) + +// Or use Selenium for JavaScript sites +FetchConfig.isSeleniumEnabled = true +val seleniumStrategy = Selenium().createStrategy(task) +val jsContent = seleniumStrategy.fetch( + url = "https://js-heavy-site.com", + webSearchDir = File("./output"), + index = 1, + pool = executorService, + orchestrationConfig = config +) +``` + +## Dependencies + +- Java 11+ HttpClient +- Selenium WebDriver (optional) +- DocumentReader for text extraction +- HtmlSimplifier for HTML processing + diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/Selenium.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/Selenium.kt similarity index 93% rename from webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/Selenium.kt rename to webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/Selenium.kt index b932238fa..e33f47961 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/Selenium.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/fetch/Selenium.kt @@ -1,6 +1,7 @@ -package com.simiacryptus.cognotik.plan.tools.online +package com.simiacryptus.cognotik.plan.tools.online.fetch import com.simiacryptus.cognotik.plan.OrchestrationConfig +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask import com.simiacryptus.cognotik.util.LoggerFactory import com.simiacryptus.cognotik.util.Selenium2S3 import java.io.File diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/DataTableAccumulationStrategy.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/DataTableAccumulationStrategy.kt new file mode 100644 index 000000000..84ffa0286 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/DataTableAccumulationStrategy.kt @@ -0,0 +1,651 @@ +package com.simiacryptus.cognotik.plan.tools.online.processing + +import com.simiacryptus.cognotik.agents.ParsedAgent +import com.simiacryptus.cognotik.describe.Description +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask +import com.simiacryptus.cognotik.util.JsonUtil +import com.simiacryptus.cognotik.util.jsonCast +import com.simiacryptus.cognotik.util.toJson +import com.simiacryptus.cognotik.webui.session.getChildClient +import org.slf4j.LoggerFactory +import java.io.File +import java.nio.charset.StandardCharsets +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter +import java.util.concurrent.ConcurrentHashMap + +class DataTableAccumulationStrategy : DefaultSummarizerStrategy() { + + data class DataTableConfig( + @Description("Column names for the data table (comma-separated)") + val column_names: String = "", + @Description("Description of what data to extract for each column") + val column_descriptions: Map = emptyMap(), + @Description("Data types for each column (string, number, boolean, date)") + val column_types: Map = emptyMap(), + @Description("Instructions for extracting data from pages") + val extraction_instructions: String = "", + @Description("Whether to automatically detect and extract HTML tables") + val auto_detect_tables: Boolean = true, + @Description("Minimum number of rows to consider a table valid") + val min_rows: Int = 1, + @Description("Maximum number of rows to extract per page (null for unlimited)") + val max_rows_per_page: Int? = null, + @Description("Whether to deduplicate rows based on key columns") + val deduplicate: Boolean = true, + @Description("Column names to use as unique keys for deduplication (comma-separated)") + val key_columns: String? = null, + @Description("Whether to validate data types for each column") + val validate_types: Boolean = true, + @Description("Whether to normalize/clean data values") + val normalize_data: Boolean = true, + @Description("Export format for final table (csv, json, markdown)") + val export_format: String = "csv", + @Description("Whether to include source URLs in the output") + val include_source_urls: Boolean = true + ) + + override val description: String + get() = "Extracts and accumulates tabular data from web pages, building a comprehensive dataset with configurable columns and validation." + + data class ExtractedTableData( + @Description("List of rows extracted from the page, each row is a map of column_name -> value") + val rows: List> = emptyList(), + @Description("Confidence score for the extraction (0.0-1.0)") + val confidence: Double = 1.0, + @Description("Any validation errors or warnings for the extracted data") + val validation_notes: List = emptyList(), + @Description("Metadata about the extracted table") + val metadata: Map = emptyMap() + ) + + private val accumulatedRows = ConcurrentHashMap>>() + private val seenRowKeys = ConcurrentHashMap.newKeySet() + private val columnStats = ConcurrentHashMap() + + data class ColumnStatistics( + var totalValues: Int = 0, + var nullValues: Int = 0, + var uniqueValues: MutableSet = mutableSetOf(), + var typeViolations: Int = 0 + ) + + companion object { + private val log = LoggerFactory.getLogger(DataTableAccumulationStrategy::class.java) + } + + override fun processPage( + url: String, + content: String, + context: PageProcessingStrategy.ProcessingContext + ): PageProcessingStrategy.PageProcessingResult { + log.debug("Processing page for data table accumulation: $url") + + val config = try { + context.executionConfig.content_queries?.let { queries -> + when (queries) { + is String -> JsonUtil.fromJson(queries, DataTableConfig::class.java) + else -> queries.jsonCast() + } + } ?: run { + log.warn("No data table config provided, using default") + DataTableConfig() + } + } catch (e: Exception) { + log.error("Failed to parse data table config", e) + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = CrawlerAgentTask.PageType.Error, + content = "Configuration error: ${e.message}", + error = e + ) + } + + // Extract table data + val extractionResult = try { + extractTableData(url, content, config, context) + } catch (e: Exception) { + log.error("Failed to extract table data from: $url", e) + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = CrawlerAgentTask.PageType.Error, + content = "Extraction error: ${e.message}", + error = e + ) + } + + // Validate and normalize rows + val processedRows = extractionResult.rows.mapNotNull { row -> + processRow(row, config, url) + } + + // Store accumulated data + if (processedRows.isNotEmpty()) { + storeTableData(url, processedRows, config) + updateColumnStatistics(processedRows, config) + } + + // Generate summary + val summary = buildString { + appendLine("## Data Table Extraction Results") + appendLine() + appendLine("**URL:** [$url]($url)") + appendLine("**Confidence:** ${(extractionResult.confidence * 100).toInt()}%") + appendLine("**Rows Extracted:** ${processedRows.size}") + appendLine("**Total Accumulated Rows:** ${accumulatedRows.values.sumOf { it.size }}") + appendLine() + + if (extractionResult.validation_notes.isNotEmpty()) { + appendLine("### Validation Notes") + extractionResult.validation_notes.forEach { note -> + appendLine("- ⚠️ $note") + } + appendLine() + } + + if (processedRows.isNotEmpty()) { + appendLine("### Sample Data (First 3 Rows)") + appendLine() + appendLine(formatRowsAsMarkdownTable(processedRows.take(3), config)) + appendLine() + } else { + appendLine("*No valid data extracted from this page*") + appendLine() + } + } + + // Also get standard link extraction + val standardResult = super.processPage(url, content, context) + + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = if (processedRows.isNotEmpty()) CrawlerAgentTask.PageType.OK else CrawlerAgentTask.PageType.Irrelevant, + content = summary, + extractedLinks = standardResult.extractedLinks, + metadata = mapOf( + "rows_extracted" to processedRows.size, + "total_rows" to accumulatedRows.values.sumOf { it.size }, + "confidence" to extractionResult.confidence, + "validation_notes" to extractionResult.validation_notes + ) + ) + } + + private fun extractTableData( + url: String, + content: String, + config: DataTableConfig, + context: PageProcessingStrategy.ProcessingContext + ): ExtractedTableData { + log.debug("Extracting table data from: $url") + + val columns = config.column_names.split(",").map { it.trim() }.filter { it.isNotEmpty() } + + val prompt = buildString { + appendLine("Extract tabular data from the following web page content.") + appendLine() + appendLine("REQUIRED COLUMNS:") + columns.forEach { column -> + appendLine("- $column") + config.column_descriptions[column]?.let { desc -> + appendLine(" Description: $desc") + } + config.column_types[column]?.let { type -> + appendLine(" Type: $type") + } + } + appendLine() + appendLine("EXTRACTION INSTRUCTIONS:") + appendLine(config.extraction_instructions.ifBlank { "Extract all relevant data matching the column definitions" }) + appendLine() + + if (config.auto_detect_tables) { + appendLine("Look for HTML tables, lists, or structured data that matches these columns.") + appendLine() + } + + if (config.max_rows_per_page != null) { + appendLine("Extract up to ${config.max_rows_per_page} rows from this page.") + appendLine() + } + + appendLine("Return:") + appendLine("1. A list of rows, where each row is a map of column_name -> value") + appendLine("2. A confidence score (0.0-1.0) for the extraction quality") + appendLine("3. Any validation notes or warnings") + appendLine() + appendLine("If a column value is not found or not applicable, use null.") + appendLine("Ensure all rows have the same columns.") + } + + val model = (context.typeConfig.model?.let { context.orchestrationConfig.instance(it) } + ?: context.orchestrationConfig.parsingChatter).getChildClient(context.task) + + val result = ParsedAgent( + prompt = prompt, + resultClass = ExtractedTableData::class.java, + model = model, + parsingChatter = model + ).answer(listOf(content.take(50000))).obj + + // Apply row limit if configured + val limitedRows = if (config.max_rows_per_page != null && result.rows.size > config.max_rows_per_page) { + result.rows.take(config.max_rows_per_page) + } else { + result.rows + } + + return result.copy(rows = limitedRows) + } + + private fun processRow( + row: Map, + config: DataTableConfig, + sourceUrl: String + ): Map? { + // Validate minimum data presence + val nonNullValues = row.values.count { it != null && it.toString().isNotBlank() } + if (nonNullValues == 0) { + log.debug("Skipping empty row from: $sourceUrl") + return null + } + + // Check for duplicates if enabled + if (config.deduplicate && config.key_columns != null) { + val keyColumns = config.key_columns.split(",").map { it.trim() } + val rowKey = keyColumns.mapNotNull { col -> + row[col]?.toString() + }.joinToString("|") + + if (rowKey.isNotBlank()) { + if (seenRowKeys.contains(rowKey)) { + log.debug("Skipping duplicate row with key: $rowKey") + return null + } + seenRowKeys.add(rowKey) + } + } + + // Normalize and validate data + val processedRow = row.toMutableMap() + + if (config.normalize_data) { + processedRow.replaceAll { key, value -> + normalizeValue(value, config.column_types[key]) + } + } + + if (config.validate_types) { + processedRow.forEach { (column, value) -> + config.column_types[column]?.let { expectedType -> + if (!validateType(value, expectedType)) { + log.warn("Type validation failed for column '$column': expected $expectedType, got ${value?.javaClass?.simpleName}") + } + } + } + } + + // Add source URL if configured + if (config.include_source_urls) { + processedRow["_source_url"] = sourceUrl + } + + return processedRow + } + + private fun normalizeValue(value: Any?, expectedType: String?): Any? { + if (value == null) return null + + val stringValue = value.toString().trim() + if (stringValue.isEmpty()) return null + + return when (expectedType?.lowercase()) { + "number" -> stringValue.replace(Regex("[^0-9.-]"), "").toDoubleOrNull() + "boolean" -> when (stringValue.lowercase()) { + "true", "yes", "1", "y" -> true + "false", "no", "0", "n" -> false + else -> null + } + "date" -> stringValue // Keep as string, could add date parsing + else -> stringValue + } + } + + private fun validateType(value: Any?, expectedType: String): Boolean { + if (value == null) return true // null is valid for any type + + return when (expectedType.lowercase()) { + "string" -> value is String + "number" -> value is Number || value.toString().toDoubleOrNull() != null + "boolean" -> value is Boolean + "date" -> true // Accept any string for dates + else -> true + } + } + + private fun storeTableData( + url: String, + rows: List>, + config: DataTableConfig + ) { + accumulatedRows.computeIfAbsent(url) { mutableListOf() }.addAll(rows) + log.info("Stored ${rows.size} rows from: $url (total: ${accumulatedRows.values.sumOf { it.size }})") + } + + private fun updateColumnStatistics( + rows: List>, + config: DataTableConfig + ) { + val columns = config.column_names.split(",").map { it.trim() } + + rows.forEach { row -> + columns.forEach { column -> + val stats = columnStats.computeIfAbsent(column) { ColumnStatistics() } + stats.totalValues++ + + val value = row[column] + if (value == null || value.toString().isBlank()) { + stats.nullValues++ + } else { + stats.uniqueValues.add(value.toString()) + + // Check type violations + config.column_types[column]?.let { expectedType -> + if (!validateType(value, expectedType)) { + stats.typeViolations++ + } + } + } + } + } + } + + private fun formatRowsAsMarkdownTable( + rows: List>, + config: DataTableConfig + ): String { + if (rows.isEmpty()) return "*No data*" + + val columns = config.column_names.split(",").map { it.trim() }.filter { it.isNotEmpty() } + if (columns.isEmpty()) return "*No columns defined*" + + return buildString { + // Header + appendLine("| ${columns.joinToString(" | ")} |") + appendLine("| ${columns.joinToString(" | ") { "---" }} |") + + // Rows + rows.forEach { row -> + val values = columns.map { column -> + row[column]?.toString()?.take(50) ?: "" + } + appendLine("| ${values.joinToString(" | ")} |") + } + } + } + + override fun shouldContinueCrawling( + currentResults: List, + context: PageProcessingStrategy.ProcessingContext + ): PageProcessingStrategy.ContinuationDecision { + val totalRows = accumulatedRows.values.sumOf { it.size } + val successfulExtractions = currentResults.count { + (it.metadata["rows_extracted"] as? Int ?: 0) > 0 + } + + log.debug("Crawling status: $successfulExtractions successful extractions, $totalRows total rows") + + return PageProcessingStrategy.ContinuationDecision( + shouldContinue = context.processedCount.get() < context.maxPages, + reason = "Accumulated $totalRows rows from $successfulExtractions pages so far" + ) + } + + override fun generateFinalOutput( + results: List, + context: PageProcessingStrategy.ProcessingContext + ): String { + log.info("Generating final data table output") + + val config = try { + context.executionConfig.content_queries?.let { queries -> + when (queries) { + is String -> JsonUtil.fromJson(queries, DataTableConfig::class.java) + else -> queries.jsonCast() + } + } ?: DataTableConfig() + } catch (e: Exception) { + log.error("Failed to parse config for final output", e) + DataTableConfig() + } + + // Aggregate all rows + val allRows = accumulatedRows.values.flatten() + + // Export data in requested format + exportDataTable(allRows, config, context) + + // Generate summary report + return buildString { + appendLine("# Data Table Accumulation Results") + appendLine() + appendLine("**Extraction Completed:** ${LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))}") + appendLine() + + appendLine("## Summary Statistics") + appendLine() + appendLine("- **Total Pages Processed:** ${results.size}") + appendLine("- **Successful Extractions:** ${accumulatedRows.size}") + appendLine("- **Total Rows Accumulated:** ${allRows.size}") + appendLine("- **Unique Sources:** ${accumulatedRows.keys.size}") + if (config.deduplicate) { + appendLine("- **Deduplication:** Enabled (${seenRowKeys.size} unique keys)") + } + appendLine() + + appendLine("## Column Statistics") + appendLine() + appendLine("| Column | Total Values | Null Values | Unique Values | Type Violations |") + appendLine("|--------|--------------|-------------|---------------|-----------------|") + + val columns = config.column_names.split(",").map { it.trim() } + columns.forEach { column -> + val stats = columnStats[column] ?: ColumnStatistics() + val nullPct = if (stats.totalValues > 0) { + (stats.nullValues * 100.0 / stats.totalValues).toInt() + } else 0 + appendLine("| $column | ${stats.totalValues} | ${stats.nullValues} ($nullPct%) | ${stats.uniqueValues.size} | ${stats.typeViolations} |") + } + appendLine() + + appendLine("## Data Quality") + appendLine() + val avgConfidence = results.mapNotNull { + it.metadata["confidence"] as? Double + }.average().takeIf { !it.isNaN() } ?: 0.0 + appendLine("- **Average Confidence:** ${(avgConfidence * 100).toInt()}%") + + val allValidationNotes = results.flatMap { + (it.metadata["validation_notes"] as? List<*>)?.filterIsInstance() ?: emptyList() + }.distinct() + + if (allValidationNotes.isEmpty()) { + appendLine("- **Validation Issues:** None detected ✅") + } else { + appendLine("- **Validation Issues:** ${allValidationNotes.size} warnings") + allValidationNotes.take(5).forEach { note -> + appendLine(" - ⚠️ $note") + } + if (allValidationNotes.size > 5) { + appendLine(" - *... and ${allValidationNotes.size - 5} more*") + } + } + appendLine() + + appendLine("## Exported Files") + appendLine() + appendLine("The complete dataset has been exported to:") + when (config.export_format.lowercase()) { + "csv" -> appendLine("- `${context.webSearchDir.name}/data_table.csv`") + "json" -> appendLine("- `${context.webSearchDir.name}/data_table.json`") + "markdown" -> appendLine("- `${context.webSearchDir.name}/data_table.md`") + else -> appendLine("- `${context.webSearchDir.name}/data_table.${config.export_format}`") + } + appendLine() + + appendLine("## Sample Data (First 10 Rows)") + appendLine() + appendLine(formatRowsAsMarkdownTable(allRows.take(10), config)) + appendLine() + + if (allRows.size > 10) { + appendLine("*... and ${allRows.size - 10} more rows*") + appendLine() + } + + appendLine("## Data Sources") + appendLine() + accumulatedRows.forEach { (url, rows) -> + appendLine("- [$url]($url) - ${rows.size} rows") + } + appendLine() + + appendLine("## Next Steps") + appendLine() + appendLine("1. Review the exported data file for completeness") + appendLine("2. Validate data quality and handle any null values") + appendLine("3. Import data into your analysis tools or database") + appendLine("4. Consider additional crawling if coverage is incomplete") + } + } + + private fun exportDataTable( + rows: List>, + config: DataTableConfig, + context: PageProcessingStrategy.ProcessingContext + ) { + if (rows.isEmpty()) { + log.warn("No rows to export") + return + } + + val columns = config.column_names.split(",").map { it.trim() }.filter { it.isNotEmpty() } + if (columns.isEmpty()) { + log.warn("No columns defined for export") + return + } + + try { + when (config.export_format.lowercase()) { + "csv" -> exportAsCSV(rows, columns, context) + "json" -> exportAsJSON(rows, context) + "markdown" -> exportAsMarkdown(rows, columns, config, context) + else -> { + log.warn("Unknown export format: ${config.export_format}, defaulting to CSV") + exportAsCSV(rows, columns, context) + } + } + } catch (e: Exception) { + log.error("Failed to export data table", e) + } + } + + private fun exportAsCSV( + rows: List>, + columns: List, + context: PageProcessingStrategy.ProcessingContext + ) { + val csvFile = File(context.webSearchDir, "data_table.csv") + + csvFile.bufferedWriter(StandardCharsets.UTF_8).use { writer -> + // Write header + writer.write(columns.joinToString(",") { escapeCSV(it) }) + writer.newLine() + + // Write rows + rows.forEach { row -> + val values = columns.map { column -> + escapeCSV(row[column]?.toString() ?: "") + } + writer.write(values.joinToString(",")) + writer.newLine() + } + } + + log.info("Exported ${rows.size} rows to CSV: ${csvFile.absolutePath}") + } + + private fun exportAsJSON( + rows: List>, + context: PageProcessingStrategy.ProcessingContext + ) { + val jsonFile = File(context.webSearchDir, "data_table.json") + jsonFile.writeText(rows.toJson(), StandardCharsets.UTF_8) + log.info("Exported ${rows.size} rows to JSON: ${jsonFile.absolutePath}") + } + + private fun exportAsMarkdown( + rows: List>, + columns: List, + config: DataTableConfig, + context: PageProcessingStrategy.ProcessingContext + ) { + val mdFile = File(context.webSearchDir, "data_table.md") + + mdFile.bufferedWriter(StandardCharsets.UTF_8).use { writer -> + writer.write("# Data Table Export\n\n") + writer.write("**Generated:** ${LocalDateTime.now()}\n\n") + writer.write("**Total Rows:** ${rows.size}\n\n") + writer.write("## Data\n\n") + writer.write(formatRowsAsMarkdownTable(rows, config)) + } + + log.info("Exported ${rows.size} rows to Markdown: ${mdFile.absolutePath}") + } + + private fun escapeCSV(value: String): String { + return if (value.contains(",") || value.contains("\"") || value.contains("\n")) { + "\"${value.replace("\"", "\"\"")}\"" + } else { + value + } + } + + override fun validateConfig(config: Any?): String? { + if (config == null) return "Data table config is required" + + return try { + val tableConfig = when (config) { + is DataTableConfig -> config + is String -> JsonUtil.fromJson(config, DataTableConfig::class.java) + else -> return "Invalid config type: ${config.javaClass.name}" + } + + if (tableConfig.column_names.isBlank()) { + return "column_names is required and cannot be blank" + } + + val columns = tableConfig.column_names.split(",").map { it.trim() }.filter { it.isNotEmpty() } + if (columns.isEmpty()) { + return "At least one column must be defined" + } + + if (tableConfig.min_rows < 0) { + return "min_rows must be non-negative" + } + + if (tableConfig.max_rows_per_page != null && tableConfig.max_rows_per_page <= 0) { + return "max_rows_per_page must be greater than 0" + } + + val validExportFormats = setOf("csv", "json", "markdown") + if (tableConfig.export_format.lowercase() !in validExportFormats) { + return "export_format must be one of: ${validExportFormats.joinToString(", ")}" + } + + null + } catch (e: Exception) { + "Config validation error: ${e.message}" + } + } +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/DefaultSummarizerStrategy.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/DefaultSummarizerStrategy.kt new file mode 100644 index 000000000..e87e927f2 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/DefaultSummarizerStrategy.kt @@ -0,0 +1,201 @@ +package com.simiacryptus.cognotik.plan.tools.online.processing + +import com.simiacryptus.cognotik.agents.ChatAgent +import com.simiacryptus.cognotik.agents.ParsedAgent +import com.simiacryptus.cognotik.agents.ParsedResponse +import com.simiacryptus.cognotik.describe.TypeDescriber +import com.simiacryptus.cognotik.plan.TaskContextYamlDescriber +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask +import com.simiacryptus.cognotik.util.toJson +import com.simiacryptus.cognotik.webui.session.getChildClient +import org.slf4j.LoggerFactory +import kotlin.math.min + +open class DefaultSummarizerStrategy : PageProcessingStrategy { + companion object { + private val log = LoggerFactory.getLogger(DefaultSummarizerStrategy::class.java) + } + + override val description: String // Describe both the strategy and its configuration + get() = "" + + open override fun processPage( + url: String, + content: String, + context: PageProcessingStrategy.ProcessingContext + ): PageProcessingStrategy.PageProcessingResult { + val analysisGoal = analysisGoal(context) + + val analysis = try { + transformContent(content, analysisGoal, context) + } catch (e: Exception) { + log.error("Error transforming content for URL: $url", e) + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = CrawlerAgentTask.PageType.Error, + content = "Error analyzing content: ${e.message}", + extractedLinks = null, + metadata = mapOf("error" to (e.message ?: "Unknown error")) + ) + } + + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = analysis.obj.page_type, + content = analysis.text, + extractedLinks = analysis.obj.link_data, + metadata = mapOf( + "tags" to (analysis.obj.tags ?: emptyList()) + ), + shouldTerminate = false + ) + } + + open fun analysisGoal(context: PageProcessingStrategy.ProcessingContext): String = when { + context.executionConfig.content_queries != null -> context.executionConfig.content_queries.toJson() + context.executionConfig.task_description?.isNotBlank() == true -> context.executionConfig.task_description!! + else -> "Analyze the content and provide insights." + } + + override fun shouldContinueCrawling( + currentResults: List, + context: PageProcessingStrategy.ProcessingContext + ): PageProcessingStrategy.ContinuationDecision { + // Never early-terminate, use existing page limit logic + return PageProcessingStrategy.ContinuationDecision( + shouldContinue = context.processedCount.get() < context.maxPages, + reason = "Processing up to max_pages limit" + ) + } + + override fun generateFinalOutput( + results: List, + context: PageProcessingStrategy.ProcessingContext + ): String { + val analysisResults = results.joinToString("\n") { it.content } + return createFinalSummary(analysisResults, context) + } + + private fun createFinalSummary(analysisResults: String, context: PageProcessingStrategy.ProcessingContext): String { + val maxFinalOutputSize = context.typeConfig.max_final_output_size ?: 15000 + + if (analysisResults.length < maxFinalOutputSize * 1.2) { + return analysisResults.substring(0, min(analysisResults.length, maxFinalOutputSize)) + + "\n\n---\n\n*Note: Some content has been truncated due to length limitations.*" + } + + val summary = ChatAgent( + prompt = listOf( + "Create a comprehensive summary of the following web search results and analyses.", + "Analysis goal: ${context.executionConfig.content_queries ?: context.executionConfig.task_description ?: "Provide key insights"}", + "For each source, extract the most important insights, facts, and conclusions.", + "Organize information by themes rather than by source when possible.", + "Use markdown formatting with headers, bullet points, and emphasis where appropriate.", + "Include the most important links that should be followed up on.", + "Keep your response under ${maxFinalOutputSize / 1000}K characters." + ).joinToString("\n\n"), + model = (context.typeConfig.model?.let { context.orchestrationConfig.instance(it) } + ?: context.orchestrationConfig.parsingChatter).getChildClient(context.task), + ).answer( + listOf("Here are summaries of each analyzed page:\n${analysisResults}"), + ) + + return summary + } + + override fun validateConfig(config: Any?): String? { + // Default strategy doesn't require specific config validation + return null + } + + private fun transformContent( + content: String, + analysisGoal: String, + context: PageProcessingStrategy.ProcessingContext + ): ParsedResponse { + val describer = TaskContextYamlDescriber(context.orchestrationConfig) + val maxChunkSize = 50000 + if (content.length <= maxChunkSize) { + return pageParsedResponse(context, analysisGoal, content, describer) + } + val chunks = splitContentIntoChunks(content, maxChunkSize) + val chunkResults = chunks.mapIndexed { index, chunk -> + val chunkGoal = "$analysisGoal (Part ${index + 1}/${chunks.size})" + pageParsedResponse(context, chunkGoal, chunk, describer) + } + if (chunkResults.size == 1) { + return chunkResults[0] + } + val combinedAnalysis = chunkResults.joinToString("\n\n---\n\n") { it.text } + return pageParsedResponse(context, analysisGoal, combinedAnalysis, describer) + } + + private fun pageParsedResponse( + context: PageProcessingStrategy.ProcessingContext, + analysisGoal: String, + content: String, + describer: TypeDescriber + ): ParsedResponse { + return try { + val model = (context.typeConfig.model?.let { context.orchestrationConfig.instance(it) } + ?: context.orchestrationConfig.parsingChatter).getChildClient(context.task) + ParsedAgent( + prompt = listOf( + "Below are analyses of different parts of a web page related to this goal: $analysisGoal", + "Create a unified summary that combines the key insights from all parts.", + "Use markdown formatting for your response, with * characters for bullets.", + "Identify the most important links that should be followed up on according to the goal." + ).joinToString("\n\n"), + resultClass = CrawlerAgentTask.ParsedPage::class.java, + model = model, + describer = describer, + parsingChatter = model, + ).answer(listOf(content)) + } catch (e: Exception) { + log.error("Error during content transformation", e) + object : ParsedResponse( + clazz = CrawlerAgentTask.ParsedPage::class.java + ) { + override val obj: CrawlerAgentTask.ParsedPage + get() = CrawlerAgentTask.ParsedPage( + page_type = CrawlerAgentTask.PageType.Error, + page_information = "Error during analysis: ${e.message}" + ) + override val text: String + get() = "Error during analysis: ${e.message}" + } + } + } + + private fun splitContentIntoChunks(content: String, maxChunkSize: Int): List { + val chunks = mutableListOf() + var remainingContent = content + while (remainingContent.isNotEmpty()) { + val chunkSize = if (remainingContent.length <= maxChunkSize) { + remainingContent.length + } else { + findBreakPoint(remainingContent, maxChunkSize) + } + chunks.add(remainingContent.substring(0, chunkSize)) + remainingContent = remainingContent.substring(chunkSize) + } + return chunks + } + + private fun findBreakPoint(text: String, maxSize: Int): Int { + val paragraphBreakSearch = text.substring(0, minOf(maxSize, text.length)).lastIndexOf("\n\n") + if (paragraphBreakSearch > maxSize * 0.7) { + return paragraphBreakSearch + 2 + } + val newlineSearch = text.substring(0, minOf(maxSize, text.length)).lastIndexOf("\n") + if (newlineSearch > maxSize * 0.7) { + return newlineSearch + 1 + } + val sentenceSearch = text.substring(0, minOf(maxSize, text.length)).lastIndexOf(". ") + if (sentenceSearch > maxSize * 0.7) { + return sentenceSearch + 2 + } + return minOf(maxSize, text.length) + } + +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/FactCheckingStrategy.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/FactCheckingStrategy.kt new file mode 100644 index 000000000..c60c75358 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/FactCheckingStrategy.kt @@ -0,0 +1,288 @@ +package com.simiacryptus.cognotik.plan.tools.online.processing + +import com.simiacryptus.cognotik.agents.ParsedAgent +import com.simiacryptus.cognotik.agents.parserCast +import com.simiacryptus.cognotik.describe.Description +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask +import com.simiacryptus.cognotik.util.jsonCast +import com.simiacryptus.cognotik.webui.session.getChildClient +import org.slf4j.LoggerFactory +import java.time.LocalDateTime +import java.util.concurrent.ConcurrentHashMap +import java.util.regex.Pattern + +class FactCheckingStrategy : PageProcessingStrategy { + + data class FactCheckingConfig( + @Description("Claims to verify") + val claims_to_verify: List, + @Description("Required confidence level (0.0-1.0)") + val confidence_threshold: Double = 0.8, + @Description("Stop after finding N supporting sources") + val required_sources: Int = 3, + @Description("Stop after finding N contradicting sources") + val contradiction_threshold: Int = 2 + ) + + override val description: String // Describe both the strategy and its configuration + get() = """Fact-Checking Strategy: Verifies specified claims against web page content. + |Configuration options include claims to verify, confidence thresholds, and source requirements.""".trimMargin() + + data class FactCheckResult( + val claim: String, + val verdict: FactVerdict, + val confidence: Double, + val supporting_evidence: List, + val contradicting_evidence: List, + val neutral_evidence: List + ) + + enum class FactVerdict { + SUPPORTED, + CONTRADICTED, + INSUFFICIENT_EVIDENCE, + MIXED + } + + data class Evidence( + val source_url: String, + val excerpt: String, + val relevance_score: Double, + val credibility_score: Double + ) + + private val verificationResults = ConcurrentHashMap>() + + companion object { + private val log = LoggerFactory.getLogger(FactCheckingStrategy::class.java) + } + + override fun processPage( + url: String, + content: String, + context: PageProcessingStrategy.ProcessingContext + ): PageProcessingStrategy.PageProcessingResult { + val config = context.executionConfig.content_queries?.parserCast(context.orchestrationConfig.parsingChatter) + ?: return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = CrawlerAgentTask.PageType.Error, + content = "Missing FactCheckingConfig", + extractedLinks = null, + metadata = emptyMap(), + shouldTerminate = true, + terminationReason = "Configuration error" + ) + + // Analyze page for each claim + val pageResults = config.claims_to_verify.map { claim -> + analyzeClaimEvidence(claim, content, url, context) + } + + // Update global verification state + pageResults.forEach { result -> + verificationResults.getOrPut(result.claim) { mutableListOf() }.add(result) + } + + // Check if we have enough evidence to terminate + val shouldTerminate = checkTerminationConditions(config) + + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = CrawlerAgentTask.PageType.OK, + content = formatFactCheckResults(pageResults), + extractedLinks = extractRelevantLinks(content, config.claims_to_verify), + metadata = mapOf( + "fact_check_results" to pageResults, + "claims_analyzed" to config.claims_to_verify.size + ), + shouldTerminate = shouldTerminate, + terminationReason = if (shouldTerminate) "Sufficient evidence gathered" else null + ) + } + + private fun formatFactCheckResults(results: List): String { + return buildString { + results.forEach { result -> + appendLine("### Claim: ${result.claim}") + appendLine("**Verdict:** ${result.verdict.name}") + appendLine("**Confidence:** ${result.confidence}") + appendLine() + if (result.supporting_evidence.isNotEmpty()) { + appendLine("**Supporting Evidence:**") + result.supporting_evidence.forEach { evidence -> + appendLine("- ${evidence.excerpt} (relevance: ${evidence.relevance_score})") + } + appendLine() + } + if (result.contradicting_evidence.isNotEmpty()) { + appendLine("**Contradicting Evidence:**") + result.contradicting_evidence.forEach { evidence -> + appendLine("- ${evidence.excerpt} (relevance: ${evidence.relevance_score})") + } + appendLine() + } + } + } + } + + private fun extractRelevantLinks(content: String, claims: List): List { + val linkPattern = Pattern.compile("""\[([^]]+)]\(([^)]+)\)""") + val matcher = linkPattern.matcher(content) + val links = mutableListOf() + while (matcher.find()) { + val linkText = matcher.group(1) + val linkUrl = matcher.group(2) + // Check if link text is relevant to any claim + val isRelevant = claims.any { claim -> + linkText.contains(claim, ignoreCase = true) || + claim.split(" ").any { word -> linkText.contains(word, ignoreCase = true) } + } + if (isRelevant) { + links.add( + CrawlerAgentTask.LinkData( + url = linkUrl, + title = linkText, + relevance_score = 80.0 + ) + ) + } + } + return links + } + + private fun analyzeClaimEvidence( + claim: String, + content: String, + url: String, + context: PageProcessingStrategy.ProcessingContext + ): FactCheckResult { + val prompt = """ + Analyze the following content for evidence related to this claim: + + CLAIM: $claim + + Determine if the content: + 1. Supports the claim + 2. Contradicts the claim + 3. Is neutral/irrelevant + + Extract specific excerpts that serve as evidence. + Rate the relevance (0.0-1.0) and credibility (0.0-1.0) of the source. + """.trimIndent() + + val analysis = ParsedAgent( + prompt = prompt, + resultClass = FactCheckResult::class.java, + model = context.orchestrationConfig.parsingChatter.getChildClient(context.task), + parsingChatter = context.orchestrationConfig.parsingChatter.getChildClient(context.task) + ).answer(listOf(content)) + + return analysis.obj.copy( + supporting_evidence = analysis.obj.supporting_evidence.map { + it.copy(source_url = url) + }, + contradicting_evidence = analysis.obj.contradicting_evidence.map { + it.copy(source_url = url) + } + ) + } + + private fun checkTerminationConditions(config: FactCheckingConfig): Boolean { + return config.claims_to_verify.all { claim -> + val results = verificationResults[claim] ?: return@all false + + val supportCount = results.count { it.verdict == FactVerdict.SUPPORTED } + val contradictCount = results.count { it.verdict == FactVerdict.CONTRADICTED } + + // Terminate if we have enough supporting OR contradicting evidence + supportCount >= config.required_sources || + contradictCount >= config.contradiction_threshold + } + } + + override fun shouldContinueCrawling( + currentResults: List, + context: PageProcessingStrategy.ProcessingContext + ): PageProcessingStrategy.ContinuationDecision { + val anyTermination = currentResults.any { it.shouldTerminate } + + return PageProcessingStrategy.ContinuationDecision( + shouldContinue = !anyTermination && context.processedCount.get() < context.maxPages, + reason = if (anyTermination) { + currentResults.first { it.shouldTerminate }.terminationReason ?: "Early termination" + } else { + "Continue gathering evidence" + } + ) + } + + override fun generateFinalOutput( + results: List, + context: PageProcessingStrategy.ProcessingContext + ): String { + val config = context.executionConfig.content_queries?.jsonCast() + ?: return "Error: Missing FactCheckingConfig" + + return buildString { + appendLine("# Fact-Checking Report") + appendLine() + appendLine("**Generated:** ${LocalDateTime.now()}") + appendLine("**Pages Analyzed:** ${results.size}") + appendLine() + + config.claims_to_verify.forEach { claim -> + appendLine("## Claim: $claim") + appendLine() + + val claimResults = verificationResults[claim] ?: emptyList() + val verdict = determineOverallVerdict(claimResults, config.confidence_threshold) + + appendLine("**Verdict:** ${verdict.name}") + appendLine() + + appendLine("### Supporting Evidence (${claimResults.flatMap { it.supporting_evidence }.size})") + claimResults.flatMap { it.supporting_evidence }.forEach { evidence -> + appendLine("- [${evidence.source_url}](${evidence.source_url})") + appendLine(" - Relevance: ${evidence.relevance_score}") + appendLine(" - Credibility: ${evidence.credibility_score}") + appendLine(" - Excerpt: \"${evidence.excerpt}\"") + appendLine() + } + + appendLine("### Contradicting Evidence (${claimResults.flatMap { it.contradicting_evidence }.size})") + claimResults.flatMap { it.contradicting_evidence }.forEach { evidence -> + appendLine("- [${evidence.source_url}](${evidence.source_url})") + appendLine(" - Relevance: ${evidence.relevance_score}") + appendLine(" - Credibility: ${evidence.credibility_score}") + appendLine(" - Excerpt: \"${evidence.excerpt}\"") + appendLine() + } + + appendLine("---") + appendLine() + } + } + } + + private fun determineOverallVerdict( + results: List, + threshold: Double + ): FactVerdict { + if (results.isEmpty()) return FactVerdict.INSUFFICIENT_EVIDENCE + + val avgConfidence = results.map { it.confidence }.average() + val supportCount = results.count { it.verdict == FactVerdict.SUPPORTED } + val contradictCount = results.count { it.verdict == FactVerdict.CONTRADICTED } + + return when { + avgConfidence < threshold -> FactVerdict.INSUFFICIENT_EVIDENCE + supportCount > contradictCount * 2 -> FactVerdict.SUPPORTED + contradictCount > supportCount * 2 -> FactVerdict.CONTRADICTED + else -> FactVerdict.MIXED + } + } + + override fun validateConfig(config: Any?): String? { + return null + } +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/JobMatchingStrategy.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/JobMatchingStrategy.kt new file mode 100644 index 000000000..15c20bb7e --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/JobMatchingStrategy.kt @@ -0,0 +1,734 @@ +package com.simiacryptus.cognotik.plan.tools.online.processing + +import com.simiacryptus.cognotik.agents.CodeAgent.Companion.indent +import com.simiacryptus.cognotik.agents.ParsedAgent +import com.simiacryptus.cognotik.agents.parserCast +import com.simiacryptus.cognotik.chat.model.ChatInterface +import com.simiacryptus.cognotik.describe.Description +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask +import com.simiacryptus.cognotik.util.toJson +import com.simiacryptus.cognotik.webui.session.getChildClient +import org.slf4j.LoggerFactory +import java.io.File +import java.io.IOException +import java.nio.charset.StandardCharsets +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter +import java.util.concurrent.ConcurrentHashMap + +class JobMatchingStrategy : DefaultSummarizerStrategy() { + + data class JobMatchingConfig( + @Description("User's resume/experience summary") + val user_experience: String = "", + @Description("Desired job titles or keywords") + val target_roles: List = listOf(), + @Description("Required skills to match") + val required_skills: List? = null, + @Description("Preferred locations (cities, states, countries, or 'Remote')") + val preferred_locations: List? = null, + @Description("Acceptable locations (if different from preferred)") + val acceptable_locations: List? = null, + @Description("Excluded locations") + val excluded_locations: List? = null, + @Description("Minimum match score (0.0-1.0)") + val min_match_score: Double = 0.6, + @Description("(Optional) Stop after finding N good matches") + val target_matches: Int? = null, + @Description("Automatically adjust match threshold based on results") + val adaptive_threshold: Boolean = false, + @Description("Preferred industries") + val preferred_industries: List? = null, + @Description("Excluded companies") + val excluded_companies: List? = null, + @Description("Minimum acceptable salary (annual)") + val min_salary: Int? = null, + @Description("Target salary (annual)") + val target_salary: Int? = null, + @Description("Maximum salary expectation (annual)") + val max_salary: Int? = null, + @Description("Currency for salary (e.g., USD, EUR, GBP)") + val salary_currency: String = "USD", + @Description("Work arrangement preference: 'remote', 'hybrid', 'onsite', or 'flexible'") + val work_arrangement_preference: String? = null, + @Description("Maximum acceptable days in office per week (for hybrid roles)") + val max_days_in_office: Int? = null, + @Description("Willing to travel (percentage or 'none', 'occasional', 'frequent')") + val travel_willingness: String? = null, + @Description("Maximum acceptable travel percentage (0-100)") + val max_travel_percentage: Int? = null, + @Description("Willing to relocate") + val willing_to_relocate: Boolean = false, + @Description("Relocation assistance required") + val requires_relocation_assistance: Boolean = false + ) + + + override val description: String + get() = "Analyzes job postings against user experience to find strong matches, generates application materials, and saves detailed reports." + + data class JobAnalysis( + @Description("Job title/position name") + val job_title: String? = null, + @Description("Company/organization name") + val company: String = "", + @Description("Primary job location (city, state, country)") + val location: String? = null, + @Description("Additional locations or service areas") + val additional_locations: List? = null, + @Description("Work arrangement: 'remote', 'hybrid', 'onsite'") + val work_arrangement: String? = null, + @Description("Days in office per week (for hybrid)") + val days_in_office: Int? = null, + @Description("Travel requirements description") + val travel_requirements: String? = null, + @Description("Travel percentage (0-100)") + val travel_percentage: Int? = null, + @Description("Relocation offered") + val relocation_offered: Boolean? = null, + @Description("Relocation assistance details") + val relocation_assistance: String? = null, + @Description("URL where the candidate can apply for the position") + val application_url: String = "", + @Description("URL of the original job description page") + val job_description_url: String = "", + @Description("Full text of the job description") + val job_description: String = "", + @Description("Minimum salary offered (if disclosed)") + val salary_min: Int? = null, + @Description("Maximum salary offered (if disclosed)") + val salary_max: Int? = null, + @Description("Salary currency") + val salary_currency: String? = null, + @Description("Salary period: 'annual', 'hourly', 'monthly'") + val salary_period: String? = null, + @Description("Additional compensation details (bonus, equity, etc.)") + val compensation_details: String? = null, + @Description("List of skills explicitly required for the position") + val required_skills: List = listOf(), + @Description("List of skills that are preferred but not required") + val preferred_skills: List = listOf(), + @Description("Overall match score between candidate and position (0.0-1.0)") + val match_score: Double = 0.0, + @Description("Location compatibility score (0.0-1.0)") + val location_score: Double = 0.0, + @Description("Salary compatibility score (0.0-1.0)") + val salary_score: Double = 0.0, + @Description("Work arrangement compatibility score (0.0-1.0)") + val work_arrangement_score: Double = 0.0, + @Description("Detailed analysis of how well the candidate matches the position") + val match_analysis: String = "", + @Description("Analysis of location and work arrangement fit") + val location_analysis: String = "", + @Description("Analysis of compensation fit") + val compensation_analysis: String = "", + @Description("Skills the candidate lacks that are required or preferred") + val skill_gaps: List = listOf(), + @Description("Skills the candidate has that match the job requirements") + val skill_matches: List = listOf(), + @Description("Draft cover letter tailored to this specific position") + val cover_letter: String = "", + @Description("Strategic notes and recommendations for the application") + val application_notes: String = "" + ) + + private val goodMatches = ConcurrentHashMap() + + companion object { + private val log = LoggerFactory.getLogger(JobMatchingStrategy::class.java) + } + + override fun processPage( + url: String, + content: String, + context: PageProcessingStrategy.ProcessingContext + ) = try { + log.debug("Processing page: $url") + val chatInterface = context.orchestrationConfig.parsingChatter.getChildClient(context.task) + val config = context.executionConfig.content_queries?.parserCast(chatInterface) + ?: run { + val errorMsg = "Missing JobMatchingConfig for job matching strategy" + log.error(errorMsg) + writeToTranscript(context, "**ERROR:** $errorMsg\n") + throw IllegalArgumentException(errorMsg) + } + + if (detectJobPosting(content, chatInterface)) { + log.info("Job posting detected at: $url") + processJD(url, content, config, context, chatInterface).let { result -> + val standardProcessing = super.processPage(url, content, context) + result.copy( + extractedLinks = standardProcessing.extractedLinks + ) + } + } else { + log.debug("Page is not a job posting, using default processing: $url") + super.processPage(url, content, context) + } + } catch (e: Exception) { + val errorMsg = "Error processing page for URL: $url - ${e.message}" + log.error(errorMsg, e) + context.task.error(e) + writeToTranscript(context, "\n**ERROR:** $errorMsg\n```text\n${e.stackTraceToString().indent(" ")}\n```\n\n") + super.processPage(url, content, context) + } + + private fun writeToTranscript(context: PageProcessingStrategy.ProcessingContext, message: String) { + context.transcriptStream?.let { stream -> + try { + stream.write(message.toByteArray(StandardCharsets.UTF_8)) + stream.flush() + } catch (e: IOException) { + log.warn("Failed to write to transcript stream", e) + } + } + } + + private fun processJD( + url: String, + content: String, + config: JobMatchingConfig, + context: PageProcessingStrategy.ProcessingContext, + chatInterface: ChatInterface + ): PageProcessingStrategy.PageProcessingResult { + log.debug("Processing job description for URL: $url") + + // Log job detection to transcript + val timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("HH:mm:ss")) + writeToTranscript(context, "\n#### Job Posting Detected at $timestamp\n**URL:** [$url]($url)\n\n") + + // Extract and analyze job details + val jobAnalysis = try { + analyzeJobMatch(url, content, config, context, chatInterface) + } catch (e: Exception) { + val errorMsg = "Failed to analyze job match for URL: $url - ${e.message}" + log.error(errorMsg, e) + context.task.error(e) + writeToTranscript(context, "\n**ERROR:** $errorMsg\n```\n${e.stackTraceToString().indent(" ")}\n```\n\n") + throw e + } + + // If it's a good match, save detailed report + val shouldTerminate = if (jobAnalysis.match_score >= config.min_match_score) { + goodMatches[url] = jobAnalysis + try { + saveJobReport(jobAnalysis, context) + } catch (e: Exception) { + val errorMsg = "Failed to save job report for ${jobAnalysis.company} - ${jobAnalysis.job_title}: ${e.message}" + log.error(errorMsg, e) + context.task.error(e) + writeToTranscript(context, "\n**ERROR:** \n```text\n${errorMsg.indent(" ")}\n```\n") + // Don't throw - we still want to continue processing + } + + // Log good match to transcript + writeToTranscript(context, buildString { + + appendLine("
    ") + appendLine("**✅ GOOD MATCH FOUND** (Score: ${(jobAnalysis.match_score * 100).toInt()}%)") + appendLine("\n\n```json\n${jobAnalysis.toJson()}\n```\n\n") + appendLine("
    \n") + appendLine("- **URL:** [$url]($url)\n\n") + appendLine("- **Position:** ${jobAnalysis.job_title}") + appendLine("- **Company:** ${jobAnalysis.company}") + appendLine("- **Location:** ${jobAnalysis.location ?: "Not specified"}") + appendLine("- **Total Matches Found:** ${goodMatches.size}/${config.target_matches}\n") + }) + + log.info("Good match found: ${jobAnalysis.company} - ${jobAnalysis.job_title} (Score: ${jobAnalysis.match_score})") + + // Check if we've found enough good matches + config.target_matches != null && config.target_matches > 0 && goodMatches.size >= config.target_matches + } else { + // Log weak match to transcript + writeToTranscript(context, buildString { + appendLine("
    ") + appendLine("**⚠️ Weak Match** (Score: ${(jobAnalysis.match_score * 100).toInt()}%)") + appendLine("\n\n```json\n${jobAnalysis.toJson()}\n```\n\n") + appendLine("
    \n") + appendLine("- **URL:** [$url]($url)\n\n") + appendLine("- **Position:** ${jobAnalysis.job_title}") + appendLine("- **Company:** ${jobAnalysis.company}\n") + }) + + log.debug("Weak match: ${jobAnalysis.company} - ${jobAnalysis.job_title} (Score: ${jobAnalysis.match_score})") + false + } + + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = CrawlerAgentTask.PageType.OK, + content = formatJobAnalysis(jobAnalysis), + metadata = mapOf( + "job_analysis" to jobAnalysis, + "match_score" to jobAnalysis.match_score, + "is_good_match" to (jobAnalysis.match_score >= config.min_match_score) + ), + shouldTerminate = shouldTerminate, + terminationReason = if (shouldTerminate) { + "Found ${config.target_matches} good job matches" + } else null + ) + } + + data class JobDetection( + val is_job_posting: Boolean = false, + val confidence: Double = 0.0, + val detected_title: String? = null, + ) + + private fun detectJobPosting( + content: String, + chatInterface: ChatInterface + ): Boolean { + log.debug("Detecting if content is a job posting") + + val prompt = """ + Analyze if this content is a job posting/description. + + Look for: + - Job title + - Company name + - Job responsibilities + - Required qualifications + - Application instructions + """.trimIndent() + val detection = try { + ParsedAgent( + prompt = prompt, + resultClass = JobDetection::class.java, + model = chatInterface, + parsingChatter = chatInterface + ).getParser().apply(content.take(5000)) // Only analyze first 5K chars for detection, using 1 pass (faster) + } catch (e: Exception) { + log.error("Failed to detect job posting", e) + throw e + } + + log.debug("Job posting detection result: is_job=${detection.is_job_posting}, confidence=${detection.confidence}") + return detection.is_job_posting && detection.confidence > 0.7 + } + + private fun analyzeJobMatch( + url: String, + content: String, + config: JobMatchingConfig, + context: PageProcessingStrategy.ProcessingContext, + chatInterface: ChatInterface + ): JobAnalysis { + log.debug("Analyzing job match for URL: $url") + + // Log analysis start to transcript + writeToTranscript(context, "**Analyzing job match...**\n") + + // Build enriched context from messages + val additionalContext = if (context.messages.isNotEmpty()) { + buildString { + appendLine() + appendLine("ADDITIONAL CONTEXT FROM USER:") + context.messages.forEach { message -> + appendLine(message) + appendLine() + } + } + } else "" + val locationContext = buildString { + appendLine() + appendLine("LOCATION PREFERENCES:") + config.preferred_locations?.let { appendLine("Preferred: ${it.joinToString(", ")}") } + config.acceptable_locations?.let { appendLine("Acceptable: ${it.joinToString(", ")}") } + config.excluded_locations?.let { appendLine("Excluded: ${it.joinToString(", ")}") } + appendLine("Willing to relocate: ${config.willing_to_relocate}") + if (config.requires_relocation_assistance) appendLine("Requires relocation assistance") + } + val compensationContext = buildString { + appendLine() + appendLine("COMPENSATION EXPECTATIONS:") + config.min_salary?.let { appendLine("Minimum: ${config.salary_currency} $it/year") } + config.target_salary?.let { appendLine("Target: ${config.salary_currency} $it/year") } + config.max_salary?.let { appendLine("Maximum: ${config.salary_currency} $it/year") } + } + val workArrangementContext = buildString { + appendLine() + appendLine("WORK ARRANGEMENT PREFERENCES:") + config.work_arrangement_preference?.let { appendLine("Preference: $it") } + config.max_days_in_office?.let { appendLine("Max days in office: $it/week") } + config.travel_willingness?.let { appendLine("Travel willingness: $it") } + config.max_travel_percentage?.let { appendLine("Max travel: $it%") } + } + + + val prompt = """ + Analyze this job posting and compare it to the candidate's experience. + + CANDIDATE EXPERIENCE: + ${config.user_experience} + + TARGET ROLES: ${config.target_roles.joinToString(", ")} + REQUIRED SKILLS: ${config.required_skills?.joinToString(", ") ?: "Not specified"} + ${locationContext} + ${compensationContext} + ${workArrangementContext} + ${additionalContext} + + Extract: + 1. Job title, company, location + 2. Work arrangement (remote/hybrid/onsite), days in office if hybrid + 3. Travel requirements and percentage + 4. Relocation information + 5. Salary range and compensation details (if disclosed) + 6. Application URL + 7. Required and preferred skills + 8. Overall match score (0.0-1.0) based on experience alignment + 9. Location compatibility score (0.0-1.0) considering preferences and work arrangement + 10. Salary compatibility score (0.0-1.0) if salary disclosed + 11. Work arrangement compatibility score (0.0-1.0) + 12. Detailed match analysis + 13. Location and work arrangement analysis + 14. Compensation analysis (if salary disclosed) + 15. Skill gaps and matches + 16. Draft a compelling cover letter (200-300 words) that incorporates the additional context and highlights relevant experience + 17. Application strategy notes + + When drafting the cover letter, pay special attention to any specific requirements, preferences, or context + provided in the additional context section. Tailor the letter to address these points directly. + IMPORTANT: When calculating scores, consider: + - Location score: Match against preferred/acceptable locations, work arrangement fit, relocation needs + - Salary score: Only calculate if salary is disclosed; compare against min/target/max expectations + - Work arrangement score: Match remote/hybrid/onsite preference, travel requirements, days in office + - Overall match score: Weight skills heavily, but factor in location, salary, and work arrangement + """.trimIndent() + + val analysis = try { + ParsedAgent( + prompt = prompt, + resultClass = JobAnalysis::class.java, + model = chatInterface, + parsingChatter = chatInterface + ).answer(listOf(content)) + } catch (e: Exception) { + log.error("Failed to analyze job match", e) + throw e + } + + log.debug("Job analysis completed with match score: ${analysis.obj.match_score}") + return analysis.obj.copy( + job_description_url = url, + ) + } + + private fun saveJobReport( + jobAnalysis: JobAnalysis, + context: PageProcessingStrategy.ProcessingContext + ) { + log.debug("Saving job report for: ${jobAnalysis.company} - ${jobAnalysis.job_title}") + + // Log report save to transcript + writeToTranscript(context, "**Saving detailed job report...**\n") + + val timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + val companySafe = jobAnalysis.company.replace(Regex("[^a-zA-Z0-9]"), "_").take(30) + val titleSafe = jobAnalysis.job_title?.replace(Regex("[^a-zA-Z0-9]"), "_")?.take(30) + + val reportDir = File(context.webSearchDir, "job_matches") + + try { + if (!reportDir.exists() && !reportDir.mkdirs()) { + throw IOException("Failed to create report directory: ${reportDir.absolutePath}") + } + } catch (e: Exception) { + log.error("Failed to create report directory", e) + context.task.error(e) + throw e + } + + val reportFile = File(reportDir, "${companySafe}_${titleSafe}_${timestamp}.md") + + val report = buildString { + appendLine("# Job Application Report") + appendLine() + appendLine("**Generated:** ${LocalDateTime.now()}") + appendLine("**Match Score:** ${jobAnalysis.match_score}") + appendLine("**Job URL:** ${jobAnalysis.application_url}") + appendLine() + + appendLine("## Position Details") + appendLine("- **Title:** ${jobAnalysis.job_title}") + appendLine("- **Company:** ${jobAnalysis.company}") + appendLine("- **Location:** ${jobAnalysis.location ?: "Not specified"}") + jobAnalysis.additional_locations?.let { + if (it.isNotEmpty()) { + appendLine("- **Additional Locations:** ${it.joinToString(", ")}") + } + } + appendLine("- **Work Arrangement:** ${jobAnalysis.work_arrangement ?: "Not specified"}") + jobAnalysis.days_in_office?.let { + appendLine("- **Days in Office:** $it/week") + } + jobAnalysis.travel_percentage?.let { + appendLine("- **Travel Required:** $it%") + } + jobAnalysis.travel_requirements?.let { + appendLine("- **Travel Details:** $it") + } + appendLine("- **Application URL:** [Apply Here](${jobAnalysis.application_url})") + appendLine() + appendLine("## Compensation") + if (jobAnalysis.salary_min != null || jobAnalysis.salary_max != null) { + val salaryRange = buildString { + jobAnalysis.salary_min?.let { append("${jobAnalysis.salary_currency ?: "USD"} $it") } + if (jobAnalysis.salary_min != null && jobAnalysis.salary_max != null) append(" - ") + jobAnalysis.salary_max?.let { append("${jobAnalysis.salary_currency ?: "USD"} $it") } + jobAnalysis.salary_period?.let { append(" ($it)") } + } + appendLine("- **Salary Range:** $salaryRange") + } else { + appendLine("- **Salary Range:** Not disclosed") + } + jobAnalysis.compensation_details?.let { + appendLine("- **Additional Compensation:** $it") + } + if (jobAnalysis.relocation_offered == true) { + appendLine("- **Relocation:** Offered") + jobAnalysis.relocation_assistance?.let { + appendLine(" - $it") + } + } + appendLine() + + appendLine("## Match Analysis") + appendLine("### Overall Match Score: ${(jobAnalysis.match_score * 100).toInt()}%") + appendLine(jobAnalysis.match_analysis) + appendLine() + appendLine("### Location Compatibility: ${(jobAnalysis.location_score * 100).toInt()}%") + appendLine(jobAnalysis.location_analysis) + appendLine() + if (jobAnalysis.salary_min != null || jobAnalysis.salary_max != null) { + appendLine("### Salary Compatibility: ${(jobAnalysis.salary_score * 100).toInt()}%") + appendLine(jobAnalysis.compensation_analysis) + appendLine() + } + appendLine("### Work Arrangement Fit: ${(jobAnalysis.work_arrangement_score * 100).toInt()}%") + appendLine() + + appendLine("## Skills Assessment") + appendLine("### Matching Skills (${jobAnalysis.skill_matches.size})") + jobAnalysis.skill_matches.forEach { skill -> + appendLine("- ✅ $skill") + } + appendLine() + + appendLine("### Skill Gaps (${jobAnalysis.skill_gaps.size})") + jobAnalysis.skill_gaps.forEach { skill -> + appendLine("- ⚠️ $skill") + } + appendLine() + + appendLine("## Cover Letter Draft") + appendLine() + appendLine(jobAnalysis.cover_letter) + appendLine() + + appendLine("## Application Strategy") + appendLine(jobAnalysis.application_notes) + appendLine() + + appendLine("---") + appendLine() + appendLine("## Job Description") + appendLine() + appendLine(jobAnalysis.job_description) + appendLine("---") + appendLine() + + appendLine("
    ") + appendLine("Job Analysis Data (JSON)") + appendLine() + appendLine("```json") + appendLine(jobAnalysis.toJson()) + appendLine("```") + appendLine() + appendLine("
    ") + } + + try { + reportFile.writeText(report) + log.info("Saved job report: ${reportFile.absolutePath}") + } catch (e: IOException) { + val errorMsg = "Failed to write job report to file: ${reportFile.absolutePath}" + log.error(errorMsg, e) + context.task.error(e) + throw IOException(errorMsg, e) + } + + // Log report location to transcript + } + + private fun formatJobAnalysis(jobAnalysis: JobAnalysis): String { + return buildString { + appendLine("### ${jobAnalysis.job_title} at ${jobAnalysis.company}") + appendLine() + appendLine("**Match Score:** ${(jobAnalysis.match_score * 100).toInt()}%") + appendLine("**Location:** ${jobAnalysis.location ?: "Not specified"}") + jobAnalysis.work_arrangement?.let { + appendLine("**Work Arrangement:** $it") + } + if (jobAnalysis.salary_min != null || jobAnalysis.salary_max != null) { + val salaryRange = buildString { + jobAnalysis.salary_min?.let { append("${jobAnalysis.salary_currency ?: "USD"} $it") } + if (jobAnalysis.salary_min != null && jobAnalysis.salary_max != null) append(" - ") + jobAnalysis.salary_max?.let { append("${jobAnalysis.salary_currency ?: "USD"} $it") } + } + appendLine("**Salary:** $salaryRange") + } + appendLine("**Application:** [Apply Here](${jobAnalysis.application_url})") + appendLine() + + if (jobAnalysis.match_score >= 0.6) { + appendLine("✅ **Good Match** - Detailed report saved") + } else { + appendLine("⚠️ **Weak Match** - Consider other opportunities") + } + appendLine() + appendLine("**Compatibility Scores:**") + appendLine("- Skills: ${(jobAnalysis.match_score * 100).toInt()}%") + appendLine("- Location: ${(jobAnalysis.location_score * 100).toInt()}%") + if (jobAnalysis.salary_min != null || jobAnalysis.salary_max != null) { + appendLine("- Salary: ${(jobAnalysis.salary_score * 100).toInt()}%") + } + appendLine("- Work Arrangement: ${(jobAnalysis.work_arrangement_score * 100).toInt()}%") + appendLine() + + appendLine("
    ") + appendLine("Match Analysis") + appendLine() + appendLine(jobAnalysis.match_analysis) + appendLine() + appendLine("
    ") + appendLine() + } + } + + override fun analysisGoal(context: PageProcessingStrategy.ProcessingContext): String = when { + context.executionConfig.content_queries != null -> context.executionConfig.content_queries.toJson() + context.executionConfig.task_description?.isNotBlank() == true -> context.executionConfig.task_description!! + else -> "Analyze the content and provide insights." + } + " - Identify pages that contain or are likely to lead to job postings matching the user's experience and target roles." + + override fun shouldContinueCrawling( + currentResults: List, + context: PageProcessingStrategy.ProcessingContext + ): PageProcessingStrategy.ContinuationDecision { + val anyTermination = currentResults.any { it.shouldTerminate } + val reason = if (anyTermination) { + currentResults.first { it.shouldTerminate }.terminationReason ?: "Target matches found" + } else { + "Continue searching for job matches (${goodMatches.size} found so far)" + } + val shouldContinue = !anyTermination && context.processedCount.get() < context.maxPages + return PageProcessingStrategy.ContinuationDecision(shouldContinue, reason) + } + + override fun generateFinalOutput( + results: List, + context: PageProcessingStrategy.ProcessingContext + ): String { + // Log final summary generation to transcript + context.transcriptStream?.let { stream -> + try { + stream.write("\n\n## Job Search Final Summary\n\n".toByteArray(StandardCharsets.UTF_8)) + stream.write("**Total Pages Analyzed:** ${results.size}\n".toByteArray(StandardCharsets.UTF_8)) + stream.write("**Job Postings Found:** ${results.count { it.metadata["is_job_posting"] == true }}\n".toByteArray(StandardCharsets.UTF_8)) + stream.write("**Good Matches:** ${goodMatches.size}\n\n".toByteArray(StandardCharsets.UTF_8)) + stream.flush() + } catch (e: Exception) { + log.debug("Failed to write final summary to transcript", e) + } + } + + return buildString { + appendLine("# Job Search Results") + appendLine() + appendLine("**Search Completed:** ${LocalDateTime.now()}") + appendLine("**Pages Analyzed:** ${results.size}") + appendLine("**Job Postings Found:** ${results.count { it.metadata["is_job_posting"] == true }}") + appendLine("**Good Matches:** ${goodMatches.size}") + appendLine() + + if (goodMatches.isEmpty()) { + appendLine("⚠️ No strong matches found. Consider:") + appendLine("- Broadening search criteria") + appendLine("- Adjusting required skills") + appendLine("- Expanding target roles") + appendLine("- Relaxing location or work arrangement preferences") + appendLine("- Adjusting salary expectations") + appendLine() + return@buildString + } + + appendLine("## Top Matches") + appendLine() + + goodMatches.values + .sortedByDescending { it.match_score } + .forEach { job -> + appendLine("### ${job.job_title} at ${job.company}") + appendLine() + appendLine("**Match Score:** ${(job.match_score * 100).toInt()}%") + appendLine("**Location:** ${job.location ?: "Not specified"}") + job.work_arrangement?.let { + appendLine("**Work Arrangement:** $it") + } + if (job.salary_min != null || job.salary_max != null) { + val salaryRange = buildString { + job.salary_min?.let { append("${job.salary_currency ?: "USD"} $it") } + if (job.salary_min != null && job.salary_max != null) append(" - ") + job.salary_max?.let { append("${job.salary_currency ?: "USD"} $it") } + } + appendLine("**Salary:** $salaryRange") + } + appendLine("**Application:** [Apply Here](${job.application_url})") + appendLine() + appendLine("**Compatibility:**") + appendLine("- Skills: ${(job.match_score * 100).toInt()}%") + appendLine("- Location: ${(job.location_score * 100).toInt()}%") + if (job.salary_min != null || job.salary_max != null) { + appendLine("- Salary: ${(job.salary_score * 100).toInt()}%") + } + appendLine("- Work Arrangement: ${(job.work_arrangement_score * 100).toInt()}%") + appendLine() + appendLine("**Skills Match:** ${job.skill_matches.size}/${job.skill_matches.size + job.skill_gaps.size}") + appendLine() + + appendLine() + appendLine(job.match_analysis.take(300) + "...") + appendLine() + appendLine("---") + appendLine() + } + + appendLine("## Next Steps") + appendLine() + appendLine("1. Review detailed reports in the `job_matches` directory") + appendLine("2. Customize cover letters for each application") + appendLine("3. Prepare for interviews by reviewing skill gaps") + appendLine("4. Track application status") + appendLine("5. Verify work arrangement and compensation details during screening") + appendLine("6. Prepare questions about travel requirements and relocation assistance") + // Log completion to transcript + context.transcriptStream?.let { stream -> + try { + stream.write("\n**Job search crawling completed successfully**\n".toByteArray(StandardCharsets.UTF_8)) + stream.write("**Good matches found:** ${goodMatches.size}\n\n".toByteArray(StandardCharsets.UTF_8)) + stream.flush() + } catch (e: Exception) { + log.debug("Failed to write completion to transcript", e) + } + } + } + } + + override fun validateConfig(config: Any?): String? { + return null + } +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/PageProcessingStrategy.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/PageProcessingStrategy.kt new file mode 100644 index 000000000..39a708e54 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/PageProcessingStrategy.kt @@ -0,0 +1,91 @@ +package com.simiacryptus.cognotik.plan.tools.online.processing + + import com.simiacryptus.cognotik.plan.OrchestrationConfig + import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask.CrawlerTaskExecutionConfigData + import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask.CrawlerTaskTypeConfig + import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask.LinkData + import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask.PageType +import com.simiacryptus.cognotik.webui.session.SessionTask +import java.io.File + import java.io.FileOutputStream +import java.util.concurrent.atomic.AtomicInteger + +interface PageProcessingStrategy { + val description: String + + /** + * Process a single page and return results + * + * @param url The URL of the page being processed + * @param content The content of the page + * @param context The processing context containing configuration and state + * @return PageProcessingResult containing the processing outcome + * @throws Exception if processing fails critically + */ + fun processPage( + url: String, + content: String, + context: ProcessingContext + ): PageProcessingResult + + /** + * Determine if crawling should continue + * + * @param currentResults The results from pages processed so far + * @param context The processing context + * @return ContinuationDecision indicating whether to continue and why + */ + fun shouldContinueCrawling( + currentResults: List, + context: ProcessingContext + ): ContinuationDecision + + /** + * Generate final output from all processed pages + * + * @param results All page processing results + * @param context The processing context + * @return String containing the final formatted output + */ + fun generateFinalOutput( + results: List, + context: ProcessingContext + ): String + + /** + * Strategy-specific configuration validation + * + * @param config The configuration to validate + * @return Error message if validation fails, null if valid + */ + fun validateConfig(config: Any?): String? + + + data class ProcessingContext( + val executionConfig: CrawlerTaskExecutionConfigData, + val typeConfig: CrawlerTaskTypeConfig, + val orchestrationConfig: OrchestrationConfig, + val messages: List = emptyList(), + val task: SessionTask, + val webSearchDir: File = File("websearch"), + val processedCount: AtomicInteger = AtomicInteger(0), + val maxPages: Int = Int.MAX_VALUE, + val transcriptStream: FileOutputStream? = null + ) + + data class PageProcessingResult( + val url: String = "", + val pageType: PageType = PageType.Error, + val content: String = "", + val extractedLinks: List? = null, + val metadata: Map = emptyMap(), + val shouldTerminate: Boolean = false, + val terminationReason: String? = null, + val error: Throwable? = null + ) + + data class ContinuationDecision( + val shouldContinue: Boolean = true, + val reason: String = "No specific reason", + ) +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/ProcessingStrategyType.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/ProcessingStrategyType.kt new file mode 100644 index 000000000..d7af5c83e --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/ProcessingStrategyType.kt @@ -0,0 +1,22 @@ +package com.simiacryptus.cognotik.plan.tools.online.processing + +enum class ProcessingStrategyType { + + DefaultSummarizer { + override fun createStrategy(): PageProcessingStrategy = DefaultSummarizerStrategy() + }, + FactChecking { + override fun createStrategy(): PageProcessingStrategy = FactCheckingStrategy() + }, + JobMatching { + override fun createStrategy(): PageProcessingStrategy = JobMatchingStrategy() + }, + SchemaExtraction { + override fun createStrategy(): PageProcessingStrategy = SchemaExtractionStrategy() + }, + DataTableAccumulation {; + override fun createStrategy(): PageProcessingStrategy = DataTableAccumulationStrategy() + }; + + abstract fun createStrategy(): PageProcessingStrategy +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/README.md b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/README.md new file mode 100644 index 000000000..7252b41cd --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/README.md @@ -0,0 +1,449 @@ +# Web Crawling and Processing Package + +## Overview + +This package provides a flexible and powerful framework for web crawling, content extraction, and intelligent data processing. It combines AI-powered analysis with configurable processing strategies to extract structured information from websites. + +## Core Components + +### Page Processing Strategies + +The package implements a strategy pattern for processing web pages, allowing different extraction and analysis approaches: + +#### 1. **DefaultSummarizerStrategy** +Basic content analysis and summarization strategy. + +**Use Cases:** +- General web content analysis +- Research and information gathering +- Content summarization + +**Features:** +- AI-powered content transformation +- Automatic link extraction +- Markdown-formatted output +- Chunked processing for large pages + +**Configuration:** +```json +{ + "task_description": "Analyze content and provide insights", + "content_queries": "Optional specific queries" +} +``` + +#### 2. **FactCheckingStrategy** +Verifies claims against web content with evidence tracking. + +**Use Cases:** +- Claim verification +- Research validation +- Source credibility assessment + +**Features:** +- Multi-source evidence collection +- Confidence scoring +- Supporting/contradicting evidence tracking +- Automatic termination when sufficient evidence is gathered + +**Configuration:** +```json +{ + "claims_to_verify": ["Claim 1", "Claim 2"], + "confidence_threshold": 0.8, + "required_sources": 3, + "contradiction_threshold": 2 +} +``` + +**Example:** +```kotlin +val config = FactCheckingConfig( + claims_to_verify = listOf( + "Company X has 10,000 employees", + "Product Y was released in 2023" + ), + confidence_threshold = 0.8, + required_sources = 3 +) +``` + +#### 3. **JobMatchingStrategy** +Analyzes job postings and matches them against candidate profiles. + +**Use Cases:** +- Job search automation +- Resume matching +- Application material generation + +**Features:** +- Automatic job posting detection +- Multi-dimensional matching (skills, location, salary, work arrangement) +- Cover letter generation +- Detailed application reports +- Location and work arrangement compatibility scoring +- Salary range analysis + +**Configuration:** +```json +{ + "user_experience": "Your resume/experience summary", + "target_roles": ["Software Engineer", "Senior Developer"], + "required_skills": ["Python", "AWS", "Docker"], + "preferred_locations": ["San Francisco", "Remote"], + "acceptable_locations": ["California", "New York"], + "excluded_locations": ["International"], + "min_match_score": 0.6, + "target_matches": 10, + "work_arrangement_preference": "remote", + "max_days_in_office": 2, + "min_salary": 120000, + "target_salary": 150000, + "max_salary": 180000, + "salary_currency": "USD", + "willing_to_relocate": false +} +``` + +**Output:** +- Individual job reports in `job_matches/` directory +- Cover letters tailored to each position +- Compatibility scores (skills, location, salary, work arrangement) +- Application strategy notes + +#### 4. **SchemaExtractionStrategy** +Extracts structured data according to user-defined schemas. + +**Use Cases:** +- Data mining +- Structured data extraction +- API-like data collection from websites + +**Features:** +- Custom JSON schema definition +- Automatic data validation +- Deduplication +- Aggregated JSON output +- Confidence-based filtering + +**Configuration:** +```json +{ + "schema_definition": "{\"name\": \"string\", \"price\": \"number\"}", + "extraction_instructions": "Extract product information", + "aggregate_results": true, + "min_confidence": 0.7, + "max_items_per_page": 50, + "validate_schema": true, + "deduplicate": true, + "deduplication_keys": "name,id" +} +``` + +**Example Schema:** +```json +{ + "product_name": "string", + "price": "number", + "rating": "number", + "availability": "boolean", + "specifications": { + "weight": "string", + "dimensions": "string" + } +} +``` + +**Output:** +- `aggregated_data.json` - All extracted data +- `aggregated_data_pretty.json` - Pretty-printed version +- `extraction_metadata.json` - Extraction statistics + +#### 5. **DataTableAccumulationStrategy** +Builds comprehensive datasets from web pages with configurable columns. + +**Use Cases:** +- Competitive analysis +- Market research +- Price comparison +- Feature matrices + +**Features:** +- Configurable column definitions +- Type validation +- Data normalization +- Multiple export formats (CSV, JSON, Markdown) +- Automatic HTML table detection +- Row deduplication + +**Configuration:** +```json +{ + "column_names": "Product,Price,Rating,Availability", + "column_descriptions": { + "Product": "Product name or title", + "Price": "Price in USD", + "Rating": "Customer rating out of 5", + "Availability": "In stock status" + }, + "column_types": { + "Product": "string", + "Price": "number", + "Rating": "number", + "Availability": "boolean" + }, + "extraction_instructions": "Extract product comparison data", + "auto_detect_tables": true, + "min_rows": 1, + "max_rows_per_page": 100, + "deduplicate": true, + "key_columns": "Product", + "validate_types": true, + "normalize_data": true, + "export_format": "csv", + "include_source_urls": true +} +``` + +**Output:** +- `data_table.csv` / `.json` / `.md` - Exported table +- Column statistics +- Data quality metrics + +## Architecture + +### ProcessingContext + +Shared context for all strategies: + +```kotlin +data class ProcessingContext( + val executionConfig: CrawlerTaskExecutionConfigData, + val typeConfig: CrawlerTaskTypeConfig, + val orchestrationConfig: OrchestrationConfig, + val messages: List, + val task: SessionTask, + val webSearchDir: File, + val processedCount: AtomicInteger, + val maxPages: Int, + val transcriptStream: FileOutputStream? +) +``` + +### PageProcessingResult + +Standard result format: + +```kotlin +data class PageProcessingResult( + val url: String, + val pageType: PageType, + val content: String, + val extractedLinks: List?, + val metadata: Map, + val shouldTerminate: Boolean, + val terminationReason: String?, + val error: Throwable? +) +``` + +## Strategy Selection + +Strategies are selected via the `ProcessingStrategyType` enum: + +```kotlin +enum class ProcessingStrategyType { + DefaultSummarizer, + FactChecking, + JobMatching, + SchemaExtraction, + DataTableAccumulation +} +``` + +## Common Features + +### Early Termination +All strategies support early termination based on: +- Target achievement (e.g., finding N job matches) +- Confidence thresholds +- Evidence sufficiency +- Custom strategy-specific conditions + +### Link Extraction +Intelligent link prioritization based on: +- Relevance scoring +- Content analysis +- Strategy-specific criteria + +### Progress Tracking +- Real-time transcript updates +- Processing statistics +- Error handling and reporting + +### Output Formats +- Markdown reports +- JSON data exports +- CSV tables +- Structured metadata + +## Usage Examples + +### Basic Crawling with Summarization + +```kotlin +val strategy = DefaultSummarizerStrategy() +val context = ProcessingContext( + executionConfig = config, + typeConfig = typeConfig, + orchestrationConfig = orchestrationConfig, + task = task, + webSearchDir = File("output") +) + +val result = strategy.processPage(url, content, context) +``` + +### Job Search Automation + +```kotlin +val strategy = JobMatchingStrategy() +val config = JobMatchingConfig( + user_experience = resumeText, + target_roles = listOf("Senior Engineer", "Tech Lead"), + required_skills = listOf("Kotlin", "AWS", "Kubernetes"), + preferred_locations = listOf("Remote", "San Francisco"), + min_match_score = 0.7, + target_matches = 5, + work_arrangement_preference = "remote", + min_salary = 150000 +) + +// Strategy will automatically: +// 1. Detect job postings +// 2. Analyze compatibility +// 3. Generate cover letters +// 4. Save detailed reports +// 5. Terminate after finding 5 good matches +``` + +### Data Extraction + +```kotlin +val strategy = SchemaExtractionStrategy() +val config = SchemaExtractionConfig( + schema_definition = """ + { + "title": "string", + "price": "number", + "features": ["string"] + } + """, + aggregate_results = true, + deduplicate = true +) + +// Extracts structured data matching schema +// Outputs aggregated JSON file +``` + +### Fact Verification + +```kotlin +val strategy = FactCheckingStrategy() +val config = FactCheckingConfig( + claims_to_verify = listOf( + "The company was founded in 2010", + "The product has 1M+ users" + ), + required_sources = 3, + confidence_threshold = 0.8 +) + +// Collects evidence from multiple sources +// Terminates when sufficient evidence found +``` + +## Error Handling + +All strategies implement robust error handling: + +```kotlin +try { + val result = strategy.processPage(url, content, context) + if (result.error != null) { + // Handle processing error + } +} catch (e: Exception) { + // Handle critical failure +} +``` + +## Configuration Validation + +Each strategy validates its configuration: + +```kotlin +val error = strategy.validateConfig(config) +if (error != null) { + throw IllegalArgumentException(error) +} +``` + +## Best Practices + +1. **Choose the Right Strategy** + - Use `DefaultSummarizer` for general content analysis + - Use `JobMatching` for recruitment automation + - Use `SchemaExtraction` for structured data mining + - Use `FactChecking` for claim verification + - Use `DataTableAccumulation` for comparative datasets + +2. **Configure Appropriately** + - Set realistic confidence thresholds + - Define clear extraction criteria + - Use deduplication for large datasets + - Set page limits to control costs + +3. **Monitor Progress** + - Check transcript streams for real-time updates + - Review metadata for extraction statistics + - Handle early termination gracefully + +4. **Handle Errors** + - Implement retry logic for transient failures + - Log errors for debugging + - Validate configurations before execution + +5. **Optimize Performance** + - Use appropriate page limits + - Enable deduplication when needed + - Set confidence thresholds to filter noise + - Leverage early termination + +## Output Structure + +``` +output/ +├── transcript.md # Real-time processing log +├── final_report.md # Final summary +├── aggregated_data.json # Extracted data (SchemaExtraction) +├── data_table.csv # Tabular data (DataTableAccumulation) +└── job_matches/ # Job reports (JobMatching) + ├── Company_Position_timestamp.md + └── ... +``` + +## Dependencies + +- AI/LLM integration via `ChatInterface` +- JSON parsing via Jackson +- Markdown generation +- Concurrent processing support + +## Thread Safety + +All strategies use thread-safe data structures: +- `ConcurrentHashMap` for shared state +- `AtomicInteger` for counters +- Synchronized file I/O diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/SchemaExtractionStrategy.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/SchemaExtractionStrategy.kt new file mode 100644 index 000000000..56d9b1888 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/processing/SchemaExtractionStrategy.kt @@ -0,0 +1,445 @@ +package com.simiacryptus.cognotik.plan.tools.online.processing + +import com.fasterxml.jackson.databind.ObjectMapper +import com.simiacryptus.cognotik.agents.ParsedAgent +import com.simiacryptus.cognotik.describe.Description +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask +import com.simiacryptus.cognotik.util.JsonUtil +import com.simiacryptus.cognotik.util.jsonCast +import com.simiacryptus.cognotik.util.toJson +import com.simiacryptus.cognotik.webui.session.getChildClient +import org.slf4j.LoggerFactory +import java.io.File +import java.nio.charset.StandardCharsets +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter +import java.util.concurrent.ConcurrentHashMap + +class SchemaExtractionStrategy : DefaultSummarizerStrategy() { + + data class SchemaExtractionConfig( + @Description("JSON schema definition describing the data structure to extract from each page") + val schema_definition: String = "{}", + @Description("Human-readable description of what data to extract") + val extraction_instructions: String = "", + @Description("Whether to aggregate all extracted data into a single JSON array") + val aggregate_results: Boolean = true, + @Description("Minimum confidence score (0.0-1.0) for extracted data to be included") + val min_confidence: Double = 0.7, + @Description("Maximum number of items to extract per page (null for unlimited)") + val max_items_per_page: Int? = null, + @Description("Whether to validate extracted data against the schema") + val validate_schema: Boolean = true, + @Description("Whether to deduplicate extracted items based on key fields") + val deduplicate: Boolean = true, + @Description("Field names to use as unique keys for deduplication (comma-separated)") + val deduplication_keys: String? = null + ) + + override val description: String + get() = "Extracts structured data from web pages according to a user-defined schema and aggregates results into a comprehensive JSON dataset." + + data class ExtractedData( + @Description("The extracted data matching the schema") + val data: Any? = null, + @Description("Confidence score for the extraction (0.0-1.0)") + val confidence: Double = 1.0, + @Description("Additional metadata about the extraction") + val metadata: Map = emptyMap(), + @Description("Any validation errors or warnings") + val validation_notes: List = emptyList() + ) + + private val extractedDataStore = ConcurrentHashMap>>() + private val seenKeys = ConcurrentHashMap.newKeySet() + + companion object { + private val log = LoggerFactory.getLogger(SchemaExtractionStrategy::class.java) + } + + override fun processPage( + url: String, + content: String, + context: PageProcessingStrategy.ProcessingContext + ): PageProcessingStrategy.PageProcessingResult { + log.debug("Processing page with schema extraction: $url") + val config = try { + context.executionConfig.content_queries?.let { queries -> + when (queries) { + is String -> JsonUtil.fromJson(queries, SchemaExtractionConfig::class.java) + else -> queries.jsonCast() + } + } ?: run { + log.warn("No schema extraction config provided, using default") + SchemaExtractionConfig() + } + } catch (e: Exception) { + log.error("Failed to parse schema extraction config", e) + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = CrawlerAgentTask.PageType.Error, + content = "Configuration error: ${e.message}", + error = e + ) + } + + // Extract data using the schema + val extractionResult = try { + extractSchemaData(url, content, config, context) + } catch (e: Exception) { + log.error("Failed to extract schema data from: $url", e) + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = CrawlerAgentTask.PageType.Error, + content = "Extraction error: ${e.message}", + error = e + ) + } + + // Filter by confidence + val filteredData = extractionResult.data?.let { data -> + if (extractionResult.confidence >= config.min_confidence) { + data + } else { + log.debug("Extracted data below confidence threshold: ${extractionResult.confidence} < ${config.min_confidence}") + null + } + } + + // Store extracted data + if (filteredData != null) { + storeExtractedData(url, filteredData, config) + } + + // Generate summary of extraction + val summary = buildString { + appendLine("## Data Extraction Results") + appendLine() + appendLine("**URL:** [$url]($url)") + appendLine("**Confidence:** ${(extractionResult.confidence * 100).toInt()}%") + appendLine() + + if (extractionResult.validation_notes.isNotEmpty()) { + appendLine("### Validation Notes") + extractionResult.validation_notes.forEach { note -> + appendLine("- $note") + } + appendLine() + } + + if (filteredData != null) { + appendLine("### Extracted Data") + appendLine() + appendLine("```json") + appendLine(filteredData.toJson()) + appendLine("```") + appendLine() + + val itemCount = when (filteredData) { + is List<*> -> filteredData.size + is Map<*, *> -> 1 + else -> 1 + } + appendLine("**Items Extracted:** $itemCount") + appendLine("**Total Items in Store:** ${extractedDataStore.values.sumOf { it.size }}") + } else { + appendLine("*No data met the confidence threshold*") + } + appendLine() + } + + // Also get standard link extraction + val standardResult = super.processPage(url, content, context) + + return PageProcessingStrategy.PageProcessingResult( + url = url, + pageType = if (filteredData != null) CrawlerAgentTask.PageType.OK else CrawlerAgentTask.PageType.Irrelevant, + content = summary, + extractedLinks = standardResult.extractedLinks, + metadata = mapOf( + "extracted_data" to (filteredData ?: emptyMap()), + "confidence" to extractionResult.confidence, + "validation_notes" to extractionResult.validation_notes, + "total_items" to extractedDataStore.values.sumOf { it.size } + ) + ) + } + + private fun extractSchemaData( + url: String, + content: String, + config: SchemaExtractionConfig, + context: PageProcessingStrategy.ProcessingContext + ): ExtractedData { + log.debug("Extracting schema data from: $url") + + val prompt = buildString { + appendLine("Extract structured data from the following web page content according to the schema provided.") + appendLine() + appendLine("SCHEMA DEFINITION:") + appendLine(config.schema_definition) + appendLine() + appendLine("EXTRACTION INSTRUCTIONS:") + appendLine(config.extraction_instructions.ifBlank { "Extract all data matching the schema" }) + appendLine() + if (config.max_items_per_page != null) { + appendLine("Extract up to ${config.max_items_per_page} items from this page.") + appendLine() + } + appendLine("Provide:") + appendLine("1. The extracted data matching the schema (as 'data' field)") + appendLine("2. A confidence score (0.0-1.0) indicating extraction quality") + appendLine("3. Any validation notes or warnings") + appendLine() + appendLine("If multiple items match the schema, return them as a list in the 'data' field.") + appendLine("If no data matches the schema, return null for 'data' with an explanation in validation_notes.") + } + + val model = (context.typeConfig.model?.let { context.orchestrationConfig.instance(it) } + ?: context.orchestrationConfig.parsingChatter).getChildClient(context.task) + + return ParsedAgent( + prompt = prompt, + resultClass = ExtractedData::class.java, + model = model, + parsingChatter = model + ).answer(listOf(content.take(50000))).obj // Limit content size for processing + } + + private fun storeExtractedData( + url: String, + data: Any, + config: SchemaExtractionConfig + ) { + val items = when (data) { + is List<*> -> data.filterIsInstance>() + is Map<*, *> -> listOf(data as Map) + else -> { + log.warn("Unexpected data type: ${data.javaClass.name}") + return + } + } + + val deduplicationKeys = config.deduplication_keys?.split(",")?.map { it.trim() } ?: emptyList() + + items.forEach { item -> + // Check for duplicates if enabled + if (config.deduplicate && deduplicationKeys.isNotEmpty()) { + val key = deduplicationKeys.mapNotNull { keyField -> + item[keyField]?.toString() + }.joinToString("|") + + if (key.isNotBlank()) { + if (seenKeys.contains(key)) { + log.debug("Skipping duplicate item with key: $key") + return@forEach + } + seenKeys.add(key) + } + } + + // Store the item + extractedDataStore.computeIfAbsent(url) { mutableListOf() }.add(item) + } + + log.info("Stored ${items.size} items from: $url (total: ${extractedDataStore.values.sumOf { it.size }})") + } + + override fun shouldContinueCrawling( + currentResults: List, + context: PageProcessingStrategy.ProcessingContext + ): PageProcessingStrategy.ContinuationDecision { + val totalItems = extractedDataStore.values.sumOf { it.size } + val successfulExtractions = currentResults.count { + it.metadata["extracted_data"] != null && it.metadata["extracted_data"] != emptyMap() + } + + log.debug("Crawling status: $successfulExtractions successful extractions, $totalItems total items") + + return PageProcessingStrategy.ContinuationDecision( + shouldContinue = context.processedCount.get() < context.maxPages, + reason = "Extracted $totalItems items from $successfulExtractions pages so far" + ) + } + + override fun generateFinalOutput( + results: List, + context: PageProcessingStrategy.ProcessingContext + ): String { + log.info("Generating final aggregated output") + val config = try { + context.executionConfig.content_queries?.let { queries -> + when (queries) { + is String -> JsonUtil.fromJson(queries, SchemaExtractionConfig::class.java) + else -> queries.jsonCast() + } + } ?: SchemaExtractionConfig() + } catch (e: Exception) { + log.error("Failed to parse config for final output", e) + SchemaExtractionConfig() + } + + // Aggregate all extracted data + val allData = if (config.aggregate_results) { + extractedDataStore.values.flatten() + } else { + extractedDataStore.map { (url, items) -> + mapOf( + "source_url" to url, + "items" to items, + "count" to items.size + ) + } + } + + // Save aggregated JSON + saveAggregatedJson(allData, context) + + // Generate summary report + return buildString { + appendLine("# Schema Extraction Results") + appendLine() + appendLine("**Extraction Completed:** ${LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))}") + appendLine() + + appendLine("## Summary Statistics") + appendLine() + appendLine("- **Total Pages Processed:** ${results.size}") + appendLine("- **Successful Extractions:** ${extractedDataStore.size}") + appendLine("- **Total Items Extracted:** ${allData.size}") + appendLine("- **Unique Sources:** ${extractedDataStore.keys.size}") + if (config.deduplicate) { + appendLine("- **Deduplication:** Enabled") + appendLine("- **Unique Items After Deduplication:** ${allData.size}") + } + appendLine() + + appendLine("## Schema Definition") + appendLine() + appendLine("```json") + appendLine(config.schema_definition) + appendLine("```") + appendLine() + + appendLine("## Extraction Statistics by Source") + appendLine() + appendLine("| Source | Items Extracted | Confidence |") + appendLine("|--------|-----------------|------------|") + + results.filter { it.metadata.containsKey("extracted_data") }.forEach { result -> + val itemCount = when (val data = result.metadata["extracted_data"]) { + is List<*> -> data.size + is Map<*, *> -> if (data.isEmpty()) 0 else 1 + else -> 0 + } + val confidence = result.metadata["confidence"] as? Double ?: 0.0 + val shortUrl = result.url.take(50) + if (result.url.length > 50) "..." else "" + appendLine("| [$shortUrl](${result.url}) | $itemCount | ${(confidence * 100).toInt()}% |") + } + appendLine() + + appendLine("## Aggregated Data") + appendLine() + appendLine("The complete extracted dataset has been saved to:") + appendLine("- `${context.webSearchDir.name}/aggregated_data.json`") + appendLine() + + appendLine("### Sample Data (First 3 Items)") + appendLine() + appendLine("```json") + appendLine(allData.take(3).toJson()) + appendLine("```") + appendLine() + + if (allData.size > 3) { + appendLine("*... and ${allData.size - 3} more items*") + appendLine() + } + + appendLine("## Data Quality Notes") + appendLine() + val allValidationNotes = results.flatMap { + (it.metadata["validation_notes"] as? List<*>)?.filterIsInstance() ?: emptyList() + }.distinct() + + if (allValidationNotes.isEmpty()) { + appendLine("✅ No validation issues detected") + } else { + allValidationNotes.forEach { note -> + appendLine("- ⚠️ $note") + } + } + appendLine() + + appendLine("## Next Steps") + appendLine() + appendLine("1. Review the aggregated JSON file for completeness") + appendLine("2. Validate data quality and schema compliance") + appendLine("3. Import data into your target system") + appendLine("4. Consider additional crawling if coverage is incomplete") + } + } + + private fun saveAggregatedJson( + data: Any, + context: PageProcessingStrategy.ProcessingContext + ) { + try { + val jsonFile = File(context.webSearchDir, "aggregated_data.json") + val jsonContent = data.toJson() + jsonFile.writeText(jsonContent, StandardCharsets.UTF_8) + log.info("Saved aggregated JSON to: ${jsonFile.absolutePath} (${jsonContent.length} bytes)") + + // Also save a pretty-printed version + val prettyFile = File(context.webSearchDir, "aggregated_data_pretty.json") + val prettyJson = ObjectMapper() + .writerWithDefaultPrettyPrinter() + .writeValueAsString(data) + prettyFile.writeText(prettyJson, StandardCharsets.UTF_8) + + // Save metadata + val metadataFile = File(context.webSearchDir, "extraction_metadata.json") + val metadata = mapOf( + "timestamp" to LocalDateTime.now().toString(), + "total_items" to when (data) { + is List<*> -> data.size + else -> 1 + }, + "sources" to extractedDataStore.keys.toList(), + "schema" to (context.executionConfig.content_queries?.toString() ?: "") + ) + metadataFile.writeText(metadata.toJson(), StandardCharsets.UTF_8) + + } catch (e: Exception) { + log.error("Failed to save aggregated JSON", e) + } + } + + override fun validateConfig(config: Any?): String? { + if (config == null) return "Schema extraction config is required" + + return try { + val schemaConfig = when (config) { + is SchemaExtractionConfig -> config + is String -> ObjectMapper().readValue(config, SchemaExtractionConfig::class.java) + else -> return "Invalid config type: ${config.javaClass.name}" + } + + if (schemaConfig.schema_definition.isBlank()) { + return "schema_definition is required and cannot be blank" + } + + if (schemaConfig.min_confidence < 0.0 || schemaConfig.min_confidence > 1.0) { + return "min_confidence must be between 0.0 and 1.0" + } + + if (schemaConfig.max_items_per_page != null && schemaConfig.max_items_per_page <= 0) { + return "max_items_per_page must be greater than 0" + } + + null + } catch (e: Exception) { + "Config validation error: ${e.message}" + } + } +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/DirectUrls.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/DirectUrls.kt similarity index 92% rename from webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/DirectUrls.kt rename to webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/DirectUrls.kt index 579975b98..c4c089ab4 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/DirectUrls.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/DirectUrls.kt @@ -1,6 +1,7 @@ -package com.simiacryptus.cognotik.plan.tools.online +package com.simiacryptus.cognotik.plan.tools.online.seed import com.simiacryptus.cognotik.plan.OrchestrationConfig +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask import com.simiacryptus.cognotik.platform.model.User import java.net.URI diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/GoogleProxy.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/GoogleProxy.kt similarity index 95% rename from webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/GoogleProxy.kt rename to webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/GoogleProxy.kt index 46d75f9f9..ccf5a19bc 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/GoogleProxy.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/GoogleProxy.kt @@ -1,8 +1,9 @@ -package com.simiacryptus.cognotik.plan.tools.online +package com.simiacryptus.cognotik.plan.tools.online.seed import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.kotlin.readValue import com.simiacryptus.cognotik.plan.OrchestrationConfig +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask import com.simiacryptus.cognotik.platform.model.User import java.net.URI import java.net.URLEncoder @@ -10,7 +11,6 @@ import java.net.http.HttpClient import java.net.http.HttpRequest import java.net.http.HttpResponse import java.time.Duration -import kotlin.math.min class GoogleProxy : SeedMethodFactory { companion object { @@ -33,8 +33,8 @@ class GoogleProxy : SeedMethodFactory { val client = HttpClient.newBuilder().build() val query = taskConfig.search_query.trim() val encodedQuery = URLEncoder.encode(query, "UTF-8") - val resultCount = min(10, 20) - val searchLimit = 15 + val resultCount = 20 + val searchLimit = resultCount SeedMethod.log.debug("Using proxy endpoint: $PROXY_ENDPOINT") diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/GoogleSearch.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/GoogleSearch.kt similarity index 95% rename from webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/GoogleSearch.kt rename to webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/GoogleSearch.kt index 1d31a6c66..8fe79a60c 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/GoogleSearch.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/GoogleSearch.kt @@ -1,9 +1,10 @@ -package com.simiacryptus.cognotik.plan.tools.online +package com.simiacryptus.cognotik.plan.tools.online.seed import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.kotlin.readValue import com.simiacryptus.cognotik.models.APIProvider import com.simiacryptus.cognotik.plan.OrchestrationConfig +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask import com.simiacryptus.cognotik.platform.ApplicationServices import com.simiacryptus.cognotik.platform.file.UserSettingsManager import com.simiacryptus.cognotik.platform.model.User @@ -32,8 +33,8 @@ class GoogleSearch : SeedMethodFactory { SeedMethod.log.debug("Using search query: $query") val encodedQuery = URLEncoder.encode(query, "UTF-8") - val resultCount = min(10, 20) // Ensure we don't exceed API limits - val searchLimit = 15 // Reduced from 20 to be more conservative + val resultCount = 20 // Ensure we don't exceed API limits + val searchLimit = resultCount // Reduced from 20 to be more conservative SeedMethod.log.debug("Fetching user settings for Google Search API") val userSettings = ApplicationServices.fileApplicationServices().userSettingsManager.getUserSettings( user ?: UserSettingsManager.defaultUser diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/README.md b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/README.md new file mode 100644 index 000000000..f6baf079d --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/README.md @@ -0,0 +1,187 @@ +# Online Seed Package + +## Overview + +The `seed` package provides various strategies for discovering and collecting initial URLs (seed items) for web crawling operations. It supports multiple search engines and methods for generating starting points for web crawlers. + +## Components + +### Core Interfaces + +#### `SeedStrategy` +The main interface for implementing seed collection strategies. Each strategy must implement: +- `getSeedItems()`: Returns a list of `SeedItem` objects based on task configuration +- `isEnabled()`: Determines if the strategy is available for use + +#### `SeedMethodFactory` +Factory interface for creating `SeedStrategy` instances with proper user context and task configuration. + +#### `SeedItem` +Data class representing a discovered URL with metadata: +- `link`: The URL to crawl +- `title`: Human-readable title +- `tags`: Optional categorization tags +- `relevance_score`: Relevance rating (1-100) +- `additionalData`: Extra metadata from the source + +### Seed Methods + +#### `DirectUrls` +Directly uses a list of provided URLs without any search or discovery. + +**Use Case**: When you have specific URLs to crawl +**Configuration**: Requires `direct_urls` list in task config +**Enabled**: Always available + +#### `GoogleProxy` +Uses a proxy endpoint to perform Google searches without requiring API credentials. + +**Use Case**: Quick Google searches without API setup +**Configuration**: +- Requires `search_query` in task config +- Uses environment variable `GOOGLE_SEARCH_PROXY_ENDPOINT` (defaults to AWS endpoint) + **Enabled**: Always available + **Limitations**: Returns up to 20 results + +#### `GoogleSearch` +Direct integration with Google Custom Search API. + +**Use Case**: Production Google searches with full API access +**Configuration**: +- Requires `search_query` in task config +- Requires Google API key and Search Engine ID in user settings + **Enabled**: Only when user has configured Google API credentials + **Limitations**: Subject to Google API quotas and rate limits + +#### `SearchAPISearch` +Base class for SearchAPI.io integrations, supporting multiple search engines: + +##### Available Engines: +- **SearchIO_Google_Search**: Standard Google web search +- **SearchIO_Google_Maps**: Location-based business search +- **SearchIO_Google_Scholar**: Academic paper search +- **SearchIO_Google_Patents**: Patent database search +- **SearchIO_Google_News**: News article search +- **SearchIO_Google_Jobs**: Job listing search +- **SearchIO_Amazon**: Amazon product search +- **SearchIO_Bing**: Bing web search +- **SearchIO_DuckDuckGo**: DuckDuckGo web search +- **SearchIO_EBay**: eBay product search + +**Use Case**: Unified API for multiple search engines +**Configuration**: +- Requires `search_query` in task config +- Requires SearchAPI.io API key in user settings + **Enabled**: Only when user has configured SearchAPI.io credentials + **Limitations**: Returns up to 20 results per query + +## Usage Example + +```kotlin +// Create a seed strategy +val seedMethod = SeedMethod.GoogleProxy +val strategy = seedMethod.createStrategy(crawlerTask, user) + +// Check if strategy is available +if (strategy.isEnabled()) { + // Get seed items + val seedItems = strategy.getSeedItems(taskConfig, orchestrationConfig) + + // Process results + seedItems?.forEach { item -> + println("Found: ${item.title} at ${item.link}") + println("Relevance: ${item.relevance_score}") + } +} +``` + +## Configuration + +### Task Configuration +```kotlin +data class CrawlerTaskExecutionConfigData( + val search_query: String? = null, // For search-based methods + val direct_urls: List = emptyList() // For DirectUrls method +) +``` + +### User Settings +Required API credentials in user settings: +- **Google Search**: API key and Search Engine ID +- **SearchAPI.io**: API key + +## Error Handling + +All seed methods implement robust error handling: +- Invalid URLs are filtered out +- Missing configuration throws `IllegalArgumentException` +- API failures throw `RuntimeException` with descriptive messages +- Empty results return empty lists (not errors) + +## Logging + +Comprehensive logging at multiple levels: +- **INFO**: Method start/completion, result counts +- **DEBUG**: Configuration details, parsing steps +- **WARN**: Invalid data, missing results +- **ERROR**: API failures, configuration issues + +## Best Practices + +1. **Choose the Right Method**: + - Use `DirectUrls` for known URLs + - Use `GoogleProxy` for quick testing + - Use `GoogleSearch` for production with API access + - Use `SearchAPISearch` variants for specialized searches + +2. **Handle Rate Limits**: + - Implement delays between requests + - Monitor API quotas + - Use appropriate result limits + +3. **Validate Results**: + - Check `isEnabled()` before using a strategy + - Handle empty result sets gracefully + - Validate URLs before crawling + +4. **Security**: + - Store API keys securely in user settings + - Never log API keys + - Use HTTPS endpoints only + +## Extension + +To add a new seed method: + +1. Implement `SeedMethodFactory`: +```kotlin +class CustomSearch : SeedMethodFactory { + override fun createStrategy(task: CrawlerAgentTask, user: User?): SeedStrategy { + return object : SeedStrategy { + override fun getSeedItems(...): List { + // Implementation + } + + override fun isEnabled(): Boolean { + // Check availability + } + } + } +} +``` + +2. Add to `SeedMethod` enum: +```kotlin +enum class SeedMethod : SeedMethodFactory { + CustomSearch { + override fun createStrategy(...) = CustomSearch().createStrategy(...) + } +} +``` + +## Dependencies + +- Jackson for JSON parsing +- Java HTTP Client for API requests +- CognoTik platform services for user settings + diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/SearchAPISearch.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/SearchAPISearch.kt similarity index 74% rename from webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/SearchAPISearch.kt rename to webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/SearchAPISearch.kt index 167f0ef12..26acfe0ad 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/SearchAPISearch.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/SearchAPISearch.kt @@ -1,8 +1,8 @@ -package com.simiacryptus.cognotik.plan.tools.online +package com.simiacryptus.cognotik.plan.tools.online.seed import com.simiacryptus.cognotik.models.APIProvider import com.simiacryptus.cognotik.plan.OrchestrationConfig -import com.simiacryptus.cognotik.plan.tools.online.SeedMethod.Companion.log +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask import com.simiacryptus.cognotik.platform.ApplicationServices import com.simiacryptus.cognotik.platform.file.UserSettingsManager import com.simiacryptus.cognotik.platform.model.User @@ -26,18 +26,18 @@ open class SearchAPISearch( taskConfig: CrawlerAgentTask.CrawlerTaskExecutionConfigData?, orchestrationConfig: OrchestrationConfig, ): List { - log.info("Starting SearchAPI.io seed method with query: ${taskConfig?.search_query}") + SeedMethod.Companion.log.info("Starting SearchAPI.io seed method with query: ${taskConfig?.search_query}") if (taskConfig?.search_query.isNullOrBlank()) { - log.error("Search query is missing for SearchAPI.io seed method") + SeedMethod.Companion.log.error("Search query is missing for SearchAPI.io seed method") throw IllegalArgumentException("Search query is required when using SearchAPI.io seed method") } val client = HttpClient.newBuilder().build() val query = taskConfig.search_query.trim() - log.debug("Using search query: $query") + SeedMethod.Companion.log.debug("Using search query: $query") val encodedQuery = URLEncoder.encode(query, "UTF-8") val resultCount = 10 val searchLimit = 20 - log.debug("Fetching user settings for SearchAPI.io") + SeedMethod.Companion.log.debug("Fetching user settings for SearchAPI.io") val userSettings = ApplicationServices.fileApplicationServices().userSettingsManager.getUserSettings( user ?: UserSettingsManager.defaultUser @@ -45,7 +45,7 @@ open class SearchAPISearch( val apiKey = userSettings .apis.firstOrNull { it.provider == APIProvider.SearchAPI }?.key?.trim() ?: throw RuntimeException("SearchAPI.io API key is required") - log.debug("Preparing SearchAPI.io request") + SeedMethod.Companion.log.debug("Preparing SearchAPI.io request") val uriBuilder = "https://www.searchapi.io/api/v1/search?engine=$engine&q=$encodedQuery&num=$resultCount&api_key=$apiKey" val request = HttpRequest.newBuilder() @@ -53,17 +53,17 @@ open class SearchAPISearch( .header("User-Agent", "CognoTik-Crawler/1.0") .GET() .build() - log.info("Sending request to SearchAPI.io") + SeedMethod.Companion.log.info("Sending request to SearchAPI.io") val response = client.send(request, HttpResponse.BodyHandlers.ofString()) val statusCode = response.statusCode() val body = response.body() if (statusCode != 200) { - log.error("SearchAPI.io request failed with status $statusCode: $body") + SeedMethod.Companion.log.error("SearchAPI.io request failed with status $statusCode: $body") throw RuntimeException("SearchAPI.io request failed with status $statusCode: $body") } - log.debug("Parsing SearchAPI.io response") + SeedMethod.Companion.log.debug("Parsing SearchAPI.io response") var results = handleResult(body, query) - log.info( + SeedMethod.Companion.log.info( "Successfully retrieved ${results.size} search results, returning ${ results.size.coerceAtMost(searchLimit) } items" @@ -91,7 +91,7 @@ open class SearchAPISearch( } ) } else { - log.warn("Skipping invalid search result missing link or title: $result") + SeedMethod.Companion.log.warn("Skipping invalid search result missing link or title: $result") null } } @@ -113,25 +113,25 @@ open class SearchAPISearch( ).let { rawData -> try { if (!rawData.containsKey(mainResultField)) { - log.warn("Expected field '$mainResultField' not found in SearchAPI.io response for query: $query") + SeedMethod.Companion.log.warn("Expected field '$mainResultField' not found in SearchAPI.io response for query: $query") listOf(rawData) } else { val list = (rawData[mainResultField] as List>) if (list.isEmpty()) { - log.warn("No search results found for query: $query") + SeedMethod.Companion.log.warn("No search results found for query: $query") listOf(rawData) } else { - log.debug("Parsed ${list.size} results from SearchAPI.io response") + SeedMethod.Companion.log.debug("Parsed ${list.size} results from SearchAPI.io response") list } } } catch (e: Exception) { - log.debug("Failed to parse SearchAPI.io response", e) + SeedMethod.Companion.log.debug("Failed to parse SearchAPI.io response", e) listOf(rawData) } } } catch (e: Exception) { - log.debug("Failed to parse SearchAPI.io response", e) + SeedMethod.Companion.log.debug("Failed to parse SearchAPI.io response", e) listOf(JsonUtil.fromJson(body, Map::class.java)) } } \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/SeedMethod.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/SeedMethod.kt similarity index 94% rename from webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/SeedMethod.kt rename to webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/SeedMethod.kt index 1850642b1..be254df1c 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/SeedMethod.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/online/seed/SeedMethod.kt @@ -1,7 +1,9 @@ -package com.simiacryptus.cognotik.plan.tools.online +package com.simiacryptus.cognotik.plan.tools.online.seed import com.simiacryptus.cognotik.describe.Description import com.simiacryptus.cognotik.plan.OrchestrationConfig +import com.simiacryptus.cognotik.plan.tools.online.CrawlerAgentTask +import com.simiacryptus.cognotik.plan.tools.online.seed.DirectUrls import com.simiacryptus.cognotik.platform.model.User import com.simiacryptus.cognotik.util.EnabledStrategy import com.simiacryptus.cognotik.util.LoggerFactory diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/writing/NarrativeGenerationTask.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/writing/NarrativeGenerationTask.kt index 02f3ac555..76f9d013f 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/writing/NarrativeGenerationTask.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/writing/NarrativeGenerationTask.kt @@ -4,7 +4,7 @@ package com.simiacryptus.cognotik.plan.tools.writing import com.simiacryptus.cognotik.agents.ChatAgent import com.simiacryptus.cognotik.agents.ImageAndText import com.simiacryptus.cognotik.agents.ParsedAgent -import com.simiacryptus.cognotik.agents.ImageModificationAgent +import com.simiacryptus.cognotik.agents.ImageProcessingAgent import com.simiacryptus.cognotik.apps.general.renderMarkdown import com.simiacryptus.cognotik.describe.Description import com.simiacryptus.cognotik.plan.OrchestrationConfig @@ -787,7 +787,7 @@ Provide the revised scene content only. }.renderMarkdown ) task.update() - val imageAgent = ImageModificationAgent( + val imageAgent = ImageProcessingAgent( prompt = "Create a compelling book cover image that captures the essence of this narrative", model = orchestrationConfig.imageChatChatter, temperature = 0.8, @@ -853,7 +853,7 @@ Provide the revised scene content only. }.renderMarkdown ) task.update() - val imageAgent = ImageModificationAgent( + val imageAgent = ImageProcessingAgent( prompt = "Create a cinematic scene illustration that captures the key moment and atmosphere", model = orchestrationConfig.imageChatChatter, temperature = 0.7, diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/writing/NarrativeReasoningTask.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/writing/NarrativeReasoningTask.kt index 34b582a26..beb4dfb30 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/writing/NarrativeReasoningTask.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/plan/tools/writing/NarrativeReasoningTask.kt @@ -5,7 +5,7 @@ import com.simiacryptus.cognotik.agents.ChatAgent import com.simiacryptus.cognotik.agents.CodeAgent.Companion.indent import com.simiacryptus.cognotik.agents.ImageAndText import com.simiacryptus.cognotik.agents.ParsedAgent -import com.simiacryptus.cognotik.agents.ImageModificationAgent +import com.simiacryptus.cognotik.agents.ImageProcessingAgent import com.simiacryptus.cognotik.apps.general.renderMarkdown import com.simiacryptus.cognotik.describe.Description import com.simiacryptus.cognotik.plan.* @@ -1264,7 +1264,7 @@ Be concise but insightful. Focus on actionable insights. }.renderMarkdown ) task.update() - val imageAgent = ImageModificationAgent( + val imageAgent = ImageProcessingAgent( prompt = "Transform the narrative description into a vivid, cinematic image", model = orchestrationConfig.imageChatChatter, temperature = 0.7, diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/util/AddApplyFileDiffLinks.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/util/AddApplyFileDiffLinks.kt index 3d7a61829..86f9773a8 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/util/AddApplyFileDiffLinks.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/util/AddApplyFileDiffLinks.kt @@ -131,7 +131,7 @@ open class AddApplyFileDiffLinks(val processor: PatchProcessor) { val relativePath = UUID.randomUUID().toString() + ".json" require(relativePath.isNotBlank()) { "File path cannot be blank" } socketManager.resolve(relativePath)?.writeText(data.toJson()) - return "Patch Data" + return "Patch Data" } fun instrument( diff --git a/webui/src/main/kotlin/com/simiacryptus/cognotik/webui/session/SessionTask.kt b/webui/src/main/kotlin/com/simiacryptus/cognotik/webui/session/SessionTask.kt index 9b6a0fa4f..4149dde00 100644 --- a/webui/src/main/kotlin/com/simiacryptus/cognotik/webui/session/SessionTask.kt +++ b/webui/src/main/kotlin/com/simiacryptus/cognotik/webui/session/SessionTask.kt @@ -16,7 +16,7 @@ import java.util.function.Consumer open class SessionTask( - val messageID: String, + val messageID: String = Session.long64(), private var buffer: MutableList = mutableListOf(), private val spinner: String = SessionTask.spinner, val ui: SocketManager