elastic · reakaleek · Sep 19, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
@@ -0,0 +1,234 @@
+---
+name: Detect Duplicate Issues
+
+on:
+  issues:
+    types:
+      - opened
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  detect-duplicates:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Detect potential duplicate issues
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const issueNumber = context.issue.number;
+
+            // Get the newly created issue
+            const { data: newIssue } = await github.rest.issues.get({
+              owner,
+              repo,
+              issue_number: issueNumber,
+            });
+
+            // Skip if the issue is a pull request
+            if (newIssue.pull_request) {
+              console.log('Skipping pull request');
+              return;
+            }
+
+            console.log('Analyzing issue #' + issueNumber + ': "' + newIssue.title + '"');
+
+            // Get existing open issues (excluding the current one)
+            const { data: existingIssues } = await github.rest.issues.listForRepo({
+              owner,
+              repo,
+              state: 'open',
+              per_page: 100,
+            });
+
+            // Filter out pull requests and the current issue
+            const openIssues = existingIssues.filter(issue => 
+              !issue.pull_request && issue.number !== issueNumber
+            );
+
+            console.log('Found ' + openIssues.length + ' existing open issues to compare against');
+
+            if (openIssues.length === 0) {
+              console.log('No existing issues to compare against');
+              return;
+            }
+
+            // Use GitHub Models to find potential duplicates
+            const duplicates = [];
+
+            // Pre-filter issues using lightweight text similarity to reduce AI API calls
+            const newTitle = newIssue.title.toLowerCase();
+            const newBody = (newIssue.body || '').toLowerCase();
+            const newTitleWords = newTitle.split(/\s+/).filter(w => w.length > 3);
+
+            const candidateIssues = [];
+
+            // First pass: quick text similarity to identify candidates
+            for (const issue of openIssues) {
+              const existingTitle = issue.title.toLowerCase();
+              const existingBody = (issue.body || '').toLowerCase();
+
+              // Calculate title word overlap
+              const titleOverlap = newTitleWords.filter(word => existingTitle.includes(word)).length;
+              const titleSimilarity = newTitleWords.length > 0 ? titleOverlap / newTitleWords.length : 0;
+
+              // Calculate body keyword overlap for additional context
+              const bodyHasKeywords = newTitleWords.some(word => existingBody.includes(word));
+
+              // Include if there's significant title similarity or body keywords match
+              if (titleSimilarity > 0.3 || bodyHasKeywords) {
+                candidateIssues.push({
+                  issue,
+                  titleSimilarity,
+                  quickMatch: titleSimilarity > 0.6 // High confidence for potential duplicates
+                });
+              }
+            }
+
+            console.log('Pre-filtered to ' + candidateIssues.length + ' candidate issues from ' + openIssues.length + ' total issues');
+
+            // Sort candidates by similarity score (highest first) and limit to top 20 for AI analysis
+            candidateIssues.sort((a, b) => b.titleSimilarity - a.titleSimilarity);
+            const topCandidates = candidateIssues.slice(0, 20);
+
+            if (topCandidates.length === 0) {
+              console.log('No candidate issues found after pre-filtering');
+              return;
+            }
+
+            // Process high-confidence matches first (may not need AI)
+            for (const candidate of topCandidates) {
+              if (candidate.quickMatch) {
+                duplicates.push({
+                  issue: candidate.issue,
+                  similarity: 'medium'
+                });
+                console.log('Found similar issue (pre-filter): #' + candidate.issue.number + ' - ' + candidate.issue.title);
+              }
+            }
+
+            // Use AI for remaining candidates if we haven't found enough duplicates
+            const remainingCandidates = topCandidates.filter(c => !c.quickMatch);
+
+            if (remainingCandidates.length > 0 && duplicates.length < 3) {
+              // Batch process up to 10 issues in a single AI call for efficiency
+              const batchSize = Math.min(10, remainingCandidates.length);
+              const batch = remainingCandidates.slice(0, batchSize);
+
+              try {
+                // Create a single prompt that compares the new issue against multiple existing issues
+                let promptContent = 'Compare this NEW ISSUE against the following EXISTING ISSUES and identify which ones are duplicates or similar:\n\n';
+                promptContent += 'NEW ISSUE:\n';
+                promptContent += 'Title: ' + newIssue.title + '\n';
+                promptContent += 'Body: ' + (newIssue.body || 'No description provided') + '\n\n';
+                promptContent += 'EXISTING ISSUES TO COMPARE:\n';
+
+                batch.forEach((candidate, index) => {
+                  promptContent += (index + 1) + '. Issue #' + candidate.issue.number + '\n';
+                  promptContent += '   Title: ' + candidate.issue.title + '\n';
+                  promptContent += '   Body: ' + (candidate.issue.body || 'No description provided') + '\n\n';
+                });
+
+                promptContent += 'For each existing issue, respond with the issue number followed by: DUPLICATE, SIMILAR, or DIFFERENT. Example: "1: DUPLICATE, 2: DIFFERENT, 3: SIMILAR"';
+
+                // Call GitHub Models API with batch comparison
+                const response = await fetch('https://models.inference.ai.azure.com/chat/completions', {
+                  method: 'POST',
+                  headers: {
+                    'Authorization': 'Bearer ' + github.token,
+                    'Content-Type': 'application/json',
+                  },
+                  body: JSON.stringify({
+                    messages: [
+                      {
+                        role: 'system',
+                        content: 'You are an expert at analyzing GitHub issues to detect duplicates. Compare issues and determine if they are likely duplicates. For each comparison, respond with: DUPLICATE (same core problem), SIMILAR (related but different aspects), or DIFFERENT (unrelated). Focus on the core problem being reported.'
+                      },
+                      {
+                        role: 'user',
+                        content: promptContent
+                      }
+                    ],
+                    model: 'gpt-4o-mini',
+                    temperature: 0.1,
+                    max_tokens: 100
+                  })
+                });
+
+                if (response.ok) {
+                  const result = await response.json();
+                  const analysis = result.choices[0]?.message?.content?.trim();
+                  console.log('AI batch analysis result: ' + analysis);
+
+                  // Parse the batch response
+                  const lines = analysis.split(/[,\n]/).map(l => l.trim());
+                  for (const line of lines) {
+                    const match = line.match(/(\d+):\s*(DUPLICATE|SIMILAR|DIFFERENT)/i);
+                    if (match) {
+                      const issueIndex = parseInt(match[1]) - 1;
+                      const classification = match[2].toUpperCase();
+
+                      if (issueIndex >= 0 && issueIndex < batch.length && (classification === 'DUPLICATE' || classification === 'SIMILAR')) {
+                        const candidate = batch[issueIndex];
+                        duplicates.push({
+                          issue: candidate.issue,
+                          similarity: classification === 'DUPLICATE' ? 'high' : 'medium'
+                        });
+                        console.log('Found ' + classification.toLowerCase() + ' issue: #' + candidate.issue.number + ' - ' + candidate.issue.title);
+                      }
+                    }
+                  }
+                } else {
+                  const errorText = await response.text();
+                  console.log('GitHub Models API failed: ' + response.status + ' - ' + errorText);
+                  console.log('Falling back to pre-filter results only');
+                }
+              } catch (error) {
+                console.log('Error in batch AI analysis: ' + error.message);
+                console.log('Falling back to pre-filter results only');
+              }
+            }
+
+            // Post comment if duplicates found
+            if (duplicates.length > 0) {
+              const highPriority = duplicates.filter(d => d.similarity === 'high');
+              const mediumPriority = duplicates.filter(d => d.similarity === 'medium');
+
+              let commentBody = '👋 **Potential duplicate issues detected**\n\n';
+              commentBody += 'This issue appears to be similar to existing open issues:\n\n';
+
+              if (highPriority.length > 0) {
+                commentBody += '### 🚨 Likely Duplicates\n';
+                for (const { issue } of highPriority) {
+                  commentBody += '- #' + issue.number + ' - [' + issue.title + '](' + issue.html_url + ')\n';
+                }
+                commentBody += '\n';
+              }
+
+              if (mediumPriority.length > 0) {
+                commentBody += '### 🔍 Similar Issues\n';
+                for (const { issue } of mediumPriority) {
+                  commentBody += '- #' + issue.number + ' - [' + issue.title + '](' + issue.html_url + ')\n';
+                }
+                commentBody += '\n';
+              }
+
+              commentBody += 'Please review these issues to see if your issue is already covered. ';
+              commentBody += 'If this is indeed a duplicate, consider closing this issue and contributing to the existing discussion.\n\n';
+              commentBody += '---\n';
+              commentBody += '*This comment was automatically generated using AI to help identify potential duplicates.*';
+
+              await github.rest.issues.createComment({
+                owner,
+                repo,
+                issue_number: issueNumber,
+                body: commentBody,
+              });
+
+              console.log('Posted comment with ' + duplicates.length + ' potential duplicate(s)');
+            } else {
+              console.log('No potential duplicates found');
+            }
diff --git a/docs/contribute/duplicate-detection.md b/docs/contribute/duplicate-detection.md
@@ -0,0 +1,72 @@
+# Duplicate Issue Detection
+
+The docs-builder repository includes an automated workflow that helps identify potential duplicate issues using AI-powered analysis with optimized efficiency.
+
+## How It Works
+
+1. **Trigger**: The workflow is triggered when a new issue is created in the repository.
+2. **Pre-filtering**: Uses lightweight text similarity to identify candidate issues (reduces AI API calls by ~80-90%).
+3. **AI Analysis**: Uses GitHub Models (GPT-4o-mini) to analyze promising candidates in batches for efficiency.
+4. **Comment**: If potential duplicates are found, the workflow posts a comment on the new issue with links to similar issues.
+
+## Workflow Features
+
+- **Efficient Processing**: Pre-filters issues using text similarity before AI analysis, reducing API calls from potentially 100+ to typically 1-2.
+- **Batch AI Analysis**: Processes multiple issue comparisons in a single API call for maximum efficiency.
+- **Smart Candidate Selection**: Focuses AI analysis on the most promising candidates based on title and content similarity.
+- **Fallback Mechanism**: If the AI service is unavailable, it uses the pre-filtering results.
+- **Categorized Results**: Distinguishes between "likely duplicates" and "similar issues" to help maintainers prioritize.
+- **Non-Intrusive**: Only comments when potential duplicates are found, doesn't interfere with normal issue workflow.
+
+## Performance Optimizations
+
+- **Pre-filtering**: Reduces candidates from 100+ issues to typically 5-20 relevant ones
+- **Batch Processing**: Single AI API call instead of individual calls per issue
+- **Early Termination**: Stops processing when sufficient duplicates are found
+- **Smart Limits**: Analyzes only top 20 most relevant candidates, processes max 10 in AI batch
+
+## Example Output
+
+When duplicates are detected, the workflow posts a comment like this:
+
+```markdown
+👋 **Potential duplicate issues detected**
+
+This issue appears to be similar to existing open issues:
+
+### 🚨 Likely Duplicates
+- #123 - [Build fails with .NET 9](https://github.com/elastic/docs-builder/issues/123)
+
+### 🔍 Similar Issues
+- #456 - [Performance issues during build](https://github.com/elastic/docs-builder/issues/456)
+
+Please review these issues to see if your issue is already covered. 
+If this is indeed a duplicate, consider closing this issue and contributing to the existing discussion.
+
+---
+*This comment was automatically generated using AI to help identify potential duplicates.*
+```
+
+## Workflow Configuration
+
+The workflow is defined in `.github/workflows/detect-duplicate-issues.yml` and includes:
+
+- **Permissions**: Read access to repository content and write access to issues
+- **Efficient Processing**: Pre-filtering and batch processing to minimize AI API calls
+- **Error Handling**: Graceful handling of API failures with fallback mechanisms
+
+## Benefits
+
+- **Reduces Maintenance Overhead**: Helps maintainers quickly identify duplicate issues
+- **Improves Issue Quality**: Encourages users to search existing issues before creating new ones
+- **Enhances Collaboration**: Directs users to existing discussions where they can contribute
+- **Cost Effective**: Optimized to minimize AI API usage while maintaining accuracy
+
+## Technical Details
+
+- **GitHub Models Integration**: Uses the GitHub Models API with GPT-4o-mini for semantic analysis
+- **Pre-filtering Algorithm**: Text similarity analysis to identify relevant candidates
+- **Batch Processing**: Compares up to 10 issues in a single AI API call
+- **Performance**: Reduces API calls by 80-90% compared to individual comparisons
+
+The workflow is designed to be both helpful and efficient, providing accurate duplicate detection while minimizing resource usage.
diff --git a/docs/contribute/index.md b/docs/contribute/index.md
@@ -42,6 +42,10 @@ In Docs V3, a single branch is published per repository. This branch is set to `
 * For **documentation** problems: [Open a docs issue](https://github.com/elastic/docs-content/issues/new?template=internal-request.yaml) *or* [Fix it myself](locally.md). You can open sensitive issues in our [internal repo](https://github.com/elastic/docs-content-internal/issues/new/choose).
 * For **build tool (docs-builder)** problems: [Open a bug report](https://github.com/elastic/docs-builder/issues/new?template=bug-report.yaml)
 
+:::{note}
+When you create a new issue in the docs-builder repository, our [automated duplicate detection system](duplicate-detection.md) will help identify if similar issues already exist.
+:::
+
 ## Request an enhancement or documentation for a new feature
 
 * Make the **documentation** better: [Open a docs issue](https://github.com/elastic/docs-content/issues/new?template=internal-request.yaml). Elastic employees can open sensitive issues in our [internal repo](https://github.com/elastic/docs-content-internal/issues/new/choose).