elastic · reakaleek · Sep 19, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
@@ -0,0 +1,287 @@
+---
+name: Detect Duplicate Issues
+
+on:
+  issues:
+    types:
+      - opened
+
+permissions:
+  contents: read
+  issues: write
+  models: read
+
+jobs:
+  detect-duplicates:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Detect potential duplicate issues
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const issueNumber = context.issue.number;
+
+            // Get the newly created issue
+            const { data: newIssue } = await github.rest.issues.get({
+              owner,
+              repo,
+              issue_number: issueNumber,
+            });
+
+            // Skip if the issue is a pull request
+            if (newIssue.pull_request) {
+              console.log('Skipping pull request');
+              return;
+            }
+
+            console.log('Analyzing issue #' + issueNumber + ': "' + newIssue.title + '"');
+
+            // Get existing open issues (excluding the current one)
+            const { data: existingIssues } = await github.rest.issues.listForRepo({
+              owner,
+              repo,
+              state: 'open',
+              per_page: 100,
+            });
+
+            // Filter out pull requests and the current issue
+            const openIssues = existingIssues.filter(issue => 
+              !issue.pull_request && issue.number !== issueNumber
+            );
+
+            console.log('Found ' + openIssues.length + ' existing open issues to compare against');
+
+            if (openIssues.length === 0) {
+              console.log('No existing issues to compare against');
+              return;
+            }
+
+            // Use GitHub Models to find potential duplicates
+            const duplicates = [];
+
+            if (openIssues.length === 0) {
+              console.log('No existing issues to compare against');
+              return;
+            }
+
+            console.log('Analyzing ' + openIssues.length + ' existing issues for potential duplicates');
+
+            try {
+              // Step 1: Send all issue titles and numbers to get top 5 candidates
+              let titlePrompt = 'Analyze this NEW ISSUE against all EXISTING ISSUES and identify the top 5 most similar ones:\n\n';
+              titlePrompt += 'NEW ISSUE:\n';
+              titlePrompt += 'Title: ' + newIssue.title + '\n';
+              titlePrompt += 'Body: ' + (newIssue.body || 'No description provided') + '\n\n';
+              titlePrompt += 'EXISTING ISSUES:\n';
+
+              openIssues.forEach((issue, index) => {
+                titlePrompt += (index + 1) + '. Issue #' + issue.number + ' - ' + issue.title + '\n';
+              });
+
+              titlePrompt += '\nRespond with a JSON object containing the top 5 most similar issues. Format: {"similar_issues": [{"rank": 1, "issue_number": 123, "similarity": "high|medium"}, ...]}';
+
+              const titleResponse = await fetch('https://models.inference.ai.azure.com/chat/completions', {
+                method: 'POST',
+                headers: {
+                  'Authorization': 'Bearer ' + github.token,
+                  'Content-Type': 'application/json',
+                },
+                body: JSON.stringify({
+                  messages: [
+                    {
+                      role: 'system',
+                      content: 'You are an expert at analyzing GitHub issues to detect duplicates. Compare issue titles and descriptions to identify the most similar ones. Respond only with valid JSON containing the top 5 most similar issues ranked by relevance. Use "high" for likely duplicates and "medium" for related issues.'
+                    },
+                    {
+                      role: 'user',
+                      content: titlePrompt
+                    }
+                  ],
+                  model: 'gpt-4o-mini',
+                  temperature: 0.1,
+                  max_tokens: 200
+                })
+              });
+
+              if (!titleResponse.ok) {
+                const errorText = await titleResponse.text();
+                console.log('First AI call failed: ' + titleResponse.status + ' - ' + errorText);
+                return;
+              }
+
+              const titleResult = await titleResponse.json();
+              const titleAnalysis = titleResult.choices[0]?.message?.content?.trim();
+              console.log('AI title analysis result: ' + titleAnalysis);
+
+              // Parse JSON response to get top 5 candidates
+              let candidateIssueNumbers = [];
+              try {
+                const jsonMatch = titleAnalysis.match(/\{.*\}/s);
+                if (jsonMatch) {
+                  const jsonData = JSON.parse(jsonMatch[0]);
+                  candidateIssueNumbers = jsonData.similar_issues || [];
+                }
+              } catch (parseError) {
+                console.log('Failed to parse JSON response, falling back to number extraction');
+                // Fallback: extract issue numbers from response
+                const numberMatches = titleAnalysis.match(/#(\d+)/g);
+                if (numberMatches) {
+                  candidateIssueNumbers = numberMatches.slice(0, 5).map(match => ({
+                    issue_number: parseInt(match.replace('#', '')),
+                    similarity: 'medium'
+                  }));
+                }
+              }
+
+              if (candidateIssueNumbers.length === 0) {
+                console.log('No candidate issues identified in first pass');
+                return;
+              }
+
+              console.log('Found ' + candidateIssueNumbers.length + ' candidate issues from title analysis');
+
+              // Step 2: Get full details for top candidates and do detailed analysis
+              const candidateIssues = [];
+              for (const candidate of candidateIssueNumbers) {
+                const issue = openIssues.find(i => i.number === candidate.issue_number);
+                if (issue) {
+                  candidateIssues.push({
+                    issue,
+                    initialSimilarity: candidate.similarity
+                  });
+                }
+              }
+
+              if (candidateIssues.length === 0) {
+                console.log('No valid candidate issues found');
+                return;
+              }
+
+              // Step 3: Detailed analysis with full issue bodies
+              let detailPrompt = 'Perform detailed comparison of this NEW ISSUE against the TOP CANDIDATE ISSUES:\n\n';
+              detailPrompt += 'NEW ISSUE:\n';
+              detailPrompt += 'Title: ' + newIssue.title + '\n';
+              detailPrompt += 'Body: ' + (newIssue.body || 'No description provided') + '\n\n';
+              detailPrompt += 'CANDIDATE ISSUES FOR DETAILED ANALYSIS:\n';
+
+              candidateIssues.forEach((candidate, index) => {
+                detailPrompt += (index + 1) + '. Issue #' + candidate.issue.number + '\n';
+                detailPrompt += '   Title: ' + candidate.issue.title + '\n';
+                detailPrompt += '   Body: ' + (candidate.issue.body || 'No description provided') + '\n\n';
+              });
+
+              detailPrompt += 'Respond with JSON format: {"duplicates": [{"issue_number": 123, "classification": "DUPLICATE|SIMILAR|DIFFERENT", "reason": "brief explanation"}]}';
+
+              const detailResponse = await fetch('https://models.inference.ai.azure.com/chat/completions', {
+                method: 'POST',
+                headers: {
+                  'Authorization': 'Bearer ' + github.token,
+                  'Content-Type': 'application/json',
+                },
+                body: JSON.stringify({
+                  messages: [
+                    {
+                      role: 'system',
+                      content: 'You are an expert at analyzing GitHub issues for duplicates. Compare the full content and determine: DUPLICATE (same core problem), SIMILAR (related but different aspects), or DIFFERENT (unrelated). Respond only with valid JSON.'
+                    },
+                    {
+                      role: 'user',
+                      content: detailPrompt
+                    }
+                  ],
+                  model: 'gpt-4o-mini',
+                  temperature: 0.1,
+                  max_tokens: 300
+                })
+              });
+
+              if (detailResponse.ok) {
+                const detailResult = await detailResponse.json();
+                const detailAnalysis = detailResult.choices[0]?.message?.content?.trim();
+                console.log('AI detailed analysis result: ' + detailAnalysis);
+
+                // Parse detailed analysis JSON
+                try {
+                  const jsonMatch = detailAnalysis.match(/\{.*\}/s);
+                  if (jsonMatch) {
+                    const jsonData = JSON.parse(jsonMatch[0]);
+                    const results = jsonData.duplicates || [];
+
+                    for (const result of results) {
+                      if (result.classification === 'DUPLICATE' || result.classification === 'SIMILAR') {
+                        const issue = candidateIssues.find(c => c.issue.number === result.issue_number)?.issue;
+                        if (issue) {
+                          duplicates.push({
+                            issue,
+                            similarity: result.classification === 'DUPLICATE' ? 'high' : 'medium'
+                          });
+                          console.log('Found ' + result.classification.toLowerCase() + ' issue: #' + issue.number + ' - ' + issue.title);
+                        }
+                      }
+                    }
+                  }
+                } catch (parseError) {
+                  console.log('Failed to parse detailed analysis JSON, using fallback');
+                  // Fallback: look for DUPLICATE/SIMILAR mentions
+                  candidateIssues.forEach(candidate => {
+                    const issueRef = '#' + candidate.issue.number;
+                    if (detailAnalysis.includes(issueRef) && 
+                        (detailAnalysis.includes('DUPLICATE') || detailAnalysis.includes('SIMILAR'))) {
+                      duplicates.push({
+                        issue: candidate.issue,
+                        similarity: detailAnalysis.includes('DUPLICATE') ? 'high' : 'medium'
+                      });
+                      console.log('Found similar issue (fallback): #' + candidate.issue.number + ' - ' + candidate.issue.title);
+                    }
+                  });
+                }
+              } else {
+                const errorText = await detailResponse.text();
+                console.log('Detailed analysis failed: ' + detailResponse.status + ' - ' + errorText);
+              }
+
+            } catch (error) {
+              console.log('Error in AI analysis: ' + error.message);
+            }
+
+            // Post comment if duplicates found
+            if (duplicates.length > 0) {
+              const highPriority = duplicates.filter(d => d.similarity === 'high');
+              const mediumPriority = duplicates.filter(d => d.similarity === 'medium');
+
+              let commentBody = '👋 **Potential duplicate issues detected**\n\n';
+              commentBody += 'This issue appears to be similar to existing open issues:\n\n';
+
+              if (highPriority.length > 0) {
+                commentBody += '### 🚨 Likely Duplicates\n';
+                for (const { issue } of highPriority) {
+                  commentBody += '- #' + issue.number + ' - [' + issue.title + '](' + issue.html_url + ')\n';
+                }
+                commentBody += '\n';
+              }
+
+              if (mediumPriority.length > 0) {
+                commentBody += '### 🔍 Similar Issues\n';
+                for (const { issue } of mediumPriority) {
+                  commentBody += '- #' + issue.number + ' - [' + issue.title + '](' + issue.html_url + ')\n';
+                }
+                commentBody += '\n';
+              }
+
+              commentBody += 'Please review these issues to see if your issue is already covered. ';
+              commentBody += 'If this is indeed a duplicate, consider closing this issue and contributing to the existing discussion.\n\n';
+              commentBody += '---\n';
+              commentBody += '*This comment was automatically generated using AI to help identify potential duplicates.*';
+
+              await github.rest.issues.createComment({
+                owner,
+                repo,
+                issue_number: issueNumber,
+                body: commentBody,
+              });
+
+              console.log('Posted comment with ' + duplicates.length + ' potential duplicate(s)');
+            } else {
+              console.log('No potential duplicates found');
+            }
diff --git a/docs/contribute/duplicate-detection.md b/docs/contribute/duplicate-detection.md
@@ -0,0 +1,79 @@
+# Duplicate Issue Detection
+
+The docs-builder repository includes an automated workflow that helps identify potential duplicate issues using a two-step AI-powered analysis approach.
+
+## How It Works
+
+1. **Trigger**: The workflow is triggered when a new issue is created in the repository.
+2. **First AI Call**: Sends all existing issue titles and numbers to GitHub Models to get the top 5 most similar issues in JSON format.
+3. **Second AI Call**: Performs detailed analysis on the top 5 candidates using their full content (title + body).
+4. **Comment**: If potential duplicates are found, the workflow posts a comment on the new issue with links to similar issues.
+
+## Workflow Features
+
+- **Pure AI Analysis**: Relies entirely on GitHub Models for duplicate detection without pre-filtering algorithms.
+- **Two-Step Process**: First identifies candidates by title similarity, then performs detailed analysis with full content.
+- **JSON-Structured Responses**: Uses structured JSON responses for reliable parsing of AI analysis results.
+- **Comprehensive Coverage**: Analyzes all existing open issues (up to 100) in the first pass.
+- **Fallback Mechanism**: If JSON parsing fails, falls back to text pattern matching.
+- **Categorized Results**: Distinguishes between "likely duplicates" and "similar issues" to help maintainers prioritize.
+- **Non-Intrusive**: Only comments when potential duplicates are found, doesn't interfere with normal issue workflow.
+
+## AI Analysis Process
+
+### Step 1: Title-Based Candidate Selection
+- Sends new issue title and description along with all existing issue titles
+- AI responds with JSON containing top 5 most similar issues
+- Each candidate includes issue number and similarity level (high/medium)
+
+### Step 2: Detailed Content Analysis  
+- Performs deep analysis on the top 5 candidates using full issue bodies
+- AI provides detailed comparison with reasoning
+- Results in final classification: DUPLICATE, SIMILAR, or DIFFERENT
+
+## Example Output
+
+When duplicates are detected, the workflow posts a comment like this:
+
+```markdown
+👋 **Potential duplicate issues detected**
+
+This issue appears to be similar to existing open issues:
+
+### 🚨 Likely Duplicates
+- #123 - [Build fails with .NET 9](https://github.com/elastic/docs-builder/issues/123)
+
+### 🔍 Similar Issues
+- #456 - [Performance issues during build](https://github.com/elastic/docs-builder/issues/456)
+
+Please review these issues to see if your issue is already covered. 
+If this is indeed a duplicate, consider closing this issue and contributing to the existing discussion.
+
+---
+*This comment was automatically generated using AI to help identify potential duplicates.*
+```
+
+## Workflow Configuration
+
+The workflow is defined in `.github/workflows/detect-duplicate-issues.yml` and includes:
+
+- **Permissions**: Read access to repository content, write access to issues, and read access to GitHub Models
+- **Two AI Calls**: Structured for candidate selection and detailed analysis
+- **Error Handling**: Graceful handling of API failures with fallback mechanisms
+
+## Benefits
+
+- **Reduces Maintenance Overhead**: Helps maintainers quickly identify duplicate issues
+- **Improves Issue Quality**: Encourages users to search existing issues before creating new ones
+- **Enhances Collaboration**: Directs users to existing discussions where they can contribute
+- **High Accuracy**: Two-step analysis ensures thorough evaluation of potential duplicates
+
+## Technical Details
+
+- **GitHub Models Integration**: Uses the GitHub Models API with GPT-4o-mini for semantic analysis
+- **Two-Step Analysis**: First pass identifies candidates, second pass performs detailed analysis
+- **JSON Responses**: Structured responses for reliable parsing and error handling
+- **Comprehensive Scope**: Analyzes all open issues without pre-filtering
+- **API Efficiency**: Typically requires only 2 AI API calls regardless of repository size
+
+The workflow is designed to provide accurate duplicate detection through comprehensive AI analysis while maintaining simplicity and reliability.
diff --git a/docs/contribute/index.md b/docs/contribute/index.md
@@ -42,6 +42,10 @@ In Docs V3, a single branch is published per repository. This branch is set to `
 * For **documentation** problems: [Open a docs issue](https://github.com/elastic/docs-content/issues/new?template=internal-request.yaml) *or* [Fix it myself](locally.md). You can open sensitive issues in our [internal repo](https://github.com/elastic/docs-content-internal/issues/new/choose).
 * For **build tool (docs-builder)** problems: [Open a bug report](https://github.com/elastic/docs-builder/issues/new?template=bug-report.yaml)
 
+:::{note}
+When you create a new issue in the docs-builder repository, our [automated duplicate detection system](duplicate-detection.md) will help identify if similar issues already exist.
+:::
+
 ## Request an enhancement or documentation for a new feature
 
 * Make the **documentation** better: [Open a docs issue](https://github.com/elastic/docs-content/issues/new?template=internal-request.yaml). Elastic employees can open sensitive issues in our [internal repo](https://github.com/elastic/docs-content-internal/issues/new/choose).