Skip to content

Commit dc3f9c3

Browse files
Copilotreakaleek
andauthored
Add automated duplicate issue detection workflow using GitHub Models AI (#1905)
* Initial plan * Add duplicate issue detection workflow with AI-powered analysis Co-authored-by: reakaleek <[email protected]> * Update GitHub Models implementation following best practices Co-authored-by: reakaleek <[email protected]> * Optimize duplicate detection workflow to minimize AI API calls Co-authored-by: reakaleek <[email protected]> * Add models: read permission as required by GitHub Models API Co-authored-by: reakaleek <[email protected]> * Replace text similarity with pure AI-based two-step analysis approach Co-authored-by: reakaleek <[email protected]> * Remove duplicate detection documentation file and references Co-authored-by: reakaleek <[email protected]> * Add resilience improvements: retry logic, content sanitization, and prompt limits Co-authored-by: reakaleek <[email protected]> --------- Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: reakaleek <[email protected]> Co-authored-by: Jan Calanog <[email protected]>
1 parent 72314a0 commit dc3f9c3

File tree

1 file changed

+328
-0
lines changed

1 file changed

+328
-0
lines changed
Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
---
2+
name: Detect Duplicate Issues
3+
4+
on:
5+
issues:
6+
types:
7+
- opened
8+
9+
permissions:
10+
contents: read
11+
issues: write
12+
models: read
13+
14+
jobs:
15+
detect-duplicates:
16+
runs-on: ubuntu-latest
17+
steps:
18+
- name: Detect potential duplicate issues
19+
uses: actions/github-script@v7
20+
with:
21+
script: |
22+
const { owner, repo } = context.repo;
23+
const issueNumber = context.issue.number;
24+
25+
// Get the newly created issue
26+
const { data: newIssue } = await github.rest.issues.get({
27+
owner,
28+
repo,
29+
issue_number: issueNumber,
30+
});
31+
32+
// Skip if the issue is a pull request
33+
if (newIssue.pull_request) {
34+
console.log('Skipping pull request');
35+
return;
36+
}
37+
38+
console.log('Analyzing issue #' + issueNumber + ': "' + newIssue.title + '"');
39+
40+
// Get existing open issues (excluding the current one)
41+
const { data: existingIssues } = await github.rest.issues.listForRepo({
42+
owner,
43+
repo,
44+
state: 'open',
45+
per_page: 100,
46+
});
47+
48+
// Filter out pull requests and the current issue
49+
const openIssues = existingIssues.filter(issue =>
50+
!issue.pull_request && issue.number !== issueNumber
51+
);
52+
53+
console.log('Found ' + openIssues.length + ' existing open issues to compare against');
54+
55+
if (openIssues.length === 0) {
56+
console.log('No existing issues to compare against');
57+
return;
58+
}
59+
60+
// Use GitHub Models to find potential duplicates
61+
const duplicates = [];
62+
63+
if (openIssues.length === 0) {
64+
console.log('No existing issues to compare against');
65+
return;
66+
}
67+
68+
console.log('Analyzing ' + openIssues.length + ' existing issues for potential duplicates');
69+
70+
try {
71+
// Helper function to safely escape content for prompts
72+
function sanitizeContent(content) {
73+
if (!content) return 'No description provided';
74+
return content.replace(/[`'"\\]/g, ' ').slice(0, 500); // Limit length and escape problematic chars
75+
}
76+
77+
// Helper function to retry AI calls with exponential backoff
78+
async function retryApiCall(apiCallFn, maxRetries = 2) {
79+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
80+
try {
81+
const response = await apiCallFn();
82+
if (response.ok) return response;
83+
84+
if (attempt < maxRetries) {
85+
const delay = Math.pow(2, attempt) * 1000; // 1s, 2s, 4s delays
86+
console.log('API call failed, retrying in ' + delay + 'ms (attempt ' + (attempt + 1) + '/' + (maxRetries + 1) + ')');
87+
await new Promise(resolve => setTimeout(resolve, delay));
88+
} else {
89+
return response; // Return the failed response on final attempt
90+
}
91+
} catch (error) {
92+
if (attempt === maxRetries) throw error;
93+
const delay = Math.pow(2, attempt) * 1000;
94+
console.log('API call error, retrying in ' + delay + 'ms: ' + error.message);
95+
await new Promise(resolve => setTimeout(resolve, delay));
96+
}
97+
}
98+
}
99+
100+
// Limit the number of issues to analyze to prevent token overflow
101+
const maxIssuesForAnalysis = Math.min(openIssues.length, 50); // Limit to 50 issues max
102+
const issuesToAnalyze = openIssues.slice(0, maxIssuesForAnalysis);
103+
104+
if (issuesToAnalyze.length < openIssues.length) {
105+
console.log('Limiting analysis to ' + maxIssuesForAnalysis + ' most recent issues (out of ' + openIssues.length + ' total)');
106+
}
107+
108+
// Step 1: Send issue titles and numbers to get top 5 candidates
109+
let titlePrompt = 'Analyze this NEW ISSUE against EXISTING ISSUES and identify the top 5 most similar ones:\n\n';
110+
titlePrompt += 'NEW ISSUE:\n';
111+
titlePrompt += 'Title: ' + sanitizeContent(newIssue.title) + '\n';
112+
titlePrompt += 'Body: ' + sanitizeContent(newIssue.body) + '\n\n';
113+
titlePrompt += 'EXISTING ISSUES:\n';
114+
115+
issuesToAnalyze.forEach((issue, index) => {
116+
titlePrompt += (index + 1) + '. Issue #' + issue.number + ' - ' + sanitizeContent(issue.title) + '\n';
117+
});
118+
119+
titlePrompt += '\nRespond with a JSON object containing the top 5 most similar issues. Format: {"similar_issues": [{"rank": 1, "issue_number": 123, "similarity": "high|medium"}, ...]}';
120+
121+
const titleResponse = await retryApiCall(() =>
122+
fetch('https://models.inference.ai.azure.com/chat/completions', {
123+
method: 'POST',
124+
headers: {
125+
'Authorization': 'Bearer ' + github.token,
126+
'Content-Type': 'application/json',
127+
},
128+
body: JSON.stringify({
129+
messages: [
130+
{
131+
role: 'system',
132+
content: 'You are an expert at analyzing GitHub issues to detect duplicates. Compare issue titles and descriptions to identify the most similar ones. Respond only with valid JSON containing the top 5 most similar issues ranked by relevance. Use "high" for likely duplicates and "medium" for related issues.'
133+
},
134+
{
135+
role: 'user',
136+
content: titlePrompt
137+
}
138+
],
139+
model: 'gpt-4o-mini',
140+
temperature: 0.1,
141+
max_tokens: 200
142+
})
143+
})
144+
);
145+
146+
if (!titleResponse.ok) {
147+
const errorText = await titleResponse.text();
148+
console.log('First AI call failed after retries: ' + titleResponse.status + ' - ' + errorText);
149+
return;
150+
}
151+
152+
const titleResult = await titleResponse.json();
153+
const titleAnalysis = titleResult.choices[0]?.message?.content?.trim();
154+
console.log('AI title analysis result: ' + titleAnalysis);
155+
156+
// Parse JSON response to get top 5 candidates
157+
let candidateIssueNumbers = [];
158+
try {
159+
const jsonMatch = titleAnalysis.match(/\{.*\}/s);
160+
if (jsonMatch) {
161+
const jsonData = JSON.parse(jsonMatch[0]);
162+
candidateIssueNumbers = jsonData.similar_issues || [];
163+
}
164+
} catch (parseError) {
165+
console.log('Failed to parse JSON response, falling back to number extraction');
166+
// Fallback: extract issue numbers from response
167+
const numberMatches = titleAnalysis.match(/#(\d+)/g);
168+
if (numberMatches) {
169+
candidateIssueNumbers = numberMatches.slice(0, 5).map(match => ({
170+
issue_number: parseInt(match.replace('#', '')),
171+
similarity: 'medium'
172+
}));
173+
}
174+
}
175+
176+
if (candidateIssueNumbers.length === 0) {
177+
console.log('No candidate issues identified in first pass');
178+
return;
179+
}
180+
181+
console.log('Found ' + candidateIssueNumbers.length + ' candidate issues from title analysis');
182+
183+
// Step 2: Get full details for top candidates and do detailed analysis
184+
const candidateIssues = [];
185+
for (const candidate of candidateIssueNumbers) {
186+
const issue = openIssues.find(i => i.number === candidate.issue_number);
187+
if (issue) {
188+
candidateIssues.push({
189+
issue,
190+
initialSimilarity: candidate.similarity
191+
});
192+
}
193+
}
194+
195+
if (candidateIssues.length === 0) {
196+
console.log('No valid candidate issues found');
197+
return;
198+
}
199+
200+
// Step 3: Detailed analysis with full issue bodies
201+
let detailPrompt = 'Perform detailed comparison of this NEW ISSUE against the TOP CANDIDATE ISSUES:\n\n';
202+
detailPrompt += 'NEW ISSUE:\n';
203+
detailPrompt += 'Title: ' + sanitizeContent(newIssue.title) + '\n';
204+
detailPrompt += 'Body: ' + sanitizeContent(newIssue.body) + '\n\n';
205+
detailPrompt += 'CANDIDATE ISSUES FOR DETAILED ANALYSIS:\n';
206+
207+
candidateIssues.forEach((candidate, index) => {
208+
detailPrompt += (index + 1) + '. Issue #' + candidate.issue.number + '\n';
209+
detailPrompt += ' Title: ' + sanitizeContent(candidate.issue.title) + '\n';
210+
detailPrompt += ' Body: ' + sanitizeContent(candidate.issue.body) + '\n\n';
211+
});
212+
213+
detailPrompt += 'Respond with JSON format: {"duplicates": [{"issue_number": 123, "classification": "DUPLICATE|SIMILAR|DIFFERENT", "reason": "brief explanation"}]}';
214+
215+
const detailResponse = await retryApiCall(() =>
216+
fetch('https://models.inference.ai.azure.com/chat/completions', {
217+
method: 'POST',
218+
headers: {
219+
'Authorization': 'Bearer ' + github.token,
220+
'Content-Type': 'application/json',
221+
},
222+
body: JSON.stringify({
223+
messages: [
224+
{
225+
role: 'system',
226+
content: 'You are an expert at analyzing GitHub issues for duplicates. Compare the full content and determine: DUPLICATE (same core problem), SIMILAR (related but different aspects), or DIFFERENT (unrelated). Respond only with valid JSON.'
227+
},
228+
{
229+
role: 'user',
230+
content: detailPrompt
231+
}
232+
],
233+
model: 'gpt-4o-mini',
234+
temperature: 0.1,
235+
max_tokens: 300
236+
})
237+
})
238+
);
239+
240+
if (detailResponse.ok) {
241+
const detailResult = await detailResponse.json();
242+
const detailAnalysis = detailResult.choices[0]?.message?.content?.trim();
243+
console.log('AI detailed analysis result: ' + detailAnalysis);
244+
245+
// Parse detailed analysis JSON
246+
try {
247+
const jsonMatch = detailAnalysis.match(/\{.*\}/s);
248+
if (jsonMatch) {
249+
const jsonData = JSON.parse(jsonMatch[0]);
250+
const results = jsonData.duplicates || [];
251+
252+
for (const result of results) {
253+
if (result.classification === 'DUPLICATE' || result.classification === 'SIMILAR') {
254+
const issue = candidateIssues.find(c => c.issue.number === result.issue_number)?.issue;
255+
if (issue) {
256+
duplicates.push({
257+
issue,
258+
similarity: result.classification === 'DUPLICATE' ? 'high' : 'medium'
259+
});
260+
console.log('Found ' + result.classification.toLowerCase() + ' issue: #' + issue.number + ' - ' + issue.title);
261+
}
262+
}
263+
}
264+
}
265+
} catch (parseError) {
266+
console.log('Failed to parse detailed analysis JSON, using fallback');
267+
// Fallback: look for DUPLICATE/SIMILAR mentions
268+
candidateIssues.forEach(candidate => {
269+
const issueRef = '#' + candidate.issue.number;
270+
if (detailAnalysis.includes(issueRef) &&
271+
(detailAnalysis.includes('DUPLICATE') || detailAnalysis.includes('SIMILAR'))) {
272+
duplicates.push({
273+
issue: candidate.issue,
274+
similarity: detailAnalysis.includes('DUPLICATE') ? 'high' : 'medium'
275+
});
276+
console.log('Found similar issue (fallback): #' + candidate.issue.number + ' - ' + candidate.issue.title);
277+
}
278+
});
279+
}
280+
} else {
281+
const errorText = await detailResponse.text();
282+
console.log('Detailed analysis failed after retries: ' + detailResponse.status + ' - ' + errorText);
283+
}
284+
285+
} catch (error) {
286+
console.log('Error in AI analysis: ' + error.message);
287+
}
288+
289+
// Post comment if duplicates found
290+
if (duplicates.length > 0) {
291+
const highPriority = duplicates.filter(d => d.similarity === 'high');
292+
const mediumPriority = duplicates.filter(d => d.similarity === 'medium');
293+
294+
let commentBody = '👋 **Potential duplicate issues detected**\n\n';
295+
commentBody += 'This issue appears to be similar to existing open issues:\n\n';
296+
297+
if (highPriority.length > 0) {
298+
commentBody += '### 🚨 Likely Duplicates\n';
299+
for (const { issue } of highPriority) {
300+
commentBody += '- #' + issue.number + ' - [' + issue.title + '](' + issue.html_url + ')\n';
301+
}
302+
commentBody += '\n';
303+
}
304+
305+
if (mediumPriority.length > 0) {
306+
commentBody += '### 🔍 Similar Issues\n';
307+
for (const { issue } of mediumPriority) {
308+
commentBody += '- #' + issue.number + ' - [' + issue.title + '](' + issue.html_url + ')\n';
309+
}
310+
commentBody += '\n';
311+
}
312+
313+
commentBody += 'Please review these issues to see if your issue is already covered. ';
314+
commentBody += 'If this is indeed a duplicate, consider closing this issue and contributing to the existing discussion.\n\n';
315+
commentBody += '---\n';
316+
commentBody += '*This comment was automatically generated using AI to help identify potential duplicates.*';
317+
318+
await github.rest.issues.createComment({
319+
owner,
320+
repo,
321+
issue_number: issueNumber,
322+
body: commentBody,
323+
});
324+
325+
console.log('Posted comment with ' + duplicates.length + ' potential duplicate(s)');
326+
} else {
327+
console.log('No potential duplicates found');
328+
}

0 commit comments

Comments
 (0)