Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions migrations/0004_add_canonical_url_and_composite_uniqueness.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
-- Migration: Add canonical_url metadata and composite uniqueness for PR identity
-- Created: 2026-03-25
-- Description:
-- 1) Adds canonical_url column for normalized PR URL storage
-- 2) Backfills canonical_url for existing rows
-- 3) Removes duplicate rows by (repo_owner, repo_name, pr_number), keeping latest id
-- 4) Enforces uniqueness on (repo_owner, repo_name, pr_number)

ALTER TABLE prs ADD COLUMN canonical_url TEXT;

UPDATE prs
SET canonical_url = 'https://github.com/' || repo_owner || '/' || repo_name || '/pull/' || pr_number
WHERE canonical_url IS NULL OR canonical_url = '';

WITH ranked_prs AS (
SELECT
id,
ROW_NUMBER() OVER (
PARTITION BY LOWER(repo_owner), LOWER(repo_name), pr_number
ORDER BY readiness_computed_at DESC NULLS LAST,
updated_at DESC NULLS LAST,
id DESC
) AS row_num
FROM prs
)
DELETE FROM prs
WHERE id IN (
SELECT id
FROM ranked_prs
WHERE row_num > 1
);

CREATE UNIQUE INDEX IF NOT EXISTS idx_prs_identity_unique
ON prs(repo_owner COLLATE NOCASE, repo_name COLLATE NOCASE, pr_number);
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
"db:migrations:list": "wrangler d1 migrations list pr_tracker",
"lint": "echo 'Linting not configured yet' && exit 0",
"format:check": "echo 'Format checking not configured yet' && exit 0",
"test": "node test-data-display.js",
"test:data-display": "node test-data-display.js"
"test": "node test-data-display.js && node test-url-validation.js",
"test:data-display": "node test-data-display.js",
"test:url-validation": "node test-url-validation.js"
},
"keywords": [
"github",
Expand Down
243 changes: 211 additions & 32 deletions public/index.html

Large diffs are not rendered by default.

11 changes: 7 additions & 4 deletions src/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,17 @@ async def delete_readiness_from_db(env, pr_id):
async def upsert_pr(db, pr_url, owner, repo, pr_number, pr_data):
"""Helper to insert or update PR in database (Deduplicates logic)"""
current_timestamp = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z')
canonical_url = f'https://github.com/{owner}/{repo}/pull/{pr_number}'

stmt = db.prepare('''
INSERT INTO prs (pr_url, repo_owner, repo_name, pr_number, title, state,
INSERT INTO prs (pr_url, canonical_url, repo_owner, repo_name, pr_number, title, state,
is_merged, mergeable_state, files_changed, author_login, author_avatar, repo_owner_avatar, checks_passed, checks_failed, checks_skipped,
commits_count, behind_by, review_status, last_updated_at,
last_refreshed_at, updated_at, is_draft, open_conversations_count, reviewers_json, etag)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(pr_url) DO UPDATE SET
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(repo_owner, repo_name, pr_number) DO UPDATE SET
pr_url = excluded.pr_url,
canonical_url = excluded.canonical_url,
title = excluded.title,
state = excluded.state,
is_merged = excluded.is_merged,
Expand All @@ -289,7 +292,7 @@ async def upsert_pr(db, pr_url, owner, repo, pr_number, pr_data):
reviewers_json = excluded.reviewers_json,
etag = excluded.etag
''').bind(
pr_url, owner, repo, pr_number,
pr_url, canonical_url, owner, repo, pr_number,
pr_data.get('title') or '',
pr_data.get('state') or '',
1 if pr_data.get('is_merged') else 0,
Expand Down
44 changes: 27 additions & 17 deletions src/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,24 +85,32 @@ async def handle_add_pr(request, env):

db = get_db(env)

normalized_pr_url = pr_url.strip()

# Auto-detect URL type: if it's an org or repo URL (not a PR URL),
# automatically treat as bulk import regardless of checkbox state
if not add_all:
# Check if the URL is an org URL or a repo URL (not a PR URL)
if parse_org_url(pr_url) or (parse_repo_url(pr_url) and '/pull/' not in pr_url):
is_pr_url = False
try:
parse_pr_url(normalized_pr_url)
is_pr_url = True
except ValueError:
is_pr_url = False

if not is_pr_url and (parse_org_url(normalized_pr_url) or parse_repo_url(normalized_pr_url)):
add_all = True

if add_all:
org_owner = ''
# Try parsing as a repo URL first (e.g. https://github.com/owner/repo)
parsed = parse_repo_url(pr_url)
parsed = parse_repo_url(normalized_pr_url)
if parsed:
# Single repo import
repos_to_import = [{'owner': parsed['owner'], 'name': parsed['repo']}]
is_org_import = False
else:
# Parse org url
org_parsed = parse_org_url(pr_url)
org_parsed = parse_org_url(normalized_pr_url)
if not org_parsed:
return Response.new(json.dumps({'error': 'Invalid GitHub Repository or Organization URL'}),
{'status': 400, 'headers': {'Content-Type': 'application/json'}})
Expand Down Expand Up @@ -211,7 +219,8 @@ async def handle_add_pr(request, env):
'reviewers_json': '[]'
}

await upsert_pr(db, item['html_url'], owner, repo, item['number'], pr_data)
canonical_pr_url = f"https://github.com/{owner}/{repo}/pull/{item['number']}"
await upsert_pr(db, canonical_pr_url, owner, repo, item['number'], pr_data)
added_count += 1

# Build response message
Expand All @@ -236,7 +245,7 @@ async def handle_add_pr(request, env):
# Add single pr
# Catch ValueError from parse_pr_url
try:
parsed = parse_pr_url(pr_url)
parsed = parse_pr_url(normalized_pr_url)
except ValueError as e:
return Response.new(
json.dumps({'error': str(e)}),
Expand All @@ -259,27 +268,28 @@ async def handle_add_pr(request, env):
return Response.new(json.dumps({'error': 'Cannot add merged/closed PRs'}),
{'status': 400, 'headers': {'Content-Type': 'application/json'}})

await upsert_pr(db, pr_url, parsed['owner'], parsed['repo'], parsed['pr_number'], pr_data)
canonical_pr_url = parsed['canonical_url']
await upsert_pr(db, canonical_pr_url, parsed['owner'], parsed['repo'], parsed['pr_number'], pr_data)

# Auto-run readiness analysis for the newly added PR
readiness_data = None
try:
pr_result = await db.prepare(
'SELECT * FROM prs WHERE pr_url = ?'
).bind(pr_url).first()
'SELECT * FROM prs WHERE repo_owner = ? COLLATE NOCASE AND repo_name = ? COLLATE NOCASE AND pr_number = ?'
).bind(parsed['owner'], parsed['repo'], parsed['pr_number']).first()
if pr_result:
pr_row = pr_result.to_py()
readiness_data = await _run_readiness_analysis(env, pr_row, pr_row['id'], user_token)
except Exception as analysis_err:
print(f"Auto-analysis failed for PR {pr_url}: {type(analysis_err).__name__}: {str(analysis_err)}")
print(f"Auto-analysis failed for PR {canonical_pr_url}: {type(analysis_err).__name__}: {str(analysis_err)}")

# Include repo_owner, repo_name, pr_number, and pr_url in the response for frontend display
response_data = {
**pr_data,
'repo_owner': parsed['owner'],
'repo_name': parsed['repo'],
'pr_number': parsed['pr_number'],
'pr_url': pr_url
'pr_url': canonical_pr_url
}

result_obj = {'success': True, 'data': response_data}
Expand Down Expand Up @@ -1110,8 +1120,8 @@ async def handle_github_webhook(request, env):
db = get_db(env)
pr_url = f"https://github.com/{repo_owner}/{repo_name}/pull/{pr_number}"
result = await db.prepare(
'SELECT id FROM prs WHERE pr_url = ?'
).bind(pr_url).first()
'SELECT id FROM prs WHERE repo_owner = ? COLLATE NOCASE AND repo_name = ? COLLATE NOCASE AND pr_number = ?'
).bind(repo_owner, repo_name, pr_number).first()

# Handle opened PRs - add to tracking automatically
if action == 'opened':
Expand All @@ -1136,8 +1146,8 @@ async def handle_github_webhook(request, env):

# Get the newly created PR ID
new_result = await db.prepare(
'SELECT id FROM prs WHERE pr_url = ?'
).bind(pr_url).first()
'SELECT id FROM prs WHERE repo_owner = ? COLLATE NOCASE AND repo_name = ? COLLATE NOCASE AND pr_number = ?'
).bind(repo_owner, repo_name, pr_number).first()
new_pr_id = new_result.to_py()['id'] if new_result else None

return Response.new(
Expand Down Expand Up @@ -1295,8 +1305,8 @@ async def handle_github_webhook(request, env):
for pr_number, repo_owner, repo_name in prs_to_update:
pr_url = f"https://github.com/{repo_owner}/{repo_name}/pull/{pr_number}"
result = await db.prepare(
'SELECT id FROM prs WHERE pr_url = ?'
).bind(pr_url).first()
'SELECT id FROM prs WHERE repo_owner = ? COLLATE NOCASE AND repo_name = ? COLLATE NOCASE AND pr_number = ?'
).bind(repo_owner, repo_name, pr_number).first()

if not result:
# PR not being tracked - skip it
Expand Down
88 changes: 56 additions & 32 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,50 +11,73 @@
# Reduces overall readiness score by 33% when mergeable state is 'dirty' (conflicts)
_MERGE_CONFLICTS_SCORE_MULTIPLIER = 0.67

_GITHUB_PR_RE = re.compile(
r'^https?://github\.com/([^/?#\s]+)/([^/?#\s]+)/pull/(\d+)(?:[/?#].*)?$',
re.IGNORECASE,
)
_GITHUB_REPO_RE = re.compile(
r'^https?://github\.com/(?!orgs(?:/|$))([^/?#\s]+)/([^/?#\s]+)/?(?:[?#].*)?$',
re.IGNORECASE,
)
_GITHUB_ORG_RE = re.compile(
r'^https?://github\.com/(?:orgs/)?([A-Za-z0-9_.-]+)/?(?:[?#].*)?$',
re.IGNORECASE,
)

def parse_pr_url(pr_url):

def parse_pr_url(pr_url: str):
"""
Parse GitHub PR URL to extract owner, repo, and PR number.

Security Hardening (Issue #45):
- Type validation to prevent type confusion attacks
- Anchored regex pattern to block malformed URLs with trailing junk
- Raises ValueError instead of returning None for better error handling
Validates and parses a GitHub PR URL.
Returns a dict with owner, repo, and pr_number or raises ValueError.
"""
# FIX Issue #45: Type validation
if not isinstance(pr_url, str):
raise ValueError("PR URL must be a string")

if not pr_url:
raise ValueError("PR URL is required")

pr_url = pr_url.strip().rstrip('/')

# FIX Issue #45: Anchored regex - must match EXACTLY, no trailing junk allowed
pattern = r'^https?://github\.com/([^/]+)/([^/]+)/pull/(\d+)$'
match = re.match(pattern, pr_url)

normalized_url = pr_url.strip()
match = _GITHUB_PR_RE.match(normalized_url)

if not match:
# FIX Issue #45: Raise error instead of returning None
raise ValueError("Invalid GitHub PR URL. Format: https://github.com/OWNER/REPO/pull/NUMBER")

raise ValueError(
"Invalid GitHub PR URL. Expected format: "
"https://github.com/owner/repo/pull/123"
)

owner, repo, pr_num_str = match.group(1), match.group(2), match.group(3)
pr_number = int(pr_num_str)

if pr_number < 1:
raise ValueError("PR number must be a positive integer.")

# Return as a dictionary to match the original function's signature
return {
'owner': match.group(1),
'repo': match.group(2),
'pr_number': int(match.group(3))
'owner': owner,
'repo': repo,
'pr_number': pr_number,
'canonical_url': f"https://github.com/{owner}/{repo}/pull/{pr_number}"
}


def parse_repo_url(url):
"""Parse GitHub Repo URL to extract owner and repo name"""
if not url: return None
url = url.strip().rstrip('/')
pattern = r'https?://github\.com/([^/]+)/([^/]+)(?:/.*)?$'
match = re.match(pattern, url)
if not url:
return None
normalized_url = url.strip()
match = _GITHUB_REPO_RE.match(normalized_url)
if match:
owner = match.group(1)
# Security guard: don't misroute /orgs/<owner> bulk URLs to parse_repo_url()
# GitHub uses /orgs/<owner> for organization homepages/dashboards.
if owner.lower() == 'orgs':
return None

repo = match.group(2)
return {
'owner': match.group(1),
'repo': match.group(2)
'owner': owner,
'repo': repo,
'canonical_url': f"https://github.com/{owner}/{repo}"
}
return None

Expand All @@ -65,21 +88,22 @@ def parse_org_url(url):
"""
if not url:
return None
url = url.strip().rstrip('/')
# Match org/user URL: github.com/<owner> with no further path segments
pattern = r'^https?://github\.com/([A-Za-z0-9_.-]+)$'
match = re.match(pattern, url)
normalized_url = url.strip()
match = _GITHUB_ORG_RE.match(normalized_url)
if match:
owner = match.group(1)
# Exclude GitHub reserved paths that aren't orgs/users
reserved = {'settings', 'organizations', 'explore', 'marketplace',
'notifications', 'new', 'login', 'signup', 'features',
'enterprise', 'pricing', 'topics', 'collections',
'trending', 'sponsors', 'about', 'security', 'pulls',
'trending', 'sponsors', 'about', 'security', 'pulls', 'orgs',
'issues', 'codespaces', 'discussions'}
if owner.lower() in reserved:
return None
return {'owner': owner}
return {
'owner': owner,
'canonical_url': f"https://github.com/{owner}"
}
return None


Expand Down
Loading
Loading