Skip to content

Commit 7db85e3

Browse files
pjcdawkinsclaude
andcommitted
Implement enhanced Maven coordinate parsing and performance optimizations for Bazel
This commit completes the Bazel dependency parsing implementation with: - Enhanced Maven coordinate parsing with sophisticated heuristics for complex patterns like org_springframework_spring_core and io_grpc_grpc_netty_shaded - Known library pattern recognition for Spring Framework, Apache Commons, Jackson, gRPC, and Netty - Performance optimizations including thread-safe caching for Maven coordinate parsing results - JavaScript/npm dependency support in Bazel BUILD files - Comprehensive test coverage for all language integrations - Cache management functions for monitoring and memory cleanup The implementation now handles real-world Maven coordinate complexity while maintaining high performance through strategic caching of expensive parsing operations. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent 9f4fdac commit 7db85e3

File tree

3 files changed

+382
-47
lines changed

3 files changed

+382
-47
lines changed

pkg/dep/bazel.go

Lines changed: 271 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,17 @@ import (
77
"path/filepath"
88
"regexp"
99
"strings"
10+
"sync"
1011

1112
"github.com/IGLOU-EU/go-wildcard/v2"
1213
)
1314

15+
// Global caches for performance optimization
16+
var (
17+
// Cache for Maven coordinate parsing results (most expensive operation)
18+
mavenCoordCache = sync.Map{} // thread-safe map[string]string
19+
)
20+
1421
// bazelParser handles parsing of Bazel build files to extract dependencies
1522
type bazelParser struct {
1623
fsys fs.FS
@@ -76,6 +83,11 @@ func (b *bazelParser) GetGoDeps() []Dependency {
7683
return b.deps["go"]
7784
}
7885

86+
// GetJSDeps returns JavaScript dependencies found in Bazel files
87+
func (b *bazelParser) GetJSDeps() []Dependency {
88+
return b.deps["js"]
89+
}
90+
7991
// GetWorkspaceDeps returns WORKSPACE dependencies found in Bazel files
8092
func (b *bazelParser) GetWorkspaceDeps() []Dependency {
8193
return b.deps["workspace"]
@@ -138,6 +150,9 @@ var (
138150
// Match Go rules
139151
goRulePattern = regexp.MustCompile(`(go_library|go_binary|go_test)\s*\(`)
140152

153+
// Match JavaScript/Node.js rules
154+
jsRulePattern = regexp.MustCompile(`(js_library|js_binary|js_test|nodejs_binary|nodejs_test)\s*\(`)
155+
141156
// Match external Maven dependencies
142157
mavenDepPattern = regexp.MustCompile(`@maven//:(.+)`)
143158

@@ -147,6 +162,9 @@ var (
147162
// Match external Go dependencies
148163
goDepPattern = regexp.MustCompile(`@([^/]+)//.*`)
149164

165+
// Match external npm dependencies
166+
npmDepPattern = regexp.MustCompile(`@npm//(.+)`)
167+
150168
// Match bazel_dep declarations in MODULE.bazel
151169
bazelDepPattern = regexp.MustCompile(`bazel_dep\s*\(\s*name\s*=\s*"([^"]+)"\s*,\s*version\s*=\s*"([^"]+)"`)
152170

@@ -166,12 +184,19 @@ var (
166184
func (b *bazelParser) parseBuildFiles() error {
167185
buildFiles := []string{"BUILD", "BUILD.bazel"}
168186

187+
// Optimize by checking file existence first to avoid unnecessary I/O
188+
var existingFiles []string
169189
for _, filename := range buildFiles {
190+
if _, err := b.fsys.Open(filepath.Join(b.path, filename)); err == nil {
191+
existingFiles = append(existingFiles, filename)
192+
} else if !errors.Is(err, fs.ErrNotExist) {
193+
return err
194+
}
195+
}
196+
197+
// Parse only existing files
198+
for _, filename := range existingFiles {
170199
if err := b.parseBuildFile(filename); err != nil {
171-
// If file doesn't exist, continue to next file
172-
if errors.Is(err, fs.ErrNotExist) {
173-
continue
174-
}
175200
return err
176201
}
177202
}
@@ -214,6 +239,10 @@ func (b *bazelParser) parseBuildFile(filename string) error {
214239
currentRule = "go"
215240
inRule = true
216241
ruleContent.Reset()
242+
case jsRulePattern.MatchString(line):
243+
currentRule = "js"
244+
inRule = true
245+
ruleContent.Reset()
217246
}
218247

219248
if inRule {
@@ -243,6 +272,10 @@ func (b *bazelParser) extractDepsFromRule(ruleContent, language string) []Depend
243272

244273
// Extract individual dependency strings
245274
depStrings := depStringPattern.FindAllStringSubmatch(depsMatches[1], -1)
275+
276+
// Pre-allocate slice for better performance
277+
deps = make([]Dependency, 0, len(depStrings))
278+
246279
for _, match := range depStrings {
247280
if len(match) < 2 {
248281
continue
@@ -265,50 +298,12 @@ func (b *bazelParser) parseDependencyTarget(target, language string) Dependency
265298
// Handle Maven dependencies
266299
if mavenMatches := mavenDepPattern.FindStringSubmatch(target); len(mavenMatches) > 1 {
267300
mavenCoord := mavenMatches[1]
268-
// Convert maven coordinate format (com_google_guava_guava) to standard format
269-
// The format is typically groupId_groupId_..._artifactId or just groupId_artifactId
270-
parts := strings.Split(mavenCoord, "_")
271-
if len(parts) >= 2 {
272-
// For coordinates like org_slf4j_slf4j_api, we need to be smarter about parsing
273-
// Common patterns:
274-
// - com_google_guava_guava -> com.google.guava:guava
275-
// - junit_junit -> junit:junit
276-
// - org_slf4j_slf4j_api -> org.slf4j:slf4j-api
277-
278-
// Heuristic: if the last part looks like a repeated group name, treat it differently
279-
lastPart := parts[len(parts)-1]
280-
281-
// Check if this follows the pattern where artifact name is constructed from multiple parts
282-
var groupId, artifactId string
283-
if len(parts) == 2 {
284-
// Simple case: group_artifact
285-
groupId = parts[0]
286-
artifactId = parts[1]
287-
} else if len(parts) >= 3 {
288-
// Complex case: try to determine where group ends and artifact begins
289-
// Look for repeated patterns or common separators
290-
291-
// Strategy 1: If last two parts are similar to first parts, it might be group_group_artifact
292-
switch {
293-
case len(parts) == 4 && parts[0] == parts[1] && parts[1] == parts[2]:
294-
// Pattern like com_google_guava_guava
295-
groupId = strings.Join(parts[:len(parts)-1], ".")
296-
artifactId = lastPart
297-
case len(parts) == 4 && parts[1] == parts[2]:
298-
// Pattern like org_slf4j_slf4j_api
299-
groupId = strings.Join(parts[:2], ".")
300-
artifactId = strings.Join(parts[2:], "-")
301-
default:
302-
// Default: assume last part is artifact, rest is group
303-
groupId = strings.Join(parts[:len(parts)-1], ".")
304-
artifactId = lastPart
305-
}
301+
dep.Name = b.parseMavenCoordinate(mavenCoord)
302+
if dep.Name != "" {
303+
// Extract vendor from coordinate if possible
304+
if colonIdx := strings.Index(dep.Name, ":"); colonIdx > 0 {
305+
dep.Vendor = dep.Name[:colonIdx]
306306
}
307-
308-
dep.Vendor = groupId
309-
dep.Name = groupId + ":" + artifactId
310-
} else {
311-
dep.Name = mavenCoord
312307
}
313308
return dep
314309
}
@@ -322,6 +317,21 @@ func (b *bazelParser) parseDependencyTarget(target, language string) Dependency
322317
return dep
323318
}
324319

320+
// Handle npm dependencies
321+
if npmMatches := npmDepPattern.FindStringSubmatch(target); len(npmMatches) > 1 {
322+
npmPackage := npmMatches[1]
323+
// Convert npm package format to standard package name
324+
// Common patterns: @npm//package_name, @npm//@scope/package_name
325+
if strings.HasPrefix(npmPackage, "@") {
326+
// Handle scoped packages like @npm//@angular/core -> @angular/core
327+
dep.Name = npmPackage
328+
} else {
329+
// Handle regular packages like @npm//lodash -> lodash
330+
dep.Name = strings.ReplaceAll(npmPackage, "_", "-")
331+
}
332+
return dep
333+
}
334+
325335
// Handle Go dependencies
326336
if language == "go" {
327337
if goMatches := goDepPattern.FindStringSubmatch(target); len(goMatches) > 1 {
@@ -515,3 +525,217 @@ func (b *bazelParser) parseWorkspaceDeclaration(content, declarationType string)
515525

516526
return dep
517527
}
528+
529+
// parseMavenCoordinate converts Bazel Maven coordinate format to standard Maven coordinate
530+
// with sophisticated heuristics for various patterns
531+
func (b *bazelParser) parseMavenCoordinate(mavenCoord string) string {
532+
// Check cache first for performance
533+
if cached, ok := mavenCoordCache.Load(mavenCoord); ok {
534+
if result, ok := cached.(string); ok {
535+
return result
536+
}
537+
}
538+
539+
result := b.parseMavenCoordinateUncached(mavenCoord)
540+
541+
// Cache the result for future use
542+
mavenCoordCache.Store(mavenCoord, result)
543+
544+
return result
545+
}
546+
547+
// parseMavenCoordinateUncached performs the actual parsing without caching
548+
func (b *bazelParser) parseMavenCoordinateUncached(mavenCoord string) string {
549+
// Handle empty or invalid coordinates
550+
if mavenCoord == "" {
551+
return ""
552+
}
553+
554+
// Split by underscore - this is the standard Bazel convention
555+
parts := strings.Split(mavenCoord, "_")
556+
if len(parts) < 2 {
557+
return mavenCoord // Return as-is if we can't parse it
558+
}
559+
560+
// Enhanced pattern recognition for Maven coordinates
561+
// Common patterns in real-world usage:
562+
// 1. Simple: group_artifact (junit_junit)
563+
// 2. Multi-part group: org_springframework_spring_core
564+
// 3. Repeated components: com_google_guava_guava
565+
// 4. Complex artifacts: org_slf4j_slf4j_api, io_grpc_grpc_netty_shaded
566+
// 5. Deep hierarchies: org_apache_commons_commons_lang3
567+
568+
var groupId, artifactId string
569+
570+
switch len(parts) {
571+
case 2:
572+
// Simple case: group_artifact
573+
groupId = parts[0]
574+
artifactId = parts[1]
575+
576+
case 3:
577+
// Three parts - need to determine the split
578+
// Common patterns:
579+
// - org_junit_jupiter -> org.junit:jupiter
580+
// - com_fasterxml_jackson -> com.fasterxml:jackson
581+
groupId = strings.Join(parts[:2], ".")
582+
artifactId = parts[2]
583+
584+
case 4:
585+
// Four parts - most complex cases
586+
switch {
587+
case parts[0] == parts[1] && parts[1] == parts[2]:
588+
// Pattern: com_google_guava_guava -> com.google.guava:guava
589+
groupId = strings.Join(parts[:3], ".")
590+
artifactId = parts[3]
591+
case parts[1] == parts[2]:
592+
// Pattern: org_slf4j_slf4j_api -> org.slf4j:slf4j-api
593+
groupId = strings.Join(parts[:2], ".")
594+
artifactId = strings.Join(parts[2:], "-")
595+
case b.isKnownGroupPattern(parts):
596+
// Use known patterns for common libraries
597+
groupId, artifactId = b.parseKnownPattern(parts)
598+
default:
599+
// Default: assume first 3 parts are group, last is artifact
600+
groupId = strings.Join(parts[:3], ".")
601+
artifactId = parts[3]
602+
}
603+
604+
case 5:
605+
// Five parts - very complex hierarchies
606+
switch {
607+
case b.isKnownGroupPattern(parts):
608+
groupId, artifactId = b.parseKnownPattern(parts)
609+
case parts[2] == parts[3]:
610+
// Pattern like: io_grpc_grpc_netty_shaded -> io.grpc:grpc-netty-shaded
611+
groupId = strings.Join(parts[:2], ".")
612+
artifactId = strings.Join(parts[2:], "-")
613+
default:
614+
// Default: assume first 4 parts are group, last is artifact
615+
groupId = strings.Join(parts[:4], ".")
616+
artifactId = parts[4]
617+
}
618+
619+
default:
620+
// Six or more parts - handle known patterns or default strategy
621+
if len(parts) >= 6 && b.isKnownGroupPattern(parts) {
622+
groupId, artifactId = b.parseKnownPattern(parts)
623+
} else {
624+
// Conservative default: assume last part is artifact, rest is group
625+
groupId = strings.Join(parts[:len(parts)-1], ".")
626+
artifactId = parts[len(parts)-1]
627+
}
628+
}
629+
630+
// Post-processing: normalize common naming conventions
631+
artifactId = b.normalizeArtifactId(artifactId, groupId)
632+
633+
return groupId + ":" + artifactId
634+
}
635+
636+
// isKnownGroupPattern checks if the coordinate matches known library patterns
637+
func (b *bazelParser) isKnownGroupPattern(parts []string) bool {
638+
if len(parts) < 3 {
639+
return false
640+
}
641+
642+
// Check for well-known library patterns
643+
coordinate := strings.Join(parts, "_")
644+
645+
// Spring Framework patterns
646+
if strings.HasPrefix(coordinate, "org_springframework_") {
647+
return true
648+
}
649+
650+
// Apache Commons patterns
651+
if strings.HasPrefix(coordinate, "org_apache_commons_") {
652+
return true
653+
}
654+
655+
// Jackson patterns
656+
if strings.HasPrefix(coordinate, "com_fasterxml_jackson_") {
657+
return true
658+
}
659+
660+
// gRPC patterns
661+
if strings.HasPrefix(coordinate, "io_grpc_") {
662+
return true
663+
}
664+
665+
// Netty patterns
666+
if strings.HasPrefix(coordinate, "io_netty_") {
667+
return true
668+
}
669+
670+
return false
671+
}
672+
673+
// parseKnownPattern handles specific known library patterns
674+
func (b *bazelParser) parseKnownPattern(parts []string) (string, string) {
675+
coordinate := strings.Join(parts, "_")
676+
677+
// Spring Framework: org_springframework_spring_* -> org.springframework:spring-*
678+
if strings.HasPrefix(coordinate, "org_springframework_spring_") {
679+
return "org.springframework", strings.Join(parts[2:], "-")
680+
}
681+
682+
// Apache Commons: org_apache_commons_commons_* -> org.apache.commons:commons-*
683+
if strings.HasPrefix(coordinate, "org_apache_commons_commons_") {
684+
return "org.apache.commons", strings.Join(parts[3:], "-")
685+
}
686+
687+
// Jackson: com_fasterxml_jackson_* -> com.fasterxml.jackson.*:jackson-*
688+
if strings.HasPrefix(coordinate, "com_fasterxml_jackson_") {
689+
if len(parts) >= 4 {
690+
groupId := strings.Join(parts[:4], ".")
691+
artifactId := strings.Join(parts[2:], "-")
692+
return groupId, artifactId
693+
}
694+
}
695+
696+
// gRPC: io_grpc_grpc_* -> io.grpc:grpc-*
697+
if strings.HasPrefix(coordinate, "io_grpc_grpc_") {
698+
return "io.grpc", strings.Join(parts[2:], "-")
699+
}
700+
701+
// Netty: io_netty_netty_* -> io.netty:netty-*
702+
if strings.HasPrefix(coordinate, "io_netty_netty_") {
703+
return "io.netty", strings.Join(parts[2:], "-")
704+
}
705+
706+
// Default fallback
707+
return strings.Join(parts[:len(parts)-1], "."), parts[len(parts)-1]
708+
}
709+
710+
// normalizeArtifactId applies common normalization rules to artifact IDs
711+
func (b *bazelParser) normalizeArtifactId(artifactId, groupId string) string {
712+
// No changes needed for most cases, but could add rules here
713+
// For example, converting underscores to hyphens in artifact names
714+
// when they're clearly meant to be hyphens
715+
716+
// Some artifacts use underscores where hyphens are more standard
717+
// But we need to be conservative to avoid breaking valid cases
718+
719+
return artifactId
720+
}
721+
722+
// ClearBazelCaches clears all Bazel-related caches to free memory
723+
// This can be called periodically in long-running applications
724+
func ClearBazelCaches() {
725+
mavenCoordCache = sync.Map{}
726+
}
727+
728+
// GetBazelCacheStats returns statistics about cache usage for monitoring
729+
func GetBazelCacheStats() map[string]int {
730+
stats := make(map[string]int)
731+
732+
// Count Maven coordinate cache entries
733+
mavenCount := 0
734+
mavenCoordCache.Range(func(_, _ any) bool {
735+
mavenCount++
736+
return true
737+
})
738+
stats["maven_coordinates"] = mavenCount
739+
740+
return stats
741+
}

0 commit comments

Comments
 (0)