-
Notifications
You must be signed in to change notification settings - Fork 2
Add MANIFEST.MF fallback parser for JARs without pom.properties #150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,311 @@ | ||
| package java | ||
|
|
||
| import ( | ||
| "archive/zip" | ||
| "bufio" | ||
| "path/filepath" | ||
| "strings" | ||
|
|
||
| "github.com/DataDog/datadog-sbom-generator/internal/cachedregexp" | ||
| "github.com/DataDog/datadog-sbom-generator/pkg/lockfile" | ||
| "github.com/DataDog/datadog-sbom-generator/pkg/models" | ||
| ) | ||
|
|
||
| // jarFilenameRegex matches Maven-convention JAR filenames: artifactId-version.jar | ||
| // The version must start with a digit. The artifactId uses a greedy match so that | ||
| // the split occurs at the LAST hyphen-digit boundary, not the first. This correctly | ||
| // handles artifactIds that themselves contain a hyphen-digit segment, e.g. | ||
| // log4j-1.2-api-2.17.1.jar → artifactId="log4j-1.2-api", version="2.17.1". | ||
| var jarFilenameRegex = cachedregexp.MustCompile(`^(.+)-(\d.*)\.jar$`) | ||
|
|
||
| // jarClassifierRegex matches known Maven classifier suffixes appended at the end of a | ||
| // version string in a JAR filename. Classifiers are either OS/architecture identifiers | ||
| // (linux, windows, osx, …) optionally followed by an arch token, or well-known | ||
| // descriptor strings (sources, javadoc, native, …). | ||
| // | ||
| // Version qualifiers such as -SNAPSHOT, -Final, -RC1 are intentionally excluded so | ||
| // they are never stripped. | ||
| var jarClassifierRegex = cachedregexp.MustCompile( | ||
| `-(?:linux|windows|osx|macos|darwin|freebsd|sunos|solaris|aix)(?:[_-][a-z0-9_]+)*$` + | ||
| `|-(?:sources|javadoc|tests|native|all|uber|shaded|assembly|no_aop)$`, | ||
| ) | ||
|
|
||
| // parseJarFilename extracts artifactId and version from a JAR filename following | ||
| // Maven naming conventions. If the version portion contains a known classifier suffix | ||
| // (e.g. "-linux-x86_64", "-sources"), the classifier is stripped so that only the | ||
| // canonical Maven version is returned. Returns empty strings if the filename doesn't | ||
| // match the expected pattern. | ||
| func parseJarFilename(filename string) (artifactID, version string) { | ||
| matches := jarFilenameRegex.FindStringSubmatch(filename) | ||
| if matches == nil { | ||
| return "", "" | ||
| } | ||
|
|
||
| return matches[1], jarClassifierRegex.ReplaceAllString(matches[2], "") | ||
| } | ||
|
|
||
| // cleanBundleSymbolicName strips OSGi directives (everything after the first ';') | ||
| // from a Bundle-SymbolicName value and trims whitespace. | ||
| func cleanBundleSymbolicName(raw string) string { | ||
| raw = strings.TrimSpace(raw) | ||
| if idx := strings.IndexByte(raw, ';'); idx >= 0 { | ||
| raw = strings.TrimSpace(raw[:idx]) | ||
| } | ||
|
|
||
| return raw | ||
| } | ||
|
|
||
| // cleanName normalizes a manifest attribute value for use as an artifactId: | ||
| // lowercases, replaces spaces with hyphens, removes non-alphanumeric chars | ||
| // (except hyphens, underscores, dots). | ||
| func cleanName(raw string) string { | ||
| raw = strings.TrimSpace(raw) | ||
| if raw == "" { | ||
| return "" | ||
| } | ||
|
|
||
| raw = strings.ToLower(raw) | ||
| raw = strings.ReplaceAll(raw, " ", "-") | ||
|
|
||
| var b strings.Builder | ||
| for _, r := range raw { | ||
| if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' { | ||
| b.WriteRune(r) | ||
| } | ||
| } | ||
|
|
||
| return b.String() | ||
| } | ||
|
|
||
| // manifestAttrs holds the MANIFEST.MF attributes relevant to fallback package inference. | ||
| // Raw attribute values are stored (not cleaned) so callers can apply appropriate cleaning. | ||
| type manifestAttrs struct { | ||
| bundleSymbolicName string | ||
| bundleName string | ||
| bundleVersion string | ||
| implVersion string | ||
| automaticModuleName string | ||
| } | ||
|
|
||
| // parseManifestAttributes reads META-INF/MANIFEST.MF from a zip archive and | ||
| // returns the attributes relevant to fallback package inference. | ||
| // Parsing stops at the first blank line, which ends the main section; per-entry | ||
| // sections that follow must not overwrite main-section values. | ||
| func parseManifestAttributes(zipReader *zip.Reader) (manifestAttrs, error) { | ||
| var manifestEntry *zip.File | ||
| for _, entry := range zipReader.File { | ||
| if entry.Name == "META-INF/MANIFEST.MF" { | ||
| manifestEntry = entry | ||
|
|
||
| break | ||
| } | ||
| } | ||
|
|
||
| if manifestEntry == nil { | ||
| return manifestAttrs{}, nil | ||
| } | ||
|
|
||
| rc, err := manifestEntry.Open() | ||
| if err != nil { | ||
| return manifestAttrs{}, err | ||
| } | ||
| defer rc.Close() | ||
|
|
||
| // Parse MANIFEST.MF format: key-value pairs with continuation lines | ||
| // (lines starting with a space are appended to the previous value). | ||
| attrs := make(map[string]string) | ||
|
|
||
| var currentKey string | ||
|
|
||
| scanner := bufio.NewScanner(rc) | ||
| for scanner.Scan() { | ||
| line := scanner.Text() | ||
|
|
||
| // A blank line ends the main section of the manifest. | ||
| // Per-entry sections follow and must not overwrite main-section attributes. | ||
| if line == "" { | ||
| break | ||
| } | ||
|
|
||
| // Continuation line: starts with a single space | ||
| if strings.HasPrefix(line, " ") && currentKey != "" { | ||
| attrs[currentKey] += strings.TrimPrefix(line, " ") | ||
|
|
||
| continue | ||
| } | ||
|
|
||
| key, value, found := strings.Cut(line, ": ") | ||
| if !found { | ||
| currentKey = "" | ||
|
|
||
| continue | ||
|
Comment on lines
+137
to
+141
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For JARs whose Useful? React with 👍 / 👎.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 4e7e90d. The scanner now breaks on the first blank line, which is the JAR spec's main-section terminator. Added a regression test: a manifest with per-entry sections containing a different
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 4e7e90d. The scanner now breaks on the first blank line, which is the JAR spec's main-section terminator. Added a regression test: a manifest with per-entry sections containing a different |
||
| } | ||
|
|
||
| currentKey = key | ||
| attrs[currentKey] = value | ||
| } | ||
|
|
||
| if err := scanner.Err(); err != nil { | ||
| return manifestAttrs{}, err | ||
| } | ||
|
|
||
| return manifestAttrs{ | ||
| bundleSymbolicName: attrs["Bundle-SymbolicName"], | ||
| bundleName: attrs["Bundle-Name"], | ||
| bundleVersion: attrs["Bundle-Version"], | ||
| implVersion: attrs["Implementation-Version"], | ||
| automaticModuleName: attrs["Automatic-Module-Name"], | ||
| }, nil | ||
| } | ||
|
|
||
| // parseGroupID infers a Maven groupId from MANIFEST.MF attributes using the | ||
| // dot-prefix heuristic from the Java Tracer's Dependency.java guessFallbackNoPom. | ||
| // | ||
| // Primary algorithm (BSN-based): | ||
| // 1. Build candidate names: [filenameArtifact, cleanName(bundleName)] | ||
| // 2. For each candidate, check if BSN ends with "." + candidate AND BSN contains "." AND len(BSN) > 5 | ||
| // 3. If match: groupId = BSN prefix (BSN minus "." + candidate) | ||
| // | ||
| // Fallback when BSN has no dots (poor OSGi metadata, e.g. Bundle-SymbolicName: "bcprov"): | ||
| // 1. Apply the same dot-prefix heuristic to Automatic-Module-Name | ||
| // 2. If no candidate matches, strip the last dot-segment of AMN as a best-effort groupId | ||
| // (requires AMN to have at least 2 dots for confidence) | ||
| // | ||
| // Final fallback: BSN as-is. | ||
| func parseGroupID(bundleSymbolicName, bundleName, filenameArtifact, automaticModuleName string) string { | ||
| if bundleSymbolicName == "" { | ||
| return "" | ||
| } | ||
|
|
||
| // Build candidate list: filename artifact first, then cleaned bundle name | ||
| var candidates []string | ||
| if filenameArtifact != "" { | ||
| candidates = append(candidates, filenameArtifact) | ||
| } | ||
|
|
||
| if cleanedBundleName := cleanName(bundleName); cleanedBundleName != "" { | ||
| candidates = append(candidates, cleanedBundleName) | ||
| } | ||
|
|
||
| // Primary: dot-prefix heuristic on BSN | ||
| for _, candidate := range candidates { | ||
| suffix := "." + candidate | ||
| if strings.HasSuffix(bundleSymbolicName, suffix) && | ||
| strings.Contains(bundleSymbolicName, ".") && | ||
| len(bundleSymbolicName) > 5 { | ||
| return bundleSymbolicName[:len(bundleSymbolicName)-len(suffix)] | ||
| } | ||
| } | ||
|
|
||
| // BSN-based inference failed. When BSN has no dots (e.g. "bcprov"), it carries | ||
| // no package hierarchy. Try Automatic-Module-Name as a more reliable source. | ||
| if !strings.Contains(bundleSymbolicName, ".") && automaticModuleName != "" { | ||
| // Dot-prefix heuristic on AMN | ||
| for _, candidate := range candidates { | ||
| suffix := "." + candidate | ||
| if strings.HasSuffix(automaticModuleName, suffix) && | ||
| strings.Contains(automaticModuleName, ".") && | ||
| len(automaticModuleName) > 5 { | ||
| return automaticModuleName[:len(automaticModuleName)-len(suffix)] | ||
| } | ||
| } | ||
|
|
||
| // No candidate matched. Strip the last dot-segment of AMN as a best-effort | ||
| // groupId (e.g. "org.bouncycastle.provider" → "org.bouncycastle"). | ||
| // Require at least 2 dots (3 segments) to avoid over-truncating short names. | ||
| if strings.Count(automaticModuleName, ".") >= 2 { | ||
| if idx := strings.LastIndex(automaticModuleName, "."); idx > 0 { | ||
| return automaticModuleName[:idx] | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // No candidate matched; use BSN as-is | ||
| return bundleSymbolicName | ||
| } | ||
|
|
||
| // resolveManifestPackage determines the final package name and version from | ||
| // MANIFEST.MF attributes and filename-derived values. | ||
| // | ||
| // ArtifactId: always the filename-derived artifact ID. | ||
| // Bundle-Name and Implementation-Title are OSGi/JAR display names and frequently | ||
| // do not match the Maven artifactId (e.g. Bundle-Name "bcprov" vs filename artifact | ||
| // "bcprov-jdk18on"). The filename is the authoritative source for the Maven coordinate. | ||
| // | ||
| // Version priority: bundleVersion==implVersion agreement > filenameVersion > bundleVersion > implVersion > "" | ||
| // | ||
| // Implementation-Title is intentionally excluded: it is an OSGi/JAR display name | ||
| // and is not reliably set to the Maven artifactId. | ||
| // | ||
| // Returns empty name if groupId or artifactId cannot be determined. | ||
| func resolveManifestPackage(filenameArtifact, filenameVersion, rawBSN, bundleName, bundleVersion, implVersion, automaticModuleName string) (name, version string) { | ||
| bsn := cleanBundleSymbolicName(rawBSN) | ||
| if bsn == "" { | ||
| return "", "" | ||
| } | ||
|
|
||
| // artifactId comes from the filename: it is the most reliable source of the Maven | ||
| // artifact ID. Bundle-Name / Implementation-Title are display names only. | ||
| artifactID := filenameArtifact | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we unconditionally use the file name but the PR description says
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was the original plan, but it was not resulting in the expected result for BC, so the file name took preference when the bundle name was not correct, let me update the description. I also sent the improvements to dd-trace-java so that they could implement them in the Java Tracer |
||
| if artifactID == "" { | ||
| return "", "" | ||
| } | ||
|
|
||
| // Resolve version by priority | ||
| if bundleVersion != "" && bundleVersion == implVersion { | ||
| // High confidence: both sources agree | ||
| version = bundleVersion | ||
| } else if filenameVersion != "" { | ||
| version = filenameVersion | ||
| } else if bundleVersion != "" { | ||
| version = bundleVersion | ||
| } else { | ||
| version = implVersion | ||
| } | ||
|
|
||
| // Resolve groupId using filename artifact and bundle name as candidates | ||
| // (not the resolved artifactId — groupId inference has its own candidate list) | ||
| groupID := parseGroupID(bsn, bundleName, filenameArtifact, automaticModuleName) | ||
| if groupID == "" { | ||
| return "", "" | ||
| } | ||
|
|
||
| return groupID + ":" + artifactID, version | ||
| } | ||
|
|
||
| // extractFromManifest attempts to infer a Maven package from the JAR's | ||
| // MANIFEST.MF attributes and filename. This is the fallback path used when | ||
| // no pom.properties are found inside the JAR. | ||
| func extractFromManifest(jarPath string, zipReader *zip.Reader, packages []lockfile.PackageDetails) []lockfile.PackageDetails { | ||
| filenameArtifact, filenameVersion := parseJarFilename(filepath.Base(jarPath)) | ||
| if filenameArtifact == "" { | ||
| // Filename doesn't match Maven convention; cannot infer package | ||
| return packages | ||
| } | ||
|
|
||
| mf, err := parseManifestAttributes(zipReader) | ||
| if err != nil { | ||
| // Silently skip on parse error — this is a best-effort fallback | ||
| return packages | ||
| } | ||
|
|
||
| name, version := resolveManifestPackage( | ||
| filenameArtifact, filenameVersion, | ||
| mf.bundleSymbolicName, mf.bundleName, mf.bundleVersion, | ||
| mf.implVersion, | ||
| mf.automaticModuleName, | ||
| ) | ||
|
|
||
| if name == "" { | ||
| return packages | ||
| } | ||
|
|
||
| return append(packages, lockfile.PackageDetails{ | ||
| Name: name, | ||
| Version: version, | ||
| PackageManager: jarPomPropertiesPackageManager, | ||
| Ecosystem: models.EcosystemMaven, | ||
| Opaque: true, | ||
| IsDirect: true, | ||
| }) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For no-pom JARs whose filename version itself contains a hyphen followed by a digit, such as Maven's timestamped snapshots (
my-lib-1.0-20260513.123456-1.jar) or other hyphenated numeric versions, this greedy match treats the last-\das the artifact/version boundary. That yieldsfilenameArtifact="my-lib-1.0-20260513.123456"andfilenameVersion="1", so the manifest fallback can emit a non-existent coordinate even when the manifest has the correct package metadata. The split needs to account for numeric hyphens inside valid versions, ideally by using the manifest version when available or validating candidate splits from the right.Useful? React with 👍 / 👎.